libavcodec/dsputil.c

   1 /*
   2  * DSP utils
   3  * Copyright (c) 2000, 2001 Fabrice Bellard
   4  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
   5  *
   6  * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
   7  *
   8  * This file is part of Libav.
   9  *
  10  * Libav is free software; you can redistribute it and/or
  11  * modify it under the terms of the GNU Lesser General Public
  12  * License as published by the Free Software Foundation; either
  13  * version 2.1 of the License, or (at your option) any later version.
  14  *
  15  * Libav is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18  * Lesser General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU Lesser General Public
  21  * License along with Libav; if not, write to the Free Software
  22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  23  */
  24
  25 /**
  26  * @file
  27  * DSP utils
  28  */
  29
  30 #include "libavutil/imgutils.h"
  31 #include "avcodec.h"
  32 #include "dsputil.h"
  33 #include "simple_idct.h"
  34 #include "faandct.h"
  35 #include "faanidct.h"
  36 #include "mathops.h"
  37 #include "mpegvideo.h"
  38 #include "config.h"
  39 #include "ac3dec.h"
  40 #include "vorbis.h"
  41 #include "png.h"
  42
  43 uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
  44 uint32_t ff_squareTbl[512] = {0, };
  45
  46 #define BIT_DEPTH 9
  47 #include "dsputil_template.c"
  48 #undef BIT_DEPTH
  49
  50 #define BIT_DEPTH 10
  51 #include "dsputil_template.c"
  52 #undef BIT_DEPTH
  53
  54 #define BIT_DEPTH 8
  55 #include "dsputil_template.c"
  56
  57 // 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
  58 #define pb_7f (~0UL/255 * 0x7f)
  59 #define pb_80 (~0UL/255 * 0x80)
  60
  61 const uint8_t ff_zigzag_direct[64] = {
  62     0,   1,  8, 16,  9,  2,  3, 10,
  63     17, 24, 32, 25, 18, 11,  4,  5,
  64     12, 19, 26, 33, 40, 48, 41, 34,
  65     27, 20, 13,  6,  7, 14, 21, 28,
  66     35, 42, 49, 56, 57, 50, 43, 36,
  67     29, 22, 15, 23, 30, 37, 44, 51,
  68     58, 59, 52, 45, 38, 31, 39, 46,
  69     53, 60, 61, 54, 47, 55, 62, 63
  70 };
  71
  72 /* Specific zigzag scan for 248 idct. NOTE that unlike the
  73    specification, we interleave the fields */
  74 const uint8_t ff_zigzag248_direct[64] = {
  75      0,  8,  1,  9, 16, 24,  2, 10,
  76     17, 25, 32, 40, 48, 56, 33, 41,
  77     18, 26,  3, 11,  4, 12, 19, 27,
  78     34, 42, 49, 57, 50, 58, 35, 43,
  79     20, 28,  5, 13,  6, 14, 21, 29,
  80     36, 44, 51, 59, 52, 60, 37, 45,
  81     22, 30,  7, 15, 23, 31, 38, 46,
  82     53, 61, 54, 62, 39, 47, 55, 63,
  83 };
  84
  85 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
  86 DECLARE_ALIGNED(16, uint16_t, inv_zigzag_direct16)[64];
  87
  88 const uint8_t ff_alternate_horizontal_scan[64] = {
  89     0,  1,   2,  3,  8,  9, 16, 17,
  90     10, 11,  4,  5,  6,  7, 15, 14,
  91     13, 12, 19, 18, 24, 25, 32, 33,
  92     26, 27, 20, 21, 22, 23, 28, 29,
  93     30, 31, 34, 35, 40, 41, 48, 49,
  94     42, 43, 36, 37, 38, 39, 44, 45,
  95     46, 47, 50, 51, 56, 57, 58, 59,
  96     52, 53, 54, 55, 60, 61, 62, 63,
  97 };
  98
  99 const uint8_t ff_alternate_vertical_scan[64] = {
 100     0,  8,  16, 24,  1,  9,  2, 10,
 101     17, 25, 32, 40, 48, 56, 57, 49,
 102     41, 33, 26, 18,  3, 11,  4, 12,
 103     19, 27, 34, 42, 50, 58, 35, 43,
 104     51, 59, 20, 28,  5, 13,  6, 14,
 105     21, 29, 36, 44, 52, 60, 37, 45,
 106     53, 61, 22, 30,  7, 15, 23, 31,
 107     38, 46, 54, 62, 39, 47, 55, 63,
 108 };
 109
 110 /* Input permutation for the simple_idct_mmx */
 111 static const uint8_t simple_mmx_permutation[64]={
 112         0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
 113         0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
 114         0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
 115         0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
 116         0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
 117         0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
 118         0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
 119         0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
 120 };
 121
 122 static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7};
 123
 124 void ff_init_scantable(uint8_t *permutation, ScanTable *st, const uint8_t *src_scantable){
 125     int i;
 126     int end;
 127
 128     st->scantable= src_scantable;
 129
 130     for(i=0; i<64; i++){
 131         int j;
 132         j = src_scantable[i];
 133         st->permutated[i] = permutation[j];
 134 #if ARCH_PPC
 135         st->inverse[j] = i;
 136 #endif
 137     }
 138
 139     end=-1;
 140     for(i=0; i<64; i++){
 141         int j;
 142         j = st->permutated[i];
 143         if(j>end) end=j;
 144         st->raster_end[i]= end;
 145     }
 146 }
 147
 148 static int pix_sum_c(uint8_t * pix, int line_size)
 149 {
 150     int s, i, j;
 151
 152     s = 0;
 153     for (i = 0; i < 16; i++) {
 154         for (j = 0; j < 16; j += 8) {
 155             s += pix[0];
 156             s += pix[1];
 157             s += pix[2];
 158             s += pix[3];
 159             s += pix[4];
 160             s += pix[5];
 161             s += pix[6];
 162             s += pix[7];
 163             pix += 8;
 164         }
 165         pix += line_size - 16;
 166     }
 167     return s;
 168 }
 169
 170 static int pix_norm1_c(uint8_t * pix, int line_size)
 171 {
 172     int s, i, j;
 173     uint32_t *sq = ff_squareTbl + 256;
 174
 175     s = 0;
 176     for (i = 0; i < 16; i++) {
 177         for (j = 0; j < 16; j += 8) {
 178 #if 0
 179             s += sq[pix[0]];
 180             s += sq[pix[1]];
 181             s += sq[pix[2]];
 182             s += sq[pix[3]];
 183             s += sq[pix[4]];
 184             s += sq[pix[5]];
 185             s += sq[pix[6]];
 186             s += sq[pix[7]];
 187 #else
 188 #if LONG_MAX > 2147483647
 189             register uint64_t x=*(uint64_t*)pix;
 190             s += sq[x&0xff];
 191             s += sq[(x>>8)&0xff];
 192             s += sq[(x>>16)&0xff];
 193             s += sq[(x>>24)&0xff];
 194             s += sq[(x>>32)&0xff];
 195             s += sq[(x>>40)&0xff];
 196             s += sq[(x>>48)&0xff];
 197             s += sq[(x>>56)&0xff];
 198 #else
 199             register uint32_t x=*(uint32_t*)pix;
 200             s += sq[x&0xff];
 201             s += sq[(x>>8)&0xff];
 202             s += sq[(x>>16)&0xff];
 203             s += sq[(x>>24)&0xff];
 204             x=*(uint32_t*)(pix+4);
 205             s += sq[x&0xff];
 206             s += sq[(x>>8)&0xff];
 207             s += sq[(x>>16)&0xff];
 208             s += sq[(x>>24)&0xff];
 209 #endif
 210 #endif
 211             pix += 8;
 212         }
 213         pix += line_size - 16;
 214     }
 215     return s;
 216 }
 217
 218 static void bswap_buf(uint32_t *dst, const uint32_t *src, int w){
 219     int i;
 220
 221     for(i=0; i+8<=w; i+=8){
 222         dst[i+0]= av_bswap32(src[i+0]);
 223         dst[i+1]= av_bswap32(src[i+1]);
 224         dst[i+2]= av_bswap32(src[i+2]);
 225         dst[i+3]= av_bswap32(src[i+3]);
 226         dst[i+4]= av_bswap32(src[i+4]);
 227         dst[i+5]= av_bswap32(src[i+5]);
 228         dst[i+6]= av_bswap32(src[i+6]);
 229         dst[i+7]= av_bswap32(src[i+7]);
 230     }
 231     for(;i<w; i++){
 232         dst[i+0]= av_bswap32(src[i+0]);
 233     }
 234 }
 235
 236 static void bswap16_buf(uint16_t *dst, const uint16_t *src, int len)
 237 {
 238     while (len--)
 239         *dst++ = av_bswap16(*src++);
 240 }
 241
 242 static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
 243 {
 244     int s, i;
 245     uint32_t *sq = ff_squareTbl + 256;
 246
 247     s = 0;
 248     for (i = 0; i < h; i++) {
 249         s += sq[pix1[0] - pix2[0]];
 250         s += sq[pix1[1] - pix2[1]];
 251         s += sq[pix1[2] - pix2[2]];
 252         s += sq[pix1[3] - pix2[3]];
 253         pix1 += line_size;
 254         pix2 += line_size;
 255     }
 256     return s;
 257 }
 258
 259 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
 260 {
 261     int s, i;
 262     uint32_t *sq = ff_squareTbl + 256;
 263
 264     s = 0;
 265     for (i = 0; i < h; i++) {
 266         s += sq[pix1[0] - pix2[0]];
 267         s += sq[pix1[1] - pix2[1]];
 268         s += sq[pix1[2] - pix2[2]];
 269         s += sq[pix1[3] - pix2[3]];
 270         s += sq[pix1[4] - pix2[4]];
 271         s += sq[pix1[5] - pix2[5]];
 272         s += sq[pix1[6] - pix2[6]];
 273         s += sq[pix1[7] - pix2[7]];
 274         pix1 += line_size;
 275         pix2 += line_size;
 276     }
 277     return s;
 278 }
 279
 280 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
 281 {
 282     int s, i;
 283     uint32_t *sq = ff_squareTbl + 256;
 284
 285     s = 0;
 286     for (i = 0; i < h; i++) {
 287         s += sq[pix1[ 0] - pix2[ 0]];
 288         s += sq[pix1[ 1] - pix2[ 1]];
 289         s += sq[pix1[ 2] - pix2[ 2]];
 290         s += sq[pix1[ 3] - pix2[ 3]];
 291         s += sq[pix1[ 4] - pix2[ 4]];
 292         s += sq[pix1[ 5] - pix2[ 5]];
 293         s += sq[pix1[ 6] - pix2[ 6]];
 294         s += sq[pix1[ 7] - pix2[ 7]];
 295         s += sq[pix1[ 8] - pix2[ 8]];
 296         s += sq[pix1[ 9] - pix2[ 9]];
 297         s += sq[pix1[10] - pix2[10]];
 298         s += sq[pix1[11] - pix2[11]];
 299         s += sq[pix1[12] - pix2[12]];
 300         s += sq[pix1[13] - pix2[13]];
 301         s += sq[pix1[14] - pix2[14]];
 302         s += sq[pix1[15] - pix2[15]];
 303
 304         pix1 += line_size;
 305         pix2 += line_size;
 306     }
 307     return s;
 308 }
 309
 310 static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
 311 {
 312     int i;
 313
 314     /* read the pixels */
 315     for(i=0;i<8;i++) {
 316         block[0] = pixels[0];
 317         block[1] = pixels[1];
 318         block[2] = pixels[2];
 319         block[3] = pixels[3];
 320         block[4] = pixels[4];
 321         block[5] = pixels[5];
 322         block[6] = pixels[6];
 323         block[7] = pixels[7];
 324         pixels += line_size;
 325         block += 8;
 326     }
 327 }
 328
 329 static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
 330                           const uint8_t *s2, int stride){
 331     int i;
 332
 333     /* read the pixels */
 334     for(i=0;i<8;i++) {
 335         block[0] = s1[0] - s2[0];
 336         block[1] = s1[1] - s2[1];
 337         block[2] = s1[2] - s2[2];
 338         block[3] = s1[3] - s2[3];
 339         block[4] = s1[4] - s2[4];
 340         block[5] = s1[5] - s2[5];
 341         block[6] = s1[6] - s2[6];
 342         block[7] = s1[7] - s2[7];
 343         s1 += stride;
 344         s2 += stride;
 345         block += 8;
 346     }
 347 }
 348
 349
 350 void ff_put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
 351                              int line_size)
 352 {
 353     int i;
 354     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 355
 356     /* read the pixels */
 357     for(i=0;i<8;i++) {
 358         pixels[0] = cm[block[0]];
 359         pixels[1] = cm[block[1]];
 360         pixels[2] = cm[block[2]];
 361         pixels[3] = cm[block[3]];
 362         pixels[4] = cm[block[4]];
 363         pixels[5] = cm[block[5]];
 364         pixels[6] = cm[block[6]];
 365         pixels[7] = cm[block[7]];
 366
 367         pixels += line_size;
 368         block += 8;
 369     }
 370 }
 371
 372 static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
 373                                  int line_size)
 374 {
 375     int i;
 376     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 377
 378     /* read the pixels */
 379     for(i=0;i<4;i++) {
 380         pixels[0] = cm[block[0]];
 381         pixels[1] = cm[block[1]];
 382         pixels[2] = cm[block[2]];
 383         pixels[3] = cm[block[3]];
 384
 385         pixels += line_size;
 386         block += 8;
 387     }
 388 }
 389
 390 static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
 391                                  int line_size)
 392 {
 393     int i;
 394     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 395
 396     /* read the pixels */
 397     for(i=0;i<2;i++) {
 398         pixels[0] = cm[block[0]];
 399         pixels[1] = cm[block[1]];
 400
 401         pixels += line_size;
 402         block += 8;
 403     }
 404 }
 405
 406 void ff_put_signed_pixels_clamped_c(const DCTELEM *block,
 407                                     uint8_t *restrict pixels,
 408                                     int line_size)
 409 {
 410     int i, j;
 411
 412     for (i = 0; i < 8; i++) {
 413         for (j = 0; j < 8; j++) {
 414             if (*block < -128)
 415                 *pixels = 0;
 416             else if (*block > 127)
 417                 *pixels = 255;
 418             else
 419                 *pixels = (uint8_t)(*block + 128);
 420             block++;
 421             pixels++;
 422         }
 423         pixels += (line_size - 8);
 424     }
 425 }
 426
 427 static void put_pixels_nonclamped_c(const DCTELEM *block, uint8_t *restrict pixels,
 428                                     int line_size)
 429 {
 430     int i;
 431
 432     /* read the pixels */
 433     for(i=0;i<8;i++) {
 434         pixels[0] = block[0];
 435         pixels[1] = block[1];
 436         pixels[2] = block[2];
 437         pixels[3] = block[3];
 438         pixels[4] = block[4];
 439         pixels[5] = block[5];
 440         pixels[6] = block[6];
 441         pixels[7] = block[7];
 442
 443         pixels += line_size;
 444         block += 8;
 445     }
 446 }
 447
 448 void ff_add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
 449                              int line_size)
 450 {
 451     int i;
 452     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 453
 454     /* read the pixels */
 455     for(i=0;i<8;i++) {
 456         pixels[0] = cm[pixels[0] + block[0]];
 457         pixels[1] = cm[pixels[1] + block[1]];
 458         pixels[2] = cm[pixels[2] + block[2]];
 459         pixels[3] = cm[pixels[3] + block[3]];
 460         pixels[4] = cm[pixels[4] + block[4]];
 461         pixels[5] = cm[pixels[5] + block[5]];
 462         pixels[6] = cm[pixels[6] + block[6]];
 463         pixels[7] = cm[pixels[7] + block[7]];
 464         pixels += line_size;
 465         block += 8;
 466     }
 467 }
 468
 469 static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
 470                           int line_size)
 471 {
 472     int i;
 473     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 474
 475     /* read the pixels */
 476     for(i=0;i<4;i++) {
 477         pixels[0] = cm[pixels[0] + block[0]];
 478         pixels[1] = cm[pixels[1] + block[1]];
 479         pixels[2] = cm[pixels[2] + block[2]];
 480         pixels[3] = cm[pixels[3] + block[3]];
 481         pixels += line_size;
 482         block += 8;
 483     }
 484 }
 485
 486 static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
 487                           int line_size)
 488 {
 489     int i;
 490     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 491
 492     /* read the pixels */
 493     for(i=0;i<2;i++) {
 494         pixels[0] = cm[pixels[0] + block[0]];
 495         pixels[1] = cm[pixels[1] + block[1]];
 496         pixels += line_size;
 497         block += 8;
 498     }
 499 }
 500
 501 static int sum_abs_dctelem_c(DCTELEM *block)
 502 {
 503     int sum=0, i;
 504     for(i=0; i<64; i++)
 505         sum+= FFABS(block[i]);
 506     return sum;
 507 }
 508
 509 static void fill_block16_c(uint8_t *block, uint8_t value, int line_size, int h)
 510 {
 511     int i;
 512
 513     for (i = 0; i < h; i++) {
 514         memset(block, value, 16);
 515         block += line_size;
 516     }
 517 }
 518
 519 static void fill_block8_c(uint8_t *block, uint8_t value, int line_size, int h)
 520 {
 521     int i;
 522
 523     for (i = 0; i < h; i++) {
 524         memset(block, value, 8);
 525         block += line_size;
 526     }
 527 }
 528
 529 static void scale_block_c(const uint8_t src[64]/*align 8*/, uint8_t *dst/*align 8*/, int linesize)
 530 {
 531     int i, j;
 532     uint16_t *dst1 = (uint16_t *) dst;
 533     uint16_t *dst2 = (uint16_t *)(dst + linesize);
 534
 535     for (j = 0; j < 8; j++) {
 536         for (i = 0; i < 8; i++) {
 537             dst1[i] = dst2[i] = src[i] * 0x0101;
 538         }
 539         src  += 8;
 540         dst1 += linesize;
 541         dst2 += linesize;
 542     }
 543 }
 544
 545 #define avg2(a,b) ((a+b+1)>>1)
 546 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
 547
 548 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
 549 {
 550     const int A=(16-x16)*(16-y16);
 551     const int B=(   x16)*(16-y16);
 552     const int C=(16-x16)*(   y16);
 553     const int D=(   x16)*(   y16);
 554     int i;
 555
 556     for(i=0; i<h; i++)
 557     {
 558         dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
 559         dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
 560         dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
 561         dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
 562         dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
 563         dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
 564         dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
 565         dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
 566         dst+= stride;
 567         src+= stride;
 568     }
 569 }
 570
 571 void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
 572                   int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
 573 {
 574     int y, vx, vy;
 575     const int s= 1<<shift;
 576
 577     width--;
 578     height--;
 579
 580     for(y=0; y<h; y++){
 581         int x;
 582
 583         vx= ox;
 584         vy= oy;
 585         for(x=0; x<8; x++){ //XXX FIXME optimize
 586             int src_x, src_y, frac_x, frac_y, index;
 587
 588             src_x= vx>>16;
 589             src_y= vy>>16;
 590             frac_x= src_x&(s-1);
 591             frac_y= src_y&(s-1);
 592             src_x>>=shift;
 593             src_y>>=shift;
 594
 595             if((unsigned)src_x < width){
 596                 if((unsigned)src_y < height){
 597                     index= src_x + src_y*stride;
 598                     dst[y*stride + x]= (  (  src[index         ]*(s-frac_x)
 599                                            + src[index       +1]*   frac_x )*(s-frac_y)
 600                                         + (  src[index+stride  ]*(s-frac_x)
 601                                            + src[index+stride+1]*   frac_x )*   frac_y
 602                                         + r)>>(shift*2);
 603                 }else{
 604                     index= src_x + av_clip(src_y, 0, height)*stride;
 605                     dst[y*stride + x]= ( (  src[index         ]*(s-frac_x)
 606                                           + src[index       +1]*   frac_x )*s
 607                                         + r)>>(shift*2);
 608                 }
 609             }else{
 610                 if((unsigned)src_y < height){
 611                     index= av_clip(src_x, 0, width) + src_y*stride;
 612                     dst[y*stride + x]= (  (  src[index         ]*(s-frac_y)
 613                                            + src[index+stride  ]*   frac_y )*s
 614                                         + r)>>(shift*2);
 615                 }else{
 616                     index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
 617                     dst[y*stride + x]=    src[index         ];
 618                 }
 619             }
 620
 621             vx+= dxx;
 622             vy+= dyx;
 623         }
 624         ox += dxy;
 625         oy += dyy;
 626     }
 627 }
 628
 629 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 630     switch(width){
 631     case 2: put_pixels2_8_c (dst, src, stride, height); break;
 632     case 4: put_pixels4_8_c (dst, src, stride, height); break;
 633     case 8: put_pixels8_8_c (dst, src, stride, height); break;
 634     case 16:put_pixels16_8_c(dst, src, stride, height); break;
 635     }
 636 }
 637
 638 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 639     int i,j;
 640     for (i=0; i < height; i++) {
 641       for (j=0; j < width; j++) {
 642         dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
 643       }
 644       src += stride;
 645       dst += stride;
 646     }
 647 }
 648
 649 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 650     int i,j;
 651     for (i=0; i < height; i++) {
 652       for (j=0; j < width; j++) {
 653         dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
 654       }
 655       src += stride;
 656       dst += stride;
 657     }
 658 }
 659
 660 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 661     int i,j;
 662     for (i=0; i < height; i++) {
 663       for (j=0; j < width; j++) {
 664         dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
 665       }
 666       src += stride;
 667       dst += stride;
 668     }
 669 }
 670
 671 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 672     int i,j;
 673     for (i=0; i < height; i++) {
 674       for (j=0; j < width; j++) {
 675         dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
 676       }
 677       src += stride;
 678       dst += stride;
 679     }
 680 }
 681
 682 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 683     int i,j;
 684     for (i=0; i < height; i++) {
 685       for (j=0; j < width; j++) {
 686         dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
 687       }
 688       src += stride;
 689       dst += stride;
 690     }
 691 }
 692
 693 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 694     int i,j;
 695     for (i=0; i < height; i++) {
 696       for (j=0; j < width; j++) {
 697         dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
 698       }
 699       src += stride;
 700       dst += stride;
 701     }
 702 }
 703
 704 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 705     int i,j;
 706     for (i=0; i < height; i++) {
 707       for (j=0; j < width; j++) {
 708         dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
 709       }
 710       src += stride;
 711       dst += stride;
 712     }
 713 }
 714
 715 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 716     int i,j;
 717     for (i=0; i < height; i++) {
 718       for (j=0; j < width; j++) {
 719         dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
 720       }
 721       src += stride;
 722       dst += stride;
 723     }
 724 }
 725
 726 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 727     switch(width){
 728     case 2: avg_pixels2_8_c (dst, src, stride, height); break;
 729     case 4: avg_pixels4_8_c (dst, src, stride, height); break;
 730     case 8: avg_pixels8_8_c (dst, src, stride, height); break;
 731     case 16:avg_pixels16_8_c(dst, src, stride, height); break;
 732     }
 733 }
 734
 735 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 736     int i,j;
 737     for (i=0; i < height; i++) {
 738       for (j=0; j < width; j++) {
 739         dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
 740       }
 741       src += stride;
 742       dst += stride;
 743     }
 744 }
 745
 746 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 747     int i,j;
 748     for (i=0; i < height; i++) {
 749       for (j=0; j < width; j++) {
 750         dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
 751       }
 752       src += stride;
 753       dst += stride;
 754     }
 755 }
 756
 757 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 758     int i,j;
 759     for (i=0; i < height; i++) {
 760       for (j=0; j < width; j++) {
 761         dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
 762       }
 763       src += stride;
 764       dst += stride;
 765     }
 766 }
 767
 768 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 769     int i,j;
 770     for (i=0; i < height; i++) {
 771       for (j=0; j < width; j++) {
 772         dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
 773       }
 774       src += stride;
 775       dst += stride;
 776     }
 777 }
 778
 779 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 780     int i,j;
 781     for (i=0; i < height; i++) {
 782       for (j=0; j < width; j++) {
 783         dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
 784       }
 785       src += stride;
 786       dst += stride;
 787     }
 788 }
 789
 790 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 791     int i,j;
 792     for (i=0; i < height; i++) {
 793       for (j=0; j < width; j++) {
 794         dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
 795       }
 796       src += stride;
 797       dst += stride;
 798     }
 799 }
 800
 801 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 802     int i,j;
 803     for (i=0; i < height; i++) {
 804       for (j=0; j < width; j++) {
 805         dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
 806       }
 807       src += stride;
 808       dst += stride;
 809     }
 810 }
 811
 812 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 813     int i,j;
 814     for (i=0; i < height; i++) {
 815       for (j=0; j < width; j++) {
 816         dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
 817       }
 818       src += stride;
 819       dst += stride;
 820     }
 821 }
 822
 823 #define QPEL_MC(r, OPNAME, RND, OP) \
 824 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
 825     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
 826     int i;\
 827     for(i=0; i<h; i++)\
 828     {\
 829         OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
 830         OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
 831         OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
 832         OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
 833         OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
 834         OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
 835         OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
 836         OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
 837         dst+=dstStride;\
 838         src+=srcStride;\
 839     }\
 840 }\
 841 \
 842 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
 843     const int w=8;\
 844     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
 845     int i;\
 846     for(i=0; i<w; i++)\
 847     {\
 848         const int src0= src[0*srcStride];\
 849         const int src1= src[1*srcStride];\
 850         const int src2= src[2*srcStride];\
 851         const int src3= src[3*srcStride];\
 852         const int src4= src[4*srcStride];\
 853         const int src5= src[5*srcStride];\
 854         const int src6= src[6*srcStride];\
 855         const int src7= src[7*srcStride];\
 856         const int src8= src[8*srcStride];\
 857         OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
 858         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
 859         OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
 860         OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
 861         OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
 862         OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
 863         OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
 864         OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
 865         dst++;\
 866         src++;\
 867     }\
 868 }\
 869 \
 870 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
 871     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
 872     int i;\
 873     \
 874     for(i=0; i<h; i++)\
 875     {\
 876         OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
 877         OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
 878         OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
 879         OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
 880         OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
 881         OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
 882         OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
 883         OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
 884         OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
 885         OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
 886         OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
 887         OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
 888         OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
 889         OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
 890         OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
 891         OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
 892         dst+=dstStride;\
 893         src+=srcStride;\
 894     }\
 895 }\
 896 \
 897 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
 898     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
 899     int i;\
 900     const int w=16;\
 901     for(i=0; i<w; i++)\
 902     {\
 903         const int src0= src[0*srcStride];\
 904         const int src1= src[1*srcStride];\
 905         const int src2= src[2*srcStride];\
 906         const int src3= src[3*srcStride];\
 907         const int src4= src[4*srcStride];\
 908         const int src5= src[5*srcStride];\
 909         const int src6= src[6*srcStride];\
 910         const int src7= src[7*srcStride];\
 911         const int src8= src[8*srcStride];\
 912         const int src9= src[9*srcStride];\
 913         const int src10= src[10*srcStride];\
 914         const int src11= src[11*srcStride];\
 915         const int src12= src[12*srcStride];\
 916         const int src13= src[13*srcStride];\
 917         const int src14= src[14*srcStride];\
 918         const int src15= src[15*srcStride];\
 919         const int src16= src[16*srcStride];\
 920         OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
 921         OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
 922         OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
 923         OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
 924         OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
 925         OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
 926         OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
 927         OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
 928         OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
 929         OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
 930         OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
 931         OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
 932         OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
 933         OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
 934         OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
 935         OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
 936         dst++;\
 937         src++;\
 938     }\
 939 }\
 940 \
 941 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
 942     uint8_t half[64];\
 943     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
 944     OPNAME ## pixels8_l2_8(dst, src, half, stride, stride, 8, 8);\
 945 }\
 946 \
 947 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
 948     OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
 949 }\
 950 \
 951 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
 952     uint8_t half[64];\
 953     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
 954     OPNAME ## pixels8_l2_8(dst, src+1, half, stride, stride, 8, 8);\
 955 }\
 956 \
 957 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
 958     uint8_t full[16*9];\
 959     uint8_t half[64];\
 960     copy_block9(full, src, 16, stride, 9);\
 961     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
 962     OPNAME ## pixels8_l2_8(dst, full, half, stride, 16, 8, 8);\
 963 }\
 964 \
 965 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
 966     uint8_t full[16*9];\
 967     copy_block9(full, src, 16, stride, 9);\
 968     OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
 969 }\
 970 \
 971 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
 972     uint8_t full[16*9];\
 973     uint8_t half[64];\
 974     copy_block9(full, src, 16, stride, 9);\
 975     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
 976     OPNAME ## pixels8_l2_8(dst, full+16, half, stride, 16, 8, 8);\
 977 }\
 978 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
 979     uint8_t full[16*9];\
 980     uint8_t halfH[72];\
 981     uint8_t halfV[64];\
 982     uint8_t halfHV[64];\
 983     copy_block9(full, src, 16, stride, 9);\
 984     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
 985     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
 986     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
 987     OPNAME ## pixels8_l4_8(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
 988 }\
 989 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
 990     uint8_t full[16*9];\
 991     uint8_t halfH[72];\
 992     uint8_t halfHV[64];\
 993     copy_block9(full, src, 16, stride, 9);\
 994     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
 995     put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
 996     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
 997     OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
 998 }\
 999 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1000     uint8_t full[16*9];\
1001     uint8_t halfH[72];\
1002     uint8_t halfV[64];\
1003     uint8_t halfHV[64];\
1004     copy_block9(full, src, 16, stride, 9);\
1005     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1006     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1007     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1008     OPNAME ## pixels8_l4_8(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1009 }\
1010 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1011     uint8_t full[16*9];\
1012     uint8_t halfH[72];\
1013     uint8_t halfHV[64];\
1014     copy_block9(full, src, 16, stride, 9);\
1015     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1016     put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
1017     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1018     OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
1019 }\
1020 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1021     uint8_t full[16*9];\
1022     uint8_t halfH[72];\
1023     uint8_t halfV[64];\
1024     uint8_t halfHV[64];\
1025     copy_block9(full, src, 16, stride, 9);\
1026     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1027     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1028     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1029     OPNAME ## pixels8_l4_8(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1030 }\
1031 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1032     uint8_t full[16*9];\
1033     uint8_t halfH[72];\
1034     uint8_t halfHV[64];\
1035     copy_block9(full, src, 16, stride, 9);\
1036     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1037     put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
1038     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1039     OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1040 }\
1041 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1042     uint8_t full[16*9];\
1043     uint8_t halfH[72];\
1044     uint8_t halfV[64];\
1045     uint8_t halfHV[64];\
1046     copy_block9(full, src, 16, stride, 9);\
1047     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full  , 8, 16, 9);\
1048     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1049     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1050     OPNAME ## pixels8_l4_8(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1051 }\
1052 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1053     uint8_t full[16*9];\
1054     uint8_t halfH[72];\
1055     uint8_t halfHV[64];\
1056     copy_block9(full, src, 16, stride, 9);\
1057     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1058     put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
1059     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1060     OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1061 }\
1062 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1063     uint8_t halfH[72];\
1064     uint8_t halfHV[64];\
1065     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1066     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1067     OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
1068 }\
1069 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1070     uint8_t halfH[72];\
1071     uint8_t halfHV[64];\
1072     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1073     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1074     OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1075 }\
1076 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1077     uint8_t full[16*9];\
1078     uint8_t halfH[72];\
1079     uint8_t halfV[64];\
1080     uint8_t halfHV[64];\
1081     copy_block9(full, src, 16, stride, 9);\
1082     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1083     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1084     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1085     OPNAME ## pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);\
1086 }\
1087 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1088     uint8_t full[16*9];\
1089     uint8_t halfH[72];\
1090     copy_block9(full, src, 16, stride, 9);\
1091     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1092     put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
1093     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1094 }\
1095 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1096     uint8_t full[16*9];\
1097     uint8_t halfH[72];\
1098     uint8_t halfV[64];\
1099     uint8_t halfHV[64];\
1100     copy_block9(full, src, 16, stride, 9);\
1101     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1102     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1103     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1104     OPNAME ## pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);\
1105 }\
1106 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1107     uint8_t full[16*9];\
1108     uint8_t halfH[72];\
1109     copy_block9(full, src, 16, stride, 9);\
1110     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1111     put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
1112     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1113 }\
1114 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1115     uint8_t halfH[72];\
1116     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1117     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1118 }\
1119 \
1120 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1121     uint8_t half[256];\
1122     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1123     OPNAME ## pixels16_l2_8(dst, src, half, stride, stride, 16, 16);\
1124 }\
1125 \
1126 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1127     OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1128 }\
1129 \
1130 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1131     uint8_t half[256];\
1132     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1133     OPNAME ## pixels16_l2_8(dst, src+1, half, stride, stride, 16, 16);\
1134 }\
1135 \
1136 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1137     uint8_t full[24*17];\
1138     uint8_t half[256];\
1139     copy_block17(full, src, 24, stride, 17);\
1140     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1141     OPNAME ## pixels16_l2_8(dst, full, half, stride, 24, 16, 16);\
1142 }\
1143 \
1144 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1145     uint8_t full[24*17];\
1146     copy_block17(full, src, 24, stride, 17);\
1147     OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1148 }\
1149 \
1150 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1151     uint8_t full[24*17];\
1152     uint8_t half[256];\
1153     copy_block17(full, src, 24, stride, 17);\
1154     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1155     OPNAME ## pixels16_l2_8(dst, full+24, half, stride, 24, 16, 16);\
1156 }\
1157 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1158     uint8_t full[24*17];\
1159     uint8_t halfH[272];\
1160     uint8_t halfV[256];\
1161     uint8_t halfHV[256];\
1162     copy_block17(full, src, 24, stride, 17);\
1163     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1164     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1165     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1166     OPNAME ## pixels16_l4_8(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1167 }\
1168 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1169     uint8_t full[24*17];\
1170     uint8_t halfH[272];\
1171     uint8_t halfHV[256];\
1172     copy_block17(full, src, 24, stride, 17);\
1173     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1174     put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1175     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1176     OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1177 }\
1178 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1179     uint8_t full[24*17];\
1180     uint8_t halfH[272];\
1181     uint8_t halfV[256];\
1182     uint8_t halfHV[256];\
1183     copy_block17(full, src, 24, stride, 17);\
1184     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1185     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1186     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1187     OPNAME ## pixels16_l4_8(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1188 }\
1189 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1190     uint8_t full[24*17];\
1191     uint8_t halfH[272];\
1192     uint8_t halfHV[256];\
1193     copy_block17(full, src, 24, stride, 17);\
1194     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1195     put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1196     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1197     OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1198 }\
1199 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1200     uint8_t full[24*17];\
1201     uint8_t halfH[272];\
1202     uint8_t halfV[256];\
1203     uint8_t halfHV[256];\
1204     copy_block17(full, src, 24, stride, 17);\
1205     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1206     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1207     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1208     OPNAME ## pixels16_l4_8(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1209 }\
1210 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1211     uint8_t full[24*17];\
1212     uint8_t halfH[272];\
1213     uint8_t halfHV[256];\
1214     copy_block17(full, src, 24, stride, 17);\
1215     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1216     put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1217     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1218     OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1219 }\
1220 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1221     uint8_t full[24*17];\
1222     uint8_t halfH[272];\
1223     uint8_t halfV[256];\
1224     uint8_t halfHV[256];\
1225     copy_block17(full, src, 24, stride, 17);\
1226     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full  , 16, 24, 17);\
1227     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1228     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1229     OPNAME ## pixels16_l4_8(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1230 }\
1231 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1232     uint8_t full[24*17];\
1233     uint8_t halfH[272];\
1234     uint8_t halfHV[256];\
1235     copy_block17(full, src, 24, stride, 17);\
1236     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1237     put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1238     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1239     OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1240 }\
1241 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1242     uint8_t halfH[272];\
1243     uint8_t halfHV[256];\
1244     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1245     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1246     OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1247 }\
1248 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1249     uint8_t halfH[272];\
1250     uint8_t halfHV[256];\
1251     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1252     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1253     OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1254 }\
1255 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1256     uint8_t full[24*17];\
1257     uint8_t halfH[272];\
1258     uint8_t halfV[256];\
1259     uint8_t halfHV[256];\
1260     copy_block17(full, src, 24, stride, 17);\
1261     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1262     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1263     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1264     OPNAME ## pixels16_l2_8(dst, halfV, halfHV, stride, 16, 16, 16);\
1265 }\
1266 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1267     uint8_t full[24*17];\
1268     uint8_t halfH[272];\
1269     copy_block17(full, src, 24, stride, 17);\
1270     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1271     put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1272     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1273 }\
1274 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1275     uint8_t full[24*17];\
1276     uint8_t halfH[272];\
1277     uint8_t halfV[256];\
1278     uint8_t halfHV[256];\
1279     copy_block17(full, src, 24, stride, 17);\
1280     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1281     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1282     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1283     OPNAME ## pixels16_l2_8(dst, halfV, halfHV, stride, 16, 16, 16);\
1284 }\
1285 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1286     uint8_t full[24*17];\
1287     uint8_t halfH[272];\
1288     copy_block17(full, src, 24, stride, 17);\
1289     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1290     put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1291     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1292 }\
1293 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1294     uint8_t halfH[272];\
1295     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1296     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1297 }
1298
1299 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
1300 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
1301 #define op_put(a, b) a = cm[((b) + 16)>>5]
1302 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
1303
1304 QPEL_MC(0, put_       , _       , op_put)
1305 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
1306 QPEL_MC(0, avg_       , _       , op_avg)
1307 //QPEL_MC(1, avg_no_rnd , _       , op_avg)
1308 #undef op_avg
1309 #undef op_avg_no_rnd
1310 #undef op_put
1311 #undef op_put_no_rnd
1312
1313 #define put_qpel8_mc00_c  ff_put_pixels8x8_c
1314 #define avg_qpel8_mc00_c  ff_avg_pixels8x8_c
1315 #define put_qpel16_mc00_c ff_put_pixels16x16_c
1316 #define avg_qpel16_mc00_c ff_avg_pixels16x16_c
1317 #define put_no_rnd_qpel8_mc00_c  ff_put_pixels8x8_c
1318 #define put_no_rnd_qpel16_mc00_c ff_put_pixels16x16_8_c
1319
1320 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
1321     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
1322     int i;
1323
1324     for(i=0; i<h; i++){
1325         dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
1326         dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
1327         dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
1328         dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
1329         dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
1330         dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
1331         dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
1332         dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
1333         dst+=dstStride;
1334         src+=srcStride;
1335     }
1336 }
1337
1338 #if CONFIG_RV40_DECODER
1339 static void put_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1340     put_pixels16_xy2_8_c(dst, src, stride, 16);
1341 }
1342 static void avg_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1343     avg_pixels16_xy2_8_c(dst, src, stride, 16);
1344 }
1345 static void put_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1346     put_pixels8_xy2_8_c(dst, src, stride, 8);
1347 }
1348 static void avg_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1349     avg_pixels8_xy2_8_c(dst, src, stride, 8);
1350 }
1351 #endif /* CONFIG_RV40_DECODER */
1352
1353 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
1354     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
1355     int i;
1356
1357     for(i=0; i<w; i++){
1358         const int src_1= src[ -srcStride];
1359         const int src0 = src[0          ];
1360         const int src1 = src[  srcStride];
1361         const int src2 = src[2*srcStride];
1362         const int src3 = src[3*srcStride];
1363         const int src4 = src[4*srcStride];
1364         const int src5 = src[5*srcStride];
1365         const int src6 = src[6*srcStride];
1366         const int src7 = src[7*srcStride];
1367         const int src8 = src[8*srcStride];
1368         const int src9 = src[9*srcStride];
1369         dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
1370         dst[1*dstStride]= cm[(9*(src1 + src2) - (src0  + src3) + 8)>>4];
1371         dst[2*dstStride]= cm[(9*(src2 + src3) - (src1  + src4) + 8)>>4];
1372         dst[3*dstStride]= cm[(9*(src3 + src4) - (src2  + src5) + 8)>>4];
1373         dst[4*dstStride]= cm[(9*(src4 + src5) - (src3  + src6) + 8)>>4];
1374         dst[5*dstStride]= cm[(9*(src5 + src6) - (src4  + src7) + 8)>>4];
1375         dst[6*dstStride]= cm[(9*(src6 + src7) - (src5  + src8) + 8)>>4];
1376         dst[7*dstStride]= cm[(9*(src7 + src8) - (src6  + src9) + 8)>>4];
1377         src++;
1378         dst++;
1379     }
1380 }
1381
1382 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
1383     uint8_t half[64];
1384     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1385     put_pixels8_l2_8(dst, src, half, stride, stride, 8, 8);
1386 }
1387
1388 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
1389     wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
1390 }
1391
1392 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
1393     uint8_t half[64];
1394     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1395     put_pixels8_l2_8(dst, src+1, half, stride, stride, 8, 8);
1396 }
1397
1398 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
1399     wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
1400 }
1401
1402 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
1403     uint8_t halfH[88];
1404     uint8_t halfV[64];
1405     uint8_t halfHV[64];
1406     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1407     wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
1408     wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
1409     put_pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);
1410 }
1411 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
1412     uint8_t halfH[88];
1413     uint8_t halfV[64];
1414     uint8_t halfHV[64];
1415     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1416     wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
1417     wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
1418     put_pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);
1419 }
1420 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
1421     uint8_t halfH[88];
1422     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1423     wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
1424 }
1425
1426 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
1427     if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
1428     int x;
1429     const int strength= ff_h263_loop_filter_strength[qscale];
1430
1431     for(x=0; x<8; x++){
1432         int d1, d2, ad1;
1433         int p0= src[x-2*stride];
1434         int p1= src[x-1*stride];
1435         int p2= src[x+0*stride];
1436         int p3= src[x+1*stride];
1437         int d = (p0 - p3 + 4*(p2 - p1)) / 8;
1438
1439         if     (d<-2*strength) d1= 0;
1440         else if(d<-  strength) d1=-2*strength - d;
1441         else if(d<   strength) d1= d;
1442         else if(d< 2*strength) d1= 2*strength - d;
1443         else                   d1= 0;
1444
1445         p1 += d1;
1446         p2 -= d1;
1447         if(p1&256) p1= ~(p1>>31);
1448         if(p2&256) p2= ~(p2>>31);
1449
1450         src[x-1*stride] = p1;
1451         src[x+0*stride] = p2;
1452
1453         ad1= FFABS(d1)>>1;
1454
1455         d2= av_clip((p0-p3)/4, -ad1, ad1);
1456
1457         src[x-2*stride] = p0 - d2;
1458         src[x+  stride] = p3 + d2;
1459     }
1460     }
1461 }
1462
1463 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
1464     if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
1465     int y;
1466     const int strength= ff_h263_loop_filter_strength[qscale];
1467
1468     for(y=0; y<8; y++){
1469         int d1, d2, ad1;
1470         int p0= src[y*stride-2];
1471         int p1= src[y*stride-1];
1472         int p2= src[y*stride+0];
1473         int p3= src[y*stride+1];
1474         int d = (p0 - p3 + 4*(p2 - p1)) / 8;
1475
1476         if     (d<-2*strength) d1= 0;
1477         else if(d<-  strength) d1=-2*strength - d;
1478         else if(d<   strength) d1= d;
1479         else if(d< 2*strength) d1= 2*strength - d;
1480         else                   d1= 0;
1481
1482         p1 += d1;
1483         p2 -= d1;
1484         if(p1&256) p1= ~(p1>>31);
1485         if(p2&256) p2= ~(p2>>31);
1486
1487         src[y*stride-1] = p1;
1488         src[y*stride+0] = p2;
1489
1490         ad1= FFABS(d1)>>1;
1491
1492         d2= av_clip((p0-p3)/4, -ad1, ad1);
1493
1494         src[y*stride-2] = p0 - d2;
1495         src[y*stride+1] = p3 + d2;
1496     }
1497     }
1498 }
1499
1500 static void h261_loop_filter_c(uint8_t *src, int stride){
1501     int x,y,xy,yz;
1502     int temp[64];
1503
1504     for(x=0; x<8; x++){
1505         temp[x      ] = 4*src[x           ];
1506         temp[x + 7*8] = 4*src[x + 7*stride];
1507     }
1508     for(y=1; y<7; y++){
1509         for(x=0; x<8; x++){
1510             xy = y * stride + x;
1511             yz = y * 8 + x;
1512             temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
1513         }
1514     }
1515
1516     for(y=0; y<8; y++){
1517         src[  y*stride] = (temp[  y*8] + 2)>>2;
1518         src[7+y*stride] = (temp[7+y*8] + 2)>>2;
1519         for(x=1; x<7; x++){
1520             xy = y * stride + x;
1521             yz = y * 8 + x;
1522             src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
1523         }
1524     }
1525 }
1526
1527 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1528 {
1529     int s, i;
1530
1531     s = 0;
1532     for(i=0;i<h;i++) {
1533         s += abs(pix1[0] - pix2[0]);
1534         s += abs(pix1[1] - pix2[1]);
1535         s += abs(pix1[2] - pix2[2]);
1536         s += abs(pix1[3] - pix2[3]);
1537         s += abs(pix1[4] - pix2[4]);
1538         s += abs(pix1[5] - pix2[5]);
1539         s += abs(pix1[6] - pix2[6]);
1540         s += abs(pix1[7] - pix2[7]);
1541         s += abs(pix1[8] - pix2[8]);
1542         s += abs(pix1[9] - pix2[9]);
1543         s += abs(pix1[10] - pix2[10]);
1544         s += abs(pix1[11] - pix2[11]);
1545         s += abs(pix1[12] - pix2[12]);
1546         s += abs(pix1[13] - pix2[13]);
1547         s += abs(pix1[14] - pix2[14]);
1548         s += abs(pix1[15] - pix2[15]);
1549         pix1 += line_size;
1550         pix2 += line_size;
1551     }
1552     return s;
1553 }
1554
1555 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1556 {
1557     int s, i;
1558
1559     s = 0;
1560     for(i=0;i<h;i++) {
1561         s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1562         s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1563         s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1564         s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1565         s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1566         s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1567         s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1568         s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1569         s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
1570         s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
1571         s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
1572         s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
1573         s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
1574         s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
1575         s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
1576         s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
1577         pix1 += line_size;
1578         pix2 += line_size;
1579     }
1580     return s;
1581 }
1582
1583 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1584 {
1585     int s, i;
1586     uint8_t *pix3 = pix2 + line_size;
1587
1588     s = 0;
1589     for(i=0;i<h;i++) {
1590         s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1591         s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1592         s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1593         s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1594         s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1595         s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1596         s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1597         s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1598         s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
1599         s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
1600         s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
1601         s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
1602         s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
1603         s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
1604         s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
1605         s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
1606         pix1 += line_size;
1607         pix2 += line_size;
1608         pix3 += line_size;
1609     }
1610     return s;
1611 }
1612
1613 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1614 {
1615     int s, i;
1616     uint8_t *pix3 = pix2 + line_size;
1617
1618     s = 0;
1619     for(i=0;i<h;i++) {
1620         s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1621         s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1622         s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1623         s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1624         s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1625         s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1626         s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1627         s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1628         s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
1629         s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
1630         s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
1631         s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
1632         s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
1633         s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
1634         s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
1635         s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
1636         pix1 += line_size;
1637         pix2 += line_size;
1638         pix3 += line_size;
1639     }
1640     return s;
1641 }
1642
1643 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1644 {
1645     int s, i;
1646
1647     s = 0;
1648     for(i=0;i<h;i++) {
1649         s += abs(pix1[0] - pix2[0]);
1650         s += abs(pix1[1] - pix2[1]);
1651         s += abs(pix1[2] - pix2[2]);
1652         s += abs(pix1[3] - pix2[3]);
1653         s += abs(pix1[4] - pix2[4]);
1654         s += abs(pix1[5] - pix2[5]);
1655         s += abs(pix1[6] - pix2[6]);
1656         s += abs(pix1[7] - pix2[7]);
1657         pix1 += line_size;
1658         pix2 += line_size;
1659     }
1660     return s;
1661 }
1662
1663 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1664 {
1665     int s, i;
1666
1667     s = 0;
1668     for(i=0;i<h;i++) {
1669         s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1670         s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1671         s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1672         s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1673         s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1674         s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1675         s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1676         s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1677         pix1 += line_size;
1678         pix2 += line_size;
1679     }
1680     return s;
1681 }
1682
1683 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1684 {
1685     int s, i;
1686     uint8_t *pix3 = pix2 + line_size;
1687
1688     s = 0;
1689     for(i=0;i<h;i++) {
1690         s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1691         s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1692         s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1693         s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1694         s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1695         s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1696         s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1697         s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1698         pix1 += line_size;
1699         pix2 += line_size;
1700         pix3 += line_size;
1701     }
1702     return s;
1703 }
1704
1705 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1706 {
1707     int s, i;
1708     uint8_t *pix3 = pix2 + line_size;
1709
1710     s = 0;
1711     for(i=0;i<h;i++) {
1712         s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1713         s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1714         s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1715         s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1716         s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1717         s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1718         s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1719         s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1720         pix1 += line_size;
1721         pix2 += line_size;
1722         pix3 += line_size;
1723     }
1724     return s;
1725 }
1726
1727 static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
1728     MpegEncContext *c = v;
1729     int score1=0;
1730     int score2=0;
1731     int x,y;
1732
1733     for(y=0; y<h; y++){
1734         for(x=0; x<16; x++){
1735             score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
1736         }
1737         if(y+1<h){
1738             for(x=0; x<15; x++){
1739                 score2+= FFABS(  s1[x  ] - s1[x  +stride]
1740                              - s1[x+1] + s1[x+1+stride])
1741                         -FFABS(  s2[x  ] - s2[x  +stride]
1742                              - s2[x+1] + s2[x+1+stride]);
1743             }
1744         }
1745         s1+= stride;
1746         s2+= stride;
1747     }
1748
1749     if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1750     else  return score1 + FFABS(score2)*8;
1751 }
1752
1753 static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
1754     MpegEncContext *c = v;
1755     int score1=0;
1756     int score2=0;
1757     int x,y;
1758
1759     for(y=0; y<h; y++){
1760         for(x=0; x<8; x++){
1761             score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
1762         }
1763         if(y+1<h){
1764             for(x=0; x<7; x++){
1765                 score2+= FFABS(  s1[x  ] - s1[x  +stride]
1766                              - s1[x+1] + s1[x+1+stride])
1767                         -FFABS(  s2[x  ] - s2[x  +stride]
1768                              - s2[x+1] + s2[x+1+stride]);
1769             }
1770         }
1771         s1+= stride;
1772         s2+= stride;
1773     }
1774
1775     if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1776     else  return score1 + FFABS(score2)*8;
1777 }
1778
1779 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
1780     int i;
1781     unsigned int sum=0;
1782
1783     for(i=0; i<8*8; i++){
1784         int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
1785         int w= weight[i];
1786         b>>= RECON_SHIFT;
1787         assert(-512<b && b<512);
1788
1789         sum += (w*b)*(w*b)>>4;
1790     }
1791     return sum>>2;
1792 }
1793
1794 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
1795     int i;
1796
1797     for(i=0; i<8*8; i++){
1798         rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
1799     }
1800 }
1801
1802 /**
1803  * permutes an 8x8 block.
1804  * @param block the block which will be permuted according to the given permutation vector
1805  * @param permutation the permutation vector
1806  * @param last the last non zero coefficient in scantable order, used to speed the permutation up
1807  * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
1808  *                  (inverse) permutated to scantable order!
1809  */
1810 void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
1811 {
1812     int i;
1813     DCTELEM temp[64];
1814
1815     if(last<=0) return;
1816     //if(permutation[1]==1) return; //FIXME it is ok but not clean and might fail for some permutations
1817
1818     for(i=0; i<=last; i++){
1819         const int j= scantable[i];
1820         temp[j]= block[j];
1821         block[j]=0;
1822     }
1823
1824     for(i=0; i<=last; i++){
1825         const int j= scantable[i];
1826         const int perm_j= permutation[j];
1827         block[perm_j]= temp[j];
1828     }
1829 }
1830
1831 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
1832     return 0;
1833 }
1834
1835 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
1836     int i;
1837
1838     memset(cmp, 0, sizeof(void*)*6);
1839
1840     for(i=0; i<6; i++){
1841         switch(type&0xFF){
1842         case FF_CMP_SAD:
1843             cmp[i]= c->sad[i];
1844             break;
1845         case FF_CMP_SATD:
1846             cmp[i]= c->hadamard8_diff[i];
1847             break;
1848         case FF_CMP_SSE:
1849             cmp[i]= c->sse[i];
1850             break;
1851         case FF_CMP_DCT:
1852             cmp[i]= c->dct_sad[i];
1853             break;
1854         case FF_CMP_DCT264:
1855             cmp[i]= c->dct264_sad[i];
1856             break;
1857         case FF_CMP_DCTMAX:
1858             cmp[i]= c->dct_max[i];
1859             break;
1860         case FF_CMP_PSNR:
1861             cmp[i]= c->quant_psnr[i];
1862             break;
1863         case FF_CMP_BIT:
1864             cmp[i]= c->bit[i];
1865             break;
1866         case FF_CMP_RD:
1867             cmp[i]= c->rd[i];
1868             break;
1869         case FF_CMP_VSAD:
1870             cmp[i]= c->vsad[i];
1871             break;
1872         case FF_CMP_VSSE:
1873             cmp[i]= c->vsse[i];
1874             break;
1875         case FF_CMP_ZERO:
1876             cmp[i]= zero_cmp;
1877             break;
1878         case FF_CMP_NSSE:
1879             cmp[i]= c->nsse[i];
1880             break;
1881 #if CONFIG_DWT
1882         case FF_CMP_W53:
1883             cmp[i]= c->w53[i];
1884             break;
1885         case FF_CMP_W97:
1886             cmp[i]= c->w97[i];
1887             break;
1888 #endif
1889         default:
1890             av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
1891         }
1892     }
1893 }
1894
1895 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
1896     long i;
1897     for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
1898         long a = *(long*)(src+i);
1899         long b = *(long*)(dst+i);
1900         *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
1901     }
1902     for(; i<w; i++)
1903         dst[i+0] += src[i+0];
1904 }
1905
1906 static void add_bytes_l2_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
1907     long i;
1908     for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
1909         long a = *(long*)(src1+i);
1910         long b = *(long*)(src2+i);
1911         *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
1912     }
1913     for(; i<w; i++)
1914         dst[i] = src1[i]+src2[i];
1915 }
1916
1917 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
1918     long i;
1919 #if !HAVE_FAST_UNALIGNED
1920     if((long)src2 & (sizeof(long)-1)){
1921         for(i=0; i+7<w; i+=8){
1922             dst[i+0] = src1[i+0]-src2[i+0];
1923             dst[i+1] = src1[i+1]-src2[i+1];
1924             dst[i+2] = src1[i+2]-src2[i+2];
1925             dst[i+3] = src1[i+3]-src2[i+3];
1926             dst[i+4] = src1[i+4]-src2[i+4];
1927             dst[i+5] = src1[i+5]-src2[i+5];
1928             dst[i+6] = src1[i+6]-src2[i+6];
1929             dst[i+7] = src1[i+7]-src2[i+7];
1930         }
1931     }else
1932 #endif
1933     for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
1934         long a = *(long*)(src1+i);
1935         long b = *(long*)(src2+i);
1936         *(long*)(dst+i) = ((a|pb_80) - (b&pb_7f)) ^ ((a^b^pb_80)&pb_80);
1937     }
1938     for(; i<w; i++)
1939         dst[i+0] = src1[i+0]-src2[i+0];
1940 }
1941
1942 static void add_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *diff, int w, int *left, int *left_top){
1943     int i;
1944     uint8_t l, lt;
1945
1946     l= *left;
1947     lt= *left_top;
1948
1949     for(i=0; i<w; i++){
1950         l= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF) + diff[i];
1951         lt= src1[i];
1952         dst[i]= l;
1953     }
1954
1955     *left= l;
1956     *left_top= lt;
1957 }
1958
1959 static void sub_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w, int *left, int *left_top){
1960     int i;
1961     uint8_t l, lt;
1962
1963     l= *left;
1964     lt= *left_top;
1965
1966     for(i=0; i<w; i++){
1967         const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
1968         lt= src1[i];
1969         l= src2[i];
1970         dst[i]= l - pred;
1971     }
1972
1973     *left= l;
1974     *left_top= lt;
1975 }
1976
1977 static int add_hfyu_left_prediction_c(uint8_t *dst, const uint8_t *src, int w, int acc){
1978     int i;
1979
1980     for(i=0; i<w-1; i++){
1981         acc+= src[i];
1982         dst[i]= acc;
1983         i++;
1984         acc+= src[i];
1985         dst[i]= acc;
1986     }
1987
1988     for(; i<w; i++){
1989         acc+= src[i];
1990         dst[i]= acc;
1991     }
1992
1993     return acc;
1994 }
1995
1996 #if HAVE_BIGENDIAN
1997 #define B 3
1998 #define G 2
1999 #define R 1
2000 #define A 0
2001 #else
2002 #define B 0
2003 #define G 1
2004 #define R 2
2005 #define A 3
2006 #endif
2007 static void add_hfyu_left_prediction_bgr32_c(uint8_t *dst, const uint8_t *src, int w, int *red, int *green, int *blue, int *alpha){
2008     int i;
2009     int r,g,b,a;
2010     r= *red;
2011     g= *green;
2012     b= *blue;
2013     a= *alpha;
2014
2015     for(i=0; i<w; i++){
2016         b+= src[4*i+B];
2017         g+= src[4*i+G];
2018         r+= src[4*i+R];
2019         a+= src[4*i+A];
2020
2021         dst[4*i+B]= b;
2022         dst[4*i+G]= g;
2023         dst[4*i+R]= r;
2024         dst[4*i+A]= a;
2025     }
2026
2027     *red= r;
2028     *green= g;
2029     *blue= b;
2030     *alpha= a;
2031 }
2032 #undef B
2033 #undef G
2034 #undef R
2035 #undef A
2036
2037 #define BUTTERFLY2(o1,o2,i1,i2) \
2038 o1= (i1)+(i2);\
2039 o2= (i1)-(i2);
2040
2041 #define BUTTERFLY1(x,y) \
2042 {\
2043     int a,b;\
2044     a= x;\
2045     b= y;\
2046     x= a+b;\
2047     y= a-b;\
2048 }
2049
2050 #define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
2051
2052 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
2053     int i;
2054     int temp[64];
2055     int sum=0;
2056
2057     assert(h==8);
2058
2059     for(i=0; i<8; i++){
2060         //FIXME try pointer walks
2061         BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
2062         BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
2063         BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
2064         BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
2065
2066         BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2067         BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2068         BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2069         BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2070
2071         BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2072         BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2073         BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2074         BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2075     }
2076
2077     for(i=0; i<8; i++){
2078         BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2079         BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2080         BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2081         BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2082
2083         BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2084         BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2085         BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2086         BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2087
2088         sum +=
2089              BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2090             +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2091             +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2092             +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2093     }
2094     return sum;
2095 }
2096
2097 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
2098     int i;
2099     int temp[64];
2100     int sum=0;
2101
2102     assert(h==8);
2103
2104     for(i=0; i<8; i++){
2105         //FIXME try pointer walks
2106         BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
2107         BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
2108         BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
2109         BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
2110
2111         BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2112         BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2113         BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2114         BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2115
2116         BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2117         BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2118         BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2119         BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2120     }
2121
2122     for(i=0; i<8; i++){
2123         BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2124         BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2125         BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2126         BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2127
2128         BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2129         BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2130         BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2131         BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2132
2133         sum +=
2134              BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2135             +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2136             +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2137             +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2138     }
2139
2140     sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
2141
2142     return sum;
2143 }
2144
2145 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2146     MpegEncContext * const s= (MpegEncContext *)c;
2147     LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
2148
2149     assert(h==8);
2150
2151     s->dsp.diff_pixels(temp, src1, src2, stride);
2152     s->dsp.fdct(temp);
2153     return s->dsp.sum_abs_dctelem(temp);
2154 }
2155
2156 #if CONFIG_GPL
2157 #define DCT8_1D {\
2158     const int s07 = SRC(0) + SRC(7);\
2159     const int s16 = SRC(1) + SRC(6);\
2160     const int s25 = SRC(2) + SRC(5);\
2161     const int s34 = SRC(3) + SRC(4);\
2162     const int a0 = s07 + s34;\
2163     const int a1 = s16 + s25;\
2164     const int a2 = s07 - s34;\
2165     const int a3 = s16 - s25;\
2166     const int d07 = SRC(0) - SRC(7);\
2167     const int d16 = SRC(1) - SRC(6);\
2168     const int d25 = SRC(2) - SRC(5);\
2169     const int d34 = SRC(3) - SRC(4);\
2170     const int a4 = d16 + d25 + (d07 + (d07>>1));\
2171     const int a5 = d07 - d34 - (d25 + (d25>>1));\
2172     const int a6 = d07 + d34 - (d16 + (d16>>1));\
2173     const int a7 = d16 - d25 + (d34 + (d34>>1));\
2174     DST(0,  a0 + a1     ) ;\
2175     DST(1,  a4 + (a7>>2)) ;\
2176     DST(2,  a2 + (a3>>1)) ;\
2177     DST(3,  a5 + (a6>>2)) ;\
2178     DST(4,  a0 - a1     ) ;\
2179     DST(5,  a6 - (a5>>2)) ;\
2180     DST(6, (a2>>1) - a3 ) ;\
2181     DST(7, (a4>>2) - a7 ) ;\
2182 }
2183
2184 static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2185     MpegEncContext * const s= (MpegEncContext *)c;
2186     DCTELEM dct[8][8];
2187     int i;
2188     int sum=0;
2189
2190     s->dsp.diff_pixels(dct[0], src1, src2, stride);
2191
2192 #define SRC(x) dct[i][x]
2193 #define DST(x,v) dct[i][x]= v
2194     for( i = 0; i < 8; i++ )
2195         DCT8_1D
2196 #undef SRC
2197 #undef DST
2198
2199 #define SRC(x) dct[x][i]
2200 #define DST(x,v) sum += FFABS(v)
2201     for( i = 0; i < 8; i++ )
2202         DCT8_1D
2203 #undef SRC
2204 #undef DST
2205     return sum;
2206 }
2207 #endif
2208
2209 static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2210     MpegEncContext * const s= (MpegEncContext *)c;
2211     LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
2212     int sum=0, i;
2213
2214     assert(h==8);
2215
2216     s->dsp.diff_pixels(temp, src1, src2, stride);
2217     s->dsp.fdct(temp);
2218
2219     for(i=0; i<64; i++)
2220         sum= FFMAX(sum, FFABS(temp[i]));
2221
2222     return sum;
2223 }
2224
2225 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2226     MpegEncContext * const s= (MpegEncContext *)c;
2227     LOCAL_ALIGNED_16(DCTELEM, temp, [64*2]);
2228     DCTELEM * const bak = temp+64;
2229     int sum=0, i;
2230
2231     assert(h==8);
2232     s->mb_intra=0;
2233
2234     s->dsp.diff_pixels(temp, src1, src2, stride);
2235
2236     memcpy(bak, temp, 64*sizeof(DCTELEM));
2237
2238     s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2239     s->dct_unquantize_inter(s, temp, 0, s->qscale);
2240     ff_simple_idct(temp); //FIXME
2241
2242     for(i=0; i<64; i++)
2243         sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
2244
2245     return sum;
2246 }
2247
2248 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2249     MpegEncContext * const s= (MpegEncContext *)c;
2250     const uint8_t *scantable= s->intra_scantable.permutated;
2251     LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
2252     LOCAL_ALIGNED_16(uint8_t, lsrc1, [64]);
2253     LOCAL_ALIGNED_16(uint8_t, lsrc2, [64]);
2254     int i, last, run, bits, level, distortion, start_i;
2255     const int esc_length= s->ac_esc_length;
2256     uint8_t * length;
2257     uint8_t * last_length;
2258
2259     assert(h==8);
2260
2261     copy_block8(lsrc1, src1, 8, stride, 8);
2262     copy_block8(lsrc2, src2, 8, stride, 8);
2263
2264     s->dsp.diff_pixels(temp, lsrc1, lsrc2, 8);
2265
2266     s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2267
2268     bits=0;
2269
2270     if (s->mb_intra) {
2271         start_i = 1;
2272         length     = s->intra_ac_vlc_length;
2273         last_length= s->intra_ac_vlc_last_length;
2274         bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2275     } else {
2276         start_i = 0;
2277         length     = s->inter_ac_vlc_length;
2278         last_length= s->inter_ac_vlc_last_length;
2279     }
2280
2281     if(last>=start_i){
2282         run=0;
2283         for(i=start_i; i<last; i++){
2284             int j= scantable[i];
2285             level= temp[j];
2286
2287             if(level){
2288                 level+=64;
2289                 if((level&(~127)) == 0){
2290                     bits+= length[UNI_AC_ENC_INDEX(run, level)];
2291                 }else
2292                     bits+= esc_length;
2293                 run=0;
2294             }else
2295                 run++;
2296         }
2297         i= scantable[last];
2298
2299         level= temp[i] + 64;
2300
2301         assert(level - 64);
2302
2303         if((level&(~127)) == 0){
2304             bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2305         }else
2306             bits+= esc_length;
2307
2308     }
2309
2310     if(last>=0){
2311         if(s->mb_intra)
2312             s->dct_unquantize_intra(s, temp, 0, s->qscale);
2313         else
2314             s->dct_unquantize_inter(s, temp, 0, s->qscale);
2315     }
2316
2317     s->dsp.idct_add(lsrc2, 8, temp);
2318
2319     distortion= s->dsp.sse[1](NULL, lsrc2, lsrc1, 8, 8);
2320
2321     return distortion + ((bits*s->qscale*s->qscale*109 + 64)>>7);
2322 }
2323
2324 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2325     MpegEncContext * const s= (MpegEncContext *)c;
2326     const uint8_t *scantable= s->intra_scantable.permutated;
2327     LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
2328     int i, last, run, bits, level, start_i;
2329     const int esc_length= s->ac_esc_length;
2330     uint8_t * length;
2331     uint8_t * last_length;
2332
2333     assert(h==8);
2334
2335     s->dsp.diff_pixels(temp, src1, src2, stride);
2336
2337     s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2338
2339     bits=0;
2340
2341     if (s->mb_intra) {
2342         start_i = 1;
2343         length     = s->intra_ac_vlc_length;
2344         last_length= s->intra_ac_vlc_last_length;
2345         bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2346     } else {
2347         start_i = 0;
2348         length     = s->inter_ac_vlc_length;
2349         last_length= s->inter_ac_vlc_last_length;
2350     }
2351
2352     if(last>=start_i){
2353         run=0;
2354         for(i=start_i; i<last; i++){
2355             int j= scantable[i];
2356             level= temp[j];
2357
2358             if(level){
2359                 level+=64;
2360                 if((level&(~127)) == 0){
2361                     bits+= length[UNI_AC_ENC_INDEX(run, level)];
2362                 }else
2363                     bits+= esc_length;
2364                 run=0;
2365             }else
2366                 run++;
2367         }
2368         i= scantable[last];
2369
2370         level= temp[i] + 64;
2371
2372         assert(level - 64);
2373
2374         if((level&(~127)) == 0){
2375             bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2376         }else
2377             bits+= esc_length;
2378     }
2379
2380     return bits;
2381 }
2382
2383 #define VSAD_INTRA(size) \
2384 static int vsad_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
2385     int score=0;                                                                                            \
2386     int x,y;                                                                                                \
2387                                                                                                             \
2388     for(y=1; y<h; y++){                                                                                     \
2389         for(x=0; x<size; x+=4){                                                                             \
2390             score+= FFABS(s[x  ] - s[x  +stride]) + FFABS(s[x+1] - s[x+1+stride])                           \
2391                    +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]);                          \
2392         }                                                                                                   \
2393         s+= stride;                                                                                         \
2394     }                                                                                                       \
2395                                                                                                             \
2396     return score;                                                                                           \
2397 }
2398 VSAD_INTRA(8)
2399 VSAD_INTRA(16)
2400
2401 static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2402     int score=0;
2403     int x,y;
2404
2405     for(y=1; y<h; y++){
2406         for(x=0; x<16; x++){
2407             score+= FFABS(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
2408         }
2409         s1+= stride;
2410         s2+= stride;
2411     }
2412
2413     return score;
2414 }
2415
2416 #define SQ(a) ((a)*(a))
2417 #define VSSE_INTRA(size) \
2418 static int vsse_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
2419     int score=0;                                                                                            \
2420     int x,y;                                                                                                \
2421                                                                                                             \
2422     for(y=1; y<h; y++){                                                                                     \
2423         for(x=0; x<size; x+=4){                                                                               \
2424             score+= SQ(s[x  ] - s[x  +stride]) + SQ(s[x+1] - s[x+1+stride])                                 \
2425                    +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);                                \
2426         }                                                                                                   \
2427         s+= stride;                                                                                         \
2428     }                                                                                                       \
2429                                                                                                             \
2430     return score;                                                                                           \
2431 }
2432 VSSE_INTRA(8)
2433 VSSE_INTRA(16)
2434
2435 static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2436     int score=0;
2437     int x,y;
2438
2439     for(y=1; y<h; y++){
2440         for(x=0; x<16; x++){
2441             score+= SQ(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
2442         }
2443         s1+= stride;
2444         s2+= stride;
2445     }
2446
2447     return score;
2448 }
2449
2450 static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
2451                                int size){
2452     int score=0;
2453     int i;
2454     for(i=0; i<size; i++)
2455         score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
2456     return score;
2457 }
2458
2459 WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
2460 WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
2461 WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
2462 #if CONFIG_GPL
2463 WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
2464 #endif
2465 WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
2466 WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
2467 WRAPPER8_16_SQ(rd8x8_c, rd16_c)
2468 WRAPPER8_16_SQ(bit8x8_c, bit16_c)
2469
2470 static void vector_fmul_c(float *dst, const float *src0, const float *src1, int len){
2471     int i;
2472     for(i=0; i<len; i++)
2473         dst[i] = src0[i] * src1[i];
2474 }
2475
2476 static void vector_fmul_reverse_c(float *dst, const float *src0, const float *src1, int len){
2477     int i;
2478     src1 += len-1;
2479     for(i=0; i<len; i++)
2480         dst[i] = src0[i] * src1[-i];
2481 }
2482
2483 static void vector_fmul_add_c(float *dst, const float *src0, const float *src1, const float *src2, int len){
2484     int i;
2485     for(i=0; i<len; i++)
2486         dst[i] = src0[i] * src1[i] + src2[i];
2487 }
2488
2489 static void vector_fmul_window_c(float *dst, const float *src0,
2490                                  const float *src1, const float *win, int len)
2491 {
2492     int i,j;
2493     dst += len;
2494     win += len;
2495     src0+= len;
2496     for(i=-len, j=len-1; i<0; i++, j--) {
2497         float s0 = src0[i];
2498         float s1 = src1[j];
2499         float wi = win[i];
2500         float wj = win[j];
2501         dst[i] = s0*wj - s1*wi;
2502         dst[j] = s0*wi + s1*wj;
2503     }
2504 }
2505
2506 static void vector_fmul_scalar_c(float *dst, const float *src, float mul,
2507                                  int len)
2508 {
2509     int i;
2510     for (i = 0; i < len; i++)
2511         dst[i] = src[i] * mul;
2512 }
2513
2514 static void vector_fmul_sv_scalar_2_c(float *dst, const float *src,
2515                                       const float **sv, float mul, int len)
2516 {
2517     int i;
2518     for (i = 0; i < len; i += 2, sv++) {
2519         dst[i  ] = src[i  ] * sv[0][0] * mul;
2520         dst[i+1] = src[i+1] * sv[0][1] * mul;
2521     }
2522 }
2523
2524 static void vector_fmul_sv_scalar_4_c(float *dst, const float *src,
2525                                       const float **sv, float mul, int len)
2526 {
2527     int i;
2528     for (i = 0; i < len; i += 4, sv++) {
2529         dst[i  ] = src[i  ] * sv[0][0] * mul;
2530         dst[i+1] = src[i+1] * sv[0][1] * mul;
2531         dst[i+2] = src[i+2] * sv[0][2] * mul;
2532         dst[i+3] = src[i+3] * sv[0][3] * mul;
2533     }
2534 }
2535
2536 static void sv_fmul_scalar_2_c(float *dst, const float **sv, float mul,
2537                                int len)
2538 {
2539     int i;
2540     for (i = 0; i < len; i += 2, sv++) {
2541         dst[i  ] = sv[0][0] * mul;
2542         dst[i+1] = sv[0][1] * mul;
2543     }
2544 }
2545
2546 static void sv_fmul_scalar_4_c(float *dst, const float **sv, float mul,
2547                                int len)
2548 {
2549     int i;
2550     for (i = 0; i < len; i += 4, sv++) {
2551         dst[i  ] = sv[0][0] * mul;
2552         dst[i+1] = sv[0][1] * mul;
2553         dst[i+2] = sv[0][2] * mul;
2554         dst[i+3] = sv[0][3] * mul;
2555     }
2556 }
2557
2558 static void butterflies_float_c(float *restrict v1, float *restrict v2,
2559                                 int len)
2560 {
2561     int i;
2562     for (i = 0; i < len; i++) {
2563         float t = v1[i] - v2[i];
2564         v1[i] += v2[i];
2565         v2[i] = t;
2566     }
2567 }
2568
2569 static float scalarproduct_float_c(const float *v1, const float *v2, int len)
2570 {
2571     float p = 0.0;
2572     int i;
2573
2574     for (i = 0; i < len; i++)
2575         p += v1[i] * v2[i];
2576
2577     return p;
2578 }
2579
2580 static inline uint32_t clipf_c_one(uint32_t a, uint32_t mini,
2581                    uint32_t maxi, uint32_t maxisign)
2582 {
2583
2584     if(a > mini) return mini;
2585     else if((a^(1U<<31)) > maxisign) return maxi;
2586     else return a;
2587 }
2588
2589 static void vector_clipf_c_opposite_sign(float *dst, const float *src, float *min, float *max, int len){
2590     int i;
2591     uint32_t mini = *(uint32_t*)min;
2592     uint32_t maxi = *(uint32_t*)max;
2593     uint32_t maxisign = maxi ^ (1U<<31);
2594     uint32_t *dsti = (uint32_t*)dst;
2595     const uint32_t *srci = (const uint32_t*)src;
2596     for(i=0; i<len; i+=8) {
2597         dsti[i + 0] = clipf_c_one(srci[i + 0], mini, maxi, maxisign);
2598         dsti[i + 1] = clipf_c_one(srci[i + 1], mini, maxi, maxisign);
2599         dsti[i + 2] = clipf_c_one(srci[i + 2], mini, maxi, maxisign);
2600         dsti[i + 3] = clipf_c_one(srci[i + 3], mini, maxi, maxisign);
2601         dsti[i + 4] = clipf_c_one(srci[i + 4], mini, maxi, maxisign);
2602         dsti[i + 5] = clipf_c_one(srci[i + 5], mini, maxi, maxisign);
2603         dsti[i + 6] = clipf_c_one(srci[i + 6], mini, maxi, maxisign);
2604         dsti[i + 7] = clipf_c_one(srci[i + 7], mini, maxi, maxisign);
2605     }
2606 }
2607 static void vector_clipf_c(float *dst, const float *src, float min, float max, int len){
2608     int i;
2609     if(min < 0 && max > 0) {
2610         vector_clipf_c_opposite_sign(dst, src, &min, &max, len);
2611     } else {
2612         for(i=0; i < len; i+=8) {
2613             dst[i    ] = av_clipf(src[i    ], min, max);
2614             dst[i + 1] = av_clipf(src[i + 1], min, max);
2615             dst[i + 2] = av_clipf(src[i + 2], min, max);
2616             dst[i + 3] = av_clipf(src[i + 3], min, max);
2617             dst[i + 4] = av_clipf(src[i + 4], min, max);
2618             dst[i + 5] = av_clipf(src[i + 5], min, max);
2619             dst[i + 6] = av_clipf(src[i + 6], min, max);
2620             dst[i + 7] = av_clipf(src[i + 7], min, max);
2621         }
2622     }
2623 }
2624
2625 static int32_t scalarproduct_int16_c(const int16_t * v1, const int16_t * v2, int order, int shift)
2626 {
2627     int res = 0;
2628
2629     while (order--)
2630         res += (*v1++ * *v2++) >> shift;
2631
2632     return res;
2633 }
2634
2635 static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul)
2636 {
2637     int res = 0;
2638     while (order--) {
2639         res   += *v1 * *v2++;
2640         *v1++ += mul * *v3++;
2641     }
2642     return res;
2643 }
2644
2645 static void apply_window_int16_c(int16_t *output, const int16_t *input,
2646                                  const int16_t *window, unsigned int len)
2647 {
2648     int i;
2649     int len2 = len >> 1;
2650
2651     for (i = 0; i < len2; i++) {
2652         int16_t w       = window[i];
2653         output[i]       = (MUL16(input[i],       w) + (1 << 14)) >> 15;
2654         output[len-i-1] = (MUL16(input[len-i-1], w) + (1 << 14)) >> 15;
2655     }
2656 }
2657
2658 static void vector_clip_int32_c(int32_t *dst, const int32_t *src, int32_t min,
2659                                 int32_t max, unsigned int len)
2660 {
2661     do {
2662         *dst++ = av_clip(*src++, min, max);
2663         *dst++ = av_clip(*src++, min, max);
2664         *dst++ = av_clip(*src++, min, max);
2665         *dst++ = av_clip(*src++, min, max);
2666         *dst++ = av_clip(*src++, min, max);
2667         *dst++ = av_clip(*src++, min, max);
2668         *dst++ = av_clip(*src++, min, max);
2669         *dst++ = av_clip(*src++, min, max);
2670         len -= 8;
2671     } while (len > 0);
2672 }
2673
2674 #define W0 2048
2675 #define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
2676 #define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
2677 #define W3 2408 /* 2048*sqrt (2)*cos (3*pi/16) */
2678 #define W4 2048 /* 2048*sqrt (2)*cos (4*pi/16) */
2679 #define W5 1609 /* 2048*sqrt (2)*cos (5*pi/16) */
2680 #define W6 1108 /* 2048*sqrt (2)*cos (6*pi/16) */
2681 #define W7 565  /* 2048*sqrt (2)*cos (7*pi/16) */
2682
2683 static void wmv2_idct_row(short * b)
2684 {
2685     int s1,s2;
2686     int a0,a1,a2,a3,a4,a5,a6,a7;
2687     /*step 1*/
2688     a1 = W1*b[1]+W7*b[7];
2689     a7 = W7*b[1]-W1*b[7];
2690     a5 = W5*b[5]+W3*b[3];
2691     a3 = W3*b[5]-W5*b[3];
2692     a2 = W2*b[2]+W6*b[6];
2693     a6 = W6*b[2]-W2*b[6];
2694     a0 = W0*b[0]+W0*b[4];
2695     a4 = W0*b[0]-W0*b[4];
2696     /*step 2*/
2697     s1 = (181*(a1-a5+a7-a3)+128)>>8;//1,3,5,7,
2698     s2 = (181*(a1-a5-a7+a3)+128)>>8;
2699     /*step 3*/
2700     b[0] = (a0+a2+a1+a5 + (1<<7))>>8;
2701     b[1] = (a4+a6 +s1   + (1<<7))>>8;
2702     b[2] = (a4-a6 +s2   + (1<<7))>>8;
2703     b[3] = (a0-a2+a7+a3 + (1<<7))>>8;
2704     b[4] = (a0-a2-a7-a3 + (1<<7))>>8;
2705     b[5] = (a4-a6 -s2   + (1<<7))>>8;
2706     b[6] = (a4+a6 -s1   + (1<<7))>>8;
2707     b[7] = (a0+a2-a1-a5 + (1<<7))>>8;
2708 }
2709 static void wmv2_idct_col(short * b)
2710 {
2711     int s1,s2;
2712     int a0,a1,a2,a3,a4,a5,a6,a7;
2713     /*step 1, with extended precision*/
2714     a1 = (W1*b[8*1]+W7*b[8*7] + 4)>>3;
2715     a7 = (W7*b[8*1]-W1*b[8*7] + 4)>>3;
2716     a5 = (W5*b[8*5]+W3*b[8*3] + 4)>>3;
2717     a3 = (W3*b[8*5]-W5*b[8*3] + 4)>>3;
2718     a2 = (W2*b[8*2]+W6*b[8*6] + 4)>>3;
2719     a6 = (W6*b[8*2]-W2*b[8*6] + 4)>>3;
2720     a0 = (W0*b[8*0]+W0*b[8*4]    )>>3;
2721     a4 = (W0*b[8*0]-W0*b[8*4]    )>>3;
2722     /*step 2*/
2723     s1 = (181*(a1-a5+a7-a3)+128)>>8;
2724     s2 = (181*(a1-a5-a7+a3)+128)>>8;
2725     /*step 3*/
2726     b[8*0] = (a0+a2+a1+a5 + (1<<13))>>14;
2727     b[8*1] = (a4+a6 +s1   + (1<<13))>>14;
2728     b[8*2] = (a4-a6 +s2   + (1<<13))>>14;
2729     b[8*3] = (a0-a2+a7+a3 + (1<<13))>>14;
2730
2731     b[8*4] = (a0-a2-a7-a3 + (1<<13))>>14;
2732     b[8*5] = (a4-a6 -s2   + (1<<13))>>14;
2733     b[8*6] = (a4+a6 -s1   + (1<<13))>>14;
2734     b[8*7] = (a0+a2-a1-a5 + (1<<13))>>14;
2735 }
2736 void ff_wmv2_idct_c(short * block){
2737     int i;
2738
2739     for(i=0;i<64;i+=8){
2740         wmv2_idct_row(block+i);
2741     }
2742     for(i=0;i<8;i++){
2743         wmv2_idct_col(block+i);
2744     }
2745 }
2746 /* XXX: those functions should be suppressed ASAP when all IDCTs are
2747  converted */
2748 static void ff_wmv2_idct_put_c(uint8_t *dest, int line_size, DCTELEM *block)
2749 {
2750     ff_wmv2_idct_c(block);
2751     ff_put_pixels_clamped_c(block, dest, line_size);
2752 }
2753 static void ff_wmv2_idct_add_c(uint8_t *dest, int line_size, DCTELEM *block)
2754 {
2755     ff_wmv2_idct_c(block);
2756     ff_add_pixels_clamped_c(block, dest, line_size);
2757 }
2758 static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
2759 {
2760     j_rev_dct (block);
2761     ff_put_pixels_clamped_c(block, dest, line_size);
2762 }
2763 static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
2764 {
2765     j_rev_dct (block);
2766     ff_add_pixels_clamped_c(block, dest, line_size);
2767 }
2768
2769 static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block)
2770 {
2771     j_rev_dct4 (block);
2772     put_pixels_clamped4_c(block, dest, line_size);
2773 }
2774 static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block)
2775 {
2776     j_rev_dct4 (block);
2777     add_pixels_clamped4_c(block, dest, line_size);
2778 }
2779
2780 static void ff_jref_idct2_put(uint8_t *dest, int line_size, DCTELEM *block)
2781 {
2782     j_rev_dct2 (block);
2783     put_pixels_clamped2_c(block, dest, line_size);
2784 }
2785 static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block)
2786 {
2787     j_rev_dct2 (block);
2788     add_pixels_clamped2_c(block, dest, line_size);
2789 }
2790
2791 static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block)
2792 {
2793     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2794
2795     dest[0] = cm[(block[0] + 4)>>3];
2796 }
2797 static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block)
2798 {
2799     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2800
2801     dest[0] = cm[dest[0] + ((block[0] + 4)>>3)];
2802 }
2803
2804 static void just_return(void *mem av_unused, int stride av_unused, int h av_unused) { return; }
2805
2806 /* init static data */
2807 av_cold void dsputil_static_init(void)
2808 {
2809     int i;
2810
2811     for(i=0;i<256;i++) ff_cropTbl[i + MAX_NEG_CROP] = i;
2812     for(i=0;i<MAX_NEG_CROP;i++) {
2813         ff_cropTbl[i] = 0;
2814         ff_cropTbl[i + MAX_NEG_CROP + 256] = 255;
2815     }
2816
2817     for(i=0;i<512;i++) {
2818         ff_squareTbl[i] = (i - 256) * (i - 256);
2819     }
2820
2821     for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
2822 }
2823
2824 int ff_check_alignment(void){
2825     static int did_fail=0;
2826     LOCAL_ALIGNED_16(int, aligned);
2827
2828     if((intptr_t)&aligned & 15){
2829         if(!did_fail){
2830 #if HAVE_MMX || HAVE_ALTIVEC
2831             av_log(NULL, AV_LOG_ERROR,
2832                 "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
2833                 "and may be very slow or crash. This is not a bug in libavcodec,\n"
2834                 "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
2835                 "Do not report crashes to Libav developers.\n");
2836 #endif
2837             did_fail=1;
2838         }
2839         return -1;
2840     }
2841     return 0;
2842 }
2843
2844 av_cold void dsputil_init(DSPContext* c, AVCodecContext *avctx)
2845 {
2846     int i;
2847
2848     ff_check_alignment();
2849
2850 #if CONFIG_ENCODERS
2851     if(avctx->dct_algo==FF_DCT_FASTINT) {
2852         c->fdct = fdct_ifast;
2853         c->fdct248 = fdct_ifast248;
2854     }
2855     else if(avctx->dct_algo==FF_DCT_FAAN) {
2856         c->fdct = ff_faandct;
2857         c->fdct248 = ff_faandct248;
2858     }
2859     else {
2860         c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
2861         c->fdct248 = ff_fdct248_islow;
2862     }
2863 #endif //CONFIG_ENCODERS
2864
2865     if(avctx->lowres==1){
2866         if(avctx->idct_algo==FF_IDCT_INT || avctx->idct_algo==FF_IDCT_AUTO || !CONFIG_H264_DECODER){
2867             c->idct_put= ff_jref_idct4_put;
2868             c->idct_add= ff_jref_idct4_add;
2869         }else{
2870             if (avctx->codec_id != CODEC_ID_H264) {
2871                 c->idct_put= ff_h264_lowres_idct_put_8_c;
2872                 c->idct_add= ff_h264_lowres_idct_add_8_c;
2873             } else {
2874                 switch (avctx->bits_per_raw_sample) {
2875                     case 9:
2876                         c->idct_put= ff_h264_lowres_idct_put_9_c;
2877                         c->idct_add= ff_h264_lowres_idct_add_9_c;
2878                         break;
2879                     case 10:
2880                         c->idct_put= ff_h264_lowres_idct_put_10_c;
2881                         c->idct_add= ff_h264_lowres_idct_add_10_c;
2882                         break;
2883                     default:
2884                         c->idct_put= ff_h264_lowres_idct_put_8_c;
2885                         c->idct_add= ff_h264_lowres_idct_add_8_c;
2886                 }
2887             }
2888         }
2889         c->idct    = j_rev_dct4;
2890         c->idct_permutation_type= FF_NO_IDCT_PERM;
2891     }else if(avctx->lowres==2){
2892         c->idct_put= ff_jref_idct2_put;
2893         c->idct_add= ff_jref_idct2_add;
2894         c->idct    = j_rev_dct2;
2895         c->idct_permutation_type= FF_NO_IDCT_PERM;
2896     }else if(avctx->lowres==3){
2897         c->idct_put= ff_jref_idct1_put;
2898         c->idct_add= ff_jref_idct1_add;
2899         c->idct    = j_rev_dct1;
2900         c->idct_permutation_type= FF_NO_IDCT_PERM;
2901     }else{
2902         if(avctx->idct_algo==FF_IDCT_INT){
2903             c->idct_put= ff_jref_idct_put;
2904             c->idct_add= ff_jref_idct_add;
2905             c->idct    = j_rev_dct;
2906             c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
2907         }else if((CONFIG_VP3_DECODER || CONFIG_VP5_DECODER || CONFIG_VP6_DECODER ) &&
2908                 avctx->idct_algo==FF_IDCT_VP3){
2909             c->idct_put= ff_vp3_idct_put_c;
2910             c->idct_add= ff_vp3_idct_add_c;
2911             c->idct    = ff_vp3_idct_c;
2912             c->idct_permutation_type= FF_NO_IDCT_PERM;
2913         }else if(avctx->idct_algo==FF_IDCT_WMV2){
2914             c->idct_put= ff_wmv2_idct_put_c;
2915             c->idct_add= ff_wmv2_idct_add_c;
2916             c->idct    = ff_wmv2_idct_c;
2917             c->idct_permutation_type= FF_NO_IDCT_PERM;
2918         }else if(avctx->idct_algo==FF_IDCT_FAAN){
2919             c->idct_put= ff_faanidct_put;
2920             c->idct_add= ff_faanidct_add;
2921             c->idct    = ff_faanidct;
2922             c->idct_permutation_type= FF_NO_IDCT_PERM;
2923         }else if(CONFIG_EATGQ_DECODER && avctx->idct_algo==FF_IDCT_EA) {
2924             c->idct_put= ff_ea_idct_put_c;
2925             c->idct_permutation_type= FF_NO_IDCT_PERM;
2926         }else if(CONFIG_BINK_DECODER && avctx->idct_algo==FF_IDCT_BINK) {
2927             c->idct     = ff_bink_idct_c;
2928             c->idct_add = ff_bink_idct_add_c;
2929             c->idct_put = ff_bink_idct_put_c;
2930             c->idct_permutation_type = FF_NO_IDCT_PERM;
2931         }else{ //accurate/default
2932             c->idct_put= ff_simple_idct_put;
2933             c->idct_add= ff_simple_idct_add;
2934             c->idct    = ff_simple_idct;
2935             c->idct_permutation_type= FF_NO_IDCT_PERM;
2936         }
2937     }
2938
2939     c->get_pixels = get_pixels_c;
2940     c->diff_pixels = diff_pixels_c;
2941     c->put_pixels_clamped = ff_put_pixels_clamped_c;
2942     c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_c;
2943     c->put_pixels_nonclamped = put_pixels_nonclamped_c;
2944     c->add_pixels_clamped = ff_add_pixels_clamped_c;
2945     c->sum_abs_dctelem = sum_abs_dctelem_c;
2946     c->gmc1 = gmc1_c;
2947     c->gmc = ff_gmc_c;
2948     c->pix_sum = pix_sum_c;
2949     c->pix_norm1 = pix_norm1_c;
2950
2951     c->fill_block_tab[0] = fill_block16_c;
2952     c->fill_block_tab[1] = fill_block8_c;
2953     c->scale_block = scale_block_c;
2954
2955     /* TODO [0] 16  [1] 8 */
2956     c->pix_abs[0][0] = pix_abs16_c;
2957     c->pix_abs[0][1] = pix_abs16_x2_c;
2958     c->pix_abs[0][2] = pix_abs16_y2_c;
2959     c->pix_abs[0][3] = pix_abs16_xy2_c;
2960     c->pix_abs[1][0] = pix_abs8_c;
2961     c->pix_abs[1][1] = pix_abs8_x2_c;
2962     c->pix_abs[1][2] = pix_abs8_y2_c;
2963     c->pix_abs[1][3] = pix_abs8_xy2_c;
2964
2965     c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
2966     c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
2967     c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
2968     c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
2969     c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
2970     c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
2971     c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
2972     c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
2973     c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
2974
2975     c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
2976     c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
2977     c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
2978     c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
2979     c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
2980     c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
2981     c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
2982     c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
2983     c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
2984
2985 #define dspfunc(PFX, IDX, NUM) \
2986     c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
2987     c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
2988     c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
2989     c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
2990     c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
2991     c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
2992     c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
2993     c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
2994     c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
2995     c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
2996     c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
2997     c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
2998     c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
2999     c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
3000     c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
3001     c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
3002
3003     dspfunc(put_qpel, 0, 16);
3004     dspfunc(put_no_rnd_qpel, 0, 16);
3005
3006     dspfunc(avg_qpel, 0, 16);
3007     /* dspfunc(avg_no_rnd_qpel, 0, 16); */
3008
3009     dspfunc(put_qpel, 1, 8);
3010     dspfunc(put_no_rnd_qpel, 1, 8);
3011
3012     dspfunc(avg_qpel, 1, 8);
3013     /* dspfunc(avg_no_rnd_qpel, 1, 8); */
3014
3015 #undef dspfunc
3016
3017 #if CONFIG_MLP_DECODER || CONFIG_TRUEHD_DECODER
3018     ff_mlp_init(c, avctx);
3019 #endif
3020 #if CONFIG_WMV2_DECODER || CONFIG_VC1_DECODER
3021     ff_intrax8dsp_init(c,avctx);
3022 #endif
3023 #if CONFIG_RV30_DECODER
3024     ff_rv30dsp_init(c,avctx);
3025 #endif
3026 #if CONFIG_RV40_DECODER
3027     ff_rv40dsp_init(c,avctx);
3028     c->put_rv40_qpel_pixels_tab[0][15] = put_rv40_qpel16_mc33_c;
3029     c->avg_rv40_qpel_pixels_tab[0][15] = avg_rv40_qpel16_mc33_c;
3030     c->put_rv40_qpel_pixels_tab[1][15] = put_rv40_qpel8_mc33_c;
3031     c->avg_rv40_qpel_pixels_tab[1][15] = avg_rv40_qpel8_mc33_c;
3032 #endif
3033
3034     c->put_mspel_pixels_tab[0]= ff_put_pixels8x8_c;
3035     c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
3036     c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
3037     c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
3038     c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
3039     c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
3040     c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
3041     c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
3042
3043 #define SET_CMP_FUNC(name) \
3044     c->name[0]= name ## 16_c;\
3045     c->name[1]= name ## 8x8_c;
3046
3047     SET_CMP_FUNC(hadamard8_diff)
3048     c->hadamard8_diff[4]= hadamard8_intra16_c;
3049     c->hadamard8_diff[5]= hadamard8_intra8x8_c;
3050     SET_CMP_FUNC(dct_sad)
3051     SET_CMP_FUNC(dct_max)
3052 #if CONFIG_GPL
3053     SET_CMP_FUNC(dct264_sad)
3054 #endif
3055     c->sad[0]= pix_abs16_c;
3056     c->sad[1]= pix_abs8_c;
3057     c->sse[0]= sse16_c;
3058     c->sse[1]= sse8_c;
3059     c->sse[2]= sse4_c;
3060     SET_CMP_FUNC(quant_psnr)
3061     SET_CMP_FUNC(rd)
3062     SET_CMP_FUNC(bit)
3063     c->vsad[0]= vsad16_c;
3064     c->vsad[4]= vsad_intra16_c;
3065     c->vsad[5]= vsad_intra8_c;
3066     c->vsse[0]= vsse16_c;
3067     c->vsse[4]= vsse_intra16_c;
3068     c->vsse[5]= vsse_intra8_c;
3069     c->nsse[0]= nsse16_c;
3070     c->nsse[1]= nsse8_c;
3071 #if CONFIG_DWT
3072     ff_dsputil_init_dwt(c);
3073 #endif
3074
3075     c->ssd_int8_vs_int16 = ssd_int8_vs_int16_c;
3076
3077     c->add_bytes= add_bytes_c;
3078     c->add_bytes_l2= add_bytes_l2_c;
3079     c->diff_bytes= diff_bytes_c;
3080     c->add_hfyu_median_prediction= add_hfyu_median_prediction_c;
3081     c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
3082     c->add_hfyu_left_prediction  = add_hfyu_left_prediction_c;
3083     c->add_hfyu_left_prediction_bgr32 = add_hfyu_left_prediction_bgr32_c;
3084     c->bswap_buf= bswap_buf;
3085     c->bswap16_buf = bswap16_buf;
3086 #if CONFIG_PNG_DECODER
3087     c->add_png_paeth_prediction= ff_add_png_paeth_prediction;
3088 #endif
3089
3090     if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
3091         c->h263_h_loop_filter= h263_h_loop_filter_c;
3092         c->h263_v_loop_filter= h263_v_loop_filter_c;
3093     }
3094
3095     if (CONFIG_VP3_DECODER) {
3096         c->vp3_h_loop_filter= ff_vp3_h_loop_filter_c;
3097         c->vp3_v_loop_filter= ff_vp3_v_loop_filter_c;
3098         c->vp3_idct_dc_add= ff_vp3_idct_dc_add_c;
3099     }
3100
3101     c->h261_loop_filter= h261_loop_filter_c;
3102
3103     c->try_8x8basis= try_8x8basis_c;
3104     c->add_8x8basis= add_8x8basis_c;
3105
3106 #if CONFIG_VORBIS_DECODER
3107     c->vorbis_inverse_coupling = vorbis_inverse_coupling;
3108 #endif
3109 #if CONFIG_AC3_DECODER
3110     c->ac3_downmix = ff_ac3_downmix_c;
3111 #endif
3112     c->vector_fmul = vector_fmul_c;
3113     c->vector_fmul_reverse = vector_fmul_reverse_c;
3114     c->vector_fmul_add = vector_fmul_add_c;
3115     c->vector_fmul_window = vector_fmul_window_c;
3116     c->vector_clipf = vector_clipf_c;
3117     c->scalarproduct_int16 = scalarproduct_int16_c;
3118     c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
3119     c->apply_window_int16 = apply_window_int16_c;
3120     c->vector_clip_int32 = vector_clip_int32_c;
3121     c->scalarproduct_float = scalarproduct_float_c;
3122     c->butterflies_float = butterflies_float_c;
3123     c->vector_fmul_scalar = vector_fmul_scalar_c;
3124
3125     c->vector_fmul_sv_scalar[0] = vector_fmul_sv_scalar_2_c;
3126     c->vector_fmul_sv_scalar[1] = vector_fmul_sv_scalar_4_c;
3127
3128     c->sv_fmul_scalar[0] = sv_fmul_scalar_2_c;
3129     c->sv_fmul_scalar[1] = sv_fmul_scalar_4_c;
3130
3131     c->shrink[0]= av_image_copy_plane;
3132     c->shrink[1]= ff_shrink22;
3133     c->shrink[2]= ff_shrink44;
3134     c->shrink[3]= ff_shrink88;
3135
3136     c->prefetch= just_return;
3137
3138     memset(c->put_2tap_qpel_pixels_tab, 0, sizeof(c->put_2tap_qpel_pixels_tab));
3139     memset(c->avg_2tap_qpel_pixels_tab, 0, sizeof(c->avg_2tap_qpel_pixels_tab));
3140
3141 #undef FUNC
3142 #undef FUNCC
3143 #define FUNC(f, depth) f ## _ ## depth
3144 #define FUNCC(f, depth) f ## _ ## depth ## _c
3145
3146 #define dspfunc1(PFX, IDX, NUM, depth)\
3147     c->PFX ## _pixels_tab[IDX][0] = FUNCC(PFX ## _pixels ## NUM        , depth);\
3148     c->PFX ## _pixels_tab[IDX][1] = FUNCC(PFX ## _pixels ## NUM ## _x2 , depth);\
3149     c->PFX ## _pixels_tab[IDX][2] = FUNCC(PFX ## _pixels ## NUM ## _y2 , depth);\
3150     c->PFX ## _pixels_tab[IDX][3] = FUNCC(PFX ## _pixels ## NUM ## _xy2, depth)
3151
3152 #define dspfunc2(PFX, IDX, NUM, depth)\
3153     c->PFX ## _pixels_tab[IDX][ 0] = FUNCC(PFX ## NUM ## _mc00, depth);\
3154     c->PFX ## _pixels_tab[IDX][ 1] = FUNCC(PFX ## NUM ## _mc10, depth);\
3155     c->PFX ## _pixels_tab[IDX][ 2] = FUNCC(PFX ## NUM ## _mc20, depth);\
3156     c->PFX ## _pixels_tab[IDX][ 3] = FUNCC(PFX ## NUM ## _mc30, depth);\
3157     c->PFX ## _pixels_tab[IDX][ 4] = FUNCC(PFX ## NUM ## _mc01, depth);\
3158     c->PFX ## _pixels_tab[IDX][ 5] = FUNCC(PFX ## NUM ## _mc11, depth);\
3159     c->PFX ## _pixels_tab[IDX][ 6] = FUNCC(PFX ## NUM ## _mc21, depth);\
3160     c->PFX ## _pixels_tab[IDX][ 7] = FUNCC(PFX ## NUM ## _mc31, depth);\
3161     c->PFX ## _pixels_tab[IDX][ 8] = FUNCC(PFX ## NUM ## _mc02, depth);\
3162     c->PFX ## _pixels_tab[IDX][ 9] = FUNCC(PFX ## NUM ## _mc12, depth);\
3163     c->PFX ## _pixels_tab[IDX][10] = FUNCC(PFX ## NUM ## _mc22, depth);\
3164     c->PFX ## _pixels_tab[IDX][11] = FUNCC(PFX ## NUM ## _mc32, depth);\
3165     c->PFX ## _pixels_tab[IDX][12] = FUNCC(PFX ## NUM ## _mc03, depth);\
3166     c->PFX ## _pixels_tab[IDX][13] = FUNCC(PFX ## NUM ## _mc13, depth);\
3167     c->PFX ## _pixels_tab[IDX][14] = FUNCC(PFX ## NUM ## _mc23, depth);\
3168     c->PFX ## _pixels_tab[IDX][15] = FUNCC(PFX ## NUM ## _mc33, depth)
3169
3170
3171 #define BIT_DEPTH_FUNCS(depth)\
3172     c->draw_edges                    = FUNCC(draw_edges            , depth);\
3173     c->emulated_edge_mc              = FUNC (ff_emulated_edge_mc   , depth);\
3174     c->clear_block                   = FUNCC(clear_block           , depth);\
3175     c->clear_blocks                  = FUNCC(clear_blocks          , depth);\
3176     c->add_pixels8                   = FUNCC(add_pixels8           , depth);\
3177     c->add_pixels4                   = FUNCC(add_pixels4           , depth);\
3178     c->put_no_rnd_pixels_l2[0]       = FUNCC(put_no_rnd_pixels16_l2, depth);\
3179     c->put_no_rnd_pixels_l2[1]       = FUNCC(put_no_rnd_pixels8_l2 , depth);\
3180 \
3181     c->put_h264_chroma_pixels_tab[0] = FUNCC(put_h264_chroma_mc8   , depth);\
3182     c->put_h264_chroma_pixels_tab[1] = FUNCC(put_h264_chroma_mc4   , depth);\
3183     c->put_h264_chroma_pixels_tab[2] = FUNCC(put_h264_chroma_mc2   , depth);\
3184     c->avg_h264_chroma_pixels_tab[0] = FUNCC(avg_h264_chroma_mc8   , depth);\
3185     c->avg_h264_chroma_pixels_tab[1] = FUNCC(avg_h264_chroma_mc4   , depth);\
3186     c->avg_h264_chroma_pixels_tab[2] = FUNCC(avg_h264_chroma_mc2   , depth);\
3187 \
3188     dspfunc1(put       , 0, 16, depth);\
3189     dspfunc1(put       , 1,  8, depth);\
3190     dspfunc1(put       , 2,  4, depth);\
3191     dspfunc1(put       , 3,  2, depth);\
3192     dspfunc1(put_no_rnd, 0, 16, depth);\
3193     dspfunc1(put_no_rnd, 1,  8, depth);\
3194     dspfunc1(avg       , 0, 16, depth);\
3195     dspfunc1(avg       , 1,  8, depth);\
3196     dspfunc1(avg       , 2,  4, depth);\
3197     dspfunc1(avg       , 3,  2, depth);\
3198     dspfunc1(avg_no_rnd, 0, 16, depth);\
3199     dspfunc1(avg_no_rnd, 1,  8, depth);\
3200 \
3201     dspfunc2(put_h264_qpel, 0, 16, depth);\
3202     dspfunc2(put_h264_qpel, 1,  8, depth);\
3203     dspfunc2(put_h264_qpel, 2,  4, depth);\
3204     dspfunc2(put_h264_qpel, 3,  2, depth);\
3205     dspfunc2(avg_h264_qpel, 0, 16, depth);\
3206     dspfunc2(avg_h264_qpel, 1,  8, depth);\
3207     dspfunc2(avg_h264_qpel, 2,  4, depth);
3208
3209     if (avctx->codec_id != CODEC_ID_H264 || avctx->bits_per_raw_sample == 8) {
3210         BIT_DEPTH_FUNCS(8)
3211     } else {
3212         switch (avctx->bits_per_raw_sample) {
3213             case 9:
3214                 BIT_DEPTH_FUNCS(9)
3215                 break;
3216             case 10:
3217                 BIT_DEPTH_FUNCS(10)
3218                 break;
3219             default:
3220                 av_log(avctx, AV_LOG_DEBUG, "Unsupported bit depth: %d\n", avctx->bits_per_raw_sample);
3221                 BIT_DEPTH_FUNCS(8)
3222                 break;
3223         }
3224     }
3225
3226
3227     if (HAVE_MMX)        dsputil_init_mmx   (c, avctx);
3228     if (ARCH_ARM)        dsputil_init_arm   (c, avctx);
3229     if (CONFIG_MLIB)     dsputil_init_mlib  (c, avctx);
3230     if (HAVE_VIS)        dsputil_init_vis   (c, avctx);
3231     if (ARCH_ALPHA)      dsputil_init_alpha (c, avctx);
3232     if (ARCH_PPC)        dsputil_init_ppc   (c, avctx);
3233     if (HAVE_MMI)        dsputil_init_mmi   (c, avctx);
3234     if (ARCH_SH4)        dsputil_init_sh4   (c, avctx);
3235     if (ARCH_BFIN)       dsputil_init_bfin  (c, avctx);
3236
3237     for(i=0; i<64; i++){
3238         if(!c->put_2tap_qpel_pixels_tab[0][i])
3239             c->put_2tap_qpel_pixels_tab[0][i]= c->put_h264_qpel_pixels_tab[0][i];
3240         if(!c->avg_2tap_qpel_pixels_tab[0][i])
3241             c->avg_2tap_qpel_pixels_tab[0][i]= c->avg_h264_qpel_pixels_tab[0][i];
3242     }
3243
3244     c->put_rv30_tpel_pixels_tab[0][0] = c->put_h264_qpel_pixels_tab[0][0];
3245     c->put_rv30_tpel_pixels_tab[1][0] = c->put_h264_qpel_pixels_tab[1][0];
3246     c->avg_rv30_tpel_pixels_tab[0][0] = c->avg_h264_qpel_pixels_tab[0][0];
3247     c->avg_rv30_tpel_pixels_tab[1][0] = c->avg_h264_qpel_pixels_tab[1][0];
3248
3249     c->put_rv40_qpel_pixels_tab[0][0] = c->put_h264_qpel_pixels_tab[0][0];
3250     c->put_rv40_qpel_pixels_tab[1][0] = c->put_h264_qpel_pixels_tab[1][0];
3251     c->avg_rv40_qpel_pixels_tab[0][0] = c->avg_h264_qpel_pixels_tab[0][0];
3252     c->avg_rv40_qpel_pixels_tab[1][0] = c->avg_h264_qpel_pixels_tab[1][0];
3253
3254     switch(c->idct_permutation_type){
3255     case FF_NO_IDCT_PERM:
3256         for(i=0; i<64; i++)
3257             c->idct_permutation[i]= i;
3258         break;
3259     case FF_LIBMPEG2_IDCT_PERM:
3260         for(i=0; i<64; i++)
3261             c->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
3262         break;
3263     case FF_SIMPLE_IDCT_PERM:
3264         for(i=0; i<64; i++)
3265             c->idct_permutation[i]= simple_mmx_permutation[i];
3266         break;
3267     case FF_TRANSPOSE_IDCT_PERM:
3268         for(i=0; i<64; i++)
3269             c->idct_permutation[i]= ((i&7)<<3) | (i>>3);
3270         break;
3271     case FF_PARTTRANS_IDCT_PERM:
3272         for(i=0; i<64; i++)
3273             c->idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3);
3274         break;
3275     case FF_SSE2_IDCT_PERM:
3276         for(i=0; i<64; i++)
3277             c->idct_permutation[i]= (i&0x38) | idct_sse2_row_perm[i&7];
3278         break;
3279     default:
3280         av_log(avctx, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");
3281     }
3282 }
3283