libavcodec/dsputil.c

   1 /*
   2  * DSP utils
   3  * Copyright (c) 2000, 2001 Fabrice Bellard
   4  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
   5  *
   6  * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
   7  *
   8  * This file is part of Libav.
   9  *
  10  * Libav is free software; you can redistribute it and/or
  11  * modify it under the terms of the GNU Lesser General Public
  12  * License as published by the Free Software Foundation; either
  13  * version 2.1 of the License, or (at your option) any later version.
  14  *
  15  * Libav is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18  * Lesser General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU Lesser General Public
  21  * License along with Libav; if not, write to the Free Software
  22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  23  */
  24
  25 /**
  26  * @file
  27  * DSP utils
  28  */
  29
  30 #include "libavutil/imgutils.h"
  31 #include "avcodec.h"
  32 #include "dsputil.h"
  33 #include "simple_idct.h"
  34 #include "faandct.h"
  35 #include "faanidct.h"
  36 #include "mathops.h"
  37 #include "mpegvideo.h"
  38 #include "config.h"
  39 #include "ac3dec.h"
  40 #include "vorbis.h"
  41 #include "png.h"
  42
  43 uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
  44 uint32_t ff_squareTbl[512] = {0, };
  45
  46 #define BIT_DEPTH 9
  47 #include "dsputil_template.c"
  48 #undef BIT_DEPTH
  49
  50 #define BIT_DEPTH 10
  51 #include "dsputil_template.c"
  52 #undef BIT_DEPTH
  53
  54 #define BIT_DEPTH 8
  55 #include "dsputil_template.c"
  56
  57 // 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
  58 #define pb_7f (~0UL/255 * 0x7f)
  59 #define pb_80 (~0UL/255 * 0x80)
  60
  61 const uint8_t ff_zigzag_direct[64] = {
  62     0,   1,  8, 16,  9,  2,  3, 10,
  63     17, 24, 32, 25, 18, 11,  4,  5,
  64     12, 19, 26, 33, 40, 48, 41, 34,
  65     27, 20, 13,  6,  7, 14, 21, 28,
  66     35, 42, 49, 56, 57, 50, 43, 36,
  67     29, 22, 15, 23, 30, 37, 44, 51,
  68     58, 59, 52, 45, 38, 31, 39, 46,
  69     53, 60, 61, 54, 47, 55, 62, 63
  70 };
  71
  72 /* Specific zigzag scan for 248 idct. NOTE that unlike the
  73    specification, we interleave the fields */
  74 const uint8_t ff_zigzag248_direct[64] = {
  75      0,  8,  1,  9, 16, 24,  2, 10,
  76     17, 25, 32, 40, 48, 56, 33, 41,
  77     18, 26,  3, 11,  4, 12, 19, 27,
  78     34, 42, 49, 57, 50, 58, 35, 43,
  79     20, 28,  5, 13,  6, 14, 21, 29,
  80     36, 44, 51, 59, 52, 60, 37, 45,
  81     22, 30,  7, 15, 23, 31, 38, 46,
  82     53, 61, 54, 62, 39, 47, 55, 63,
  83 };
  84
  85 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
  86 DECLARE_ALIGNED(16, uint16_t, inv_zigzag_direct16)[64];
  87
  88 const uint8_t ff_alternate_horizontal_scan[64] = {
  89     0,  1,   2,  3,  8,  9, 16, 17,
  90     10, 11,  4,  5,  6,  7, 15, 14,
  91     13, 12, 19, 18, 24, 25, 32, 33,
  92     26, 27, 20, 21, 22, 23, 28, 29,
  93     30, 31, 34, 35, 40, 41, 48, 49,
  94     42, 43, 36, 37, 38, 39, 44, 45,
  95     46, 47, 50, 51, 56, 57, 58, 59,
  96     52, 53, 54, 55, 60, 61, 62, 63,
  97 };
  98
  99 const uint8_t ff_alternate_vertical_scan[64] = {
 100     0,  8,  16, 24,  1,  9,  2, 10,
 101     17, 25, 32, 40, 48, 56, 57, 49,
 102     41, 33, 26, 18,  3, 11,  4, 12,
 103     19, 27, 34, 42, 50, 58, 35, 43,
 104     51, 59, 20, 28,  5, 13,  6, 14,
 105     21, 29, 36, 44, 52, 60, 37, 45,
 106     53, 61, 22, 30,  7, 15, 23, 31,
 107     38, 46, 54, 62, 39, 47, 55, 63,
 108 };
 109
 110 /* Input permutation for the simple_idct_mmx */
 111 static const uint8_t simple_mmx_permutation[64]={
 112         0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
 113         0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
 114         0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
 115         0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
 116         0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
 117         0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
 118         0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
 119         0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
 120 };
 121
 122 static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7};
 123
 124 void ff_init_scantable(uint8_t *permutation, ScanTable *st, const uint8_t *src_scantable){
 125     int i;
 126     int end;
 127
 128     st->scantable= src_scantable;
 129
 130     for(i=0; i<64; i++){
 131         int j;
 132         j = src_scantable[i];
 133         st->permutated[i] = permutation[j];
 134 #if ARCH_PPC
 135         st->inverse[j] = i;
 136 #endif
 137     }
 138
 139     end=-1;
 140     for(i=0; i<64; i++){
 141         int j;
 142         j = st->permutated[i];
 143         if(j>end) end=j;
 144         st->raster_end[i]= end;
 145     }
 146 }
 147
 148 static int pix_sum_c(uint8_t * pix, int line_size)
 149 {
 150     int s, i, j;
 151
 152     s = 0;
 153     for (i = 0; i < 16; i++) {
 154         for (j = 0; j < 16; j += 8) {
 155             s += pix[0];
 156             s += pix[1];
 157             s += pix[2];
 158             s += pix[3];
 159             s += pix[4];
 160             s += pix[5];
 161             s += pix[6];
 162             s += pix[7];
 163             pix += 8;
 164         }
 165         pix += line_size - 16;
 166     }
 167     return s;
 168 }
 169
 170 static int pix_norm1_c(uint8_t * pix, int line_size)
 171 {
 172     int s, i, j;
 173     uint32_t *sq = ff_squareTbl + 256;
 174
 175     s = 0;
 176     for (i = 0; i < 16; i++) {
 177         for (j = 0; j < 16; j += 8) {
 178 #if LONG_MAX > 2147483647
 179             register uint64_t x=*(uint64_t*)pix;
 180             s += sq[x&0xff];
 181             s += sq[(x>>8)&0xff];
 182             s += sq[(x>>16)&0xff];
 183             s += sq[(x>>24)&0xff];
 184             s += sq[(x>>32)&0xff];
 185             s += sq[(x>>40)&0xff];
 186             s += sq[(x>>48)&0xff];
 187             s += sq[(x>>56)&0xff];
 188 #else
 189             register uint32_t x=*(uint32_t*)pix;
 190             s += sq[x&0xff];
 191             s += sq[(x>>8)&0xff];
 192             s += sq[(x>>16)&0xff];
 193             s += sq[(x>>24)&0xff];
 194             x=*(uint32_t*)(pix+4);
 195             s += sq[x&0xff];
 196             s += sq[(x>>8)&0xff];
 197             s += sq[(x>>16)&0xff];
 198             s += sq[(x>>24)&0xff];
 199 #endif
 200             pix += 8;
 201         }
 202         pix += line_size - 16;
 203     }
 204     return s;
 205 }
 206
 207 static void bswap_buf(uint32_t *dst, const uint32_t *src, int w){
 208     int i;
 209
 210     for(i=0; i+8<=w; i+=8){
 211         dst[i+0]= av_bswap32(src[i+0]);
 212         dst[i+1]= av_bswap32(src[i+1]);
 213         dst[i+2]= av_bswap32(src[i+2]);
 214         dst[i+3]= av_bswap32(src[i+3]);
 215         dst[i+4]= av_bswap32(src[i+4]);
 216         dst[i+5]= av_bswap32(src[i+5]);
 217         dst[i+6]= av_bswap32(src[i+6]);
 218         dst[i+7]= av_bswap32(src[i+7]);
 219     }
 220     for(;i<w; i++){
 221         dst[i+0]= av_bswap32(src[i+0]);
 222     }
 223 }
 224
 225 static void bswap16_buf(uint16_t *dst, const uint16_t *src, int len)
 226 {
 227     while (len--)
 228         *dst++ = av_bswap16(*src++);
 229 }
 230
 231 static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
 232 {
 233     int s, i;
 234     uint32_t *sq = ff_squareTbl + 256;
 235
 236     s = 0;
 237     for (i = 0; i < h; i++) {
 238         s += sq[pix1[0] - pix2[0]];
 239         s += sq[pix1[1] - pix2[1]];
 240         s += sq[pix1[2] - pix2[2]];
 241         s += sq[pix1[3] - pix2[3]];
 242         pix1 += line_size;
 243         pix2 += line_size;
 244     }
 245     return s;
 246 }
 247
 248 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
 249 {
 250     int s, i;
 251     uint32_t *sq = ff_squareTbl + 256;
 252
 253     s = 0;
 254     for (i = 0; i < h; i++) {
 255         s += sq[pix1[0] - pix2[0]];
 256         s += sq[pix1[1] - pix2[1]];
 257         s += sq[pix1[2] - pix2[2]];
 258         s += sq[pix1[3] - pix2[3]];
 259         s += sq[pix1[4] - pix2[4]];
 260         s += sq[pix1[5] - pix2[5]];
 261         s += sq[pix1[6] - pix2[6]];
 262         s += sq[pix1[7] - pix2[7]];
 263         pix1 += line_size;
 264         pix2 += line_size;
 265     }
 266     return s;
 267 }
 268
 269 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
 270 {
 271     int s, i;
 272     uint32_t *sq = ff_squareTbl + 256;
 273
 274     s = 0;
 275     for (i = 0; i < h; i++) {
 276         s += sq[pix1[ 0] - pix2[ 0]];
 277         s += sq[pix1[ 1] - pix2[ 1]];
 278         s += sq[pix1[ 2] - pix2[ 2]];
 279         s += sq[pix1[ 3] - pix2[ 3]];
 280         s += sq[pix1[ 4] - pix2[ 4]];
 281         s += sq[pix1[ 5] - pix2[ 5]];
 282         s += sq[pix1[ 6] - pix2[ 6]];
 283         s += sq[pix1[ 7] - pix2[ 7]];
 284         s += sq[pix1[ 8] - pix2[ 8]];
 285         s += sq[pix1[ 9] - pix2[ 9]];
 286         s += sq[pix1[10] - pix2[10]];
 287         s += sq[pix1[11] - pix2[11]];
 288         s += sq[pix1[12] - pix2[12]];
 289         s += sq[pix1[13] - pix2[13]];
 290         s += sq[pix1[14] - pix2[14]];
 291         s += sq[pix1[15] - pix2[15]];
 292
 293         pix1 += line_size;
 294         pix2 += line_size;
 295     }
 296     return s;
 297 }
 298
 299 static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
 300 {
 301     int i;
 302
 303     /* read the pixels */
 304     for(i=0;i<8;i++) {
 305         block[0] = pixels[0];
 306         block[1] = pixels[1];
 307         block[2] = pixels[2];
 308         block[3] = pixels[3];
 309         block[4] = pixels[4];
 310         block[5] = pixels[5];
 311         block[6] = pixels[6];
 312         block[7] = pixels[7];
 313         pixels += line_size;
 314         block += 8;
 315     }
 316 }
 317
 318 static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
 319                           const uint8_t *s2, int stride){
 320     int i;
 321
 322     /* read the pixels */
 323     for(i=0;i<8;i++) {
 324         block[0] = s1[0] - s2[0];
 325         block[1] = s1[1] - s2[1];
 326         block[2] = s1[2] - s2[2];
 327         block[3] = s1[3] - s2[3];
 328         block[4] = s1[4] - s2[4];
 329         block[5] = s1[5] - s2[5];
 330         block[6] = s1[6] - s2[6];
 331         block[7] = s1[7] - s2[7];
 332         s1 += stride;
 333         s2 += stride;
 334         block += 8;
 335     }
 336 }
 337
 338
 339 void ff_put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
 340                              int line_size)
 341 {
 342     int i;
 343     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 344
 345     /* read the pixels */
 346     for(i=0;i<8;i++) {
 347         pixels[0] = cm[block[0]];
 348         pixels[1] = cm[block[1]];
 349         pixels[2] = cm[block[2]];
 350         pixels[3] = cm[block[3]];
 351         pixels[4] = cm[block[4]];
 352         pixels[5] = cm[block[5]];
 353         pixels[6] = cm[block[6]];
 354         pixels[7] = cm[block[7]];
 355
 356         pixels += line_size;
 357         block += 8;
 358     }
 359 }
 360
 361 static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
 362                                  int line_size)
 363 {
 364     int i;
 365     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 366
 367     /* read the pixels */
 368     for(i=0;i<4;i++) {
 369         pixels[0] = cm[block[0]];
 370         pixels[1] = cm[block[1]];
 371         pixels[2] = cm[block[2]];
 372         pixels[3] = cm[block[3]];
 373
 374         pixels += line_size;
 375         block += 8;
 376     }
 377 }
 378
 379 static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
 380                                  int line_size)
 381 {
 382     int i;
 383     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 384
 385     /* read the pixels */
 386     for(i=0;i<2;i++) {
 387         pixels[0] = cm[block[0]];
 388         pixels[1] = cm[block[1]];
 389
 390         pixels += line_size;
 391         block += 8;
 392     }
 393 }
 394
 395 void ff_put_signed_pixels_clamped_c(const DCTELEM *block,
 396                                     uint8_t *restrict pixels,
 397                                     int line_size)
 398 {
 399     int i, j;
 400
 401     for (i = 0; i < 8; i++) {
 402         for (j = 0; j < 8; j++) {
 403             if (*block < -128)
 404                 *pixels = 0;
 405             else if (*block > 127)
 406                 *pixels = 255;
 407             else
 408                 *pixels = (uint8_t)(*block + 128);
 409             block++;
 410             pixels++;
 411         }
 412         pixels += (line_size - 8);
 413     }
 414 }
 415
 416 static void put_pixels_nonclamped_c(const DCTELEM *block, uint8_t *restrict pixels,
 417                                     int line_size)
 418 {
 419     int i;
 420
 421     /* read the pixels */
 422     for(i=0;i<8;i++) {
 423         pixels[0] = block[0];
 424         pixels[1] = block[1];
 425         pixels[2] = block[2];
 426         pixels[3] = block[3];
 427         pixels[4] = block[4];
 428         pixels[5] = block[5];
 429         pixels[6] = block[6];
 430         pixels[7] = block[7];
 431
 432         pixels += line_size;
 433         block += 8;
 434     }
 435 }
 436
 437 void ff_add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
 438                              int line_size)
 439 {
 440     int i;
 441     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 442
 443     /* read the pixels */
 444     for(i=0;i<8;i++) {
 445         pixels[0] = cm[pixels[0] + block[0]];
 446         pixels[1] = cm[pixels[1] + block[1]];
 447         pixels[2] = cm[pixels[2] + block[2]];
 448         pixels[3] = cm[pixels[3] + block[3]];
 449         pixels[4] = cm[pixels[4] + block[4]];
 450         pixels[5] = cm[pixels[5] + block[5]];
 451         pixels[6] = cm[pixels[6] + block[6]];
 452         pixels[7] = cm[pixels[7] + block[7]];
 453         pixels += line_size;
 454         block += 8;
 455     }
 456 }
 457
 458 static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
 459                           int line_size)
 460 {
 461     int i;
 462     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 463
 464     /* read the pixels */
 465     for(i=0;i<4;i++) {
 466         pixels[0] = cm[pixels[0] + block[0]];
 467         pixels[1] = cm[pixels[1] + block[1]];
 468         pixels[2] = cm[pixels[2] + block[2]];
 469         pixels[3] = cm[pixels[3] + block[3]];
 470         pixels += line_size;
 471         block += 8;
 472     }
 473 }
 474
 475 static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
 476                           int line_size)
 477 {
 478     int i;
 479     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 480
 481     /* read the pixels */
 482     for(i=0;i<2;i++) {
 483         pixels[0] = cm[pixels[0] + block[0]];
 484         pixels[1] = cm[pixels[1] + block[1]];
 485         pixels += line_size;
 486         block += 8;
 487     }
 488 }
 489
 490 static int sum_abs_dctelem_c(DCTELEM *block)
 491 {
 492     int sum=0, i;
 493     for(i=0; i<64; i++)
 494         sum+= FFABS(block[i]);
 495     return sum;
 496 }
 497
 498 static void fill_block16_c(uint8_t *block, uint8_t value, int line_size, int h)
 499 {
 500     int i;
 501
 502     for (i = 0; i < h; i++) {
 503         memset(block, value, 16);
 504         block += line_size;
 505     }
 506 }
 507
 508 static void fill_block8_c(uint8_t *block, uint8_t value, int line_size, int h)
 509 {
 510     int i;
 511
 512     for (i = 0; i < h; i++) {
 513         memset(block, value, 8);
 514         block += line_size;
 515     }
 516 }
 517
 518 static void scale_block_c(const uint8_t src[64]/*align 8*/, uint8_t *dst/*align 8*/, int linesize)
 519 {
 520     int i, j;
 521     uint16_t *dst1 = (uint16_t *) dst;
 522     uint16_t *dst2 = (uint16_t *)(dst + linesize);
 523
 524     for (j = 0; j < 8; j++) {
 525         for (i = 0; i < 8; i++) {
 526             dst1[i] = dst2[i] = src[i] * 0x0101;
 527         }
 528         src  += 8;
 529         dst1 += linesize;
 530         dst2 += linesize;
 531     }
 532 }
 533
 534 #define avg2(a,b) ((a+b+1)>>1)
 535 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
 536
 537 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
 538 {
 539     const int A=(16-x16)*(16-y16);
 540     const int B=(   x16)*(16-y16);
 541     const int C=(16-x16)*(   y16);
 542     const int D=(   x16)*(   y16);
 543     int i;
 544
 545     for(i=0; i<h; i++)
 546     {
 547         dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
 548         dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
 549         dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
 550         dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
 551         dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
 552         dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
 553         dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
 554         dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
 555         dst+= stride;
 556         src+= stride;
 557     }
 558 }
 559
 560 void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
 561                   int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
 562 {
 563     int y, vx, vy;
 564     const int s= 1<<shift;
 565
 566     width--;
 567     height--;
 568
 569     for(y=0; y<h; y++){
 570         int x;
 571
 572         vx= ox;
 573         vy= oy;
 574         for(x=0; x<8; x++){ //XXX FIXME optimize
 575             int src_x, src_y, frac_x, frac_y, index;
 576
 577             src_x= vx>>16;
 578             src_y= vy>>16;
 579             frac_x= src_x&(s-1);
 580             frac_y= src_y&(s-1);
 581             src_x>>=shift;
 582             src_y>>=shift;
 583
 584             if((unsigned)src_x < width){
 585                 if((unsigned)src_y < height){
 586                     index= src_x + src_y*stride;
 587                     dst[y*stride + x]= (  (  src[index         ]*(s-frac_x)
 588                                            + src[index       +1]*   frac_x )*(s-frac_y)
 589                                         + (  src[index+stride  ]*(s-frac_x)
 590                                            + src[index+stride+1]*   frac_x )*   frac_y
 591                                         + r)>>(shift*2);
 592                 }else{
 593                     index= src_x + av_clip(src_y, 0, height)*stride;
 594                     dst[y*stride + x]= ( (  src[index         ]*(s-frac_x)
 595                                           + src[index       +1]*   frac_x )*s
 596                                         + r)>>(shift*2);
 597                 }
 598             }else{
 599                 if((unsigned)src_y < height){
 600                     index= av_clip(src_x, 0, width) + src_y*stride;
 601                     dst[y*stride + x]= (  (  src[index         ]*(s-frac_y)
 602                                            + src[index+stride  ]*   frac_y )*s
 603                                         + r)>>(shift*2);
 604                 }else{
 605                     index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
 606                     dst[y*stride + x]=    src[index         ];
 607                 }
 608             }
 609
 610             vx+= dxx;
 611             vy+= dyx;
 612         }
 613         ox += dxy;
 614         oy += dyy;
 615     }
 616 }
 617
 618 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 619     switch(width){
 620     case 2: put_pixels2_8_c (dst, src, stride, height); break;
 621     case 4: put_pixels4_8_c (dst, src, stride, height); break;
 622     case 8: put_pixels8_8_c (dst, src, stride, height); break;
 623     case 16:put_pixels16_8_c(dst, src, stride, height); break;
 624     }
 625 }
 626
 627 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 628     int i,j;
 629     for (i=0; i < height; i++) {
 630       for (j=0; j < width; j++) {
 631         dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
 632       }
 633       src += stride;
 634       dst += stride;
 635     }
 636 }
 637
 638 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 639     int i,j;
 640     for (i=0; i < height; i++) {
 641       for (j=0; j < width; j++) {
 642         dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
 643       }
 644       src += stride;
 645       dst += stride;
 646     }
 647 }
 648
 649 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 650     int i,j;
 651     for (i=0; i < height; i++) {
 652       for (j=0; j < width; j++) {
 653         dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
 654       }
 655       src += stride;
 656       dst += stride;
 657     }
 658 }
 659
 660 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 661     int i,j;
 662     for (i=0; i < height; i++) {
 663       for (j=0; j < width; j++) {
 664         dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
 665       }
 666       src += stride;
 667       dst += stride;
 668     }
 669 }
 670
 671 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 672     int i,j;
 673     for (i=0; i < height; i++) {
 674       for (j=0; j < width; j++) {
 675         dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
 676       }
 677       src += stride;
 678       dst += stride;
 679     }
 680 }
 681
 682 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 683     int i,j;
 684     for (i=0; i < height; i++) {
 685       for (j=0; j < width; j++) {
 686         dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
 687       }
 688       src += stride;
 689       dst += stride;
 690     }
 691 }
 692
 693 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 694     int i,j;
 695     for (i=0; i < height; i++) {
 696       for (j=0; j < width; j++) {
 697         dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
 698       }
 699       src += stride;
 700       dst += stride;
 701     }
 702 }
 703
 704 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 705     int i,j;
 706     for (i=0; i < height; i++) {
 707       for (j=0; j < width; j++) {
 708         dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
 709       }
 710       src += stride;
 711       dst += stride;
 712     }
 713 }
 714
 715 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 716     switch(width){
 717     case 2: avg_pixels2_8_c (dst, src, stride, height); break;
 718     case 4: avg_pixels4_8_c (dst, src, stride, height); break;
 719     case 8: avg_pixels8_8_c (dst, src, stride, height); break;
 720     case 16:avg_pixels16_8_c(dst, src, stride, height); break;
 721     }
 722 }
 723
 724 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 725     int i,j;
 726     for (i=0; i < height; i++) {
 727       for (j=0; j < width; j++) {
 728         dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
 729       }
 730       src += stride;
 731       dst += stride;
 732     }
 733 }
 734
 735 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 736     int i,j;
 737     for (i=0; i < height; i++) {
 738       for (j=0; j < width; j++) {
 739         dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
 740       }
 741       src += stride;
 742       dst += stride;
 743     }
 744 }
 745
 746 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 747     int i,j;
 748     for (i=0; i < height; i++) {
 749       for (j=0; j < width; j++) {
 750         dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
 751       }
 752       src += stride;
 753       dst += stride;
 754     }
 755 }
 756
 757 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 758     int i,j;
 759     for (i=0; i < height; i++) {
 760       for (j=0; j < width; j++) {
 761         dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
 762       }
 763       src += stride;
 764       dst += stride;
 765     }
 766 }
 767
 768 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 769     int i,j;
 770     for (i=0; i < height; i++) {
 771       for (j=0; j < width; j++) {
 772         dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
 773       }
 774       src += stride;
 775       dst += stride;
 776     }
 777 }
 778
 779 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 780     int i,j;
 781     for (i=0; i < height; i++) {
 782       for (j=0; j < width; j++) {
 783         dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
 784       }
 785       src += stride;
 786       dst += stride;
 787     }
 788 }
 789
 790 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 791     int i,j;
 792     for (i=0; i < height; i++) {
 793       for (j=0; j < width; j++) {
 794         dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
 795       }
 796       src += stride;
 797       dst += stride;
 798     }
 799 }
 800
 801 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 802     int i,j;
 803     for (i=0; i < height; i++) {
 804       for (j=0; j < width; j++) {
 805         dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
 806       }
 807       src += stride;
 808       dst += stride;
 809     }
 810 }
 811
 812 #define QPEL_MC(r, OPNAME, RND, OP) \
 813 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
 814     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
 815     int i;\
 816     for(i=0; i<h; i++)\
 817     {\
 818         OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
 819         OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
 820         OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
 821         OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
 822         OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
 823         OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
 824         OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
 825         OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
 826         dst+=dstStride;\
 827         src+=srcStride;\
 828     }\
 829 }\
 830 \
 831 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
 832     const int w=8;\
 833     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
 834     int i;\
 835     for(i=0; i<w; i++)\
 836     {\
 837         const int src0= src[0*srcStride];\
 838         const int src1= src[1*srcStride];\
 839         const int src2= src[2*srcStride];\
 840         const int src3= src[3*srcStride];\
 841         const int src4= src[4*srcStride];\
 842         const int src5= src[5*srcStride];\
 843         const int src6= src[6*srcStride];\
 844         const int src7= src[7*srcStride];\
 845         const int src8= src[8*srcStride];\
 846         OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
 847         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
 848         OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
 849         OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
 850         OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
 851         OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
 852         OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
 853         OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
 854         dst++;\
 855         src++;\
 856     }\
 857 }\
 858 \
 859 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
 860     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
 861     int i;\
 862     \
 863     for(i=0; i<h; i++)\
 864     {\
 865         OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
 866         OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
 867         OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
 868         OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
 869         OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
 870         OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
 871         OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
 872         OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
 873         OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
 874         OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
 875         OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
 876         OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
 877         OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
 878         OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
 879         OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
 880         OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
 881         dst+=dstStride;\
 882         src+=srcStride;\
 883     }\
 884 }\
 885 \
 886 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
 887     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
 888     int i;\
 889     const int w=16;\
 890     for(i=0; i<w; i++)\
 891     {\
 892         const int src0= src[0*srcStride];\
 893         const int src1= src[1*srcStride];\
 894         const int src2= src[2*srcStride];\
 895         const int src3= src[3*srcStride];\
 896         const int src4= src[4*srcStride];\
 897         const int src5= src[5*srcStride];\
 898         const int src6= src[6*srcStride];\
 899         const int src7= src[7*srcStride];\
 900         const int src8= src[8*srcStride];\
 901         const int src9= src[9*srcStride];\
 902         const int src10= src[10*srcStride];\
 903         const int src11= src[11*srcStride];\
 904         const int src12= src[12*srcStride];\
 905         const int src13= src[13*srcStride];\
 906         const int src14= src[14*srcStride];\
 907         const int src15= src[15*srcStride];\
 908         const int src16= src[16*srcStride];\
 909         OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
 910         OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
 911         OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
 912         OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
 913         OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
 914         OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
 915         OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
 916         OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
 917         OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
 918         OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
 919         OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
 920         OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
 921         OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
 922         OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
 923         OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
 924         OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
 925         dst++;\
 926         src++;\
 927     }\
 928 }\
 929 \
 930 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
 931     uint8_t half[64];\
 932     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
 933     OPNAME ## pixels8_l2_8(dst, src, half, stride, stride, 8, 8);\
 934 }\
 935 \
 936 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
 937     OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
 938 }\
 939 \
 940 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
 941     uint8_t half[64];\
 942     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
 943     OPNAME ## pixels8_l2_8(dst, src+1, half, stride, stride, 8, 8);\
 944 }\
 945 \
 946 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
 947     uint8_t full[16*9];\
 948     uint8_t half[64];\
 949     copy_block9(full, src, 16, stride, 9);\
 950     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
 951     OPNAME ## pixels8_l2_8(dst, full, half, stride, 16, 8, 8);\
 952 }\
 953 \
 954 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
 955     uint8_t full[16*9];\
 956     copy_block9(full, src, 16, stride, 9);\
 957     OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
 958 }\
 959 \
 960 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
 961     uint8_t full[16*9];\
 962     uint8_t half[64];\
 963     copy_block9(full, src, 16, stride, 9);\
 964     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
 965     OPNAME ## pixels8_l2_8(dst, full+16, half, stride, 16, 8, 8);\
 966 }\
 967 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
 968     uint8_t full[16*9];\
 969     uint8_t halfH[72];\
 970     uint8_t halfV[64];\
 971     uint8_t halfHV[64];\
 972     copy_block9(full, src, 16, stride, 9);\
 973     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
 974     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
 975     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
 976     OPNAME ## pixels8_l4_8(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
 977 }\
 978 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
 979     uint8_t full[16*9];\
 980     uint8_t halfH[72];\
 981     uint8_t halfHV[64];\
 982     copy_block9(full, src, 16, stride, 9);\
 983     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
 984     put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
 985     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
 986     OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
 987 }\
 988 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
 989     uint8_t full[16*9];\
 990     uint8_t halfH[72];\
 991     uint8_t halfV[64];\
 992     uint8_t halfHV[64];\
 993     copy_block9(full, src, 16, stride, 9);\
 994     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
 995     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
 996     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
 997     OPNAME ## pixels8_l4_8(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
 998 }\
 999 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1000     uint8_t full[16*9];\
1001     uint8_t halfH[72];\
1002     uint8_t halfHV[64];\
1003     copy_block9(full, src, 16, stride, 9);\
1004     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1005     put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
1006     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1007     OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
1008 }\
1009 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1010     uint8_t full[16*9];\
1011     uint8_t halfH[72];\
1012     uint8_t halfV[64];\
1013     uint8_t halfHV[64];\
1014     copy_block9(full, src, 16, stride, 9);\
1015     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1016     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1017     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1018     OPNAME ## pixels8_l4_8(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1019 }\
1020 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1021     uint8_t full[16*9];\
1022     uint8_t halfH[72];\
1023     uint8_t halfHV[64];\
1024     copy_block9(full, src, 16, stride, 9);\
1025     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1026     put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
1027     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1028     OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1029 }\
1030 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1031     uint8_t full[16*9];\
1032     uint8_t halfH[72];\
1033     uint8_t halfV[64];\
1034     uint8_t halfHV[64];\
1035     copy_block9(full, src, 16, stride, 9);\
1036     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full  , 8, 16, 9);\
1037     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1038     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1039     OPNAME ## pixels8_l4_8(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1040 }\
1041 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1042     uint8_t full[16*9];\
1043     uint8_t halfH[72];\
1044     uint8_t halfHV[64];\
1045     copy_block9(full, src, 16, stride, 9);\
1046     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1047     put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
1048     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1049     OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1050 }\
1051 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1052     uint8_t halfH[72];\
1053     uint8_t halfHV[64];\
1054     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1055     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1056     OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
1057 }\
1058 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1059     uint8_t halfH[72];\
1060     uint8_t halfHV[64];\
1061     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1062     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1063     OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1064 }\
1065 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1066     uint8_t full[16*9];\
1067     uint8_t halfH[72];\
1068     uint8_t halfV[64];\
1069     uint8_t halfHV[64];\
1070     copy_block9(full, src, 16, stride, 9);\
1071     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1072     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1073     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1074     OPNAME ## pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);\
1075 }\
1076 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1077     uint8_t full[16*9];\
1078     uint8_t halfH[72];\
1079     copy_block9(full, src, 16, stride, 9);\
1080     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1081     put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
1082     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1083 }\
1084 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1085     uint8_t full[16*9];\
1086     uint8_t halfH[72];\
1087     uint8_t halfV[64];\
1088     uint8_t halfHV[64];\
1089     copy_block9(full, src, 16, stride, 9);\
1090     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1091     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1092     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1093     OPNAME ## pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);\
1094 }\
1095 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1096     uint8_t full[16*9];\
1097     uint8_t halfH[72];\
1098     copy_block9(full, src, 16, stride, 9);\
1099     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1100     put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
1101     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1102 }\
1103 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1104     uint8_t halfH[72];\
1105     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1106     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1107 }\
1108 \
1109 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1110     uint8_t half[256];\
1111     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1112     OPNAME ## pixels16_l2_8(dst, src, half, stride, stride, 16, 16);\
1113 }\
1114 \
1115 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1116     OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1117 }\
1118 \
1119 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1120     uint8_t half[256];\
1121     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1122     OPNAME ## pixels16_l2_8(dst, src+1, half, stride, stride, 16, 16);\
1123 }\
1124 \
1125 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1126     uint8_t full[24*17];\
1127     uint8_t half[256];\
1128     copy_block17(full, src, 24, stride, 17);\
1129     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1130     OPNAME ## pixels16_l2_8(dst, full, half, stride, 24, 16, 16);\
1131 }\
1132 \
1133 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1134     uint8_t full[24*17];\
1135     copy_block17(full, src, 24, stride, 17);\
1136     OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1137 }\
1138 \
1139 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1140     uint8_t full[24*17];\
1141     uint8_t half[256];\
1142     copy_block17(full, src, 24, stride, 17);\
1143     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1144     OPNAME ## pixels16_l2_8(dst, full+24, half, stride, 24, 16, 16);\
1145 }\
1146 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1147     uint8_t full[24*17];\
1148     uint8_t halfH[272];\
1149     uint8_t halfV[256];\
1150     uint8_t halfHV[256];\
1151     copy_block17(full, src, 24, stride, 17);\
1152     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1153     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1154     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1155     OPNAME ## pixels16_l4_8(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1156 }\
1157 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1158     uint8_t full[24*17];\
1159     uint8_t halfH[272];\
1160     uint8_t halfHV[256];\
1161     copy_block17(full, src, 24, stride, 17);\
1162     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1163     put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1164     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1165     OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1166 }\
1167 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1168     uint8_t full[24*17];\
1169     uint8_t halfH[272];\
1170     uint8_t halfV[256];\
1171     uint8_t halfHV[256];\
1172     copy_block17(full, src, 24, stride, 17);\
1173     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1174     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1175     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1176     OPNAME ## pixels16_l4_8(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1177 }\
1178 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1179     uint8_t full[24*17];\
1180     uint8_t halfH[272];\
1181     uint8_t halfHV[256];\
1182     copy_block17(full, src, 24, stride, 17);\
1183     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1184     put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1185     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1186     OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1187 }\
1188 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1189     uint8_t full[24*17];\
1190     uint8_t halfH[272];\
1191     uint8_t halfV[256];\
1192     uint8_t halfHV[256];\
1193     copy_block17(full, src, 24, stride, 17);\
1194     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1195     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1196     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1197     OPNAME ## pixels16_l4_8(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1198 }\
1199 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1200     uint8_t full[24*17];\
1201     uint8_t halfH[272];\
1202     uint8_t halfHV[256];\
1203     copy_block17(full, src, 24, stride, 17);\
1204     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1205     put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1206     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1207     OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1208 }\
1209 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1210     uint8_t full[24*17];\
1211     uint8_t halfH[272];\
1212     uint8_t halfV[256];\
1213     uint8_t halfHV[256];\
1214     copy_block17(full, src, 24, stride, 17);\
1215     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full  , 16, 24, 17);\
1216     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1217     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1218     OPNAME ## pixels16_l4_8(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1219 }\
1220 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1221     uint8_t full[24*17];\
1222     uint8_t halfH[272];\
1223     uint8_t halfHV[256];\
1224     copy_block17(full, src, 24, stride, 17);\
1225     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1226     put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1227     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1228     OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1229 }\
1230 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1231     uint8_t halfH[272];\
1232     uint8_t halfHV[256];\
1233     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1234     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1235     OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1236 }\
1237 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1238     uint8_t halfH[272];\
1239     uint8_t halfHV[256];\
1240     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1241     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1242     OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1243 }\
1244 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1245     uint8_t full[24*17];\
1246     uint8_t halfH[272];\
1247     uint8_t halfV[256];\
1248     uint8_t halfHV[256];\
1249     copy_block17(full, src, 24, stride, 17);\
1250     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1251     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1252     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1253     OPNAME ## pixels16_l2_8(dst, halfV, halfHV, stride, 16, 16, 16);\
1254 }\
1255 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1256     uint8_t full[24*17];\
1257     uint8_t halfH[272];\
1258     copy_block17(full, src, 24, stride, 17);\
1259     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1260     put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1261     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1262 }\
1263 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1264     uint8_t full[24*17];\
1265     uint8_t halfH[272];\
1266     uint8_t halfV[256];\
1267     uint8_t halfHV[256];\
1268     copy_block17(full, src, 24, stride, 17);\
1269     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1270     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1271     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1272     OPNAME ## pixels16_l2_8(dst, halfV, halfHV, stride, 16, 16, 16);\
1273 }\
1274 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1275     uint8_t full[24*17];\
1276     uint8_t halfH[272];\
1277     copy_block17(full, src, 24, stride, 17);\
1278     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1279     put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1280     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1281 }\
1282 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1283     uint8_t halfH[272];\
1284     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1285     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1286 }
1287
1288 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
1289 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
1290 #define op_put(a, b) a = cm[((b) + 16)>>5]
1291 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
1292
1293 QPEL_MC(0, put_       , _       , op_put)
1294 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
1295 QPEL_MC(0, avg_       , _       , op_avg)
1296 //QPEL_MC(1, avg_no_rnd , _       , op_avg)
1297 #undef op_avg
1298 #undef op_avg_no_rnd
1299 #undef op_put
1300 #undef op_put_no_rnd
1301
1302 #define put_qpel8_mc00_c  ff_put_pixels8x8_c
1303 #define avg_qpel8_mc00_c  ff_avg_pixels8x8_c
1304 #define put_qpel16_mc00_c ff_put_pixels16x16_c
1305 #define avg_qpel16_mc00_c ff_avg_pixels16x16_c
1306 #define put_no_rnd_qpel8_mc00_c  ff_put_pixels8x8_c
1307 #define put_no_rnd_qpel16_mc00_c ff_put_pixels16x16_8_c
1308
1309 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
1310     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
1311     int i;
1312
1313     for(i=0; i<h; i++){
1314         dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
1315         dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
1316         dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
1317         dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
1318         dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
1319         dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
1320         dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
1321         dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
1322         dst+=dstStride;
1323         src+=srcStride;
1324     }
1325 }
1326
1327 #if CONFIG_RV40_DECODER
1328 static void put_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1329     put_pixels16_xy2_8_c(dst, src, stride, 16);
1330 }
1331 static void avg_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1332     avg_pixels16_xy2_8_c(dst, src, stride, 16);
1333 }
1334 static void put_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1335     put_pixels8_xy2_8_c(dst, src, stride, 8);
1336 }
1337 static void avg_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1338     avg_pixels8_xy2_8_c(dst, src, stride, 8);
1339 }
1340 #endif /* CONFIG_RV40_DECODER */
1341
1342 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
1343     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
1344     int i;
1345
1346     for(i=0; i<w; i++){
1347         const int src_1= src[ -srcStride];
1348         const int src0 = src[0          ];
1349         const int src1 = src[  srcStride];
1350         const int src2 = src[2*srcStride];
1351         const int src3 = src[3*srcStride];
1352         const int src4 = src[4*srcStride];
1353         const int src5 = src[5*srcStride];
1354         const int src6 = src[6*srcStride];
1355         const int src7 = src[7*srcStride];
1356         const int src8 = src[8*srcStride];
1357         const int src9 = src[9*srcStride];
1358         dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
1359         dst[1*dstStride]= cm[(9*(src1 + src2) - (src0  + src3) + 8)>>4];
1360         dst[2*dstStride]= cm[(9*(src2 + src3) - (src1  + src4) + 8)>>4];
1361         dst[3*dstStride]= cm[(9*(src3 + src4) - (src2  + src5) + 8)>>4];
1362         dst[4*dstStride]= cm[(9*(src4 + src5) - (src3  + src6) + 8)>>4];
1363         dst[5*dstStride]= cm[(9*(src5 + src6) - (src4  + src7) + 8)>>4];
1364         dst[6*dstStride]= cm[(9*(src6 + src7) - (src5  + src8) + 8)>>4];
1365         dst[7*dstStride]= cm[(9*(src7 + src8) - (src6  + src9) + 8)>>4];
1366         src++;
1367         dst++;
1368     }
1369 }
1370
1371 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
1372     uint8_t half[64];
1373     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1374     put_pixels8_l2_8(dst, src, half, stride, stride, 8, 8);
1375 }
1376
1377 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
1378     wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
1379 }
1380
1381 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
1382     uint8_t half[64];
1383     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1384     put_pixels8_l2_8(dst, src+1, half, stride, stride, 8, 8);
1385 }
1386
1387 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
1388     wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
1389 }
1390
1391 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
1392     uint8_t halfH[88];
1393     uint8_t halfV[64];
1394     uint8_t halfHV[64];
1395     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1396     wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
1397     wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
1398     put_pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);
1399 }
1400 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
1401     uint8_t halfH[88];
1402     uint8_t halfV[64];
1403     uint8_t halfHV[64];
1404     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1405     wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
1406     wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
1407     put_pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);
1408 }
1409 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
1410     uint8_t halfH[88];
1411     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1412     wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
1413 }
1414
1415 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
1416     if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
1417     int x;
1418     const int strength= ff_h263_loop_filter_strength[qscale];
1419
1420     for(x=0; x<8; x++){
1421         int d1, d2, ad1;
1422         int p0= src[x-2*stride];
1423         int p1= src[x-1*stride];
1424         int p2= src[x+0*stride];
1425         int p3= src[x+1*stride];
1426         int d = (p0 - p3 + 4*(p2 - p1)) / 8;
1427
1428         if     (d<-2*strength) d1= 0;
1429         else if(d<-  strength) d1=-2*strength - d;
1430         else if(d<   strength) d1= d;
1431         else if(d< 2*strength) d1= 2*strength - d;
1432         else                   d1= 0;
1433
1434         p1 += d1;
1435         p2 -= d1;
1436         if(p1&256) p1= ~(p1>>31);
1437         if(p2&256) p2= ~(p2>>31);
1438
1439         src[x-1*stride] = p1;
1440         src[x+0*stride] = p2;
1441
1442         ad1= FFABS(d1)>>1;
1443
1444         d2= av_clip((p0-p3)/4, -ad1, ad1);
1445
1446         src[x-2*stride] = p0 - d2;
1447         src[x+  stride] = p3 + d2;
1448     }
1449     }
1450 }
1451
1452 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
1453     if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
1454     int y;
1455     const int strength= ff_h263_loop_filter_strength[qscale];
1456
1457     for(y=0; y<8; y++){
1458         int d1, d2, ad1;
1459         int p0= src[y*stride-2];
1460         int p1= src[y*stride-1];
1461         int p2= src[y*stride+0];
1462         int p3= src[y*stride+1];
1463         int d = (p0 - p3 + 4*(p2 - p1)) / 8;
1464
1465         if     (d<-2*strength) d1= 0;
1466         else if(d<-  strength) d1=-2*strength - d;
1467         else if(d<   strength) d1= d;
1468         else if(d< 2*strength) d1= 2*strength - d;
1469         else                   d1= 0;
1470
1471         p1 += d1;
1472         p2 -= d1;
1473         if(p1&256) p1= ~(p1>>31);
1474         if(p2&256) p2= ~(p2>>31);
1475
1476         src[y*stride-1] = p1;
1477         src[y*stride+0] = p2;
1478
1479         ad1= FFABS(d1)>>1;
1480
1481         d2= av_clip((p0-p3)/4, -ad1, ad1);
1482
1483         src[y*stride-2] = p0 - d2;
1484         src[y*stride+1] = p3 + d2;
1485     }
1486     }
1487 }
1488
1489 static void h261_loop_filter_c(uint8_t *src, int stride){
1490     int x,y,xy,yz;
1491     int temp[64];
1492
1493     for(x=0; x<8; x++){
1494         temp[x      ] = 4*src[x           ];
1495         temp[x + 7*8] = 4*src[x + 7*stride];
1496     }
1497     for(y=1; y<7; y++){
1498         for(x=0; x<8; x++){
1499             xy = y * stride + x;
1500             yz = y * 8 + x;
1501             temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
1502         }
1503     }
1504
1505     for(y=0; y<8; y++){
1506         src[  y*stride] = (temp[  y*8] + 2)>>2;
1507         src[7+y*stride] = (temp[7+y*8] + 2)>>2;
1508         for(x=1; x<7; x++){
1509             xy = y * stride + x;
1510             yz = y * 8 + x;
1511             src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
1512         }
1513     }
1514 }
1515
1516 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1517 {
1518     int s, i;
1519
1520     s = 0;
1521     for(i=0;i<h;i++) {
1522         s += abs(pix1[0] - pix2[0]);
1523         s += abs(pix1[1] - pix2[1]);
1524         s += abs(pix1[2] - pix2[2]);
1525         s += abs(pix1[3] - pix2[3]);
1526         s += abs(pix1[4] - pix2[4]);
1527         s += abs(pix1[5] - pix2[5]);
1528         s += abs(pix1[6] - pix2[6]);
1529         s += abs(pix1[7] - pix2[7]);
1530         s += abs(pix1[8] - pix2[8]);
1531         s += abs(pix1[9] - pix2[9]);
1532         s += abs(pix1[10] - pix2[10]);
1533         s += abs(pix1[11] - pix2[11]);
1534         s += abs(pix1[12] - pix2[12]);
1535         s += abs(pix1[13] - pix2[13]);
1536         s += abs(pix1[14] - pix2[14]);
1537         s += abs(pix1[15] - pix2[15]);
1538         pix1 += line_size;
1539         pix2 += line_size;
1540     }
1541     return s;
1542 }
1543
1544 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1545 {
1546     int s, i;
1547
1548     s = 0;
1549     for(i=0;i<h;i++) {
1550         s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1551         s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1552         s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1553         s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1554         s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1555         s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1556         s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1557         s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1558         s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
1559         s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
1560         s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
1561         s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
1562         s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
1563         s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
1564         s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
1565         s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
1566         pix1 += line_size;
1567         pix2 += line_size;
1568     }
1569     return s;
1570 }
1571
1572 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1573 {
1574     int s, i;
1575     uint8_t *pix3 = pix2 + line_size;
1576
1577     s = 0;
1578     for(i=0;i<h;i++) {
1579         s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1580         s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1581         s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1582         s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1583         s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1584         s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1585         s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1586         s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1587         s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
1588         s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
1589         s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
1590         s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
1591         s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
1592         s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
1593         s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
1594         s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
1595         pix1 += line_size;
1596         pix2 += line_size;
1597         pix3 += line_size;
1598     }
1599     return s;
1600 }
1601
1602 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1603 {
1604     int s, i;
1605     uint8_t *pix3 = pix2 + line_size;
1606
1607     s = 0;
1608     for(i=0;i<h;i++) {
1609         s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1610         s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1611         s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1612         s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1613         s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1614         s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1615         s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1616         s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1617         s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
1618         s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
1619         s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
1620         s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
1621         s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
1622         s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
1623         s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
1624         s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
1625         pix1 += line_size;
1626         pix2 += line_size;
1627         pix3 += line_size;
1628     }
1629     return s;
1630 }
1631
1632 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1633 {
1634     int s, i;
1635
1636     s = 0;
1637     for(i=0;i<h;i++) {
1638         s += abs(pix1[0] - pix2[0]);
1639         s += abs(pix1[1] - pix2[1]);
1640         s += abs(pix1[2] - pix2[2]);
1641         s += abs(pix1[3] - pix2[3]);
1642         s += abs(pix1[4] - pix2[4]);
1643         s += abs(pix1[5] - pix2[5]);
1644         s += abs(pix1[6] - pix2[6]);
1645         s += abs(pix1[7] - pix2[7]);
1646         pix1 += line_size;
1647         pix2 += line_size;
1648     }
1649     return s;
1650 }
1651
1652 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1653 {
1654     int s, i;
1655
1656     s = 0;
1657     for(i=0;i<h;i++) {
1658         s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1659         s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1660         s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1661         s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1662         s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1663         s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1664         s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1665         s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1666         pix1 += line_size;
1667         pix2 += line_size;
1668     }
1669     return s;
1670 }
1671
1672 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1673 {
1674     int s, i;
1675     uint8_t *pix3 = pix2 + line_size;
1676
1677     s = 0;
1678     for(i=0;i<h;i++) {
1679         s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1680         s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1681         s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1682         s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1683         s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1684         s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1685         s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1686         s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1687         pix1 += line_size;
1688         pix2 += line_size;
1689         pix3 += line_size;
1690     }
1691     return s;
1692 }
1693
1694 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1695 {
1696     int s, i;
1697     uint8_t *pix3 = pix2 + line_size;
1698
1699     s = 0;
1700     for(i=0;i<h;i++) {
1701         s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1702         s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1703         s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1704         s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1705         s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1706         s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1707         s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1708         s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1709         pix1 += line_size;
1710         pix2 += line_size;
1711         pix3 += line_size;
1712     }
1713     return s;
1714 }
1715
1716 static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
1717     MpegEncContext *c = v;
1718     int score1=0;
1719     int score2=0;
1720     int x,y;
1721
1722     for(y=0; y<h; y++){
1723         for(x=0; x<16; x++){
1724             score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
1725         }
1726         if(y+1<h){
1727             for(x=0; x<15; x++){
1728                 score2+= FFABS(  s1[x  ] - s1[x  +stride]
1729                              - s1[x+1] + s1[x+1+stride])
1730                         -FFABS(  s2[x  ] - s2[x  +stride]
1731                              - s2[x+1] + s2[x+1+stride]);
1732             }
1733         }
1734         s1+= stride;
1735         s2+= stride;
1736     }
1737
1738     if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1739     else  return score1 + FFABS(score2)*8;
1740 }
1741
1742 static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
1743     MpegEncContext *c = v;
1744     int score1=0;
1745     int score2=0;
1746     int x,y;
1747
1748     for(y=0; y<h; y++){
1749         for(x=0; x<8; x++){
1750             score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
1751         }
1752         if(y+1<h){
1753             for(x=0; x<7; x++){
1754                 score2+= FFABS(  s1[x  ] - s1[x  +stride]
1755                              - s1[x+1] + s1[x+1+stride])
1756                         -FFABS(  s2[x  ] - s2[x  +stride]
1757                              - s2[x+1] + s2[x+1+stride]);
1758             }
1759         }
1760         s1+= stride;
1761         s2+= stride;
1762     }
1763
1764     if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1765     else  return score1 + FFABS(score2)*8;
1766 }
1767
1768 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
1769     int i;
1770     unsigned int sum=0;
1771
1772     for(i=0; i<8*8; i++){
1773         int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
1774         int w= weight[i];
1775         b>>= RECON_SHIFT;
1776         assert(-512<b && b<512);
1777
1778         sum += (w*b)*(w*b)>>4;
1779     }
1780     return sum>>2;
1781 }
1782
1783 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
1784     int i;
1785
1786     for(i=0; i<8*8; i++){
1787         rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
1788     }
1789 }
1790
1791 /**
1792  * permutes an 8x8 block.
1793  * @param block the block which will be permuted according to the given permutation vector
1794  * @param permutation the permutation vector
1795  * @param last the last non zero coefficient in scantable order, used to speed the permutation up
1796  * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
1797  *                  (inverse) permutated to scantable order!
1798  */
1799 void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
1800 {
1801     int i;
1802     DCTELEM temp[64];
1803
1804     if(last<=0) return;
1805     //if(permutation[1]==1) return; //FIXME it is ok but not clean and might fail for some permutations
1806
1807     for(i=0; i<=last; i++){
1808         const int j= scantable[i];
1809         temp[j]= block[j];
1810         block[j]=0;
1811     }
1812
1813     for(i=0; i<=last; i++){
1814         const int j= scantable[i];
1815         const int perm_j= permutation[j];
1816         block[perm_j]= temp[j];
1817     }
1818 }
1819
1820 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
1821     return 0;
1822 }
1823
1824 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
1825     int i;
1826
1827     memset(cmp, 0, sizeof(void*)*6);
1828
1829     for(i=0; i<6; i++){
1830         switch(type&0xFF){
1831         case FF_CMP_SAD:
1832             cmp[i]= c->sad[i];
1833             break;
1834         case FF_CMP_SATD:
1835             cmp[i]= c->hadamard8_diff[i];
1836             break;
1837         case FF_CMP_SSE:
1838             cmp[i]= c->sse[i];
1839             break;
1840         case FF_CMP_DCT:
1841             cmp[i]= c->dct_sad[i];
1842             break;
1843         case FF_CMP_DCT264:
1844             cmp[i]= c->dct264_sad[i];
1845             break;
1846         case FF_CMP_DCTMAX:
1847             cmp[i]= c->dct_max[i];
1848             break;
1849         case FF_CMP_PSNR:
1850             cmp[i]= c->quant_psnr[i];
1851             break;
1852         case FF_CMP_BIT:
1853             cmp[i]= c->bit[i];
1854             break;
1855         case FF_CMP_RD:
1856             cmp[i]= c->rd[i];
1857             break;
1858         case FF_CMP_VSAD:
1859             cmp[i]= c->vsad[i];
1860             break;
1861         case FF_CMP_VSSE:
1862             cmp[i]= c->vsse[i];
1863             break;
1864         case FF_CMP_ZERO:
1865             cmp[i]= zero_cmp;
1866             break;
1867         case FF_CMP_NSSE:
1868             cmp[i]= c->nsse[i];
1869             break;
1870 #if CONFIG_DWT
1871         case FF_CMP_W53:
1872             cmp[i]= c->w53[i];
1873             break;
1874         case FF_CMP_W97:
1875             cmp[i]= c->w97[i];
1876             break;
1877 #endif
1878         default:
1879             av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
1880         }
1881     }
1882 }
1883
1884 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
1885     long i;
1886     for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
1887         long a = *(long*)(src+i);
1888         long b = *(long*)(dst+i);
1889         *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
1890     }
1891     for(; i<w; i++)
1892         dst[i+0] += src[i+0];
1893 }
1894
1895 static void add_bytes_l2_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
1896     long i;
1897     for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
1898         long a = *(long*)(src1+i);
1899         long b = *(long*)(src2+i);
1900         *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
1901     }
1902     for(; i<w; i++)
1903         dst[i] = src1[i]+src2[i];
1904 }
1905
1906 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
1907     long i;
1908 #if !HAVE_FAST_UNALIGNED
1909     if((long)src2 & (sizeof(long)-1)){
1910         for(i=0; i+7<w; i+=8){
1911             dst[i+0] = src1[i+0]-src2[i+0];
1912             dst[i+1] = src1[i+1]-src2[i+1];
1913             dst[i+2] = src1[i+2]-src2[i+2];
1914             dst[i+3] = src1[i+3]-src2[i+3];
1915             dst[i+4] = src1[i+4]-src2[i+4];
1916             dst[i+5] = src1[i+5]-src2[i+5];
1917             dst[i+6] = src1[i+6]-src2[i+6];
1918             dst[i+7] = src1[i+7]-src2[i+7];
1919         }
1920     }else
1921 #endif
1922     for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
1923         long a = *(long*)(src1+i);
1924         long b = *(long*)(src2+i);
1925         *(long*)(dst+i) = ((a|pb_80) - (b&pb_7f)) ^ ((a^b^pb_80)&pb_80);
1926     }
1927     for(; i<w; i++)
1928         dst[i+0] = src1[i+0]-src2[i+0];
1929 }
1930
1931 static void add_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *diff, int w, int *left, int *left_top){
1932     int i;
1933     uint8_t l, lt;
1934
1935     l= *left;
1936     lt= *left_top;
1937
1938     for(i=0; i<w; i++){
1939         l= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF) + diff[i];
1940         lt= src1[i];
1941         dst[i]= l;
1942     }
1943
1944     *left= l;
1945     *left_top= lt;
1946 }
1947
1948 static void sub_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w, int *left, int *left_top){
1949     int i;
1950     uint8_t l, lt;
1951
1952     l= *left;
1953     lt= *left_top;
1954
1955     for(i=0; i<w; i++){
1956         const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
1957         lt= src1[i];
1958         l= src2[i];
1959         dst[i]= l - pred;
1960     }
1961
1962     *left= l;
1963     *left_top= lt;
1964 }
1965
1966 static int add_hfyu_left_prediction_c(uint8_t *dst, const uint8_t *src, int w, int acc){
1967     int i;
1968
1969     for(i=0; i<w-1; i++){
1970         acc+= src[i];
1971         dst[i]= acc;
1972         i++;
1973         acc+= src[i];
1974         dst[i]= acc;
1975     }
1976
1977     for(; i<w; i++){
1978         acc+= src[i];
1979         dst[i]= acc;
1980     }
1981
1982     return acc;
1983 }
1984
1985 #if HAVE_BIGENDIAN
1986 #define B 3
1987 #define G 2
1988 #define R 1
1989 #define A 0
1990 #else
1991 #define B 0
1992 #define G 1
1993 #define R 2
1994 #define A 3
1995 #endif
1996 static void add_hfyu_left_prediction_bgr32_c(uint8_t *dst, const uint8_t *src, int w, int *red, int *green, int *blue, int *alpha){
1997     int i;
1998     int r,g,b,a;
1999     r= *red;
2000     g= *green;
2001     b= *blue;
2002     a= *alpha;
2003
2004     for(i=0; i<w; i++){
2005         b+= src[4*i+B];
2006         g+= src[4*i+G];
2007         r+= src[4*i+R];
2008         a+= src[4*i+A];
2009
2010         dst[4*i+B]= b;
2011         dst[4*i+G]= g;
2012         dst[4*i+R]= r;
2013         dst[4*i+A]= a;
2014     }
2015
2016     *red= r;
2017     *green= g;
2018     *blue= b;
2019     *alpha= a;
2020 }
2021 #undef B
2022 #undef G
2023 #undef R
2024 #undef A
2025
2026 #define BUTTERFLY2(o1,o2,i1,i2) \
2027 o1= (i1)+(i2);\
2028 o2= (i1)-(i2);
2029
2030 #define BUTTERFLY1(x,y) \
2031 {\
2032     int a,b;\
2033     a= x;\
2034     b= y;\
2035     x= a+b;\
2036     y= a-b;\
2037 }
2038
2039 #define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
2040
2041 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
2042     int i;
2043     int temp[64];
2044     int sum=0;
2045
2046     assert(h==8);
2047
2048     for(i=0; i<8; i++){
2049         //FIXME try pointer walks
2050         BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
2051         BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
2052         BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
2053         BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
2054
2055         BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2056         BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2057         BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2058         BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2059
2060         BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2061         BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2062         BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2063         BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2064     }
2065
2066     for(i=0; i<8; i++){
2067         BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2068         BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2069         BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2070         BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2071
2072         BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2073         BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2074         BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2075         BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2076
2077         sum +=
2078              BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2079             +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2080             +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2081             +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2082     }
2083     return sum;
2084 }
2085
2086 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
2087     int i;
2088     int temp[64];
2089     int sum=0;
2090
2091     assert(h==8);
2092
2093     for(i=0; i<8; i++){
2094         //FIXME try pointer walks
2095         BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
2096         BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
2097         BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
2098         BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
2099
2100         BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2101         BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2102         BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2103         BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2104
2105         BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2106         BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2107         BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2108         BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2109     }
2110
2111     for(i=0; i<8; i++){
2112         BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2113         BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2114         BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2115         BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2116
2117         BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2118         BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2119         BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2120         BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2121
2122         sum +=
2123              BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2124             +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2125             +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2126             +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2127     }
2128
2129     sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
2130
2131     return sum;
2132 }
2133
2134 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2135     MpegEncContext * const s= (MpegEncContext *)c;
2136     LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
2137
2138     assert(h==8);
2139
2140     s->dsp.diff_pixels(temp, src1, src2, stride);
2141     s->dsp.fdct(temp);
2142     return s->dsp.sum_abs_dctelem(temp);
2143 }
2144
2145 #if CONFIG_GPL
2146 #define DCT8_1D {\
2147     const int s07 = SRC(0) + SRC(7);\
2148     const int s16 = SRC(1) + SRC(6);\
2149     const int s25 = SRC(2) + SRC(5);\
2150     const int s34 = SRC(3) + SRC(4);\
2151     const int a0 = s07 + s34;\
2152     const int a1 = s16 + s25;\
2153     const int a2 = s07 - s34;\
2154     const int a3 = s16 - s25;\
2155     const int d07 = SRC(0) - SRC(7);\
2156     const int d16 = SRC(1) - SRC(6);\
2157     const int d25 = SRC(2) - SRC(5);\
2158     const int d34 = SRC(3) - SRC(4);\
2159     const int a4 = d16 + d25 + (d07 + (d07>>1));\
2160     const int a5 = d07 - d34 - (d25 + (d25>>1));\
2161     const int a6 = d07 + d34 - (d16 + (d16>>1));\
2162     const int a7 = d16 - d25 + (d34 + (d34>>1));\
2163     DST(0,  a0 + a1     ) ;\
2164     DST(1,  a4 + (a7>>2)) ;\
2165     DST(2,  a2 + (a3>>1)) ;\
2166     DST(3,  a5 + (a6>>2)) ;\
2167     DST(4,  a0 - a1     ) ;\
2168     DST(5,  a6 - (a5>>2)) ;\
2169     DST(6, (a2>>1) - a3 ) ;\
2170     DST(7, (a4>>2) - a7 ) ;\
2171 }
2172
2173 static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2174     MpegEncContext * const s= (MpegEncContext *)c;
2175     DCTELEM dct[8][8];
2176     int i;
2177     int sum=0;
2178
2179     s->dsp.diff_pixels(dct[0], src1, src2, stride);
2180
2181 #define SRC(x) dct[i][x]
2182 #define DST(x,v) dct[i][x]= v
2183     for( i = 0; i < 8; i++ )
2184         DCT8_1D
2185 #undef SRC
2186 #undef DST
2187
2188 #define SRC(x) dct[x][i]
2189 #define DST(x,v) sum += FFABS(v)
2190     for( i = 0; i < 8; i++ )
2191         DCT8_1D
2192 #undef SRC
2193 #undef DST
2194     return sum;
2195 }
2196 #endif
2197
2198 static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2199     MpegEncContext * const s= (MpegEncContext *)c;
2200     LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
2201     int sum=0, i;
2202
2203     assert(h==8);
2204
2205     s->dsp.diff_pixels(temp, src1, src2, stride);
2206     s->dsp.fdct(temp);
2207
2208     for(i=0; i<64; i++)
2209         sum= FFMAX(sum, FFABS(temp[i]));
2210
2211     return sum;
2212 }
2213
2214 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2215     MpegEncContext * const s= (MpegEncContext *)c;
2216     LOCAL_ALIGNED_16(DCTELEM, temp, [64*2]);
2217     DCTELEM * const bak = temp+64;
2218     int sum=0, i;
2219
2220     assert(h==8);
2221     s->mb_intra=0;
2222
2223     s->dsp.diff_pixels(temp, src1, src2, stride);
2224
2225     memcpy(bak, temp, 64*sizeof(DCTELEM));
2226
2227     s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2228     s->dct_unquantize_inter(s, temp, 0, s->qscale);
2229     ff_simple_idct(temp); //FIXME
2230
2231     for(i=0; i<64; i++)
2232         sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
2233
2234     return sum;
2235 }
2236
2237 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2238     MpegEncContext * const s= (MpegEncContext *)c;
2239     const uint8_t *scantable= s->intra_scantable.permutated;
2240     LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
2241     LOCAL_ALIGNED_16(uint8_t, lsrc1, [64]);
2242     LOCAL_ALIGNED_16(uint8_t, lsrc2, [64]);
2243     int i, last, run, bits, level, distortion, start_i;
2244     const int esc_length= s->ac_esc_length;
2245     uint8_t * length;
2246     uint8_t * last_length;
2247
2248     assert(h==8);
2249
2250     copy_block8(lsrc1, src1, 8, stride, 8);
2251     copy_block8(lsrc2, src2, 8, stride, 8);
2252
2253     s->dsp.diff_pixels(temp, lsrc1, lsrc2, 8);
2254
2255     s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2256
2257     bits=0;
2258
2259     if (s->mb_intra) {
2260         start_i = 1;
2261         length     = s->intra_ac_vlc_length;
2262         last_length= s->intra_ac_vlc_last_length;
2263         bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2264     } else {
2265         start_i = 0;
2266         length     = s->inter_ac_vlc_length;
2267         last_length= s->inter_ac_vlc_last_length;
2268     }
2269
2270     if(last>=start_i){
2271         run=0;
2272         for(i=start_i; i<last; i++){
2273             int j= scantable[i];
2274             level= temp[j];
2275
2276             if(level){
2277                 level+=64;
2278                 if((level&(~127)) == 0){
2279                     bits+= length[UNI_AC_ENC_INDEX(run, level)];
2280                 }else
2281                     bits+= esc_length;
2282                 run=0;
2283             }else
2284                 run++;
2285         }
2286         i= scantable[last];
2287
2288         level= temp[i] + 64;
2289
2290         assert(level - 64);
2291
2292         if((level&(~127)) == 0){
2293             bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2294         }else
2295             bits+= esc_length;
2296
2297     }
2298
2299     if(last>=0){
2300         if(s->mb_intra)
2301             s->dct_unquantize_intra(s, temp, 0, s->qscale);
2302         else
2303             s->dct_unquantize_inter(s, temp, 0, s->qscale);
2304     }
2305
2306     s->dsp.idct_add(lsrc2, 8, temp);
2307
2308     distortion= s->dsp.sse[1](NULL, lsrc2, lsrc1, 8, 8);
2309
2310     return distortion + ((bits*s->qscale*s->qscale*109 + 64)>>7);
2311 }
2312
2313 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2314     MpegEncContext * const s= (MpegEncContext *)c;
2315     const uint8_t *scantable= s->intra_scantable.permutated;
2316     LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
2317     int i, last, run, bits, level, start_i;
2318     const int esc_length= s->ac_esc_length;
2319     uint8_t * length;
2320     uint8_t * last_length;
2321
2322     assert(h==8);
2323
2324     s->dsp.diff_pixels(temp, src1, src2, stride);
2325
2326     s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2327
2328     bits=0;
2329
2330     if (s->mb_intra) {
2331         start_i = 1;
2332         length     = s->intra_ac_vlc_length;
2333         last_length= s->intra_ac_vlc_last_length;
2334         bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2335     } else {
2336         start_i = 0;
2337         length     = s->inter_ac_vlc_length;
2338         last_length= s->inter_ac_vlc_last_length;
2339     }
2340
2341     if(last>=start_i){
2342         run=0;
2343         for(i=start_i; i<last; i++){
2344             int j= scantable[i];
2345             level= temp[j];
2346
2347             if(level){
2348                 level+=64;
2349                 if((level&(~127)) == 0){
2350                     bits+= length[UNI_AC_ENC_INDEX(run, level)];
2351                 }else
2352                     bits+= esc_length;
2353                 run=0;
2354             }else
2355                 run++;
2356         }
2357         i= scantable[last];
2358
2359         level= temp[i] + 64;
2360
2361         assert(level - 64);
2362
2363         if((level&(~127)) == 0){
2364             bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2365         }else
2366             bits+= esc_length;
2367     }
2368
2369     return bits;
2370 }
2371
2372 #define VSAD_INTRA(size) \
2373 static int vsad_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
2374     int score=0;                                                                                            \
2375     int x,y;                                                                                                \
2376                                                                                                             \
2377     for(y=1; y<h; y++){                                                                                     \
2378         for(x=0; x<size; x+=4){                                                                             \
2379             score+= FFABS(s[x  ] - s[x  +stride]) + FFABS(s[x+1] - s[x+1+stride])                           \
2380                    +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]);                          \
2381         }                                                                                                   \
2382         s+= stride;                                                                                         \
2383     }                                                                                                       \
2384                                                                                                             \
2385     return score;                                                                                           \
2386 }
2387 VSAD_INTRA(8)
2388 VSAD_INTRA(16)
2389
2390 static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2391     int score=0;
2392     int x,y;
2393
2394     for(y=1; y<h; y++){
2395         for(x=0; x<16; x++){
2396             score+= FFABS(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
2397         }
2398         s1+= stride;
2399         s2+= stride;
2400     }
2401
2402     return score;
2403 }
2404
2405 #define SQ(a) ((a)*(a))
2406 #define VSSE_INTRA(size) \
2407 static int vsse_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
2408     int score=0;                                                                                            \
2409     int x,y;                                                                                                \
2410                                                                                                             \
2411     for(y=1; y<h; y++){                                                                                     \
2412         for(x=0; x<size; x+=4){                                                                               \
2413             score+= SQ(s[x  ] - s[x  +stride]) + SQ(s[x+1] - s[x+1+stride])                                 \
2414                    +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);                                \
2415         }                                                                                                   \
2416         s+= stride;                                                                                         \
2417     }                                                                                                       \
2418                                                                                                             \
2419     return score;                                                                                           \
2420 }
2421 VSSE_INTRA(8)
2422 VSSE_INTRA(16)
2423
2424 static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2425     int score=0;
2426     int x,y;
2427
2428     for(y=1; y<h; y++){
2429         for(x=0; x<16; x++){
2430             score+= SQ(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
2431         }
2432         s1+= stride;
2433         s2+= stride;
2434     }
2435
2436     return score;
2437 }
2438
2439 static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
2440                                int size){
2441     int score=0;
2442     int i;
2443     for(i=0; i<size; i++)
2444         score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
2445     return score;
2446 }
2447
2448 WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
2449 WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
2450 WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
2451 #if CONFIG_GPL
2452 WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
2453 #endif
2454 WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
2455 WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
2456 WRAPPER8_16_SQ(rd8x8_c, rd16_c)
2457 WRAPPER8_16_SQ(bit8x8_c, bit16_c)
2458
2459 static void vector_fmul_c(float *dst, const float *src0, const float *src1, int len){
2460     int i;
2461     for(i=0; i<len; i++)
2462         dst[i] = src0[i] * src1[i];
2463 }
2464
2465 static void vector_fmul_reverse_c(float *dst, const float *src0, const float *src1, int len){
2466     int i;
2467     src1 += len-1;
2468     for(i=0; i<len; i++)
2469         dst[i] = src0[i] * src1[-i];
2470 }
2471
2472 static void vector_fmul_add_c(float *dst, const float *src0, const float *src1, const float *src2, int len){
2473     int i;
2474     for(i=0; i<len; i++)
2475         dst[i] = src0[i] * src1[i] + src2[i];
2476 }
2477
2478 static void vector_fmul_window_c(float *dst, const float *src0,
2479                                  const float *src1, const float *win, int len)
2480 {
2481     int i,j;
2482     dst += len;
2483     win += len;
2484     src0+= len;
2485     for(i=-len, j=len-1; i<0; i++, j--) {
2486         float s0 = src0[i];
2487         float s1 = src1[j];
2488         float wi = win[i];
2489         float wj = win[j];
2490         dst[i] = s0*wj - s1*wi;
2491         dst[j] = s0*wi + s1*wj;
2492     }
2493 }
2494
2495 static void vector_fmul_scalar_c(float *dst, const float *src, float mul,
2496                                  int len)
2497 {
2498     int i;
2499     for (i = 0; i < len; i++)
2500         dst[i] = src[i] * mul;
2501 }
2502
2503 static void vector_fmul_sv_scalar_2_c(float *dst, const float *src,
2504                                       const float **sv, float mul, int len)
2505 {
2506     int i;
2507     for (i = 0; i < len; i += 2, sv++) {
2508         dst[i  ] = src[i  ] * sv[0][0] * mul;
2509         dst[i+1] = src[i+1] * sv[0][1] * mul;
2510     }
2511 }
2512
2513 static void vector_fmul_sv_scalar_4_c(float *dst, const float *src,
2514                                       const float **sv, float mul, int len)
2515 {
2516     int i;
2517     for (i = 0; i < len; i += 4, sv++) {
2518         dst[i  ] = src[i  ] * sv[0][0] * mul;
2519         dst[i+1] = src[i+1] * sv[0][1] * mul;
2520         dst[i+2] = src[i+2] * sv[0][2] * mul;
2521         dst[i+3] = src[i+3] * sv[0][3] * mul;
2522     }
2523 }
2524
2525 static void sv_fmul_scalar_2_c(float *dst, const float **sv, float mul,
2526                                int len)
2527 {
2528     int i;
2529     for (i = 0; i < len; i += 2, sv++) {
2530         dst[i  ] = sv[0][0] * mul;
2531         dst[i+1] = sv[0][1] * mul;
2532     }
2533 }
2534
2535 static void sv_fmul_scalar_4_c(float *dst, const float **sv, float mul,
2536                                int len)
2537 {
2538     int i;
2539     for (i = 0; i < len; i += 4, sv++) {
2540         dst[i  ] = sv[0][0] * mul;
2541         dst[i+1] = sv[0][1] * mul;
2542         dst[i+2] = sv[0][2] * mul;
2543         dst[i+3] = sv[0][3] * mul;
2544     }
2545 }
2546
2547 static void butterflies_float_c(float *restrict v1, float *restrict v2,
2548                                 int len)
2549 {
2550     int i;
2551     for (i = 0; i < len; i++) {
2552         float t = v1[i] - v2[i];
2553         v1[i] += v2[i];
2554         v2[i] = t;
2555     }
2556 }
2557
2558 static float scalarproduct_float_c(const float *v1, const float *v2, int len)
2559 {
2560     float p = 0.0;
2561     int i;
2562
2563     for (i = 0; i < len; i++)
2564         p += v1[i] * v2[i];
2565
2566     return p;
2567 }
2568
2569 static inline uint32_t clipf_c_one(uint32_t a, uint32_t mini,
2570                    uint32_t maxi, uint32_t maxisign)
2571 {
2572
2573     if(a > mini) return mini;
2574     else if((a^(1U<<31)) > maxisign) return maxi;
2575     else return a;
2576 }
2577
2578 static void vector_clipf_c_opposite_sign(float *dst, const float *src, float *min, float *max, int len){
2579     int i;
2580     uint32_t mini = *(uint32_t*)min;
2581     uint32_t maxi = *(uint32_t*)max;
2582     uint32_t maxisign = maxi ^ (1U<<31);
2583     uint32_t *dsti = (uint32_t*)dst;
2584     const uint32_t *srci = (const uint32_t*)src;
2585     for(i=0; i<len; i+=8) {
2586         dsti[i + 0] = clipf_c_one(srci[i + 0], mini, maxi, maxisign);
2587         dsti[i + 1] = clipf_c_one(srci[i + 1], mini, maxi, maxisign);
2588         dsti[i + 2] = clipf_c_one(srci[i + 2], mini, maxi, maxisign);
2589         dsti[i + 3] = clipf_c_one(srci[i + 3], mini, maxi, maxisign);
2590         dsti[i + 4] = clipf_c_one(srci[i + 4], mini, maxi, maxisign);
2591         dsti[i + 5] = clipf_c_one(srci[i + 5], mini, maxi, maxisign);
2592         dsti[i + 6] = clipf_c_one(srci[i + 6], mini, maxi, maxisign);
2593         dsti[i + 7] = clipf_c_one(srci[i + 7], mini, maxi, maxisign);
2594     }
2595 }
2596 static void vector_clipf_c(float *dst, const float *src, float min, float max, int len){
2597     int i;
2598     if(min < 0 && max > 0) {
2599         vector_clipf_c_opposite_sign(dst, src, &min, &max, len);
2600     } else {
2601         for(i=0; i < len; i+=8) {
2602             dst[i    ] = av_clipf(src[i    ], min, max);
2603             dst[i + 1] = av_clipf(src[i + 1], min, max);
2604             dst[i + 2] = av_clipf(src[i + 2], min, max);
2605             dst[i + 3] = av_clipf(src[i + 3], min, max);
2606             dst[i + 4] = av_clipf(src[i + 4], min, max);
2607             dst[i + 5] = av_clipf(src[i + 5], min, max);
2608             dst[i + 6] = av_clipf(src[i + 6], min, max);
2609             dst[i + 7] = av_clipf(src[i + 7], min, max);
2610         }
2611     }
2612 }
2613
2614 static int32_t scalarproduct_int16_c(const int16_t * v1, const int16_t * v2, int order, int shift)
2615 {
2616     int res = 0;
2617
2618     while (order--)
2619         res += (*v1++ * *v2++) >> shift;
2620
2621     return res;
2622 }
2623
2624 static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul)
2625 {
2626     int res = 0;
2627     while (order--) {
2628         res   += *v1 * *v2++;
2629         *v1++ += mul * *v3++;
2630     }
2631     return res;
2632 }
2633
2634 static void apply_window_int16_c(int16_t *output, const int16_t *input,
2635                                  const int16_t *window, unsigned int len)
2636 {
2637     int i;
2638     int len2 = len >> 1;
2639
2640     for (i = 0; i < len2; i++) {
2641         int16_t w       = window[i];
2642         output[i]       = (MUL16(input[i],       w) + (1 << 14)) >> 15;
2643         output[len-i-1] = (MUL16(input[len-i-1], w) + (1 << 14)) >> 15;
2644     }
2645 }
2646
2647 static void vector_clip_int32_c(int32_t *dst, const int32_t *src, int32_t min,
2648                                 int32_t max, unsigned int len)
2649 {
2650     do {
2651         *dst++ = av_clip(*src++, min, max);
2652         *dst++ = av_clip(*src++, min, max);
2653         *dst++ = av_clip(*src++, min, max);
2654         *dst++ = av_clip(*src++, min, max);
2655         *dst++ = av_clip(*src++, min, max);
2656         *dst++ = av_clip(*src++, min, max);
2657         *dst++ = av_clip(*src++, min, max);
2658         *dst++ = av_clip(*src++, min, max);
2659         len -= 8;
2660     } while (len > 0);
2661 }
2662
2663 #define W0 2048
2664 #define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
2665 #define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
2666 #define W3 2408 /* 2048*sqrt (2)*cos (3*pi/16) */
2667 #define W4 2048 /* 2048*sqrt (2)*cos (4*pi/16) */
2668 #define W5 1609 /* 2048*sqrt (2)*cos (5*pi/16) */
2669 #define W6 1108 /* 2048*sqrt (2)*cos (6*pi/16) */
2670 #define W7 565  /* 2048*sqrt (2)*cos (7*pi/16) */
2671
2672 static void wmv2_idct_row(short * b)
2673 {
2674     int s1,s2;
2675     int a0,a1,a2,a3,a4,a5,a6,a7;
2676     /*step 1*/
2677     a1 = W1*b[1]+W7*b[7];
2678     a7 = W7*b[1]-W1*b[7];
2679     a5 = W5*b[5]+W3*b[3];
2680     a3 = W3*b[5]-W5*b[3];
2681     a2 = W2*b[2]+W6*b[6];
2682     a6 = W6*b[2]-W2*b[6];
2683     a0 = W0*b[0]+W0*b[4];
2684     a4 = W0*b[0]-W0*b[4];
2685     /*step 2*/
2686     s1 = (181*(a1-a5+a7-a3)+128)>>8;//1,3,5,7,
2687     s2 = (181*(a1-a5-a7+a3)+128)>>8;
2688     /*step 3*/
2689     b[0] = (a0+a2+a1+a5 + (1<<7))>>8;
2690     b[1] = (a4+a6 +s1   + (1<<7))>>8;
2691     b[2] = (a4-a6 +s2   + (1<<7))>>8;
2692     b[3] = (a0-a2+a7+a3 + (1<<7))>>8;
2693     b[4] = (a0-a2-a7-a3 + (1<<7))>>8;
2694     b[5] = (a4-a6 -s2   + (1<<7))>>8;
2695     b[6] = (a4+a6 -s1   + (1<<7))>>8;
2696     b[7] = (a0+a2-a1-a5 + (1<<7))>>8;
2697 }
2698 static void wmv2_idct_col(short * b)
2699 {
2700     int s1,s2;
2701     int a0,a1,a2,a3,a4,a5,a6,a7;
2702     /*step 1, with extended precision*/
2703     a1 = (W1*b[8*1]+W7*b[8*7] + 4)>>3;
2704     a7 = (W7*b[8*1]-W1*b[8*7] + 4)>>3;
2705     a5 = (W5*b[8*5]+W3*b[8*3] + 4)>>3;
2706     a3 = (W3*b[8*5]-W5*b[8*3] + 4)>>3;
2707     a2 = (W2*b[8*2]+W6*b[8*6] + 4)>>3;
2708     a6 = (W6*b[8*2]-W2*b[8*6] + 4)>>3;
2709     a0 = (W0*b[8*0]+W0*b[8*4]    )>>3;
2710     a4 = (W0*b[8*0]-W0*b[8*4]    )>>3;
2711     /*step 2*/
2712     s1 = (181*(a1-a5+a7-a3)+128)>>8;
2713     s2 = (181*(a1-a5-a7+a3)+128)>>8;
2714     /*step 3*/
2715     b[8*0] = (a0+a2+a1+a5 + (1<<13))>>14;
2716     b[8*1] = (a4+a6 +s1   + (1<<13))>>14;
2717     b[8*2] = (a4-a6 +s2   + (1<<13))>>14;
2718     b[8*3] = (a0-a2+a7+a3 + (1<<13))>>14;
2719
2720     b[8*4] = (a0-a2-a7-a3 + (1<<13))>>14;
2721     b[8*5] = (a4-a6 -s2   + (1<<13))>>14;
2722     b[8*6] = (a4+a6 -s1   + (1<<13))>>14;
2723     b[8*7] = (a0+a2-a1-a5 + (1<<13))>>14;
2724 }
2725 void ff_wmv2_idct_c(short * block){
2726     int i;
2727
2728     for(i=0;i<64;i+=8){
2729         wmv2_idct_row(block+i);
2730     }
2731     for(i=0;i<8;i++){
2732         wmv2_idct_col(block+i);
2733     }
2734 }
2735 /* XXX: those functions should be suppressed ASAP when all IDCTs are
2736  converted */
2737 static void ff_wmv2_idct_put_c(uint8_t *dest, int line_size, DCTELEM *block)
2738 {
2739     ff_wmv2_idct_c(block);
2740     ff_put_pixels_clamped_c(block, dest, line_size);
2741 }
2742 static void ff_wmv2_idct_add_c(uint8_t *dest, int line_size, DCTELEM *block)
2743 {
2744     ff_wmv2_idct_c(block);
2745     ff_add_pixels_clamped_c(block, dest, line_size);
2746 }
2747 static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
2748 {
2749     j_rev_dct (block);
2750     ff_put_pixels_clamped_c(block, dest, line_size);
2751 }
2752 static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
2753 {
2754     j_rev_dct (block);
2755     ff_add_pixels_clamped_c(block, dest, line_size);
2756 }
2757
2758 static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block)
2759 {
2760     j_rev_dct4 (block);
2761     put_pixels_clamped4_c(block, dest, line_size);
2762 }
2763 static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block)
2764 {
2765     j_rev_dct4 (block);
2766     add_pixels_clamped4_c(block, dest, line_size);
2767 }
2768
2769 static void ff_jref_idct2_put(uint8_t *dest, int line_size, DCTELEM *block)
2770 {
2771     j_rev_dct2 (block);
2772     put_pixels_clamped2_c(block, dest, line_size);
2773 }
2774 static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block)
2775 {
2776     j_rev_dct2 (block);
2777     add_pixels_clamped2_c(block, dest, line_size);
2778 }
2779
2780 static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block)
2781 {
2782     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2783
2784     dest[0] = cm[(block[0] + 4)>>3];
2785 }
2786 static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block)
2787 {
2788     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2789
2790     dest[0] = cm[dest[0] + ((block[0] + 4)>>3)];
2791 }
2792
2793 static void just_return(void *mem av_unused, int stride av_unused, int h av_unused) { return; }
2794
2795 /* init static data */
2796 av_cold void dsputil_static_init(void)
2797 {
2798     int i;
2799
2800     for(i=0;i<256;i++) ff_cropTbl[i + MAX_NEG_CROP] = i;
2801     for(i=0;i<MAX_NEG_CROP;i++) {
2802         ff_cropTbl[i] = 0;
2803         ff_cropTbl[i + MAX_NEG_CROP + 256] = 255;
2804     }
2805
2806     for(i=0;i<512;i++) {
2807         ff_squareTbl[i] = (i - 256) * (i - 256);
2808     }
2809
2810     for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
2811 }
2812
2813 int ff_check_alignment(void){
2814     static int did_fail=0;
2815     LOCAL_ALIGNED_16(int, aligned);
2816
2817     if((intptr_t)&aligned & 15){
2818         if(!did_fail){
2819 #if HAVE_MMX || HAVE_ALTIVEC
2820             av_log(NULL, AV_LOG_ERROR,
2821                 "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
2822                 "and may be very slow or crash. This is not a bug in libavcodec,\n"
2823                 "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
2824                 "Do not report crashes to Libav developers.\n");
2825 #endif
2826             did_fail=1;
2827         }
2828         return -1;
2829     }
2830     return 0;
2831 }
2832
2833 av_cold void dsputil_init(DSPContext* c, AVCodecContext *avctx)
2834 {
2835     int i;
2836
2837     ff_check_alignment();
2838
2839 #if CONFIG_ENCODERS
2840     if(avctx->dct_algo==FF_DCT_FASTINT) {
2841         c->fdct = fdct_ifast;
2842         c->fdct248 = fdct_ifast248;
2843     }
2844     else if(avctx->dct_algo==FF_DCT_FAAN) {
2845         c->fdct = ff_faandct;
2846         c->fdct248 = ff_faandct248;
2847     }
2848     else {
2849         c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
2850         c->fdct248 = ff_fdct248_islow;
2851     }
2852 #endif //CONFIG_ENCODERS
2853
2854     if(avctx->lowres==1){
2855         if(avctx->idct_algo==FF_IDCT_INT || avctx->idct_algo==FF_IDCT_AUTO || !CONFIG_H264_DECODER){
2856             c->idct_put= ff_jref_idct4_put;
2857             c->idct_add= ff_jref_idct4_add;
2858         }else{
2859             if (avctx->codec_id != CODEC_ID_H264) {
2860                 c->idct_put= ff_h264_lowres_idct_put_8_c;
2861                 c->idct_add= ff_h264_lowres_idct_add_8_c;
2862             } else {
2863                 switch (avctx->bits_per_raw_sample) {
2864                     case 9:
2865                         c->idct_put= ff_h264_lowres_idct_put_9_c;
2866                         c->idct_add= ff_h264_lowres_idct_add_9_c;
2867                         break;
2868                     case 10:
2869                         c->idct_put= ff_h264_lowres_idct_put_10_c;
2870                         c->idct_add= ff_h264_lowres_idct_add_10_c;
2871                         break;
2872                     default:
2873                         c->idct_put= ff_h264_lowres_idct_put_8_c;
2874                         c->idct_add= ff_h264_lowres_idct_add_8_c;
2875                 }
2876             }
2877         }
2878         c->idct    = j_rev_dct4;
2879         c->idct_permutation_type= FF_NO_IDCT_PERM;
2880     }else if(avctx->lowres==2){
2881         c->idct_put= ff_jref_idct2_put;
2882         c->idct_add= ff_jref_idct2_add;
2883         c->idct    = j_rev_dct2;
2884         c->idct_permutation_type= FF_NO_IDCT_PERM;
2885     }else if(avctx->lowres==3){
2886         c->idct_put= ff_jref_idct1_put;
2887         c->idct_add= ff_jref_idct1_add;
2888         c->idct    = j_rev_dct1;
2889         c->idct_permutation_type= FF_NO_IDCT_PERM;
2890     }else{
2891         if(avctx->idct_algo==FF_IDCT_INT){
2892             c->idct_put= ff_jref_idct_put;
2893             c->idct_add= ff_jref_idct_add;
2894             c->idct    = j_rev_dct;
2895             c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
2896         }else if((CONFIG_VP3_DECODER || CONFIG_VP5_DECODER || CONFIG_VP6_DECODER ) &&
2897                 avctx->idct_algo==FF_IDCT_VP3){
2898             c->idct_put= ff_vp3_idct_put_c;
2899             c->idct_add= ff_vp3_idct_add_c;
2900             c->idct    = ff_vp3_idct_c;
2901             c->idct_permutation_type= FF_NO_IDCT_PERM;
2902         }else if(avctx->idct_algo==FF_IDCT_WMV2){
2903             c->idct_put= ff_wmv2_idct_put_c;
2904             c->idct_add= ff_wmv2_idct_add_c;
2905             c->idct    = ff_wmv2_idct_c;
2906             c->idct_permutation_type= FF_NO_IDCT_PERM;
2907         }else if(avctx->idct_algo==FF_IDCT_FAAN){
2908             c->idct_put= ff_faanidct_put;
2909             c->idct_add= ff_faanidct_add;
2910             c->idct    = ff_faanidct;
2911             c->idct_permutation_type= FF_NO_IDCT_PERM;
2912         }else if(CONFIG_EATGQ_DECODER && avctx->idct_algo==FF_IDCT_EA) {
2913             c->idct_put= ff_ea_idct_put_c;
2914             c->idct_permutation_type= FF_NO_IDCT_PERM;
2915         }else if(CONFIG_BINK_DECODER && avctx->idct_algo==FF_IDCT_BINK) {
2916             c->idct     = ff_bink_idct_c;
2917             c->idct_add = ff_bink_idct_add_c;
2918             c->idct_put = ff_bink_idct_put_c;
2919             c->idct_permutation_type = FF_NO_IDCT_PERM;
2920         }else{ //accurate/default
2921             c->idct_put= ff_simple_idct_put;
2922             c->idct_add= ff_simple_idct_add;
2923             c->idct    = ff_simple_idct;
2924             c->idct_permutation_type= FF_NO_IDCT_PERM;
2925         }
2926     }
2927
2928     c->get_pixels = get_pixels_c;
2929     c->diff_pixels = diff_pixels_c;
2930     c->put_pixels_clamped = ff_put_pixels_clamped_c;
2931     c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_c;
2932     c->put_pixels_nonclamped = put_pixels_nonclamped_c;
2933     c->add_pixels_clamped = ff_add_pixels_clamped_c;
2934     c->sum_abs_dctelem = sum_abs_dctelem_c;
2935     c->gmc1 = gmc1_c;
2936     c->gmc = ff_gmc_c;
2937     c->pix_sum = pix_sum_c;
2938     c->pix_norm1 = pix_norm1_c;
2939
2940     c->fill_block_tab[0] = fill_block16_c;
2941     c->fill_block_tab[1] = fill_block8_c;
2942     c->scale_block = scale_block_c;
2943
2944     /* TODO [0] 16  [1] 8 */
2945     c->pix_abs[0][0] = pix_abs16_c;
2946     c->pix_abs[0][1] = pix_abs16_x2_c;
2947     c->pix_abs[0][2] = pix_abs16_y2_c;
2948     c->pix_abs[0][3] = pix_abs16_xy2_c;
2949     c->pix_abs[1][0] = pix_abs8_c;
2950     c->pix_abs[1][1] = pix_abs8_x2_c;
2951     c->pix_abs[1][2] = pix_abs8_y2_c;
2952     c->pix_abs[1][3] = pix_abs8_xy2_c;
2953
2954     c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
2955     c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
2956     c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
2957     c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
2958     c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
2959     c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
2960     c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
2961     c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
2962     c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
2963
2964     c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
2965     c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
2966     c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
2967     c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
2968     c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
2969     c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
2970     c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
2971     c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
2972     c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
2973
2974 #define dspfunc(PFX, IDX, NUM) \
2975     c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
2976     c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
2977     c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
2978     c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
2979     c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
2980     c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
2981     c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
2982     c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
2983     c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
2984     c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
2985     c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
2986     c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
2987     c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
2988     c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
2989     c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
2990     c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
2991
2992     dspfunc(put_qpel, 0, 16);
2993     dspfunc(put_no_rnd_qpel, 0, 16);
2994
2995     dspfunc(avg_qpel, 0, 16);
2996     /* dspfunc(avg_no_rnd_qpel, 0, 16); */
2997
2998     dspfunc(put_qpel, 1, 8);
2999     dspfunc(put_no_rnd_qpel, 1, 8);
3000
3001     dspfunc(avg_qpel, 1, 8);
3002     /* dspfunc(avg_no_rnd_qpel, 1, 8); */
3003
3004 #undef dspfunc
3005
3006 #if CONFIG_MLP_DECODER || CONFIG_TRUEHD_DECODER
3007     ff_mlp_init(c, avctx);
3008 #endif
3009 #if CONFIG_WMV2_DECODER || CONFIG_VC1_DECODER
3010     ff_intrax8dsp_init(c,avctx);
3011 #endif
3012 #if CONFIG_RV30_DECODER
3013     ff_rv30dsp_init(c,avctx);
3014 #endif
3015 #if CONFIG_RV40_DECODER
3016     ff_rv40dsp_init(c,avctx);
3017     c->put_rv40_qpel_pixels_tab[0][15] = put_rv40_qpel16_mc33_c;
3018     c->avg_rv40_qpel_pixels_tab[0][15] = avg_rv40_qpel16_mc33_c;
3019     c->put_rv40_qpel_pixels_tab[1][15] = put_rv40_qpel8_mc33_c;
3020     c->avg_rv40_qpel_pixels_tab[1][15] = avg_rv40_qpel8_mc33_c;
3021 #endif
3022
3023     c->put_mspel_pixels_tab[0]= ff_put_pixels8x8_c;
3024     c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
3025     c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
3026     c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
3027     c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
3028     c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
3029     c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
3030     c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
3031
3032 #define SET_CMP_FUNC(name) \
3033     c->name[0]= name ## 16_c;\
3034     c->name[1]= name ## 8x8_c;
3035
3036     SET_CMP_FUNC(hadamard8_diff)
3037     c->hadamard8_diff[4]= hadamard8_intra16_c;
3038     c->hadamard8_diff[5]= hadamard8_intra8x8_c;
3039     SET_CMP_FUNC(dct_sad)
3040     SET_CMP_FUNC(dct_max)
3041 #if CONFIG_GPL
3042     SET_CMP_FUNC(dct264_sad)
3043 #endif
3044     c->sad[0]= pix_abs16_c;
3045     c->sad[1]= pix_abs8_c;
3046     c->sse[0]= sse16_c;
3047     c->sse[1]= sse8_c;
3048     c->sse[2]= sse4_c;
3049     SET_CMP_FUNC(quant_psnr)
3050     SET_CMP_FUNC(rd)
3051     SET_CMP_FUNC(bit)
3052     c->vsad[0]= vsad16_c;
3053     c->vsad[4]= vsad_intra16_c;
3054     c->vsad[5]= vsad_intra8_c;
3055     c->vsse[0]= vsse16_c;
3056     c->vsse[4]= vsse_intra16_c;
3057     c->vsse[5]= vsse_intra8_c;
3058     c->nsse[0]= nsse16_c;
3059     c->nsse[1]= nsse8_c;
3060 #if CONFIG_DWT
3061     ff_dsputil_init_dwt(c);
3062 #endif
3063
3064     c->ssd_int8_vs_int16 = ssd_int8_vs_int16_c;
3065
3066     c->add_bytes= add_bytes_c;
3067     c->add_bytes_l2= add_bytes_l2_c;
3068     c->diff_bytes= diff_bytes_c;
3069     c->add_hfyu_median_prediction= add_hfyu_median_prediction_c;
3070     c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
3071     c->add_hfyu_left_prediction  = add_hfyu_left_prediction_c;
3072     c->add_hfyu_left_prediction_bgr32 = add_hfyu_left_prediction_bgr32_c;
3073     c->bswap_buf= bswap_buf;
3074     c->bswap16_buf = bswap16_buf;
3075 #if CONFIG_PNG_DECODER
3076     c->add_png_paeth_prediction= ff_add_png_paeth_prediction;
3077 #endif
3078
3079     if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
3080         c->h263_h_loop_filter= h263_h_loop_filter_c;
3081         c->h263_v_loop_filter= h263_v_loop_filter_c;
3082     }
3083
3084     if (CONFIG_VP3_DECODER) {
3085         c->vp3_h_loop_filter= ff_vp3_h_loop_filter_c;
3086         c->vp3_v_loop_filter= ff_vp3_v_loop_filter_c;
3087         c->vp3_idct_dc_add= ff_vp3_idct_dc_add_c;
3088     }
3089
3090     c->h261_loop_filter= h261_loop_filter_c;
3091
3092     c->try_8x8basis= try_8x8basis_c;
3093     c->add_8x8basis= add_8x8basis_c;
3094
3095 #if CONFIG_VORBIS_DECODER
3096     c->vorbis_inverse_coupling = vorbis_inverse_coupling;
3097 #endif
3098 #if CONFIG_AC3_DECODER
3099     c->ac3_downmix = ff_ac3_downmix_c;
3100 #endif
3101     c->vector_fmul = vector_fmul_c;
3102     c->vector_fmul_reverse = vector_fmul_reverse_c;
3103     c->vector_fmul_add = vector_fmul_add_c;
3104     c->vector_fmul_window = vector_fmul_window_c;
3105     c->vector_clipf = vector_clipf_c;
3106     c->scalarproduct_int16 = scalarproduct_int16_c;
3107     c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
3108     c->apply_window_int16 = apply_window_int16_c;
3109     c->vector_clip_int32 = vector_clip_int32_c;
3110     c->scalarproduct_float = scalarproduct_float_c;
3111     c->butterflies_float = butterflies_float_c;
3112     c->vector_fmul_scalar = vector_fmul_scalar_c;
3113
3114     c->vector_fmul_sv_scalar[0] = vector_fmul_sv_scalar_2_c;
3115     c->vector_fmul_sv_scalar[1] = vector_fmul_sv_scalar_4_c;
3116
3117     c->sv_fmul_scalar[0] = sv_fmul_scalar_2_c;
3118     c->sv_fmul_scalar[1] = sv_fmul_scalar_4_c;
3119
3120     c->shrink[0]= av_image_copy_plane;
3121     c->shrink[1]= ff_shrink22;
3122     c->shrink[2]= ff_shrink44;
3123     c->shrink[3]= ff_shrink88;
3124
3125     c->prefetch= just_return;
3126
3127     memset(c->put_2tap_qpel_pixels_tab, 0, sizeof(c->put_2tap_qpel_pixels_tab));
3128     memset(c->avg_2tap_qpel_pixels_tab, 0, sizeof(c->avg_2tap_qpel_pixels_tab));
3129
3130 #undef FUNC
3131 #undef FUNCC
3132 #define FUNC(f, depth) f ## _ ## depth
3133 #define FUNCC(f, depth) f ## _ ## depth ## _c
3134
3135 #define dspfunc1(PFX, IDX, NUM, depth)\
3136     c->PFX ## _pixels_tab[IDX][0] = FUNCC(PFX ## _pixels ## NUM        , depth);\
3137     c->PFX ## _pixels_tab[IDX][1] = FUNCC(PFX ## _pixels ## NUM ## _x2 , depth);\
3138     c->PFX ## _pixels_tab[IDX][2] = FUNCC(PFX ## _pixels ## NUM ## _y2 , depth);\
3139     c->PFX ## _pixels_tab[IDX][3] = FUNCC(PFX ## _pixels ## NUM ## _xy2, depth)
3140
3141 #define dspfunc2(PFX, IDX, NUM, depth)\
3142     c->PFX ## _pixels_tab[IDX][ 0] = FUNCC(PFX ## NUM ## _mc00, depth);\
3143     c->PFX ## _pixels_tab[IDX][ 1] = FUNCC(PFX ## NUM ## _mc10, depth);\
3144     c->PFX ## _pixels_tab[IDX][ 2] = FUNCC(PFX ## NUM ## _mc20, depth);\
3145     c->PFX ## _pixels_tab[IDX][ 3] = FUNCC(PFX ## NUM ## _mc30, depth);\
3146     c->PFX ## _pixels_tab[IDX][ 4] = FUNCC(PFX ## NUM ## _mc01, depth);\
3147     c->PFX ## _pixels_tab[IDX][ 5] = FUNCC(PFX ## NUM ## _mc11, depth);\
3148     c->PFX ## _pixels_tab[IDX][ 6] = FUNCC(PFX ## NUM ## _mc21, depth);\
3149     c->PFX ## _pixels_tab[IDX][ 7] = FUNCC(PFX ## NUM ## _mc31, depth);\
3150     c->PFX ## _pixels_tab[IDX][ 8] = FUNCC(PFX ## NUM ## _mc02, depth);\
3151     c->PFX ## _pixels_tab[IDX][ 9] = FUNCC(PFX ## NUM ## _mc12, depth);\
3152     c->PFX ## _pixels_tab[IDX][10] = FUNCC(PFX ## NUM ## _mc22, depth);\
3153     c->PFX ## _pixels_tab[IDX][11] = FUNCC(PFX ## NUM ## _mc32, depth);\
3154     c->PFX ## _pixels_tab[IDX][12] = FUNCC(PFX ## NUM ## _mc03, depth);\
3155     c->PFX ## _pixels_tab[IDX][13] = FUNCC(PFX ## NUM ## _mc13, depth);\
3156     c->PFX ## _pixels_tab[IDX][14] = FUNCC(PFX ## NUM ## _mc23, depth);\
3157     c->PFX ## _pixels_tab[IDX][15] = FUNCC(PFX ## NUM ## _mc33, depth)
3158
3159
3160 #define BIT_DEPTH_FUNCS(depth)\
3161     c->draw_edges                    = FUNCC(draw_edges            , depth);\
3162     c->emulated_edge_mc              = FUNC (ff_emulated_edge_mc   , depth);\
3163     c->clear_block                   = FUNCC(clear_block           , depth);\
3164     c->clear_blocks                  = FUNCC(clear_blocks          , depth);\
3165     c->add_pixels8                   = FUNCC(add_pixels8           , depth);\
3166     c->add_pixels4                   = FUNCC(add_pixels4           , depth);\
3167     c->put_no_rnd_pixels_l2[0]       = FUNCC(put_no_rnd_pixels16_l2, depth);\
3168     c->put_no_rnd_pixels_l2[1]       = FUNCC(put_no_rnd_pixels8_l2 , depth);\
3169 \
3170     c->put_h264_chroma_pixels_tab[0] = FUNCC(put_h264_chroma_mc8   , depth);\
3171     c->put_h264_chroma_pixels_tab[1] = FUNCC(put_h264_chroma_mc4   , depth);\
3172     c->put_h264_chroma_pixels_tab[2] = FUNCC(put_h264_chroma_mc2   , depth);\
3173     c->avg_h264_chroma_pixels_tab[0] = FUNCC(avg_h264_chroma_mc8   , depth);\
3174     c->avg_h264_chroma_pixels_tab[1] = FUNCC(avg_h264_chroma_mc4   , depth);\
3175     c->avg_h264_chroma_pixels_tab[2] = FUNCC(avg_h264_chroma_mc2   , depth);\
3176 \
3177     dspfunc1(put       , 0, 16, depth);\
3178     dspfunc1(put       , 1,  8, depth);\
3179     dspfunc1(put       , 2,  4, depth);\
3180     dspfunc1(put       , 3,  2, depth);\
3181     dspfunc1(put_no_rnd, 0, 16, depth);\
3182     dspfunc1(put_no_rnd, 1,  8, depth);\
3183     dspfunc1(avg       , 0, 16, depth);\
3184     dspfunc1(avg       , 1,  8, depth);\
3185     dspfunc1(avg       , 2,  4, depth);\
3186     dspfunc1(avg       , 3,  2, depth);\
3187     dspfunc1(avg_no_rnd, 0, 16, depth);\
3188     dspfunc1(avg_no_rnd, 1,  8, depth);\
3189 \
3190     dspfunc2(put_h264_qpel, 0, 16, depth);\
3191     dspfunc2(put_h264_qpel, 1,  8, depth);\
3192     dspfunc2(put_h264_qpel, 2,  4, depth);\
3193     dspfunc2(put_h264_qpel, 3,  2, depth);\
3194     dspfunc2(avg_h264_qpel, 0, 16, depth);\
3195     dspfunc2(avg_h264_qpel, 1,  8, depth);\
3196     dspfunc2(avg_h264_qpel, 2,  4, depth);
3197
3198     if (avctx->codec_id != CODEC_ID_H264 || avctx->bits_per_raw_sample == 8) {
3199         BIT_DEPTH_FUNCS(8)
3200     } else {
3201         switch (avctx->bits_per_raw_sample) {
3202             case 9:
3203                 BIT_DEPTH_FUNCS(9)
3204                 break;
3205             case 10:
3206                 BIT_DEPTH_FUNCS(10)
3207                 break;
3208             default:
3209                 av_log(avctx, AV_LOG_DEBUG, "Unsupported bit depth: %d\n", avctx->bits_per_raw_sample);
3210                 BIT_DEPTH_FUNCS(8)
3211                 break;
3212         }
3213     }
3214
3215
3216     if (HAVE_MMX)        dsputil_init_mmx   (c, avctx);
3217     if (ARCH_ARM)        dsputil_init_arm   (c, avctx);
3218     if (CONFIG_MLIB)     dsputil_init_mlib  (c, avctx);
3219     if (HAVE_VIS)        dsputil_init_vis   (c, avctx);
3220     if (ARCH_ALPHA)      dsputil_init_alpha (c, avctx);
3221     if (ARCH_PPC)        dsputil_init_ppc   (c, avctx);
3222     if (HAVE_MMI)        dsputil_init_mmi   (c, avctx);
3223     if (ARCH_SH4)        dsputil_init_sh4   (c, avctx);
3224     if (ARCH_BFIN)       dsputil_init_bfin  (c, avctx);
3225
3226     for(i=0; i<64; i++){
3227         if(!c->put_2tap_qpel_pixels_tab[0][i])
3228             c->put_2tap_qpel_pixels_tab[0][i]= c->put_h264_qpel_pixels_tab[0][i];
3229         if(!c->avg_2tap_qpel_pixels_tab[0][i])
3230             c->avg_2tap_qpel_pixels_tab[0][i]= c->avg_h264_qpel_pixels_tab[0][i];
3231     }
3232
3233     c->put_rv30_tpel_pixels_tab[0][0] = c->put_h264_qpel_pixels_tab[0][0];
3234     c->put_rv30_tpel_pixels_tab[1][0] = c->put_h264_qpel_pixels_tab[1][0];
3235     c->avg_rv30_tpel_pixels_tab[0][0] = c->avg_h264_qpel_pixels_tab[0][0];
3236     c->avg_rv30_tpel_pixels_tab[1][0] = c->avg_h264_qpel_pixels_tab[1][0];
3237
3238     c->put_rv40_qpel_pixels_tab[0][0] = c->put_h264_qpel_pixels_tab[0][0];
3239     c->put_rv40_qpel_pixels_tab[1][0] = c->put_h264_qpel_pixels_tab[1][0];
3240     c->avg_rv40_qpel_pixels_tab[0][0] = c->avg_h264_qpel_pixels_tab[0][0];
3241     c->avg_rv40_qpel_pixels_tab[1][0] = c->avg_h264_qpel_pixels_tab[1][0];
3242
3243     switch(c->idct_permutation_type){
3244     case FF_NO_IDCT_PERM:
3245         for(i=0; i<64; i++)
3246             c->idct_permutation[i]= i;
3247         break;
3248     case FF_LIBMPEG2_IDCT_PERM:
3249         for(i=0; i<64; i++)
3250             c->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
3251         break;
3252     case FF_SIMPLE_IDCT_PERM:
3253         for(i=0; i<64; i++)
3254             c->idct_permutation[i]= simple_mmx_permutation[i];
3255         break;
3256     case FF_TRANSPOSE_IDCT_PERM:
3257         for(i=0; i<64; i++)
3258             c->idct_permutation[i]= ((i&7)<<3) | (i>>3);
3259         break;
3260     case FF_PARTTRANS_IDCT_PERM:
3261         for(i=0; i<64; i++)
3262             c->idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3);
3263         break;
3264     case FF_SSE2_IDCT_PERM:
3265         for(i=0; i<64; i++)
3266             c->idct_permutation[i]= (i&0x38) | idct_sse2_row_perm[i&7];
3267         break;
3268     default:
3269         av_log(avctx, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");
3270     }
3271 }
3272