3 * Copyright (c) 2000, 2001 Fabrice Bellard
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
6 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
8 * This file is part of Libav.
10 * Libav is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public
12 * License as published by the Free Software Foundation; either
13 * version 2.1 of the License, or (at your option) any later version.
15 * Libav is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with Libav; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
30 #include "libavutil/imgutils.h"
33 #include "simple_idct.h"
37 #include "mpegvideo.h"
43 uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
44 uint32_t ff_squareTbl[512] = {0, };
47 #include "dsputil_template.c"
51 #include "dsputil_template.c"
55 #include "dsputil_template.c"
57 // 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
58 #define pb_7f (~0UL/255 * 0x7f)
59 #define pb_80 (~0UL/255 * 0x80)
61 const uint8_t ff_zigzag_direct[64] = {
62 0, 1, 8, 16, 9, 2, 3, 10,
63 17, 24, 32, 25, 18, 11, 4, 5,
64 12, 19, 26, 33, 40, 48, 41, 34,
65 27, 20, 13, 6, 7, 14, 21, 28,
66 35, 42, 49, 56, 57, 50, 43, 36,
67 29, 22, 15, 23, 30, 37, 44, 51,
68 58, 59, 52, 45, 38, 31, 39, 46,
69 53, 60, 61, 54, 47, 55, 62, 63
72 /* Specific zigzag scan for 248 idct. NOTE that unlike the
73 specification, we interleave the fields */
74 const uint8_t ff_zigzag248_direct[64] = {
75 0, 8, 1, 9, 16, 24, 2, 10,
76 17, 25, 32, 40, 48, 56, 33, 41,
77 18, 26, 3, 11, 4, 12, 19, 27,
78 34, 42, 49, 57, 50, 58, 35, 43,
79 20, 28, 5, 13, 6, 14, 21, 29,
80 36, 44, 51, 59, 52, 60, 37, 45,
81 22, 30, 7, 15, 23, 31, 38, 46,
82 53, 61, 54, 62, 39, 47, 55, 63,
85 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
86 DECLARE_ALIGNED(16, uint16_t, inv_zigzag_direct16)[64];
88 const uint8_t ff_alternate_horizontal_scan[64] = {
89 0, 1, 2, 3, 8, 9, 16, 17,
90 10, 11, 4, 5, 6, 7, 15, 14,
91 13, 12, 19, 18, 24, 25, 32, 33,
92 26, 27, 20, 21, 22, 23, 28, 29,
93 30, 31, 34, 35, 40, 41, 48, 49,
94 42, 43, 36, 37, 38, 39, 44, 45,
95 46, 47, 50, 51, 56, 57, 58, 59,
96 52, 53, 54, 55, 60, 61, 62, 63,
99 const uint8_t ff_alternate_vertical_scan[64] = {
100 0, 8, 16, 24, 1, 9, 2, 10,
101 17, 25, 32, 40, 48, 56, 57, 49,
102 41, 33, 26, 18, 3, 11, 4, 12,
103 19, 27, 34, 42, 50, 58, 35, 43,
104 51, 59, 20, 28, 5, 13, 6, 14,
105 21, 29, 36, 44, 52, 60, 37, 45,
106 53, 61, 22, 30, 7, 15, 23, 31,
107 38, 46, 54, 62, 39, 47, 55, 63,
110 /* Input permutation for the simple_idct_mmx */
111 static const uint8_t simple_mmx_permutation[64]={
112 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
113 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
114 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
115 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
116 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
117 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
118 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
119 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
122 static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7};
124 void ff_init_scantable(uint8_t *permutation, ScanTable *st, const uint8_t *src_scantable){
128 st->scantable= src_scantable;
132 j = src_scantable[i];
133 st->permutated[i] = permutation[j];
142 j = st->permutated[i];
144 st->raster_end[i]= end;
148 static int pix_sum_c(uint8_t * pix, int line_size)
153 for (i = 0; i < 16; i++) {
154 for (j = 0; j < 16; j += 8) {
165 pix += line_size - 16;
170 static int pix_norm1_c(uint8_t * pix, int line_size)
173 uint32_t *sq = ff_squareTbl + 256;
176 for (i = 0; i < 16; i++) {
177 for (j = 0; j < 16; j += 8) {
188 #if LONG_MAX > 2147483647
189 register uint64_t x=*(uint64_t*)pix;
191 s += sq[(x>>8)&0xff];
192 s += sq[(x>>16)&0xff];
193 s += sq[(x>>24)&0xff];
194 s += sq[(x>>32)&0xff];
195 s += sq[(x>>40)&0xff];
196 s += sq[(x>>48)&0xff];
197 s += sq[(x>>56)&0xff];
199 register uint32_t x=*(uint32_t*)pix;
201 s += sq[(x>>8)&0xff];
202 s += sq[(x>>16)&0xff];
203 s += sq[(x>>24)&0xff];
204 x=*(uint32_t*)(pix+4);
206 s += sq[(x>>8)&0xff];
207 s += sq[(x>>16)&0xff];
208 s += sq[(x>>24)&0xff];
213 pix += line_size - 16;
218 static void bswap_buf(uint32_t *dst, const uint32_t *src, int w){
221 for(i=0; i+8<=w; i+=8){
222 dst[i+0]= av_bswap32(src[i+0]);
223 dst[i+1]= av_bswap32(src[i+1]);
224 dst[i+2]= av_bswap32(src[i+2]);
225 dst[i+3]= av_bswap32(src[i+3]);
226 dst[i+4]= av_bswap32(src[i+4]);
227 dst[i+5]= av_bswap32(src[i+5]);
228 dst[i+6]= av_bswap32(src[i+6]);
229 dst[i+7]= av_bswap32(src[i+7]);
232 dst[i+0]= av_bswap32(src[i+0]);
236 static void bswap16_buf(uint16_t *dst, const uint16_t *src, int len)
239 *dst++ = av_bswap16(*src++);
242 static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
245 uint32_t *sq = ff_squareTbl + 256;
248 for (i = 0; i < h; i++) {
249 s += sq[pix1[0] - pix2[0]];
250 s += sq[pix1[1] - pix2[1]];
251 s += sq[pix1[2] - pix2[2]];
252 s += sq[pix1[3] - pix2[3]];
259 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
262 uint32_t *sq = ff_squareTbl + 256;
265 for (i = 0; i < h; i++) {
266 s += sq[pix1[0] - pix2[0]];
267 s += sq[pix1[1] - pix2[1]];
268 s += sq[pix1[2] - pix2[2]];
269 s += sq[pix1[3] - pix2[3]];
270 s += sq[pix1[4] - pix2[4]];
271 s += sq[pix1[5] - pix2[5]];
272 s += sq[pix1[6] - pix2[6]];
273 s += sq[pix1[7] - pix2[7]];
280 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
283 uint32_t *sq = ff_squareTbl + 256;
286 for (i = 0; i < h; i++) {
287 s += sq[pix1[ 0] - pix2[ 0]];
288 s += sq[pix1[ 1] - pix2[ 1]];
289 s += sq[pix1[ 2] - pix2[ 2]];
290 s += sq[pix1[ 3] - pix2[ 3]];
291 s += sq[pix1[ 4] - pix2[ 4]];
292 s += sq[pix1[ 5] - pix2[ 5]];
293 s += sq[pix1[ 6] - pix2[ 6]];
294 s += sq[pix1[ 7] - pix2[ 7]];
295 s += sq[pix1[ 8] - pix2[ 8]];
296 s += sq[pix1[ 9] - pix2[ 9]];
297 s += sq[pix1[10] - pix2[10]];
298 s += sq[pix1[11] - pix2[11]];
299 s += sq[pix1[12] - pix2[12]];
300 s += sq[pix1[13] - pix2[13]];
301 s += sq[pix1[14] - pix2[14]];
302 s += sq[pix1[15] - pix2[15]];
310 static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
314 /* read the pixels */
316 block[0] = pixels[0];
317 block[1] = pixels[1];
318 block[2] = pixels[2];
319 block[3] = pixels[3];
320 block[4] = pixels[4];
321 block[5] = pixels[5];
322 block[6] = pixels[6];
323 block[7] = pixels[7];
329 static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
330 const uint8_t *s2, int stride){
333 /* read the pixels */
335 block[0] = s1[0] - s2[0];
336 block[1] = s1[1] - s2[1];
337 block[2] = s1[2] - s2[2];
338 block[3] = s1[3] - s2[3];
339 block[4] = s1[4] - s2[4];
340 block[5] = s1[5] - s2[5];
341 block[6] = s1[6] - s2[6];
342 block[7] = s1[7] - s2[7];
350 void ff_put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
354 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
356 /* read the pixels */
358 pixels[0] = cm[block[0]];
359 pixels[1] = cm[block[1]];
360 pixels[2] = cm[block[2]];
361 pixels[3] = cm[block[3]];
362 pixels[4] = cm[block[4]];
363 pixels[5] = cm[block[5]];
364 pixels[6] = cm[block[6]];
365 pixels[7] = cm[block[7]];
372 static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
376 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
378 /* read the pixels */
380 pixels[0] = cm[block[0]];
381 pixels[1] = cm[block[1]];
382 pixels[2] = cm[block[2]];
383 pixels[3] = cm[block[3]];
390 static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
394 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
396 /* read the pixels */
398 pixels[0] = cm[block[0]];
399 pixels[1] = cm[block[1]];
406 void ff_put_signed_pixels_clamped_c(const DCTELEM *block,
407 uint8_t *restrict pixels,
412 for (i = 0; i < 8; i++) {
413 for (j = 0; j < 8; j++) {
416 else if (*block > 127)
419 *pixels = (uint8_t)(*block + 128);
423 pixels += (line_size - 8);
427 static void put_pixels_nonclamped_c(const DCTELEM *block, uint8_t *restrict pixels,
432 /* read the pixels */
434 pixels[0] = block[0];
435 pixels[1] = block[1];
436 pixels[2] = block[2];
437 pixels[3] = block[3];
438 pixels[4] = block[4];
439 pixels[5] = block[5];
440 pixels[6] = block[6];
441 pixels[7] = block[7];
448 void ff_add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
452 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
454 /* read the pixels */
456 pixels[0] = cm[pixels[0] + block[0]];
457 pixels[1] = cm[pixels[1] + block[1]];
458 pixels[2] = cm[pixels[2] + block[2]];
459 pixels[3] = cm[pixels[3] + block[3]];
460 pixels[4] = cm[pixels[4] + block[4]];
461 pixels[5] = cm[pixels[5] + block[5]];
462 pixels[6] = cm[pixels[6] + block[6]];
463 pixels[7] = cm[pixels[7] + block[7]];
469 static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
473 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
475 /* read the pixels */
477 pixels[0] = cm[pixels[0] + block[0]];
478 pixels[1] = cm[pixels[1] + block[1]];
479 pixels[2] = cm[pixels[2] + block[2]];
480 pixels[3] = cm[pixels[3] + block[3]];
486 static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
490 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
492 /* read the pixels */
494 pixels[0] = cm[pixels[0] + block[0]];
495 pixels[1] = cm[pixels[1] + block[1]];
501 static int sum_abs_dctelem_c(DCTELEM *block)
505 sum+= FFABS(block[i]);
509 static void fill_block16_c(uint8_t *block, uint8_t value, int line_size, int h)
513 for (i = 0; i < h; i++) {
514 memset(block, value, 16);
519 static void fill_block8_c(uint8_t *block, uint8_t value, int line_size, int h)
523 for (i = 0; i < h; i++) {
524 memset(block, value, 8);
529 static void scale_block_c(const uint8_t src[64]/*align 8*/, uint8_t *dst/*align 8*/, int linesize)
532 uint16_t *dst1 = (uint16_t *) dst;
533 uint16_t *dst2 = (uint16_t *)(dst + linesize);
535 for (j = 0; j < 8; j++) {
536 for (i = 0; i < 8; i++) {
537 dst1[i] = dst2[i] = src[i] * 0x0101;
545 #define avg2(a,b) ((a+b+1)>>1)
546 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
548 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
550 const int A=(16-x16)*(16-y16);
551 const int B=( x16)*(16-y16);
552 const int C=(16-x16)*( y16);
553 const int D=( x16)*( y16);
558 dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
559 dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
560 dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
561 dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
562 dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
563 dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
564 dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
565 dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
571 void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
572 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
575 const int s= 1<<shift;
585 for(x=0; x<8; x++){ //XXX FIXME optimize
586 int src_x, src_y, frac_x, frac_y, index;
595 if((unsigned)src_x < width){
596 if((unsigned)src_y < height){
597 index= src_x + src_y*stride;
598 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
599 + src[index +1]* frac_x )*(s-frac_y)
600 + ( src[index+stride ]*(s-frac_x)
601 + src[index+stride+1]* frac_x )* frac_y
604 index= src_x + av_clip(src_y, 0, height)*stride;
605 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
606 + src[index +1]* frac_x )*s
610 if((unsigned)src_y < height){
611 index= av_clip(src_x, 0, width) + src_y*stride;
612 dst[y*stride + x]= ( ( src[index ]*(s-frac_y)
613 + src[index+stride ]* frac_y )*s
616 index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
617 dst[y*stride + x]= src[index ];
629 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
631 case 2: put_pixels2_8_c (dst, src, stride, height); break;
632 case 4: put_pixels4_8_c (dst, src, stride, height); break;
633 case 8: put_pixels8_8_c (dst, src, stride, height); break;
634 case 16:put_pixels16_8_c(dst, src, stride, height); break;
638 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
640 for (i=0; i < height; i++) {
641 for (j=0; j < width; j++) {
642 dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
649 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
651 for (i=0; i < height; i++) {
652 for (j=0; j < width; j++) {
653 dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
660 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
662 for (i=0; i < height; i++) {
663 for (j=0; j < width; j++) {
664 dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
671 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
673 for (i=0; i < height; i++) {
674 for (j=0; j < width; j++) {
675 dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
682 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
684 for (i=0; i < height; i++) {
685 for (j=0; j < width; j++) {
686 dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
693 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
695 for (i=0; i < height; i++) {
696 for (j=0; j < width; j++) {
697 dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
704 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
706 for (i=0; i < height; i++) {
707 for (j=0; j < width; j++) {
708 dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
715 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
717 for (i=0; i < height; i++) {
718 for (j=0; j < width; j++) {
719 dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
726 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
728 case 2: avg_pixels2_8_c (dst, src, stride, height); break;
729 case 4: avg_pixels4_8_c (dst, src, stride, height); break;
730 case 8: avg_pixels8_8_c (dst, src, stride, height); break;
731 case 16:avg_pixels16_8_c(dst, src, stride, height); break;
735 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
737 for (i=0; i < height; i++) {
738 for (j=0; j < width; j++) {
739 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
746 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
748 for (i=0; i < height; i++) {
749 for (j=0; j < width; j++) {
750 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
757 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
759 for (i=0; i < height; i++) {
760 for (j=0; j < width; j++) {
761 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
768 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
770 for (i=0; i < height; i++) {
771 for (j=0; j < width; j++) {
772 dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
779 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
781 for (i=0; i < height; i++) {
782 for (j=0; j < width; j++) {
783 dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
790 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
792 for (i=0; i < height; i++) {
793 for (j=0; j < width; j++) {
794 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
801 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
803 for (i=0; i < height; i++) {
804 for (j=0; j < width; j++) {
805 dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
812 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
814 for (i=0; i < height; i++) {
815 for (j=0; j < width; j++) {
816 dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
823 #define QPEL_MC(r, OPNAME, RND, OP) \
824 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
825 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
829 OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
830 OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
831 OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
832 OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
833 OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
834 OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
835 OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
836 OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
842 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
844 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
848 const int src0= src[0*srcStride];\
849 const int src1= src[1*srcStride];\
850 const int src2= src[2*srcStride];\
851 const int src3= src[3*srcStride];\
852 const int src4= src[4*srcStride];\
853 const int src5= src[5*srcStride];\
854 const int src6= src[6*srcStride];\
855 const int src7= src[7*srcStride];\
856 const int src8= src[8*srcStride];\
857 OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
858 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
859 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
860 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
861 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
862 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
863 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
864 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
870 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
871 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
876 OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
877 OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
878 OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
879 OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
880 OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
881 OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
882 OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
883 OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
884 OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
885 OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
886 OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
887 OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
888 OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
889 OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
890 OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
891 OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
897 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
898 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
903 const int src0= src[0*srcStride];\
904 const int src1= src[1*srcStride];\
905 const int src2= src[2*srcStride];\
906 const int src3= src[3*srcStride];\
907 const int src4= src[4*srcStride];\
908 const int src5= src[5*srcStride];\
909 const int src6= src[6*srcStride];\
910 const int src7= src[7*srcStride];\
911 const int src8= src[8*srcStride];\
912 const int src9= src[9*srcStride];\
913 const int src10= src[10*srcStride];\
914 const int src11= src[11*srcStride];\
915 const int src12= src[12*srcStride];\
916 const int src13= src[13*srcStride];\
917 const int src14= src[14*srcStride];\
918 const int src15= src[15*srcStride];\
919 const int src16= src[16*srcStride];\
920 OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
921 OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
922 OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
923 OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
924 OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
925 OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
926 OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
927 OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
928 OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
929 OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
930 OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
931 OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
932 OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
933 OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
934 OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
935 OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
941 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
943 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
944 OPNAME ## pixels8_l2_8(dst, src, half, stride, stride, 8, 8);\
947 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
948 OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
951 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
953 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
954 OPNAME ## pixels8_l2_8(dst, src+1, half, stride, stride, 8, 8);\
957 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
960 copy_block9(full, src, 16, stride, 9);\
961 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
962 OPNAME ## pixels8_l2_8(dst, full, half, stride, 16, 8, 8);\
965 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
967 copy_block9(full, src, 16, stride, 9);\
968 OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
971 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
974 copy_block9(full, src, 16, stride, 9);\
975 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
976 OPNAME ## pixels8_l2_8(dst, full+16, half, stride, 16, 8, 8);\
978 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
983 copy_block9(full, src, 16, stride, 9);\
984 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
985 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
986 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
987 OPNAME ## pixels8_l4_8(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
989 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
993 copy_block9(full, src, 16, stride, 9);\
994 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
995 put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
996 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
997 OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
999 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1000 uint8_t full[16*9];\
1003 uint8_t halfHV[64];\
1004 copy_block9(full, src, 16, stride, 9);\
1005 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1006 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1007 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1008 OPNAME ## pixels8_l4_8(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1010 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1011 uint8_t full[16*9];\
1013 uint8_t halfHV[64];\
1014 copy_block9(full, src, 16, stride, 9);\
1015 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1016 put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
1017 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1018 OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
1020 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1021 uint8_t full[16*9];\
1024 uint8_t halfHV[64];\
1025 copy_block9(full, src, 16, stride, 9);\
1026 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1027 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1028 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1029 OPNAME ## pixels8_l4_8(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1031 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1032 uint8_t full[16*9];\
1034 uint8_t halfHV[64];\
1035 copy_block9(full, src, 16, stride, 9);\
1036 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1037 put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
1038 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1039 OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1041 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1042 uint8_t full[16*9];\
1045 uint8_t halfHV[64];\
1046 copy_block9(full, src, 16, stride, 9);\
1047 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full , 8, 16, 9);\
1048 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1049 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1050 OPNAME ## pixels8_l4_8(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1052 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1053 uint8_t full[16*9];\
1055 uint8_t halfHV[64];\
1056 copy_block9(full, src, 16, stride, 9);\
1057 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1058 put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
1059 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1060 OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1062 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1064 uint8_t halfHV[64];\
1065 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1066 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1067 OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
1069 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1071 uint8_t halfHV[64];\
1072 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1073 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1074 OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1076 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1077 uint8_t full[16*9];\
1080 uint8_t halfHV[64];\
1081 copy_block9(full, src, 16, stride, 9);\
1082 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1083 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1084 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1085 OPNAME ## pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);\
1087 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1088 uint8_t full[16*9];\
1090 copy_block9(full, src, 16, stride, 9);\
1091 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1092 put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
1093 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1095 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1096 uint8_t full[16*9];\
1099 uint8_t halfHV[64];\
1100 copy_block9(full, src, 16, stride, 9);\
1101 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1102 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1103 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1104 OPNAME ## pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);\
1106 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1107 uint8_t full[16*9];\
1109 copy_block9(full, src, 16, stride, 9);\
1110 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1111 put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
1112 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1114 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1116 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1117 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1120 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1122 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1123 OPNAME ## pixels16_l2_8(dst, src, half, stride, stride, 16, 16);\
1126 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1127 OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1130 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1132 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1133 OPNAME ## pixels16_l2_8(dst, src+1, half, stride, stride, 16, 16);\
1136 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1137 uint8_t full[24*17];\
1139 copy_block17(full, src, 24, stride, 17);\
1140 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1141 OPNAME ## pixels16_l2_8(dst, full, half, stride, 24, 16, 16);\
1144 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1145 uint8_t full[24*17];\
1146 copy_block17(full, src, 24, stride, 17);\
1147 OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1150 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1151 uint8_t full[24*17];\
1153 copy_block17(full, src, 24, stride, 17);\
1154 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1155 OPNAME ## pixels16_l2_8(dst, full+24, half, stride, 24, 16, 16);\
1157 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1158 uint8_t full[24*17];\
1159 uint8_t halfH[272];\
1160 uint8_t halfV[256];\
1161 uint8_t halfHV[256];\
1162 copy_block17(full, src, 24, stride, 17);\
1163 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1164 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1165 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1166 OPNAME ## pixels16_l4_8(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1168 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1169 uint8_t full[24*17];\
1170 uint8_t halfH[272];\
1171 uint8_t halfHV[256];\
1172 copy_block17(full, src, 24, stride, 17);\
1173 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1174 put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1175 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1176 OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1178 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1179 uint8_t full[24*17];\
1180 uint8_t halfH[272];\
1181 uint8_t halfV[256];\
1182 uint8_t halfHV[256];\
1183 copy_block17(full, src, 24, stride, 17);\
1184 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1185 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1186 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1187 OPNAME ## pixels16_l4_8(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1189 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1190 uint8_t full[24*17];\
1191 uint8_t halfH[272];\
1192 uint8_t halfHV[256];\
1193 copy_block17(full, src, 24, stride, 17);\
1194 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1195 put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1196 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1197 OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1199 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1200 uint8_t full[24*17];\
1201 uint8_t halfH[272];\
1202 uint8_t halfV[256];\
1203 uint8_t halfHV[256];\
1204 copy_block17(full, src, 24, stride, 17);\
1205 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1206 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1207 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1208 OPNAME ## pixels16_l4_8(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1210 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1211 uint8_t full[24*17];\
1212 uint8_t halfH[272];\
1213 uint8_t halfHV[256];\
1214 copy_block17(full, src, 24, stride, 17);\
1215 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1216 put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1217 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1218 OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1220 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1221 uint8_t full[24*17];\
1222 uint8_t halfH[272];\
1223 uint8_t halfV[256];\
1224 uint8_t halfHV[256];\
1225 copy_block17(full, src, 24, stride, 17);\
1226 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full , 16, 24, 17);\
1227 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1228 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1229 OPNAME ## pixels16_l4_8(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1231 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1232 uint8_t full[24*17];\
1233 uint8_t halfH[272];\
1234 uint8_t halfHV[256];\
1235 copy_block17(full, src, 24, stride, 17);\
1236 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1237 put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1238 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1239 OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1241 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1242 uint8_t halfH[272];\
1243 uint8_t halfHV[256];\
1244 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1245 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1246 OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1248 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1249 uint8_t halfH[272];\
1250 uint8_t halfHV[256];\
1251 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1252 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1253 OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1255 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1256 uint8_t full[24*17];\
1257 uint8_t halfH[272];\
1258 uint8_t halfV[256];\
1259 uint8_t halfHV[256];\
1260 copy_block17(full, src, 24, stride, 17);\
1261 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1262 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1263 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1264 OPNAME ## pixels16_l2_8(dst, halfV, halfHV, stride, 16, 16, 16);\
1266 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1267 uint8_t full[24*17];\
1268 uint8_t halfH[272];\
1269 copy_block17(full, src, 24, stride, 17);\
1270 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1271 put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1272 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1274 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1275 uint8_t full[24*17];\
1276 uint8_t halfH[272];\
1277 uint8_t halfV[256];\
1278 uint8_t halfHV[256];\
1279 copy_block17(full, src, 24, stride, 17);\
1280 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1281 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1282 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1283 OPNAME ## pixels16_l2_8(dst, halfV, halfHV, stride, 16, 16, 16);\
1285 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1286 uint8_t full[24*17];\
1287 uint8_t halfH[272];\
1288 copy_block17(full, src, 24, stride, 17);\
1289 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1290 put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1291 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1293 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1294 uint8_t halfH[272];\
1295 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1296 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1299 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
1300 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
1301 #define op_put(a, b) a = cm[((b) + 16)>>5]
1302 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
1304 QPEL_MC(0, put_ , _ , op_put)
1305 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
1306 QPEL_MC(0, avg_ , _ , op_avg)
1307 //QPEL_MC(1, avg_no_rnd , _ , op_avg)
1309 #undef op_avg_no_rnd
1311 #undef op_put_no_rnd
1313 #define put_qpel8_mc00_c ff_put_pixels8x8_c
1314 #define avg_qpel8_mc00_c ff_avg_pixels8x8_c
1315 #define put_qpel16_mc00_c ff_put_pixels16x16_c
1316 #define avg_qpel16_mc00_c ff_avg_pixels16x16_c
1317 #define put_no_rnd_qpel8_mc00_c ff_put_pixels8x8_c
1318 #define put_no_rnd_qpel16_mc00_c ff_put_pixels16x16_8_c
1320 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
1321 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
1325 dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
1326 dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
1327 dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
1328 dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
1329 dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
1330 dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
1331 dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
1332 dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
1338 #if CONFIG_RV40_DECODER
1339 static void put_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1340 put_pixels16_xy2_8_c(dst, src, stride, 16);
1342 static void avg_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1343 avg_pixels16_xy2_8_c(dst, src, stride, 16);
1345 static void put_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1346 put_pixels8_xy2_8_c(dst, src, stride, 8);
1348 static void avg_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1349 avg_pixels8_xy2_8_c(dst, src, stride, 8);
1351 #endif /* CONFIG_RV40_DECODER */
1353 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
1354 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
1358 const int src_1= src[ -srcStride];
1359 const int src0 = src[0 ];
1360 const int src1 = src[ srcStride];
1361 const int src2 = src[2*srcStride];
1362 const int src3 = src[3*srcStride];
1363 const int src4 = src[4*srcStride];
1364 const int src5 = src[5*srcStride];
1365 const int src6 = src[6*srcStride];
1366 const int src7 = src[7*srcStride];
1367 const int src8 = src[8*srcStride];
1368 const int src9 = src[9*srcStride];
1369 dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
1370 dst[1*dstStride]= cm[(9*(src1 + src2) - (src0 + src3) + 8)>>4];
1371 dst[2*dstStride]= cm[(9*(src2 + src3) - (src1 + src4) + 8)>>4];
1372 dst[3*dstStride]= cm[(9*(src3 + src4) - (src2 + src5) + 8)>>4];
1373 dst[4*dstStride]= cm[(9*(src4 + src5) - (src3 + src6) + 8)>>4];
1374 dst[5*dstStride]= cm[(9*(src5 + src6) - (src4 + src7) + 8)>>4];
1375 dst[6*dstStride]= cm[(9*(src6 + src7) - (src5 + src8) + 8)>>4];
1376 dst[7*dstStride]= cm[(9*(src7 + src8) - (src6 + src9) + 8)>>4];
1382 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
1384 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1385 put_pixels8_l2_8(dst, src, half, stride, stride, 8, 8);
1388 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
1389 wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
1392 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
1394 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1395 put_pixels8_l2_8(dst, src+1, half, stride, stride, 8, 8);
1398 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
1399 wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
1402 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
1406 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1407 wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
1408 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
1409 put_pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);
1411 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
1415 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1416 wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
1417 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
1418 put_pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);
1420 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
1422 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1423 wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
1426 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
1427 if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
1429 const int strength= ff_h263_loop_filter_strength[qscale];
1433 int p0= src[x-2*stride];
1434 int p1= src[x-1*stride];
1435 int p2= src[x+0*stride];
1436 int p3= src[x+1*stride];
1437 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
1439 if (d<-2*strength) d1= 0;
1440 else if(d<- strength) d1=-2*strength - d;
1441 else if(d< strength) d1= d;
1442 else if(d< 2*strength) d1= 2*strength - d;
1447 if(p1&256) p1= ~(p1>>31);
1448 if(p2&256) p2= ~(p2>>31);
1450 src[x-1*stride] = p1;
1451 src[x+0*stride] = p2;
1455 d2= av_clip((p0-p3)/4, -ad1, ad1);
1457 src[x-2*stride] = p0 - d2;
1458 src[x+ stride] = p3 + d2;
1463 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
1464 if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
1466 const int strength= ff_h263_loop_filter_strength[qscale];
1470 int p0= src[y*stride-2];
1471 int p1= src[y*stride-1];
1472 int p2= src[y*stride+0];
1473 int p3= src[y*stride+1];
1474 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
1476 if (d<-2*strength) d1= 0;
1477 else if(d<- strength) d1=-2*strength - d;
1478 else if(d< strength) d1= d;
1479 else if(d< 2*strength) d1= 2*strength - d;
1484 if(p1&256) p1= ~(p1>>31);
1485 if(p2&256) p2= ~(p2>>31);
1487 src[y*stride-1] = p1;
1488 src[y*stride+0] = p2;
1492 d2= av_clip((p0-p3)/4, -ad1, ad1);
1494 src[y*stride-2] = p0 - d2;
1495 src[y*stride+1] = p3 + d2;
1500 static void h261_loop_filter_c(uint8_t *src, int stride){
1505 temp[x ] = 4*src[x ];
1506 temp[x + 7*8] = 4*src[x + 7*stride];
1510 xy = y * stride + x;
1512 temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
1517 src[ y*stride] = (temp[ y*8] + 2)>>2;
1518 src[7+y*stride] = (temp[7+y*8] + 2)>>2;
1520 xy = y * stride + x;
1522 src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
1527 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1533 s += abs(pix1[0] - pix2[0]);
1534 s += abs(pix1[1] - pix2[1]);
1535 s += abs(pix1[2] - pix2[2]);
1536 s += abs(pix1[3] - pix2[3]);
1537 s += abs(pix1[4] - pix2[4]);
1538 s += abs(pix1[5] - pix2[5]);
1539 s += abs(pix1[6] - pix2[6]);
1540 s += abs(pix1[7] - pix2[7]);
1541 s += abs(pix1[8] - pix2[8]);
1542 s += abs(pix1[9] - pix2[9]);
1543 s += abs(pix1[10] - pix2[10]);
1544 s += abs(pix1[11] - pix2[11]);
1545 s += abs(pix1[12] - pix2[12]);
1546 s += abs(pix1[13] - pix2[13]);
1547 s += abs(pix1[14] - pix2[14]);
1548 s += abs(pix1[15] - pix2[15]);
1555 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1561 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1562 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1563 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1564 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1565 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1566 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1567 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1568 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1569 s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
1570 s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
1571 s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
1572 s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
1573 s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
1574 s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
1575 s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
1576 s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
1583 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1586 uint8_t *pix3 = pix2 + line_size;
1590 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1591 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1592 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1593 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1594 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1595 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1596 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1597 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1598 s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
1599 s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
1600 s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
1601 s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
1602 s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
1603 s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
1604 s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
1605 s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
1613 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1616 uint8_t *pix3 = pix2 + line_size;
1620 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1621 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1622 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1623 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1624 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1625 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1626 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1627 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1628 s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
1629 s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
1630 s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
1631 s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
1632 s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
1633 s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
1634 s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
1635 s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
1643 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1649 s += abs(pix1[0] - pix2[0]);
1650 s += abs(pix1[1] - pix2[1]);
1651 s += abs(pix1[2] - pix2[2]);
1652 s += abs(pix1[3] - pix2[3]);
1653 s += abs(pix1[4] - pix2[4]);
1654 s += abs(pix1[5] - pix2[5]);
1655 s += abs(pix1[6] - pix2[6]);
1656 s += abs(pix1[7] - pix2[7]);
1663 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1669 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1670 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1671 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1672 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1673 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1674 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1675 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1676 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1683 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1686 uint8_t *pix3 = pix2 + line_size;
1690 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1691 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1692 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1693 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1694 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1695 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1696 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1697 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1705 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1708 uint8_t *pix3 = pix2 + line_size;
1712 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1713 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1714 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1715 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1716 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1717 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1718 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1719 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1727 static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
1728 MpegEncContext *c = v;
1734 for(x=0; x<16; x++){
1735 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
1738 for(x=0; x<15; x++){
1739 score2+= FFABS( s1[x ] - s1[x +stride]
1740 - s1[x+1] + s1[x+1+stride])
1741 -FFABS( s2[x ] - s2[x +stride]
1742 - s2[x+1] + s2[x+1+stride]);
1749 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1750 else return score1 + FFABS(score2)*8;
1753 static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
1754 MpegEncContext *c = v;
1761 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
1765 score2+= FFABS( s1[x ] - s1[x +stride]
1766 - s1[x+1] + s1[x+1+stride])
1767 -FFABS( s2[x ] - s2[x +stride]
1768 - s2[x+1] + s2[x+1+stride]);
1775 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1776 else return score1 + FFABS(score2)*8;
1779 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
1783 for(i=0; i<8*8; i++){
1784 int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
1787 assert(-512<b && b<512);
1789 sum += (w*b)*(w*b)>>4;
1794 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
1797 for(i=0; i<8*8; i++){
1798 rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
1803 * permutes an 8x8 block.
1804 * @param block the block which will be permuted according to the given permutation vector
1805 * @param permutation the permutation vector
1806 * @param last the last non zero coefficient in scantable order, used to speed the permutation up
1807 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
1808 * (inverse) permutated to scantable order!
1810 void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
1816 //if(permutation[1]==1) return; //FIXME it is ok but not clean and might fail for some permutations
1818 for(i=0; i<=last; i++){
1819 const int j= scantable[i];
1824 for(i=0; i<=last; i++){
1825 const int j= scantable[i];
1826 const int perm_j= permutation[j];
1827 block[perm_j]= temp[j];
1831 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
1835 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
1838 memset(cmp, 0, sizeof(void*)*6);
1846 cmp[i]= c->hadamard8_diff[i];
1852 cmp[i]= c->dct_sad[i];
1855 cmp[i]= c->dct264_sad[i];
1858 cmp[i]= c->dct_max[i];
1861 cmp[i]= c->quant_psnr[i];
1890 av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
1895 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
1897 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
1898 long a = *(long*)(src+i);
1899 long b = *(long*)(dst+i);
1900 *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
1903 dst[i+0] += src[i+0];
1906 static void add_bytes_l2_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
1908 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
1909 long a = *(long*)(src1+i);
1910 long b = *(long*)(src2+i);
1911 *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
1914 dst[i] = src1[i]+src2[i];
1917 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
1919 #if !HAVE_FAST_UNALIGNED
1920 if((long)src2 & (sizeof(long)-1)){
1921 for(i=0; i+7<w; i+=8){
1922 dst[i+0] = src1[i+0]-src2[i+0];
1923 dst[i+1] = src1[i+1]-src2[i+1];
1924 dst[i+2] = src1[i+2]-src2[i+2];
1925 dst[i+3] = src1[i+3]-src2[i+3];
1926 dst[i+4] = src1[i+4]-src2[i+4];
1927 dst[i+5] = src1[i+5]-src2[i+5];
1928 dst[i+6] = src1[i+6]-src2[i+6];
1929 dst[i+7] = src1[i+7]-src2[i+7];
1933 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
1934 long a = *(long*)(src1+i);
1935 long b = *(long*)(src2+i);
1936 *(long*)(dst+i) = ((a|pb_80) - (b&pb_7f)) ^ ((a^b^pb_80)&pb_80);
1939 dst[i+0] = src1[i+0]-src2[i+0];
1942 static void add_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *diff, int w, int *left, int *left_top){
1950 l= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF) + diff[i];
1959 static void sub_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w, int *left, int *left_top){
1967 const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
1977 static int add_hfyu_left_prediction_c(uint8_t *dst, const uint8_t *src, int w, int acc){
1980 for(i=0; i<w-1; i++){
2007 static void add_hfyu_left_prediction_bgr32_c(uint8_t *dst, const uint8_t *src, int w, int *red, int *green, int *blue, int *alpha){
2037 #define BUTTERFLY2(o1,o2,i1,i2) \
2041 #define BUTTERFLY1(x,y) \
2050 #define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
2052 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
2060 //FIXME try pointer walks
2061 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
2062 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
2063 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
2064 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
2066 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2067 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2068 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2069 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2071 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2072 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2073 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2074 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2078 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2079 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2080 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2081 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2083 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2084 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2085 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2086 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2089 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2090 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2091 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2092 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2097 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
2105 //FIXME try pointer walks
2106 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
2107 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
2108 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
2109 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
2111 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2112 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2113 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2114 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2116 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2117 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2118 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2119 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2123 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2124 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2125 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2126 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2128 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2129 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2130 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2131 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2134 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2135 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2136 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2137 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2140 sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
2145 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2146 MpegEncContext * const s= (MpegEncContext *)c;
2147 LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
2151 s->dsp.diff_pixels(temp, src1, src2, stride);
2153 return s->dsp.sum_abs_dctelem(temp);
2158 const int s07 = SRC(0) + SRC(7);\
2159 const int s16 = SRC(1) + SRC(6);\
2160 const int s25 = SRC(2) + SRC(5);\
2161 const int s34 = SRC(3) + SRC(4);\
2162 const int a0 = s07 + s34;\
2163 const int a1 = s16 + s25;\
2164 const int a2 = s07 - s34;\
2165 const int a3 = s16 - s25;\
2166 const int d07 = SRC(0) - SRC(7);\
2167 const int d16 = SRC(1) - SRC(6);\
2168 const int d25 = SRC(2) - SRC(5);\
2169 const int d34 = SRC(3) - SRC(4);\
2170 const int a4 = d16 + d25 + (d07 + (d07>>1));\
2171 const int a5 = d07 - d34 - (d25 + (d25>>1));\
2172 const int a6 = d07 + d34 - (d16 + (d16>>1));\
2173 const int a7 = d16 - d25 + (d34 + (d34>>1));\
2175 DST(1, a4 + (a7>>2)) ;\
2176 DST(2, a2 + (a3>>1)) ;\
2177 DST(3, a5 + (a6>>2)) ;\
2179 DST(5, a6 - (a5>>2)) ;\
2180 DST(6, (a2>>1) - a3 ) ;\
2181 DST(7, (a4>>2) - a7 ) ;\
2184 static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2185 MpegEncContext * const s= (MpegEncContext *)c;
2190 s->dsp.diff_pixels(dct[0], src1, src2, stride);
2192 #define SRC(x) dct[i][x]
2193 #define DST(x,v) dct[i][x]= v
2194 for( i = 0; i < 8; i++ )
2199 #define SRC(x) dct[x][i]
2200 #define DST(x,v) sum += FFABS(v)
2201 for( i = 0; i < 8; i++ )
2209 static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2210 MpegEncContext * const s= (MpegEncContext *)c;
2211 LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
2216 s->dsp.diff_pixels(temp, src1, src2, stride);
2220 sum= FFMAX(sum, FFABS(temp[i]));
2225 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2226 MpegEncContext * const s= (MpegEncContext *)c;
2227 LOCAL_ALIGNED_16(DCTELEM, temp, [64*2]);
2228 DCTELEM * const bak = temp+64;
2234 s->dsp.diff_pixels(temp, src1, src2, stride);
2236 memcpy(bak, temp, 64*sizeof(DCTELEM));
2238 s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2239 s->dct_unquantize_inter(s, temp, 0, s->qscale);
2240 ff_simple_idct(temp); //FIXME
2243 sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
2248 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2249 MpegEncContext * const s= (MpegEncContext *)c;
2250 const uint8_t *scantable= s->intra_scantable.permutated;
2251 LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
2252 LOCAL_ALIGNED_16(uint8_t, lsrc1, [64]);
2253 LOCAL_ALIGNED_16(uint8_t, lsrc2, [64]);
2254 int i, last, run, bits, level, distortion, start_i;
2255 const int esc_length= s->ac_esc_length;
2257 uint8_t * last_length;
2261 copy_block8(lsrc1, src1, 8, stride, 8);
2262 copy_block8(lsrc2, src2, 8, stride, 8);
2264 s->dsp.diff_pixels(temp, lsrc1, lsrc2, 8);
2266 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2272 length = s->intra_ac_vlc_length;
2273 last_length= s->intra_ac_vlc_last_length;
2274 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2277 length = s->inter_ac_vlc_length;
2278 last_length= s->inter_ac_vlc_last_length;
2283 for(i=start_i; i<last; i++){
2284 int j= scantable[i];
2289 if((level&(~127)) == 0){
2290 bits+= length[UNI_AC_ENC_INDEX(run, level)];
2299 level= temp[i] + 64;
2303 if((level&(~127)) == 0){
2304 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2312 s->dct_unquantize_intra(s, temp, 0, s->qscale);
2314 s->dct_unquantize_inter(s, temp, 0, s->qscale);
2317 s->dsp.idct_add(lsrc2, 8, temp);
2319 distortion= s->dsp.sse[1](NULL, lsrc2, lsrc1, 8, 8);
2321 return distortion + ((bits*s->qscale*s->qscale*109 + 64)>>7);
2324 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2325 MpegEncContext * const s= (MpegEncContext *)c;
2326 const uint8_t *scantable= s->intra_scantable.permutated;
2327 LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
2328 int i, last, run, bits, level, start_i;
2329 const int esc_length= s->ac_esc_length;
2331 uint8_t * last_length;
2335 s->dsp.diff_pixels(temp, src1, src2, stride);
2337 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2343 length = s->intra_ac_vlc_length;
2344 last_length= s->intra_ac_vlc_last_length;
2345 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2348 length = s->inter_ac_vlc_length;
2349 last_length= s->inter_ac_vlc_last_length;
2354 for(i=start_i; i<last; i++){
2355 int j= scantable[i];
2360 if((level&(~127)) == 0){
2361 bits+= length[UNI_AC_ENC_INDEX(run, level)];
2370 level= temp[i] + 64;
2374 if((level&(~127)) == 0){
2375 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2383 #define VSAD_INTRA(size) \
2384 static int vsad_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
2388 for(y=1; y<h; y++){ \
2389 for(x=0; x<size; x+=4){ \
2390 score+= FFABS(s[x ] - s[x +stride]) + FFABS(s[x+1] - s[x+1+stride]) \
2391 +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]); \
2401 static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2406 for(x=0; x<16; x++){
2407 score+= FFABS(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
2416 #define SQ(a) ((a)*(a))
2417 #define VSSE_INTRA(size) \
2418 static int vsse_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
2422 for(y=1; y<h; y++){ \
2423 for(x=0; x<size; x+=4){ \
2424 score+= SQ(s[x ] - s[x +stride]) + SQ(s[x+1] - s[x+1+stride]) \
2425 +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]); \
2435 static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2440 for(x=0; x<16; x++){
2441 score+= SQ(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
2450 static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
2454 for(i=0; i<size; i++)
2455 score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
2459 WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
2460 WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
2461 WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
2463 WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
2465 WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
2466 WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
2467 WRAPPER8_16_SQ(rd8x8_c, rd16_c)
2468 WRAPPER8_16_SQ(bit8x8_c, bit16_c)
2470 static void vector_fmul_c(float *dst, const float *src0, const float *src1, int len){
2472 for(i=0; i<len; i++)
2473 dst[i] = src0[i] * src1[i];
2476 static void vector_fmul_reverse_c(float *dst, const float *src0, const float *src1, int len){
2479 for(i=0; i<len; i++)
2480 dst[i] = src0[i] * src1[-i];
2483 static void vector_fmul_add_c(float *dst, const float *src0, const float *src1, const float *src2, int len){
2485 for(i=0; i<len; i++)
2486 dst[i] = src0[i] * src1[i] + src2[i];
2489 static void vector_fmul_window_c(float *dst, const float *src0,
2490 const float *src1, const float *win, int len)
2496 for(i=-len, j=len-1; i<0; i++, j--) {
2501 dst[i] = s0*wj - s1*wi;
2502 dst[j] = s0*wi + s1*wj;
2506 static void vector_fmul_scalar_c(float *dst, const float *src, float mul,
2510 for (i = 0; i < len; i++)
2511 dst[i] = src[i] * mul;
2514 static void vector_fmul_sv_scalar_2_c(float *dst, const float *src,
2515 const float **sv, float mul, int len)
2518 for (i = 0; i < len; i += 2, sv++) {
2519 dst[i ] = src[i ] * sv[0][0] * mul;
2520 dst[i+1] = src[i+1] * sv[0][1] * mul;
2524 static void vector_fmul_sv_scalar_4_c(float *dst, const float *src,
2525 const float **sv, float mul, int len)
2528 for (i = 0; i < len; i += 4, sv++) {
2529 dst[i ] = src[i ] * sv[0][0] * mul;
2530 dst[i+1] = src[i+1] * sv[0][1] * mul;
2531 dst[i+2] = src[i+2] * sv[0][2] * mul;
2532 dst[i+3] = src[i+3] * sv[0][3] * mul;
2536 static void sv_fmul_scalar_2_c(float *dst, const float **sv, float mul,
2540 for (i = 0; i < len; i += 2, sv++) {
2541 dst[i ] = sv[0][0] * mul;
2542 dst[i+1] = sv[0][1] * mul;
2546 static void sv_fmul_scalar_4_c(float *dst, const float **sv, float mul,
2550 for (i = 0; i < len; i += 4, sv++) {
2551 dst[i ] = sv[0][0] * mul;
2552 dst[i+1] = sv[0][1] * mul;
2553 dst[i+2] = sv[0][2] * mul;
2554 dst[i+3] = sv[0][3] * mul;
2558 static void butterflies_float_c(float *restrict v1, float *restrict v2,
2562 for (i = 0; i < len; i++) {
2563 float t = v1[i] - v2[i];
2569 static float scalarproduct_float_c(const float *v1, const float *v2, int len)
2574 for (i = 0; i < len; i++)
2580 static inline uint32_t clipf_c_one(uint32_t a, uint32_t mini,
2581 uint32_t maxi, uint32_t maxisign)
2584 if(a > mini) return mini;
2585 else if((a^(1U<<31)) > maxisign) return maxi;
2589 static void vector_clipf_c_opposite_sign(float *dst, const float *src, float *min, float *max, int len){
2591 uint32_t mini = *(uint32_t*)min;
2592 uint32_t maxi = *(uint32_t*)max;
2593 uint32_t maxisign = maxi ^ (1U<<31);
2594 uint32_t *dsti = (uint32_t*)dst;
2595 const uint32_t *srci = (const uint32_t*)src;
2596 for(i=0; i<len; i+=8) {
2597 dsti[i + 0] = clipf_c_one(srci[i + 0], mini, maxi, maxisign);
2598 dsti[i + 1] = clipf_c_one(srci[i + 1], mini, maxi, maxisign);
2599 dsti[i + 2] = clipf_c_one(srci[i + 2], mini, maxi, maxisign);
2600 dsti[i + 3] = clipf_c_one(srci[i + 3], mini, maxi, maxisign);
2601 dsti[i + 4] = clipf_c_one(srci[i + 4], mini, maxi, maxisign);
2602 dsti[i + 5] = clipf_c_one(srci[i + 5], mini, maxi, maxisign);
2603 dsti[i + 6] = clipf_c_one(srci[i + 6], mini, maxi, maxisign);
2604 dsti[i + 7] = clipf_c_one(srci[i + 7], mini, maxi, maxisign);
2607 static void vector_clipf_c(float *dst, const float *src, float min, float max, int len){
2609 if(min < 0 && max > 0) {
2610 vector_clipf_c_opposite_sign(dst, src, &min, &max, len);
2612 for(i=0; i < len; i+=8) {
2613 dst[i ] = av_clipf(src[i ], min, max);
2614 dst[i + 1] = av_clipf(src[i + 1], min, max);
2615 dst[i + 2] = av_clipf(src[i + 2], min, max);
2616 dst[i + 3] = av_clipf(src[i + 3], min, max);
2617 dst[i + 4] = av_clipf(src[i + 4], min, max);
2618 dst[i + 5] = av_clipf(src[i + 5], min, max);
2619 dst[i + 6] = av_clipf(src[i + 6], min, max);
2620 dst[i + 7] = av_clipf(src[i + 7], min, max);
2625 static int32_t scalarproduct_int16_c(const int16_t * v1, const int16_t * v2, int order, int shift)
2630 res += (*v1++ * *v2++) >> shift;
2635 static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul)
2640 *v1++ += mul * *v3++;
2645 static void apply_window_int16_c(int16_t *output, const int16_t *input,
2646 const int16_t *window, unsigned int len)
2649 int len2 = len >> 1;
2651 for (i = 0; i < len2; i++) {
2652 int16_t w = window[i];
2653 output[i] = (MUL16(input[i], w) + (1 << 14)) >> 15;
2654 output[len-i-1] = (MUL16(input[len-i-1], w) + (1 << 14)) >> 15;
2658 static void vector_clip_int32_c(int32_t *dst, const int32_t *src, int32_t min,
2659 int32_t max, unsigned int len)
2662 *dst++ = av_clip(*src++, min, max);
2663 *dst++ = av_clip(*src++, min, max);
2664 *dst++ = av_clip(*src++, min, max);
2665 *dst++ = av_clip(*src++, min, max);
2666 *dst++ = av_clip(*src++, min, max);
2667 *dst++ = av_clip(*src++, min, max);
2668 *dst++ = av_clip(*src++, min, max);
2669 *dst++ = av_clip(*src++, min, max);
2675 #define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
2676 #define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
2677 #define W3 2408 /* 2048*sqrt (2)*cos (3*pi/16) */
2678 #define W4 2048 /* 2048*sqrt (2)*cos (4*pi/16) */
2679 #define W5 1609 /* 2048*sqrt (2)*cos (5*pi/16) */
2680 #define W6 1108 /* 2048*sqrt (2)*cos (6*pi/16) */
2681 #define W7 565 /* 2048*sqrt (2)*cos (7*pi/16) */
2683 static void wmv2_idct_row(short * b)
2686 int a0,a1,a2,a3,a4,a5,a6,a7;
2688 a1 = W1*b[1]+W7*b[7];
2689 a7 = W7*b[1]-W1*b[7];
2690 a5 = W5*b[5]+W3*b[3];
2691 a3 = W3*b[5]-W5*b[3];
2692 a2 = W2*b[2]+W6*b[6];
2693 a6 = W6*b[2]-W2*b[6];
2694 a0 = W0*b[0]+W0*b[4];
2695 a4 = W0*b[0]-W0*b[4];
2697 s1 = (181*(a1-a5+a7-a3)+128)>>8;//1,3,5,7,
2698 s2 = (181*(a1-a5-a7+a3)+128)>>8;
2700 b[0] = (a0+a2+a1+a5 + (1<<7))>>8;
2701 b[1] = (a4+a6 +s1 + (1<<7))>>8;
2702 b[2] = (a4-a6 +s2 + (1<<7))>>8;
2703 b[3] = (a0-a2+a7+a3 + (1<<7))>>8;
2704 b[4] = (a0-a2-a7-a3 + (1<<7))>>8;
2705 b[5] = (a4-a6 -s2 + (1<<7))>>8;
2706 b[6] = (a4+a6 -s1 + (1<<7))>>8;
2707 b[7] = (a0+a2-a1-a5 + (1<<7))>>8;
2709 static void wmv2_idct_col(short * b)
2712 int a0,a1,a2,a3,a4,a5,a6,a7;
2713 /*step 1, with extended precision*/
2714 a1 = (W1*b[8*1]+W7*b[8*7] + 4)>>3;
2715 a7 = (W7*b[8*1]-W1*b[8*7] + 4)>>3;
2716 a5 = (W5*b[8*5]+W3*b[8*3] + 4)>>3;
2717 a3 = (W3*b[8*5]-W5*b[8*3] + 4)>>3;
2718 a2 = (W2*b[8*2]+W6*b[8*6] + 4)>>3;
2719 a6 = (W6*b[8*2]-W2*b[8*6] + 4)>>3;
2720 a0 = (W0*b[8*0]+W0*b[8*4] )>>3;
2721 a4 = (W0*b[8*0]-W0*b[8*4] )>>3;
2723 s1 = (181*(a1-a5+a7-a3)+128)>>8;
2724 s2 = (181*(a1-a5-a7+a3)+128)>>8;
2726 b[8*0] = (a0+a2+a1+a5 + (1<<13))>>14;
2727 b[8*1] = (a4+a6 +s1 + (1<<13))>>14;
2728 b[8*2] = (a4-a6 +s2 + (1<<13))>>14;
2729 b[8*3] = (a0-a2+a7+a3 + (1<<13))>>14;
2731 b[8*4] = (a0-a2-a7-a3 + (1<<13))>>14;
2732 b[8*5] = (a4-a6 -s2 + (1<<13))>>14;
2733 b[8*6] = (a4+a6 -s1 + (1<<13))>>14;
2734 b[8*7] = (a0+a2-a1-a5 + (1<<13))>>14;
2736 void ff_wmv2_idct_c(short * block){
2740 wmv2_idct_row(block+i);
2743 wmv2_idct_col(block+i);
2746 /* XXX: those functions should be suppressed ASAP when all IDCTs are
2748 static void ff_wmv2_idct_put_c(uint8_t *dest, int line_size, DCTELEM *block)
2750 ff_wmv2_idct_c(block);
2751 ff_put_pixels_clamped_c(block, dest, line_size);
2753 static void ff_wmv2_idct_add_c(uint8_t *dest, int line_size, DCTELEM *block)
2755 ff_wmv2_idct_c(block);
2756 ff_add_pixels_clamped_c(block, dest, line_size);
2758 static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
2761 ff_put_pixels_clamped_c(block, dest, line_size);
2763 static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
2766 ff_add_pixels_clamped_c(block, dest, line_size);
2769 static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block)
2772 put_pixels_clamped4_c(block, dest, line_size);
2774 static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block)
2777 add_pixels_clamped4_c(block, dest, line_size);
2780 static void ff_jref_idct2_put(uint8_t *dest, int line_size, DCTELEM *block)
2783 put_pixels_clamped2_c(block, dest, line_size);
2785 static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block)
2788 add_pixels_clamped2_c(block, dest, line_size);
2791 static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block)
2793 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2795 dest[0] = cm[(block[0] + 4)>>3];
2797 static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block)
2799 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2801 dest[0] = cm[dest[0] + ((block[0] + 4)>>3)];
2804 static void just_return(void *mem av_unused, int stride av_unused, int h av_unused) { return; }
2806 /* init static data */
2807 av_cold void dsputil_static_init(void)
2811 for(i=0;i<256;i++) ff_cropTbl[i + MAX_NEG_CROP] = i;
2812 for(i=0;i<MAX_NEG_CROP;i++) {
2814 ff_cropTbl[i + MAX_NEG_CROP + 256] = 255;
2817 for(i=0;i<512;i++) {
2818 ff_squareTbl[i] = (i - 256) * (i - 256);
2821 for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
2824 int ff_check_alignment(void){
2825 static int did_fail=0;
2826 LOCAL_ALIGNED_16(int, aligned);
2828 if((intptr_t)&aligned & 15){
2830 #if HAVE_MMX || HAVE_ALTIVEC
2831 av_log(NULL, AV_LOG_ERROR,
2832 "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
2833 "and may be very slow or crash. This is not a bug in libavcodec,\n"
2834 "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
2835 "Do not report crashes to Libav developers.\n");
2844 av_cold void dsputil_init(DSPContext* c, AVCodecContext *avctx)
2848 ff_check_alignment();
2851 if(avctx->dct_algo==FF_DCT_FASTINT) {
2852 c->fdct = fdct_ifast;
2853 c->fdct248 = fdct_ifast248;
2855 else if(avctx->dct_algo==FF_DCT_FAAN) {
2856 c->fdct = ff_faandct;
2857 c->fdct248 = ff_faandct248;
2860 c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
2861 c->fdct248 = ff_fdct248_islow;
2863 #endif //CONFIG_ENCODERS
2865 if(avctx->lowres==1){
2866 if(avctx->idct_algo==FF_IDCT_INT || avctx->idct_algo==FF_IDCT_AUTO || !CONFIG_H264_DECODER){
2867 c->idct_put= ff_jref_idct4_put;
2868 c->idct_add= ff_jref_idct4_add;
2870 if (avctx->codec_id != CODEC_ID_H264) {
2871 c->idct_put= ff_h264_lowres_idct_put_8_c;
2872 c->idct_add= ff_h264_lowres_idct_add_8_c;
2874 switch (avctx->bits_per_raw_sample) {
2876 c->idct_put= ff_h264_lowres_idct_put_9_c;
2877 c->idct_add= ff_h264_lowres_idct_add_9_c;
2880 c->idct_put= ff_h264_lowres_idct_put_10_c;
2881 c->idct_add= ff_h264_lowres_idct_add_10_c;
2884 c->idct_put= ff_h264_lowres_idct_put_8_c;
2885 c->idct_add= ff_h264_lowres_idct_add_8_c;
2889 c->idct = j_rev_dct4;
2890 c->idct_permutation_type= FF_NO_IDCT_PERM;
2891 }else if(avctx->lowres==2){
2892 c->idct_put= ff_jref_idct2_put;
2893 c->idct_add= ff_jref_idct2_add;
2894 c->idct = j_rev_dct2;
2895 c->idct_permutation_type= FF_NO_IDCT_PERM;
2896 }else if(avctx->lowres==3){
2897 c->idct_put= ff_jref_idct1_put;
2898 c->idct_add= ff_jref_idct1_add;
2899 c->idct = j_rev_dct1;
2900 c->idct_permutation_type= FF_NO_IDCT_PERM;
2902 if(avctx->idct_algo==FF_IDCT_INT){
2903 c->idct_put= ff_jref_idct_put;
2904 c->idct_add= ff_jref_idct_add;
2905 c->idct = j_rev_dct;
2906 c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
2907 }else if((CONFIG_VP3_DECODER || CONFIG_VP5_DECODER || CONFIG_VP6_DECODER ) &&
2908 avctx->idct_algo==FF_IDCT_VP3){
2909 c->idct_put= ff_vp3_idct_put_c;
2910 c->idct_add= ff_vp3_idct_add_c;
2911 c->idct = ff_vp3_idct_c;
2912 c->idct_permutation_type= FF_NO_IDCT_PERM;
2913 }else if(avctx->idct_algo==FF_IDCT_WMV2){
2914 c->idct_put= ff_wmv2_idct_put_c;
2915 c->idct_add= ff_wmv2_idct_add_c;
2916 c->idct = ff_wmv2_idct_c;
2917 c->idct_permutation_type= FF_NO_IDCT_PERM;
2918 }else if(avctx->idct_algo==FF_IDCT_FAAN){
2919 c->idct_put= ff_faanidct_put;
2920 c->idct_add= ff_faanidct_add;
2921 c->idct = ff_faanidct;
2922 c->idct_permutation_type= FF_NO_IDCT_PERM;
2923 }else if(CONFIG_EATGQ_DECODER && avctx->idct_algo==FF_IDCT_EA) {
2924 c->idct_put= ff_ea_idct_put_c;
2925 c->idct_permutation_type= FF_NO_IDCT_PERM;
2926 }else if(CONFIG_BINK_DECODER && avctx->idct_algo==FF_IDCT_BINK) {
2927 c->idct = ff_bink_idct_c;
2928 c->idct_add = ff_bink_idct_add_c;
2929 c->idct_put = ff_bink_idct_put_c;
2930 c->idct_permutation_type = FF_NO_IDCT_PERM;
2931 }else{ //accurate/default
2932 c->idct_put= ff_simple_idct_put;
2933 c->idct_add= ff_simple_idct_add;
2934 c->idct = ff_simple_idct;
2935 c->idct_permutation_type= FF_NO_IDCT_PERM;
2939 c->get_pixels = get_pixels_c;
2940 c->diff_pixels = diff_pixels_c;
2941 c->put_pixels_clamped = ff_put_pixels_clamped_c;
2942 c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_c;
2943 c->put_pixels_nonclamped = put_pixels_nonclamped_c;
2944 c->add_pixels_clamped = ff_add_pixels_clamped_c;
2945 c->sum_abs_dctelem = sum_abs_dctelem_c;
2948 c->pix_sum = pix_sum_c;
2949 c->pix_norm1 = pix_norm1_c;
2951 c->fill_block_tab[0] = fill_block16_c;
2952 c->fill_block_tab[1] = fill_block8_c;
2953 c->scale_block = scale_block_c;
2955 /* TODO [0] 16 [1] 8 */
2956 c->pix_abs[0][0] = pix_abs16_c;
2957 c->pix_abs[0][1] = pix_abs16_x2_c;
2958 c->pix_abs[0][2] = pix_abs16_y2_c;
2959 c->pix_abs[0][3] = pix_abs16_xy2_c;
2960 c->pix_abs[1][0] = pix_abs8_c;
2961 c->pix_abs[1][1] = pix_abs8_x2_c;
2962 c->pix_abs[1][2] = pix_abs8_y2_c;
2963 c->pix_abs[1][3] = pix_abs8_xy2_c;
2965 c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
2966 c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
2967 c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
2968 c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
2969 c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
2970 c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
2971 c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
2972 c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
2973 c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
2975 c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
2976 c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
2977 c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
2978 c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
2979 c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
2980 c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
2981 c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
2982 c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
2983 c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
2985 #define dspfunc(PFX, IDX, NUM) \
2986 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
2987 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
2988 c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
2989 c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
2990 c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
2991 c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
2992 c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
2993 c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
2994 c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
2995 c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
2996 c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
2997 c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
2998 c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
2999 c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
3000 c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
3001 c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
3003 dspfunc(put_qpel, 0, 16);
3004 dspfunc(put_no_rnd_qpel, 0, 16);
3006 dspfunc(avg_qpel, 0, 16);
3007 /* dspfunc(avg_no_rnd_qpel, 0, 16); */
3009 dspfunc(put_qpel, 1, 8);
3010 dspfunc(put_no_rnd_qpel, 1, 8);
3012 dspfunc(avg_qpel, 1, 8);
3013 /* dspfunc(avg_no_rnd_qpel, 1, 8); */
3017 #if CONFIG_MLP_DECODER || CONFIG_TRUEHD_DECODER
3018 ff_mlp_init(c, avctx);
3020 #if CONFIG_WMV2_DECODER || CONFIG_VC1_DECODER
3021 ff_intrax8dsp_init(c,avctx);
3023 #if CONFIG_RV30_DECODER
3024 ff_rv30dsp_init(c,avctx);
3026 #if CONFIG_RV40_DECODER
3027 ff_rv40dsp_init(c,avctx);
3028 c->put_rv40_qpel_pixels_tab[0][15] = put_rv40_qpel16_mc33_c;
3029 c->avg_rv40_qpel_pixels_tab[0][15] = avg_rv40_qpel16_mc33_c;
3030 c->put_rv40_qpel_pixels_tab[1][15] = put_rv40_qpel8_mc33_c;
3031 c->avg_rv40_qpel_pixels_tab[1][15] = avg_rv40_qpel8_mc33_c;
3034 c->put_mspel_pixels_tab[0]= ff_put_pixels8x8_c;
3035 c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
3036 c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
3037 c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
3038 c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
3039 c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
3040 c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
3041 c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
3043 #define SET_CMP_FUNC(name) \
3044 c->name[0]= name ## 16_c;\
3045 c->name[1]= name ## 8x8_c;
3047 SET_CMP_FUNC(hadamard8_diff)
3048 c->hadamard8_diff[4]= hadamard8_intra16_c;
3049 c->hadamard8_diff[5]= hadamard8_intra8x8_c;
3050 SET_CMP_FUNC(dct_sad)
3051 SET_CMP_FUNC(dct_max)
3053 SET_CMP_FUNC(dct264_sad)
3055 c->sad[0]= pix_abs16_c;
3056 c->sad[1]= pix_abs8_c;
3060 SET_CMP_FUNC(quant_psnr)
3063 c->vsad[0]= vsad16_c;
3064 c->vsad[4]= vsad_intra16_c;
3065 c->vsad[5]= vsad_intra8_c;
3066 c->vsse[0]= vsse16_c;
3067 c->vsse[4]= vsse_intra16_c;
3068 c->vsse[5]= vsse_intra8_c;
3069 c->nsse[0]= nsse16_c;
3070 c->nsse[1]= nsse8_c;
3072 ff_dsputil_init_dwt(c);
3075 c->ssd_int8_vs_int16 = ssd_int8_vs_int16_c;
3077 c->add_bytes= add_bytes_c;
3078 c->add_bytes_l2= add_bytes_l2_c;
3079 c->diff_bytes= diff_bytes_c;
3080 c->add_hfyu_median_prediction= add_hfyu_median_prediction_c;
3081 c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
3082 c->add_hfyu_left_prediction = add_hfyu_left_prediction_c;
3083 c->add_hfyu_left_prediction_bgr32 = add_hfyu_left_prediction_bgr32_c;
3084 c->bswap_buf= bswap_buf;
3085 c->bswap16_buf = bswap16_buf;
3086 #if CONFIG_PNG_DECODER
3087 c->add_png_paeth_prediction= ff_add_png_paeth_prediction;
3090 if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
3091 c->h263_h_loop_filter= h263_h_loop_filter_c;
3092 c->h263_v_loop_filter= h263_v_loop_filter_c;
3095 if (CONFIG_VP3_DECODER) {
3096 c->vp3_h_loop_filter= ff_vp3_h_loop_filter_c;
3097 c->vp3_v_loop_filter= ff_vp3_v_loop_filter_c;
3098 c->vp3_idct_dc_add= ff_vp3_idct_dc_add_c;
3101 c->h261_loop_filter= h261_loop_filter_c;
3103 c->try_8x8basis= try_8x8basis_c;
3104 c->add_8x8basis= add_8x8basis_c;
3106 #if CONFIG_VORBIS_DECODER
3107 c->vorbis_inverse_coupling = vorbis_inverse_coupling;
3109 #if CONFIG_AC3_DECODER
3110 c->ac3_downmix = ff_ac3_downmix_c;
3112 c->vector_fmul = vector_fmul_c;
3113 c->vector_fmul_reverse = vector_fmul_reverse_c;
3114 c->vector_fmul_add = vector_fmul_add_c;
3115 c->vector_fmul_window = vector_fmul_window_c;
3116 c->vector_clipf = vector_clipf_c;
3117 c->scalarproduct_int16 = scalarproduct_int16_c;
3118 c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
3119 c->apply_window_int16 = apply_window_int16_c;
3120 c->vector_clip_int32 = vector_clip_int32_c;
3121 c->scalarproduct_float = scalarproduct_float_c;
3122 c->butterflies_float = butterflies_float_c;
3123 c->vector_fmul_scalar = vector_fmul_scalar_c;
3125 c->vector_fmul_sv_scalar[0] = vector_fmul_sv_scalar_2_c;
3126 c->vector_fmul_sv_scalar[1] = vector_fmul_sv_scalar_4_c;
3128 c->sv_fmul_scalar[0] = sv_fmul_scalar_2_c;
3129 c->sv_fmul_scalar[1] = sv_fmul_scalar_4_c;
3131 c->shrink[0]= av_image_copy_plane;
3132 c->shrink[1]= ff_shrink22;
3133 c->shrink[2]= ff_shrink44;
3134 c->shrink[3]= ff_shrink88;
3136 c->prefetch= just_return;
3138 memset(c->put_2tap_qpel_pixels_tab, 0, sizeof(c->put_2tap_qpel_pixels_tab));
3139 memset(c->avg_2tap_qpel_pixels_tab, 0, sizeof(c->avg_2tap_qpel_pixels_tab));
3143 #define FUNC(f, depth) f ## _ ## depth
3144 #define FUNCC(f, depth) f ## _ ## depth ## _c
3146 #define dspfunc1(PFX, IDX, NUM, depth)\
3147 c->PFX ## _pixels_tab[IDX][0] = FUNCC(PFX ## _pixels ## NUM , depth);\
3148 c->PFX ## _pixels_tab[IDX][1] = FUNCC(PFX ## _pixels ## NUM ## _x2 , depth);\
3149 c->PFX ## _pixels_tab[IDX][2] = FUNCC(PFX ## _pixels ## NUM ## _y2 , depth);\
3150 c->PFX ## _pixels_tab[IDX][3] = FUNCC(PFX ## _pixels ## NUM ## _xy2, depth)
3152 #define dspfunc2(PFX, IDX, NUM, depth)\
3153 c->PFX ## _pixels_tab[IDX][ 0] = FUNCC(PFX ## NUM ## _mc00, depth);\
3154 c->PFX ## _pixels_tab[IDX][ 1] = FUNCC(PFX ## NUM ## _mc10, depth);\
3155 c->PFX ## _pixels_tab[IDX][ 2] = FUNCC(PFX ## NUM ## _mc20, depth);\
3156 c->PFX ## _pixels_tab[IDX][ 3] = FUNCC(PFX ## NUM ## _mc30, depth);\
3157 c->PFX ## _pixels_tab[IDX][ 4] = FUNCC(PFX ## NUM ## _mc01, depth);\
3158 c->PFX ## _pixels_tab[IDX][ 5] = FUNCC(PFX ## NUM ## _mc11, depth);\
3159 c->PFX ## _pixels_tab[IDX][ 6] = FUNCC(PFX ## NUM ## _mc21, depth);\
3160 c->PFX ## _pixels_tab[IDX][ 7] = FUNCC(PFX ## NUM ## _mc31, depth);\
3161 c->PFX ## _pixels_tab[IDX][ 8] = FUNCC(PFX ## NUM ## _mc02, depth);\
3162 c->PFX ## _pixels_tab[IDX][ 9] = FUNCC(PFX ## NUM ## _mc12, depth);\
3163 c->PFX ## _pixels_tab[IDX][10] = FUNCC(PFX ## NUM ## _mc22, depth);\
3164 c->PFX ## _pixels_tab[IDX][11] = FUNCC(PFX ## NUM ## _mc32, depth);\
3165 c->PFX ## _pixels_tab[IDX][12] = FUNCC(PFX ## NUM ## _mc03, depth);\
3166 c->PFX ## _pixels_tab[IDX][13] = FUNCC(PFX ## NUM ## _mc13, depth);\
3167 c->PFX ## _pixels_tab[IDX][14] = FUNCC(PFX ## NUM ## _mc23, depth);\
3168 c->PFX ## _pixels_tab[IDX][15] = FUNCC(PFX ## NUM ## _mc33, depth)
3171 #define BIT_DEPTH_FUNCS(depth)\
3172 c->draw_edges = FUNCC(draw_edges , depth);\
3173 c->emulated_edge_mc = FUNC (ff_emulated_edge_mc , depth);\
3174 c->clear_block = FUNCC(clear_block , depth);\
3175 c->clear_blocks = FUNCC(clear_blocks , depth);\
3176 c->add_pixels8 = FUNCC(add_pixels8 , depth);\
3177 c->add_pixels4 = FUNCC(add_pixels4 , depth);\
3178 c->put_no_rnd_pixels_l2[0] = FUNCC(put_no_rnd_pixels16_l2, depth);\
3179 c->put_no_rnd_pixels_l2[1] = FUNCC(put_no_rnd_pixels8_l2 , depth);\
3181 c->put_h264_chroma_pixels_tab[0] = FUNCC(put_h264_chroma_mc8 , depth);\
3182 c->put_h264_chroma_pixels_tab[1] = FUNCC(put_h264_chroma_mc4 , depth);\
3183 c->put_h264_chroma_pixels_tab[2] = FUNCC(put_h264_chroma_mc2 , depth);\
3184 c->avg_h264_chroma_pixels_tab[0] = FUNCC(avg_h264_chroma_mc8 , depth);\
3185 c->avg_h264_chroma_pixels_tab[1] = FUNCC(avg_h264_chroma_mc4 , depth);\
3186 c->avg_h264_chroma_pixels_tab[2] = FUNCC(avg_h264_chroma_mc2 , depth);\
3188 dspfunc1(put , 0, 16, depth);\
3189 dspfunc1(put , 1, 8, depth);\
3190 dspfunc1(put , 2, 4, depth);\
3191 dspfunc1(put , 3, 2, depth);\
3192 dspfunc1(put_no_rnd, 0, 16, depth);\
3193 dspfunc1(put_no_rnd, 1, 8, depth);\
3194 dspfunc1(avg , 0, 16, depth);\
3195 dspfunc1(avg , 1, 8, depth);\
3196 dspfunc1(avg , 2, 4, depth);\
3197 dspfunc1(avg , 3, 2, depth);\
3198 dspfunc1(avg_no_rnd, 0, 16, depth);\
3199 dspfunc1(avg_no_rnd, 1, 8, depth);\
3201 dspfunc2(put_h264_qpel, 0, 16, depth);\
3202 dspfunc2(put_h264_qpel, 1, 8, depth);\
3203 dspfunc2(put_h264_qpel, 2, 4, depth);\
3204 dspfunc2(put_h264_qpel, 3, 2, depth);\
3205 dspfunc2(avg_h264_qpel, 0, 16, depth);\
3206 dspfunc2(avg_h264_qpel, 1, 8, depth);\
3207 dspfunc2(avg_h264_qpel, 2, 4, depth);
3209 if (avctx->codec_id != CODEC_ID_H264 || avctx->bits_per_raw_sample == 8) {
3212 switch (avctx->bits_per_raw_sample) {
3220 av_log(avctx, AV_LOG_DEBUG, "Unsupported bit depth: %d\n", avctx->bits_per_raw_sample);
3227 if (HAVE_MMX) dsputil_init_mmx (c, avctx);
3228 if (ARCH_ARM) dsputil_init_arm (c, avctx);
3229 if (CONFIG_MLIB) dsputil_init_mlib (c, avctx);
3230 if (HAVE_VIS) dsputil_init_vis (c, avctx);
3231 if (ARCH_ALPHA) dsputil_init_alpha (c, avctx);
3232 if (ARCH_PPC) dsputil_init_ppc (c, avctx);
3233 if (HAVE_MMI) dsputil_init_mmi (c, avctx);
3234 if (ARCH_SH4) dsputil_init_sh4 (c, avctx);
3235 if (ARCH_BFIN) dsputil_init_bfin (c, avctx);
3237 for(i=0; i<64; i++){
3238 if(!c->put_2tap_qpel_pixels_tab[0][i])
3239 c->put_2tap_qpel_pixels_tab[0][i]= c->put_h264_qpel_pixels_tab[0][i];
3240 if(!c->avg_2tap_qpel_pixels_tab[0][i])
3241 c->avg_2tap_qpel_pixels_tab[0][i]= c->avg_h264_qpel_pixels_tab[0][i];
3244 c->put_rv30_tpel_pixels_tab[0][0] = c->put_h264_qpel_pixels_tab[0][0];
3245 c->put_rv30_tpel_pixels_tab[1][0] = c->put_h264_qpel_pixels_tab[1][0];
3246 c->avg_rv30_tpel_pixels_tab[0][0] = c->avg_h264_qpel_pixels_tab[0][0];
3247 c->avg_rv30_tpel_pixels_tab[1][0] = c->avg_h264_qpel_pixels_tab[1][0];
3249 c->put_rv40_qpel_pixels_tab[0][0] = c->put_h264_qpel_pixels_tab[0][0];
3250 c->put_rv40_qpel_pixels_tab[1][0] = c->put_h264_qpel_pixels_tab[1][0];
3251 c->avg_rv40_qpel_pixels_tab[0][0] = c->avg_h264_qpel_pixels_tab[0][0];
3252 c->avg_rv40_qpel_pixels_tab[1][0] = c->avg_h264_qpel_pixels_tab[1][0];
3254 switch(c->idct_permutation_type){
3255 case FF_NO_IDCT_PERM:
3257 c->idct_permutation[i]= i;
3259 case FF_LIBMPEG2_IDCT_PERM:
3261 c->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
3263 case FF_SIMPLE_IDCT_PERM:
3265 c->idct_permutation[i]= simple_mmx_permutation[i];
3267 case FF_TRANSPOSE_IDCT_PERM:
3269 c->idct_permutation[i]= ((i&7)<<3) | (i>>3);
3271 case FF_PARTTRANS_IDCT_PERM:
3273 c->idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3);
3275 case FF_SSE2_IDCT_PERM:
3277 c->idct_permutation[i]= (i&0x38) | idct_sse2_row_perm[i&7];
3280 av_log(avctx, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");