3 * Copyright (c) 2000, 2001 Fabrice Bellard
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
6 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
8 * This file is part of Libav.
10 * Libav is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public
12 * License as published by the Free Software Foundation; either
13 * version 2.1 of the License, or (at your option) any later version.
15 * Libav is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with Libav; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
30 #include "libavutil/imgutils.h"
33 #include "simple_idct.h"
37 #include "mpegvideo.h"
43 uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
44 uint32_t ff_squareTbl[512] = {0, };
47 #include "dsputil_template.c"
51 #include "dsputil_template.c"
55 #include "dsputil_template.c"
57 // 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
58 #define pb_7f (~0UL/255 * 0x7f)
59 #define pb_80 (~0UL/255 * 0x80)
61 const uint8_t ff_zigzag_direct[64] = {
62 0, 1, 8, 16, 9, 2, 3, 10,
63 17, 24, 32, 25, 18, 11, 4, 5,
64 12, 19, 26, 33, 40, 48, 41, 34,
65 27, 20, 13, 6, 7, 14, 21, 28,
66 35, 42, 49, 56, 57, 50, 43, 36,
67 29, 22, 15, 23, 30, 37, 44, 51,
68 58, 59, 52, 45, 38, 31, 39, 46,
69 53, 60, 61, 54, 47, 55, 62, 63
72 /* Specific zigzag scan for 248 idct. NOTE that unlike the
73 specification, we interleave the fields */
74 const uint8_t ff_zigzag248_direct[64] = {
75 0, 8, 1, 9, 16, 24, 2, 10,
76 17, 25, 32, 40, 48, 56, 33, 41,
77 18, 26, 3, 11, 4, 12, 19, 27,
78 34, 42, 49, 57, 50, 58, 35, 43,
79 20, 28, 5, 13, 6, 14, 21, 29,
80 36, 44, 51, 59, 52, 60, 37, 45,
81 22, 30, 7, 15, 23, 31, 38, 46,
82 53, 61, 54, 62, 39, 47, 55, 63,
85 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
86 DECLARE_ALIGNED(16, uint16_t, inv_zigzag_direct16)[64];
88 const uint8_t ff_alternate_horizontal_scan[64] = {
89 0, 1, 2, 3, 8, 9, 16, 17,
90 10, 11, 4, 5, 6, 7, 15, 14,
91 13, 12, 19, 18, 24, 25, 32, 33,
92 26, 27, 20, 21, 22, 23, 28, 29,
93 30, 31, 34, 35, 40, 41, 48, 49,
94 42, 43, 36, 37, 38, 39, 44, 45,
95 46, 47, 50, 51, 56, 57, 58, 59,
96 52, 53, 54, 55, 60, 61, 62, 63,
99 const uint8_t ff_alternate_vertical_scan[64] = {
100 0, 8, 16, 24, 1, 9, 2, 10,
101 17, 25, 32, 40, 48, 56, 57, 49,
102 41, 33, 26, 18, 3, 11, 4, 12,
103 19, 27, 34, 42, 50, 58, 35, 43,
104 51, 59, 20, 28, 5, 13, 6, 14,
105 21, 29, 36, 44, 52, 60, 37, 45,
106 53, 61, 22, 30, 7, 15, 23, 31,
107 38, 46, 54, 62, 39, 47, 55, 63,
110 /* Input permutation for the simple_idct_mmx */
111 static const uint8_t simple_mmx_permutation[64]={
112 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
113 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
114 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
115 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
116 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
117 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
118 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
119 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
122 static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7};
124 void ff_init_scantable(uint8_t *permutation, ScanTable *st, const uint8_t *src_scantable){
128 st->scantable= src_scantable;
132 j = src_scantable[i];
133 st->permutated[i] = permutation[j];
142 j = st->permutated[i];
144 st->raster_end[i]= end;
148 static int pix_sum_c(uint8_t * pix, int line_size)
153 for (i = 0; i < 16; i++) {
154 for (j = 0; j < 16; j += 8) {
165 pix += line_size - 16;
170 static int pix_norm1_c(uint8_t * pix, int line_size)
173 uint32_t *sq = ff_squareTbl + 256;
176 for (i = 0; i < 16; i++) {
177 for (j = 0; j < 16; j += 8) {
178 #if LONG_MAX > 2147483647
179 register uint64_t x=*(uint64_t*)pix;
181 s += sq[(x>>8)&0xff];
182 s += sq[(x>>16)&0xff];
183 s += sq[(x>>24)&0xff];
184 s += sq[(x>>32)&0xff];
185 s += sq[(x>>40)&0xff];
186 s += sq[(x>>48)&0xff];
187 s += sq[(x>>56)&0xff];
189 register uint32_t x=*(uint32_t*)pix;
191 s += sq[(x>>8)&0xff];
192 s += sq[(x>>16)&0xff];
193 s += sq[(x>>24)&0xff];
194 x=*(uint32_t*)(pix+4);
196 s += sq[(x>>8)&0xff];
197 s += sq[(x>>16)&0xff];
198 s += sq[(x>>24)&0xff];
202 pix += line_size - 16;
207 static void bswap_buf(uint32_t *dst, const uint32_t *src, int w){
210 for(i=0; i+8<=w; i+=8){
211 dst[i+0]= av_bswap32(src[i+0]);
212 dst[i+1]= av_bswap32(src[i+1]);
213 dst[i+2]= av_bswap32(src[i+2]);
214 dst[i+3]= av_bswap32(src[i+3]);
215 dst[i+4]= av_bswap32(src[i+4]);
216 dst[i+5]= av_bswap32(src[i+5]);
217 dst[i+6]= av_bswap32(src[i+6]);
218 dst[i+7]= av_bswap32(src[i+7]);
221 dst[i+0]= av_bswap32(src[i+0]);
225 static void bswap16_buf(uint16_t *dst, const uint16_t *src, int len)
228 *dst++ = av_bswap16(*src++);
231 static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
234 uint32_t *sq = ff_squareTbl + 256;
237 for (i = 0; i < h; i++) {
238 s += sq[pix1[0] - pix2[0]];
239 s += sq[pix1[1] - pix2[1]];
240 s += sq[pix1[2] - pix2[2]];
241 s += sq[pix1[3] - pix2[3]];
248 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
251 uint32_t *sq = ff_squareTbl + 256;
254 for (i = 0; i < h; i++) {
255 s += sq[pix1[0] - pix2[0]];
256 s += sq[pix1[1] - pix2[1]];
257 s += sq[pix1[2] - pix2[2]];
258 s += sq[pix1[3] - pix2[3]];
259 s += sq[pix1[4] - pix2[4]];
260 s += sq[pix1[5] - pix2[5]];
261 s += sq[pix1[6] - pix2[6]];
262 s += sq[pix1[7] - pix2[7]];
269 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
272 uint32_t *sq = ff_squareTbl + 256;
275 for (i = 0; i < h; i++) {
276 s += sq[pix1[ 0] - pix2[ 0]];
277 s += sq[pix1[ 1] - pix2[ 1]];
278 s += sq[pix1[ 2] - pix2[ 2]];
279 s += sq[pix1[ 3] - pix2[ 3]];
280 s += sq[pix1[ 4] - pix2[ 4]];
281 s += sq[pix1[ 5] - pix2[ 5]];
282 s += sq[pix1[ 6] - pix2[ 6]];
283 s += sq[pix1[ 7] - pix2[ 7]];
284 s += sq[pix1[ 8] - pix2[ 8]];
285 s += sq[pix1[ 9] - pix2[ 9]];
286 s += sq[pix1[10] - pix2[10]];
287 s += sq[pix1[11] - pix2[11]];
288 s += sq[pix1[12] - pix2[12]];
289 s += sq[pix1[13] - pix2[13]];
290 s += sq[pix1[14] - pix2[14]];
291 s += sq[pix1[15] - pix2[15]];
299 static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
303 /* read the pixels */
305 block[0] = pixels[0];
306 block[1] = pixels[1];
307 block[2] = pixels[2];
308 block[3] = pixels[3];
309 block[4] = pixels[4];
310 block[5] = pixels[5];
311 block[6] = pixels[6];
312 block[7] = pixels[7];
318 static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
319 const uint8_t *s2, int stride){
322 /* read the pixels */
324 block[0] = s1[0] - s2[0];
325 block[1] = s1[1] - s2[1];
326 block[2] = s1[2] - s2[2];
327 block[3] = s1[3] - s2[3];
328 block[4] = s1[4] - s2[4];
329 block[5] = s1[5] - s2[5];
330 block[6] = s1[6] - s2[6];
331 block[7] = s1[7] - s2[7];
339 void ff_put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
343 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
345 /* read the pixels */
347 pixels[0] = cm[block[0]];
348 pixels[1] = cm[block[1]];
349 pixels[2] = cm[block[2]];
350 pixels[3] = cm[block[3]];
351 pixels[4] = cm[block[4]];
352 pixels[5] = cm[block[5]];
353 pixels[6] = cm[block[6]];
354 pixels[7] = cm[block[7]];
361 static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
365 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
367 /* read the pixels */
369 pixels[0] = cm[block[0]];
370 pixels[1] = cm[block[1]];
371 pixels[2] = cm[block[2]];
372 pixels[3] = cm[block[3]];
379 static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
383 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
385 /* read the pixels */
387 pixels[0] = cm[block[0]];
388 pixels[1] = cm[block[1]];
395 void ff_put_signed_pixels_clamped_c(const DCTELEM *block,
396 uint8_t *restrict pixels,
401 for (i = 0; i < 8; i++) {
402 for (j = 0; j < 8; j++) {
405 else if (*block > 127)
408 *pixels = (uint8_t)(*block + 128);
412 pixels += (line_size - 8);
416 static void put_pixels_nonclamped_c(const DCTELEM *block, uint8_t *restrict pixels,
421 /* read the pixels */
423 pixels[0] = block[0];
424 pixels[1] = block[1];
425 pixels[2] = block[2];
426 pixels[3] = block[3];
427 pixels[4] = block[4];
428 pixels[5] = block[5];
429 pixels[6] = block[6];
430 pixels[7] = block[7];
437 void ff_add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
441 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
443 /* read the pixels */
445 pixels[0] = cm[pixels[0] + block[0]];
446 pixels[1] = cm[pixels[1] + block[1]];
447 pixels[2] = cm[pixels[2] + block[2]];
448 pixels[3] = cm[pixels[3] + block[3]];
449 pixels[4] = cm[pixels[4] + block[4]];
450 pixels[5] = cm[pixels[5] + block[5]];
451 pixels[6] = cm[pixels[6] + block[6]];
452 pixels[7] = cm[pixels[7] + block[7]];
458 static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
462 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
464 /* read the pixels */
466 pixels[0] = cm[pixels[0] + block[0]];
467 pixels[1] = cm[pixels[1] + block[1]];
468 pixels[2] = cm[pixels[2] + block[2]];
469 pixels[3] = cm[pixels[3] + block[3]];
475 static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
479 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
481 /* read the pixels */
483 pixels[0] = cm[pixels[0] + block[0]];
484 pixels[1] = cm[pixels[1] + block[1]];
490 static int sum_abs_dctelem_c(DCTELEM *block)
494 sum+= FFABS(block[i]);
498 static void fill_block16_c(uint8_t *block, uint8_t value, int line_size, int h)
502 for (i = 0; i < h; i++) {
503 memset(block, value, 16);
508 static void fill_block8_c(uint8_t *block, uint8_t value, int line_size, int h)
512 for (i = 0; i < h; i++) {
513 memset(block, value, 8);
518 static void scale_block_c(const uint8_t src[64]/*align 8*/, uint8_t *dst/*align 8*/, int linesize)
521 uint16_t *dst1 = (uint16_t *) dst;
522 uint16_t *dst2 = (uint16_t *)(dst + linesize);
524 for (j = 0; j < 8; j++) {
525 for (i = 0; i < 8; i++) {
526 dst1[i] = dst2[i] = src[i] * 0x0101;
534 #define avg2(a,b) ((a+b+1)>>1)
535 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
537 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
539 const int A=(16-x16)*(16-y16);
540 const int B=( x16)*(16-y16);
541 const int C=(16-x16)*( y16);
542 const int D=( x16)*( y16);
547 dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
548 dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
549 dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
550 dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
551 dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
552 dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
553 dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
554 dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
560 void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
561 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
564 const int s= 1<<shift;
574 for(x=0; x<8; x++){ //XXX FIXME optimize
575 int src_x, src_y, frac_x, frac_y, index;
584 if((unsigned)src_x < width){
585 if((unsigned)src_y < height){
586 index= src_x + src_y*stride;
587 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
588 + src[index +1]* frac_x )*(s-frac_y)
589 + ( src[index+stride ]*(s-frac_x)
590 + src[index+stride+1]* frac_x )* frac_y
593 index= src_x + av_clip(src_y, 0, height)*stride;
594 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
595 + src[index +1]* frac_x )*s
599 if((unsigned)src_y < height){
600 index= av_clip(src_x, 0, width) + src_y*stride;
601 dst[y*stride + x]= ( ( src[index ]*(s-frac_y)
602 + src[index+stride ]* frac_y )*s
605 index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
606 dst[y*stride + x]= src[index ];
618 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
620 case 2: put_pixels2_8_c (dst, src, stride, height); break;
621 case 4: put_pixels4_8_c (dst, src, stride, height); break;
622 case 8: put_pixels8_8_c (dst, src, stride, height); break;
623 case 16:put_pixels16_8_c(dst, src, stride, height); break;
627 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
629 for (i=0; i < height; i++) {
630 for (j=0; j < width; j++) {
631 dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
638 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
640 for (i=0; i < height; i++) {
641 for (j=0; j < width; j++) {
642 dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
649 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
651 for (i=0; i < height; i++) {
652 for (j=0; j < width; j++) {
653 dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
660 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
662 for (i=0; i < height; i++) {
663 for (j=0; j < width; j++) {
664 dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
671 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
673 for (i=0; i < height; i++) {
674 for (j=0; j < width; j++) {
675 dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
682 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
684 for (i=0; i < height; i++) {
685 for (j=0; j < width; j++) {
686 dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
693 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
695 for (i=0; i < height; i++) {
696 for (j=0; j < width; j++) {
697 dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
704 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
706 for (i=0; i < height; i++) {
707 for (j=0; j < width; j++) {
708 dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
715 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
717 case 2: avg_pixels2_8_c (dst, src, stride, height); break;
718 case 4: avg_pixels4_8_c (dst, src, stride, height); break;
719 case 8: avg_pixels8_8_c (dst, src, stride, height); break;
720 case 16:avg_pixels16_8_c(dst, src, stride, height); break;
724 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
726 for (i=0; i < height; i++) {
727 for (j=0; j < width; j++) {
728 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
735 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
737 for (i=0; i < height; i++) {
738 for (j=0; j < width; j++) {
739 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
746 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
748 for (i=0; i < height; i++) {
749 for (j=0; j < width; j++) {
750 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
757 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
759 for (i=0; i < height; i++) {
760 for (j=0; j < width; j++) {
761 dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
768 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
770 for (i=0; i < height; i++) {
771 for (j=0; j < width; j++) {
772 dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
779 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
781 for (i=0; i < height; i++) {
782 for (j=0; j < width; j++) {
783 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
790 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
792 for (i=0; i < height; i++) {
793 for (j=0; j < width; j++) {
794 dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
801 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
803 for (i=0; i < height; i++) {
804 for (j=0; j < width; j++) {
805 dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
812 #define QPEL_MC(r, OPNAME, RND, OP) \
813 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
814 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
818 OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
819 OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
820 OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
821 OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
822 OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
823 OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
824 OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
825 OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
831 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
833 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
837 const int src0= src[0*srcStride];\
838 const int src1= src[1*srcStride];\
839 const int src2= src[2*srcStride];\
840 const int src3= src[3*srcStride];\
841 const int src4= src[4*srcStride];\
842 const int src5= src[5*srcStride];\
843 const int src6= src[6*srcStride];\
844 const int src7= src[7*srcStride];\
845 const int src8= src[8*srcStride];\
846 OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
847 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
848 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
849 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
850 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
851 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
852 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
853 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
859 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
860 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
865 OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
866 OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
867 OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
868 OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
869 OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
870 OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
871 OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
872 OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
873 OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
874 OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
875 OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
876 OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
877 OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
878 OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
879 OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
880 OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
886 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
887 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
892 const int src0= src[0*srcStride];\
893 const int src1= src[1*srcStride];\
894 const int src2= src[2*srcStride];\
895 const int src3= src[3*srcStride];\
896 const int src4= src[4*srcStride];\
897 const int src5= src[5*srcStride];\
898 const int src6= src[6*srcStride];\
899 const int src7= src[7*srcStride];\
900 const int src8= src[8*srcStride];\
901 const int src9= src[9*srcStride];\
902 const int src10= src[10*srcStride];\
903 const int src11= src[11*srcStride];\
904 const int src12= src[12*srcStride];\
905 const int src13= src[13*srcStride];\
906 const int src14= src[14*srcStride];\
907 const int src15= src[15*srcStride];\
908 const int src16= src[16*srcStride];\
909 OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
910 OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
911 OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
912 OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
913 OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
914 OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
915 OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
916 OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
917 OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
918 OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
919 OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
920 OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
921 OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
922 OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
923 OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
924 OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
930 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
932 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
933 OPNAME ## pixels8_l2_8(dst, src, half, stride, stride, 8, 8);\
936 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
937 OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
940 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
942 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
943 OPNAME ## pixels8_l2_8(dst, src+1, half, stride, stride, 8, 8);\
946 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
949 copy_block9(full, src, 16, stride, 9);\
950 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
951 OPNAME ## pixels8_l2_8(dst, full, half, stride, 16, 8, 8);\
954 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
956 copy_block9(full, src, 16, stride, 9);\
957 OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
960 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
963 copy_block9(full, src, 16, stride, 9);\
964 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
965 OPNAME ## pixels8_l2_8(dst, full+16, half, stride, 16, 8, 8);\
967 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
972 copy_block9(full, src, 16, stride, 9);\
973 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
974 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
975 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
976 OPNAME ## pixels8_l4_8(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
978 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
982 copy_block9(full, src, 16, stride, 9);\
983 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
984 put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
985 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
986 OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
988 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
993 copy_block9(full, src, 16, stride, 9);\
994 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
995 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
996 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
997 OPNAME ## pixels8_l4_8(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
999 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1000 uint8_t full[16*9];\
1002 uint8_t halfHV[64];\
1003 copy_block9(full, src, 16, stride, 9);\
1004 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1005 put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
1006 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1007 OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
1009 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1010 uint8_t full[16*9];\
1013 uint8_t halfHV[64];\
1014 copy_block9(full, src, 16, stride, 9);\
1015 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1016 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1017 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1018 OPNAME ## pixels8_l4_8(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1020 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1021 uint8_t full[16*9];\
1023 uint8_t halfHV[64];\
1024 copy_block9(full, src, 16, stride, 9);\
1025 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1026 put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
1027 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1028 OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1030 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1031 uint8_t full[16*9];\
1034 uint8_t halfHV[64];\
1035 copy_block9(full, src, 16, stride, 9);\
1036 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full , 8, 16, 9);\
1037 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1038 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1039 OPNAME ## pixels8_l4_8(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1041 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1042 uint8_t full[16*9];\
1044 uint8_t halfHV[64];\
1045 copy_block9(full, src, 16, stride, 9);\
1046 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1047 put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
1048 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1049 OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1051 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1053 uint8_t halfHV[64];\
1054 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1055 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1056 OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
1058 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1060 uint8_t halfHV[64];\
1061 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1062 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1063 OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1065 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1066 uint8_t full[16*9];\
1069 uint8_t halfHV[64];\
1070 copy_block9(full, src, 16, stride, 9);\
1071 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1072 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1073 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1074 OPNAME ## pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);\
1076 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1077 uint8_t full[16*9];\
1079 copy_block9(full, src, 16, stride, 9);\
1080 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1081 put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
1082 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1084 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1085 uint8_t full[16*9];\
1088 uint8_t halfHV[64];\
1089 copy_block9(full, src, 16, stride, 9);\
1090 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1091 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1092 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1093 OPNAME ## pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);\
1095 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1096 uint8_t full[16*9];\
1098 copy_block9(full, src, 16, stride, 9);\
1099 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1100 put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
1101 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1103 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1105 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1106 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1109 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1111 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1112 OPNAME ## pixels16_l2_8(dst, src, half, stride, stride, 16, 16);\
1115 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1116 OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1119 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1121 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1122 OPNAME ## pixels16_l2_8(dst, src+1, half, stride, stride, 16, 16);\
1125 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1126 uint8_t full[24*17];\
1128 copy_block17(full, src, 24, stride, 17);\
1129 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1130 OPNAME ## pixels16_l2_8(dst, full, half, stride, 24, 16, 16);\
1133 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1134 uint8_t full[24*17];\
1135 copy_block17(full, src, 24, stride, 17);\
1136 OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1139 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1140 uint8_t full[24*17];\
1142 copy_block17(full, src, 24, stride, 17);\
1143 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1144 OPNAME ## pixels16_l2_8(dst, full+24, half, stride, 24, 16, 16);\
1146 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1147 uint8_t full[24*17];\
1148 uint8_t halfH[272];\
1149 uint8_t halfV[256];\
1150 uint8_t halfHV[256];\
1151 copy_block17(full, src, 24, stride, 17);\
1152 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1153 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1154 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1155 OPNAME ## pixels16_l4_8(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1157 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1158 uint8_t full[24*17];\
1159 uint8_t halfH[272];\
1160 uint8_t halfHV[256];\
1161 copy_block17(full, src, 24, stride, 17);\
1162 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1163 put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1164 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1165 OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1167 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1168 uint8_t full[24*17];\
1169 uint8_t halfH[272];\
1170 uint8_t halfV[256];\
1171 uint8_t halfHV[256];\
1172 copy_block17(full, src, 24, stride, 17);\
1173 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1174 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1175 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1176 OPNAME ## pixels16_l4_8(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1178 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1179 uint8_t full[24*17];\
1180 uint8_t halfH[272];\
1181 uint8_t halfHV[256];\
1182 copy_block17(full, src, 24, stride, 17);\
1183 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1184 put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1185 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1186 OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1188 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1189 uint8_t full[24*17];\
1190 uint8_t halfH[272];\
1191 uint8_t halfV[256];\
1192 uint8_t halfHV[256];\
1193 copy_block17(full, src, 24, stride, 17);\
1194 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1195 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1196 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1197 OPNAME ## pixels16_l4_8(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1199 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1200 uint8_t full[24*17];\
1201 uint8_t halfH[272];\
1202 uint8_t halfHV[256];\
1203 copy_block17(full, src, 24, stride, 17);\
1204 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1205 put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1206 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1207 OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1209 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1210 uint8_t full[24*17];\
1211 uint8_t halfH[272];\
1212 uint8_t halfV[256];\
1213 uint8_t halfHV[256];\
1214 copy_block17(full, src, 24, stride, 17);\
1215 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full , 16, 24, 17);\
1216 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1217 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1218 OPNAME ## pixels16_l4_8(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1220 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1221 uint8_t full[24*17];\
1222 uint8_t halfH[272];\
1223 uint8_t halfHV[256];\
1224 copy_block17(full, src, 24, stride, 17);\
1225 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1226 put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1227 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1228 OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1230 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1231 uint8_t halfH[272];\
1232 uint8_t halfHV[256];\
1233 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1234 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1235 OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1237 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1238 uint8_t halfH[272];\
1239 uint8_t halfHV[256];\
1240 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1241 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1242 OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1244 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1245 uint8_t full[24*17];\
1246 uint8_t halfH[272];\
1247 uint8_t halfV[256];\
1248 uint8_t halfHV[256];\
1249 copy_block17(full, src, 24, stride, 17);\
1250 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1251 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1252 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1253 OPNAME ## pixels16_l2_8(dst, halfV, halfHV, stride, 16, 16, 16);\
1255 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1256 uint8_t full[24*17];\
1257 uint8_t halfH[272];\
1258 copy_block17(full, src, 24, stride, 17);\
1259 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1260 put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1261 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1263 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1264 uint8_t full[24*17];\
1265 uint8_t halfH[272];\
1266 uint8_t halfV[256];\
1267 uint8_t halfHV[256];\
1268 copy_block17(full, src, 24, stride, 17);\
1269 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1270 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1271 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1272 OPNAME ## pixels16_l2_8(dst, halfV, halfHV, stride, 16, 16, 16);\
1274 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1275 uint8_t full[24*17];\
1276 uint8_t halfH[272];\
1277 copy_block17(full, src, 24, stride, 17);\
1278 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1279 put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1280 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1282 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1283 uint8_t halfH[272];\
1284 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1285 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1288 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
1289 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
1290 #define op_put(a, b) a = cm[((b) + 16)>>5]
1291 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
1293 QPEL_MC(0, put_ , _ , op_put)
1294 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
1295 QPEL_MC(0, avg_ , _ , op_avg)
1296 //QPEL_MC(1, avg_no_rnd , _ , op_avg)
1298 #undef op_avg_no_rnd
1300 #undef op_put_no_rnd
1302 #define put_qpel8_mc00_c ff_put_pixels8x8_c
1303 #define avg_qpel8_mc00_c ff_avg_pixels8x8_c
1304 #define put_qpel16_mc00_c ff_put_pixels16x16_c
1305 #define avg_qpel16_mc00_c ff_avg_pixels16x16_c
1306 #define put_no_rnd_qpel8_mc00_c ff_put_pixels8x8_c
1307 #define put_no_rnd_qpel16_mc00_c ff_put_pixels16x16_8_c
1309 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
1310 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
1314 dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
1315 dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
1316 dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
1317 dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
1318 dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
1319 dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
1320 dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
1321 dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
1327 #if CONFIG_RV40_DECODER
1328 static void put_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1329 put_pixels16_xy2_8_c(dst, src, stride, 16);
1331 static void avg_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1332 avg_pixels16_xy2_8_c(dst, src, stride, 16);
1334 static void put_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1335 put_pixels8_xy2_8_c(dst, src, stride, 8);
1337 static void avg_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1338 avg_pixels8_xy2_8_c(dst, src, stride, 8);
1340 #endif /* CONFIG_RV40_DECODER */
1342 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
1343 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
1347 const int src_1= src[ -srcStride];
1348 const int src0 = src[0 ];
1349 const int src1 = src[ srcStride];
1350 const int src2 = src[2*srcStride];
1351 const int src3 = src[3*srcStride];
1352 const int src4 = src[4*srcStride];
1353 const int src5 = src[5*srcStride];
1354 const int src6 = src[6*srcStride];
1355 const int src7 = src[7*srcStride];
1356 const int src8 = src[8*srcStride];
1357 const int src9 = src[9*srcStride];
1358 dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
1359 dst[1*dstStride]= cm[(9*(src1 + src2) - (src0 + src3) + 8)>>4];
1360 dst[2*dstStride]= cm[(9*(src2 + src3) - (src1 + src4) + 8)>>4];
1361 dst[3*dstStride]= cm[(9*(src3 + src4) - (src2 + src5) + 8)>>4];
1362 dst[4*dstStride]= cm[(9*(src4 + src5) - (src3 + src6) + 8)>>4];
1363 dst[5*dstStride]= cm[(9*(src5 + src6) - (src4 + src7) + 8)>>4];
1364 dst[6*dstStride]= cm[(9*(src6 + src7) - (src5 + src8) + 8)>>4];
1365 dst[7*dstStride]= cm[(9*(src7 + src8) - (src6 + src9) + 8)>>4];
1371 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
1373 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1374 put_pixels8_l2_8(dst, src, half, stride, stride, 8, 8);
1377 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
1378 wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
1381 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
1383 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1384 put_pixels8_l2_8(dst, src+1, half, stride, stride, 8, 8);
1387 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
1388 wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
1391 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
1395 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1396 wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
1397 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
1398 put_pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);
1400 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
1404 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1405 wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
1406 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
1407 put_pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);
1409 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
1411 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1412 wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
1415 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
1416 if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
1418 const int strength= ff_h263_loop_filter_strength[qscale];
1422 int p0= src[x-2*stride];
1423 int p1= src[x-1*stride];
1424 int p2= src[x+0*stride];
1425 int p3= src[x+1*stride];
1426 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
1428 if (d<-2*strength) d1= 0;
1429 else if(d<- strength) d1=-2*strength - d;
1430 else if(d< strength) d1= d;
1431 else if(d< 2*strength) d1= 2*strength - d;
1436 if(p1&256) p1= ~(p1>>31);
1437 if(p2&256) p2= ~(p2>>31);
1439 src[x-1*stride] = p1;
1440 src[x+0*stride] = p2;
1444 d2= av_clip((p0-p3)/4, -ad1, ad1);
1446 src[x-2*stride] = p0 - d2;
1447 src[x+ stride] = p3 + d2;
1452 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
1453 if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
1455 const int strength= ff_h263_loop_filter_strength[qscale];
1459 int p0= src[y*stride-2];
1460 int p1= src[y*stride-1];
1461 int p2= src[y*stride+0];
1462 int p3= src[y*stride+1];
1463 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
1465 if (d<-2*strength) d1= 0;
1466 else if(d<- strength) d1=-2*strength - d;
1467 else if(d< strength) d1= d;
1468 else if(d< 2*strength) d1= 2*strength - d;
1473 if(p1&256) p1= ~(p1>>31);
1474 if(p2&256) p2= ~(p2>>31);
1476 src[y*stride-1] = p1;
1477 src[y*stride+0] = p2;
1481 d2= av_clip((p0-p3)/4, -ad1, ad1);
1483 src[y*stride-2] = p0 - d2;
1484 src[y*stride+1] = p3 + d2;
1489 static void h261_loop_filter_c(uint8_t *src, int stride){
1494 temp[x ] = 4*src[x ];
1495 temp[x + 7*8] = 4*src[x + 7*stride];
1499 xy = y * stride + x;
1501 temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
1506 src[ y*stride] = (temp[ y*8] + 2)>>2;
1507 src[7+y*stride] = (temp[7+y*8] + 2)>>2;
1509 xy = y * stride + x;
1511 src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
1516 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1522 s += abs(pix1[0] - pix2[0]);
1523 s += abs(pix1[1] - pix2[1]);
1524 s += abs(pix1[2] - pix2[2]);
1525 s += abs(pix1[3] - pix2[3]);
1526 s += abs(pix1[4] - pix2[4]);
1527 s += abs(pix1[5] - pix2[5]);
1528 s += abs(pix1[6] - pix2[6]);
1529 s += abs(pix1[7] - pix2[7]);
1530 s += abs(pix1[8] - pix2[8]);
1531 s += abs(pix1[9] - pix2[9]);
1532 s += abs(pix1[10] - pix2[10]);
1533 s += abs(pix1[11] - pix2[11]);
1534 s += abs(pix1[12] - pix2[12]);
1535 s += abs(pix1[13] - pix2[13]);
1536 s += abs(pix1[14] - pix2[14]);
1537 s += abs(pix1[15] - pix2[15]);
1544 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1550 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1551 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1552 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1553 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1554 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1555 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1556 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1557 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1558 s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
1559 s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
1560 s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
1561 s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
1562 s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
1563 s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
1564 s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
1565 s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
1572 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1575 uint8_t *pix3 = pix2 + line_size;
1579 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1580 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1581 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1582 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1583 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1584 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1585 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1586 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1587 s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
1588 s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
1589 s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
1590 s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
1591 s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
1592 s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
1593 s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
1594 s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
1602 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1605 uint8_t *pix3 = pix2 + line_size;
1609 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1610 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1611 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1612 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1613 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1614 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1615 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1616 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1617 s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
1618 s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
1619 s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
1620 s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
1621 s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
1622 s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
1623 s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
1624 s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
1632 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1638 s += abs(pix1[0] - pix2[0]);
1639 s += abs(pix1[1] - pix2[1]);
1640 s += abs(pix1[2] - pix2[2]);
1641 s += abs(pix1[3] - pix2[3]);
1642 s += abs(pix1[4] - pix2[4]);
1643 s += abs(pix1[5] - pix2[5]);
1644 s += abs(pix1[6] - pix2[6]);
1645 s += abs(pix1[7] - pix2[7]);
1652 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1658 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1659 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1660 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1661 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1662 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1663 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1664 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1665 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1672 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1675 uint8_t *pix3 = pix2 + line_size;
1679 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1680 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1681 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1682 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1683 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1684 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1685 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1686 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1694 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1697 uint8_t *pix3 = pix2 + line_size;
1701 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1702 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1703 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1704 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1705 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1706 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1707 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1708 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1716 static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
1717 MpegEncContext *c = v;
1723 for(x=0; x<16; x++){
1724 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
1727 for(x=0; x<15; x++){
1728 score2+= FFABS( s1[x ] - s1[x +stride]
1729 - s1[x+1] + s1[x+1+stride])
1730 -FFABS( s2[x ] - s2[x +stride]
1731 - s2[x+1] + s2[x+1+stride]);
1738 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1739 else return score1 + FFABS(score2)*8;
1742 static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
1743 MpegEncContext *c = v;
1750 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
1754 score2+= FFABS( s1[x ] - s1[x +stride]
1755 - s1[x+1] + s1[x+1+stride])
1756 -FFABS( s2[x ] - s2[x +stride]
1757 - s2[x+1] + s2[x+1+stride]);
1764 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1765 else return score1 + FFABS(score2)*8;
1768 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
1772 for(i=0; i<8*8; i++){
1773 int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
1776 assert(-512<b && b<512);
1778 sum += (w*b)*(w*b)>>4;
1783 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
1786 for(i=0; i<8*8; i++){
1787 rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
1792 * permutes an 8x8 block.
1793 * @param block the block which will be permuted according to the given permutation vector
1794 * @param permutation the permutation vector
1795 * @param last the last non zero coefficient in scantable order, used to speed the permutation up
1796 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
1797 * (inverse) permutated to scantable order!
1799 void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
1805 //if(permutation[1]==1) return; //FIXME it is ok but not clean and might fail for some permutations
1807 for(i=0; i<=last; i++){
1808 const int j= scantable[i];
1813 for(i=0; i<=last; i++){
1814 const int j= scantable[i];
1815 const int perm_j= permutation[j];
1816 block[perm_j]= temp[j];
1820 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
1824 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
1827 memset(cmp, 0, sizeof(void*)*6);
1835 cmp[i]= c->hadamard8_diff[i];
1841 cmp[i]= c->dct_sad[i];
1844 cmp[i]= c->dct264_sad[i];
1847 cmp[i]= c->dct_max[i];
1850 cmp[i]= c->quant_psnr[i];
1879 av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
1884 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
1886 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
1887 long a = *(long*)(src+i);
1888 long b = *(long*)(dst+i);
1889 *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
1892 dst[i+0] += src[i+0];
1895 static void add_bytes_l2_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
1897 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
1898 long a = *(long*)(src1+i);
1899 long b = *(long*)(src2+i);
1900 *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
1903 dst[i] = src1[i]+src2[i];
1906 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
1908 #if !HAVE_FAST_UNALIGNED
1909 if((long)src2 & (sizeof(long)-1)){
1910 for(i=0; i+7<w; i+=8){
1911 dst[i+0] = src1[i+0]-src2[i+0];
1912 dst[i+1] = src1[i+1]-src2[i+1];
1913 dst[i+2] = src1[i+2]-src2[i+2];
1914 dst[i+3] = src1[i+3]-src2[i+3];
1915 dst[i+4] = src1[i+4]-src2[i+4];
1916 dst[i+5] = src1[i+5]-src2[i+5];
1917 dst[i+6] = src1[i+6]-src2[i+6];
1918 dst[i+7] = src1[i+7]-src2[i+7];
1922 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
1923 long a = *(long*)(src1+i);
1924 long b = *(long*)(src2+i);
1925 *(long*)(dst+i) = ((a|pb_80) - (b&pb_7f)) ^ ((a^b^pb_80)&pb_80);
1928 dst[i+0] = src1[i+0]-src2[i+0];
1931 static void add_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *diff, int w, int *left, int *left_top){
1939 l= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF) + diff[i];
1948 static void sub_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w, int *left, int *left_top){
1956 const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
1966 static int add_hfyu_left_prediction_c(uint8_t *dst, const uint8_t *src, int w, int acc){
1969 for(i=0; i<w-1; i++){
1996 static void add_hfyu_left_prediction_bgr32_c(uint8_t *dst, const uint8_t *src, int w, int *red, int *green, int *blue, int *alpha){
2026 #define BUTTERFLY2(o1,o2,i1,i2) \
2030 #define BUTTERFLY1(x,y) \
2039 #define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
2041 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
2049 //FIXME try pointer walks
2050 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
2051 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
2052 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
2053 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
2055 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2056 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2057 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2058 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2060 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2061 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2062 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2063 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2067 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2068 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2069 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2070 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2072 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2073 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2074 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2075 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2078 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2079 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2080 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2081 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2086 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
2094 //FIXME try pointer walks
2095 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
2096 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
2097 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
2098 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
2100 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2101 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2102 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2103 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2105 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2106 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2107 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2108 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2112 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2113 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2114 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2115 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2117 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2118 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2119 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2120 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2123 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2124 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2125 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2126 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2129 sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
2134 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2135 MpegEncContext * const s= (MpegEncContext *)c;
2136 LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
2140 s->dsp.diff_pixels(temp, src1, src2, stride);
2142 return s->dsp.sum_abs_dctelem(temp);
2147 const int s07 = SRC(0) + SRC(7);\
2148 const int s16 = SRC(1) + SRC(6);\
2149 const int s25 = SRC(2) + SRC(5);\
2150 const int s34 = SRC(3) + SRC(4);\
2151 const int a0 = s07 + s34;\
2152 const int a1 = s16 + s25;\
2153 const int a2 = s07 - s34;\
2154 const int a3 = s16 - s25;\
2155 const int d07 = SRC(0) - SRC(7);\
2156 const int d16 = SRC(1) - SRC(6);\
2157 const int d25 = SRC(2) - SRC(5);\
2158 const int d34 = SRC(3) - SRC(4);\
2159 const int a4 = d16 + d25 + (d07 + (d07>>1));\
2160 const int a5 = d07 - d34 - (d25 + (d25>>1));\
2161 const int a6 = d07 + d34 - (d16 + (d16>>1));\
2162 const int a7 = d16 - d25 + (d34 + (d34>>1));\
2164 DST(1, a4 + (a7>>2)) ;\
2165 DST(2, a2 + (a3>>1)) ;\
2166 DST(3, a5 + (a6>>2)) ;\
2168 DST(5, a6 - (a5>>2)) ;\
2169 DST(6, (a2>>1) - a3 ) ;\
2170 DST(7, (a4>>2) - a7 ) ;\
2173 static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2174 MpegEncContext * const s= (MpegEncContext *)c;
2179 s->dsp.diff_pixels(dct[0], src1, src2, stride);
2181 #define SRC(x) dct[i][x]
2182 #define DST(x,v) dct[i][x]= v
2183 for( i = 0; i < 8; i++ )
2188 #define SRC(x) dct[x][i]
2189 #define DST(x,v) sum += FFABS(v)
2190 for( i = 0; i < 8; i++ )
2198 static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2199 MpegEncContext * const s= (MpegEncContext *)c;
2200 LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
2205 s->dsp.diff_pixels(temp, src1, src2, stride);
2209 sum= FFMAX(sum, FFABS(temp[i]));
2214 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2215 MpegEncContext * const s= (MpegEncContext *)c;
2216 LOCAL_ALIGNED_16(DCTELEM, temp, [64*2]);
2217 DCTELEM * const bak = temp+64;
2223 s->dsp.diff_pixels(temp, src1, src2, stride);
2225 memcpy(bak, temp, 64*sizeof(DCTELEM));
2227 s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2228 s->dct_unquantize_inter(s, temp, 0, s->qscale);
2229 ff_simple_idct(temp); //FIXME
2232 sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
2237 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2238 MpegEncContext * const s= (MpegEncContext *)c;
2239 const uint8_t *scantable= s->intra_scantable.permutated;
2240 LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
2241 LOCAL_ALIGNED_16(uint8_t, lsrc1, [64]);
2242 LOCAL_ALIGNED_16(uint8_t, lsrc2, [64]);
2243 int i, last, run, bits, level, distortion, start_i;
2244 const int esc_length= s->ac_esc_length;
2246 uint8_t * last_length;
2250 copy_block8(lsrc1, src1, 8, stride, 8);
2251 copy_block8(lsrc2, src2, 8, stride, 8);
2253 s->dsp.diff_pixels(temp, lsrc1, lsrc2, 8);
2255 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2261 length = s->intra_ac_vlc_length;
2262 last_length= s->intra_ac_vlc_last_length;
2263 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2266 length = s->inter_ac_vlc_length;
2267 last_length= s->inter_ac_vlc_last_length;
2272 for(i=start_i; i<last; i++){
2273 int j= scantable[i];
2278 if((level&(~127)) == 0){
2279 bits+= length[UNI_AC_ENC_INDEX(run, level)];
2288 level= temp[i] + 64;
2292 if((level&(~127)) == 0){
2293 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2301 s->dct_unquantize_intra(s, temp, 0, s->qscale);
2303 s->dct_unquantize_inter(s, temp, 0, s->qscale);
2306 s->dsp.idct_add(lsrc2, 8, temp);
2308 distortion= s->dsp.sse[1](NULL, lsrc2, lsrc1, 8, 8);
2310 return distortion + ((bits*s->qscale*s->qscale*109 + 64)>>7);
2313 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2314 MpegEncContext * const s= (MpegEncContext *)c;
2315 const uint8_t *scantable= s->intra_scantable.permutated;
2316 LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
2317 int i, last, run, bits, level, start_i;
2318 const int esc_length= s->ac_esc_length;
2320 uint8_t * last_length;
2324 s->dsp.diff_pixels(temp, src1, src2, stride);
2326 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2332 length = s->intra_ac_vlc_length;
2333 last_length= s->intra_ac_vlc_last_length;
2334 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2337 length = s->inter_ac_vlc_length;
2338 last_length= s->inter_ac_vlc_last_length;
2343 for(i=start_i; i<last; i++){
2344 int j= scantable[i];
2349 if((level&(~127)) == 0){
2350 bits+= length[UNI_AC_ENC_INDEX(run, level)];
2359 level= temp[i] + 64;
2363 if((level&(~127)) == 0){
2364 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2372 #define VSAD_INTRA(size) \
2373 static int vsad_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
2377 for(y=1; y<h; y++){ \
2378 for(x=0; x<size; x+=4){ \
2379 score+= FFABS(s[x ] - s[x +stride]) + FFABS(s[x+1] - s[x+1+stride]) \
2380 +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]); \
2390 static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2395 for(x=0; x<16; x++){
2396 score+= FFABS(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
2405 #define SQ(a) ((a)*(a))
2406 #define VSSE_INTRA(size) \
2407 static int vsse_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
2411 for(y=1; y<h; y++){ \
2412 for(x=0; x<size; x+=4){ \
2413 score+= SQ(s[x ] - s[x +stride]) + SQ(s[x+1] - s[x+1+stride]) \
2414 +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]); \
2424 static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2429 for(x=0; x<16; x++){
2430 score+= SQ(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
2439 static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
2443 for(i=0; i<size; i++)
2444 score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
2448 WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
2449 WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
2450 WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
2452 WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
2454 WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
2455 WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
2456 WRAPPER8_16_SQ(rd8x8_c, rd16_c)
2457 WRAPPER8_16_SQ(bit8x8_c, bit16_c)
2459 static void vector_fmul_c(float *dst, const float *src0, const float *src1, int len){
2461 for(i=0; i<len; i++)
2462 dst[i] = src0[i] * src1[i];
2465 static void vector_fmul_reverse_c(float *dst, const float *src0, const float *src1, int len){
2468 for(i=0; i<len; i++)
2469 dst[i] = src0[i] * src1[-i];
2472 static void vector_fmul_add_c(float *dst, const float *src0, const float *src1, const float *src2, int len){
2474 for(i=0; i<len; i++)
2475 dst[i] = src0[i] * src1[i] + src2[i];
2478 static void vector_fmul_window_c(float *dst, const float *src0,
2479 const float *src1, const float *win, int len)
2485 for(i=-len, j=len-1; i<0; i++, j--) {
2490 dst[i] = s0*wj - s1*wi;
2491 dst[j] = s0*wi + s1*wj;
2495 static void vector_fmul_scalar_c(float *dst, const float *src, float mul,
2499 for (i = 0; i < len; i++)
2500 dst[i] = src[i] * mul;
2503 static void vector_fmul_sv_scalar_2_c(float *dst, const float *src,
2504 const float **sv, float mul, int len)
2507 for (i = 0; i < len; i += 2, sv++) {
2508 dst[i ] = src[i ] * sv[0][0] * mul;
2509 dst[i+1] = src[i+1] * sv[0][1] * mul;
2513 static void vector_fmul_sv_scalar_4_c(float *dst, const float *src,
2514 const float **sv, float mul, int len)
2517 for (i = 0; i < len; i += 4, sv++) {
2518 dst[i ] = src[i ] * sv[0][0] * mul;
2519 dst[i+1] = src[i+1] * sv[0][1] * mul;
2520 dst[i+2] = src[i+2] * sv[0][2] * mul;
2521 dst[i+3] = src[i+3] * sv[0][3] * mul;
2525 static void sv_fmul_scalar_2_c(float *dst, const float **sv, float mul,
2529 for (i = 0; i < len; i += 2, sv++) {
2530 dst[i ] = sv[0][0] * mul;
2531 dst[i+1] = sv[0][1] * mul;
2535 static void sv_fmul_scalar_4_c(float *dst, const float **sv, float mul,
2539 for (i = 0; i < len; i += 4, sv++) {
2540 dst[i ] = sv[0][0] * mul;
2541 dst[i+1] = sv[0][1] * mul;
2542 dst[i+2] = sv[0][2] * mul;
2543 dst[i+3] = sv[0][3] * mul;
2547 static void butterflies_float_c(float *restrict v1, float *restrict v2,
2551 for (i = 0; i < len; i++) {
2552 float t = v1[i] - v2[i];
2558 static float scalarproduct_float_c(const float *v1, const float *v2, int len)
2563 for (i = 0; i < len; i++)
2569 static inline uint32_t clipf_c_one(uint32_t a, uint32_t mini,
2570 uint32_t maxi, uint32_t maxisign)
2573 if(a > mini) return mini;
2574 else if((a^(1U<<31)) > maxisign) return maxi;
2578 static void vector_clipf_c_opposite_sign(float *dst, const float *src, float *min, float *max, int len){
2580 uint32_t mini = *(uint32_t*)min;
2581 uint32_t maxi = *(uint32_t*)max;
2582 uint32_t maxisign = maxi ^ (1U<<31);
2583 uint32_t *dsti = (uint32_t*)dst;
2584 const uint32_t *srci = (const uint32_t*)src;
2585 for(i=0; i<len; i+=8) {
2586 dsti[i + 0] = clipf_c_one(srci[i + 0], mini, maxi, maxisign);
2587 dsti[i + 1] = clipf_c_one(srci[i + 1], mini, maxi, maxisign);
2588 dsti[i + 2] = clipf_c_one(srci[i + 2], mini, maxi, maxisign);
2589 dsti[i + 3] = clipf_c_one(srci[i + 3], mini, maxi, maxisign);
2590 dsti[i + 4] = clipf_c_one(srci[i + 4], mini, maxi, maxisign);
2591 dsti[i + 5] = clipf_c_one(srci[i + 5], mini, maxi, maxisign);
2592 dsti[i + 6] = clipf_c_one(srci[i + 6], mini, maxi, maxisign);
2593 dsti[i + 7] = clipf_c_one(srci[i + 7], mini, maxi, maxisign);
2596 static void vector_clipf_c(float *dst, const float *src, float min, float max, int len){
2598 if(min < 0 && max > 0) {
2599 vector_clipf_c_opposite_sign(dst, src, &min, &max, len);
2601 for(i=0; i < len; i+=8) {
2602 dst[i ] = av_clipf(src[i ], min, max);
2603 dst[i + 1] = av_clipf(src[i + 1], min, max);
2604 dst[i + 2] = av_clipf(src[i + 2], min, max);
2605 dst[i + 3] = av_clipf(src[i + 3], min, max);
2606 dst[i + 4] = av_clipf(src[i + 4], min, max);
2607 dst[i + 5] = av_clipf(src[i + 5], min, max);
2608 dst[i + 6] = av_clipf(src[i + 6], min, max);
2609 dst[i + 7] = av_clipf(src[i + 7], min, max);
2614 static int32_t scalarproduct_int16_c(const int16_t * v1, const int16_t * v2, int order, int shift)
2619 res += (*v1++ * *v2++) >> shift;
2624 static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul)
2629 *v1++ += mul * *v3++;
2634 static void apply_window_int16_c(int16_t *output, const int16_t *input,
2635 const int16_t *window, unsigned int len)
2638 int len2 = len >> 1;
2640 for (i = 0; i < len2; i++) {
2641 int16_t w = window[i];
2642 output[i] = (MUL16(input[i], w) + (1 << 14)) >> 15;
2643 output[len-i-1] = (MUL16(input[len-i-1], w) + (1 << 14)) >> 15;
2647 static void vector_clip_int32_c(int32_t *dst, const int32_t *src, int32_t min,
2648 int32_t max, unsigned int len)
2651 *dst++ = av_clip(*src++, min, max);
2652 *dst++ = av_clip(*src++, min, max);
2653 *dst++ = av_clip(*src++, min, max);
2654 *dst++ = av_clip(*src++, min, max);
2655 *dst++ = av_clip(*src++, min, max);
2656 *dst++ = av_clip(*src++, min, max);
2657 *dst++ = av_clip(*src++, min, max);
2658 *dst++ = av_clip(*src++, min, max);
2664 #define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
2665 #define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
2666 #define W3 2408 /* 2048*sqrt (2)*cos (3*pi/16) */
2667 #define W4 2048 /* 2048*sqrt (2)*cos (4*pi/16) */
2668 #define W5 1609 /* 2048*sqrt (2)*cos (5*pi/16) */
2669 #define W6 1108 /* 2048*sqrt (2)*cos (6*pi/16) */
2670 #define W7 565 /* 2048*sqrt (2)*cos (7*pi/16) */
2672 static void wmv2_idct_row(short * b)
2675 int a0,a1,a2,a3,a4,a5,a6,a7;
2677 a1 = W1*b[1]+W7*b[7];
2678 a7 = W7*b[1]-W1*b[7];
2679 a5 = W5*b[5]+W3*b[3];
2680 a3 = W3*b[5]-W5*b[3];
2681 a2 = W2*b[2]+W6*b[6];
2682 a6 = W6*b[2]-W2*b[6];
2683 a0 = W0*b[0]+W0*b[4];
2684 a4 = W0*b[0]-W0*b[4];
2686 s1 = (181*(a1-a5+a7-a3)+128)>>8;//1,3,5,7,
2687 s2 = (181*(a1-a5-a7+a3)+128)>>8;
2689 b[0] = (a0+a2+a1+a5 + (1<<7))>>8;
2690 b[1] = (a4+a6 +s1 + (1<<7))>>8;
2691 b[2] = (a4-a6 +s2 + (1<<7))>>8;
2692 b[3] = (a0-a2+a7+a3 + (1<<7))>>8;
2693 b[4] = (a0-a2-a7-a3 + (1<<7))>>8;
2694 b[5] = (a4-a6 -s2 + (1<<7))>>8;
2695 b[6] = (a4+a6 -s1 + (1<<7))>>8;
2696 b[7] = (a0+a2-a1-a5 + (1<<7))>>8;
2698 static void wmv2_idct_col(short * b)
2701 int a0,a1,a2,a3,a4,a5,a6,a7;
2702 /*step 1, with extended precision*/
2703 a1 = (W1*b[8*1]+W7*b[8*7] + 4)>>3;
2704 a7 = (W7*b[8*1]-W1*b[8*7] + 4)>>3;
2705 a5 = (W5*b[8*5]+W3*b[8*3] + 4)>>3;
2706 a3 = (W3*b[8*5]-W5*b[8*3] + 4)>>3;
2707 a2 = (W2*b[8*2]+W6*b[8*6] + 4)>>3;
2708 a6 = (W6*b[8*2]-W2*b[8*6] + 4)>>3;
2709 a0 = (W0*b[8*0]+W0*b[8*4] )>>3;
2710 a4 = (W0*b[8*0]-W0*b[8*4] )>>3;
2712 s1 = (181*(a1-a5+a7-a3)+128)>>8;
2713 s2 = (181*(a1-a5-a7+a3)+128)>>8;
2715 b[8*0] = (a0+a2+a1+a5 + (1<<13))>>14;
2716 b[8*1] = (a4+a6 +s1 + (1<<13))>>14;
2717 b[8*2] = (a4-a6 +s2 + (1<<13))>>14;
2718 b[8*3] = (a0-a2+a7+a3 + (1<<13))>>14;
2720 b[8*4] = (a0-a2-a7-a3 + (1<<13))>>14;
2721 b[8*5] = (a4-a6 -s2 + (1<<13))>>14;
2722 b[8*6] = (a4+a6 -s1 + (1<<13))>>14;
2723 b[8*7] = (a0+a2-a1-a5 + (1<<13))>>14;
2725 void ff_wmv2_idct_c(short * block){
2729 wmv2_idct_row(block+i);
2732 wmv2_idct_col(block+i);
2735 /* XXX: those functions should be suppressed ASAP when all IDCTs are
2737 static void ff_wmv2_idct_put_c(uint8_t *dest, int line_size, DCTELEM *block)
2739 ff_wmv2_idct_c(block);
2740 ff_put_pixels_clamped_c(block, dest, line_size);
2742 static void ff_wmv2_idct_add_c(uint8_t *dest, int line_size, DCTELEM *block)
2744 ff_wmv2_idct_c(block);
2745 ff_add_pixels_clamped_c(block, dest, line_size);
2747 static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
2750 ff_put_pixels_clamped_c(block, dest, line_size);
2752 static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
2755 ff_add_pixels_clamped_c(block, dest, line_size);
2758 static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block)
2761 put_pixels_clamped4_c(block, dest, line_size);
2763 static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block)
2766 add_pixels_clamped4_c(block, dest, line_size);
2769 static void ff_jref_idct2_put(uint8_t *dest, int line_size, DCTELEM *block)
2772 put_pixels_clamped2_c(block, dest, line_size);
2774 static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block)
2777 add_pixels_clamped2_c(block, dest, line_size);
2780 static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block)
2782 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2784 dest[0] = cm[(block[0] + 4)>>3];
2786 static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block)
2788 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2790 dest[0] = cm[dest[0] + ((block[0] + 4)>>3)];
2793 static void just_return(void *mem av_unused, int stride av_unused, int h av_unused) { return; }
2795 /* init static data */
2796 av_cold void dsputil_static_init(void)
2800 for(i=0;i<256;i++) ff_cropTbl[i + MAX_NEG_CROP] = i;
2801 for(i=0;i<MAX_NEG_CROP;i++) {
2803 ff_cropTbl[i + MAX_NEG_CROP + 256] = 255;
2806 for(i=0;i<512;i++) {
2807 ff_squareTbl[i] = (i - 256) * (i - 256);
2810 for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
2813 int ff_check_alignment(void){
2814 static int did_fail=0;
2815 LOCAL_ALIGNED_16(int, aligned);
2817 if((intptr_t)&aligned & 15){
2819 #if HAVE_MMX || HAVE_ALTIVEC
2820 av_log(NULL, AV_LOG_ERROR,
2821 "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
2822 "and may be very slow or crash. This is not a bug in libavcodec,\n"
2823 "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
2824 "Do not report crashes to Libav developers.\n");
2833 av_cold void dsputil_init(DSPContext* c, AVCodecContext *avctx)
2837 ff_check_alignment();
2840 if(avctx->dct_algo==FF_DCT_FASTINT) {
2841 c->fdct = fdct_ifast;
2842 c->fdct248 = fdct_ifast248;
2844 else if(avctx->dct_algo==FF_DCT_FAAN) {
2845 c->fdct = ff_faandct;
2846 c->fdct248 = ff_faandct248;
2849 c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
2850 c->fdct248 = ff_fdct248_islow;
2852 #endif //CONFIG_ENCODERS
2854 if(avctx->lowres==1){
2855 if(avctx->idct_algo==FF_IDCT_INT || avctx->idct_algo==FF_IDCT_AUTO || !CONFIG_H264_DECODER){
2856 c->idct_put= ff_jref_idct4_put;
2857 c->idct_add= ff_jref_idct4_add;
2859 if (avctx->codec_id != CODEC_ID_H264) {
2860 c->idct_put= ff_h264_lowres_idct_put_8_c;
2861 c->idct_add= ff_h264_lowres_idct_add_8_c;
2863 switch (avctx->bits_per_raw_sample) {
2865 c->idct_put= ff_h264_lowres_idct_put_9_c;
2866 c->idct_add= ff_h264_lowres_idct_add_9_c;
2869 c->idct_put= ff_h264_lowres_idct_put_10_c;
2870 c->idct_add= ff_h264_lowres_idct_add_10_c;
2873 c->idct_put= ff_h264_lowres_idct_put_8_c;
2874 c->idct_add= ff_h264_lowres_idct_add_8_c;
2878 c->idct = j_rev_dct4;
2879 c->idct_permutation_type= FF_NO_IDCT_PERM;
2880 }else if(avctx->lowres==2){
2881 c->idct_put= ff_jref_idct2_put;
2882 c->idct_add= ff_jref_idct2_add;
2883 c->idct = j_rev_dct2;
2884 c->idct_permutation_type= FF_NO_IDCT_PERM;
2885 }else if(avctx->lowres==3){
2886 c->idct_put= ff_jref_idct1_put;
2887 c->idct_add= ff_jref_idct1_add;
2888 c->idct = j_rev_dct1;
2889 c->idct_permutation_type= FF_NO_IDCT_PERM;
2891 if(avctx->idct_algo==FF_IDCT_INT){
2892 c->idct_put= ff_jref_idct_put;
2893 c->idct_add= ff_jref_idct_add;
2894 c->idct = j_rev_dct;
2895 c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
2896 }else if((CONFIG_VP3_DECODER || CONFIG_VP5_DECODER || CONFIG_VP6_DECODER ) &&
2897 avctx->idct_algo==FF_IDCT_VP3){
2898 c->idct_put= ff_vp3_idct_put_c;
2899 c->idct_add= ff_vp3_idct_add_c;
2900 c->idct = ff_vp3_idct_c;
2901 c->idct_permutation_type= FF_NO_IDCT_PERM;
2902 }else if(avctx->idct_algo==FF_IDCT_WMV2){
2903 c->idct_put= ff_wmv2_idct_put_c;
2904 c->idct_add= ff_wmv2_idct_add_c;
2905 c->idct = ff_wmv2_idct_c;
2906 c->idct_permutation_type= FF_NO_IDCT_PERM;
2907 }else if(avctx->idct_algo==FF_IDCT_FAAN){
2908 c->idct_put= ff_faanidct_put;
2909 c->idct_add= ff_faanidct_add;
2910 c->idct = ff_faanidct;
2911 c->idct_permutation_type= FF_NO_IDCT_PERM;
2912 }else if(CONFIG_EATGQ_DECODER && avctx->idct_algo==FF_IDCT_EA) {
2913 c->idct_put= ff_ea_idct_put_c;
2914 c->idct_permutation_type= FF_NO_IDCT_PERM;
2915 }else if(CONFIG_BINK_DECODER && avctx->idct_algo==FF_IDCT_BINK) {
2916 c->idct = ff_bink_idct_c;
2917 c->idct_add = ff_bink_idct_add_c;
2918 c->idct_put = ff_bink_idct_put_c;
2919 c->idct_permutation_type = FF_NO_IDCT_PERM;
2920 }else{ //accurate/default
2921 c->idct_put= ff_simple_idct_put;
2922 c->idct_add= ff_simple_idct_add;
2923 c->idct = ff_simple_idct;
2924 c->idct_permutation_type= FF_NO_IDCT_PERM;
2928 c->get_pixels = get_pixels_c;
2929 c->diff_pixels = diff_pixels_c;
2930 c->put_pixels_clamped = ff_put_pixels_clamped_c;
2931 c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_c;
2932 c->put_pixels_nonclamped = put_pixels_nonclamped_c;
2933 c->add_pixels_clamped = ff_add_pixels_clamped_c;
2934 c->sum_abs_dctelem = sum_abs_dctelem_c;
2937 c->pix_sum = pix_sum_c;
2938 c->pix_norm1 = pix_norm1_c;
2940 c->fill_block_tab[0] = fill_block16_c;
2941 c->fill_block_tab[1] = fill_block8_c;
2942 c->scale_block = scale_block_c;
2944 /* TODO [0] 16 [1] 8 */
2945 c->pix_abs[0][0] = pix_abs16_c;
2946 c->pix_abs[0][1] = pix_abs16_x2_c;
2947 c->pix_abs[0][2] = pix_abs16_y2_c;
2948 c->pix_abs[0][3] = pix_abs16_xy2_c;
2949 c->pix_abs[1][0] = pix_abs8_c;
2950 c->pix_abs[1][1] = pix_abs8_x2_c;
2951 c->pix_abs[1][2] = pix_abs8_y2_c;
2952 c->pix_abs[1][3] = pix_abs8_xy2_c;
2954 c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
2955 c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
2956 c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
2957 c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
2958 c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
2959 c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
2960 c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
2961 c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
2962 c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
2964 c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
2965 c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
2966 c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
2967 c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
2968 c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
2969 c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
2970 c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
2971 c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
2972 c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
2974 #define dspfunc(PFX, IDX, NUM) \
2975 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
2976 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
2977 c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
2978 c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
2979 c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
2980 c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
2981 c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
2982 c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
2983 c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
2984 c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
2985 c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
2986 c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
2987 c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
2988 c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
2989 c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
2990 c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
2992 dspfunc(put_qpel, 0, 16);
2993 dspfunc(put_no_rnd_qpel, 0, 16);
2995 dspfunc(avg_qpel, 0, 16);
2996 /* dspfunc(avg_no_rnd_qpel, 0, 16); */
2998 dspfunc(put_qpel, 1, 8);
2999 dspfunc(put_no_rnd_qpel, 1, 8);
3001 dspfunc(avg_qpel, 1, 8);
3002 /* dspfunc(avg_no_rnd_qpel, 1, 8); */
3006 #if CONFIG_MLP_DECODER || CONFIG_TRUEHD_DECODER
3007 ff_mlp_init(c, avctx);
3009 #if CONFIG_WMV2_DECODER || CONFIG_VC1_DECODER
3010 ff_intrax8dsp_init(c,avctx);
3012 #if CONFIG_RV30_DECODER
3013 ff_rv30dsp_init(c,avctx);
3015 #if CONFIG_RV40_DECODER
3016 ff_rv40dsp_init(c,avctx);
3017 c->put_rv40_qpel_pixels_tab[0][15] = put_rv40_qpel16_mc33_c;
3018 c->avg_rv40_qpel_pixels_tab[0][15] = avg_rv40_qpel16_mc33_c;
3019 c->put_rv40_qpel_pixels_tab[1][15] = put_rv40_qpel8_mc33_c;
3020 c->avg_rv40_qpel_pixels_tab[1][15] = avg_rv40_qpel8_mc33_c;
3023 c->put_mspel_pixels_tab[0]= ff_put_pixels8x8_c;
3024 c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
3025 c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
3026 c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
3027 c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
3028 c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
3029 c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
3030 c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
3032 #define SET_CMP_FUNC(name) \
3033 c->name[0]= name ## 16_c;\
3034 c->name[1]= name ## 8x8_c;
3036 SET_CMP_FUNC(hadamard8_diff)
3037 c->hadamard8_diff[4]= hadamard8_intra16_c;
3038 c->hadamard8_diff[5]= hadamard8_intra8x8_c;
3039 SET_CMP_FUNC(dct_sad)
3040 SET_CMP_FUNC(dct_max)
3042 SET_CMP_FUNC(dct264_sad)
3044 c->sad[0]= pix_abs16_c;
3045 c->sad[1]= pix_abs8_c;
3049 SET_CMP_FUNC(quant_psnr)
3052 c->vsad[0]= vsad16_c;
3053 c->vsad[4]= vsad_intra16_c;
3054 c->vsad[5]= vsad_intra8_c;
3055 c->vsse[0]= vsse16_c;
3056 c->vsse[4]= vsse_intra16_c;
3057 c->vsse[5]= vsse_intra8_c;
3058 c->nsse[0]= nsse16_c;
3059 c->nsse[1]= nsse8_c;
3061 ff_dsputil_init_dwt(c);
3064 c->ssd_int8_vs_int16 = ssd_int8_vs_int16_c;
3066 c->add_bytes= add_bytes_c;
3067 c->add_bytes_l2= add_bytes_l2_c;
3068 c->diff_bytes= diff_bytes_c;
3069 c->add_hfyu_median_prediction= add_hfyu_median_prediction_c;
3070 c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
3071 c->add_hfyu_left_prediction = add_hfyu_left_prediction_c;
3072 c->add_hfyu_left_prediction_bgr32 = add_hfyu_left_prediction_bgr32_c;
3073 c->bswap_buf= bswap_buf;
3074 c->bswap16_buf = bswap16_buf;
3075 #if CONFIG_PNG_DECODER
3076 c->add_png_paeth_prediction= ff_add_png_paeth_prediction;
3079 if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
3080 c->h263_h_loop_filter= h263_h_loop_filter_c;
3081 c->h263_v_loop_filter= h263_v_loop_filter_c;
3084 if (CONFIG_VP3_DECODER) {
3085 c->vp3_h_loop_filter= ff_vp3_h_loop_filter_c;
3086 c->vp3_v_loop_filter= ff_vp3_v_loop_filter_c;
3087 c->vp3_idct_dc_add= ff_vp3_idct_dc_add_c;
3090 c->h261_loop_filter= h261_loop_filter_c;
3092 c->try_8x8basis= try_8x8basis_c;
3093 c->add_8x8basis= add_8x8basis_c;
3095 #if CONFIG_VORBIS_DECODER
3096 c->vorbis_inverse_coupling = vorbis_inverse_coupling;
3098 #if CONFIG_AC3_DECODER
3099 c->ac3_downmix = ff_ac3_downmix_c;
3101 c->vector_fmul = vector_fmul_c;
3102 c->vector_fmul_reverse = vector_fmul_reverse_c;
3103 c->vector_fmul_add = vector_fmul_add_c;
3104 c->vector_fmul_window = vector_fmul_window_c;
3105 c->vector_clipf = vector_clipf_c;
3106 c->scalarproduct_int16 = scalarproduct_int16_c;
3107 c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
3108 c->apply_window_int16 = apply_window_int16_c;
3109 c->vector_clip_int32 = vector_clip_int32_c;
3110 c->scalarproduct_float = scalarproduct_float_c;
3111 c->butterflies_float = butterflies_float_c;
3112 c->vector_fmul_scalar = vector_fmul_scalar_c;
3114 c->vector_fmul_sv_scalar[0] = vector_fmul_sv_scalar_2_c;
3115 c->vector_fmul_sv_scalar[1] = vector_fmul_sv_scalar_4_c;
3117 c->sv_fmul_scalar[0] = sv_fmul_scalar_2_c;
3118 c->sv_fmul_scalar[1] = sv_fmul_scalar_4_c;
3120 c->shrink[0]= av_image_copy_plane;
3121 c->shrink[1]= ff_shrink22;
3122 c->shrink[2]= ff_shrink44;
3123 c->shrink[3]= ff_shrink88;
3125 c->prefetch= just_return;
3127 memset(c->put_2tap_qpel_pixels_tab, 0, sizeof(c->put_2tap_qpel_pixels_tab));
3128 memset(c->avg_2tap_qpel_pixels_tab, 0, sizeof(c->avg_2tap_qpel_pixels_tab));
3132 #define FUNC(f, depth) f ## _ ## depth
3133 #define FUNCC(f, depth) f ## _ ## depth ## _c
3135 #define dspfunc1(PFX, IDX, NUM, depth)\
3136 c->PFX ## _pixels_tab[IDX][0] = FUNCC(PFX ## _pixels ## NUM , depth);\
3137 c->PFX ## _pixels_tab[IDX][1] = FUNCC(PFX ## _pixels ## NUM ## _x2 , depth);\
3138 c->PFX ## _pixels_tab[IDX][2] = FUNCC(PFX ## _pixels ## NUM ## _y2 , depth);\
3139 c->PFX ## _pixels_tab[IDX][3] = FUNCC(PFX ## _pixels ## NUM ## _xy2, depth)
3141 #define dspfunc2(PFX, IDX, NUM, depth)\
3142 c->PFX ## _pixels_tab[IDX][ 0] = FUNCC(PFX ## NUM ## _mc00, depth);\
3143 c->PFX ## _pixels_tab[IDX][ 1] = FUNCC(PFX ## NUM ## _mc10, depth);\
3144 c->PFX ## _pixels_tab[IDX][ 2] = FUNCC(PFX ## NUM ## _mc20, depth);\
3145 c->PFX ## _pixels_tab[IDX][ 3] = FUNCC(PFX ## NUM ## _mc30, depth);\
3146 c->PFX ## _pixels_tab[IDX][ 4] = FUNCC(PFX ## NUM ## _mc01, depth);\
3147 c->PFX ## _pixels_tab[IDX][ 5] = FUNCC(PFX ## NUM ## _mc11, depth);\
3148 c->PFX ## _pixels_tab[IDX][ 6] = FUNCC(PFX ## NUM ## _mc21, depth);\
3149 c->PFX ## _pixels_tab[IDX][ 7] = FUNCC(PFX ## NUM ## _mc31, depth);\
3150 c->PFX ## _pixels_tab[IDX][ 8] = FUNCC(PFX ## NUM ## _mc02, depth);\
3151 c->PFX ## _pixels_tab[IDX][ 9] = FUNCC(PFX ## NUM ## _mc12, depth);\
3152 c->PFX ## _pixels_tab[IDX][10] = FUNCC(PFX ## NUM ## _mc22, depth);\
3153 c->PFX ## _pixels_tab[IDX][11] = FUNCC(PFX ## NUM ## _mc32, depth);\
3154 c->PFX ## _pixels_tab[IDX][12] = FUNCC(PFX ## NUM ## _mc03, depth);\
3155 c->PFX ## _pixels_tab[IDX][13] = FUNCC(PFX ## NUM ## _mc13, depth);\
3156 c->PFX ## _pixels_tab[IDX][14] = FUNCC(PFX ## NUM ## _mc23, depth);\
3157 c->PFX ## _pixels_tab[IDX][15] = FUNCC(PFX ## NUM ## _mc33, depth)
3160 #define BIT_DEPTH_FUNCS(depth)\
3161 c->draw_edges = FUNCC(draw_edges , depth);\
3162 c->emulated_edge_mc = FUNC (ff_emulated_edge_mc , depth);\
3163 c->clear_block = FUNCC(clear_block , depth);\
3164 c->clear_blocks = FUNCC(clear_blocks , depth);\
3165 c->add_pixels8 = FUNCC(add_pixels8 , depth);\
3166 c->add_pixels4 = FUNCC(add_pixels4 , depth);\
3167 c->put_no_rnd_pixels_l2[0] = FUNCC(put_no_rnd_pixels16_l2, depth);\
3168 c->put_no_rnd_pixels_l2[1] = FUNCC(put_no_rnd_pixels8_l2 , depth);\
3170 c->put_h264_chroma_pixels_tab[0] = FUNCC(put_h264_chroma_mc8 , depth);\
3171 c->put_h264_chroma_pixels_tab[1] = FUNCC(put_h264_chroma_mc4 , depth);\
3172 c->put_h264_chroma_pixels_tab[2] = FUNCC(put_h264_chroma_mc2 , depth);\
3173 c->avg_h264_chroma_pixels_tab[0] = FUNCC(avg_h264_chroma_mc8 , depth);\
3174 c->avg_h264_chroma_pixels_tab[1] = FUNCC(avg_h264_chroma_mc4 , depth);\
3175 c->avg_h264_chroma_pixels_tab[2] = FUNCC(avg_h264_chroma_mc2 , depth);\
3177 dspfunc1(put , 0, 16, depth);\
3178 dspfunc1(put , 1, 8, depth);\
3179 dspfunc1(put , 2, 4, depth);\
3180 dspfunc1(put , 3, 2, depth);\
3181 dspfunc1(put_no_rnd, 0, 16, depth);\
3182 dspfunc1(put_no_rnd, 1, 8, depth);\
3183 dspfunc1(avg , 0, 16, depth);\
3184 dspfunc1(avg , 1, 8, depth);\
3185 dspfunc1(avg , 2, 4, depth);\
3186 dspfunc1(avg , 3, 2, depth);\
3187 dspfunc1(avg_no_rnd, 0, 16, depth);\
3188 dspfunc1(avg_no_rnd, 1, 8, depth);\
3190 dspfunc2(put_h264_qpel, 0, 16, depth);\
3191 dspfunc2(put_h264_qpel, 1, 8, depth);\
3192 dspfunc2(put_h264_qpel, 2, 4, depth);\
3193 dspfunc2(put_h264_qpel, 3, 2, depth);\
3194 dspfunc2(avg_h264_qpel, 0, 16, depth);\
3195 dspfunc2(avg_h264_qpel, 1, 8, depth);\
3196 dspfunc2(avg_h264_qpel, 2, 4, depth);
3198 if (avctx->codec_id != CODEC_ID_H264 || avctx->bits_per_raw_sample == 8) {
3201 switch (avctx->bits_per_raw_sample) {
3209 av_log(avctx, AV_LOG_DEBUG, "Unsupported bit depth: %d\n", avctx->bits_per_raw_sample);
3216 if (HAVE_MMX) dsputil_init_mmx (c, avctx);
3217 if (ARCH_ARM) dsputil_init_arm (c, avctx);
3218 if (CONFIG_MLIB) dsputil_init_mlib (c, avctx);
3219 if (HAVE_VIS) dsputil_init_vis (c, avctx);
3220 if (ARCH_ALPHA) dsputil_init_alpha (c, avctx);
3221 if (ARCH_PPC) dsputil_init_ppc (c, avctx);
3222 if (HAVE_MMI) dsputil_init_mmi (c, avctx);
3223 if (ARCH_SH4) dsputil_init_sh4 (c, avctx);
3224 if (ARCH_BFIN) dsputil_init_bfin (c, avctx);
3226 for(i=0; i<64; i++){
3227 if(!c->put_2tap_qpel_pixels_tab[0][i])
3228 c->put_2tap_qpel_pixels_tab[0][i]= c->put_h264_qpel_pixels_tab[0][i];
3229 if(!c->avg_2tap_qpel_pixels_tab[0][i])
3230 c->avg_2tap_qpel_pixels_tab[0][i]= c->avg_h264_qpel_pixels_tab[0][i];
3233 c->put_rv30_tpel_pixels_tab[0][0] = c->put_h264_qpel_pixels_tab[0][0];
3234 c->put_rv30_tpel_pixels_tab[1][0] = c->put_h264_qpel_pixels_tab[1][0];
3235 c->avg_rv30_tpel_pixels_tab[0][0] = c->avg_h264_qpel_pixels_tab[0][0];
3236 c->avg_rv30_tpel_pixels_tab[1][0] = c->avg_h264_qpel_pixels_tab[1][0];
3238 c->put_rv40_qpel_pixels_tab[0][0] = c->put_h264_qpel_pixels_tab[0][0];
3239 c->put_rv40_qpel_pixels_tab[1][0] = c->put_h264_qpel_pixels_tab[1][0];
3240 c->avg_rv40_qpel_pixels_tab[0][0] = c->avg_h264_qpel_pixels_tab[0][0];
3241 c->avg_rv40_qpel_pixels_tab[1][0] = c->avg_h264_qpel_pixels_tab[1][0];
3243 switch(c->idct_permutation_type){
3244 case FF_NO_IDCT_PERM:
3246 c->idct_permutation[i]= i;
3248 case FF_LIBMPEG2_IDCT_PERM:
3250 c->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
3252 case FF_SIMPLE_IDCT_PERM:
3254 c->idct_permutation[i]= simple_mmx_permutation[i];
3256 case FF_TRANSPOSE_IDCT_PERM:
3258 c->idct_permutation[i]= ((i&7)<<3) | (i>>3);
3260 case FF_PARTTRANS_IDCT_PERM:
3262 c->idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3);
3264 case FF_SSE2_IDCT_PERM:
3266 c->idct_permutation[i]= (i&0x38) | idct_sse2_row_perm[i&7];
3269 av_log(avctx, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");