libpostproc/postprocess.c

   1 /*
   2  * Copyright (C) 2001-2003 Michael Niedermayer (michaelni@gmx.at)
   3  *
   4  * AltiVec optimizations (C) 2004 Romain Dolbeau <romain@dolbeau.org>
   5  *
   6  * This file is part of FFmpeg.
   7  *
   8  * FFmpeg is free software; you can redistribute it and/or modify
   9  * it under the terms of the GNU General Public License as published by
  10  * the Free Software Foundation; either version 2 of the License, or
  11  * (at your option) any later version.
  12  *
  13  * FFmpeg is distributed in the hope that it will be useful,
  14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16  * GNU General Public License for more details.
  17  *
  18  * You should have received a copy of the GNU General Public License
  19  * along with FFmpeg; if not, write to the Free Software
  20  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  21  */
  22
  23 /**
  24  * @file
  25  * postprocessing.
  26  */
  27
  28 /*
  29                         C       MMX     MMX2    3DNow   AltiVec
  30 isVertDC                Ec      Ec                      Ec
  31 isVertMinMaxOk          Ec      Ec                      Ec
  32 doVertLowPass           E               e       e       Ec
  33 doVertDefFilter         Ec      Ec      e       e       Ec
  34 isHorizDC               Ec      Ec                      Ec
  35 isHorizMinMaxOk         a       E                       Ec
  36 doHorizLowPass          E               e       e       Ec
  37 doHorizDefFilter        Ec      Ec      e       e       Ec
  38 do_a_deblock            Ec      E       Ec      E
  39 deRing                  E               e       e*      Ecp
  40 Vertical RKAlgo1        E               a       a
  41 Horizontal RKAlgo1                      a       a
  42 Vertical X1#            a               E       E
  43 Horizontal X1#          a               E       E
  44 LinIpolDeinterlace      e               E       E*
  45 CubicIpolDeinterlace    a               e       e*
  46 LinBlendDeinterlace     e               E       E*
  47 MedianDeinterlace#      E       Ec      Ec
  48 TempDeNoiser#           E               e       e       Ec
  49
  50 * I do not have a 3DNow! CPU -> it is untested, but no one said it does not work so it seems to work
  51 # more or less selfinvented filters so the exactness is not too meaningful
  52 E = Exact implementation
  53 e = almost exact implementation (slightly different rounding,...)
  54 a = alternative / approximate impl
  55 c = checked against the other implementations (-vo md5)
  56 p = partially optimized, still some work to do
  57 */
  58
  59 /*
  60 TODO:
  61 reduce the time wasted on the mem transfer
  62 unroll stuff if instructions depend too much on the prior one
  63 move YScale thing to the end instead of fixing QP
  64 write a faster and higher quality deblocking filter :)
  65 make the mainloop more flexible (variable number of blocks at once
  66         (the if/else stuff per block is slowing things down)
  67 compare the quality & speed of all filters
  68 split this huge file
  69 optimize c versions
  70 try to unroll inner for(x=0 ... loop to avoid these damn if(x ... checks
  71 ...
  72 */
  73
  74 //Changelog: use git log
  75
  76 #include "config.h"
  77 #include "libavutil/avutil.h"
  78 #include "libavutil/avassert.h"
  79 #include "libavutil/intreadwrite.h"
  80 #include <inttypes.h>
  81 #include <stdio.h>
  82 #include <stdlib.h>
  83 #include <string.h>
  84 //#undef HAVE_MMXEXT_INLINE
  85 //#define HAVE_AMD3DNOW_INLINE
  86 //#undef HAVE_MMX_INLINE
  87 //#undef ARCH_X86
  88 //#define DEBUG_BRIGHTNESS
  89 #include "postprocess.h"
  90 #include "postprocess_internal.h"
  91 #include "libavutil/avstring.h"
  92
  93 #include "libavutil/ffversion.h"
  94 const char postproc_ffversion[] = "FFmpeg version " FFMPEG_VERSION;
  95
  96 unsigned postproc_version(void)
  97 {
  98     av_assert0(LIBPOSTPROC_VERSION_MICRO >= 100);
  99     return LIBPOSTPROC_VERSION_INT;
 100 }
 101
 102 const char *postproc_configuration(void)
 103 {
 104     return FFMPEG_CONFIGURATION;
 105 }
 106
 107 const char *postproc_license(void)
 108 {
 109 #define LICENSE_PREFIX "libpostproc license: "
 110     return LICENSE_PREFIX FFMPEG_LICENSE + sizeof(LICENSE_PREFIX) - 1;
 111 }
 112
 113 #if HAVE_ALTIVEC_H
 114 #include <altivec.h>
 115 #endif
 116
 117 #define GET_MODE_BUFFER_SIZE 500
 118 #define OPTIONS_ARRAY_SIZE 10
 119 #define BLOCK_SIZE 8
 120 #define TEMP_STRIDE 8
 121 //#define NUM_BLOCKS_AT_ONCE 16 //not used yet
 122
 123 #if ARCH_X86 && HAVE_INLINE_ASM
 124 DECLARE_ASM_CONST(8, uint64_t, w05)= 0x0005000500050005LL;
 125 DECLARE_ASM_CONST(8, uint64_t, w04)= 0x0004000400040004LL;
 126 DECLARE_ASM_CONST(8, uint64_t, w20)= 0x0020002000200020LL;
 127 DECLARE_ASM_CONST(8, uint64_t, b00)= 0x0000000000000000LL;
 128 DECLARE_ASM_CONST(8, uint64_t, b01)= 0x0101010101010101LL;
 129 DECLARE_ASM_CONST(8, uint64_t, b02)= 0x0202020202020202LL;
 130 DECLARE_ASM_CONST(8, uint64_t, b08)= 0x0808080808080808LL;
 131 DECLARE_ASM_CONST(8, uint64_t, b80)= 0x8080808080808080LL;
 132 #endif
 133
 134 DECLARE_ASM_CONST(8, int, deringThreshold)= 20;
 135
 136
 137 static const struct PPFilter filters[]=
 138 {
 139     {"hb", "hdeblock",              1, 1, 3, H_DEBLOCK},
 140     {"vb", "vdeblock",              1, 2, 4, V_DEBLOCK},
 141 /*  {"hr", "rkhdeblock",            1, 1, 3, H_RK1_FILTER},
 142     {"vr", "rkvdeblock",            1, 2, 4, V_RK1_FILTER},*/
 143     {"h1", "x1hdeblock",            1, 1, 3, H_X1_FILTER},
 144     {"v1", "x1vdeblock",            1, 2, 4, V_X1_FILTER},
 145     {"ha", "ahdeblock",             1, 1, 3, H_A_DEBLOCK},
 146     {"va", "avdeblock",             1, 2, 4, V_A_DEBLOCK},
 147     {"dr", "dering",                1, 5, 6, DERING},
 148     {"al", "autolevels",            0, 1, 2, LEVEL_FIX},
 149     {"lb", "linblenddeint",         1, 1, 4, LINEAR_BLEND_DEINT_FILTER},
 150     {"li", "linipoldeint",          1, 1, 4, LINEAR_IPOL_DEINT_FILTER},
 151     {"ci", "cubicipoldeint",        1, 1, 4, CUBIC_IPOL_DEINT_FILTER},
 152     {"md", "mediandeint",           1, 1, 4, MEDIAN_DEINT_FILTER},
 153     {"fd", "ffmpegdeint",           1, 1, 4, FFMPEG_DEINT_FILTER},
 154     {"l5", "lowpass5",              1, 1, 4, LOWPASS5_DEINT_FILTER},
 155     {"tn", "tmpnoise",              1, 7, 8, TEMP_NOISE_FILTER},
 156     {"fq", "forcequant",            1, 0, 0, FORCE_QUANT},
 157     {"be", "bitexact",              1, 0, 0, BITEXACT},
 158     {"vi", "visualize",             1, 0, 0, VISUALIZE},
 159     {NULL, NULL,0,0,0,0} //End Marker
 160 };
 161
 162 static const char * const replaceTable[]=
 163 {
 164     "default",      "hb:a,vb:a,dr:a",
 165     "de",           "hb:a,vb:a,dr:a",
 166     "fast",         "h1:a,v1:a,dr:a",
 167     "fa",           "h1:a,v1:a,dr:a",
 168     "ac",           "ha:a:128:7,va:a,dr:a",
 169     NULL //End Marker
 170 };
 171
 172 /* The horizontal functions exist only in C because the MMX
 173  * code is faster with vertical filters and transposing. */
 174
 175 /**
 176  * Check if the given 8x8 Block is mostly "flat"
 177  */
 178 static inline int isHorizDC_C(const uint8_t src[], int stride, const PPContext *c)
 179 {
 180     int numEq= 0;
 181     int y;
 182     const int dcOffset= ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1;
 183     const int dcThreshold= dcOffset*2 + 1;
 184
 185     for(y=0; y<BLOCK_SIZE; y++){
 186         numEq += ((unsigned)(src[0] - src[1] + dcOffset)) < dcThreshold;
 187         numEq += ((unsigned)(src[1] - src[2] + dcOffset)) < dcThreshold;
 188         numEq += ((unsigned)(src[2] - src[3] + dcOffset)) < dcThreshold;
 189         numEq += ((unsigned)(src[3] - src[4] + dcOffset)) < dcThreshold;
 190         numEq += ((unsigned)(src[4] - src[5] + dcOffset)) < dcThreshold;
 191         numEq += ((unsigned)(src[5] - src[6] + dcOffset)) < dcThreshold;
 192         numEq += ((unsigned)(src[6] - src[7] + dcOffset)) < dcThreshold;
 193         src+= stride;
 194     }
 195     return numEq > c->ppMode.flatnessThreshold;
 196 }
 197
 198 /**
 199  * Check if the middle 8x8 Block in the given 8x16 block is flat
 200  */
 201 static inline int isVertDC_C(const uint8_t src[], int stride, const PPContext *c)
 202 {
 203     int numEq= 0;
 204     int y;
 205     const int dcOffset= ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1;
 206     const int dcThreshold= dcOffset*2 + 1;
 207
 208     src+= stride*4; // src points to begin of the 8x8 Block
 209     for(y=0; y<BLOCK_SIZE-1; y++){
 210         numEq += ((unsigned)(src[0] - src[0+stride] + dcOffset)) < dcThreshold;
 211         numEq += ((unsigned)(src[1] - src[1+stride] + dcOffset)) < dcThreshold;
 212         numEq += ((unsigned)(src[2] - src[2+stride] + dcOffset)) < dcThreshold;
 213         numEq += ((unsigned)(src[3] - src[3+stride] + dcOffset)) < dcThreshold;
 214         numEq += ((unsigned)(src[4] - src[4+stride] + dcOffset)) < dcThreshold;
 215         numEq += ((unsigned)(src[5] - src[5+stride] + dcOffset)) < dcThreshold;
 216         numEq += ((unsigned)(src[6] - src[6+stride] + dcOffset)) < dcThreshold;
 217         numEq += ((unsigned)(src[7] - src[7+stride] + dcOffset)) < dcThreshold;
 218         src+= stride;
 219     }
 220     return numEq > c->ppMode.flatnessThreshold;
 221 }
 222
 223 static inline int isHorizMinMaxOk_C(const uint8_t src[], int stride, int QP)
 224 {
 225     int i;
 226     for(i=0; i<2; i++){
 227         if((unsigned)(src[0] - src[5] + 2*QP) > 4*QP) return 0;
 228         src += stride;
 229         if((unsigned)(src[2] - src[7] + 2*QP) > 4*QP) return 0;
 230         src += stride;
 231         if((unsigned)(src[4] - src[1] + 2*QP) > 4*QP) return 0;
 232         src += stride;
 233         if((unsigned)(src[6] - src[3] + 2*QP) > 4*QP) return 0;
 234         src += stride;
 235     }
 236     return 1;
 237 }
 238
 239 static inline int isVertMinMaxOk_C(const uint8_t src[], int stride, int QP)
 240 {
 241     int x;
 242     src+= stride*4;
 243     for(x=0; x<BLOCK_SIZE; x+=4){
 244         if((unsigned)(src[  x + 0*stride] - src[  x + 5*stride] + 2*QP) > 4*QP) return 0;
 245         if((unsigned)(src[1+x + 2*stride] - src[1+x + 7*stride] + 2*QP) > 4*QP) return 0;
 246         if((unsigned)(src[2+x + 4*stride] - src[2+x + 1*stride] + 2*QP) > 4*QP) return 0;
 247         if((unsigned)(src[3+x + 6*stride] - src[3+x + 3*stride] + 2*QP) > 4*QP) return 0;
 248     }
 249     return 1;
 250 }
 251
 252 static inline int horizClassify_C(const uint8_t src[], int stride, const PPContext *c)
 253 {
 254     if( isHorizDC_C(src, stride, c) ){
 255         return isHorizMinMaxOk_C(src, stride, c->QP);
 256     }else{
 257         return 2;
 258     }
 259 }
 260
 261 static inline int vertClassify_C(const uint8_t src[], int stride, const PPContext *c)
 262 {
 263     if( isVertDC_C(src, stride, c) ){
 264         return isVertMinMaxOk_C(src, stride, c->QP);
 265     }else{
 266         return 2;
 267     }
 268 }
 269
 270 static inline void doHorizDefFilter_C(uint8_t dst[], int stride, const PPContext *c)
 271 {
 272     int y;
 273     for(y=0; y<BLOCK_SIZE; y++){
 274         const int middleEnergy= 5*(dst[4] - dst[3]) + 2*(dst[2] - dst[5]);
 275
 276         if(FFABS(middleEnergy) < 8*c->QP){
 277             const int q=(dst[3] - dst[4])/2;
 278             const int leftEnergy=  5*(dst[2] - dst[1]) + 2*(dst[0] - dst[3]);
 279             const int rightEnergy= 5*(dst[6] - dst[5]) + 2*(dst[4] - dst[7]);
 280
 281             int d= FFABS(middleEnergy) - FFMIN( FFABS(leftEnergy), FFABS(rightEnergy) );
 282             d= FFMAX(d, 0);
 283
 284             d= (5*d + 32) >> 6;
 285             d*= FFSIGN(-middleEnergy);
 286
 287             if(q>0)
 288             {
 289                 d = FFMAX(d, 0);
 290                 d = FFMIN(d, q);
 291             }
 292             else
 293             {
 294                 d = FFMIN(d, 0);
 295                 d = FFMAX(d, q);
 296             }
 297
 298             dst[3]-= d;
 299             dst[4]+= d;
 300         }
 301         dst+= stride;
 302     }
 303 }
 304
 305 /**
 306  * Do a horizontal low pass filter on the 10x8 block (dst points to middle 8x8 Block)
 307  * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16 (C version)
 308  */
 309 static inline void doHorizLowPass_C(uint8_t dst[], int stride, const PPContext *c)
 310 {
 311     int y;
 312     for(y=0; y<BLOCK_SIZE; y++){
 313         const int first= FFABS(dst[-1] - dst[0]) < c->QP ? dst[-1] : dst[0];
 314         const int last= FFABS(dst[8] - dst[7]) < c->QP ? dst[8] : dst[7];
 315
 316         int sums[10];
 317         sums[0] = 4*first + dst[0] + dst[1] + dst[2] + 4;
 318         sums[1] = sums[0] - first  + dst[3];
 319         sums[2] = sums[1] - first  + dst[4];
 320         sums[3] = sums[2] - first  + dst[5];
 321         sums[4] = sums[3] - first  + dst[6];
 322         sums[5] = sums[4] - dst[0] + dst[7];
 323         sums[6] = sums[5] - dst[1] + last;
 324         sums[7] = sums[6] - dst[2] + last;
 325         sums[8] = sums[7] - dst[3] + last;
 326         sums[9] = sums[8] - dst[4] + last;
 327
 328         dst[0]= (sums[0] + sums[2] + 2*dst[0])>>4;
 329         dst[1]= (sums[1] + sums[3] + 2*dst[1])>>4;
 330         dst[2]= (sums[2] + sums[4] + 2*dst[2])>>4;
 331         dst[3]= (sums[3] + sums[5] + 2*dst[3])>>4;
 332         dst[4]= (sums[4] + sums[6] + 2*dst[4])>>4;
 333         dst[5]= (sums[5] + sums[7] + 2*dst[5])>>4;
 334         dst[6]= (sums[6] + sums[8] + 2*dst[6])>>4;
 335         dst[7]= (sums[7] + sums[9] + 2*dst[7])>>4;
 336
 337         dst+= stride;
 338     }
 339 }
 340
 341 /**
 342  * Experimental Filter 1 (Horizontal)
 343  * will not damage linear gradients
 344  * Flat blocks should look like they were passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter
 345  * can only smooth blocks at the expected locations (it cannot smooth them if they did move)
 346  * MMX2 version does correct clipping C version does not
 347  * not identical with the vertical one
 348  */
 349 static inline void horizX1Filter(uint8_t *src, int stride, int QP)
 350 {
 351     int y;
 352     static uint64_t lut[256];
 353     if(!lut[255])
 354     {
 355         int i;
 356         for(i=0; i<256; i++)
 357         {
 358             int v= i < 128 ? 2*i : 2*(i-256);
 359 /*
 360 //Simulate 112242211 9-Tap filter
 361             uint64_t a= (v/16)  & 0xFF;
 362             uint64_t b= (v/8)   & 0xFF;
 363             uint64_t c= (v/4)   & 0xFF;
 364             uint64_t d= (3*v/8) & 0xFF;
 365 */
 366 //Simulate piecewise linear interpolation
 367             uint64_t a= (v/16)   & 0xFF;
 368             uint64_t b= (v*3/16) & 0xFF;
 369             uint64_t c= (v*5/16) & 0xFF;
 370             uint64_t d= (7*v/16) & 0xFF;
 371             uint64_t A= (0x100 - a)&0xFF;
 372             uint64_t B= (0x100 - b)&0xFF;
 373             uint64_t C= (0x100 - c)&0xFF;
 374             uint64_t D= (0x100 - c)&0xFF;
 375
 376             lut[i]   = (a<<56) | (b<<48) | (c<<40) | (d<<32) |
 377                        (D<<24) | (C<<16) | (B<<8)  | (A);
 378             //lut[i] = (v<<32) | (v<<24);
 379         }
 380     }
 381
 382     for(y=0; y<BLOCK_SIZE; y++){
 383         int a= src[1] - src[2];
 384         int b= src[3] - src[4];
 385         int c= src[5] - src[6];
 386
 387         int d= FFMAX(FFABS(b) - (FFABS(a) + FFABS(c))/2, 0);
 388
 389         if(d < QP){
 390             int v = d * FFSIGN(-b);
 391
 392             src[1] +=v/8;
 393             src[2] +=v/4;
 394             src[3] +=3*v/8;
 395             src[4] -=3*v/8;
 396             src[5] -=v/4;
 397             src[6] -=v/8;
 398         }
 399         src+=stride;
 400     }
 401 }
 402
 403 /**
 404  * accurate deblock filter
 405  */
 406 static av_always_inline void do_a_deblock_C(uint8_t *src, int step,
 407                                             int stride, const PPContext *c, int mode)
 408 {
 409     int y;
 410     const int QP= c->QP;
 411     const int dcOffset= ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1;
 412     const int dcThreshold= dcOffset*2 + 1;
 413 //START_TIMER
 414     src+= step*4; // src points to begin of the 8x8 Block
 415     for(y=0; y<8; y++){
 416         int numEq= 0;
 417
 418         numEq += ((unsigned)(src[-1*step] - src[0*step] + dcOffset)) < dcThreshold;
 419         numEq += ((unsigned)(src[ 0*step] - src[1*step] + dcOffset)) < dcThreshold;
 420         numEq += ((unsigned)(src[ 1*step] - src[2*step] + dcOffset)) < dcThreshold;
 421         numEq += ((unsigned)(src[ 2*step] - src[3*step] + dcOffset)) < dcThreshold;
 422         numEq += ((unsigned)(src[ 3*step] - src[4*step] + dcOffset)) < dcThreshold;
 423         numEq += ((unsigned)(src[ 4*step] - src[5*step] + dcOffset)) < dcThreshold;
 424         numEq += ((unsigned)(src[ 5*step] - src[6*step] + dcOffset)) < dcThreshold;
 425         numEq += ((unsigned)(src[ 6*step] - src[7*step] + dcOffset)) < dcThreshold;
 426         numEq += ((unsigned)(src[ 7*step] - src[8*step] + dcOffset)) < dcThreshold;
 427         if(numEq > c->ppMode.flatnessThreshold){
 428             int min, max, x;
 429
 430             if(src[0] > src[step]){
 431                 max= src[0];
 432                 min= src[step];
 433             }else{
 434                 max= src[step];
 435                 min= src[0];
 436             }
 437             for(x=2; x<8; x+=2){
 438                 if(src[x*step] > src[(x+1)*step]){
 439                         if(src[x    *step] > max) max= src[ x   *step];
 440                         if(src[(x+1)*step] < min) min= src[(x+1)*step];
 441                 }else{
 442                         if(src[(x+1)*step] > max) max= src[(x+1)*step];
 443                         if(src[ x   *step] < min) min= src[ x   *step];
 444                 }
 445             }
 446             if(max-min < 2*QP){
 447                 const int first= FFABS(src[-1*step] - src[0]) < QP ? src[-1*step] : src[0];
 448                 const int last= FFABS(src[8*step] - src[7*step]) < QP ? src[8*step] : src[7*step];
 449
 450                 int sums[10];
 451                 sums[0] = 4*first + src[0*step] + src[1*step] + src[2*step] + 4;
 452                 sums[1] = sums[0] - first       + src[3*step];
 453                 sums[2] = sums[1] - first       + src[4*step];
 454                 sums[3] = sums[2] - first       + src[5*step];
 455                 sums[4] = sums[3] - first       + src[6*step];
 456                 sums[5] = sums[4] - src[0*step] + src[7*step];
 457                 sums[6] = sums[5] - src[1*step] + last;
 458                 sums[7] = sums[6] - src[2*step] + last;
 459                 sums[8] = sums[7] - src[3*step] + last;
 460                 sums[9] = sums[8] - src[4*step] + last;
 461
 462                 if (mode & VISUALIZE) {
 463                     src[0*step] =
 464                     src[1*step] =
 465                     src[2*step] =
 466                     src[3*step] =
 467                     src[4*step] =
 468                     src[5*step] =
 469                     src[6*step] =
 470                     src[7*step] = 128;
 471                 }
 472                 src[0*step]= (sums[0] + sums[2] + 2*src[0*step])>>4;
 473                 src[1*step]= (sums[1] + sums[3] + 2*src[1*step])>>4;
 474                 src[2*step]= (sums[2] + sums[4] + 2*src[2*step])>>4;
 475                 src[3*step]= (sums[3] + sums[5] + 2*src[3*step])>>4;
 476                 src[4*step]= (sums[4] + sums[6] + 2*src[4*step])>>4;
 477                 src[5*step]= (sums[5] + sums[7] + 2*src[5*step])>>4;
 478                 src[6*step]= (sums[6] + sums[8] + 2*src[6*step])>>4;
 479                 src[7*step]= (sums[7] + sums[9] + 2*src[7*step])>>4;
 480             }
 481         }else{
 482             const int middleEnergy= 5*(src[4*step] - src[3*step]) + 2*(src[2*step] - src[5*step]);
 483
 484             if(FFABS(middleEnergy) < 8*QP){
 485                 const int q=(src[3*step] - src[4*step])/2;
 486                 const int leftEnergy=  5*(src[2*step] - src[1*step]) + 2*(src[0*step] - src[3*step]);
 487                 const int rightEnergy= 5*(src[6*step] - src[5*step]) + 2*(src[4*step] - src[7*step]);
 488
 489                 int d= FFABS(middleEnergy) - FFMIN( FFABS(leftEnergy), FFABS(rightEnergy) );
 490                 d= FFMAX(d, 0);
 491
 492                 d= (5*d + 32) >> 6;
 493                 d*= FFSIGN(-middleEnergy);
 494
 495                 if(q>0){
 496                     d = FFMAX(d, 0);
 497                     d = FFMIN(d, q);
 498                 }else{
 499                     d = FFMIN(d, 0);
 500                     d = FFMAX(d, q);
 501                 }
 502
 503                 if ((mode & VISUALIZE) && d) {
 504                     d= (d < 0) ? 32 : -32;
 505                     src[3*step]= av_clip_uint8(src[3*step] - d);
 506                     src[4*step]= av_clip_uint8(src[4*step] + d);
 507                     d = 0;
 508                 }
 509
 510                 src[3*step]-= d;
 511                 src[4*step]+= d;
 512             }
 513         }
 514
 515         src += stride;
 516     }
 517 /*if(step==16){
 518     STOP_TIMER("step16")
 519 }else{
 520     STOP_TIMER("stepX")
 521 }*/
 522 }
 523
 524 //Note: we have C, MMX, MMX2, 3DNOW version there is no 3DNOW+MMX2 one
 525 //Plain C versions
 526 //we always compile C for testing which needs bitexactness
 527 #define TEMPLATE_PP_C 1
 528 #include "postprocess_template.c"
 529
 530 #if HAVE_ALTIVEC
 531 #   define TEMPLATE_PP_ALTIVEC 1
 532 #   include "postprocess_altivec_template.c"
 533 #   include "postprocess_template.c"
 534 #endif
 535
 536 #if ARCH_X86 && HAVE_INLINE_ASM
 537 #    if CONFIG_RUNTIME_CPUDETECT
 538 #        define TEMPLATE_PP_MMX 1
 539 #        include "postprocess_template.c"
 540 #        define TEMPLATE_PP_MMXEXT 1
 541 #        include "postprocess_template.c"
 542 #        define TEMPLATE_PP_3DNOW 1
 543 #        include "postprocess_template.c"
 544 #        define TEMPLATE_PP_SSE2 1
 545 #        include "postprocess_template.c"
 546 #    else
 547 #        if HAVE_SSE2_INLINE
 548 #            define TEMPLATE_PP_SSE2 1
 549 #            include "postprocess_template.c"
 550 #        elif HAVE_MMXEXT_INLINE
 551 #            define TEMPLATE_PP_MMXEXT 1
 552 #            include "postprocess_template.c"
 553 #        elif HAVE_AMD3DNOW_INLINE
 554 #            define TEMPLATE_PP_3DNOW 1
 555 #            include "postprocess_template.c"
 556 #        elif HAVE_MMX_INLINE
 557 #            define TEMPLATE_PP_MMX 1
 558 #            include "postprocess_template.c"
 559 #        endif
 560 #    endif
 561 #endif
 562
 563 typedef void (*pp_fn)(const uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
 564                       const QP_STORE_T QPs[], int QPStride, int isColor, PPContext *c2);
 565
 566 static inline void postProcess(const uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
 567         const QP_STORE_T QPs[], int QPStride, int isColor, pp_mode *vm, pp_context *vc)
 568 {
 569     pp_fn pp = postProcess_C;
 570     PPContext *c= (PPContext *)vc;
 571     PPMode *ppMode= (PPMode *)vm;
 572     c->ppMode= *ppMode; //FIXME
 573
 574     if (!(ppMode->lumMode & BITEXACT)) {
 575 #if CONFIG_RUNTIME_CPUDETECT
 576 #if ARCH_X86 && HAVE_INLINE_ASM
 577         // ordered per speed fastest first
 578         if      (c->cpuCaps & AV_CPU_FLAG_SSE2)     pp = postProcess_SSE2;
 579         else if (c->cpuCaps & AV_CPU_FLAG_MMXEXT)   pp = postProcess_MMX2;
 580         else if (c->cpuCaps & AV_CPU_FLAG_3DNOW)    pp = postProcess_3DNow;
 581         else if (c->cpuCaps & AV_CPU_FLAG_MMX)      pp = postProcess_MMX;
 582 #elif HAVE_ALTIVEC
 583         if      (c->cpuCaps & AV_CPU_FLAG_ALTIVEC)  pp = postProcess_altivec;
 584 #endif
 585 #else /* CONFIG_RUNTIME_CPUDETECT */
 586 #if     HAVE_SSE2_INLINE
 587         pp = postProcess_SSE2;
 588 #elif   HAVE_MMXEXT_INLINE
 589         pp = postProcess_MMX2;
 590 #elif HAVE_AMD3DNOW_INLINE
 591         pp = postProcess_3DNow;
 592 #elif HAVE_MMX_INLINE
 593         pp = postProcess_MMX;
 594 #elif HAVE_ALTIVEC
 595         pp = postProcess_altivec;
 596 #endif
 597 #endif /* !CONFIG_RUNTIME_CPUDETECT */
 598     }
 599
 600     pp(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
 601 }
 602
 603 /* -pp Command line Help
 604 */
 605 const char pp_help[] =
 606 "Available postprocessing filters:\n"
 607 "Filters                        Options\n"
 608 "short  long name       short   long option     Description\n"
 609 "*      *               a       autoq           CPU power dependent enabler\n"
 610 "                       c       chrom           chrominance filtering enabled\n"
 611 "                       y       nochrom         chrominance filtering disabled\n"
 612 "                       n       noluma          luma filtering disabled\n"
 613 "hb     hdeblock        (2 threshold)           horizontal deblocking filter\n"
 614 "       1. difference factor: default=32, higher -> more deblocking\n"
 615 "       2. flatness threshold: default=39, lower -> more deblocking\n"
 616 "                       the h & v deblocking filters share these\n"
 617 "                       so you can't set different thresholds for h / v\n"
 618 "vb     vdeblock        (2 threshold)           vertical deblocking filter\n"
 619 "ha     hadeblock       (2 threshold)           horizontal deblocking filter\n"
 620 "va     vadeblock       (2 threshold)           vertical deblocking filter\n"
 621 "h1     x1hdeblock                              experimental h deblock filter 1\n"
 622 "v1     x1vdeblock                              experimental v deblock filter 1\n"
 623 "dr     dering                                  deringing filter\n"
 624 "al     autolevels                              automatic brightness / contrast\n"
 625 "                       f        fullyrange     stretch luminance to (0..255)\n"
 626 "lb     linblenddeint                           linear blend deinterlacer\n"
 627 "li     linipoldeint                            linear interpolating deinterlace\n"
 628 "ci     cubicipoldeint                          cubic interpolating deinterlacer\n"
 629 "md     mediandeint                             median deinterlacer\n"
 630 "fd     ffmpegdeint                             ffmpeg deinterlacer\n"
 631 "l5     lowpass5                                FIR lowpass deinterlacer\n"
 632 "de     default                                 hb:a,vb:a,dr:a\n"
 633 "fa     fast                                    h1:a,v1:a,dr:a\n"
 634 "ac                                             ha:a:128:7,va:a,dr:a\n"
 635 "tn     tmpnoise        (3 threshold)           temporal noise reducer\n"
 636 "                     1. <= 2. <= 3.            larger -> stronger filtering\n"
 637 "fq     forceQuant      <quantizer>             force quantizer\n"
 638 "Usage:\n"
 639 "<filterName>[:<option>[:<option>...]][[,|/][-]<filterName>[:<option>...]]...\n"
 640 "long form example:\n"
 641 "vdeblock:autoq/hdeblock:autoq/linblenddeint    default,-vdeblock\n"
 642 "short form example:\n"
 643 "vb:a/hb:a/lb                                   de,-vb\n"
 644 "more examples:\n"
 645 "tn:64:128:256\n"
 646 "\n"
 647 ;
 648
 649 pp_mode *pp_get_mode_by_name_and_quality(const char *name, int quality)
 650 {
 651     char temp[GET_MODE_BUFFER_SIZE];
 652     char *p= temp;
 653     static const char filterDelimiters[] = ",/";
 654     static const char optionDelimiters[] = ":|";
 655     struct PPMode *ppMode;
 656     char *filterToken;
 657
 658     if (!name)  {
 659         av_log(NULL, AV_LOG_ERROR, "pp: Missing argument\n");
 660         return NULL;
 661     }
 662
 663     if (!strcmp(name, "help")) {
 664         const char *p;
 665         for (p = pp_help; strchr(p, '\n'); p = strchr(p, '\n') + 1) {
 666             av_strlcpy(temp, p, FFMIN(sizeof(temp), strchr(p, '\n') - p + 2));
 667             av_log(NULL, AV_LOG_INFO, "%s", temp);
 668         }
 669         return NULL;
 670     }
 671
 672     ppMode= av_malloc(sizeof(PPMode));
 673     if (!ppMode)
 674         return NULL;
 675
 676     ppMode->lumMode= 0;
 677     ppMode->chromMode= 0;
 678     ppMode->maxTmpNoise[0]= 700;
 679     ppMode->maxTmpNoise[1]= 1500;
 680     ppMode->maxTmpNoise[2]= 3000;
 681     ppMode->maxAllowedY= 234;
 682     ppMode->minAllowedY= 16;
 683     ppMode->baseDcDiff= 256/8;
 684     ppMode->flatnessThreshold= 56-16-1;
 685     ppMode->maxClippedThreshold= (AVRational){1,100};
 686     ppMode->error=0;
 687
 688     memset(temp, 0, GET_MODE_BUFFER_SIZE);
 689     av_strlcpy(temp, name, GET_MODE_BUFFER_SIZE - 1);
 690
 691     av_log(NULL, AV_LOG_DEBUG, "pp: %s\n", name);
 692
 693     for(;;){
 694         const char *filterName;
 695         int q= 1000000; //PP_QUALITY_MAX;
 696         int chrom=-1;
 697         int luma=-1;
 698         const char *option;
 699         const char *options[OPTIONS_ARRAY_SIZE];
 700         int i;
 701         int filterNameOk=0;
 702         int numOfUnknownOptions=0;
 703         int enable=1; //does the user want us to enabled or disabled the filter
 704         char *tokstate;
 705
 706         filterToken= av_strtok(p, filterDelimiters, &tokstate);
 707         if(!filterToken) break;
 708         p+= strlen(filterToken) + 1; // p points to next filterToken
 709         filterName= av_strtok(filterToken, optionDelimiters, &tokstate);
 710         if (!filterName) {
 711             ppMode->error++;
 712             break;
 713         }
 714         av_log(NULL, AV_LOG_DEBUG, "pp: %s::%s\n", filterToken, filterName);
 715
 716         if(*filterName == '-'){
 717             enable=0;
 718             filterName++;
 719         }
 720
 721         for(;;){ //for all options
 722             option= av_strtok(NULL, optionDelimiters, &tokstate);
 723             if(!option) break;
 724
 725             av_log(NULL, AV_LOG_DEBUG, "pp: option: %s\n", option);
 726             if(!strcmp("autoq", option) || !strcmp("a", option)) q= quality;
 727             else if(!strcmp("nochrom", option) || !strcmp("y", option)) chrom=0;
 728             else if(!strcmp("chrom", option) || !strcmp("c", option)) chrom=1;
 729             else if(!strcmp("noluma", option) || !strcmp("n", option)) luma=0;
 730             else{
 731                 options[numOfUnknownOptions] = option;
 732                 numOfUnknownOptions++;
 733             }
 734             if(numOfUnknownOptions >= OPTIONS_ARRAY_SIZE-1) break;
 735         }
 736         options[numOfUnknownOptions] = NULL;
 737
 738         /* replace stuff from the replace Table */
 739         for(i=0; replaceTable[2*i]; i++){
 740             if(!strcmp(replaceTable[2*i], filterName)){
 741                 size_t newlen = strlen(replaceTable[2*i + 1]);
 742                 int plen;
 743                 int spaceLeft;
 744
 745                 p--, *p=',';
 746
 747                 plen= strlen(p);
 748                 spaceLeft= p - temp + plen;
 749                 if(spaceLeft + newlen  >= GET_MODE_BUFFER_SIZE - 1){
 750                     ppMode->error++;
 751                     break;
 752                 }
 753                 memmove(p + newlen, p, plen+1);
 754                 memcpy(p, replaceTable[2*i + 1], newlen);
 755                 filterNameOk=1;
 756             }
 757         }
 758
 759         for(i=0; filters[i].shortName; i++){
 760             if(   !strcmp(filters[i].longName, filterName)
 761                || !strcmp(filters[i].shortName, filterName)){
 762                 ppMode->lumMode &= ~filters[i].mask;
 763                 ppMode->chromMode &= ~filters[i].mask;
 764
 765                 filterNameOk=1;
 766                 if(!enable) break; // user wants to disable it
 767
 768                 if(q >= filters[i].minLumQuality && luma)
 769                     ppMode->lumMode|= filters[i].mask;
 770                 if(chrom==1 || (chrom==-1 && filters[i].chromDefault))
 771                     if(q >= filters[i].minChromQuality)
 772                             ppMode->chromMode|= filters[i].mask;
 773
 774                 if(filters[i].mask == LEVEL_FIX){
 775                     int o;
 776                     ppMode->minAllowedY= 16;
 777                     ppMode->maxAllowedY= 234;
 778                     for(o=0; options[o]; o++){
 779                         if(  !strcmp(options[o],"fullyrange")
 780                            ||!strcmp(options[o],"f")){
 781                             ppMode->minAllowedY= 0;
 782                             ppMode->maxAllowedY= 255;
 783                             numOfUnknownOptions--;
 784                         }
 785                     }
 786                 }
 787                 else if(filters[i].mask == TEMP_NOISE_FILTER)
 788                 {
 789                     int o;
 790                     int numOfNoises=0;
 791
 792                     for(o=0; options[o]; o++){
 793                         char *tail;
 794                         ppMode->maxTmpNoise[numOfNoises]=
 795                             strtol(options[o], &tail, 0);
 796                         if(tail!=options[o]){
 797                             numOfNoises++;
 798                             numOfUnknownOptions--;
 799                             if(numOfNoises >= 3) break;
 800                         }
 801                     }
 802                 }
 803                 else if(filters[i].mask == V_DEBLOCK   || filters[i].mask == H_DEBLOCK
 804                      || filters[i].mask == V_A_DEBLOCK || filters[i].mask == H_A_DEBLOCK){
 805                     int o;
 806
 807                     for(o=0; options[o] && o<2; o++){
 808                         char *tail;
 809                         int val= strtol(options[o], &tail, 0);
 810                         if(tail==options[o]) break;
 811
 812                         numOfUnknownOptions--;
 813                         if(o==0) ppMode->baseDcDiff= val;
 814                         else ppMode->flatnessThreshold= val;
 815                     }
 816                 }
 817                 else if(filters[i].mask == FORCE_QUANT){
 818                     int o;
 819                     ppMode->forcedQuant= 15;
 820
 821                     for(o=0; options[o] && o<1; o++){
 822                         char *tail;
 823                         int val= strtol(options[o], &tail, 0);
 824                         if(tail==options[o]) break;
 825
 826                         numOfUnknownOptions--;
 827                         ppMode->forcedQuant= val;
 828                     }
 829                 }
 830             }
 831         }
 832         if(!filterNameOk) ppMode->error++;
 833         ppMode->error += numOfUnknownOptions;
 834     }
 835
 836     av_log(NULL, AV_LOG_DEBUG, "pp: lumMode=%X, chromMode=%X\n", ppMode->lumMode, ppMode->chromMode);
 837     if(ppMode->error){
 838         av_log(NULL, AV_LOG_ERROR, "%d errors in postprocess string \"%s\"\n", ppMode->error, name);
 839         av_free(ppMode);
 840         return NULL;
 841     }
 842     return ppMode;
 843 }
 844
 845 void pp_free_mode(pp_mode *mode){
 846     av_free(mode);
 847 }
 848
 849 static void reallocAlign(void **p, int size){
 850     av_free(*p);
 851     *p= av_mallocz(size);
 852 }
 853
 854 static void reallocBuffers(PPContext *c, int width, int height, int stride, int qpStride){
 855     int mbWidth = (width+15)>>4;
 856     int mbHeight= (height+15)>>4;
 857     int i;
 858
 859     c->stride= stride;
 860     c->qpStride= qpStride;
 861
 862     reallocAlign((void **)&c->tempDst, stride*24+32);
 863     reallocAlign((void **)&c->tempSrc, stride*24);
 864     reallocAlign((void **)&c->tempBlocks, 2*16*8);
 865     reallocAlign((void **)&c->yHistogram, 256*sizeof(uint64_t));
 866     for(i=0; i<256; i++)
 867             c->yHistogram[i]= width*height/64*15/256;
 868
 869     for(i=0; i<3; i++){
 870         //Note: The +17*1024 is just there so I do not have to worry about r/w over the end.
 871         reallocAlign((void **)&c->tempBlurred[i], stride*mbHeight*16 + 17*1024);
 872         reallocAlign((void **)&c->tempBlurredPast[i], 256*((height+7)&(~7))/2 + 17*1024);//FIXME size
 873     }
 874
 875     reallocAlign((void **)&c->deintTemp, 2*width+32);
 876     reallocAlign((void **)&c->nonBQPTable, qpStride*mbHeight*sizeof(QP_STORE_T));
 877     reallocAlign((void **)&c->stdQPTable, qpStride*mbHeight*sizeof(QP_STORE_T));
 878     reallocAlign((void **)&c->forcedQPTable, mbWidth*sizeof(QP_STORE_T));
 879 }
 880
 881 static const char * context_to_name(void * ptr) {
 882     return "postproc";
 883 }
 884
 885 static const AVClass av_codec_context_class = { "Postproc", context_to_name, NULL };
 886
 887 av_cold pp_context *pp_get_context(int width, int height, int cpuCaps){
 888     PPContext *c= av_mallocz(sizeof(PPContext));
 889     int stride= FFALIGN(width, 16);  //assumed / will realloc if needed
 890     int qpStride= (width+15)/16 + 2; //assumed / will realloc if needed
 891
 892     if (!c)
 893         return NULL;
 894
 895     c->av_class = &av_codec_context_class;
 896     if(cpuCaps&PP_FORMAT){
 897         c->hChromaSubSample= cpuCaps&0x3;
 898         c->vChromaSubSample= (cpuCaps>>4)&0x3;
 899     }else{
 900         c->hChromaSubSample= 1;
 901         c->vChromaSubSample= 1;
 902     }
 903     if (cpuCaps & PP_CPU_CAPS_AUTO) {
 904         c->cpuCaps = av_get_cpu_flags();
 905     } else {
 906         c->cpuCaps = 0;
 907         if (cpuCaps & PP_CPU_CAPS_MMX)      c->cpuCaps |= AV_CPU_FLAG_MMX;
 908         if (cpuCaps & PP_CPU_CAPS_MMX2)     c->cpuCaps |= AV_CPU_FLAG_MMXEXT;
 909         if (cpuCaps & PP_CPU_CAPS_3DNOW)    c->cpuCaps |= AV_CPU_FLAG_3DNOW;
 910         if (cpuCaps & PP_CPU_CAPS_ALTIVEC)  c->cpuCaps |= AV_CPU_FLAG_ALTIVEC;
 911     }
 912
 913     reallocBuffers(c, width, height, stride, qpStride);
 914
 915     c->frameNum=-1;
 916
 917     return c;
 918 }
 919
 920 av_cold void pp_free_context(void *vc){
 921     PPContext *c = (PPContext*)vc;
 922     int i;
 923
 924     for(i=0; i<FF_ARRAY_ELEMS(c->tempBlurred); i++)
 925         av_free(c->tempBlurred[i]);
 926     for(i=0; i<FF_ARRAY_ELEMS(c->tempBlurredPast); i++)
 927         av_free(c->tempBlurredPast[i]);
 928
 929     av_free(c->tempBlocks);
 930     av_free(c->yHistogram);
 931     av_free(c->tempDst);
 932     av_free(c->tempSrc);
 933     av_free(c->deintTemp);
 934     av_free(c->stdQPTable);
 935     av_free(c->nonBQPTable);
 936     av_free(c->forcedQPTable);
 937
 938     memset(c, 0, sizeof(PPContext));
 939
 940     av_free(c);
 941 }
 942
 943 void  pp_postprocess(const uint8_t * src[3], const int srcStride[3],
 944                      uint8_t * dst[3], const int dstStride[3],
 945                      int width, int height,
 946                      const QP_STORE_T *QP_store,  int QPStride,
 947                      pp_mode *vm,  void *vc, int pict_type)
 948 {
 949     int mbWidth = (width+15)>>4;
 950     int mbHeight= (height+15)>>4;
 951     PPMode *mode = vm;
 952     PPContext *c = vc;
 953     int minStride= FFMAX(FFABS(srcStride[0]), FFABS(dstStride[0]));
 954     int absQPStride = FFABS(QPStride);
 955
 956     // c->stride and c->QPStride are always positive
 957     if(c->stride < minStride || c->qpStride < absQPStride)
 958         reallocBuffers(c, width, height,
 959                        FFMAX(minStride, c->stride),
 960                        FFMAX(c->qpStride, absQPStride));
 961
 962     if(!QP_store || (mode->lumMode & FORCE_QUANT)){
 963         int i;
 964         QP_store= c->forcedQPTable;
 965         absQPStride = QPStride = 0;
 966         if(mode->lumMode & FORCE_QUANT)
 967             for(i=0; i<mbWidth; i++) c->forcedQPTable[i]= mode->forcedQuant;
 968         else
 969             for(i=0; i<mbWidth; i++) c->forcedQPTable[i]= 1;
 970     }
 971
 972     if(pict_type & PP_PICT_TYPE_QP2){
 973         int i;
 974         const int count= FFMAX(mbHeight * absQPStride, mbWidth);
 975         for(i=0; i<(count>>2); i++){
 976             AV_WN32(c->stdQPTable + (i<<2), AV_RN32(QP_store + (i<<2)) >> 1 & 0x7F7F7F7F);
 977         }
 978         for(i<<=2; i<count; i++){
 979             c->stdQPTable[i] = QP_store[i]>>1;
 980         }
 981         QP_store= c->stdQPTable;
 982         QPStride= absQPStride;
 983     }
 984
 985     if(0){
 986         int x,y;
 987         for(y=0; y<mbHeight; y++){
 988             for(x=0; x<mbWidth; x++){
 989                 av_log(c, AV_LOG_INFO, "%2d ", QP_store[x + y*QPStride]);
 990             }
 991             av_log(c, AV_LOG_INFO, "\n");
 992         }
 993         av_log(c, AV_LOG_INFO, "\n");
 994     }
 995
 996     if((pict_type&7)!=3){
 997         if (QPStride >= 0){
 998             int i;
 999             const int count= FFMAX(mbHeight * QPStride, mbWidth);
1000             for(i=0; i<(count>>2); i++){
1001                 AV_WN32(c->nonBQPTable + (i<<2), AV_RN32(QP_store + (i<<2)) & 0x3F3F3F3F);
1002             }
1003             for(i<<=2; i<count; i++){
1004                 c->nonBQPTable[i] = QP_store[i] & 0x3F;
1005             }
1006         } else {
1007             int i,j;
1008             for(i=0; i<mbHeight; i++) {
1009                 for(j=0; j<absQPStride; j++) {
1010                     c->nonBQPTable[i*absQPStride+j] = QP_store[i*QPStride+j] & 0x3F;
1011                 }
1012             }
1013         }
1014     }
1015
1016     av_log(c, AV_LOG_DEBUG, "using npp filters 0x%X/0x%X\n",
1017            mode->lumMode, mode->chromMode);
1018
1019     postProcess(src[0], srcStride[0], dst[0], dstStride[0],
1020                 width, height, QP_store, QPStride, 0, mode, c);
1021
1022     if (!(src[1] && src[2] && dst[1] && dst[2]))
1023         return;
1024
1025     width  = (width )>>c->hChromaSubSample;
1026     height = (height)>>c->vChromaSubSample;
1027
1028     if(mode->chromMode){
1029         postProcess(src[1], srcStride[1], dst[1], dstStride[1],
1030                     width, height, QP_store, QPStride, 1, mode, c);
1031         postProcess(src[2], srcStride[2], dst[2], dstStride[2],
1032                     width, height, QP_store, QPStride, 2, mode, c);
1033     }
1034     else if(srcStride[1] == dstStride[1] && srcStride[2] == dstStride[2]){
1035         linecpy(dst[1], src[1], height, srcStride[1]);
1036         linecpy(dst[2], src[2], height, srcStride[2]);
1037     }else{
1038         int y;
1039         for(y=0; y<height; y++){
1040             memcpy(&(dst[1][y*dstStride[1]]), &(src[1][y*srcStride[1]]), width);
1041             memcpy(&(dst[2][y*dstStride[2]]), &(src[2][y*srcStride[2]]), width);
1042         }
1043     }
1044 }