From cd38e322ef2736ede1c59dd036db6547f132d361 Mon Sep 17 00:00:00 2001 From: Michael Niedermayer Date: Sat, 24 Nov 2001 01:38:30 +0000 Subject: [PATCH] faster dering Originally committed as revision 3094 to svn://svn.mplayerhq.hu/mplayer/trunk/postproc --- postproc/postprocess.c | 176 +++++++++++++++++++++++++++------------- postproc/postprocess_template.c | 176 +++++++++++++++++++++++++++------------- 2 files changed, 238 insertions(+), 114 deletions(-) diff --git a/postproc/postprocess.c b/postproc/postprocess.c index d590b01a4..d0ae70b81 100644 --- a/postproc/postprocess.c +++ b/postproc/postprocess.c @@ -47,7 +47,6 @@ c = checked against the other implementations (-vo md5) /* TODO: -verify that everything workes as it should (how?) reduce the time wasted on the mem transfer implement everything in C at least (done at the moment but ...) unroll stuff if instructions depend too much on the prior one @@ -62,7 +61,8 @@ border remover optimize c versions try to unroll inner for(x=0 ... loop to avoid these damn if(x ... checks smart blur -commandline option for the deblock thresholds +commandline option for the deblock / dering thresholds +memcpy chrominance if no chroma filtering is done ... */ @@ -162,6 +162,7 @@ static uint8_t tempBlocks[8*16*2]; //used for the horizontal code int hFlatnessThreshold= 56 - 16; int vFlatnessThreshold= 56 - 16; +int deringThreshold= 20; //amount of "black" u r willing to loose to get a brightness corrected picture double maxClippedThreshold= 0.01; @@ -310,28 +311,26 @@ asm volatile( "paddb %%mm2, %%mm0 \n\t" " \n\t" +#ifdef HAVE_MMX2 + "pxor %%mm7, %%mm7 \n\t" + "psadbw %%mm7, %%mm0 \n\t" +#else "movq %%mm0, %%mm1 \n\t" "psrlw $8, %%mm0 \n\t" "paddb %%mm1, %%mm0 \n\t" -#ifdef HAVE_MMX2 - "pshufw $0xF9, %%mm0, %%mm1 \n\t" - "paddb %%mm1, %%mm0 \n\t" - "pshufw $0xFE, %%mm0, %%mm1 \n\t" -#else "movq %%mm0, %%mm1 \n\t" "psrlq $16, %%mm0 \n\t" "paddb %%mm1, %%mm0 \n\t" "movq %%mm0, %%mm1 \n\t" "psrlq $32, %%mm0 \n\t" -#endif "paddb %%mm1, %%mm0 \n\t" +#endif "movd %%mm0, %0 \n\t" : "=r" (numEq) : "r" (src), "r" (stride) - : "%eax", "%ebx" + : "%ebx" ); - - numEq= (256 - numEq) &0xFF; + numEq= (-numEq) &0xFF; #else for(y=0; y>4; +#ifdef DEBUG_DERING_THRESHOLD + asm volatile("emms\n\t":); + { + static long long numPixels=0; + if(x!=1 && x!=8 && y!=1 && y!=8) numPixels++; +// if((max-min)<20 || (max-min)*QP<200) +// if((max-min)*QP < 500) +// if(max-min worstDiff) + { + worstDiff= absDiff; + worstQP= QP; + worstRange= max-min; + } + errorSum+= error; + + if(1024LL*1024LL*1024LL % numSkiped == 0) + { + printf( "sum:%1.3f, skip:%d, wQP:%d, " + "wRange:%d, wDiff:%d, relSkip:%1.3f\n", + (float)errorSum/numSkiped, numSkiped, worstQP, worstRange, + worstDiff, (float)numSkiped/numPixels); + } + } + } +#endif if (*p + 2*QP < f) *p= *p + 2*QP; else if(*p - 2*QP > f) *p= *p - 2*QP; else *p=f; } } } - +#ifdef DEBUG_DERING_THRESHOLD + if(max-min < 20) + { + for(y=1; y<9; y++) + { + int x; + int t = 0; + p= src + stride*y; + for(x=1; x<9; x++) + { + p++; + *p = MIN(*p + 20, 255); + } + } +// src[0] = src[7]=src[stride*7]=src[stride*7 + 7]=255; + } +#endif #endif } diff --git a/postproc/postprocess_template.c b/postproc/postprocess_template.c index d590b01a4..d0ae70b81 100644 --- a/postproc/postprocess_template.c +++ b/postproc/postprocess_template.c @@ -47,7 +47,6 @@ c = checked against the other implementations (-vo md5) /* TODO: -verify that everything workes as it should (how?) reduce the time wasted on the mem transfer implement everything in C at least (done at the moment but ...) unroll stuff if instructions depend too much on the prior one @@ -62,7 +61,8 @@ border remover optimize c versions try to unroll inner for(x=0 ... loop to avoid these damn if(x ... checks smart blur -commandline option for the deblock thresholds +commandline option for the deblock / dering thresholds +memcpy chrominance if no chroma filtering is done ... */ @@ -162,6 +162,7 @@ static uint8_t tempBlocks[8*16*2]; //used for the horizontal code int hFlatnessThreshold= 56 - 16; int vFlatnessThreshold= 56 - 16; +int deringThreshold= 20; //amount of "black" u r willing to loose to get a brightness corrected picture double maxClippedThreshold= 0.01; @@ -310,28 +311,26 @@ asm volatile( "paddb %%mm2, %%mm0 \n\t" " \n\t" +#ifdef HAVE_MMX2 + "pxor %%mm7, %%mm7 \n\t" + "psadbw %%mm7, %%mm0 \n\t" +#else "movq %%mm0, %%mm1 \n\t" "psrlw $8, %%mm0 \n\t" "paddb %%mm1, %%mm0 \n\t" -#ifdef HAVE_MMX2 - "pshufw $0xF9, %%mm0, %%mm1 \n\t" - "paddb %%mm1, %%mm0 \n\t" - "pshufw $0xFE, %%mm0, %%mm1 \n\t" -#else "movq %%mm0, %%mm1 \n\t" "psrlq $16, %%mm0 \n\t" "paddb %%mm1, %%mm0 \n\t" "movq %%mm0, %%mm1 \n\t" "psrlq $32, %%mm0 \n\t" -#endif "paddb %%mm1, %%mm0 \n\t" +#endif "movd %%mm0, %0 \n\t" : "=r" (numEq) : "r" (src), "r" (stride) - : "%eax", "%ebx" + : "%ebx" ); - - numEq= (256 - numEq) &0xFF; + numEq= (-numEq) &0xFF; #else for(y=0; y>4; +#ifdef DEBUG_DERING_THRESHOLD + asm volatile("emms\n\t":); + { + static long long numPixels=0; + if(x!=1 && x!=8 && y!=1 && y!=8) numPixels++; +// if((max-min)<20 || (max-min)*QP<200) +// if((max-min)*QP < 500) +// if(max-min worstDiff) + { + worstDiff= absDiff; + worstQP= QP; + worstRange= max-min; + } + errorSum+= error; + + if(1024LL*1024LL*1024LL % numSkiped == 0) + { + printf( "sum:%1.3f, skip:%d, wQP:%d, " + "wRange:%d, wDiff:%d, relSkip:%1.3f\n", + (float)errorSum/numSkiped, numSkiped, worstQP, worstRange, + worstDiff, (float)numSkiped/numPixels); + } + } + } +#endif if (*p + 2*QP < f) *p= *p + 2*QP; else if(*p - 2*QP > f) *p= *p - 2*QP; else *p=f; } } } - +#ifdef DEBUG_DERING_THRESHOLD + if(max-min < 20) + { + for(y=1; y<9; y++) + { + int x; + int t = 0; + p= src + stride*y; + for(x=1; x<9; x++) + { + p++; + *p = MIN(*p + 20, 255); + } + } +// src[0] = src[7]=src[stride*7]=src[stride*7 + 7]=255; + } +#endif #endif } -- 2.11.0