OSDN Git Service

faster mmx2 / 3dnow deblocking filter
authorMichael Niedermayer <michaelni@gmx.at>
Mon, 19 Nov 2001 22:20:30 +0000 (22:20 +0000)
committerMichael Niedermayer <michaelni@gmx.at>
Mon, 19 Nov 2001 22:20:30 +0000 (22:20 +0000)
brightness_debug (draws luminance histogram & autodetected white/black level)

Originally committed as revision 3014 to svn://svn.mplayerhq.hu/mplayer/trunk/postproc

postproc/postprocess.c
postproc/postprocess_template.c

index 0d20ad4..6ac10ff 100644 (file)
 isVertDC               Ec      Ec
 isVertMinMaxOk         Ec      Ec
 doVertLowPass          E               e       e
-doVertDefFilter                Ec      Ec      Ec
+doVertDefFilter                Ec      Ec      e       e
 isHorizDC              Ec      Ec
 isHorizMinMaxOk                a       E
 doHorizLowPass         E               e       e
-doHorizDefFilter       Ec      Ec      Ec
+doHorizDefFilter       Ec      Ec      e       e
 deRing                 E               e       e*
 Vertical RKAlgo1       E               a       a
 Horizontal RKAlgo1                     a       a
@@ -63,8 +63,6 @@ optimize c versions
 try to unroll inner for(x=0 ... loop to avoid these damn if(x ... checks
 smart blur
 ...
-
-Notes:
 */
 
 //Changelog: use the CVS log
@@ -80,6 +78,7 @@ Notes:
 //#undef HAVE_MMX2
 //#define HAVE_3DNOW
 //#undef HAVE_MMX
+//#define DEBUG_BRIGHTNESS
 #include "postprocess.h"
 
 #define MIN(a,b) ((a) > (b) ? (b) : (a))
@@ -1067,10 +1066,299 @@ HX1b((%0, %1, 4),(%%ebx),(%%ebx, %1),(%%ebx, %1, 2))
 
 static inline void doVertDefFilter(uint8_t src[], int stride, int QP)
 {
-#ifdef HAVE_MMX
+#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
+/*
+       uint8_t tmp[16];
+       const int l1= stride;
+       const int l2= stride + l1;
+       const int l3= stride + l2;
+       const int l4= (int)tmp - (int)src - stride*3;
+       const int l5= (int)tmp - (int)src - stride*3 + 8;
+       const int l6= stride*3 + l3;
+       const int l7= stride + l6;
+       const int l8= stride + l7;
+
+       memcpy(tmp, src+stride*7, 8);
+       memcpy(tmp+8, src+stride*8, 8);
+*/
+       src+= stride*4;
+       asm volatile(
+
+#if 0 //sligtly more accurate and slightly slower
+               "pxor %%mm7, %%mm7                              \n\t" // 0
+               "leal (%0, %1), %%eax                           \n\t"
+               "leal (%%eax, %1, 4), %%ebx                     \n\t"
+//     0       1       2       3       4       5       6       7
+//     %0      %0+%1   %0+2%1  eax+2%1 %0+4%1  eax+4%1 ebx+%1  ebx+2%1
+//     %0      eax     eax+%1  eax+2%1 %0+4%1  ebx     ebx+%1  ebx+2%1
+
+
+               "movq (%0, %1, 2), %%mm0                        \n\t" // l2
+               "movq (%0), %%mm1                               \n\t" // l0
+               "movq %%mm0, %%mm2                              \n\t" // l2
+               PAVGB(%%mm7, %%mm0)                                   // ~l2/2
+               PAVGB(%%mm1, %%mm0)                                   // ~(l2 + 2l0)/4
+               PAVGB(%%mm2, %%mm0)                                   // ~(5l2 + 2l0)/8
+
+               "movq (%%eax), %%mm1                            \n\t" // l1
+               "movq (%%eax, %1, 2), %%mm3                     \n\t" // l3
+               "movq %%mm1, %%mm4                              \n\t" // l1
+               PAVGB(%%mm7, %%mm1)                                   // ~l1/2
+               PAVGB(%%mm3, %%mm1)                                   // ~(l1 + 2l3)/4
+               PAVGB(%%mm4, %%mm1)                                   // ~(5l1 + 2l3)/8
+
+               "movq %%mm0, %%mm4                              \n\t" // ~(5l2 + 2l0)/8
+               "psubusb %%mm1, %%mm0                           \n\t"
+               "psubusb %%mm4, %%mm1                           \n\t"
+               "por %%mm0, %%mm1                               \n\t" // ~|2l0 - 5l1 + 5l2 - 2l3|/8
+// mm1= |lenergy|, mm2= l2, mm3= l3, mm7=0
+
+               "movq (%0, %1, 4), %%mm0                        \n\t" // l4
+               "movq %%mm0, %%mm4                              \n\t" // l4
+               PAVGB(%%mm7, %%mm0)                                   // ~l4/2
+               PAVGB(%%mm2, %%mm0)                                   // ~(l4 + 2l2)/4
+               PAVGB(%%mm4, %%mm0)                                   // ~(5l4 + 2l2)/8
+
+               "movq (%%ebx), %%mm2                            \n\t" // l5
+               "movq %%mm3, %%mm5                              \n\t" // l3
+               PAVGB(%%mm7, %%mm3)                                   // ~l3/2
+               PAVGB(%%mm2, %%mm3)                                   // ~(l3 + 2l5)/4
+               PAVGB(%%mm5, %%mm3)                                   // ~(5l3 + 2l5)/8
+
+               "movq %%mm0, %%mm6                              \n\t" // ~(5l4 + 2l2)/8
+               "psubusb %%mm3, %%mm0                           \n\t"
+               "psubusb %%mm6, %%mm3                           \n\t"
+               "por %%mm0, %%mm3                               \n\t" // ~|2l2 - 5l3 + 5l4 - 2l5|/8
+               "pcmpeqb %%mm7, %%mm0                           \n\t" // SIGN(2l2 - 5l3 + 5l4 - 2l5)
+// mm0= SIGN(menergy), mm1= |lenergy|, mm2= l5, mm3= |menergy|, mm4=l4, mm5= l3, mm7=0
+
+               "movq (%%ebx, %1), %%mm6                        \n\t" // l6
+               "movq %%mm6, %%mm5                              \n\t" // l6
+               PAVGB(%%mm7, %%mm6)                                   // ~l6/2
+               PAVGB(%%mm4, %%mm6)                                   // ~(l6 + 2l4)/4
+               PAVGB(%%mm5, %%mm6)                                   // ~(5l6 + 2l4)/8
+
+               "movq (%%ebx, %1, 2), %%mm5                     \n\t" // l7
+               "movq %%mm2, %%mm4                              \n\t" // l5
+               PAVGB(%%mm7, %%mm2)                                   // ~l5/2
+               PAVGB(%%mm5, %%mm2)                                   // ~(l5 + 2l7)/4
+               PAVGB(%%mm4, %%mm2)                                   // ~(5l5 + 2l7)/8
+
+               "movq %%mm6, %%mm4                              \n\t" // ~(5l6 + 2l4)/8
+               "psubusb %%mm2, %%mm6                           \n\t"
+               "psubusb %%mm4, %%mm2                           \n\t"
+               "por %%mm6, %%mm2                               \n\t" // ~|2l4 - 5l5 + 5l6 - 2l7|/8
+// mm0= SIGN(menergy), mm1= |lenergy|/8, mm2= |renergy|/8, mm3= |menergy|/8, mm7=0
+
+
+               PMINUB(%%mm2, %%mm1, %%mm4)                           // MIN(|lenergy|,|renergy|)/8
+               "movq pQPb, %%mm4                               \n\t" // QP //FIXME QP+1 ?
+               "paddusb b01, %%mm4                             \n\t"
+               "pcmpgtb %%mm3, %%mm4                           \n\t" // |menergy|/8 < QP
+               "psubusb %%mm1, %%mm3                           \n\t" // d=|menergy|/8-MIN(|lenergy|,|renergy|)/8
+               "pand %%mm4, %%mm3                              \n\t"
+
+               "movq %%mm3, %%mm1                              \n\t"
+//             "psubusb b01, %%mm3                             \n\t"
+               PAVGB(%%mm7, %%mm3)
+               PAVGB(%%mm7, %%mm3)
+               "paddusb %%mm1, %%mm3                           \n\t"
+//             "paddusb b01, %%mm3                             \n\t"
+
+               "movq (%%eax, %1, 2), %%mm6                     \n\t" //l3
+               "movq (%0, %1, 4), %%mm5                        \n\t" //l4
+               "movq (%0, %1, 4), %%mm4                        \n\t" //l4
+               "psubusb %%mm6, %%mm5                           \n\t"
+               "psubusb %%mm4, %%mm6                           \n\t"
+               "por %%mm6, %%mm5                               \n\t" // |l3-l4|
+               "pcmpeqb %%mm7, %%mm6                           \n\t" // SIGN(l3-l4)
+               "pxor %%mm6, %%mm0                              \n\t"
+               "pand %%mm0, %%mm3                              \n\t"
+               PMINUB(%%mm5, %%mm3, %%mm0)
+
+               "psubusb b01, %%mm3                             \n\t"
+               PAVGB(%%mm7, %%mm3)
+
+               "movq (%%eax, %1, 2), %%mm0                     \n\t"
+               "movq (%0, %1, 4), %%mm2                        \n\t"
+               "pxor %%mm6, %%mm0                              \n\t"
+               "pxor %%mm6, %%mm2                              \n\t"
+               "psubb %%mm3, %%mm0                             \n\t"
+               "paddb %%mm3, %%mm2                             \n\t"
+               "pxor %%mm6, %%mm0                              \n\t"
+               "pxor %%mm6, %%mm2                              \n\t"
+               "movq %%mm0, (%%eax, %1, 2)                     \n\t"
+               "movq %%mm2, (%0, %1, 4)                        \n\t"
+#endif
+
+               "leal (%0, %1), %%eax                           \n\t"
+               "pcmpeqb %%mm6, %%mm6                           \n\t" // -1
+//     0       1       2       3       4       5       6       7
+//     %0      %0+%1   %0+2%1  eax+2%1 %0+4%1  eax+4%1 ebx+%1  ebx+2%1
+//     %0      eax     eax+%1  eax+2%1 %0+4%1  ebx     ebx+%1  ebx+2%1
+
+
+               "movq (%%eax, %1, 2), %%mm1                     \n\t" // l3
+               "movq (%0, %1, 4), %%mm0                        \n\t" // l4
+               "pxor %%mm6, %%mm1                              \n\t" // -l3-1
+               PAVGB(%%mm1, %%mm0)                                   // -q+128 = (l4-l3+256)/2
+// mm1=-l3-1, mm0=128-q
+
+               "movq (%%eax, %1, 4), %%mm2                     \n\t" // l5
+               "movq (%%eax, %1), %%mm3                        \n\t" // l2
+               "pxor %%mm6, %%mm2                              \n\t" // -l5-1
+               "movq %%mm2, %%mm5                              \n\t" // -l5-1
+               "movq b80, %%mm4                                \n\t" // 128
+               "leal (%%eax, %1, 4), %%ebx                     \n\t"
+               PAVGB(%%mm3, %%mm2)                                   // (l2-l5+256)/2
+               PAVGB(%%mm0, %%mm4)                                   // ~(l4-l3)/4 + 128
+               PAVGB(%%mm2, %%mm4)                                   // ~(l2-l5)/4 +(l4-l3)/8 + 128
+               PAVGB(%%mm0, %%mm4)                                   // ~(l2-l5)/8 +5(l4-l3)/16 + 128
+// mm1=-l3-1, mm0=128-q, mm3=l2, mm4=menergy/16 + 128, mm5= -l5-1
+
+               "movq (%%eax), %%mm2                            \n\t" // l1
+               "pxor %%mm6, %%mm2                              \n\t" // -l1-1
+               PAVGB(%%mm3, %%mm2)                                   // (l2-l1+256)/2
+               PAVGB((%0), %%mm1)                                    // (l0-l3+256)/2
+               "movq b80, %%mm3                                \n\t" // 128
+               PAVGB(%%mm2, %%mm3)                                   // ~(l2-l1)/4 + 128
+               PAVGB(%%mm1, %%mm3)                                   // ~(l0-l3)/4 +(l2-l1)/8 + 128
+               PAVGB(%%mm2, %%mm3)                                   // ~(l0-l3)/8 +5(l2-l1)/16 + 128
+// mm0=128-q, mm3=lenergy/16 + 128, mm4= menergy/16 + 128, mm5= -l5-1
+
+               PAVGB((%%ebx, %1), %%mm5)                             // (l6-l5+256)/2
+               "movq (%%ebx, %1, 2), %%mm1                     \n\t" // l7
+               "pxor %%mm6, %%mm1                              \n\t" // -l7-1
+               PAVGB((%0, %1, 4), %%mm1)                             // (l4-l7+256)/2
+               "movq b80, %%mm2                                \n\t" // 128
+               PAVGB(%%mm5, %%mm2)                                   // ~(l6-l5)/4 + 128
+               PAVGB(%%mm1, %%mm2)                                   // ~(l4-l7)/4 +(l6-l5)/8 + 128
+               PAVGB(%%mm5, %%mm2)                                   // ~(l4-l7)/8 +5(l6-l5)/16 + 128
+// mm0=128-q, mm2=renergy/16 + 128, mm3=lenergy/16 + 128, mm4= menergy/16 + 128
+
+               "movq b00, %%mm1                                \n\t" // 0
+               "movq b00, %%mm5                                \n\t" // 0
+               "psubb %%mm2, %%mm1                             \n\t" // 128 - renergy/16
+               "psubb %%mm3, %%mm5                             \n\t" // 128 - lenergy/16
+               PMAXUB(%%mm1, %%mm2)                                  // 128 + |renergy/16|
+               PMAXUB(%%mm5, %%mm3)                                  // 128 + |lenergy/16|
+               PMINUB(%%mm2, %%mm3, %%mm1)                           // 128 + MIN(|lenergy|,|renergy|)/16
+
+// mm0=128-q, mm3=128 + MIN(|lenergy|,|renergy|)/16, mm4= menergy/16 + 128
+
+               "movq b00, %%mm7                                \n\t" // 0
+               "movq pQPb, %%mm2                               \n\t" // QP
+               PAVGB(%%mm6, %%mm2)                                   // 128 + QP/2
+               "psubb %%mm6, %%mm2                             \n\t"
+
+               "movq %%mm4, %%mm1                              \n\t"
+               "pcmpgtb %%mm7, %%mm1                           \n\t" // SIGN(menergy)
+               "pxor %%mm1, %%mm4                              \n\t"
+               "psubb %%mm1, %%mm4                             \n\t" // 128 + |menergy|/16
+               "pcmpgtb %%mm4, %%mm2                           \n\t" // |menergy|/16 < QP/2
+               "psubusb %%mm3, %%mm4                           \n\t" //d=|menergy|/16 - MIN(|lenergy|,|renergy|)/16
+// mm0=128-q, mm1= SIGN(menergy), mm2= |menergy|/16 < QP/2, mm4= d/16
+
+               "movq %%mm4, %%mm3                              \n\t" // d
+               "psubusb b01, %%mm4                             \n\t"
+               PAVGB(%%mm7, %%mm4)                                   // d/32
+               PAVGB(%%mm7, %%mm4)                                   // (d + 32)/64
+               "paddb %%mm3, %%mm4                             \n\t" // 5d/64
+               "pand %%mm2, %%mm4                              \n\t"
+
+               "movq b80, %%mm5                                \n\t" // 128
+               "psubb %%mm0, %%mm5                             \n\t" // q
+               "paddsb %%mm6, %%mm5                            \n\t" // fix bad rounding
+               "pcmpgtb %%mm5, %%mm7                           \n\t" // SIGN(q)
+               "pxor %%mm7, %%mm5                              \n\t"
+
+               PMINUB(%%mm5, %%mm4, %%mm3)                           // MIN(|q|, 5d/64)
+               "pxor %%mm1, %%mm7                              \n\t" // SIGN(d*q)
+
+               "pand %%mm7, %%mm4                              \n\t"
+               "movq (%%eax, %1, 2), %%mm0                     \n\t"
+               "movq (%0, %1, 4), %%mm2                        \n\t"
+               "pxor %%mm1, %%mm0                              \n\t"
+               "pxor %%mm1, %%mm2                              \n\t"
+               "paddb %%mm4, %%mm0                             \n\t"
+               "psubb %%mm4, %%mm2                             \n\t"
+               "pxor %%mm1, %%mm0                              \n\t"
+               "pxor %%mm1, %%mm2                              \n\t"
+               "movq %%mm0, (%%eax, %1, 2)                     \n\t"
+               "movq %%mm2, (%0, %1, 4)                        \n\t"
+
+               :
+               : "r" (src), "r" (stride)
+               : "%eax", "%ebx"
+       );
+
+/*
+       {
+       int x;
+       src-= stride;
+       for(x=0; x<BLOCK_SIZE; x++)
+       {
+               const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]);
+               if(ABS(middleEnergy)< 8*QP)
+               {
+                       const int q=(src[l4] - src[l5])/2;
+                       const int leftEnergy=  5*(src[l3] - src[l2]) + 2*(src[l1] - src[l4]);
+                       const int rightEnergy= 5*(src[l7] - src[l6]) + 2*(src[l5] - src[l8]);
+
+                       int d= ABS(middleEnergy) - MIN( ABS(leftEnergy), ABS(rightEnergy) );
+                       d= MAX(d, 0);
+
+                       d= (5*d + 32) >> 6;
+                       d*= SIGN(-middleEnergy);
+
+                       if(q>0)
+                       {
+                               d= d<0 ? 0 : d;
+                               d= d>q ? q : d;
+                       }
+                       else
+                       {
+                               d= d>0 ? 0 : d;
+                               d= d<q ? q : d;
+                       }
+
+                       src[l4]-= d;
+                       src[l5]+= d;
+               }
+               src++;
+       }
+src-=8;
+       for(x=0; x<8; x++)
+       {
+               int y;
+               for(y=4; y<6; y++)
+               {
+                       int d= src[x+y*stride] - tmp[x+(y-4)*8];
+                       int ad= ABS(d);
+                       static int max=0;
+                       static int sum=0;
+                       static int num=0;
+                       static int bias=0;
+
+                       if(max<ad) max=ad;
+                       sum+= ad>3 ? 1 : 0;
+                       if(ad>3)
+                       {
+                               src[0] = src[7] = src[stride*7] = src[(stride+1)*7]=255;
+                       }
+                       if(y==4) bias+=d;
+                       num++;
+                       if(num%1000000 == 0)
+                       {
+                               printf(" %d %d %d %d\n", num, sum, max, bias);
+                       }
+               }
+       }
+}
+*/
+#elif defined (HAVE_MMX)
        src+= stride*4;
-       //FIXME try pmul for *5 stuff
-//     src[0]=0;
+
        asm volatile(
                "pxor %%mm7, %%mm7                              \n\t"
                "leal (%0, %1), %%eax                           \n\t"
@@ -3961,7 +4249,17 @@ static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStri
                        uint8_t *dstBlock= &(dst[y*dstStride]);
                        memcpy(dstBlock, tempDst + dstStride, dstStride*(height-y) );
                }
-       }
+/*
+               for(x=0; x<width; x+=32)
+               {
+                       int i;
+                       i+=     + dstBlock[x + 7*dstStride] + dstBlock[x + 8*dstStride]
+                               + dstBlock[x + 9*dstStride] + dstBlock[x +10*dstStride]
+                               + dstBlock[x +11*dstStride] + dstBlock[x +12*dstStride]
+                               + dstBlock[x +13*dstStride] + dstBlock[x +14*dstStride]
+                               + dstBlock[x +15*dstStride];
+               }
+*/     }
 #ifdef HAVE_3DNOW
        asm volatile("femms");
 #elif defined (HAVE_MMX)
@@ -3977,4 +4275,31 @@ static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStri
                        (int)(sumTime/1000), (int)((sumTime-memcpyTime-vertTime-horizTime)/1000)
                        , black, white);
 #endif
+#ifdef DEBUG_BRIGHTNESS
+       if(!isColor)
+       {
+               int max=1;
+               int i;
+               for(i=0; i<256; i++)
+                       if(yHistogram[i] > max) max=yHistogram[i];
+
+               for(i=1; i<256; i++)
+               {
+                       int x;
+                       int start=yHistogram[i-1]/(max/256+1);
+                       int end=yHistogram[i]/(max/256+1);
+                       int inc= end > start ? 1 : -1;
+                       for(x=start; x!=end+inc; x+=inc)
+                               dst[ i*dstStride + x]+=128;
+               }
+
+               for(i=0; i<100; i+=2)
+               {
+                       dst[ (white)*dstStride + i]+=128;
+                       dst[ (black)*dstStride + i]+=128;
+               }
+
+       }
+#endif
+
 }
index 0d20ad4..6ac10ff 100644 (file)
 isVertDC               Ec      Ec
 isVertMinMaxOk         Ec      Ec
 doVertLowPass          E               e       e
-doVertDefFilter                Ec      Ec      Ec
+doVertDefFilter                Ec      Ec      e       e
 isHorizDC              Ec      Ec
 isHorizMinMaxOk                a       E
 doHorizLowPass         E               e       e
-doHorizDefFilter       Ec      Ec      Ec
+doHorizDefFilter       Ec      Ec      e       e
 deRing                 E               e       e*
 Vertical RKAlgo1       E               a       a
 Horizontal RKAlgo1                     a       a
@@ -63,8 +63,6 @@ optimize c versions
 try to unroll inner for(x=0 ... loop to avoid these damn if(x ... checks
 smart blur
 ...
-
-Notes:
 */
 
 //Changelog: use the CVS log
@@ -80,6 +78,7 @@ Notes:
 //#undef HAVE_MMX2
 //#define HAVE_3DNOW
 //#undef HAVE_MMX
+//#define DEBUG_BRIGHTNESS
 #include "postprocess.h"
 
 #define MIN(a,b) ((a) > (b) ? (b) : (a))
@@ -1067,10 +1066,299 @@ HX1b((%0, %1, 4),(%%ebx),(%%ebx, %1),(%%ebx, %1, 2))
 
 static inline void doVertDefFilter(uint8_t src[], int stride, int QP)
 {
-#ifdef HAVE_MMX
+#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
+/*
+       uint8_t tmp[16];
+       const int l1= stride;
+       const int l2= stride + l1;
+       const int l3= stride + l2;
+       const int l4= (int)tmp - (int)src - stride*3;
+       const int l5= (int)tmp - (int)src - stride*3 + 8;
+       const int l6= stride*3 + l3;
+       const int l7= stride + l6;
+       const int l8= stride + l7;
+
+       memcpy(tmp, src+stride*7, 8);
+       memcpy(tmp+8, src+stride*8, 8);
+*/
+       src+= stride*4;
+       asm volatile(
+
+#if 0 //sligtly more accurate and slightly slower
+               "pxor %%mm7, %%mm7                              \n\t" // 0
+               "leal (%0, %1), %%eax                           \n\t"
+               "leal (%%eax, %1, 4), %%ebx                     \n\t"
+//     0       1       2       3       4       5       6       7
+//     %0      %0+%1   %0+2%1  eax+2%1 %0+4%1  eax+4%1 ebx+%1  ebx+2%1
+//     %0      eax     eax+%1  eax+2%1 %0+4%1  ebx     ebx+%1  ebx+2%1
+
+
+               "movq (%0, %1, 2), %%mm0                        \n\t" // l2
+               "movq (%0), %%mm1                               \n\t" // l0
+               "movq %%mm0, %%mm2                              \n\t" // l2
+               PAVGB(%%mm7, %%mm0)                                   // ~l2/2
+               PAVGB(%%mm1, %%mm0)                                   // ~(l2 + 2l0)/4
+               PAVGB(%%mm2, %%mm0)                                   // ~(5l2 + 2l0)/8
+
+               "movq (%%eax), %%mm1                            \n\t" // l1
+               "movq (%%eax, %1, 2), %%mm3                     \n\t" // l3
+               "movq %%mm1, %%mm4                              \n\t" // l1
+               PAVGB(%%mm7, %%mm1)                                   // ~l1/2
+               PAVGB(%%mm3, %%mm1)                                   // ~(l1 + 2l3)/4
+               PAVGB(%%mm4, %%mm1)                                   // ~(5l1 + 2l3)/8
+
+               "movq %%mm0, %%mm4                              \n\t" // ~(5l2 + 2l0)/8
+               "psubusb %%mm1, %%mm0                           \n\t"
+               "psubusb %%mm4, %%mm1                           \n\t"
+               "por %%mm0, %%mm1                               \n\t" // ~|2l0 - 5l1 + 5l2 - 2l3|/8
+// mm1= |lenergy|, mm2= l2, mm3= l3, mm7=0
+
+               "movq (%0, %1, 4), %%mm0                        \n\t" // l4
+               "movq %%mm0, %%mm4                              \n\t" // l4
+               PAVGB(%%mm7, %%mm0)                                   // ~l4/2
+               PAVGB(%%mm2, %%mm0)                                   // ~(l4 + 2l2)/4
+               PAVGB(%%mm4, %%mm0)                                   // ~(5l4 + 2l2)/8
+
+               "movq (%%ebx), %%mm2                            \n\t" // l5
+               "movq %%mm3, %%mm5                              \n\t" // l3
+               PAVGB(%%mm7, %%mm3)                                   // ~l3/2
+               PAVGB(%%mm2, %%mm3)                                   // ~(l3 + 2l5)/4
+               PAVGB(%%mm5, %%mm3)                                   // ~(5l3 + 2l5)/8
+
+               "movq %%mm0, %%mm6                              \n\t" // ~(5l4 + 2l2)/8
+               "psubusb %%mm3, %%mm0                           \n\t"
+               "psubusb %%mm6, %%mm3                           \n\t"
+               "por %%mm0, %%mm3                               \n\t" // ~|2l2 - 5l3 + 5l4 - 2l5|/8
+               "pcmpeqb %%mm7, %%mm0                           \n\t" // SIGN(2l2 - 5l3 + 5l4 - 2l5)
+// mm0= SIGN(menergy), mm1= |lenergy|, mm2= l5, mm3= |menergy|, mm4=l4, mm5= l3, mm7=0
+
+               "movq (%%ebx, %1), %%mm6                        \n\t" // l6
+               "movq %%mm6, %%mm5                              \n\t" // l6
+               PAVGB(%%mm7, %%mm6)                                   // ~l6/2
+               PAVGB(%%mm4, %%mm6)                                   // ~(l6 + 2l4)/4
+               PAVGB(%%mm5, %%mm6)                                   // ~(5l6 + 2l4)/8
+
+               "movq (%%ebx, %1, 2), %%mm5                     \n\t" // l7
+               "movq %%mm2, %%mm4                              \n\t" // l5
+               PAVGB(%%mm7, %%mm2)                                   // ~l5/2
+               PAVGB(%%mm5, %%mm2)                                   // ~(l5 + 2l7)/4
+               PAVGB(%%mm4, %%mm2)                                   // ~(5l5 + 2l7)/8
+
+               "movq %%mm6, %%mm4                              \n\t" // ~(5l6 + 2l4)/8
+               "psubusb %%mm2, %%mm6                           \n\t"
+               "psubusb %%mm4, %%mm2                           \n\t"
+               "por %%mm6, %%mm2                               \n\t" // ~|2l4 - 5l5 + 5l6 - 2l7|/8
+// mm0= SIGN(menergy), mm1= |lenergy|/8, mm2= |renergy|/8, mm3= |menergy|/8, mm7=0
+
+
+               PMINUB(%%mm2, %%mm1, %%mm4)                           // MIN(|lenergy|,|renergy|)/8
+               "movq pQPb, %%mm4                               \n\t" // QP //FIXME QP+1 ?
+               "paddusb b01, %%mm4                             \n\t"
+               "pcmpgtb %%mm3, %%mm4                           \n\t" // |menergy|/8 < QP
+               "psubusb %%mm1, %%mm3                           \n\t" // d=|menergy|/8-MIN(|lenergy|,|renergy|)/8
+               "pand %%mm4, %%mm3                              \n\t"
+
+               "movq %%mm3, %%mm1                              \n\t"
+//             "psubusb b01, %%mm3                             \n\t"
+               PAVGB(%%mm7, %%mm3)
+               PAVGB(%%mm7, %%mm3)
+               "paddusb %%mm1, %%mm3                           \n\t"
+//             "paddusb b01, %%mm3                             \n\t"
+
+               "movq (%%eax, %1, 2), %%mm6                     \n\t" //l3
+               "movq (%0, %1, 4), %%mm5                        \n\t" //l4
+               "movq (%0, %1, 4), %%mm4                        \n\t" //l4
+               "psubusb %%mm6, %%mm5                           \n\t"
+               "psubusb %%mm4, %%mm6                           \n\t"
+               "por %%mm6, %%mm5                               \n\t" // |l3-l4|
+               "pcmpeqb %%mm7, %%mm6                           \n\t" // SIGN(l3-l4)
+               "pxor %%mm6, %%mm0                              \n\t"
+               "pand %%mm0, %%mm3                              \n\t"
+               PMINUB(%%mm5, %%mm3, %%mm0)
+
+               "psubusb b01, %%mm3                             \n\t"
+               PAVGB(%%mm7, %%mm3)
+
+               "movq (%%eax, %1, 2), %%mm0                     \n\t"
+               "movq (%0, %1, 4), %%mm2                        \n\t"
+               "pxor %%mm6, %%mm0                              \n\t"
+               "pxor %%mm6, %%mm2                              \n\t"
+               "psubb %%mm3, %%mm0                             \n\t"
+               "paddb %%mm3, %%mm2                             \n\t"
+               "pxor %%mm6, %%mm0                              \n\t"
+               "pxor %%mm6, %%mm2                              \n\t"
+               "movq %%mm0, (%%eax, %1, 2)                     \n\t"
+               "movq %%mm2, (%0, %1, 4)                        \n\t"
+#endif
+
+               "leal (%0, %1), %%eax                           \n\t"
+               "pcmpeqb %%mm6, %%mm6                           \n\t" // -1
+//     0       1       2       3       4       5       6       7
+//     %0      %0+%1   %0+2%1  eax+2%1 %0+4%1  eax+4%1 ebx+%1  ebx+2%1
+//     %0      eax     eax+%1  eax+2%1 %0+4%1  ebx     ebx+%1  ebx+2%1
+
+
+               "movq (%%eax, %1, 2), %%mm1                     \n\t" // l3
+               "movq (%0, %1, 4), %%mm0                        \n\t" // l4
+               "pxor %%mm6, %%mm1                              \n\t" // -l3-1
+               PAVGB(%%mm1, %%mm0)                                   // -q+128 = (l4-l3+256)/2
+// mm1=-l3-1, mm0=128-q
+
+               "movq (%%eax, %1, 4), %%mm2                     \n\t" // l5
+               "movq (%%eax, %1), %%mm3                        \n\t" // l2
+               "pxor %%mm6, %%mm2                              \n\t" // -l5-1
+               "movq %%mm2, %%mm5                              \n\t" // -l5-1
+               "movq b80, %%mm4                                \n\t" // 128
+               "leal (%%eax, %1, 4), %%ebx                     \n\t"
+               PAVGB(%%mm3, %%mm2)                                   // (l2-l5+256)/2
+               PAVGB(%%mm0, %%mm4)                                   // ~(l4-l3)/4 + 128
+               PAVGB(%%mm2, %%mm4)                                   // ~(l2-l5)/4 +(l4-l3)/8 + 128
+               PAVGB(%%mm0, %%mm4)                                   // ~(l2-l5)/8 +5(l4-l3)/16 + 128
+// mm1=-l3-1, mm0=128-q, mm3=l2, mm4=menergy/16 + 128, mm5= -l5-1
+
+               "movq (%%eax), %%mm2                            \n\t" // l1
+               "pxor %%mm6, %%mm2                              \n\t" // -l1-1
+               PAVGB(%%mm3, %%mm2)                                   // (l2-l1+256)/2
+               PAVGB((%0), %%mm1)                                    // (l0-l3+256)/2
+               "movq b80, %%mm3                                \n\t" // 128
+               PAVGB(%%mm2, %%mm3)                                   // ~(l2-l1)/4 + 128
+               PAVGB(%%mm1, %%mm3)                                   // ~(l0-l3)/4 +(l2-l1)/8 + 128
+               PAVGB(%%mm2, %%mm3)                                   // ~(l0-l3)/8 +5(l2-l1)/16 + 128
+// mm0=128-q, mm3=lenergy/16 + 128, mm4= menergy/16 + 128, mm5= -l5-1
+
+               PAVGB((%%ebx, %1), %%mm5)                             // (l6-l5+256)/2
+               "movq (%%ebx, %1, 2), %%mm1                     \n\t" // l7
+               "pxor %%mm6, %%mm1                              \n\t" // -l7-1
+               PAVGB((%0, %1, 4), %%mm1)                             // (l4-l7+256)/2
+               "movq b80, %%mm2                                \n\t" // 128
+               PAVGB(%%mm5, %%mm2)                                   // ~(l6-l5)/4 + 128
+               PAVGB(%%mm1, %%mm2)                                   // ~(l4-l7)/4 +(l6-l5)/8 + 128
+               PAVGB(%%mm5, %%mm2)                                   // ~(l4-l7)/8 +5(l6-l5)/16 + 128
+// mm0=128-q, mm2=renergy/16 + 128, mm3=lenergy/16 + 128, mm4= menergy/16 + 128
+
+               "movq b00, %%mm1                                \n\t" // 0
+               "movq b00, %%mm5                                \n\t" // 0
+               "psubb %%mm2, %%mm1                             \n\t" // 128 - renergy/16
+               "psubb %%mm3, %%mm5                             \n\t" // 128 - lenergy/16
+               PMAXUB(%%mm1, %%mm2)                                  // 128 + |renergy/16|
+               PMAXUB(%%mm5, %%mm3)                                  // 128 + |lenergy/16|
+               PMINUB(%%mm2, %%mm3, %%mm1)                           // 128 + MIN(|lenergy|,|renergy|)/16
+
+// mm0=128-q, mm3=128 + MIN(|lenergy|,|renergy|)/16, mm4= menergy/16 + 128
+
+               "movq b00, %%mm7                                \n\t" // 0
+               "movq pQPb, %%mm2                               \n\t" // QP
+               PAVGB(%%mm6, %%mm2)                                   // 128 + QP/2
+               "psubb %%mm6, %%mm2                             \n\t"
+
+               "movq %%mm4, %%mm1                              \n\t"
+               "pcmpgtb %%mm7, %%mm1                           \n\t" // SIGN(menergy)
+               "pxor %%mm1, %%mm4                              \n\t"
+               "psubb %%mm1, %%mm4                             \n\t" // 128 + |menergy|/16
+               "pcmpgtb %%mm4, %%mm2                           \n\t" // |menergy|/16 < QP/2
+               "psubusb %%mm3, %%mm4                           \n\t" //d=|menergy|/16 - MIN(|lenergy|,|renergy|)/16
+// mm0=128-q, mm1= SIGN(menergy), mm2= |menergy|/16 < QP/2, mm4= d/16
+
+               "movq %%mm4, %%mm3                              \n\t" // d
+               "psubusb b01, %%mm4                             \n\t"
+               PAVGB(%%mm7, %%mm4)                                   // d/32
+               PAVGB(%%mm7, %%mm4)                                   // (d + 32)/64
+               "paddb %%mm3, %%mm4                             \n\t" // 5d/64
+               "pand %%mm2, %%mm4                              \n\t"
+
+               "movq b80, %%mm5                                \n\t" // 128
+               "psubb %%mm0, %%mm5                             \n\t" // q
+               "paddsb %%mm6, %%mm5                            \n\t" // fix bad rounding
+               "pcmpgtb %%mm5, %%mm7                           \n\t" // SIGN(q)
+               "pxor %%mm7, %%mm5                              \n\t"
+
+               PMINUB(%%mm5, %%mm4, %%mm3)                           // MIN(|q|, 5d/64)
+               "pxor %%mm1, %%mm7                              \n\t" // SIGN(d*q)
+
+               "pand %%mm7, %%mm4                              \n\t"
+               "movq (%%eax, %1, 2), %%mm0                     \n\t"
+               "movq (%0, %1, 4), %%mm2                        \n\t"
+               "pxor %%mm1, %%mm0                              \n\t"
+               "pxor %%mm1, %%mm2                              \n\t"
+               "paddb %%mm4, %%mm0                             \n\t"
+               "psubb %%mm4, %%mm2                             \n\t"
+               "pxor %%mm1, %%mm0                              \n\t"
+               "pxor %%mm1, %%mm2                              \n\t"
+               "movq %%mm0, (%%eax, %1, 2)                     \n\t"
+               "movq %%mm2, (%0, %1, 4)                        \n\t"
+
+               :
+               : "r" (src), "r" (stride)
+               : "%eax", "%ebx"
+       );
+
+/*
+       {
+       int x;
+       src-= stride;
+       for(x=0; x<BLOCK_SIZE; x++)
+       {
+               const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]);
+               if(ABS(middleEnergy)< 8*QP)
+               {
+                       const int q=(src[l4] - src[l5])/2;
+                       const int leftEnergy=  5*(src[l3] - src[l2]) + 2*(src[l1] - src[l4]);
+                       const int rightEnergy= 5*(src[l7] - src[l6]) + 2*(src[l5] - src[l8]);
+
+                       int d= ABS(middleEnergy) - MIN( ABS(leftEnergy), ABS(rightEnergy) );
+                       d= MAX(d, 0);
+
+                       d= (5*d + 32) >> 6;
+                       d*= SIGN(-middleEnergy);
+
+                       if(q>0)
+                       {
+                               d= d<0 ? 0 : d;
+                               d= d>q ? q : d;
+                       }
+                       else
+                       {
+                               d= d>0 ? 0 : d;
+                               d= d<q ? q : d;
+                       }
+
+                       src[l4]-= d;
+                       src[l5]+= d;
+               }
+               src++;
+       }
+src-=8;
+       for(x=0; x<8; x++)
+       {
+               int y;
+               for(y=4; y<6; y++)
+               {
+                       int d= src[x+y*stride] - tmp[x+(y-4)*8];
+                       int ad= ABS(d);
+                       static int max=0;
+                       static int sum=0;
+                       static int num=0;
+                       static int bias=0;
+
+                       if(max<ad) max=ad;
+                       sum+= ad>3 ? 1 : 0;
+                       if(ad>3)
+                       {
+                               src[0] = src[7] = src[stride*7] = src[(stride+1)*7]=255;
+                       }
+                       if(y==4) bias+=d;
+                       num++;
+                       if(num%1000000 == 0)
+                       {
+                               printf(" %d %d %d %d\n", num, sum, max, bias);
+                       }
+               }
+       }
+}
+*/
+#elif defined (HAVE_MMX)
        src+= stride*4;
-       //FIXME try pmul for *5 stuff
-//     src[0]=0;
+
        asm volatile(
                "pxor %%mm7, %%mm7                              \n\t"
                "leal (%0, %1), %%eax                           \n\t"
@@ -3961,7 +4249,17 @@ static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStri
                        uint8_t *dstBlock= &(dst[y*dstStride]);
                        memcpy(dstBlock, tempDst + dstStride, dstStride*(height-y) );
                }
-       }
+/*
+               for(x=0; x<width; x+=32)
+               {
+                       int i;
+                       i+=     + dstBlock[x + 7*dstStride] + dstBlock[x + 8*dstStride]
+                               + dstBlock[x + 9*dstStride] + dstBlock[x +10*dstStride]
+                               + dstBlock[x +11*dstStride] + dstBlock[x +12*dstStride]
+                               + dstBlock[x +13*dstStride] + dstBlock[x +14*dstStride]
+                               + dstBlock[x +15*dstStride];
+               }
+*/     }
 #ifdef HAVE_3DNOW
        asm volatile("femms");
 #elif defined (HAVE_MMX)
@@ -3977,4 +4275,31 @@ static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStri
                        (int)(sumTime/1000), (int)((sumTime-memcpyTime-vertTime-horizTime)/1000)
                        , black, white);
 #endif
+#ifdef DEBUG_BRIGHTNESS
+       if(!isColor)
+       {
+               int max=1;
+               int i;
+               for(i=0; i<256; i++)
+                       if(yHistogram[i] > max) max=yHistogram[i];
+
+               for(i=1; i<256; i++)
+               {
+                       int x;
+                       int start=yHistogram[i-1]/(max/256+1);
+                       int end=yHistogram[i]/(max/256+1);
+                       int inc= end > start ? 1 : -1;
+                       for(x=start; x!=end+inc; x+=inc)
+                               dst[ i*dstStride + x]+=128;
+               }
+
+               for(i=0; i<100; i+=2)
+               {
+                       dst[ (white)*dstStride + i]+=128;
+                       dst[ (black)*dstStride + i]+=128;
+               }
+
+       }
+#endif
+
 }