OSDN Git Service

(commit by michael)
authorMichael Niedermayer <michaelni@gmx.at>
Thu, 17 Jan 2002 20:00:41 +0000 (20:00 +0000)
committerMichael Niedermayer <michaelni@gmx.at>
Thu, 17 Jan 2002 20:00:41 +0000 (20:00 +0000)
faster simple idct in MMX

Originally committed as revision 269 to svn://svn.ffmpeg.org/ffmpeg/trunk

libavcodec/dsputil.c
libavcodec/i386/simple_idct_mmx.c

index f45d051..6b993d8 100644 (file)
@@ -71,16 +71,16 @@ UINT8 ff_alternate_vertical_scan[64] = {
     38, 46, 54, 62, 39, 47, 55, 63,
 };
 
-
+/* Input permutation for the simple_idct_mmx */
 static UINT8 simple_mmx_permutation[64]={
-       0x00, 0x08, 0x01, 0x09, 0x04, 0x0C, 0x05, 0x0D,
-       0x10, 0x18, 0x11, 0x19, 0x14, 0x1C, 0x15, 0x1D,
-       0x02, 0x0A, 0x03, 0x0B, 0x06, 0x0E, 0x07, 0x0F,
-       0x12, 0x1A, 0x13, 0x1B, 0x16, 0x1E, 0x17, 0x1F,
-       0x20, 0x28, 0x21, 0x29, 0x24, 0x2C, 0x25, 0x2D,
-       0x30, 0x38, 0x31, 0x39, 0x34, 0x3C, 0x35, 0x3D,
-       0x22, 0x2A, 0x23, 0x2B, 0x26, 0x2E, 0x27, 0x2F,
-       0x32, 0x3A, 0x33, 0x3B, 0x36, 0x3E, 0x37, 0x3F,
+       0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D, 
+       0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D, 
+       0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D, 
+       0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F, 
+       0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F, 
+       0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D, 
+       0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F, 
+       0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
 };
 
 /* used to skip zeros at the end */
index 297f237..80ca2df 100644 (file)
@@ -1,5 +1,5 @@
 /*
-    Copyright (C) 2001 Michael Niedermayer (michaelni@gmx.at)
+    Copyright (C) 2001-2002 Michael Niedermayer (michaelni@gmx.at)
 
     This program is free software; you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
@@ -43,26 +43,30 @@ static int16_t __attribute__((aligned(8))) coeffs[]= {
 //     0, 0, 0, 0,
 //     0, 0, 0, 0,
 
-        C4,  C2,  C4,  C2,
-        C4,  C6,  C4,  C6,
-        C1,  C3,  C1,  C3,
-        C5,  C7,  C5,  C7,
-
-        C4,  C6,  C4,  C6,
-       -C4, -C2, -C4, -C2,
-        C3, -C7,  C3, -C7,
-       -C1, -C5, -C1, -C5,
-
-        C4, -C6,  C4, -C6,
-       -C4,  C2, -C4,  C2,
-        C5, -C1,  C5, -C1,
-        C7,  C3,  C7,  C3,
-
-        C4, -C2,  C4, -C2,
-        C4, -C6,  C4, -C6,
-        C7, -C5,  C7, -C5,
-        C3, -C1,  C3, -C1
-       };
+ C4,  C4,  C4,  C4,
+ C4, -C4,  C4, -C4,
+ C2,  C6,  C2,  C6,
+ C6, -C2,  C6, -C2,
+ C1,  C3,  C1,  C3,
+ C5,  C7,  C5,  C7,
+ C3, -C7,  C3, -C7,
+-C1, -C5, -C1, -C5,
+ C5, -C1,  C5, -C1,
+ C7,  C3,  C7,  C3,
+ C7, -C5,  C7, -C5,
+ C3, -C1,  C3, -C1
+};
+
+static void unused_var_killer(){
+       int a= wm1010 + d40000;
+       temp[0]=a;
+}
+
 #if 0
 static void inline idctCol (int16_t * col, int16_t *input)
 {
@@ -188,107 +192,157 @@ row[7] = input[13];
 
 static inline void idct(int16_t *block)
 {
-       int i;
-//for(i=0; i<64; i++) temp[i]= block[ block_permute_op(i) ];
-//for(i=0; i<64; i++) temp[block_permute_op(i)]= block[ i ];
-//for(i=0; i<64; i++) block[i]= temp[i];
-//block_permute(block);
-/*
-idctRow(temp, block);
-idctRow(temp+16, block+16);
-idctRow(temp+1, block+2);
-idctRow(temp+17, block+18);
-idctRow(temp+32, block+32);
-idctRow(temp+48, block+48);
-idctRow(temp+33, block+34);
-idctRow(temp+49, block+50);
-*/
-
        asm volatile(
-//             "lea 64(%0), %%eax              \n\t"
-//r0,r2,R0,R2  r4,r6,R4,R6     r1,r3,R1,R3     r5,r7,R5,R7
-//src0         src4            src1            src5
-//r0,R0,r7,R7  r1,R1,r6,R6     r2,R2,r5,R5     r3,R3,r4,R4
-//dst0         dst1            dst2            dst3
 #if 0 //Alternative, simpler variant
-#define IDCT_CORE(src0, src4, src1, src5, dst, rounder, shift) \
-       "movq " #src0 ", %%mm0                  \n\t" /* R2     R0      r2      r0 */\
-       "movq " #src4 ", %%mm1                  \n\t" /* R6     R4      r6      r4 */\
+
+#define ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
+       "movq " #src0 ", %%mm0                  \n\t" /* R4     R0      r4      r0 */\
+       "movq " #src4 ", %%mm1                  \n\t" /* R6     R2      r6      r2 */\
        "movq " #src1 ", %%mm2                  \n\t" /* R3     R1      r3      r1 */\
        "movq " #src5 ", %%mm3                  \n\t" /* R7     R5      r7      r5 */\
-       "movq 16(%2), %%mm4                     \n\t" /* C2     C4      C2      C4 */\
-       "pmaddwd %%mm0, %%mm4                   \n\t" /* C2R2+C4R0      C2r2+C4r0 */\
-       "movq 24(%2), %%mm5                     \n\t" /* C6     C4      C6      C4 */\
-       "pmaddwd %%mm1, %%mm5                   \n\t" /* C6R6+C4R4      C6r6+C4r4 */\
-       "movq 32(%2), %%mm6                     \n\t" /* C3     C1      C3      C1 */\
-       "pmaddwd %%mm2, %%mm6                   \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
-       "movq 40(%2), %%mm7                     \n\t" /* C7     C5      C7      C5 */\
-       "pmaddwd %%mm3, %%mm7                   \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
-       "paddd %%mm5, %%mm4                     \n\t" /* A0             a0 */\
+       "movq 16(%2), %%mm4                     \n\t" /* C4     C4      C4      C4 */\
+       "pmaddwd %%mm0, %%mm4                   \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
+       "movq 24(%2), %%mm5                     \n\t" /* -C4    C4      -C4     C4 */\
+       "pmaddwd %%mm5, %%mm0                   \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
+       "movq 32(%2), %%mm5                     \n\t" /* C6     C2      C6      C2 */\
+       "pmaddwd %%mm1, %%mm5                   \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
+       "movq 40(%2), %%mm6                     \n\t" /* -C2    C6      -C2     C6 */\
+       "pmaddwd %%mm6, %%mm1                   \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
+       "movq 48(%2), %%mm7                     \n\t" /* C3     C1      C3      C1 */\
+       "pmaddwd %%mm2, %%mm7                   \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
        #rounder ", %%mm4                       \n\t"\
-\
-       "movq 48(%2), %%mm5                     \n\t" /* C6     C4      C6      C4 */\
-       "pmaddwd %%mm0, %%mm5                   \n\t" /* C6R2+C4R0      C6r2+C4r0 */\
-       "paddd %%mm7, %%mm6                     \n\t" /* B0             b0 */\
-       "paddd %%mm4, %%mm6                     \n\t" /* A0+B0          a0+b0 */\
+       "movq %%mm4, %%mm6                      \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
+       "paddd %%mm5, %%mm4                     \n\t" /* A0             a0 */\
+       "psubd %%mm5, %%mm6                     \n\t" /* A3             a3 */\
+       "movq 56(%2), %%mm5                     \n\t" /* C7     C5      C7      C5 */\
+       "pmaddwd %%mm3, %%mm5                   \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
+       #rounder ", %%mm0                       \n\t"\
+       "paddd %%mm0, %%mm1                     \n\t" /* A1             a1 */\
+       "paddd %%mm0, %%mm0                     \n\t" \
+       "psubd %%mm1, %%mm0                     \n\t" /* A2             a2 */\
+       "pmaddwd 64(%2), %%mm2                  \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
+       "paddd %%mm5, %%mm7                     \n\t" /* B0             b0 */\
+       "movq 72(%2), %%mm5                     \n\t" /* -C5    -C1     -C5     -C1 */\
+       "pmaddwd %%mm3, %%mm5                   \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
+       "paddd %%mm4, %%mm7                     \n\t" /* A0+B0          a0+b0 */\
        "paddd %%mm4, %%mm4                     \n\t" /* 2A0            2a0 */\
-       "psubd %%mm6, %%mm4                     \n\t" /* A0-B0          a0-b0 */\
+       "psubd %%mm7, %%mm4                     \n\t" /* A0-B0          a0-b0 */\
+       "paddd %%mm2, %%mm5                     \n\t" /* B1             b1 */\
+       "psrad $" #shift ", %%mm7               \n\t"\
+       "psrad $" #shift ", %%mm4               \n\t"\
+       "movq %%mm1, %%mm2                      \n\t" /* A1             a1 */\
+       "paddd %%mm5, %%mm1                     \n\t" /* A1+B1          a1+b1 */\
+       "psubd %%mm5, %%mm2                     \n\t" /* A1-B1          a1-b1 */\
+       "psrad $" #shift ", %%mm1               \n\t"\
+       "psrad $" #shift ", %%mm2               \n\t"\
+       "packssdw %%mm1, %%mm7                  \n\t" /* A1+B1  a1+b1   A0+B0   a0+b0 */\
+       "packssdw %%mm4, %%mm2                  \n\t" /* A0-B0  a0-b0   A1-B1   a1-b1 */\
+       "movq %%mm7, " #dst "                   \n\t"\
+       "movq " #src1 ", %%mm1                  \n\t" /* R3     R1      r3      r1 */\
+       "movq 80(%2), %%mm4                     \n\t" /* -C1    C5      -C1     C5 */\
+       "movq %%mm2, 24+" #dst "                \n\t"\
+       "pmaddwd %%mm1, %%mm4                   \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
+       "movq 88(%2), %%mm7                     \n\t" /* C3     C7      C3      C7 */\
+       "pmaddwd 96(%2), %%mm1                  \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
+       "pmaddwd %%mm3, %%mm7                   \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
+       "movq %%mm0, %%mm2                      \n\t" /* A2             a2 */\
+       "pmaddwd 104(%2), %%mm3                 \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
+       "paddd %%mm7, %%mm4                     \n\t" /* B2             b2 */\
+       "paddd %%mm4, %%mm2                     \n\t" /* A2+B2          a2+b2 */\
+       "psubd %%mm4, %%mm0                     \n\t" /* a2-B2          a2-b2 */\
+       "psrad $" #shift ", %%mm2               \n\t"\
+       "psrad $" #shift ", %%mm0               \n\t"\
+       "movq %%mm6, %%mm4                      \n\t" /* A3             a3 */\
+       "paddd %%mm1, %%mm3                     \n\t" /* B3             b3 */\
+       "paddd %%mm3, %%mm6                     \n\t" /* A3+B3          a3+b3 */\
+       "psubd %%mm3, %%mm4                     \n\t" /* a3-B3          a3-b3 */\
        "psrad $" #shift ", %%mm6               \n\t"\
+       "packssdw %%mm6, %%mm2                  \n\t" /* A3+B3  a3+b3   A2+B2   a2+b2 */\
+       "movq %%mm2, 8+" #dst "                 \n\t"\
        "psrad $" #shift ", %%mm4               \n\t"\
-       WRITE0(%%mm6, %%mm4, dst) \
-\
-       "movq 56(%2), %%mm4                     \n\t" /* -C2    -C4     -C2     -C4 */\
-       "pmaddwd %%mm1, %%mm4                   \n\t" /* -C2R6-C4R4     -C2r6-C4r4 */\
-       "movq 64(%2), %%mm6                     \n\t" /* -C7    C3      -C7     C3 */\
-       "pmaddwd %%mm2, %%mm6                   \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
-       "movq 72(%2), %%mm7                     \n\t" /* -C5    -C1     -C5     -C1 */\
-       "pmaddwd %%mm3, %%mm7                   \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
-       "paddd %%mm5, %%mm4                     \n\t" /* A1             a1 */\
+       "packssdw %%mm0, %%mm4                  \n\t" /* A2-B2  a2-b2   A3-B3   a3-b3 */\
+       "movq %%mm4, 16+" #dst "                \n\t"\
+
+#define COL_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
+       "movq " #src0 ", %%mm0                  \n\t" /* R4     R0      r4      r0 */\
+       "movq " #src4 ", %%mm1                  \n\t" /* R6     R2      r6      r2 */\
+       "movq " #src1 ", %%mm2                  \n\t" /* R3     R1      r3      r1 */\
+       "movq " #src5 ", %%mm3                  \n\t" /* R7     R5      r7      r5 */\
+       "movq 16(%2), %%mm4                     \n\t" /* C4     C4      C4      C4 */\
+       "pmaddwd %%mm0, %%mm4                   \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
+       "movq 24(%2), %%mm5                     \n\t" /* -C4    C4      -C4     C4 */\
+       "pmaddwd %%mm5, %%mm0                   \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
+       "movq 32(%2), %%mm5                     \n\t" /* C6     C2      C6      C2 */\
+       "pmaddwd %%mm1, %%mm5                   \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
+       "movq 40(%2), %%mm6                     \n\t" /* -C2    C6      -C2     C6 */\
+       "pmaddwd %%mm6, %%mm1                   \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
        #rounder ", %%mm4                       \n\t"\
-\
-       "movq 80(%2), %%mm5                     \n\t" /* -C6    C4      -C6     C4 */\
-       "pmaddwd %%mm0, %%mm5                   \n\t" /* -C6R2+C4R0     -C6r2+C4r0 */\
-       "paddd %%mm7, %%mm6                     \n\t" /* B1             b1 */\
-       "paddd %%mm4, %%mm6                     \n\t" /* A1+B1          a1+b1 */\
-       "paddd %%mm4, %%mm4                     \n\t" /* 2A1            2a1 */\
-       "psubd %%mm6, %%mm4                     \n\t" /* A1-B1          a1-b1 */\
-       "psrad $" #shift ", %%mm6               \n\t"\
+       "movq %%mm4, %%mm6                      \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
+       "movq 48(%2), %%mm7                     \n\t" /* C3     C1      C3      C1 */\
+       #rounder ", %%mm0                       \n\t"\
+       "pmaddwd %%mm2, %%mm7                   \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
+       "paddd %%mm5, %%mm4                     \n\t" /* A0             a0 */\
+       "psubd %%mm5, %%mm6                     \n\t" /* A3             a3 */\
+       "movq %%mm0, %%mm5                      \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
+       "paddd %%mm1, %%mm0                     \n\t" /* A1             a1 */\
+       "psubd %%mm1, %%mm5                     \n\t" /* A2             a2 */\
+       "movq 56(%2), %%mm1                     \n\t" /* C7     C5      C7      C5 */\
+       "pmaddwd %%mm3, %%mm1                   \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
+       "pmaddwd 64(%2), %%mm2                  \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
+       "paddd %%mm1, %%mm7                     \n\t" /* B0             b0 */\
+       "movq 72(%2), %%mm1                     \n\t" /* -C5    -C1     -C5     -C1 */\
+       "pmaddwd %%mm3, %%mm1                   \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
+       "paddd %%mm4, %%mm7                     \n\t" /* A0+B0          a0+b0 */\
+       "paddd %%mm4, %%mm4                     \n\t" /* 2A0            2a0 */\
+       "psubd %%mm7, %%mm4                     \n\t" /* A0-B0          a0-b0 */\
+       "paddd %%mm2, %%mm1                     \n\t" /* B1             b1 */\
+       "psrad $" #shift ", %%mm7               \n\t"\
        "psrad $" #shift ", %%mm4               \n\t"\
-       WRITE1(%%mm6, %%mm4, dst, %%mm7) \
-\
-       "movq 88(%2), %%mm4                     \n\t" /* C2     -C4     C2      -C4 */\
-       "pmaddwd %%mm1, %%mm4                   \n\t" /* C2R6-C4R4      C2r6-C4r4 */\
-       "movq 96(%2), %%mm6                     \n\t" /* -C1    C5      -C1     C5 */\
-       "pmaddwd %%mm2, %%mm6                   \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
-       "movq 104(%2), %%mm7                    \n\t" /* C3     C7      C3      C7 */\
+       "movq %%mm0, %%mm2                      \n\t" /* A1             a1 */\
+       "paddd %%mm1, %%mm0                     \n\t" /* A1+B1          a1+b1 */\
+       "psubd %%mm1, %%mm2                     \n\t" /* A1-B1          a1-b1 */\
+       "psrad $" #shift ", %%mm0               \n\t"\
+       "psrad $" #shift ", %%mm2               \n\t"\
+       "packssdw %%mm7, %%mm7                  \n\t" /* A0+B0  a0+b0 */\
+       "movd %%mm7, " #dst "                   \n\t"\
+       "packssdw %%mm0, %%mm0                  \n\t" /* A1+B1  a1+b1 */\
+       "movd %%mm0, 16+" #dst "                \n\t"\
+       "packssdw %%mm2, %%mm2                  \n\t" /* A1-B1  a1-b1 */\
+       "movd %%mm2, 96+" #dst "                \n\t"\
+       "packssdw %%mm4, %%mm4                  \n\t" /* A0-B0  a0-b0 */\
+       "movd %%mm4, 112+" #dst "               \n\t"\
+       "movq " #src1 ", %%mm0                  \n\t" /* R3     R1      r3      r1 */\
+       "movq 80(%2), %%mm4                     \n\t" /* -C1    C5      -C1     C5 */\
+       "pmaddwd %%mm0, %%mm4                   \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
+       "movq 88(%2), %%mm7                     \n\t" /* C3     C7      C3      C7 */\
+       "pmaddwd 96(%2), %%mm0                  \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
        "pmaddwd %%mm3, %%mm7                   \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
-       "paddd %%mm5, %%mm4                     \n\t" /* A2             a2 */\
-       #rounder ", %%mm4                       \n\t"\
-\
-       "pmaddwd 112(%2), %%mm0                 \n\t" /* -C2R2+C4R0     -C2r2+C4r0 */\
-       "paddd %%mm7, %%mm6                     \n\t" /* B1             b1 */\
-       "paddd %%mm4, %%mm6                     \n\t" /* A1+B1          a1+b1 */\
-       "pmaddwd 120(%2), %%mm1                 \n\t" /* -C6R6+C4R4     -C6r6+C4r4 */\
-       "paddd %%mm4, %%mm4                     \n\t" /* 2A1            2a1 */\
-       "psubd %%mm6, %%mm4                     \n\t" /* A1-B1          a1-b1 */\
-       "pmaddwd 128(%2), %%mm2                 \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
-       "pmaddwd 136(%2), %%mm3                 \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
+       "movq %%mm5, %%mm2                      \n\t" /* A2             a2 */\
+       "pmaddwd 104(%2), %%mm3                 \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
+       "paddd %%mm7, %%mm4                     \n\t" /* B2             b2 */\
+       "paddd %%mm4, %%mm2                     \n\t" /* A2+B2          a2+b2 */\
+       "psubd %%mm4, %%mm5                     \n\t" /* a2-B2          a2-b2 */\
+       "psrad $" #shift ", %%mm2               \n\t"\
+       "psrad $" #shift ", %%mm5               \n\t"\
+       "movq %%mm6, %%mm4                      \n\t" /* A3             a3 */\
+       "paddd %%mm0, %%mm3                     \n\t" /* B3             b3 */\
+       "paddd %%mm3, %%mm6                     \n\t" /* A3+B3          a3+b3 */\
+       "psubd %%mm3, %%mm4                     \n\t" /* a3-B3          a3-b3 */\
        "psrad $" #shift ", %%mm6               \n\t"\
        "psrad $" #shift ", %%mm4               \n\t"\
-\
-       "paddd %%mm1, %%mm0                     \n\t" /* A3             a3 */\
-       #rounder ", %%mm0                       \n\t"\
-       "paddd %%mm3, %%mm2                     \n\t" /* B3             b3 */\
-       "paddd %%mm0, %%mm2                     \n\t" /* A3+B3          a3+b3 */\
-       "paddd %%mm0, %%mm0                     \n\t" /* 2A3            2a3 */\
-       "psubd %%mm2, %%mm0                     \n\t" /* A3-B3          a3-b3 */\
-       "psrad $" #shift ", %%mm2               \n\t"\
-       "psrad $" #shift ", %%mm0               \n\t"\
-       WRITE2(%%mm6, %%mm4, %%mm2, %%mm0, dst)
-
-#define DC_COND_IDCT_CORE(src0, src4, src1, src5, dst, rounder, shift) \
-       "movq " #src0 ", %%mm0                  \n\t" /* R2     R0      r2      r0 */\
-       "movq " #src4 ", %%mm1                  \n\t" /* R6     R4      r6      r4 */\
+       "packssdw %%mm2, %%mm2                  \n\t" /* A2+B2  a2+b2 */\
+       "packssdw %%mm6, %%mm6                  \n\t" /* A3+B3  a3+b3 */\
+       "movd %%mm2, 32+" #dst "                \n\t"\
+       "packssdw %%mm4, %%mm4                  \n\t" /* A3-B3  a3-b3 */\
+       "packssdw %%mm5, %%mm5                  \n\t" /* A2-B2  a2-b2 */\
+       "movd %%mm6, 48+" #dst "                \n\t"\
+       "movd %%mm4, 64+" #dst "                \n\t"\
+       "movd %%mm5, 80+" #dst "                \n\t"\
+
+       
+#define DC_COND_ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
+       "movq " #src0 ", %%mm0                  \n\t" /* R4     R0      r4      r0 */\
+       "movq " #src4 ", %%mm1                  \n\t" /* R6     R2      r6      r2 */\
        "movq " #src1 ", %%mm2                  \n\t" /* R3     R1      r3      r1 */\
        "movq " #src5 ", %%mm3                  \n\t" /* R7     R5      r7      r5 */\
        "movq wm1010, %%mm4                     \n\t"\
@@ -300,231 +354,103 @@ idctRow(temp+49, block+50);
        "movd %%mm4, %%eax                      \n\t"\
        "orl %%eax, %%eax                       \n\t"\
        "jz 1f                                  \n\t"\
-       "movq 16(%2), %%mm4                     \n\t" /* C2     C4      C2      C4 */\
-       "pmaddwd %%mm0, %%mm4                   \n\t" /* C2R2+C4R0      C2r2+C4r0 */\
-       "movq 24(%2), %%mm5                     \n\t" /* C6     C4      C6      C4 */\
-       "pmaddwd %%mm1, %%mm5                   \n\t" /* C6R6+C4R4      C6r6+C4r4 */\
-       "movq 32(%2), %%mm6                     \n\t" /* C3     C1      C3      C1 */\
-       "pmaddwd %%mm2, %%mm6                   \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
-       "movq 40(%2), %%mm7                     \n\t" /* C7     C5      C7      C5 */\
-       "pmaddwd %%mm3, %%mm7                   \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
-       "paddd %%mm5, %%mm4                     \n\t" /* A0             a0 */\
+       "movq 16(%2), %%mm4                     \n\t" /* C4     C4      C4      C4 */\
+       "pmaddwd %%mm0, %%mm4                   \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
+       "movq 24(%2), %%mm5                     \n\t" /* -C4    C4      -C4     C4 */\
+       "pmaddwd %%mm5, %%mm0                   \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
+       "movq 32(%2), %%mm5                     \n\t" /* C6     C2      C6      C2 */\
+       "pmaddwd %%mm1, %%mm5                   \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
+       "movq 40(%2), %%mm6                     \n\t" /* -C2    C6      -C2     C6 */\
+       "pmaddwd %%mm6, %%mm1                   \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
+       "movq 48(%2), %%mm7                     \n\t" /* C3     C1      C3      C1 */\
+       "pmaddwd %%mm2, %%mm7                   \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
        #rounder ", %%mm4                       \n\t"\
-\
-       "movq 48(%2), %%mm5                     \n\t" /* C6     C4      C6      C4 */\
-       "pmaddwd %%mm0, %%mm5                   \n\t" /* C6R2+C4R0      C6r2+C4r0 */\
-       "paddd %%mm7, %%mm6                     \n\t" /* B0             b0 */\
-       "paddd %%mm4, %%mm6                     \n\t" /* A0+B0          a0+b0 */\
+       "movq %%mm4, %%mm6                      \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
+       "paddd %%mm5, %%mm4                     \n\t" /* A0             a0 */\
+       "psubd %%mm5, %%mm6                     \n\t" /* A3             a3 */\
+       "movq 56(%2), %%mm5                     \n\t" /* C7     C5      C7      C5 */\
+       "pmaddwd %%mm3, %%mm5                   \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
+       #rounder ", %%mm0                       \n\t"\
+       "paddd %%mm0, %%mm1                     \n\t" /* A1             a1 */\
+       "paddd %%mm0, %%mm0                     \n\t" \
+       "psubd %%mm1, %%mm0                     \n\t" /* A2             a2 */\
+       "pmaddwd 64(%2), %%mm2                  \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
+       "paddd %%mm5, %%mm7                     \n\t" /* B0             b0 */\
+       "movq 72(%2), %%mm5                     \n\t" /* -C5    -C1     -C5     -C1 */\
+       "pmaddwd %%mm3, %%mm5                   \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
+       "paddd %%mm4, %%mm7                     \n\t" /* A0+B0          a0+b0 */\
        "paddd %%mm4, %%mm4                     \n\t" /* 2A0            2a0 */\
-       "psubd %%mm6, %%mm4                     \n\t" /* A0-B0          a0-b0 */\
-       "psrad $" #shift ", %%mm6               \n\t"\
-       "psrad $" #shift ", %%mm4               \n\t"\
-       WRITE0(%%mm6, %%mm4, dst) \
-\
-       "movq 56(%2), %%mm4                     \n\t" /* -C2    -C4     -C2     -C4 */\
-       "pmaddwd %%mm1, %%mm4                   \n\t" /* -C2R6-C4R4     -C2r6-C4r4 */\
-       "movq 64(%2), %%mm6                     \n\t" /* -C7    C3      -C7     C3 */\
-       "pmaddwd %%mm2, %%mm6                   \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
-       "movq 72(%2), %%mm7                     \n\t" /* -C5    -C1     -C5     -C1 */\
-       "pmaddwd %%mm3, %%mm7                   \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
-       "paddd %%mm5, %%mm4                     \n\t" /* A1             a1 */\
-       #rounder ", %%mm4                       \n\t"\
-\
-       "movq 80(%2), %%mm5                     \n\t" /* -C6    C4      -C6     C4 */\
-       "pmaddwd %%mm0, %%mm5                   \n\t" /* -C6R2+C4R0     -C6r2+C4r0 */\
-       "paddd %%mm7, %%mm6                     \n\t" /* B1             b1 */\
-       "paddd %%mm4, %%mm6                     \n\t" /* A1+B1          a1+b1 */\
-       "paddd %%mm4, %%mm4                     \n\t" /* 2A1            2a1 */\
-       "psubd %%mm6, %%mm4                     \n\t" /* A1-B1          a1-b1 */\
-       "psrad $" #shift ", %%mm6               \n\t"\
+       "psubd %%mm7, %%mm4                     \n\t" /* A0-B0          a0-b0 */\
+       "paddd %%mm2, %%mm5                     \n\t" /* B1             b1 */\
+       "psrad $" #shift ", %%mm7               \n\t"\
        "psrad $" #shift ", %%mm4               \n\t"\
-       WRITE1(%%mm6, %%mm4, dst, %%mm7) \
-\
-       "movq 88(%2), %%mm4                     \n\t" /* C2     -C4     C2      -C4 */\
-       "pmaddwd %%mm1, %%mm4                   \n\t" /* C2R6-C4R4      C2r6-C4r4 */\
-       "movq 96(%2), %%mm6                     \n\t" /* -C1    C5      -C1     C5 */\
-       "pmaddwd %%mm2, %%mm6                   \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
-       "movq 104(%2), %%mm7                    \n\t" /* C3     C7      C3      C7 */\
+       "movq %%mm1, %%mm2                      \n\t" /* A1             a1 */\
+       "paddd %%mm5, %%mm1                     \n\t" /* A1+B1          a1+b1 */\
+       "psubd %%mm5, %%mm2                     \n\t" /* A1-B1          a1-b1 */\
+       "psrad $" #shift ", %%mm1               \n\t"\
+       "psrad $" #shift ", %%mm2               \n\t"\
+       "packssdw %%mm1, %%mm7                  \n\t" /* A1+B1  a1+b1   A0+B0   a0+b0 */\
+       "packssdw %%mm4, %%mm2                  \n\t" /* A0-B0  a0-b0   A1-B1   a1-b1 */\
+       "movq %%mm7, " #dst "                   \n\t"\
+       "movq " #src1 ", %%mm1                  \n\t" /* R3     R1      r3      r1 */\
+       "movq 80(%2), %%mm4                     \n\t" /* -C1    C5      -C1     C5 */\
+       "movq %%mm2, 24+" #dst "                \n\t"\
+       "pmaddwd %%mm1, %%mm4                   \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
+       "movq 88(%2), %%mm7                     \n\t" /* C3     C7      C3      C7 */\
+       "pmaddwd 96(%2), %%mm1                  \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
        "pmaddwd %%mm3, %%mm7                   \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
-       "paddd %%mm5, %%mm4                     \n\t" /* A2             a2 */\
-       #rounder ", %%mm4                       \n\t"\
-\
-       "pmaddwd 112(%2), %%mm0                 \n\t" /* -C2R2+C4R0     -C2r2+C4r0 */\
-       "paddd %%mm7, %%mm6                     \n\t" /* B1             b1 */\
-       "paddd %%mm4, %%mm6                     \n\t" /* A1+B1          a1+b1 */\
-       "pmaddwd 120(%2), %%mm1                 \n\t" /* -C6R6+C4R4     -C6r6+C4r4 */\
-       "paddd %%mm4, %%mm4                     \n\t" /* 2A1            2a1 */\
-       "psubd %%mm6, %%mm4                     \n\t" /* A1-B1          a1-b1 */\
-       "pmaddwd 128(%2), %%mm2                 \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
-       "pmaddwd 136(%2), %%mm3                 \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
-       "psrad $" #shift ", %%mm6               \n\t"\
-       "psrad $" #shift ", %%mm4               \n\t"\
-\
-       "paddd %%mm1, %%mm0                     \n\t" /* A3             a3 */\
-       #rounder ", %%mm0                       \n\t"\
-       "paddd %%mm3, %%mm2                     \n\t" /* B3             b3 */\
-       "paddd %%mm0, %%mm2                     \n\t" /* A3+B3          a3+b3 */\
-       "paddd %%mm0, %%mm0                     \n\t" /* 2A3            2a3 */\
-       "psubd %%mm2, %%mm0                     \n\t" /* A3-B3          a3-b3 */\
+       "movq %%mm0, %%mm2                      \n\t" /* A2             a2 */\
+       "pmaddwd 104(%2), %%mm3                 \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
+       "paddd %%mm7, %%mm4                     \n\t" /* B2             b2 */\
+       "paddd %%mm4, %%mm2                     \n\t" /* A2+B2          a2+b2 */\
+       "psubd %%mm4, %%mm0                     \n\t" /* a2-B2          a2-b2 */\
        "psrad $" #shift ", %%mm2               \n\t"\
        "psrad $" #shift ", %%mm0               \n\t"\
-       WRITE2(%%mm6, %%mm4, %%mm2, %%mm0, dst)\
+       "movq %%mm6, %%mm4                      \n\t" /* A3             a3 */\
+       "paddd %%mm1, %%mm3                     \n\t" /* B3             b3 */\
+       "paddd %%mm3, %%mm6                     \n\t" /* A3+B3          a3+b3 */\
+       "psubd %%mm3, %%mm4                     \n\t" /* a3-B3          a3-b3 */\
+       "psrad $" #shift ", %%mm6               \n\t"\
+       "packssdw %%mm6, %%mm2                  \n\t" /* A3+B3  a3+b3   A2+B2   a2+b2 */\
+       "movq %%mm2, 8+" #dst "                 \n\t"\
+       "psrad $" #shift ", %%mm4               \n\t"\
+       "packssdw %%mm0, %%mm4                  \n\t" /* A2-B2  a2-b2   A3-B3   a3-b3 */\
+       "movq %%mm4, 16+" #dst "                \n\t"\
        "jmp 2f                                 \n\t"\
        "1:                                     \n\t"\
-       WRITE3(%%mm0, dst)\
-       "2:                                     \n\t"\
-
-
-#define WRITE0(s0, s7, dst)\
-       "movq " #s0 ", " #dst "                 \n\t" /* R0             r0 */\
-       "movq " #s7 ", 24+" #dst "              \n\t" /* R7             r7 */
-
-#define WRITE1(s1, s6, dst, tmp)\
-       "movq " #dst ", " #tmp "                \n\t" /* R0             r0 */\
-       "packssdw " #s1 ", " #tmp "             \n\t" /* R1     r1      R0      r0*/\
-       "movq " #tmp ", " #dst "                \n\t"\
-       "movq 24+" #dst ", " #tmp "             \n\t" /* R7             r7 */\
-       "packssdw " #tmp ", " #s6 "             \n\t" /* R7     r7      R6      r6*/\
-       "movq " #s6 ", 24+" #dst "              \n\t"
-
-#define WRITE2(s2, s5, s3, s4, dst)\
-       "packssdw " #s3 ", " #s2 "              \n\t" /* R3     r3      R2      r2*/\
-       "packssdw " #s5 ", " #s4 "              \n\t" /* R5     r5      R4      r4*/\
-       "movq " #s2 ", 8+" #dst "               \n\t"\
-       "movq " #s4 ", 16+" #dst "              \n\t"
-
-#define WRITE3(a, dst)\
-       "pslld $16, " #a "                      \n\t"\
-       "psrad $13, " #a "                      \n\t"\
-       "packssdw " #a ", " #a "                \n\t"\
-       "movq " #a ", " #dst "                  \n\t"\
-       "movq " #a ", 8+" #dst "                \n\t"\
-       "movq " #a ", 16+" #dst "               \n\t"\
-       "movq " #a ", 24+" #dst "               \n\t"\
-
-//IDCT_CORE(          src0,   src4,   src1,   src5,    dst,   rounder, shift)
-IDCT_CORE(            (%0),  8(%0), 16(%0), 24(%0),  0(%1),paddd 8(%2), 11)
-/*
-DC_COND_IDCT_CORE(  32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11)
-DC_COND_IDCT_CORE(  64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11)
-DC_COND_IDCT_CORE(  96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11)
-*/
-IDCT_CORE(  32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11)
-IDCT_CORE(  64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11)
-IDCT_CORE(  96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11)
-
-#undef WRITE0
-#undef WRITE1
-#undef WRITE2
-
-#define WRITE0(s0, s7, dst)\
-       "packssdw " #s0 ", " #s0 "              \n\t" /* C0, c0, C0, c0 */\
-       "packssdw " #s7 ", " #s7 "              \n\t" /* C7, c7, C7, c7 */\
-       "movd " #s0 ", " #dst "                 \n\t" /* C0, c0 */\
-       "movd " #s7 ", 112+" #dst "             \n\t" /* C7, c7 */
-
-#define WRITE1(s1, s6, dst, tmp)\
-       "packssdw " #s1 ", " #s1 "              \n\t" /* C1, c1, C1, c1 */\
-       "packssdw " #s6 ", " #s6 "              \n\t" /* C6, c6, C6, c6 */\
-       "movd " #s1 ", 16+" #dst "              \n\t" /* C1, c1 */\
-       "movd " #s6 ", 96+" #dst "              \n\t" /* C6, c6 */
-
-#define WRITE2(s2, s5, s3, s4, dst)\
-       "packssdw " #s2 ", " #s2 "              \n\t" /* C2, c2, C2, c2 */\
-       "packssdw " #s3 ", " #s3 "              \n\t" /* C3, c3, C3, c3 */\
-       "movd " #s2 ", 32+" #dst "              \n\t" /* C2, c2 */\
-       "movd " #s3 ", 48+" #dst "              \n\t" /* C3, c3 */\
-       "packssdw " #s4 ", " #s4 "              \n\t" /* C4, c4, C4, c4 */\
-       "packssdw " #s5 ", " #s5 "              \n\t" /* C5, c5, C5, c5 */\
-       "movd " #s4 ", 64+" #dst "              \n\t" /* C4, c4 */\
-       "movd " #s5 ", 80+" #dst "              \n\t" /* C5, c5 */\
-
-//IDCT_CORE(  src0,   src4,   src1,    src5,    dst, rounder, shift)
-IDCT_CORE(    (%1), 64(%1), 32(%1),  96(%1),  0(%0),/nop, 20)
-IDCT_CORE(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0),/nop, 20)
-IDCT_CORE(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0),/nop, 20)
-IDCT_CORE(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
+       "pslld $16, %%mm0                       \n\t"\
+       "#paddd d40000, %%mm0                   \n\t"\
+       "psrad $13, %%mm0                       \n\t"\
+       "packssdw %%mm0, %%mm0                  \n\t"\
+       "movq %%mm0, " #dst "                   \n\t"\
+       "movq %%mm0, 8+" #dst "                 \n\t"\
+       "movq %%mm0, 16+" #dst "                \n\t"\
+       "movq %%mm0, 24+" #dst "                \n\t"\
+       "2:                                     \n\t"
 
-#else
 
-#define IDCT_CORE(src0, src4, src1, src5, dst, rounder, shift) \
-       "movq " #src0 ", %%mm0                  \n\t" /* R2     R0      r2      r0 */\
-       "movq " #src4 ", %%mm1                  \n\t" /* R6     R4      r6      r4 */\
-       "movq " #src1 ", %%mm2                  \n\t" /* R3     R1      r3      r1 */\
-       "movq " #src5 ", %%mm3                  \n\t" /* R7     R5      r7      r5 */\
-       "movq 16(%2), %%mm4                     \n\t" /* C2     C4      C2      C4 */\
-       "pmaddwd %%mm0, %%mm4                   \n\t" /* C2R2+C4R0      C2r2+C4r0 */\
-       "movq 24(%2), %%mm5                     \n\t" /* C6     C4      C6      C4 */\
-       "pmaddwd %%mm1, %%mm5                   \n\t" /* C6R6+C4R4      C6r6+C4r4 */\
-       "movq 32(%2), %%mm6                     \n\t" /* C3     C1      C3      C1 */\
-       "pmaddwd %%mm2, %%mm6                   \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
-       "movq 40(%2), %%mm7                     \n\t" /* C7     C5      C7      C5 */\
-       "pmaddwd %%mm3, %%mm7                   \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
-       "paddd %%mm5, %%mm4                     \n\t" /* A0             a0 */\
-       #rounder ", %%mm4                       \n\t"\
-\
-       "movq 48(%2), %%mm5                     \n\t" /* C6     C4      C6      C4 */\
-       "pmaddwd %%mm0, %%mm5                   \n\t" /* C6R2+C4R0      C6r2+C4r0 */\
-       "paddd %%mm7, %%mm6                     \n\t" /* B0             b0 */\
-       "paddd %%mm4, %%mm6                     \n\t" /* A0+B0          a0+b0 */\
-       "paddd %%mm4, %%mm4                     \n\t" /* 2A0            2a0 */\
-       "psubd %%mm6, %%mm4                     \n\t" /* A0-B0          a0-b0 */\
-       "psrad $" #shift ", %%mm6               \n\t"\
-       "psrad $" #shift ", %%mm4               \n\t"\
-       WRITE0(%%mm6, %%mm4, dst) \
-\
-       "movq 56(%2), %%mm4                     \n\t" /* -C2    -C4     -C2     -C4 */\
-       "pmaddwd %%mm1, %%mm4                   \n\t" /* -C2R6-C4R4     -C2r6-C4r4 */\
-       "movq 64(%2), %%mm6                     \n\t" /* -C7    C3      -C7     C3 */\
-       "pmaddwd %%mm2, %%mm6                   \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
-       "movq 72(%2), %%mm7                     \n\t" /* -C5    -C1     -C5     -C1 */\
-       "pmaddwd %%mm3, %%mm7                   \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
-       "paddd %%mm5, %%mm4                     \n\t" /* A1             a1 */\
-       #rounder ", %%mm4                       \n\t"\
-\
-       "movq 80(%2), %%mm5                     \n\t" /* -C6    C4      -C6     C4 */\
-       "pmaddwd %%mm0, %%mm5                   \n\t" /* -C6R2+C4R0     -C6r2+C4r0 */\
-       "paddd %%mm7, %%mm6                     \n\t" /* B1             b1 */\
-       "paddd %%mm4, %%mm6                     \n\t" /* A1+B1          a1+b1 */\
-       "paddd %%mm4, %%mm4                     \n\t" /* 2A1            2a1 */\
-       "psubd %%mm6, %%mm4                     \n\t" /* A1-B1          a1-b1 */\
-       "psrad $" #shift ", %%mm6               \n\t"\
-       "psrad $" #shift ", %%mm4               \n\t"\
-       WRITE1(%%mm6, %%mm4, dst, %%mm7) \
-\
-       "movq 88(%2), %%mm4                     \n\t" /* C2     -C4     C2      -C4 */\
-       "pmaddwd %%mm1, %%mm4                   \n\t" /* C2R6-C4R4      C2r6-C4r4 */\
-       "movq 96(%2), %%mm6                     \n\t" /* -C1    C5      -C1     C5 */\
-       "pmaddwd %%mm2, %%mm6                   \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
-       "movq 104(%2), %%mm7                    \n\t" /* C3     C7      C3      C7 */\
-       "pmaddwd %%mm3, %%mm7                   \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
-       "paddd %%mm5, %%mm4                     \n\t" /* A2             a2 */\
-       #rounder ", %%mm4                       \n\t"\
-\
-       "pmaddwd 112(%2), %%mm0                 \n\t" /* -C2R2+C4R0     -C2r2+C4r0 */\
-       "paddd %%mm7, %%mm6                     \n\t" /* B1             b1 */\
-       "paddd %%mm4, %%mm6                     \n\t" /* A1+B1          a1+b1 */\
-       "pmaddwd 120(%2), %%mm1                 \n\t" /* -C6R6+C4R4     -C6r6+C4r4 */\
-       "paddd %%mm4, %%mm4                     \n\t" /* 2A1            2a1 */\
-       "psubd %%mm6, %%mm4                     \n\t" /* A1-B1          a1-b1 */\
-       "pmaddwd 128(%2), %%mm2                 \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
-       "pmaddwd 136(%2), %%mm3                 \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
-       "psrad $" #shift ", %%mm6               \n\t"\
-       "psrad $" #shift ", %%mm4               \n\t"\
-\
-       "paddd %%mm1, %%mm0                     \n\t" /* A3             a3 */\
-       #rounder ", %%mm0                       \n\t"\
-       "paddd %%mm3, %%mm2                     \n\t" /* B3             b3 */\
-       "paddd %%mm0, %%mm2                     \n\t" /* A3+B3          a3+b3 */\
-       "paddd %%mm0, %%mm0                     \n\t" /* 2A3            2a3 */\
-       "psubd %%mm2, %%mm0                     \n\t" /* A3-B3          a3-b3 */\
-       "psrad $" #shift ", %%mm2               \n\t"\
-       "psrad $" #shift ", %%mm0               \n\t"\
-       WRITE2(%%mm6, %%mm4, %%mm2, %%mm0, dst)
+//IDCT(      src0,   src4,   src1,   src5,    dst,    rounder, shift)
+ROW_IDCT(    (%0),  8(%0), 16(%0), 24(%0),  0(%1),paddd 8(%2), 11)
+/*ROW_IDCT(  32(%0), 40(%0), 48(%0), 56(%0), 32(%1), paddd (%2), 11)
+ROW_IDCT(  64(%0), 72(%0), 80(%0), 88(%0), 64(%1), paddd (%2), 11)
+ROW_IDCT(  96(%0),104(%0),112(%0),120(%0), 96(%1), paddd (%2), 11)*/
+
+DC_COND_ROW_IDCT(  32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11)
+DC_COND_ROW_IDCT(  64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11)
+DC_COND_ROW_IDCT(  96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11)
+
+
+//IDCT(      src0,   src4,   src1,    src5,    dst, rounder, shift)
+COL_IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0),/nop, 20)
+COL_IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0),/nop, 20)
+COL_IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0),/nop, 20)
+COL_IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
 
-#define DC_COND_IDCT_CORE(src0, src4, src1, src5, dst, rounder, shift) \
-       "movq " #src0 ", %%mm0                  \n\t" /* R2     R0      r2      r0 */\
-       "movq " #src4 ", %%mm1                  \n\t" /* R6     R4      r6      r4 */\
+#else
+
+#define DC_COND_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
+       "movq " #src0 ", %%mm0                  \n\t" /* R4     R0      r4      r0 */\
+       "movq " #src4 ", %%mm1                  \n\t" /* R6     R2      r6      r2 */\
        "movq " #src1 ", %%mm2                  \n\t" /* R3     R1      r3      r1 */\
        "movq " #src5 ", %%mm3                  \n\t" /* R7     R5      r7      r5 */\
        "movq wm1010, %%mm4                     \n\t"\
@@ -536,920 +462,822 @@ IDCT_CORE(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
        "movd %%mm4, %%eax                      \n\t"\
        "orl %%eax, %%eax                       \n\t"\
        "jz 1f                                  \n\t"\
-       "movq 16(%2), %%mm4                     \n\t" /* C2     C4      C2      C4 */\
-       "pmaddwd %%mm0, %%mm4                   \n\t" /* C2R2+C4R0      C2r2+C4r0 */\
-       "movq 24(%2), %%mm5                     \n\t" /* C6     C4      C6      C4 */\
-       "pmaddwd %%mm1, %%mm5                   \n\t" /* C6R6+C4R4      C6r6+C4r4 */\
-       "movq 32(%2), %%mm6                     \n\t" /* C3     C1      C3      C1 */\
-       "pmaddwd %%mm2, %%mm6                   \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
-       "movq 40(%2), %%mm7                     \n\t" /* C7     C5      C7      C5 */\
-       "pmaddwd %%mm3, %%mm7                   \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
-       "paddd %%mm5, %%mm4                     \n\t" /* A0             a0 */\
+       "movq 16(%2), %%mm4                     \n\t" /* C4     C4      C4      C4 */\
+       "pmaddwd %%mm0, %%mm4                   \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
+       "movq 24(%2), %%mm5                     \n\t" /* -C4    C4      -C4     C4 */\
+       "pmaddwd %%mm5, %%mm0                   \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
+       "movq 32(%2), %%mm5                     \n\t" /* C6     C2      C6      C2 */\
+       "pmaddwd %%mm1, %%mm5                   \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
+       "movq 40(%2), %%mm6                     \n\t" /* -C2    C6      -C2     C6 */\
+       "pmaddwd %%mm6, %%mm1                   \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
+       "movq 48(%2), %%mm7                     \n\t" /* C3     C1      C3      C1 */\
+       "pmaddwd %%mm2, %%mm7                   \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
        #rounder ", %%mm4                       \n\t"\
-\
-       "movq 48(%2), %%mm5                     \n\t" /* C6     C4      C6      C4 */\
-       "pmaddwd %%mm0, %%mm5                   \n\t" /* C6R2+C4R0      C6r2+C4r0 */\
-       "paddd %%mm7, %%mm6                     \n\t" /* B0             b0 */\
-       "paddd %%mm4, %%mm6                     \n\t" /* A0+B0          a0+b0 */\
+       "movq %%mm4, %%mm6                      \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
+       "paddd %%mm5, %%mm4                     \n\t" /* A0             a0 */\
+       "psubd %%mm5, %%mm6                     \n\t" /* A3             a3 */\
+       "movq 56(%2), %%mm5                     \n\t" /* C7     C5      C7      C5 */\
+       "pmaddwd %%mm3, %%mm5                   \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
+       #rounder ", %%mm0                       \n\t"\
+       "paddd %%mm0, %%mm1                     \n\t" /* A1             a1 */\
+       "paddd %%mm0, %%mm0                     \n\t" \
+       "psubd %%mm1, %%mm0                     \n\t" /* A2             a2 */\
+       "pmaddwd 64(%2), %%mm2                  \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
+       "paddd %%mm5, %%mm7                     \n\t" /* B0             b0 */\
+       "movq 72(%2), %%mm5                     \n\t" /* -C5    -C1     -C5     -C1 */\
+       "pmaddwd %%mm3, %%mm5                   \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
+       "paddd %%mm4, %%mm7                     \n\t" /* A0+B0          a0+b0 */\
        "paddd %%mm4, %%mm4                     \n\t" /* 2A0            2a0 */\
-       "psubd %%mm6, %%mm4                     \n\t" /* A0-B0          a0-b0 */\
-       "psrad $" #shift ", %%mm6               \n\t"\
-       "psrad $" #shift ", %%mm4               \n\t"\
-       WRITE0(%%mm6, %%mm4, dst) \
-\
-       "movq 56(%2), %%mm4                     \n\t" /* -C2    -C4     -C2     -C4 */\
-       "pmaddwd %%mm1, %%mm4                   \n\t" /* -C2R6-C4R4     -C2r6-C4r4 */\
-       "movq 64(%2), %%mm6                     \n\t" /* -C7    C3      -C7     C3 */\
-       "pmaddwd %%mm2, %%mm6                   \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
-       "movq 72(%2), %%mm7                     \n\t" /* -C5    -C1     -C5     -C1 */\
-       "pmaddwd %%mm3, %%mm7                   \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
-       "paddd %%mm5, %%mm4                     \n\t" /* A1             a1 */\
-       #rounder ", %%mm4                       \n\t"\
-\
-       "movq 80(%2), %%mm5                     \n\t" /* -C6    C4      -C6     C4 */\
-       "pmaddwd %%mm0, %%mm5                   \n\t" /* -C6R2+C4R0     -C6r2+C4r0 */\
-       "paddd %%mm7, %%mm6                     \n\t" /* B1             b1 */\
-       "paddd %%mm4, %%mm6                     \n\t" /* A1+B1          a1+b1 */\
-       "paddd %%mm4, %%mm4                     \n\t" /* 2A1            2a1 */\
-       "psubd %%mm6, %%mm4                     \n\t" /* A1-B1          a1-b1 */\
-       "psrad $" #shift ", %%mm6               \n\t"\
+       "psubd %%mm7, %%mm4                     \n\t" /* A0-B0          a0-b0 */\
+       "paddd %%mm2, %%mm5                     \n\t" /* B1             b1 */\
+       "psrad $" #shift ", %%mm7               \n\t"\
        "psrad $" #shift ", %%mm4               \n\t"\
-       WRITE1(%%mm6, %%mm4, dst, %%mm7) \
-\
-       "movq 88(%2), %%mm4                     \n\t" /* C2     -C4     C2      -C4 */\
-       "pmaddwd %%mm1, %%mm4                   \n\t" /* C2R6-C4R4      C2r6-C4r4 */\
-       "movq 96(%2), %%mm6                     \n\t" /* -C1    C5      -C1     C5 */\
-       "pmaddwd %%mm2, %%mm6                   \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
-       "movq 104(%2), %%mm7                    \n\t" /* C3     C7      C3      C7 */\
+       "movq %%mm1, %%mm2                      \n\t" /* A1             a1 */\
+       "paddd %%mm5, %%mm1                     \n\t" /* A1+B1          a1+b1 */\
+       "psubd %%mm5, %%mm2                     \n\t" /* A1-B1          a1-b1 */\
+       "psrad $" #shift ", %%mm1               \n\t"\
+       "psrad $" #shift ", %%mm2               \n\t"\
+       "packssdw %%mm1, %%mm7                  \n\t" /* A1+B1  a1+b1   A0+B0   a0+b0 */\
+       "packssdw %%mm4, %%mm2                  \n\t" /* A0-B0  a0-b0   A1-B1   a1-b1 */\
+       "movq %%mm7, " #dst "                   \n\t"\
+       "movq " #src1 ", %%mm1                  \n\t" /* R3     R1      r3      r1 */\
+       "movq 80(%2), %%mm4                     \n\t" /* -C1    C5      -C1     C5 */\
+       "movq %%mm2, 24+" #dst "                \n\t"\
+       "pmaddwd %%mm1, %%mm4                   \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
+       "movq 88(%2), %%mm7                     \n\t" /* C3     C7      C3      C7 */\
+       "pmaddwd 96(%2), %%mm1                  \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
        "pmaddwd %%mm3, %%mm7                   \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
-       "paddd %%mm5, %%mm4                     \n\t" /* A2             a2 */\
-       #rounder ", %%mm4                       \n\t"\
-\
-       "pmaddwd 112(%2), %%mm0                 \n\t" /* -C2R2+C4R0     -C2r2+C4r0 */\
-       "paddd %%mm7, %%mm6                     \n\t" /* B1             b1 */\
-       "paddd %%mm4, %%mm6                     \n\t" /* A1+B1          a1+b1 */\
-       "pmaddwd 120(%2), %%mm1                 \n\t" /* -C6R6+C4R4     -C6r6+C4r4 */\
-       "paddd %%mm4, %%mm4                     \n\t" /* 2A1            2a1 */\
-       "psubd %%mm6, %%mm4                     \n\t" /* A1-B1          a1-b1 */\
-       "pmaddwd 128(%2), %%mm2                 \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
-       "pmaddwd 136(%2), %%mm3                 \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
-       "psrad $" #shift ", %%mm6               \n\t"\
-       "psrad $" #shift ", %%mm4               \n\t"\
-\
-       "paddd %%mm1, %%mm0                     \n\t" /* A3             a3 */\
-       #rounder ", %%mm0                       \n\t"\
-       "paddd %%mm3, %%mm2                     \n\t" /* B3             b3 */\
-       "paddd %%mm0, %%mm2                     \n\t" /* A3+B3          a3+b3 */\
-       "paddd %%mm0, %%mm0                     \n\t" /* 2A3            2a3 */\
-       "psubd %%mm2, %%mm0                     \n\t" /* A3-B3          a3-b3 */\
+       "movq %%mm0, %%mm2                      \n\t" /* A2             a2 */\
+       "pmaddwd 104(%2), %%mm3                 \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
+       "paddd %%mm7, %%mm4                     \n\t" /* B2             b2 */\
+       "paddd %%mm4, %%mm2                     \n\t" /* A2+B2          a2+b2 */\
+       "psubd %%mm4, %%mm0                     \n\t" /* a2-B2          a2-b2 */\
        "psrad $" #shift ", %%mm2               \n\t"\
        "psrad $" #shift ", %%mm0               \n\t"\
-       WRITE2(%%mm6, %%mm4, %%mm2, %%mm0, dst)\
+       "movq %%mm6, %%mm4                      \n\t" /* A3             a3 */\
+       "paddd %%mm1, %%mm3                     \n\t" /* B3             b3 */\
+       "paddd %%mm3, %%mm6                     \n\t" /* A3+B3          a3+b3 */\
+       "psubd %%mm3, %%mm4                     \n\t" /* a3-B3          a3-b3 */\
+       "psrad $" #shift ", %%mm6               \n\t"\
+       "packssdw %%mm6, %%mm2                  \n\t" /* A3+B3  a3+b3   A2+B2   a2+b2 */\
+       "movq %%mm2, 8+" #dst "                 \n\t"\
+       "psrad $" #shift ", %%mm4               \n\t"\
+       "packssdw %%mm0, %%mm4                  \n\t" /* A2-B2  a2-b2   A3-B3   a3-b3 */\
+       "movq %%mm4, 16+" #dst "                \n\t"\
        "jmp 2f                                 \n\t"\
-       "#.balign 16                            \n\t"\
        "1:                                     \n\t"\
-       WRITE3(%%mm0, dst)\
-       "2:                                     \n\t"\
+       "pslld $16, %%mm0                       \n\t"\
+       "paddd d40000, %%mm0                    \n\t"\
+       "psrad $13, %%mm0                       \n\t"\
+       "packssdw %%mm0, %%mm0                  \n\t"\
+       "movq %%mm0, " #dst "                   \n\t"\
+       "movq %%mm0, 8+" #dst "                 \n\t"\
+       "movq %%mm0, 16+" #dst "                \n\t"\
+       "movq %%mm0, 24+" #dst "                \n\t"\
+       "2:                                     \n\t"
 
-#define Z_COND_IDCT_CORE(src0, src4, src1, src5, dst, rounder, shift, bt) \
-       "movq " #src0 ", %%mm0                  \n\t" /* R2     R0      r2      r0 */\
-       "movq " #src4 ", %%mm1                  \n\t" /* R6     R4      r6      r4 */\
+#define Z_COND_IDCT(src0, src4, src1, src5, dst, rounder, shift, bt) \
+       "movq " #src0 ", %%mm0                  \n\t" /* R4     R0      r4      r0 */\
+       "movq " #src4 ", %%mm1                  \n\t" /* R6     R2      r6      r2 */\
        "movq " #src1 ", %%mm2                  \n\t" /* R3     R1      r3      r1 */\
        "movq " #src5 ", %%mm3                  \n\t" /* R7     R5      r7      r5 */\
        "movq %%mm0, %%mm4                      \n\t"\
        "por %%mm1, %%mm4                       \n\t"\
        "por %%mm2, %%mm4                       \n\t"\
        "por %%mm3, %%mm4                       \n\t"\
-       "packssdw %%mm4, %%mm4                  \n\t"\
+       "packssdw %%mm4,%%mm4                   \n\t"\
        "movd %%mm4, %%eax                      \n\t"\
        "orl %%eax, %%eax                       \n\t"\
        "jz " #bt "                             \n\t"\
-       "movq 16(%2), %%mm4                     \n\t" /* C2     C4      C2      C4 */\
-       "pmaddwd %%mm0, %%mm4                   \n\t" /* C2R2+C4R0      C2r2+C4r0 */\
-       "movq 24(%2), %%mm5                     \n\t" /* C6     C4      C6      C4 */\
-       "pmaddwd %%mm1, %%mm5                   \n\t" /* C6R6+C4R4      C6r6+C4r4 */\
-       "movq 32(%2), %%mm6                     \n\t" /* C3     C1      C3      C1 */\
-       "pmaddwd %%mm2, %%mm6                   \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
-       "movq 40(%2), %%mm7                     \n\t" /* C7     C5      C7      C5 */\
-       "pmaddwd %%mm3, %%mm7                   \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
-       "paddd %%mm5, %%mm4                     \n\t" /* A0             a0 */\
+       "movq 16(%2), %%mm4                     \n\t" /* C4     C4      C4      C4 */\
+       "pmaddwd %%mm0, %%mm4                   \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
+       "movq 24(%2), %%mm5                     \n\t" /* -C4    C4      -C4     C4 */\
+       "pmaddwd %%mm5, %%mm0                   \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
+       "movq 32(%2), %%mm5                     \n\t" /* C6     C2      C6      C2 */\
+       "pmaddwd %%mm1, %%mm5                   \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
+       "movq 40(%2), %%mm6                     \n\t" /* -C2    C6      -C2     C6 */\
+       "pmaddwd %%mm6, %%mm1                   \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
+       "movq 48(%2), %%mm7                     \n\t" /* C3     C1      C3      C1 */\
+       "pmaddwd %%mm2, %%mm7                   \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
        #rounder ", %%mm4                       \n\t"\
-\
-       "movq 48(%2), %%mm5                     \n\t" /* C6     C4      C6      C4 */\
-       "pmaddwd %%mm0, %%mm5                   \n\t" /* C6R2+C4R0      C6r2+C4r0 */\
-       "paddd %%mm7, %%mm6                     \n\t" /* B0             b0 */\
-       "paddd %%mm4, %%mm6                     \n\t" /* A0+B0          a0+b0 */\
+       "movq %%mm4, %%mm6                      \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
+       "paddd %%mm5, %%mm4                     \n\t" /* A0             a0 */\
+       "psubd %%mm5, %%mm6                     \n\t" /* A3             a3 */\
+       "movq 56(%2), %%mm5                     \n\t" /* C7     C5      C7      C5 */\
+       "pmaddwd %%mm3, %%mm5                   \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
+       #rounder ", %%mm0                       \n\t"\
+       "paddd %%mm0, %%mm1                     \n\t" /* A1             a1 */\
+       "paddd %%mm0, %%mm0                     \n\t" \
+       "psubd %%mm1, %%mm0                     \n\t" /* A2             a2 */\
+       "pmaddwd 64(%2), %%mm2                  \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
+       "paddd %%mm5, %%mm7                     \n\t" /* B0             b0 */\
+       "movq 72(%2), %%mm5                     \n\t" /* -C5    -C1     -C5     -C1 */\
+       "pmaddwd %%mm3, %%mm5                   \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
+       "paddd %%mm4, %%mm7                     \n\t" /* A0+B0          a0+b0 */\
        "paddd %%mm4, %%mm4                     \n\t" /* 2A0            2a0 */\
-       "psubd %%mm6, %%mm4                     \n\t" /* A0-B0          a0-b0 */\
-       "psrad $" #shift ", %%mm6               \n\t"\
-       "psrad $" #shift ", %%mm4               \n\t"\
-       WRITE0(%%mm6, %%mm4, dst) \
-\
-       "movq 56(%2), %%mm4                     \n\t" /* -C2    -C4     -C2     -C4 */\
-       "pmaddwd %%mm1, %%mm4                   \n\t" /* -C2R6-C4R4     -C2r6-C4r4 */\
-       "movq 64(%2), %%mm6                     \n\t" /* -C7    C3      -C7     C3 */\
-       "pmaddwd %%mm2, %%mm6                   \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
-       "movq 72(%2), %%mm7                     \n\t" /* -C5    -C1     -C5     -C1 */\
-       "pmaddwd %%mm3, %%mm7                   \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
-       "paddd %%mm5, %%mm4                     \n\t" /* A1             a1 */\
-       #rounder ", %%mm4                       \n\t"\
-\
-       "movq 80(%2), %%mm5                     \n\t" /* -C6    C4      -C6     C4 */\
-       "pmaddwd %%mm0, %%mm5                   \n\t" /* -C6R2+C4R0     -C6r2+C4r0 */\
-       "paddd %%mm7, %%mm6                     \n\t" /* B1             b1 */\
-       "paddd %%mm4, %%mm6                     \n\t" /* A1+B1          a1+b1 */\
-       "paddd %%mm4, %%mm4                     \n\t" /* 2A1            2a1 */\
-       "psubd %%mm6, %%mm4                     \n\t" /* A1-B1          a1-b1 */\
-       "psrad $" #shift ", %%mm6               \n\t"\
+       "psubd %%mm7, %%mm4                     \n\t" /* A0-B0          a0-b0 */\
+       "paddd %%mm2, %%mm5                     \n\t" /* B1             b1 */\
+       "psrad $" #shift ", %%mm7               \n\t"\
        "psrad $" #shift ", %%mm4               \n\t"\
-       WRITE1(%%mm6, %%mm4, dst, %%mm7) \
-\
-       "movq 88(%2), %%mm4                     \n\t" /* C2     -C4     C2      -C4 */\
-       "pmaddwd %%mm1, %%mm4                   \n\t" /* C2R6-C4R4      C2r6-C4r4 */\
-       "movq 96(%2), %%mm6                     \n\t" /* -C1    C5      -C1     C5 */\
-       "pmaddwd %%mm2, %%mm6                   \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
-       "movq 104(%2), %%mm7                    \n\t" /* C3     C7      C3      C7 */\
+       "movq %%mm1, %%mm2                      \n\t" /* A1             a1 */\
+       "paddd %%mm5, %%mm1                     \n\t" /* A1+B1          a1+b1 */\
+       "psubd %%mm5, %%mm2                     \n\t" /* A1-B1          a1-b1 */\
+       "psrad $" #shift ", %%mm1               \n\t"\
+       "psrad $" #shift ", %%mm2               \n\t"\
+       "packssdw %%mm1, %%mm7                  \n\t" /* A1+B1  a1+b1   A0+B0   a0+b0 */\
+       "packssdw %%mm4, %%mm2                  \n\t" /* A0-B0  a0-b0   A1-B1   a1-b1 */\
+       "movq %%mm7, " #dst "                   \n\t"\
+       "movq " #src1 ", %%mm1                  \n\t" /* R3     R1      r3      r1 */\
+       "movq 80(%2), %%mm4                     \n\t" /* -C1    C5      -C1     C5 */\
+       "movq %%mm2, 24+" #dst "                \n\t"\
+       "pmaddwd %%mm1, %%mm4                   \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
+       "movq 88(%2), %%mm7                     \n\t" /* C3     C7      C3      C7 */\
+       "pmaddwd 96(%2), %%mm1                  \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
        "pmaddwd %%mm3, %%mm7                   \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
-       "paddd %%mm5, %%mm4                     \n\t" /* A2             a2 */\
-       #rounder ", %%mm4                       \n\t"\
-\
-       "pmaddwd 112(%2), %%mm0                 \n\t" /* -C2R2+C4R0     -C2r2+C4r0 */\
-       "paddd %%mm7, %%mm6                     \n\t" /* B1             b1 */\
-       "paddd %%mm4, %%mm6                     \n\t" /* A1+B1          a1+b1 */\
-       "pmaddwd 120(%2), %%mm1                 \n\t" /* -C6R6+C4R4     -C6r6+C4r4 */\
-       "paddd %%mm4, %%mm4                     \n\t" /* 2A1            2a1 */\
-       "psubd %%mm6, %%mm4                     \n\t" /* A1-B1          a1-b1 */\
-       "pmaddwd 128(%2), %%mm2                 \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
-       "pmaddwd 136(%2), %%mm3                 \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
+       "movq %%mm0, %%mm2                      \n\t" /* A2             a2 */\
+       "pmaddwd 104(%2), %%mm3                 \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
+       "paddd %%mm7, %%mm4                     \n\t" /* B2             b2 */\
+       "paddd %%mm4, %%mm2                     \n\t" /* A2+B2          a2+b2 */\
+       "psubd %%mm4, %%mm0                     \n\t" /* a2-B2          a2-b2 */\
+       "psrad $" #shift ", %%mm2               \n\t"\
+       "psrad $" #shift ", %%mm0               \n\t"\
+       "movq %%mm6, %%mm4                      \n\t" /* A3             a3 */\
+       "paddd %%mm1, %%mm3                     \n\t" /* B3             b3 */\
+       "paddd %%mm3, %%mm6                     \n\t" /* A3+B3          a3+b3 */\
+       "psubd %%mm3, %%mm4                     \n\t" /* a3-B3          a3-b3 */\
        "psrad $" #shift ", %%mm6               \n\t"\
+       "packssdw %%mm6, %%mm2                  \n\t" /* A3+B3  a3+b3   A2+B2   a2+b2 */\
+       "movq %%mm2, 8+" #dst "                 \n\t"\
        "psrad $" #shift ", %%mm4               \n\t"\
-\
-       "paddd %%mm1, %%mm0                     \n\t" /* A3             a3 */\
+       "packssdw %%mm0, %%mm4                  \n\t" /* A2-B2  a2-b2   A3-B3   a3-b3 */\
+       "movq %%mm4, 16+" #dst "                \n\t"\
+
+#define ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
+       "movq " #src0 ", %%mm0                  \n\t" /* R4     R0      r4      r0 */\
+       "movq " #src4 ", %%mm1                  \n\t" /* R6     R2      r6      r2 */\
+       "movq " #src1 ", %%mm2                  \n\t" /* R3     R1      r3      r1 */\
+       "movq " #src5 ", %%mm3                  \n\t" /* R7     R5      r7      r5 */\
+       "movq 16(%2), %%mm4                     \n\t" /* C4     C4      C4      C4 */\
+       "pmaddwd %%mm0, %%mm4                   \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
+       "movq 24(%2), %%mm5                     \n\t" /* -C4    C4      -C4     C4 */\
+       "pmaddwd %%mm5, %%mm0                   \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
+       "movq 32(%2), %%mm5                     \n\t" /* C6     C2      C6      C2 */\
+       "pmaddwd %%mm1, %%mm5                   \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
+       "movq 40(%2), %%mm6                     \n\t" /* -C2    C6      -C2     C6 */\
+       "pmaddwd %%mm6, %%mm1                   \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
+       "movq 48(%2), %%mm7                     \n\t" /* C3     C1      C3      C1 */\
+       "pmaddwd %%mm2, %%mm7                   \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
+       #rounder ", %%mm4                       \n\t"\
+       "movq %%mm4, %%mm6                      \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
+       "paddd %%mm5, %%mm4                     \n\t" /* A0             a0 */\
+       "psubd %%mm5, %%mm6                     \n\t" /* A3             a3 */\
+       "movq 56(%2), %%mm5                     \n\t" /* C7     C5      C7      C5 */\
+       "pmaddwd %%mm3, %%mm5                   \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
        #rounder ", %%mm0                       \n\t"\
-       "paddd %%mm3, %%mm2                     \n\t" /* B3             b3 */\
-       "paddd %%mm0, %%mm2                     \n\t" /* A3+B3          a3+b3 */\
-       "paddd %%mm0, %%mm0                     \n\t" /* 2A3            2a3 */\
-       "psubd %%mm2, %%mm0                     \n\t" /* A3-B3          a3-b3 */\
+       "paddd %%mm0, %%mm1                     \n\t" /* A1             a1 */\
+       "paddd %%mm0, %%mm0                     \n\t" \
+       "psubd %%mm1, %%mm0                     \n\t" /* A2             a2 */\
+       "pmaddwd 64(%2), %%mm2                  \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
+       "paddd %%mm5, %%mm7                     \n\t" /* B0             b0 */\
+       "movq 72(%2), %%mm5                     \n\t" /* -C5    -C1     -C5     -C1 */\
+       "pmaddwd %%mm3, %%mm5                   \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
+       "paddd %%mm4, %%mm7                     \n\t" /* A0+B0          a0+b0 */\
+       "paddd %%mm4, %%mm4                     \n\t" /* 2A0            2a0 */\
+       "psubd %%mm7, %%mm4                     \n\t" /* A0-B0          a0-b0 */\
+       "paddd %%mm2, %%mm5                     \n\t" /* B1             b1 */\
+       "psrad $" #shift ", %%mm7               \n\t"\
+       "psrad $" #shift ", %%mm4               \n\t"\
+       "movq %%mm1, %%mm2                      \n\t" /* A1             a1 */\
+       "paddd %%mm5, %%mm1                     \n\t" /* A1+B1          a1+b1 */\
+       "psubd %%mm5, %%mm2                     \n\t" /* A1-B1          a1-b1 */\
+       "psrad $" #shift ", %%mm1               \n\t"\
+       "psrad $" #shift ", %%mm2               \n\t"\
+       "packssdw %%mm1, %%mm7                  \n\t" /* A1+B1  a1+b1   A0+B0   a0+b0 */\
+       "packssdw %%mm4, %%mm2                  \n\t" /* A0-B0  a0-b0   A1-B1   a1-b1 */\
+       "movq %%mm7, " #dst "                   \n\t"\
+       "movq " #src1 ", %%mm1                  \n\t" /* R3     R1      r3      r1 */\
+       "movq 80(%2), %%mm4                     \n\t" /* -C1    C5      -C1     C5 */\
+       "movq %%mm2, 24+" #dst "                \n\t"\
+       "pmaddwd %%mm1, %%mm4                   \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
+       "movq 88(%2), %%mm7                     \n\t" /* C3     C7      C3      C7 */\
+       "pmaddwd 96(%2), %%mm1                  \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
+       "pmaddwd %%mm3, %%mm7                   \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
+       "movq %%mm0, %%mm2                      \n\t" /* A2             a2 */\
+       "pmaddwd 104(%2), %%mm3                 \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
+       "paddd %%mm7, %%mm4                     \n\t" /* B2             b2 */\
+       "paddd %%mm4, %%mm2                     \n\t" /* A2+B2          a2+b2 */\
+       "psubd %%mm4, %%mm0                     \n\t" /* a2-B2          a2-b2 */\
        "psrad $" #shift ", %%mm2               \n\t"\
        "psrad $" #shift ", %%mm0               \n\t"\
-       WRITE2(%%mm6, %%mm4, %%mm2, %%mm0, dst)\
-
-
-#define WRITE0(s0, s7, dst)\
-       "movq " #s0 ", " #dst "                 \n\t" /* R0             r0 */\
-       "movq " #s7 ", 24+" #dst "              \n\t" /* R7             r7 */
-
-#define WRITE1(s1, s6, dst, tmp)\
-       "movq " #dst ", " #tmp "                \n\t" /* R0             r0 */\
-       "packssdw " #s1 ", " #tmp "             \n\t" /* R1     r1      R0      r0*/\
-       "movq " #tmp ", " #dst "                \n\t"\
-       "movq 24+" #dst ", " #tmp "             \n\t" /* R7             r7 */\
-       "packssdw " #tmp ", " #s6 "             \n\t" /* R7     r7      R6      r6*/\
-       "movq " #s6 ", 24+" #dst "              \n\t"
-
-#define WRITE2(s2, s5, s3, s4, dst)\
-       "packssdw " #s3 ", " #s2 "              \n\t" /* R3     r3      R2      r2*/\
-       "packssdw " #s5 ", " #s4 "              \n\t" /* R5     r5      R4      r4*/\
-       "movq " #s2 ", 8+" #dst "               \n\t"\
-       "movq " #s4 ", 16+" #dst "              \n\t"
-
-#define WRITE3(a, dst)\
-       "pslld $16, " #a "                      \n\t"\
-       "paddd d40000, " #a "                   \n\t"\
-       "psrad $13, " #a "                      \n\t"\
-       "packssdw " #a ", " #a "                \n\t"\
-       "movq " #a ", " #dst "                  \n\t"\
-       "movq " #a ", 8+" #dst "                \n\t"\
-       "movq " #a ", 16+" #dst "               \n\t"\
-       "movq " #a ", 24+" #dst "               \n\t"\
-
-#define WRITE0b(s0, s7, dst)\
-       "packssdw " #s0 ", " #s0 "              \n\t" /* C0, c0, C0, c0 */\
-       "packssdw " #s7 ", " #s7 "              \n\t" /* C7, c7, C7, c7 */\
-       "movd " #s0 ", " #dst "                 \n\t" /* C0, c0 */\
-       "movd " #s7 ", 112+" #dst "             \n\t" /* C7, c7 */
-
-#define WRITE1b(s1, s6, dst, tmp)\
-       "packssdw " #s1 ", " #s1 "              \n\t" /* C1, c1, C1, c1 */\
-       "packssdw " #s6 ", " #s6 "              \n\t" /* C6, c6, C6, c6 */\
-       "movd " #s1 ", 16+" #dst "              \n\t" /* C1, c1 */\
-       "movd " #s6 ", 96+" #dst "              \n\t" /* C6, c6 */
-
-#define WRITE2b(s2, s5, s3, s4, dst)\
-       "packssdw " #s2 ", " #s2 "              \n\t" /* C2, c2, C2, c2 */\
-       "packssdw " #s3 ", " #s3 "              \n\t" /* C3, c3, C3, c3 */\
-       "movd " #s2 ", 32+" #dst "              \n\t" /* C2, c2 */\
-       "movd " #s3 ", 48+" #dst "              \n\t" /* C3, c3 */\
-       "packssdw " #s4 ", " #s4 "              \n\t" /* C4, c4, C4, c4 */\
-       "packssdw " #s5 ", " #s5 "              \n\t" /* C5, c5, C5, c5 */\
-       "movd " #s4 ", 64+" #dst "              \n\t" /* C4, c4 */\
-       "movd " #s5 ", 80+" #dst "              \n\t" /* C5, c5 */\
-
-
-//IDCT_CORE(         src0,   src4,   src1,   src5,    dst,   rounder, shift)
-DC_COND_IDCT_CORE(  0(%0),  8(%0), 16(%0), 24(%0),  0(%1),paddd 8(%2), 11)
-Z_COND_IDCT_CORE(  32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11, 4f)
-Z_COND_IDCT_CORE(  64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 2f)
-Z_COND_IDCT_CORE(  96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 1f)
-
-#undef IDCT_CORE
-#define IDCT_CORE(src0, src4, src1, src5, dst, rounder, shift) \
-       "movq " #src0 ", %%mm0                  \n\t" /* R2     R0      r2      r0 */\
-       "movq " #src4 ", %%mm1                  \n\t" /* R6     R4      r6      r4 */\
+       "movq %%mm6, %%mm4                      \n\t" /* A3             a3 */\
+       "paddd %%mm1, %%mm3                     \n\t" /* B3             b3 */\
+       "paddd %%mm3, %%mm6                     \n\t" /* A3+B3          a3+b3 */\
+       "psubd %%mm3, %%mm4                     \n\t" /* a3-B3          a3-b3 */\
+       "psrad $" #shift ", %%mm6               \n\t"\
+       "packssdw %%mm6, %%mm2                  \n\t" /* A3+B3  a3+b3   A2+B2   a2+b2 */\
+       "movq %%mm2, 8+" #dst "                 \n\t"\
+       "psrad $" #shift ", %%mm4               \n\t"\
+       "packssdw %%mm0, %%mm4                  \n\t" /* A2-B2  a2-b2   A3-B3   a3-b3 */\
+       "movq %%mm4, 16+" #dst "                \n\t"\
+
+//IDCT(         src0,   src4,   src1,   src5,    dst,   rounder, shift)
+DC_COND_IDCT(  0(%0),  8(%0), 16(%0), 24(%0),  0(%1),paddd 8(%2), 11)
+Z_COND_IDCT(  32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11, 4f)
+Z_COND_IDCT(  64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 2f)
+Z_COND_IDCT(  96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 1f)
+
+#undef IDCT
+#define IDCT(src0, src4, src1, src5, dst, rounder, shift) \
+       "movq " #src0 ", %%mm0                  \n\t" /* R4     R0      r4      r0 */\
+       "movq " #src4 ", %%mm1                  \n\t" /* R6     R2      r6      r2 */\
        "movq " #src1 ", %%mm2                  \n\t" /* R3     R1      r3      r1 */\
        "movq " #src5 ", %%mm3                  \n\t" /* R7     R5      r7      r5 */\
-       "movq 16(%2), %%mm4                     \n\t" /* C2     C4      C2      C4 */\
-       "pmaddwd %%mm0, %%mm4                   \n\t" /* C2R2+C4R0      C2r2+C4r0 */\
-       "movq 24(%2), %%mm5                     \n\t" /* C6     C4      C6      C4 */\
-       "pmaddwd %%mm1, %%mm5                   \n\t" /* C6R6+C4R4      C6r6+C4r4 */\
-       "movq 32(%2), %%mm6                     \n\t" /* C3     C1      C3      C1 */\
-       "pmaddwd %%mm2, %%mm6                   \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
-       "movq 40(%2), %%mm7                     \n\t" /* C7     C5      C7      C5 */\
-       "pmaddwd %%mm3, %%mm7                   \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
+       "movq 16(%2), %%mm4                     \n\t" /* C4     C4      C4      C4 */\
+       "pmaddwd %%mm0, %%mm4                   \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
+       "movq 24(%2), %%mm5                     \n\t" /* -C4    C4      -C4     C4 */\
+       "pmaddwd %%mm5, %%mm0                   \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
+       "movq 32(%2), %%mm5                     \n\t" /* C6     C2      C6      C2 */\
+       "pmaddwd %%mm1, %%mm5                   \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
+       "movq 40(%2), %%mm6                     \n\t" /* -C2    C6      -C2     C6 */\
+       "pmaddwd %%mm6, %%mm1                   \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
+       #rounder ", %%mm4                       \n\t"\
+       "movq %%mm4, %%mm6                      \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
+       "movq 48(%2), %%mm7                     \n\t" /* C3     C1      C3      C1 */\
+       #rounder ", %%mm0                       \n\t"\
+       "pmaddwd %%mm2, %%mm7                   \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
        "paddd %%mm5, %%mm4                     \n\t" /* A0             a0 */\
-\
-       "movq 48(%2), %%mm5                     \n\t" /* C6     C4      C6      C4 */\
-       "pmaddwd %%mm0, %%mm5                   \n\t" /* C6R2+C4R0      C6r2+C4r0 */\
-       "paddd %%mm7, %%mm6                     \n\t" /* B0             b0 */\
-       "paddd %%mm4, %%mm6                     \n\t" /* A0+B0          a0+b0 */\
+       "psubd %%mm5, %%mm6                     \n\t" /* A3             a3 */\
+       "movq %%mm0, %%mm5                      \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
+       "paddd %%mm1, %%mm0                     \n\t" /* A1             a1 */\
+       "psubd %%mm1, %%mm5                     \n\t" /* A2             a2 */\
+       "movq 56(%2), %%mm1                     \n\t" /* C7     C5      C7      C5 */\
+       "pmaddwd %%mm3, %%mm1                   \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
+       "pmaddwd 64(%2), %%mm2                  \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
+       "paddd %%mm1, %%mm7                     \n\t" /* B0             b0 */\
+       "movq 72(%2), %%mm1                     \n\t" /* -C5    -C1     -C5     -C1 */\
+       "pmaddwd %%mm3, %%mm1                   \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
+       "paddd %%mm4, %%mm7                     \n\t" /* A0+B0          a0+b0 */\
        "paddd %%mm4, %%mm4                     \n\t" /* 2A0            2a0 */\
-       "psubd %%mm6, %%mm4                     \n\t" /* A0-B0          a0-b0 */\
-       "psrad $" #shift ", %%mm6               \n\t"\
-       "psrad $" #shift ", %%mm4               \n\t"\
-       WRITE0b(%%mm6, %%mm4, dst) \
-\
-       "movq 56(%2), %%mm4                     \n\t" /* -C2    -C4     -C2     -C4 */\
-       "pmaddwd %%mm1, %%mm4                   \n\t" /* -C2R6-C4R4     -C2r6-C4r4 */\
-       "movq 64(%2), %%mm6                     \n\t" /* -C7    C3      -C7     C3 */\
-       "pmaddwd %%mm2, %%mm6                   \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
-       "movq 72(%2), %%mm7                     \n\t" /* -C5    -C1     -C5     -C1 */\
-       "pmaddwd %%mm3, %%mm7                   \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
-       "paddd %%mm5, %%mm4                     \n\t" /* A1             a1 */\
-\
-       "movq 80(%2), %%mm5                     \n\t" /* -C6    C4      -C6     C4 */\
-       "pmaddwd %%mm0, %%mm5                   \n\t" /* -C6R2+C4R0     -C6r2+C4r0 */\
-       "paddd %%mm7, %%mm6                     \n\t" /* B1             b1 */\
-       "paddd %%mm4, %%mm6                     \n\t" /* A1+B1          a1+b1 */\
-       "paddd %%mm4, %%mm4                     \n\t" /* 2A1            2a1 */\
-       "psubd %%mm6, %%mm4                     \n\t" /* A1-B1          a1-b1 */\
-       "psrad $" #shift ", %%mm6               \n\t"\
+       "psubd %%mm7, %%mm4                     \n\t" /* A0-B0          a0-b0 */\
+       "paddd %%mm2, %%mm1                     \n\t" /* B1             b1 */\
+       "psrad $" #shift ", %%mm7               \n\t"\
        "psrad $" #shift ", %%mm4               \n\t"\
-       WRITE1b(%%mm6, %%mm4, dst, %%mm7) \
-\
-       "movq 88(%2), %%mm4                     \n\t" /* C2     -C4     C2      -C4 */\
-       "pmaddwd %%mm1, %%mm4                   \n\t" /* C2R6-C4R4      C2r6-C4r4 */\
-       "movq 96(%2), %%mm6                     \n\t" /* -C1    C5      -C1     C5 */\
-       "pmaddwd %%mm2, %%mm6                   \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
-       "movq 104(%2), %%mm7                    \n\t" /* C3     C7      C3      C7 */\
+       "movq %%mm0, %%mm2                      \n\t" /* A1             a1 */\
+       "paddd %%mm1, %%mm0                     \n\t" /* A1+B1          a1+b1 */\
+       "psubd %%mm1, %%mm2                     \n\t" /* A1-B1          a1-b1 */\
+       "psrad $" #shift ", %%mm0               \n\t"\
+       "psrad $" #shift ", %%mm2               \n\t"\
+       "packssdw %%mm7, %%mm7                  \n\t" /* A0+B0  a0+b0 */\
+       "movd %%mm7, " #dst "                   \n\t"\
+       "packssdw %%mm0, %%mm0                  \n\t" /* A1+B1  a1+b1 */\
+       "movd %%mm0, 16+" #dst "                \n\t"\
+       "packssdw %%mm2, %%mm2                  \n\t" /* A1-B1  a1-b1 */\
+       "movd %%mm2, 96+" #dst "                \n\t"\
+       "packssdw %%mm4, %%mm4                  \n\t" /* A0-B0  a0-b0 */\
+       "movd %%mm4, 112+" #dst "               \n\t"\
+       "movq " #src1 ", %%mm0                  \n\t" /* R3     R1      r3      r1 */\
+       "movq 80(%2), %%mm4                     \n\t" /* -C1    C5      -C1     C5 */\
+       "pmaddwd %%mm0, %%mm4                   \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
+       "movq 88(%2), %%mm7                     \n\t" /* C3     C7      C3      C7 */\
+       "pmaddwd 96(%2), %%mm0                  \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
        "pmaddwd %%mm3, %%mm7                   \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
-       "paddd %%mm5, %%mm4                     \n\t" /* A2             a2 */\
-\
-       "pmaddwd 112(%2), %%mm0                 \n\t" /* -C2R2+C4R0     -C2r2+C4r0 */\
-       "paddd %%mm7, %%mm6                     \n\t" /* B1             b1 */\
-       "paddd %%mm4, %%mm6                     \n\t" /* A1+B1          a1+b1 */\
-       "pmaddwd 120(%2), %%mm1                 \n\t" /* -C6R6+C4R4     -C6r6+C4r4 */\
-       "paddd %%mm4, %%mm4                     \n\t" /* 2A1            2a1 */\
-       "psubd %%mm6, %%mm4                     \n\t" /* A1-B1          a1-b1 */\
-       "pmaddwd 128(%2), %%mm2                 \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
-       "pmaddwd 136(%2), %%mm3                 \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
+       "movq %%mm5, %%mm2                      \n\t" /* A2             a2 */\
+       "pmaddwd 104(%2), %%mm3                 \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
+       "paddd %%mm7, %%mm4                     \n\t" /* B2             b2 */\
+       "paddd %%mm4, %%mm2                     \n\t" /* A2+B2          a2+b2 */\
+       "psubd %%mm4, %%mm5                     \n\t" /* a2-B2          a2-b2 */\
+       "psrad $" #shift ", %%mm2               \n\t"\
+       "psrad $" #shift ", %%mm5               \n\t"\
+       "movq %%mm6, %%mm4                      \n\t" /* A3             a3 */\
+       "paddd %%mm0, %%mm3                     \n\t" /* B3             b3 */\
+       "paddd %%mm3, %%mm6                     \n\t" /* A3+B3          a3+b3 */\
+       "psubd %%mm3, %%mm4                     \n\t" /* a3-B3          a3-b3 */\
        "psrad $" #shift ", %%mm6               \n\t"\
        "psrad $" #shift ", %%mm4               \n\t"\
-\
-       "paddd %%mm1, %%mm0                     \n\t" /* A3             a3 */\
-       "paddd %%mm3, %%mm2                     \n\t" /* B3             b3 */\
-       "paddd %%mm0, %%mm2                     \n\t" /* A3+B3          a3+b3 */\
-       "paddd %%mm0, %%mm0                     \n\t" /* 2A3            2a3 */\
-       "psubd %%mm2, %%mm0                     \n\t" /* A3-B3          a3-b3 */\
-       "psrad $" #shift ", %%mm2               \n\t"\
-       "psrad $" #shift ", %%mm0               \n\t"\
-       WRITE2b(%%mm6, %%mm4, %%mm2, %%mm0, dst)
-
-//IDCT_CORE(  src0,   src4,   src1,    src5,    dst, rounder, shift)
-IDCT_CORE(    (%1), 64(%1), 32(%1),  96(%1),  0(%0),/nop, 20)
-IDCT_CORE(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0),/nop, 20)
-IDCT_CORE(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0),/nop, 20)
-IDCT_CORE(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
+       "packssdw %%mm2, %%mm2                  \n\t" /* A2+B2  a2+b2 */\
+       "packssdw %%mm6, %%mm6                  \n\t" /* A3+B3  a3+b3 */\
+       "movd %%mm2, 32+" #dst "                \n\t"\
+       "packssdw %%mm4, %%mm4                  \n\t" /* A3-B3  a3-b3 */\
+       "packssdw %%mm5, %%mm5                  \n\t" /* A2-B2  a2-b2 */\
+       "movd %%mm6, 48+" #dst "                \n\t"\
+       "movd %%mm4, 64+" #dst "                \n\t"\
+       "movd %%mm5, 80+" #dst "                \n\t"
+
+
+//IDCT(  src0,   src4,   src1,    src5,    dst, rounder, shift)
+IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0),/nop, 20)
+IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0),/nop, 20)
+IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0),/nop, 20)
+IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
        "jmp 9f                                 \n\t"
 
        "#.balign 16                            \n\t"\
        "4:                                     \n\t"
-Z_COND_IDCT_CORE(  64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 6f)
-Z_COND_IDCT_CORE(  96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 5f)
+Z_COND_IDCT(  64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 6f)
+Z_COND_IDCT(  96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 5f)
 
-#undef IDCT_CORE
-#define IDCT_CORE(src0, src4, src1, src5, dst, rounder, shift) \
-       "movq " #src0 ", %%mm0                  \n\t" /* R2     R0      r2      r0 */\
-       "movq " #src4 ", %%mm1                  \n\t" /* R6     R4      r6      r4 */\
+#undef IDCT
+#define IDCT(src0, src4, src1, src5, dst, rounder, shift) \
+       "movq " #src0 ", %%mm0                  \n\t" /* R4     R0      r4      r0 */\
+       "movq " #src4 ", %%mm1                  \n\t" /* R6     R2      r6      r2 */\
        "movq " #src5 ", %%mm3                  \n\t" /* R7     R5      r7      r5 */\
-       "movq 16(%2), %%mm4                     \n\t" /* C2     C4      C2      C4 */\
-       "pmaddwd %%mm0, %%mm4                   \n\t" /* C2R2+C4R0      C2r2+C4r0 */\
-       "movq 24(%2), %%mm5                     \n\t" /* C6     C4      C6      C4 */\
-       "pmaddwd %%mm1, %%mm5                   \n\t" /* C6R6+C4R4      C6r6+C4r4 */\
-       "movq 40(%2), %%mm7                     \n\t" /* C7     C5      C7      C5 */\
-       "pmaddwd %%mm3, %%mm7                   \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
+       "movq 16(%2), %%mm4                     \n\t" /* C4     C4      C4      C4 */\
+       "pmaddwd %%mm0, %%mm4                   \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
+       "movq 24(%2), %%mm5                     \n\t" /* -C4    C4      -C4     C4 */\
+       "pmaddwd %%mm5, %%mm0                   \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
+       "movq 32(%2), %%mm5                     \n\t" /* C6     C2      C6      C2 */\
+       "pmaddwd %%mm1, %%mm5                   \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
+       "movq 40(%2), %%mm6                     \n\t" /* -C2    C6      -C2     C6 */\
+       "pmaddwd %%mm6, %%mm1                   \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
+       #rounder ", %%mm4                       \n\t"\
+       "movq %%mm4, %%mm6                      \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
+       #rounder ", %%mm0                       \n\t"\
        "paddd %%mm5, %%mm4                     \n\t" /* A0             a0 */\
-\
-       "movq 48(%2), %%mm5                     \n\t" /* C6     C4      C6      C4 */\
-       "pmaddwd %%mm0, %%mm5                   \n\t" /* C6R2+C4R0      C6r2+C4r0 */\
-       "paddd %%mm4, %%mm7                     \n\t" /* A0+B0          a0+b0 */\
-       "paddd %%mm4, %%mm4                     \n\t" /* 2A0            2a0 */\
-       "psubd %%mm7, %%mm4                     \n\t" /* A0-B0          a0-b0 */\
-       "psrad $" #shift ", %%mm7               \n\t"\
-       "psrad $" #shift ", %%mm4               \n\t"\
-       WRITE0b(%%mm7, %%mm4, dst) \
-\
-       "movq 56(%2), %%mm4                     \n\t" /* -C2    -C4     -C2     -C4 */\
-       "pmaddwd %%mm1, %%mm4                   \n\t" /* -C2R6-C4R4     -C2r6-C4r4 */\
+       "psubd %%mm5, %%mm6                     \n\t" /* A3             a3 */\
+       "movq %%mm0, %%mm5                      \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
+       "paddd %%mm1, %%mm0                     \n\t" /* A1             a1 */\
+       "psubd %%mm1, %%mm5                     \n\t" /* A2             a2 */\
+       "movq 56(%2), %%mm1                     \n\t" /* C7     C5      C7      C5 */\
+       "pmaddwd %%mm3, %%mm1                   \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
        "movq 72(%2), %%mm7                     \n\t" /* -C5    -C1     -C5     -C1 */\
        "pmaddwd %%mm3, %%mm7                   \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
-       "paddd %%mm5, %%mm4                     \n\t" /* A1             a1 */\
-\
-       "movq 80(%2), %%mm5                     \n\t" /* -C6    C4      -C6     C4 */\
-       "pmaddwd %%mm0, %%mm5                   \n\t" /* -C6R2+C4R0     -C6r2+C4r0 */\
-       "paddd %%mm4, %%mm7                     \n\t" /* A1+B1          a1+b1 */\
-       "paddd %%mm4, %%mm4                     \n\t" /* 2A1            2a1 */\
-       "psubd %%mm7, %%mm4                     \n\t" /* A1-B1          a1-b1 */\
-       "psrad $" #shift ", %%mm7               \n\t"\
-       "psrad $" #shift ", %%mm4               \n\t"\
-       WRITE1b(%%mm7, %%mm4, dst, %%mm6) \
-\
-       "movq 88(%2), %%mm4                     \n\t" /* C2     -C4     C2      -C4 */\
-       "pmaddwd %%mm1, %%mm4                   \n\t" /* C2R6-C4R4      C2r6-C4r4 */\
-       "movq 104(%2), %%mm7                    \n\t" /* C3     C7      C3      C7 */\
-       "pmaddwd %%mm3, %%mm7                   \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
-       "paddd %%mm5, %%mm4                     \n\t" /* A2             a2 */\
-\
-       "pmaddwd 112(%2), %%mm0                 \n\t" /* -C2R2+C4R0     -C2r2+C4r0 */\
-       "paddd %%mm4, %%mm7                     \n\t" /* A1+B1          a1+b1 */\
-       "pmaddwd 120(%2), %%mm1                 \n\t" /* -C6R6+C4R4     -C6r6+C4r4 */\
-       "paddd %%mm4, %%mm4                     \n\t" /* 2A1            2a1 */\
-       "psubd %%mm7, %%mm4                     \n\t" /* A1-B1          a1-b1 */\
-       "pmaddwd 136(%2), %%mm3                 \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
-       "psrad $" #shift ", %%mm7               \n\t"\
+       "paddd %%mm4, %%mm1                     \n\t" /* A0+B0          a0+b0 */\
+       "paddd %%mm4, %%mm4                     \n\t" /* 2A0            2a0 */\
+       "psubd %%mm1, %%mm4                     \n\t" /* A0-B0          a0-b0 */\
+       "psrad $" #shift ", %%mm1               \n\t"\
        "psrad $" #shift ", %%mm4               \n\t"\
-\
-       "paddd %%mm1, %%mm0                     \n\t" /* A3             a3 */\
-       "paddd %%mm0, %%mm3                     \n\t" /* A3+B3          a3+b3 */\
-       "paddd %%mm0, %%mm0                     \n\t" /* 2A3            2a3 */\
-       "psubd %%mm3, %%mm0                     \n\t" /* A3-B3          a3-b3 */\
-       "psrad $" #shift ", %%mm3               \n\t"\
+       "movq %%mm0, %%mm2                      \n\t" /* A1             a1 */\
+       "paddd %%mm7, %%mm0                     \n\t" /* A1+B1          a1+b1 */\
+       "psubd %%mm7, %%mm2                     \n\t" /* A1-B1          a1-b1 */\
        "psrad $" #shift ", %%mm0               \n\t"\
-       WRITE2b(%%mm7, %%mm4, %%mm3, %%mm0, dst)
-
-//IDCT_CORE(  src0,   src4,   src1,    src5,    dst, rounder, shift)
-IDCT_CORE(    (%1), 64(%1), 32(%1),  96(%1),  0(%0),/nop, 20)
-IDCT_CORE(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0),/nop, 20)
-IDCT_CORE(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0),/nop, 20)
-IDCT_CORE(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
+       "psrad $" #shift ", %%mm2               \n\t"\
+       "packssdw %%mm1, %%mm1                  \n\t" /* A0+B0  a0+b0 */\
+       "movd %%mm1, " #dst "                   \n\t"\
+       "packssdw %%mm0, %%mm0                  \n\t" /* A1+B1  a1+b1 */\
+       "movd %%mm0, 16+" #dst "                \n\t"\
+       "packssdw %%mm2, %%mm2                  \n\t" /* A1-B1  a1-b1 */\
+       "movd %%mm2, 96+" #dst "                \n\t"\
+       "packssdw %%mm4, %%mm4                  \n\t" /* A0-B0  a0-b0 */\
+       "movd %%mm4, 112+" #dst "               \n\t"\
+       "movq 88(%2), %%mm1                     \n\t" /* C3     C7      C3      C7 */\
+       "pmaddwd %%mm3, %%mm1                   \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
+       "movq %%mm5, %%mm2                      \n\t" /* A2             a2 */\
+       "pmaddwd 104(%2), %%mm3                 \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
+       "paddd %%mm1, %%mm2                     \n\t" /* A2+B2          a2+b2 */\
+       "psubd %%mm1, %%mm5                     \n\t" /* a2-B2          a2-b2 */\
+       "psrad $" #shift ", %%mm2               \n\t"\
+       "psrad $" #shift ", %%mm5               \n\t"\
+       "movq %%mm6, %%mm1                      \n\t" /* A3             a3 */\
+       "paddd %%mm3, %%mm6                     \n\t" /* A3+B3          a3+b3 */\
+       "psubd %%mm3, %%mm1                     \n\t" /* a3-B3          a3-b3 */\
+       "psrad $" #shift ", %%mm6               \n\t"\
+       "psrad $" #shift ", %%mm1               \n\t"\
+       "packssdw %%mm2, %%mm2                  \n\t" /* A2+B2  a2+b2 */\
+       "packssdw %%mm6, %%mm6                  \n\t" /* A3+B3  a3+b3 */\
+       "movd %%mm2, 32+" #dst "                \n\t"\
+       "packssdw %%mm1, %%mm1                  \n\t" /* A3-B3  a3-b3 */\
+       "packssdw %%mm5, %%mm5                  \n\t" /* A2-B2  a2-b2 */\
+       "movd %%mm6, 48+" #dst "                \n\t"\
+       "movd %%mm1, 64+" #dst "                \n\t"\
+       "movd %%mm5, 80+" #dst "                \n\t"   
+
+//IDCT(  src0,   src4,   src1,    src5,    dst, rounder, shift)
+IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0),/nop, 20)
+IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0),/nop, 20)
+IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0),/nop, 20)
+IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
        "jmp 9f                                 \n\t"
 
        "#.balign 16                            \n\t"\
        "6:                                     \n\t"
-Z_COND_IDCT_CORE(  96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 7f)
+Z_COND_IDCT(  96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 7f)
 
-#undef IDCT_CORE
-#define IDCT_CORE(src0, src4, src1, src5, dst, rounder, shift) \
-       "movq " #src0 ", %%mm0                  \n\t" /* R2     R0      r2      r0 */\
+#undef IDCT
+#define IDCT(src0, src4, src1, src5, dst, rounder, shift) \
+       "movq " #src0 ", %%mm0                  \n\t" /* R4     R0      r4      r0 */\
        "movq " #src5 ", %%mm3                  \n\t" /* R7     R5      r7      r5 */\
-       "movq 16(%2), %%mm4                     \n\t" /* C2     C4      C2      C4 */\
-       "pmaddwd %%mm0, %%mm4                   \n\t" /* C2R2+C4R0      C2r2+C4r0 */\
-       "movq 40(%2), %%mm7                     \n\t" /* C7     C5      C7      C5 */\
-       "pmaddwd %%mm3, %%mm7                   \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
-\
-       "movq 48(%2), %%mm5                     \n\t" /* C6     C4      C6      C4 */\
-       "pmaddwd %%mm0, %%mm5                   \n\t" /* C6R2+C4R0      C6r2+C4r0 */\
-       "paddd %%mm4, %%mm7                     \n\t" /* A0+B0          a0+b0 */\
-       "paddd %%mm4, %%mm4                     \n\t" /* 2A0            2a0 */\
-       "psubd %%mm7, %%mm4                     \n\t" /* A0-B0          a0-b0 */\
-       "psrad $" #shift ", %%mm7               \n\t"\
-       "psrad $" #shift ", %%mm4               \n\t"\
-       WRITE0b(%%mm7, %%mm4, dst) \
-\
+       "movq 16(%2), %%mm4                     \n\t" /* C4     C4      C4      C4 */\
+       "pmaddwd %%mm0, %%mm4                   \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
+       "movq 24(%2), %%mm5                     \n\t" /* -C4    C4      -C4     C4 */\
+       "pmaddwd %%mm5, %%mm0                   \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
+       #rounder ", %%mm4                       \n\t"\
+       "movq %%mm4, %%mm6                      \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
+       #rounder ", %%mm0                       \n\t"\
+       "movq %%mm0, %%mm5                      \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
+       "movq 56(%2), %%mm1                     \n\t" /* C7     C5      C7      C5 */\
+       "pmaddwd %%mm3, %%mm1                   \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
        "movq 72(%2), %%mm7                     \n\t" /* -C5    -C1     -C5     -C1 */\
        "pmaddwd %%mm3, %%mm7                   \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
-\
-       "movq 80(%2), %%mm4                     \n\t" /* -C6    C4      -C6     C4 */\
-       "pmaddwd %%mm0, %%mm4                   \n\t" /* -C6R2+C4R0     -C6r2+C4r0 */\
-       "paddd %%mm5, %%mm7                     \n\t" /* A1+B1          a1+b1 */\
-       "paddd %%mm5, %%mm5                     \n\t" /* 2A1            2a1 */\
-       "psubd %%mm7, %%mm5                     \n\t" /* A1-B1          a1-b1 */\
-       "psrad $" #shift ", %%mm7               \n\t"\
-       "psrad $" #shift ", %%mm5               \n\t"\
-       WRITE1b(%%mm7, %%mm5, dst, %%mm6) \
-\
-       "movq 104(%2), %%mm7                    \n\t" /* C3     C7      C3      C7 */\
-       "pmaddwd %%mm3, %%mm7                   \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
-\
-       "pmaddwd 112(%2), %%mm0                 \n\t" /* -C2R2+C4R0     -C2r2+C4r0 */\
-       "paddd %%mm4, %%mm7                     \n\t" /* A1+B1          a1+b1 */\
-       "paddd %%mm4, %%mm4                     \n\t" /* 2A1            2a1 */\
-       "psubd %%mm7, %%mm4                     \n\t" /* A1-B1          a1-b1 */\
-       "pmaddwd 136(%2), %%mm3                 \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
-       "psrad $" #shift ", %%mm7               \n\t"\
+       "paddd %%mm4, %%mm1                     \n\t" /* A0+B0          a0+b0 */\
+       "paddd %%mm4, %%mm4                     \n\t" /* 2A0            2a0 */\
+       "psubd %%mm1, %%mm4                     \n\t" /* A0-B0          a0-b0 */\
+       "psrad $" #shift ", %%mm1               \n\t"\
        "psrad $" #shift ", %%mm4               \n\t"\
-\
-       "paddd %%mm0, %%mm3                     \n\t" /* A3+B3          a3+b3 */\
-       "paddd %%mm0, %%mm0                     \n\t" /* 2A3            2a3 */\
-       "psubd %%mm3, %%mm0                     \n\t" /* A3-B3          a3-b3 */\
-       "psrad $" #shift ", %%mm3               \n\t"\
+       "movq %%mm0, %%mm2                      \n\t" /* A1             a1 */\
+       "paddd %%mm7, %%mm0                     \n\t" /* A1+B1          a1+b1 */\
+       "psubd %%mm7, %%mm2                     \n\t" /* A1-B1          a1-b1 */\
        "psrad $" #shift ", %%mm0               \n\t"\
-       WRITE2b(%%mm7, %%mm4, %%mm3, %%mm0, dst)
-
-//IDCT_CORE(  src0,   src4,   src1,    src5,    dst, rounder, shift)
-IDCT_CORE(    (%1), 64(%1), 32(%1),  96(%1),  0(%0),/nop, 20)
-IDCT_CORE(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0),/nop, 20)
-IDCT_CORE(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0),/nop, 20)
-IDCT_CORE(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
+       "psrad $" #shift ", %%mm2               \n\t"\
+       "packssdw %%mm1, %%mm1                  \n\t" /* A0+B0  a0+b0 */\
+       "movd %%mm1, " #dst "                   \n\t"\
+       "packssdw %%mm0, %%mm0                  \n\t" /* A1+B1  a1+b1 */\
+       "movd %%mm0, 16+" #dst "                \n\t"\
+       "packssdw %%mm2, %%mm2                  \n\t" /* A1-B1  a1-b1 */\
+       "movd %%mm2, 96+" #dst "                \n\t"\
+       "packssdw %%mm4, %%mm4                  \n\t" /* A0-B0  a0-b0 */\
+       "movd %%mm4, 112+" #dst "               \n\t"\
+       "movq 88(%2), %%mm1                     \n\t" /* C3     C7      C3      C7 */\
+       "pmaddwd %%mm3, %%mm1                   \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
+       "movq %%mm5, %%mm2                      \n\t" /* A2             a2 */\
+       "pmaddwd 104(%2), %%mm3                 \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
+       "paddd %%mm1, %%mm2                     \n\t" /* A2+B2          a2+b2 */\
+       "psubd %%mm1, %%mm5                     \n\t" /* a2-B2          a2-b2 */\
+       "psrad $" #shift ", %%mm2               \n\t"\
+       "psrad $" #shift ", %%mm5               \n\t"\
+       "movq %%mm6, %%mm1                      \n\t" /* A3             a3 */\
+       "paddd %%mm3, %%mm6                     \n\t" /* A3+B3          a3+b3 */\
+       "psubd %%mm3, %%mm1                     \n\t" /* a3-B3          a3-b3 */\
+       "psrad $" #shift ", %%mm6               \n\t"\
+       "psrad $" #shift ", %%mm1               \n\t"\
+       "packssdw %%mm2, %%mm2                  \n\t" /* A2+B2  a2+b2 */\
+       "packssdw %%mm6, %%mm6                  \n\t" /* A3+B3  a3+b3 */\
+       "movd %%mm2, 32+" #dst "                \n\t"\
+       "packssdw %%mm1, %%mm1                  \n\t" /* A3-B3  a3-b3 */\
+       "packssdw %%mm5, %%mm5                  \n\t" /* A2-B2  a2-b2 */\
+       "movd %%mm6, 48+" #dst "                \n\t"\
+       "movd %%mm1, 64+" #dst "                \n\t"\
+       "movd %%mm5, 80+" #dst "                \n\t"   
+
+
+//IDCT(  src0,   src4,   src1,    src5,    dst, rounder, shift)
+IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0),/nop, 20)
+IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0),/nop, 20)
+IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0),/nop, 20)
+IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
        "jmp 9f                                 \n\t"
 
        "#.balign 16                            \n\t"\
        "2:                                     \n\t"
-Z_COND_IDCT_CORE(  96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 3f)
+Z_COND_IDCT(  96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 3f)
 
-#undef IDCT_CORE
-#define IDCT_CORE(src0, src4, src1, src5, dst, rounder, shift) \
-       "movq " #src0 ", %%mm0                  \n\t" /* R2     R0      r2      r0 */\
+#undef IDCT
+#define IDCT(src0, src4, src1, src5, dst, rounder, shift) \
+       "movq " #src0 ", %%mm0                  \n\t" /* R4     R0      r4      r0 */\
        "movq " #src1 ", %%mm2                  \n\t" /* R3     R1      r3      r1 */\
        "movq " #src5 ", %%mm3                  \n\t" /* R7     R5      r7      r5 */\
-       "movq 16(%2), %%mm4                     \n\t" /* C2     C4      C2      C4 */\
-       "pmaddwd %%mm0, %%mm4                   \n\t" /* C2R2+C4R0      C2r2+C4r0 */\
-       "movq 32(%2), %%mm6                     \n\t" /* C3     C1      C3      C1 */\
-       "pmaddwd %%mm2, %%mm6                   \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
-       "movq 40(%2), %%mm7                     \n\t" /* C7     C5      C7      C5 */\
-       "pmaddwd %%mm3, %%mm7                   \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
-\
-       "movq 48(%2), %%mm5                     \n\t" /* C6     C4      C6      C4 */\
-       "pmaddwd %%mm0, %%mm5                   \n\t" /* C6R2+C4R0      C6r2+C4r0 */\
-       "paddd %%mm7, %%mm6                     \n\t" /* B0             b0 */\
-       "paddd %%mm4, %%mm6                     \n\t" /* A0+B0          a0+b0 */\
+       "movq 16(%2), %%mm4                     \n\t" /* C4     C4      C4      C4 */\
+       "pmaddwd %%mm0, %%mm4                   \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
+       "movq 24(%2), %%mm5                     \n\t" /* -C4    C4      -C4     C4 */\
+       "pmaddwd %%mm5, %%mm0                   \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
+       #rounder ", %%mm4                       \n\t"\
+       "movq %%mm4, %%mm6                      \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
+       "movq 48(%2), %%mm7                     \n\t" /* C3     C1      C3      C1 */\
+       #rounder ", %%mm0                       \n\t"\
+       "pmaddwd %%mm2, %%mm7                   \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
+       "movq %%mm0, %%mm5                      \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
+       "movq 56(%2), %%mm1                     \n\t" /* C7     C5      C7      C5 */\
+       "pmaddwd %%mm3, %%mm1                   \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
+       "pmaddwd 64(%2), %%mm2                  \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
+       "paddd %%mm1, %%mm7                     \n\t" /* B0             b0 */\
+       "movq 72(%2), %%mm1                     \n\t" /* -C5    -C1     -C5     -C1 */\
+       "pmaddwd %%mm3, %%mm1                   \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
+       "paddd %%mm4, %%mm7                     \n\t" /* A0+B0          a0+b0 */\
        "paddd %%mm4, %%mm4                     \n\t" /* 2A0            2a0 */\
-       "psubd %%mm6, %%mm4                     \n\t" /* A0-B0          a0-b0 */\
-       "psrad $" #shift ", %%mm6               \n\t"\
+       "psubd %%mm7, %%mm4                     \n\t" /* A0-B0          a0-b0 */\
+       "paddd %%mm2, %%mm1                     \n\t" /* B1             b1 */\
+       "psrad $" #shift ", %%mm7               \n\t"\
        "psrad $" #shift ", %%mm4               \n\t"\
-       WRITE0b(%%mm6, %%mm4, dst) \
-\
-       "movq 64(%2), %%mm6                     \n\t" /* -C7    C3      -C7     C3 */\
-       "pmaddwd %%mm2, %%mm6                   \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
-       "movq 72(%2), %%mm7                     \n\t" /* -C5    -C1     -C5     -C1 */\
-       "pmaddwd %%mm3, %%mm7                   \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
-\
-       "movq 80(%2), %%mm4                     \n\t" /* -C6    C4      -C6     C4 */\
-       "pmaddwd %%mm0, %%mm4                   \n\t" /* -C6R2+C4R0     -C6r2+C4r0 */\
-       "paddd %%mm7, %%mm6                     \n\t" /* B1             b1 */\
-       "paddd %%mm5, %%mm6                     \n\t" /* A1+B1          a1+b1 */\
-       "paddd %%mm5, %%mm5                     \n\t" /* 2A1            2a1 */\
-       "psubd %%mm6, %%mm5                     \n\t" /* A1-B1          a1-b1 */\
-       "psrad $" #shift ", %%mm6               \n\t"\
-       "psrad $" #shift ", %%mm5               \n\t"\
-       WRITE1b(%%mm6, %%mm5, dst, %%mm7) \
-\
-       "movq 96(%2), %%mm6                     \n\t" /* -C1    C5      -C1     C5 */\
-       "pmaddwd %%mm2, %%mm6                   \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
-       "movq 104(%2), %%mm7                    \n\t" /* C3     C7      C3      C7 */\
+       "movq %%mm0, %%mm2                      \n\t" /* A1             a1 */\
+       "paddd %%mm1, %%mm0                     \n\t" /* A1+B1          a1+b1 */\
+       "psubd %%mm1, %%mm2                     \n\t" /* A1-B1          a1-b1 */\
+       "psrad $" #shift ", %%mm0               \n\t"\
+       "psrad $" #shift ", %%mm2               \n\t"\
+       "packssdw %%mm7, %%mm7                  \n\t" /* A0+B0  a0+b0 */\
+       "movd %%mm7, " #dst "                   \n\t"\
+       "packssdw %%mm0, %%mm0                  \n\t" /* A1+B1  a1+b1 */\
+       "movd %%mm0, 16+" #dst "                \n\t"\
+       "packssdw %%mm2, %%mm2                  \n\t" /* A1-B1  a1-b1 */\
+       "movd %%mm2, 96+" #dst "                \n\t"\
+       "packssdw %%mm4, %%mm4                  \n\t" /* A0-B0  a0-b0 */\
+       "movd %%mm4, 112+" #dst "               \n\t"\
+       "movq " #src1 ", %%mm0                  \n\t" /* R3     R1      r3      r1 */\
+       "movq 80(%2), %%mm4                     \n\t" /* -C1    C5      -C1     C5 */\
+       "pmaddwd %%mm0, %%mm4                   \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
+       "movq 88(%2), %%mm7                     \n\t" /* C3     C7      C3      C7 */\
+       "pmaddwd 96(%2), %%mm0                  \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
        "pmaddwd %%mm3, %%mm7                   \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
-\
-       "pmaddwd 112(%2), %%mm0                 \n\t" /* -C2R2+C4R0     -C2r2+C4r0 */\
-       "paddd %%mm7, %%mm6                     \n\t" /* B1             b1 */\
-       "paddd %%mm4, %%mm6                     \n\t" /* A1+B1          a1+b1 */\
-       "paddd %%mm4, %%mm4                     \n\t" /* 2A1            2a1 */\
-       "psubd %%mm6, %%mm4                     \n\t" /* A1-B1          a1-b1 */\
-       "pmaddwd 128(%2), %%mm2                 \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
-       "pmaddwd 136(%2), %%mm3                 \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
+       "movq %%mm5, %%mm2                      \n\t" /* A2             a2 */\
+       "pmaddwd 104(%2), %%mm3                 \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
+       "paddd %%mm7, %%mm4                     \n\t" /* B2             b2 */\
+       "paddd %%mm4, %%mm2                     \n\t" /* A2+B2          a2+b2 */\
+       "psubd %%mm4, %%mm5                     \n\t" /* a2-B2          a2-b2 */\
+       "psrad $" #shift ", %%mm2               \n\t"\
+       "psrad $" #shift ", %%mm5               \n\t"\
+       "movq %%mm6, %%mm4                      \n\t" /* A3             a3 */\
+       "paddd %%mm0, %%mm3                     \n\t" /* B3             b3 */\
+       "paddd %%mm3, %%mm6                     \n\t" /* A3+B3          a3+b3 */\
+       "psubd %%mm3, %%mm4                     \n\t" /* a3-B3          a3-b3 */\
        "psrad $" #shift ", %%mm6               \n\t"\
        "psrad $" #shift ", %%mm4               \n\t"\
-\
-       "paddd %%mm3, %%mm2                     \n\t" /* B3             b3 */\
-       "paddd %%mm0, %%mm2                     \n\t" /* A3+B3          a3+b3 */\
-       "paddd %%mm0, %%mm0                     \n\t" /* 2A3            2a3 */\
-       "psubd %%mm2, %%mm0                     \n\t" /* A3-B3          a3-b3 */\
-       "psrad $" #shift ", %%mm2               \n\t"\
-       "psrad $" #shift ", %%mm0               \n\t"\
-       WRITE2b(%%mm6, %%mm4, %%mm2, %%mm0, dst)
-
-//IDCT_CORE(  src0,   src4,   src1,    src5,    dst, rounder, shift)
-IDCT_CORE(    (%1), 64(%1), 32(%1),  96(%1),  0(%0),/nop, 20)
-IDCT_CORE(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0),/nop, 20)
-IDCT_CORE(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0),/nop, 20)
-IDCT_CORE(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
+       "packssdw %%mm2, %%mm2                  \n\t" /* A2+B2  a2+b2 */\
+       "packssdw %%mm6, %%mm6                  \n\t" /* A3+B3  a3+b3 */\
+       "movd %%mm2, 32+" #dst "                \n\t"\
+       "packssdw %%mm4, %%mm4                  \n\t" /* A3-B3  a3-b3 */\
+       "packssdw %%mm5, %%mm5                  \n\t" /* A2-B2  a2-b2 */\
+       "movd %%mm6, 48+" #dst "                \n\t"\
+       "movd %%mm4, 64+" #dst "                \n\t"\
+       "movd %%mm5, 80+" #dst "                \n\t"
+
+//IDCT(  src0,   src4,   src1,    src5,    dst, rounder, shift)
+IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0),/nop, 20)
+IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0),/nop, 20)
+IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0),/nop, 20)
+IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
        "jmp 9f                                 \n\t"
 
        "#.balign 16                            \n\t"\
        "3:                                     \n\t"
-#undef IDCT_CORE
-#define IDCT_CORE(src0, src4, src1, src5, dst, rounder, shift) \
-       "movq " #src0 ", %%mm0                  \n\t" /* R2     R0      r2      r0 */\
+#undef IDCT
+#define IDCT(src0, src4, src1, src5, dst, rounder, shift) \
+       "movq " #src0 ", %%mm0                  \n\t" /* R4     R0      r4      r0 */\
        "movq " #src1 ", %%mm2                  \n\t" /* R3     R1      r3      r1 */\
-       "movq 16(%2), %%mm4                     \n\t" /* C2     C4      C2      C4 */\
-       "pmaddwd %%mm0, %%mm4                   \n\t" /* C2R2+C4R0      C2r2+C4r0 */\
-       "movq 32(%2), %%mm6                     \n\t" /* C3     C1      C3      C1 */\
-       "pmaddwd %%mm2, %%mm6                   \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
-\
-       "movq 48(%2), %%mm5                     \n\t" /* C6     C4      C6      C4 */\
-       "pmaddwd %%mm0, %%mm5                   \n\t" /* C6R2+C4R0      C6r2+C4r0 */\
-       "paddd %%mm4, %%mm6                     \n\t" /* A0+B0          a0+b0 */\
+       "movq 16(%2), %%mm4                     \n\t" /* C4     C4      C4      C4 */\
+       "pmaddwd %%mm0, %%mm4                   \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
+       "movq 24(%2), %%mm5                     \n\t" /* -C4    C4      -C4     C4 */\
+       "pmaddwd %%mm5, %%mm0                   \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
+       #rounder ", %%mm4                       \n\t"\
+       "movq %%mm4, %%mm6                      \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
+       "movq 48(%2), %%mm7                     \n\t" /* C3     C1      C3      C1 */\
+       #rounder ", %%mm0                       \n\t"\
+       "pmaddwd %%mm2, %%mm7                   \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
+       "movq %%mm0, %%mm5                      \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
+       "movq 64(%2), %%mm3                     \n\t"\
+       "pmaddwd %%mm2, %%mm3                   \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
+       "paddd %%mm4, %%mm7                     \n\t" /* A0+B0          a0+b0 */\
        "paddd %%mm4, %%mm4                     \n\t" /* 2A0            2a0 */\
-       "psubd %%mm6, %%mm4                     \n\t" /* A0-B0          a0-b0 */\
-       "psrad $" #shift ", %%mm6               \n\t"\
+       "psubd %%mm7, %%mm4                     \n\t" /* A0-B0          a0-b0 */\
+       "psrad $" #shift ", %%mm7               \n\t"\
        "psrad $" #shift ", %%mm4               \n\t"\
-       WRITE0b(%%mm6, %%mm4, dst) \
-\
-       "movq 64(%2), %%mm6                     \n\t" /* -C7    C3      -C7     C3 */\
-       "pmaddwd %%mm2, %%mm6                   \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
-\
-       "movq 80(%2), %%mm4                     \n\t" /* -C6    C4      -C6     C4 */\
-       "pmaddwd %%mm0, %%mm4                   \n\t" /* -C6R2+C4R0     -C6r2+C4r0 */\
-       "paddd %%mm5, %%mm6                     \n\t" /* A1+B1          a1+b1 */\
-       "paddd %%mm5, %%mm5                     \n\t" /* 2A1            2a1 */\
-       "psubd %%mm6, %%mm5                     \n\t" /* A1-B1          a1-b1 */\
-       "psrad $" #shift ", %%mm6               \n\t"\
+       "movq %%mm0, %%mm1                      \n\t" /* A1             a1 */\
+       "paddd %%mm3, %%mm0                     \n\t" /* A1+B1          a1+b1 */\
+       "psubd %%mm3, %%mm1                     \n\t" /* A1-B1          a1-b1 */\
+       "psrad $" #shift ", %%mm0               \n\t"\
+       "psrad $" #shift ", %%mm1               \n\t"\
+       "packssdw %%mm7, %%mm7                  \n\t" /* A0+B0  a0+b0 */\
+       "movd %%mm7, " #dst "                   \n\t"\
+       "packssdw %%mm0, %%mm0                  \n\t" /* A1+B1  a1+b1 */\
+       "movd %%mm0, 16+" #dst "                \n\t"\
+       "packssdw %%mm1, %%mm1                  \n\t" /* A1-B1  a1-b1 */\
+       "movd %%mm1, 96+" #dst "                \n\t"\
+       "packssdw %%mm4, %%mm4                  \n\t" /* A0-B0  a0-b0 */\
+       "movd %%mm4, 112+" #dst "               \n\t"\
+       "movq 80(%2), %%mm4                     \n\t" /* -C1    C5      -C1     C5 */\
+       "pmaddwd %%mm2, %%mm4                   \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
+       "pmaddwd 96(%2), %%mm2                  \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
+       "movq %%mm5, %%mm1                      \n\t" /* A2             a2 */\
+       "paddd %%mm4, %%mm1                     \n\t" /* A2+B2          a2+b2 */\
+       "psubd %%mm4, %%mm5                     \n\t" /* a2-B2          a2-b2 */\
+       "psrad $" #shift ", %%mm1               \n\t"\
        "psrad $" #shift ", %%mm5               \n\t"\
-       WRITE1b(%%mm6, %%mm5, dst, %%mm7) \
-\
-       "movq 96(%2), %%mm6                     \n\t" /* -C1    C5      -C1     C5 */\
-       "pmaddwd %%mm2, %%mm6                   \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
-\
-       "pmaddwd 112(%2), %%mm0                 \n\t" /* -C2R2+C4R0     -C2r2+C4r0 */\
-       "paddd %%mm4, %%mm6                     \n\t" /* A1+B1          a1+b1 */\
-       "paddd %%mm4, %%mm4                     \n\t" /* 2A1            2a1 */\
-       "psubd %%mm6, %%mm4                     \n\t" /* A1-B1          a1-b1 */\
-       "pmaddwd 128(%2), %%mm2                 \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
+       "movq %%mm6, %%mm4                      \n\t" /* A3             a3 */\
+       "paddd %%mm2, %%mm6                     \n\t" /* A3+B3          a3+b3 */\
+       "psubd %%mm2, %%mm4                     \n\t" /* a3-B3          a3-b3 */\
        "psrad $" #shift ", %%mm6               \n\t"\
        "psrad $" #shift ", %%mm4               \n\t"\
-\
-       "paddd %%mm0, %%mm2                     \n\t" /* A3+B3          a3+b3 */\
-       "paddd %%mm0, %%mm0                     \n\t" /* 2A3            2a3 */\
-       "psubd %%mm2, %%mm0                     \n\t" /* A3-B3          a3-b3 */\
-       "psrad $" #shift ", %%mm2               \n\t"\
-       "psrad $" #shift ", %%mm0               \n\t"\
-       WRITE2b(%%mm6, %%mm4, %%mm2, %%mm0, dst)
-
-//IDCT_CORE(  src0,   src4,   src1,    src5,    dst, rounder, shift)
-IDCT_CORE(    (%1), 64(%1), 32(%1),  96(%1),  0(%0),/nop, 20)
-IDCT_CORE(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0),/nop, 20)
-IDCT_CORE(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0),/nop, 20)
-IDCT_CORE(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
+       "packssdw %%mm1, %%mm1                  \n\t" /* A2+B2  a2+b2 */\
+       "packssdw %%mm6, %%mm6                  \n\t" /* A3+B3  a3+b3 */\
+       "movd %%mm1, 32+" #dst "                \n\t"\
+       "packssdw %%mm4, %%mm4                  \n\t" /* A3-B3  a3-b3 */\
+       "packssdw %%mm5, %%mm5                  \n\t" /* A2-B2  a2-b2 */\
+       "movd %%mm6, 48+" #dst "                \n\t"\
+       "movd %%mm4, 64+" #dst "                \n\t"\
+       "movd %%mm5, 80+" #dst "                \n\t"
+
+
+//IDCT(  src0,   src4,   src1,    src5,    dst, rounder, shift)
+IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0),/nop, 20)
+IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0),/nop, 20)
+IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0),/nop, 20)
+IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
        "jmp 9f                                 \n\t"
 
        "#.balign 16                            \n\t"\
        "5:                                     \n\t"
-#undef IDCT_CORE
-#define IDCT_CORE(src0, src4, src1, src5, dst, rounder, shift) \
-       "movq " #src0 ", %%mm0                  \n\t" /* R2     R0      r2      r0 */\
-       "movq 16(%2), %%mm4                     \n\t" /* C2     C4      C2      C4 */\
-       "movq %%mm4, %%mm6\n\t"\
-       "pmaddwd %%mm0, %%mm4                   \n\t" /* C2R2+C4R0      C2r2+C4r0 */\
-       "movq " #src4 ", %%mm1                  \n\t" /* R6     R4      r6      r4 */\
-       "movq 24(%2), %%mm5                     \n\t" /* C6     C4      C6      C4 */\
-       "movq %%mm5, %%mm7\n\t"\
-       "pmaddwd %%mm1, %%mm5                   \n\t" /* C6R6+C4R4      C6r6+C4r4 */\
-       "movq 8+" #src0 ", %%mm2                \n\t" /*2R2     R0      r2      r0 */\
-       "pmaddwd %%mm2, %%mm6                   \n\t" /*2C2R2+C4R0      C2r2+C4r0 */\
-       "movq 8+" #src4 ", %%mm3                \n\t" /*2R6     R4      r6      r4 */\
-       "pmaddwd %%mm3, %%mm7                   \n\t" /*2C6R6+C4R4      C6r6+C4r4 */\
-\
+#undef IDCT
+#define IDCT(src0, src4, src1, src5, dst, rounder, shift) \
+       "movq " #src0 ", %%mm0                  \n\t" /* R4     R0      r4      r0 */\
+       "movq " #src4 ", %%mm1                  \n\t" /* R6     R2      r6      r2 */\
+       "movq 16(%2), %%mm4                     \n\t" /* C4     C4      C4      C4 */\
+       "pmaddwd %%mm0, %%mm4                   \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
+       "movq 24(%2), %%mm5                     \n\t" /* -C4    C4      -C4     C4 */\
+       "pmaddwd %%mm5, %%mm0                   \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
+       "movq 32(%2), %%mm5                     \n\t" /* C6     C2      C6      C2 */\
+       "pmaddwd %%mm1, %%mm5                   \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
+       "movq 40(%2), %%mm6                     \n\t" /* -C2    C6      -C2     C6 */\
+       "pmaddwd %%mm6, %%mm1                   \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
+       #rounder ", %%mm4                       \n\t"\
+       "movq %%mm4, %%mm6                      \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
        "paddd %%mm5, %%mm4                     \n\t" /* A0             a0 */\
-       "movq 48(%2), %%mm5                     \n\t" /* C6     C4      C6      C4 */\
+       #rounder ", %%mm0                       \n\t"\
+       "psubd %%mm5, %%mm6                     \n\t" /* A3             a3 */\
+       "movq %%mm0, %%mm5                      \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
+       "paddd %%mm1, %%mm0                     \n\t" /* A1             a1 */\
+       "psubd %%mm1, %%mm5                     \n\t" /* A2             a2 */\
+       "movq 8+" #src0 ", %%mm2                \n\t" /* R4     R0      r4      r0 */\
+       "movq 8+" #src4 ", %%mm3                \n\t" /* R6     R2      r6      r2 */\
+       "movq 16(%2), %%mm1                     \n\t" /* C4     C4      C4      C4 */\
+       "pmaddwd %%mm2, %%mm1                   \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
+       "movq 24(%2), %%mm7                     \n\t" /* -C4    C4      -C4     C4 */\
+       "pmaddwd %%mm7, %%mm2                   \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
+       "movq 32(%2), %%mm7                     \n\t" /* C6     C2      C6      C2 */\
+       "pmaddwd %%mm3, %%mm7                   \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
+       "pmaddwd 40(%2), %%mm3                  \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
+       #rounder ", %%mm1                       \n\t"\
+       "paddd %%mm1, %%mm7                     \n\t" /* A0             a0 */\
+       "paddd %%mm1, %%mm1                     \n\t" /* 2C0            2c0 */\
+       #rounder ", %%mm2                       \n\t"\
+       "psubd %%mm7, %%mm1                     \n\t" /* A3             a3 */\
+       "paddd %%mm2, %%mm3                     \n\t" /* A1             a1 */\
+       "paddd %%mm2, %%mm2                     \n\t" /* 2C1            2c1 */\
+       "psubd %%mm3, %%mm2                     \n\t" /* A2             a2 */\
        "psrad $" #shift ", %%mm4               \n\t"\
-       "pmaddwd %%mm0, %%mm5                   \n\t" /* C6R2+C4R0      C6r2+C4r0 */\
-\
-       "paddd %%mm7, %%mm6                     \n\t" /*2A0             a0 */\
-       "movq 56(%2), %%mm7                     \n\t" /* -C2    -C4     -C2     -C4 */\
-       "psrad $" #shift ", %%mm6               \n\t"\
-       "pmaddwd %%mm1, %%mm7                   \n\t" /* -C2R6-C4R4     -C2r6-C4r4 */\
-\
-       "packssdw %%mm6, %%mm4                  \n\t" /* C0, c0, C0, c0 */\
-       "movq 48(%2), %%mm6                     \n\t" /* C6     C4      C6      C4 */\
-       "movq %%mm4, " #dst "                   \n\t" /* C0, c0 */\
-       "pmaddwd %%mm2, %%mm6                   \n\t" /*2C6R2+C4R0      C6r2+C4r0 */\
-\
-       "movq %%mm4, 112+" #dst "               \n\t" /* C0, c0 */\
-       "movq 56(%2), %%mm4                     \n\t" /* -C2    -C4     -C2     -C4 */\
-       "pmaddwd %%mm3, %%mm4                   \n\t" /*2-C2R6-C4R4     -C2r6-C4r4 */\
-\
-       "paddd %%mm5, %%mm7                     \n\t" /* A1             a1 */\
-       "movq 80(%2), %%mm5                     \n\t" /* -C6    C4      -C6     C4 */\
        "psrad $" #shift ", %%mm7               \n\t"\
-       "pmaddwd %%mm0, %%mm5                   \n\t" /* -C6R2+C4R0     -C6r2+C4r0 */\
-\
-       "paddd %%mm4, %%mm6                     \n\t" /*2A1             a1 */\
-       "pmaddwd 112(%2), %%mm0                 \n\t" /* -C2R2+C4R0     -C2r2+C4r0 */\
-\
-       "psrad $" #shift ", %%mm6               \n\t"\
-       "movq 88(%2), %%mm4                     \n\t" /* C2     -C4     C2      -C4 */\
-       "pmaddwd %%mm1, %%mm4                   \n\t" /* C2R6-C4R4      C2r6-C4r4 */\
-\
-       "pmaddwd 120(%2), %%mm1                 \n\t" /* -C6R6+C4R4     -C6r6+C4r4 */\
-       "packssdw %%mm6, %%mm7                  \n\t" /* C1, c1, C1, c1 */\
-\
-       "movq 80(%2), %%mm6                     \n\t" /* -C6    C4      -C6     C4 */\
-       "movq %%mm7, 16+" #dst "                \n\t" /* C1, c1 */\
-       "pmaddwd %%mm2, %%mm6                   \n\t" /*2-C6R2+C4R0     -C6r2+C4r0 */\
-\
-       "movq %%mm7, 96+" #dst "                \n\t" /* C1, c1 */\
-       "movq 88(%2), %%mm7                     \n\t" /* C2     -C4     C2      -C4 */\
-       "pmaddwd %%mm3, %%mm7                   \n\t" /*2C2R6-C4R4      C2r6-C4r4 */\
-\
-       "pmaddwd 112(%2), %%mm2                 \n\t" /*2-C2R2+C4R0     -C2r2+C4r0 */\
-       "paddd %%mm5, %%mm4                     \n\t" /* A2             a2 */\
-\
-       "pmaddwd 120(%2), %%mm3                 \n\t" /*2-C6R6+C4R4     -C6r6+C4r4 */\
-       "psrad $" #shift ", %%mm4               \n\t"\
-\
-       "paddd %%mm7, %%mm6                     \n\t" /*2A2             a2 */\
-       "paddd %%mm1, %%mm0                     \n\t" /* A3             a3 */\
-\
-       "psrad $" #shift ", %%mm6               \n\t"\
-\
-       "packssdw %%mm6, %%mm4                  \n\t" /* C2, c2, C2, c2 */\
-       "movq %%mm4, 32+" #dst "                \n\t" /* C2, c2 */\
+       "psrad $" #shift ", %%mm3               \n\t"\
+       "packssdw %%mm7, %%mm4                  \n\t" /* A0     a0 */\
+       "movq %%mm4, " #dst "                   \n\t"\
        "psrad $" #shift ", %%mm0               \n\t"\
-       "paddd %%mm3, %%mm2                     \n\t" /*2A3             a3 */\
-\
-       "movq %%mm4, 80+" #dst "                \n\t" /* C2, c2 */\
+       "packssdw %%mm3, %%mm0                  \n\t" /* A1     a1 */\
+       "movq %%mm0, 16+" #dst "                \n\t"\
+       "movq %%mm0, 96+" #dst "                \n\t"\
+       "movq %%mm4, 112+" #dst "               \n\t"\
+       "psrad $" #shift ", %%mm5               \n\t"\
+       "psrad $" #shift ", %%mm6               \n\t"\
        "psrad $" #shift ", %%mm2               \n\t"\
-\
-       "packssdw %%mm2, %%mm0                  \n\t" /* C3, c3, C3, c3 */\
-       "movq %%mm0, 48+" #dst "                \n\t" /* C3, c3 */\
-       "movq %%mm0, 64+" #dst "                \n\t" /* C3, c3 */\
-
-//IDCT_CORE(  src0,   src4,   src1,    src5,    dst, rounder, shift)
-IDCT_CORE(    0(%1), 64(%1), 32(%1),  96(%1),  0(%0),/nop, 20)
-//IDCT_CORE(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0),/nop, 20)
-IDCT_CORE(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0),/nop, 20)
-//IDCT_CORE(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
+       "packssdw %%mm2, %%mm5                  \n\t" /* A2-B2  a2-b2 */\
+       "movq %%mm5, 32+" #dst "                \n\t"\
+       "psrad $" #shift ", %%mm1               \n\t"\
+       "packssdw %%mm1, %%mm6                  \n\t" /* A3+B3  a3+b3 */\
+       "movq %%mm6, 48+" #dst "                \n\t"\
+       "movq %%mm6, 64+" #dst "                \n\t"\
+       "movq %%mm5, 80+" #dst "                \n\t"   
+       
+
+//IDCT(  src0,   src4,   src1,    src5,    dst, rounder, shift)
+IDCT(    0(%1), 64(%1), 32(%1),  96(%1),  0(%0),/nop, 20)
+//IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0),/nop, 20)
+IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0),/nop, 20)
+//IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
        "jmp 9f                                 \n\t"
 
 
        "#.balign 16                            \n\t"\
        "1:                                     \n\t"
-#undef IDCT_CORE
-#define IDCT_CORE(src0, src4, src1, src5, dst, rounder, shift) \
-       "movq " #src0 ", %%mm0                  \n\t" /* R2     R0      r2      r0 */\
-       "movq " #src4 ", %%mm1                  \n\t" /* R6     R4      r6      r4 */\
+#undef IDCT
+#define IDCT(src0, src4, src1, src5, dst, rounder, shift) \
+       "movq " #src0 ", %%mm0                  \n\t" /* R4     R0      r4      r0 */\
+       "movq " #src4 ", %%mm1                  \n\t" /* R6     R2      r6      r2 */\
        "movq " #src1 ", %%mm2                  \n\t" /* R3     R1      r3      r1 */\
-       "movq 16(%2), %%mm4                     \n\t" /* C2     C4      C2      C4 */\
-       "pmaddwd %%mm0, %%mm4                   \n\t" /* C2R2+C4R0      C2r2+C4r0 */\
-       "movq 24(%2), %%mm5                     \n\t" /* C6     C4      C6      C4 */\
-       "pmaddwd %%mm1, %%mm5                   \n\t" /* C6R6+C4R4      C6r6+C4r4 */\
-       "movq 32(%2), %%mm6                     \n\t" /* C3     C1      C3      C1 */\
-       "pmaddwd %%mm2, %%mm6                   \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
+       "movq 16(%2), %%mm4                     \n\t" /* C4     C4      C4      C4 */\
+       "pmaddwd %%mm0, %%mm4                   \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
+       "movq 24(%2), %%mm5                     \n\t" /* -C4    C4      -C4     C4 */\
+       "pmaddwd %%mm5, %%mm0                   \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
+       "movq 32(%2), %%mm5                     \n\t" /* C6     C2      C6      C2 */\
+       "pmaddwd %%mm1, %%mm5                   \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
+       "movq 40(%2), %%mm6                     \n\t" /* -C2    C6      -C2     C6 */\
+       "pmaddwd %%mm6, %%mm1                   \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
+       #rounder ", %%mm4                       \n\t"\
+       "movq %%mm4, %%mm6                      \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
+       "movq 48(%2), %%mm7                     \n\t" /* C3     C1      C3      C1 */\
+       #rounder ", %%mm0                       \n\t"\
+       "pmaddwd %%mm2, %%mm7                   \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
        "paddd %%mm5, %%mm4                     \n\t" /* A0             a0 */\
-\
-       "movq 48(%2), %%mm5                     \n\t" /* C6     C4      C6      C4 */\
-       "pmaddwd %%mm0, %%mm5                   \n\t" /* C6R2+C4R0      C6r2+C4r0 */\
-       "paddd %%mm4, %%mm6                     \n\t" /* A0+B0          a0+b0 */\
+       "psubd %%mm5, %%mm6                     \n\t" /* A3             a3 */\
+       "movq %%mm0, %%mm5                      \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
+       "paddd %%mm1, %%mm0                     \n\t" /* A1             a1 */\
+       "psubd %%mm1, %%mm5                     \n\t" /* A2             a2 */\
+       "movq 64(%2), %%mm1                     \n\t"\
+       "pmaddwd %%mm2, %%mm1                   \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
+       "paddd %%mm4, %%mm7                     \n\t" /* A0+B0          a0+b0 */\
        "paddd %%mm4, %%mm4                     \n\t" /* 2A0            2a0 */\
-       "psubd %%mm6, %%mm4                     \n\t" /* A0-B0          a0-b0 */\
-       "psrad $" #shift ", %%mm6               \n\t"\
-       "psrad $" #shift ", %%mm4               \n\t"\
-       WRITE0b(%%mm6, %%mm4, dst) \
-\
-       "movq 56(%2), %%mm4                     \n\t" /* -C2    -C4     -C2     -C4 */\
-       "pmaddwd %%mm1, %%mm4                   \n\t" /* -C2R6-C4R4     -C2r6-C4r4 */\
-       "movq 64(%2), %%mm6                     \n\t" /* -C7    C3      -C7     C3 */\
-       "pmaddwd %%mm2, %%mm6                   \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
-       "paddd %%mm5, %%mm4                     \n\t" /* A1             a1 */\
-\
-       "movq 80(%2), %%mm5                     \n\t" /* -C6    C4      -C6     C4 */\
-       "pmaddwd %%mm0, %%mm5                   \n\t" /* -C6R2+C4R0     -C6r2+C4r0 */\
-       "paddd %%mm4, %%mm6                     \n\t" /* A1+B1          a1+b1 */\
-       "paddd %%mm4, %%mm4                     \n\t" /* 2A1            2a1 */\
-       "psubd %%mm6, %%mm4                     \n\t" /* A1-B1          a1-b1 */\
-       "psrad $" #shift ", %%mm6               \n\t"\
+       "psubd %%mm7, %%mm4                     \n\t" /* A0-B0          a0-b0 */\
+       "psrad $" #shift ", %%mm7               \n\t"\
        "psrad $" #shift ", %%mm4               \n\t"\
-       WRITE1b(%%mm6, %%mm4, dst, %%mm7) \
-\
-       "movq 88(%2), %%mm4                     \n\t" /* C2     -C4     C2      -C4 */\
-       "pmaddwd %%mm1, %%mm4                   \n\t" /* C2R6-C4R4      C2r6-C4r4 */\
-       "movq 96(%2), %%mm6                     \n\t" /* -C1    C5      -C1     C5 */\
-       "pmaddwd %%mm2, %%mm6                   \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
-       "paddd %%mm5, %%mm4                     \n\t" /* A2             a2 */\
-\
-       "pmaddwd 112(%2), %%mm0                 \n\t" /* -C2R2+C4R0     -C2r2+C4r0 */\
-       "paddd %%mm4, %%mm6                     \n\t" /* A1+B1          a1+b1 */\
-       "pmaddwd 120(%2), %%mm1                 \n\t" /* -C6R6+C4R4     -C6r6+C4r4 */\
-       "paddd %%mm4, %%mm4                     \n\t" /* 2A1            2a1 */\
-       "psubd %%mm6, %%mm4                     \n\t" /* A1-B1          a1-b1 */\
-       "pmaddwd 128(%2), %%mm2                 \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
+       "movq %%mm0, %%mm3                      \n\t" /* A1             a1 */\
+       "paddd %%mm1, %%mm0                     \n\t" /* A1+B1          a1+b1 */\
+       "psubd %%mm1, %%mm3                     \n\t" /* A1-B1          a1-b1 */\
+       "psrad $" #shift ", %%mm0               \n\t"\
+       "psrad $" #shift ", %%mm3               \n\t"\
+       "packssdw %%mm7, %%mm7                  \n\t" /* A0+B0  a0+b0 */\
+       "movd %%mm7, " #dst "                   \n\t"\
+       "packssdw %%mm0, %%mm0                  \n\t" /* A1+B1  a1+b1 */\
+       "movd %%mm0, 16+" #dst "                \n\t"\
+       "packssdw %%mm3, %%mm3                  \n\t" /* A1-B1  a1-b1 */\
+       "movd %%mm3, 96+" #dst "                \n\t"\
+       "packssdw %%mm4, %%mm4                  \n\t" /* A0-B0  a0-b0 */\
+       "movd %%mm4, 112+" #dst "               \n\t"\
+       "movq 80(%2), %%mm4                     \n\t" /* -C1    C5      -C1     C5 */\
+       "pmaddwd %%mm2, %%mm4                   \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
+       "pmaddwd 96(%2), %%mm2                  \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
+       "movq %%mm5, %%mm3                      \n\t" /* A2             a2 */\
+       "paddd %%mm4, %%mm3                     \n\t" /* A2+B2          a2+b2 */\
+       "psubd %%mm4, %%mm5                     \n\t" /* a2-B2          a2-b2 */\
+       "psrad $" #shift ", %%mm3               \n\t"\
+       "psrad $" #shift ", %%mm5               \n\t"\
+       "movq %%mm6, %%mm4                      \n\t" /* A3             a3 */\
+       "paddd %%mm2, %%mm6                     \n\t" /* A3+B3          a3+b3 */\
+       "psubd %%mm2, %%mm4                     \n\t" /* a3-B3          a3-b3 */\
        "psrad $" #shift ", %%mm6               \n\t"\
+       "packssdw %%mm3, %%mm3                  \n\t" /* A2+B2  a2+b2 */\
+       "movd %%mm3, 32+" #dst "                \n\t"\
        "psrad $" #shift ", %%mm4               \n\t"\
-\
-       "paddd %%mm1, %%mm0                     \n\t" /* A3             a3 */\
-       "paddd %%mm0, %%mm2                     \n\t" /* A3+B3          a3+b3 */\
-       "paddd %%mm0, %%mm0                     \n\t" /* 2A3            2a3 */\
-       "psubd %%mm2, %%mm0                     \n\t" /* A3-B3          a3-b3 */\
-       "psrad $" #shift ", %%mm2               \n\t"\
-       "psrad $" #shift ", %%mm0               \n\t"\
-       WRITE2b(%%mm6, %%mm4, %%mm2, %%mm0, dst)
-
-//IDCT_CORE(  src0,   src4,   src1,    src5,    dst, rounder, shift)
-IDCT_CORE(    (%1), 64(%1), 32(%1),  96(%1),  0(%0),/nop, 20)
-IDCT_CORE(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0),/nop, 20)
-IDCT_CORE(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0),/nop, 20)
-IDCT_CORE(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
+       "packssdw %%mm6, %%mm6                  \n\t" /* A3+B3  a3+b3 */\
+       "movd %%mm6, 48+" #dst "                \n\t"\
+       "packssdw %%mm4, %%mm4                  \n\t" /* A3-B3  a3-b3 */\
+       "packssdw %%mm5, %%mm5                  \n\t" /* A2-B2  a2-b2 */\
+       "movd %%mm4, 64+" #dst "                \n\t"\
+       "movd %%mm5, 80+" #dst "                \n\t"
+       
+
+//IDCT(  src0,   src4,   src1,    src5,    dst, rounder, shift)
+IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0),/nop, 20)
+IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0),/nop, 20)
+IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0),/nop, 20)
+IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
        "jmp 9f                                 \n\t"
 
 
        "#.balign 16                            \n\t"
        "7:                                     \n\t"
-#undef IDCT_CORE
-#define IDCT_CORE(src0, src4, src1, src5, dst, rounder, shift) \
-       "movq " #src0 ", %%mm0                  \n\t" /* R2     R0      r2      r0 */\
-       "movq 16(%2), %%mm2                     \n\t" /* C2     C4      C2      C4 */\
-       "movq 8+" #src0 ", %%mm1                \n\t" /* R2     R0      r2      r0 */\
-       "pmaddwd %%mm0, %%mm2                   \n\t" /* C2R2+C4R0      C2r2+C4r0 */\
-       "movq 16(%2), %%mm3                     \n\t" /* C2     C4      C2      C4 */\
-       "pmaddwd %%mm1, %%mm3                   \n\t" /* C2R2+C4R0      C2r2+C4r0 */\
-\
-       "movq 48(%2), %%mm4                     \n\t" /* C6     C4      C6      C4 */\
-       "pmaddwd %%mm0, %%mm4                   \n\t" /* C6R2+C4R0      C6r2+C4r0 */\
-       "movq 48(%2), %%mm5                     \n\t" /* C6     C4      C6      C4 */\
-       "pmaddwd %%mm1, %%mm5                   \n\t" /* C6R2+C4R0      C6r2+C4r0 */\
-       "movq 80(%2), %%mm6                     \n\t" /* -C6    C4      -C6     C4 */\
-       "pmaddwd %%mm0, %%mm6                   \n\t" /* -C6R2+C4R0     -C6r2+C4r0 */\
-       "movq 80(%2), %%mm7                     \n\t" /* -C6    C4      -C6     C4 */\
-       "pmaddwd %%mm1, %%mm7                   \n\t" /* -C6R2+C4R0     -C6r2+C4r0 */\
-       "pmaddwd 112(%2), %%mm0                 \n\t" /* -C2R2+C4R0     -C2r2+C4r0 */\
-       "psrad $" #shift ", %%mm2               \n\t"\
-       "psrad $" #shift ", %%mm3               \n\t"\
-       "pmaddwd 112(%2), %%mm1                 \n\t" /* -C2R2+C4R0     -C2r2+C4r0 */\
-       "packssdw %%mm3, %%mm2                  \n\t" /* C0, c0, C0, c0 */\
-       "movq %%mm2, " #dst "                   \n\t" /* C0, c0 */\
+#undef IDCT
+#define IDCT(src0, src4, src1, src5, dst, rounder, shift) \
+       "movq " #src0 ", %%mm0                  \n\t" /* R4     R0      r4      r0 */\
+       "movq 16(%2), %%mm4                     \n\t" /* C4     C4      C4      C4 */\
+       "pmaddwd %%mm0, %%mm4                   \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
+       "movq 24(%2), %%mm5                     \n\t" /* -C4    C4      -C4     C4 */\
+       "pmaddwd %%mm5, %%mm0                   \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
+       #rounder ", %%mm4                       \n\t"\
+       #rounder ", %%mm0                       \n\t"\
        "psrad $" #shift ", %%mm4               \n\t"\
-       "psrad $" #shift ", %%mm5               \n\t"\
-       "movq %%mm2, 112+" #dst "               \n\t" /* C0, c0 */\
-       "packssdw %%mm5, %%mm4                  \n\t" /* C1, c1, C1, c1 */\
-       "movq %%mm4, 16+" #dst "                \n\t" /* C0, c0 */\
-       "psrad $" #shift ", %%mm7               \n\t"\
-       "psrad $" #shift ", %%mm6               \n\t"\
-       "movq %%mm4, 96+" #dst "                \n\t" /* C0, c0 */\
-       "packssdw %%mm7, %%mm6                  \n\t" /* C2, c2, C2, c2 */\
-       "movq %%mm6, 32+" #dst "                \n\t" /* C0, c0 */\
        "psrad $" #shift ", %%mm0               \n\t"\
-       "movq %%mm6, 80+" #dst "                \n\t" /* C0, c0 */\
+       "movq 8+" #src0 ", %%mm2                \n\t" /* R4     R0      r4      r0 */\
+       "movq 16(%2), %%mm1                     \n\t" /* C4     C4      C4      C4 */\
+       "pmaddwd %%mm2, %%mm1                   \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
+       "movq 24(%2), %%mm7                     \n\t" /* -C4    C4      -C4     C4 */\
+       "pmaddwd %%mm7, %%mm2                   \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
+       "movq 32(%2), %%mm7                     \n\t" /* C6     C2      C6      C2 */\
+       #rounder ", %%mm1                       \n\t"\
+       #rounder ", %%mm2                       \n\t"\
        "psrad $" #shift ", %%mm1               \n\t"\
-       "packssdw %%mm1, %%mm0                  \n\t" /* C3, c3, C3, c3 */\
-       "movq %%mm0, 48+" #dst "                \n\t" /* C0, c0 */\
-       "movq %%mm0, 64+" #dst "                \n\t" /* C0, c0 */\
-
-//IDCT_CORE(  src0,   src4,   src1,    src5,    dst, rounder, shift)
-IDCT_CORE(   0(%1), 64(%1), 32(%1),  96(%1),  0(%0),/nop, 20)
-//IDCT_CORE(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0),/nop, 20)
-IDCT_CORE(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0),/nop, 20)
-//IDCT_CORE(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
+       "packssdw %%mm1, %%mm4                  \n\t" /* A0     a0 */\
+       "movq %%mm4, " #dst "                   \n\t"\
+       "psrad $" #shift ", %%mm2               \n\t"\
+       "packssdw %%mm2, %%mm0                  \n\t" /* A1     a1 */\
+       "movq %%mm0, 16+" #dst "                \n\t"\
+       "movq %%mm0, 96+" #dst "                \n\t"\
+       "movq %%mm4, 112+" #dst "               \n\t"\
+       "movq %%mm0, 32+" #dst "                \n\t"\
+       "movq %%mm4, 48+" #dst "                \n\t"\
+       "movq %%mm4, 64+" #dst "                \n\t"\
+       "movq %%mm0, 80+" #dst "                \n\t"   
+
+//IDCT(  src0,   src4,   src1,    src5,    dst, rounder, shift)
+IDCT(   0(%1), 64(%1), 32(%1),  96(%1),  0(%0),/nop, 20)
+//IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0),/nop, 20)
+IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0),/nop, 20)
+//IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
 
 
 #endif
 
 /*
 Input
- 00 20 02 22 40 60 42 62
- 10 30 12 32 50 70 52 72
- 01 21 03 23 41 61 43 63
+ 00 40 04 44 20 60 24 64
+ 10 30 14 34 50 70 54 74
+ 01 41 03 43 21 61 23 63
  11 31 13 33 51 71 53 73
- 04 24 06 26 44 64 46 66
- 14 34 16 36 54 74 56 76
-...
-*/
-/*
+ 02 42 06 46 22 62 26 66
+ 12 32 16 36 52 72 56 76
+ 05 45 07 47 25 65 27 67
+ 15 35 17 37 55 75 57 77
+  
 Temp
- 00 02 10 12 20 22 30 32
- 40 42 50 52 60 62 70 72
+ 00 04 10 14 20 24 30 34
+ 40 44 50 54 60 64 70 74
  01 03 11 13 21 23 31 33
  41 43 51 53 61 63 71 73
- 04 06 14 16 24 26 34 36
- 44 46 54 56 64 66 74 76
+ 02 06 12 16 22 26 32 36
+ 42 46 52 56 62 66 72 76
  05 07 15 17 25 27 35 37
  45 47 55 57 65 67 75 77
 */
 
-/*
-Output
- 00 10 20 30 40 50 60 70
- 01 11 21 31 41 51 61 71
-...
-*/
-
 "9: \n\t"
                :: "r" (block), "r" (temp), "r" (coeffs)
                : "%eax"
        );
-/*
-idctCol(block, temp);
-idctCol(block+1, temp+2);
-idctCol(block+2, temp+4);
-idctCol(block+3, temp+6);
-idctCol(block+4, temp+8);
-idctCol(block+5, temp+10);
-idctCol(block+6, temp+12);
-idctCol(block+7, temp+14);
-*/
 }
 
 void simple_idct_mmx(int16_t *block)
 {
-       static int imax=0, imin=0;
-       static int omax=0, omin=0;
-       int i, j;
-/*
-       for(i=0; i<64; i++)
-       {
-               if(block[i] > imax)
-               {
-                       imax= block[i];
-                       printf("Input-Max: %d\n", imax);
-                       printf("Input-Min: %d\n", imin);
-                       printf("Output-Max: %d\n", omax);
-                       printf("Output-Min: %d\n", omin);
-               }
-               if(block[i] < imin)
-               {
-                       imin= block[i];
-                       printf("Input-Max: %d\n", imax);
-                       printf("Input-Min: %d\n", imin);
-                       printf("Output-Max: %d\n", omax);
-                       printf("Output-Min: %d\n", omin);
-               }
-       }*/
-/*     static int stat[64];
-       for(j=0; j<4; j++)
-       {
-               static int line[8]={0,2,1,3,4,6,5,7};
-               for(i=0; i<16; i++)
-               {
-                       if(block[j*16+i])
-                       {
-                               stat[j*16+1]++;
-                               break;
-                       }
-               }
-               for(i=0; i<16; i++)
-               {
-                       if(block[j*16+i] && i!=0 && i!=2)
-                       {
-                               stat[j*16+2]++;
-                               break;
-                       }
-               }
-       }
-       stat[0]++;*/
-/*     for(i=1; i<8; i++)
-       {
-               if(block[i] != 0)
-               {
-                       stat[1]++;
-                       break;
-               }
-       }
-       for(i=32; i<64; i++)
-       {
-               if(block[i] != 0)
-               {
-                       stat[2]++;
-                       break;
-               }
-       }
-       stat[0]++;
-*/
-//     return;
        idct(block);
-//     memset(block, 0, 128);
-/*
-       if(stat[0] > 100000)
-               for(i=0; i<64; i++)
-               {
-                       if((i&7) == 0) printf("\n");
-                       printf("%06d ", stat[i]);
-               }
-*/
-/*
-       for(i=0; i<4; i++) printf("%d", stat[1+i*16]);
-       printf("  ");
-       for(i=0; i<4; i++) printf("%d", stat[2+i*16]);
-       printf("\n");
-*/
-//     printf("%d", stat[2]);
-
-//     memset(stat, 0, 256);
-
-/*
-       for(i=0; i<64; i++)
-       {
-               if(block[i] > omax)
-               {
-                       omax= block[i];
-                       printf("Input-Max: %d\n", imax);
-                       printf("Input-Min: %d\n", imin);
-                       printf("Output-Max: %d\n", omax);
-                       printf("Output-Min: %d\n", omin);
-               }
-               if(block[i] < omin)
-               {
-                       omin= block[i];
-                       printf("Input-Max: %d\n", imax);
-                       printf("Input-Min: %d\n", imin);
-                       printf("Output-Max: %d\n", omax);
-                       printf("Output-Min: %d\n", omin);
-               }
-       }*/
 }