OSDN Git Service

d78ae6b5083852f59bd7e3c0aba4ee615903bd45
[coroid/libav_saccubus.git] / libavcodec / x86 / simple_idct_mmx.c
1 /*
2  * Simple IDCT MMX
3  *
4  * Copyright (c) 2001, 2002 Michael Niedermayer <michaelni@gmx.at>
5  *
6  * This file is part of Libav.
7  *
8  * Libav is free software; you can redistribute it and/or
9  * modify it under the terms of the GNU Lesser General Public
10  * License as published by the Free Software Foundation; either
11  * version 2.1 of the License, or (at your option) any later version.
12  *
13  * Libav is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16  * Lesser General Public License for more details.
17  *
18  * You should have received a copy of the GNU Lesser General Public
19  * License along with Libav; if not, write to the Free Software
20  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21  */
22 #include "libavcodec/dsputil.h"
23 #include "libavcodec/simple_idct.h"
24 #include "dsputil_mmx.h"
25
26 /*
27 23170.475006
28 22725.260826
29 21406.727617
30 19265.545870
31 16384.000000
32 12872.826198
33 8866.956905
34 4520.335430
35 */
36 #define C0 23170 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
37 #define C1 22725 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
38 #define C2 21407 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
39 #define C3 19266 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
40 #if 0
41 #define C4 16384 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
42 #else
43 #define C4 16383 //cos(i*M_PI/16)*sqrt(2)*(1<<14) - 0.5
44 #endif
45 #define C5 12873 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
46 #define C6 8867  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
47 #define C7 4520  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
48
49 #define ROW_SHIFT 11
50 #define COL_SHIFT 20 // 6
51
52 DECLARE_ASM_CONST(8, uint64_t, wm1010)= 0xFFFF0000FFFF0000ULL;
53 DECLARE_ASM_CONST(8, uint64_t, d40000)= 0x0000000000040000ULL;
54
55 DECLARE_ALIGNED(8, static const int16_t, coeffs)[]= {
56         1<<(ROW_SHIFT-1), 0, 1<<(ROW_SHIFT-1), 0,
57 //        1<<(COL_SHIFT-1), 0, 1<<(COL_SHIFT-1), 0,
58 //        0, 1<<(COL_SHIFT-1-16), 0, 1<<(COL_SHIFT-1-16),
59         1<<(ROW_SHIFT-1), 1, 1<<(ROW_SHIFT-1), 0,
60         // the 1 = ((1<<(COL_SHIFT-1))/C4)<<ROW_SHIFT :)
61 //        0, 0, 0, 0,
62 //        0, 0, 0, 0,
63
64  C4,  C4,  C4,  C4,
65  C4, -C4,  C4, -C4,
66
67  C2,  C6,  C2,  C6,
68  C6, -C2,  C6, -C2,
69
70  C1,  C3,  C1,  C3,
71  C5,  C7,  C5,  C7,
72
73  C3, -C7,  C3, -C7,
74 -C1, -C5, -C1, -C5,
75
76  C5, -C1,  C5, -C1,
77  C7,  C3,  C7,  C3,
78
79  C7, -C5,  C7, -C5,
80  C3, -C1,  C3, -C1
81 };
82
83 #if 0
84 static void unused_var_killer(void)
85 {
86         int a= wm1010 + d40000;
87         temp[0]=a;
88 }
89
90 static void inline idctCol (int16_t * col, int16_t *input)
91 {
92 #undef C0
93 #undef C1
94 #undef C2
95 #undef C3
96 #undef C4
97 #undef C5
98 #undef C6
99 #undef C7
100         int a0, a1, a2, a3, b0, b1, b2, b3;
101         const int C0 = 23170; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
102         const int C1 = 22725; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
103         const int C2 = 21407; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
104         const int C3 = 19266; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
105         const int C4 = 16383; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
106         const int C5 = 12873; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
107         const int C6 = 8867;  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
108         const int C7 = 4520;  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
109 /*
110         if( !(col[8*1] | col[8*2] |col[8*3] |col[8*4] |col[8*5] |col[8*6] | col[8*7])) {
111                 col[8*0] = col[8*1] = col[8*2] = col[8*3] = col[8*4] =
112                         col[8*5] = col[8*6] = col[8*7] = col[8*0]<<3;
113                 return;
114         }*/
115
116 col[8*0] = input[8*0 + 0];
117 col[8*1] = input[8*2 + 0];
118 col[8*2] = input[8*0 + 1];
119 col[8*3] = input[8*2 + 1];
120 col[8*4] = input[8*4 + 0];
121 col[8*5] = input[8*6 + 0];
122 col[8*6] = input[8*4 + 1];
123 col[8*7] = input[8*6 + 1];
124
125         a0 = C4*col[8*0] + C2*col[8*2] + C4*col[8*4] + C6*col[8*6] + (1<<(COL_SHIFT-1));
126         a1 = C4*col[8*0] + C6*col[8*2] - C4*col[8*4] - C2*col[8*6] + (1<<(COL_SHIFT-1));
127         a2 = C4*col[8*0] - C6*col[8*2] - C4*col[8*4] + C2*col[8*6] + (1<<(COL_SHIFT-1));
128         a3 = C4*col[8*0] - C2*col[8*2] + C4*col[8*4] - C6*col[8*6] + (1<<(COL_SHIFT-1));
129
130         b0 = C1*col[8*1] + C3*col[8*3] + C5*col[8*5] + C7*col[8*7];
131         b1 = C3*col[8*1] - C7*col[8*3] - C1*col[8*5] - C5*col[8*7];
132         b2 = C5*col[8*1] - C1*col[8*3] + C7*col[8*5] + C3*col[8*7];
133         b3 = C7*col[8*1] - C5*col[8*3] + C3*col[8*5] - C1*col[8*7];
134
135         col[8*0] = (a0 + b0) >> COL_SHIFT;
136         col[8*1] = (a1 + b1) >> COL_SHIFT;
137         col[8*2] = (a2 + b2) >> COL_SHIFT;
138         col[8*3] = (a3 + b3) >> COL_SHIFT;
139         col[8*4] = (a3 - b3) >> COL_SHIFT;
140         col[8*5] = (a2 - b2) >> COL_SHIFT;
141         col[8*6] = (a1 - b1) >> COL_SHIFT;
142         col[8*7] = (a0 - b0) >> COL_SHIFT;
143 }
144
145 static void inline idctRow (int16_t * output, int16_t * input)
146 {
147         int16_t row[8];
148
149         int a0, a1, a2, a3, b0, b1, b2, b3;
150         const int C0 = 23170; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
151         const int C1 = 22725; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
152         const int C2 = 21407; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
153         const int C3 = 19266; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
154         const int C4 = 16383; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
155         const int C5 = 12873; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
156         const int C6 = 8867;  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
157         const int C7 = 4520;  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
158
159 row[0] = input[0];
160 row[2] = input[1];
161 row[4] = input[4];
162 row[6] = input[5];
163 row[1] = input[8];
164 row[3] = input[9];
165 row[5] = input[12];
166 row[7] = input[13];
167
168         if( !(row[1] | row[2] |row[3] |row[4] |row[5] |row[6] | row[7]) ) {
169                 row[0] = row[1] = row[2] = row[3] = row[4] =
170                         row[5] = row[6] = row[7] = row[0]<<3;
171         output[0]  = row[0];
172         output[2]  = row[1];
173         output[4]  = row[2];
174         output[6]  = row[3];
175         output[8]  = row[4];
176         output[10] = row[5];
177         output[12] = row[6];
178         output[14] = row[7];
179                 return;
180         }
181
182         a0 = C4*row[0] + C2*row[2] + C4*row[4] + C6*row[6] + (1<<(ROW_SHIFT-1));
183         a1 = C4*row[0] + C6*row[2] - C4*row[4] - C2*row[6] + (1<<(ROW_SHIFT-1));
184         a2 = C4*row[0] - C6*row[2] - C4*row[4] + C2*row[6] + (1<<(ROW_SHIFT-1));
185         a3 = C4*row[0] - C2*row[2] + C4*row[4] - C6*row[6] + (1<<(ROW_SHIFT-1));
186
187         b0 = C1*row[1] + C3*row[3] + C5*row[5] + C7*row[7];
188         b1 = C3*row[1] - C7*row[3] - C1*row[5] - C5*row[7];
189         b2 = C5*row[1] - C1*row[3] + C7*row[5] + C3*row[7];
190         b3 = C7*row[1] - C5*row[3] + C3*row[5] - C1*row[7];
191
192         row[0] = (a0 + b0) >> ROW_SHIFT;
193         row[1] = (a1 + b1) >> ROW_SHIFT;
194         row[2] = (a2 + b2) >> ROW_SHIFT;
195         row[3] = (a3 + b3) >> ROW_SHIFT;
196         row[4] = (a3 - b3) >> ROW_SHIFT;
197         row[5] = (a2 - b2) >> ROW_SHIFT;
198         row[6] = (a1 - b1) >> ROW_SHIFT;
199         row[7] = (a0 - b0) >> ROW_SHIFT;
200
201         output[0]  = row[0];
202         output[2]  = row[1];
203         output[4]  = row[2];
204         output[6]  = row[3];
205         output[8]  = row[4];
206         output[10] = row[5];
207         output[12] = row[6];
208         output[14] = row[7];
209 }
210 #endif
211
212 static inline void idct(int16_t *block)
213 {
214         DECLARE_ALIGNED(8, int64_t, align_tmp)[16];
215         int16_t * const temp= (int16_t*)align_tmp;
216
217         __asm__ volatile(
218 #if 0 //Alternative, simpler variant
219
220 #define ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
221         "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
222         "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
223         "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
224         "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
225         "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
226         "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
227         "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
228         "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
229         "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
230         "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
231         "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
232         "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
233         "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
234         "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
235         #rounder ", %%mm4               \n\t"\
236         "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
237         "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
238         "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
239         "movq 56(%2), %%mm5             \n\t" /* C7     C5      C7      C5 */\
240         "pmaddwd %%mm3, %%mm5           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
241         #rounder ", %%mm0               \n\t"\
242         "paddd %%mm0, %%mm1             \n\t" /* A1             a1 */\
243         "paddd %%mm0, %%mm0             \n\t" \
244         "psubd %%mm1, %%mm0             \n\t" /* A2             a2 */\
245         "pmaddwd 64(%2), %%mm2          \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
246         "paddd %%mm5, %%mm7             \n\t" /* B0             b0 */\
247         "movq 72(%2), %%mm5             \n\t" /* -C5    -C1     -C5     -C1 */\
248         "pmaddwd %%mm3, %%mm5           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
249         "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
250         "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
251         "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
252         "paddd %%mm2, %%mm5             \n\t" /* B1             b1 */\
253         "psrad $" #shift ", %%mm7       \n\t"\
254         "psrad $" #shift ", %%mm4       \n\t"\
255         "movq %%mm1, %%mm2              \n\t" /* A1             a1 */\
256         "paddd %%mm5, %%mm1             \n\t" /* A1+B1          a1+b1 */\
257         "psubd %%mm5, %%mm2             \n\t" /* A1-B1          a1-b1 */\
258         "psrad $" #shift ", %%mm1       \n\t"\
259         "psrad $" #shift ", %%mm2       \n\t"\
260         "packssdw %%mm1, %%mm7          \n\t" /* A1+B1  a1+b1   A0+B0   a0+b0 */\
261         "packssdw %%mm4, %%mm2          \n\t" /* A0-B0  a0-b0   A1-B1   a1-b1 */\
262         "movq %%mm7, " #dst "           \n\t"\
263         "movq " #src1 ", %%mm1          \n\t" /* R3     R1      r3      r1 */\
264         "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
265         "movq %%mm2, 24+" #dst "        \n\t"\
266         "pmaddwd %%mm1, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
267         "movq 88(%2), %%mm7             \n\t" /* C3     C7      C3      C7 */\
268         "pmaddwd 96(%2), %%mm1          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
269         "pmaddwd %%mm3, %%mm7           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
270         "movq %%mm0, %%mm2              \n\t" /* A2             a2 */\
271         "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
272         "paddd %%mm7, %%mm4             \n\t" /* B2             b2 */\
273         "paddd %%mm4, %%mm2             \n\t" /* A2+B2          a2+b2 */\
274         "psubd %%mm4, %%mm0             \n\t" /* a2-B2          a2-b2 */\
275         "psrad $" #shift ", %%mm2       \n\t"\
276         "psrad $" #shift ", %%mm0       \n\t"\
277         "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
278         "paddd %%mm1, %%mm3             \n\t" /* B3             b3 */\
279         "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
280         "psubd %%mm3, %%mm4             \n\t" /* a3-B3          a3-b3 */\
281         "psrad $" #shift ", %%mm6       \n\t"\
282         "packssdw %%mm6, %%mm2          \n\t" /* A3+B3  a3+b3   A2+B2   a2+b2 */\
283         "movq %%mm2, 8+" #dst "         \n\t"\
284         "psrad $" #shift ", %%mm4       \n\t"\
285         "packssdw %%mm0, %%mm4          \n\t" /* A2-B2  a2-b2   A3-B3   a3-b3 */\
286         "movq %%mm4, 16+" #dst "        \n\t"\
287
288 #define COL_IDCT(src0, src4, src1, src5, dst, shift) \
289         "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
290         "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
291         "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
292         "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
293         "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
294         "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
295         "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
296         "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
297         "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
298         "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
299         "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
300         "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
301         "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
302         "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
303         "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
304         "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
305         "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
306         "movq %%mm0, %%mm5              \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
307         "paddd %%mm1, %%mm0             \n\t" /* A1             a1 */\
308         "psubd %%mm1, %%mm5             \n\t" /* A2             a2 */\
309         "movq 56(%2), %%mm1             \n\t" /* C7     C5      C7      C5 */\
310         "pmaddwd %%mm3, %%mm1           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
311         "pmaddwd 64(%2), %%mm2          \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
312         "paddd %%mm1, %%mm7             \n\t" /* B0             b0 */\
313         "movq 72(%2), %%mm1             \n\t" /* -C5    -C1     -C5     -C1 */\
314         "pmaddwd %%mm3, %%mm1           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
315         "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
316         "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
317         "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
318         "paddd %%mm2, %%mm1             \n\t" /* B1             b1 */\
319         "psrad $" #shift ", %%mm7       \n\t"\
320         "psrad $" #shift ", %%mm4       \n\t"\
321         "movq %%mm0, %%mm2              \n\t" /* A1             a1 */\
322         "paddd %%mm1, %%mm0             \n\t" /* A1+B1          a1+b1 */\
323         "psubd %%mm1, %%mm2             \n\t" /* A1-B1          a1-b1 */\
324         "psrad $" #shift ", %%mm0       \n\t"\
325         "psrad $" #shift ", %%mm2       \n\t"\
326         "packssdw %%mm7, %%mm7          \n\t" /* A0+B0  a0+b0 */\
327         "movd %%mm7, " #dst "           \n\t"\
328         "packssdw %%mm0, %%mm0          \n\t" /* A1+B1  a1+b1 */\
329         "movd %%mm0, 16+" #dst "        \n\t"\
330         "packssdw %%mm2, %%mm2          \n\t" /* A1-B1  a1-b1 */\
331         "movd %%mm2, 96+" #dst "        \n\t"\
332         "packssdw %%mm4, %%mm4          \n\t" /* A0-B0  a0-b0 */\
333         "movd %%mm4, 112+" #dst "       \n\t"\
334         "movq " #src1 ", %%mm0          \n\t" /* R3     R1      r3      r1 */\
335         "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
336         "pmaddwd %%mm0, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
337         "movq 88(%2), %%mm7             \n\t" /* C3     C7      C3      C7 */\
338         "pmaddwd 96(%2), %%mm0          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
339         "pmaddwd %%mm3, %%mm7           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
340         "movq %%mm5, %%mm2              \n\t" /* A2             a2 */\
341         "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
342         "paddd %%mm7, %%mm4             \n\t" /* B2             b2 */\
343         "paddd %%mm4, %%mm2             \n\t" /* A2+B2          a2+b2 */\
344         "psubd %%mm4, %%mm5             \n\t" /* a2-B2          a2-b2 */\
345         "psrad $" #shift ", %%mm2       \n\t"\
346         "psrad $" #shift ", %%mm5       \n\t"\
347         "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
348         "paddd %%mm0, %%mm3             \n\t" /* B3             b3 */\
349         "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
350         "psubd %%mm3, %%mm4             \n\t" /* a3-B3          a3-b3 */\
351         "psrad $" #shift ", %%mm6       \n\t"\
352         "psrad $" #shift ", %%mm4       \n\t"\
353         "packssdw %%mm2, %%mm2          \n\t" /* A2+B2  a2+b2 */\
354         "packssdw %%mm6, %%mm6          \n\t" /* A3+B3  a3+b3 */\
355         "movd %%mm2, 32+" #dst "        \n\t"\
356         "packssdw %%mm4, %%mm4          \n\t" /* A3-B3  a3-b3 */\
357         "packssdw %%mm5, %%mm5          \n\t" /* A2-B2  a2-b2 */\
358         "movd %%mm6, 48+" #dst "        \n\t"\
359         "movd %%mm4, 64+" #dst "        \n\t"\
360         "movd %%mm5, 80+" #dst "        \n\t"\
361
362
363 #define DC_COND_ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
364         "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
365         "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
366         "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
367         "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
368         "movq "MANGLE(wm1010)", %%mm4   \n\t"\
369         "pand %%mm0, %%mm4              \n\t"\
370         "por %%mm1, %%mm4               \n\t"\
371         "por %%mm2, %%mm4               \n\t"\
372         "por %%mm3, %%mm4               \n\t"\
373         "packssdw %%mm4,%%mm4           \n\t"\
374         "movd %%mm4, %%eax              \n\t"\
375         "orl %%eax, %%eax               \n\t"\
376         "jz 1f                          \n\t"\
377         "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
378         "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
379         "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
380         "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
381         "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
382         "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
383         "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
384         "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
385         "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
386         "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
387         #rounder ", %%mm4               \n\t"\
388         "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
389         "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
390         "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
391         "movq 56(%2), %%mm5             \n\t" /* C7     C5      C7      C5 */\
392         "pmaddwd %%mm3, %%mm5           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
393         #rounder ", %%mm0               \n\t"\
394         "paddd %%mm0, %%mm1             \n\t" /* A1             a1 */\
395         "paddd %%mm0, %%mm0             \n\t" \
396         "psubd %%mm1, %%mm0             \n\t" /* A2             a2 */\
397         "pmaddwd 64(%2), %%mm2          \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
398         "paddd %%mm5, %%mm7             \n\t" /* B0             b0 */\
399         "movq 72(%2), %%mm5             \n\t" /* -C5    -C1     -C5     -C1 */\
400         "pmaddwd %%mm3, %%mm5           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
401         "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
402         "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
403         "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
404         "paddd %%mm2, %%mm5             \n\t" /* B1             b1 */\
405         "psrad $" #shift ", %%mm7       \n\t"\
406         "psrad $" #shift ", %%mm4       \n\t"\
407         "movq %%mm1, %%mm2              \n\t" /* A1             a1 */\
408         "paddd %%mm5, %%mm1             \n\t" /* A1+B1          a1+b1 */\
409         "psubd %%mm5, %%mm2             \n\t" /* A1-B1          a1-b1 */\
410         "psrad $" #shift ", %%mm1       \n\t"\
411         "psrad $" #shift ", %%mm2       \n\t"\
412         "packssdw %%mm1, %%mm7          \n\t" /* A1+B1  a1+b1   A0+B0   a0+b0 */\
413         "packssdw %%mm4, %%mm2          \n\t" /* A0-B0  a0-b0   A1-B1   a1-b1 */\
414         "movq %%mm7, " #dst "           \n\t"\
415         "movq " #src1 ", %%mm1          \n\t" /* R3     R1      r3      r1 */\
416         "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
417         "movq %%mm2, 24+" #dst "        \n\t"\
418         "pmaddwd %%mm1, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
419         "movq 88(%2), %%mm7             \n\t" /* C3     C7      C3      C7 */\
420         "pmaddwd 96(%2), %%mm1          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
421         "pmaddwd %%mm3, %%mm7           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
422         "movq %%mm0, %%mm2              \n\t" /* A2             a2 */\
423         "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
424         "paddd %%mm7, %%mm4             \n\t" /* B2             b2 */\
425         "paddd %%mm4, %%mm2             \n\t" /* A2+B2          a2+b2 */\
426         "psubd %%mm4, %%mm0             \n\t" /* a2-B2          a2-b2 */\
427         "psrad $" #shift ", %%mm2       \n\t"\
428         "psrad $" #shift ", %%mm0       \n\t"\
429         "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
430         "paddd %%mm1, %%mm3             \n\t" /* B3             b3 */\
431         "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
432         "psubd %%mm3, %%mm4             \n\t" /* a3-B3          a3-b3 */\
433         "psrad $" #shift ", %%mm6       \n\t"\
434         "packssdw %%mm6, %%mm2          \n\t" /* A3+B3  a3+b3   A2+B2   a2+b2 */\
435         "movq %%mm2, 8+" #dst "         \n\t"\
436         "psrad $" #shift ", %%mm4       \n\t"\
437         "packssdw %%mm0, %%mm4          \n\t" /* A2-B2  a2-b2   A3-B3   a3-b3 */\
438         "movq %%mm4, 16+" #dst "        \n\t"\
439         "jmp 2f                         \n\t"\
440         "1:                             \n\t"\
441         "pslld $16, %%mm0               \n\t"\
442         "#paddd "MANGLE(d40000)", %%mm0 \n\t"\
443         "psrad $13, %%mm0               \n\t"\
444         "packssdw %%mm0, %%mm0          \n\t"\
445         "movq %%mm0, " #dst "           \n\t"\
446         "movq %%mm0, 8+" #dst "         \n\t"\
447         "movq %%mm0, 16+" #dst "        \n\t"\
448         "movq %%mm0, 24+" #dst "        \n\t"\
449         "2:                             \n\t"
450
451
452 //IDCT(      src0,   src4,   src1,   src5,    dst,    rounder, shift)
453 ROW_IDCT(    (%0),  8(%0), 16(%0), 24(%0),  0(%1),paddd 8(%2), 11)
454 /*ROW_IDCT(  32(%0), 40(%0), 48(%0), 56(%0), 32(%1), paddd (%2), 11)
455 ROW_IDCT(  64(%0), 72(%0), 80(%0), 88(%0), 64(%1), paddd (%2), 11)
456 ROW_IDCT(  96(%0),104(%0),112(%0),120(%0), 96(%1), paddd (%2), 11)*/
457
458 DC_COND_ROW_IDCT(  32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11)
459 DC_COND_ROW_IDCT(  64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11)
460 DC_COND_ROW_IDCT(  96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11)
461
462
463 //IDCT(      src0,   src4,   src1,    src5,    dst, shift)
464 COL_IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
465 COL_IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
466 COL_IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
467 COL_IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
468
469 #else
470
471 #define DC_COND_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
472         "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
473         "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
474         "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
475         "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
476         "movq "MANGLE(wm1010)", %%mm4   \n\t"\
477         "pand %%mm0, %%mm4              \n\t"\
478         "por %%mm1, %%mm4               \n\t"\
479         "por %%mm2, %%mm4               \n\t"\
480         "por %%mm3, %%mm4               \n\t"\
481         "packssdw %%mm4,%%mm4           \n\t"\
482         "movd %%mm4, %%eax              \n\t"\
483         "orl %%eax, %%eax               \n\t"\
484         "jz 1f                          \n\t"\
485         "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
486         "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
487         "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
488         "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
489         "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
490         "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
491         "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
492         "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
493         "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
494         "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
495         #rounder ", %%mm4               \n\t"\
496         "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
497         "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
498         "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
499         "movq 56(%2), %%mm5             \n\t" /* C7     C5      C7      C5 */\
500         "pmaddwd %%mm3, %%mm5           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
501         #rounder ", %%mm0               \n\t"\
502         "paddd %%mm0, %%mm1             \n\t" /* A1             a1 */\
503         "paddd %%mm0, %%mm0             \n\t" \
504         "psubd %%mm1, %%mm0             \n\t" /* A2             a2 */\
505         "pmaddwd 64(%2), %%mm2          \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
506         "paddd %%mm5, %%mm7             \n\t" /* B0             b0 */\
507         "movq 72(%2), %%mm5             \n\t" /* -C5    -C1     -C5     -C1 */\
508         "pmaddwd %%mm3, %%mm5           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
509         "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
510         "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
511         "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
512         "paddd %%mm2, %%mm5             \n\t" /* B1             b1 */\
513         "psrad $" #shift ", %%mm7       \n\t"\
514         "psrad $" #shift ", %%mm4       \n\t"\
515         "movq %%mm1, %%mm2              \n\t" /* A1             a1 */\
516         "paddd %%mm5, %%mm1             \n\t" /* A1+B1          a1+b1 */\
517         "psubd %%mm5, %%mm2             \n\t" /* A1-B1          a1-b1 */\
518         "psrad $" #shift ", %%mm1       \n\t"\
519         "psrad $" #shift ", %%mm2       \n\t"\
520         "packssdw %%mm1, %%mm7          \n\t" /* A1+B1  a1+b1   A0+B0   a0+b0 */\
521         "packssdw %%mm4, %%mm2          \n\t" /* A0-B0  a0-b0   A1-B1   a1-b1 */\
522         "movq %%mm7, " #dst "           \n\t"\
523         "movq " #src1 ", %%mm1          \n\t" /* R3     R1      r3      r1 */\
524         "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
525         "movq %%mm2, 24+" #dst "        \n\t"\
526         "pmaddwd %%mm1, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
527         "movq 88(%2), %%mm7             \n\t" /* C3     C7      C3      C7 */\
528         "pmaddwd 96(%2), %%mm1          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
529         "pmaddwd %%mm3, %%mm7           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
530         "movq %%mm0, %%mm2              \n\t" /* A2             a2 */\
531         "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
532         "paddd %%mm7, %%mm4             \n\t" /* B2             b2 */\
533         "paddd %%mm4, %%mm2             \n\t" /* A2+B2          a2+b2 */\
534         "psubd %%mm4, %%mm0             \n\t" /* a2-B2          a2-b2 */\
535         "psrad $" #shift ", %%mm2       \n\t"\
536         "psrad $" #shift ", %%mm0       \n\t"\
537         "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
538         "paddd %%mm1, %%mm3             \n\t" /* B3             b3 */\
539         "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
540         "psubd %%mm3, %%mm4             \n\t" /* a3-B3          a3-b3 */\
541         "psrad $" #shift ", %%mm6       \n\t"\
542         "packssdw %%mm6, %%mm2          \n\t" /* A3+B3  a3+b3   A2+B2   a2+b2 */\
543         "movq %%mm2, 8+" #dst "         \n\t"\
544         "psrad $" #shift ", %%mm4       \n\t"\
545         "packssdw %%mm0, %%mm4          \n\t" /* A2-B2  a2-b2   A3-B3   a3-b3 */\
546         "movq %%mm4, 16+" #dst "        \n\t"\
547         "jmp 2f                         \n\t"\
548         "1:                             \n\t"\
549         "pslld $16, %%mm0               \n\t"\
550         "paddd "MANGLE(d40000)", %%mm0  \n\t"\
551         "psrad $13, %%mm0               \n\t"\
552         "packssdw %%mm0, %%mm0          \n\t"\
553         "movq %%mm0, " #dst "           \n\t"\
554         "movq %%mm0, 8+" #dst "         \n\t"\
555         "movq %%mm0, 16+" #dst "        \n\t"\
556         "movq %%mm0, 24+" #dst "        \n\t"\
557         "2:                             \n\t"
558
559 #define Z_COND_IDCT(src0, src4, src1, src5, dst, rounder, shift, bt) \
560         "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
561         "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
562         "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
563         "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
564         "movq %%mm0, %%mm4              \n\t"\
565         "por %%mm1, %%mm4               \n\t"\
566         "por %%mm2, %%mm4               \n\t"\
567         "por %%mm3, %%mm4               \n\t"\
568         "packssdw %%mm4,%%mm4           \n\t"\
569         "movd %%mm4, %%eax              \n\t"\
570         "orl %%eax, %%eax               \n\t"\
571         "jz " #bt "                     \n\t"\
572         "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
573         "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
574         "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
575         "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
576         "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
577         "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
578         "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
579         "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
580         "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
581         "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
582         #rounder ", %%mm4               \n\t"\
583         "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
584         "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
585         "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
586         "movq 56(%2), %%mm5             \n\t" /* C7     C5      C7      C5 */\
587         "pmaddwd %%mm3, %%mm5           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
588         #rounder ", %%mm0               \n\t"\
589         "paddd %%mm0, %%mm1             \n\t" /* A1             a1 */\
590         "paddd %%mm0, %%mm0             \n\t" \
591         "psubd %%mm1, %%mm0             \n\t" /* A2             a2 */\
592         "pmaddwd 64(%2), %%mm2          \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
593         "paddd %%mm5, %%mm7             \n\t" /* B0             b0 */\
594         "movq 72(%2), %%mm5             \n\t" /* -C5    -C1     -C5     -C1 */\
595         "pmaddwd %%mm3, %%mm5           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
596         "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
597         "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
598         "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
599         "paddd %%mm2, %%mm5             \n\t" /* B1             b1 */\
600         "psrad $" #shift ", %%mm7       \n\t"\
601         "psrad $" #shift ", %%mm4       \n\t"\
602         "movq %%mm1, %%mm2              \n\t" /* A1             a1 */\
603         "paddd %%mm5, %%mm1             \n\t" /* A1+B1          a1+b1 */\
604         "psubd %%mm5, %%mm2             \n\t" /* A1-B1          a1-b1 */\
605         "psrad $" #shift ", %%mm1       \n\t"\
606         "psrad $" #shift ", %%mm2       \n\t"\
607         "packssdw %%mm1, %%mm7          \n\t" /* A1+B1  a1+b1   A0+B0   a0+b0 */\
608         "packssdw %%mm4, %%mm2          \n\t" /* A0-B0  a0-b0   A1-B1   a1-b1 */\
609         "movq %%mm7, " #dst "           \n\t"\
610         "movq " #src1 ", %%mm1          \n\t" /* R3     R1      r3      r1 */\
611         "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
612         "movq %%mm2, 24+" #dst "        \n\t"\
613         "pmaddwd %%mm1, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
614         "movq 88(%2), %%mm7             \n\t" /* C3     C7      C3      C7 */\
615         "pmaddwd 96(%2), %%mm1          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
616         "pmaddwd %%mm3, %%mm7           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
617         "movq %%mm0, %%mm2              \n\t" /* A2             a2 */\
618         "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
619         "paddd %%mm7, %%mm4             \n\t" /* B2             b2 */\
620         "paddd %%mm4, %%mm2             \n\t" /* A2+B2          a2+b2 */\
621         "psubd %%mm4, %%mm0             \n\t" /* a2-B2          a2-b2 */\
622         "psrad $" #shift ", %%mm2       \n\t"\
623         "psrad $" #shift ", %%mm0       \n\t"\
624         "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
625         "paddd %%mm1, %%mm3             \n\t" /* B3             b3 */\
626         "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
627         "psubd %%mm3, %%mm4             \n\t" /* a3-B3          a3-b3 */\
628         "psrad $" #shift ", %%mm6       \n\t"\
629         "packssdw %%mm6, %%mm2          \n\t" /* A3+B3  a3+b3   A2+B2   a2+b2 */\
630         "movq %%mm2, 8+" #dst "         \n\t"\
631         "psrad $" #shift ", %%mm4       \n\t"\
632         "packssdw %%mm0, %%mm4          \n\t" /* A2-B2  a2-b2   A3-B3   a3-b3 */\
633         "movq %%mm4, 16+" #dst "        \n\t"\
634
635 #define ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
636         "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
637         "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
638         "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
639         "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
640         "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
641         "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
642         "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
643         "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
644         "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
645         "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
646         "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
647         "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
648         "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
649         "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
650         #rounder ", %%mm4               \n\t"\
651         "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
652         "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
653         "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
654         "movq 56(%2), %%mm5             \n\t" /* C7     C5      C7      C5 */\
655         "pmaddwd %%mm3, %%mm5           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
656         #rounder ", %%mm0               \n\t"\
657         "paddd %%mm0, %%mm1             \n\t" /* A1             a1 */\
658         "paddd %%mm0, %%mm0             \n\t" \
659         "psubd %%mm1, %%mm0             \n\t" /* A2             a2 */\
660         "pmaddwd 64(%2), %%mm2          \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
661         "paddd %%mm5, %%mm7             \n\t" /* B0             b0 */\
662         "movq 72(%2), %%mm5             \n\t" /* -C5    -C1     -C5     -C1 */\
663         "pmaddwd %%mm3, %%mm5           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
664         "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
665         "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
666         "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
667         "paddd %%mm2, %%mm5             \n\t" /* B1             b1 */\
668         "psrad $" #shift ", %%mm7       \n\t"\
669         "psrad $" #shift ", %%mm4       \n\t"\
670         "movq %%mm1, %%mm2              \n\t" /* A1             a1 */\
671         "paddd %%mm5, %%mm1             \n\t" /* A1+B1          a1+b1 */\
672         "psubd %%mm5, %%mm2             \n\t" /* A1-B1          a1-b1 */\
673         "psrad $" #shift ", %%mm1       \n\t"\
674         "psrad $" #shift ", %%mm2       \n\t"\
675         "packssdw %%mm1, %%mm7          \n\t" /* A1+B1  a1+b1   A0+B0   a0+b0 */\
676         "packssdw %%mm4, %%mm2          \n\t" /* A0-B0  a0-b0   A1-B1   a1-b1 */\
677         "movq %%mm7, " #dst "           \n\t"\
678         "movq " #src1 ", %%mm1          \n\t" /* R3     R1      r3      r1 */\
679         "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
680         "movq %%mm2, 24+" #dst "        \n\t"\
681         "pmaddwd %%mm1, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
682         "movq 88(%2), %%mm7             \n\t" /* C3     C7      C3      C7 */\
683         "pmaddwd 96(%2), %%mm1          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
684         "pmaddwd %%mm3, %%mm7           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
685         "movq %%mm0, %%mm2              \n\t" /* A2             a2 */\
686         "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
687         "paddd %%mm7, %%mm4             \n\t" /* B2             b2 */\
688         "paddd %%mm4, %%mm2             \n\t" /* A2+B2          a2+b2 */\
689         "psubd %%mm4, %%mm0             \n\t" /* a2-B2          a2-b2 */\
690         "psrad $" #shift ", %%mm2       \n\t"\
691         "psrad $" #shift ", %%mm0       \n\t"\
692         "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
693         "paddd %%mm1, %%mm3             \n\t" /* B3             b3 */\
694         "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
695         "psubd %%mm3, %%mm4             \n\t" /* a3-B3          a3-b3 */\
696         "psrad $" #shift ", %%mm6       \n\t"\
697         "packssdw %%mm6, %%mm2          \n\t" /* A3+B3  a3+b3   A2+B2   a2+b2 */\
698         "movq %%mm2, 8+" #dst "         \n\t"\
699         "psrad $" #shift ", %%mm4       \n\t"\
700         "packssdw %%mm0, %%mm4          \n\t" /* A2-B2  a2-b2   A3-B3   a3-b3 */\
701         "movq %%mm4, 16+" #dst "        \n\t"\
702
703 //IDCT(         src0,   src4,   src1,   src5,    dst,   rounder, shift)
704 DC_COND_IDCT(  0(%0),  8(%0), 16(%0), 24(%0),  0(%1),paddd 8(%2), 11)
705 Z_COND_IDCT(  32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11, 4f)
706 Z_COND_IDCT(  64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 2f)
707 Z_COND_IDCT(  96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 1f)
708
709 #undef IDCT
710 #define IDCT(src0, src4, src1, src5, dst, shift) \
711         "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
712         "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
713         "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
714         "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
715         "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
716         "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
717         "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
718         "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
719         "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
720         "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
721         "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
722         "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
723         "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
724         "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
725         "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
726         "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
727         "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
728         "movq %%mm0, %%mm5              \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
729         "paddd %%mm1, %%mm0             \n\t" /* A1             a1 */\
730         "psubd %%mm1, %%mm5             \n\t" /* A2             a2 */\
731         "movq 56(%2), %%mm1             \n\t" /* C7     C5      C7      C5 */\
732         "pmaddwd %%mm3, %%mm1           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
733         "pmaddwd 64(%2), %%mm2          \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
734         "paddd %%mm1, %%mm7             \n\t" /* B0             b0 */\
735         "movq 72(%2), %%mm1             \n\t" /* -C5    -C1     -C5     -C1 */\
736         "pmaddwd %%mm3, %%mm1           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
737         "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
738         "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
739         "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
740         "paddd %%mm2, %%mm1             \n\t" /* B1             b1 */\
741         "psrad $" #shift ", %%mm7       \n\t"\
742         "psrad $" #shift ", %%mm4       \n\t"\
743         "movq %%mm0, %%mm2              \n\t" /* A1             a1 */\
744         "paddd %%mm1, %%mm0             \n\t" /* A1+B1          a1+b1 */\
745         "psubd %%mm1, %%mm2             \n\t" /* A1-B1          a1-b1 */\
746         "psrad $" #shift ", %%mm0       \n\t"\
747         "psrad $" #shift ", %%mm2       \n\t"\
748         "packssdw %%mm7, %%mm7          \n\t" /* A0+B0  a0+b0 */\
749         "movd %%mm7, " #dst "           \n\t"\
750         "packssdw %%mm0, %%mm0          \n\t" /* A1+B1  a1+b1 */\
751         "movd %%mm0, 16+" #dst "        \n\t"\
752         "packssdw %%mm2, %%mm2          \n\t" /* A1-B1  a1-b1 */\
753         "movd %%mm2, 96+" #dst "        \n\t"\
754         "packssdw %%mm4, %%mm4          \n\t" /* A0-B0  a0-b0 */\
755         "movd %%mm4, 112+" #dst "       \n\t"\
756         "movq " #src1 ", %%mm0          \n\t" /* R3     R1      r3      r1 */\
757         "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
758         "pmaddwd %%mm0, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
759         "movq 88(%2), %%mm7             \n\t" /* C3     C7      C3      C7 */\
760         "pmaddwd 96(%2), %%mm0          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
761         "pmaddwd %%mm3, %%mm7           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
762         "movq %%mm5, %%mm2              \n\t" /* A2             a2 */\
763         "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
764         "paddd %%mm7, %%mm4             \n\t" /* B2             b2 */\
765         "paddd %%mm4, %%mm2             \n\t" /* A2+B2          a2+b2 */\
766         "psubd %%mm4, %%mm5             \n\t" /* a2-B2          a2-b2 */\
767         "psrad $" #shift ", %%mm2       \n\t"\
768         "psrad $" #shift ", %%mm5       \n\t"\
769         "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
770         "paddd %%mm0, %%mm3             \n\t" /* B3             b3 */\
771         "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
772         "psubd %%mm3, %%mm4             \n\t" /* a3-B3          a3-b3 */\
773         "psrad $" #shift ", %%mm6       \n\t"\
774         "psrad $" #shift ", %%mm4       \n\t"\
775         "packssdw %%mm2, %%mm2          \n\t" /* A2+B2  a2+b2 */\
776         "packssdw %%mm6, %%mm6          \n\t" /* A3+B3  a3+b3 */\
777         "movd %%mm2, 32+" #dst "        \n\t"\
778         "packssdw %%mm4, %%mm4          \n\t" /* A3-B3  a3-b3 */\
779         "packssdw %%mm5, %%mm5          \n\t" /* A2-B2  a2-b2 */\
780         "movd %%mm6, 48+" #dst "        \n\t"\
781         "movd %%mm4, 64+" #dst "        \n\t"\
782         "movd %%mm5, 80+" #dst "        \n\t"
783
784
785 //IDCT(  src0,   src4,   src1,    src5,    dst, shift)
786 IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
787 IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
788 IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
789 IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
790         "jmp 9f                         \n\t"
791
792         "# .p2align 4                   \n\t"\
793         "4:                             \n\t"
794 Z_COND_IDCT(  64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 6f)
795 Z_COND_IDCT(  96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 5f)
796
797 #undef IDCT
798 #define IDCT(src0, src4, src1, src5, dst, shift) \
799         "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
800         "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
801         "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
802         "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
803         "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
804         "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
805         "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
806         "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
807         "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
808         "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
809         "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
810         "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
811         "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
812         "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
813         "movq %%mm0, %%mm5              \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
814         "paddd %%mm1, %%mm0             \n\t" /* A1             a1 */\
815         "psubd %%mm1, %%mm5             \n\t" /* A2             a2 */\
816         "movq 56(%2), %%mm1             \n\t" /* C7     C5      C7      C5 */\
817         "pmaddwd %%mm3, %%mm1           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
818         "movq 72(%2), %%mm7             \n\t" /* -C5    -C1     -C5     -C1 */\
819         "pmaddwd %%mm3, %%mm7           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
820         "paddd %%mm4, %%mm1             \n\t" /* A0+B0          a0+b0 */\
821         "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
822         "psubd %%mm1, %%mm4             \n\t" /* A0-B0          a0-b0 */\
823         "psrad $" #shift ", %%mm1       \n\t"\
824         "psrad $" #shift ", %%mm4       \n\t"\
825         "movq %%mm0, %%mm2              \n\t" /* A1             a1 */\
826         "paddd %%mm7, %%mm0             \n\t" /* A1+B1          a1+b1 */\
827         "psubd %%mm7, %%mm2             \n\t" /* A1-B1          a1-b1 */\
828         "psrad $" #shift ", %%mm0       \n\t"\
829         "psrad $" #shift ", %%mm2       \n\t"\
830         "packssdw %%mm1, %%mm1          \n\t" /* A0+B0  a0+b0 */\
831         "movd %%mm1, " #dst "           \n\t"\
832         "packssdw %%mm0, %%mm0          \n\t" /* A1+B1  a1+b1 */\
833         "movd %%mm0, 16+" #dst "        \n\t"\
834         "packssdw %%mm2, %%mm2          \n\t" /* A1-B1  a1-b1 */\
835         "movd %%mm2, 96+" #dst "        \n\t"\
836         "packssdw %%mm4, %%mm4          \n\t" /* A0-B0  a0-b0 */\
837         "movd %%mm4, 112+" #dst "       \n\t"\
838         "movq 88(%2), %%mm1             \n\t" /* C3     C7      C3      C7 */\
839         "pmaddwd %%mm3, %%mm1           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
840         "movq %%mm5, %%mm2              \n\t" /* A2             a2 */\
841         "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
842         "paddd %%mm1, %%mm2             \n\t" /* A2+B2          a2+b2 */\
843         "psubd %%mm1, %%mm5             \n\t" /* a2-B2          a2-b2 */\
844         "psrad $" #shift ", %%mm2       \n\t"\
845         "psrad $" #shift ", %%mm5       \n\t"\
846         "movq %%mm6, %%mm1              \n\t" /* A3             a3 */\
847         "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
848         "psubd %%mm3, %%mm1             \n\t" /* a3-B3          a3-b3 */\
849         "psrad $" #shift ", %%mm6       \n\t"\
850         "psrad $" #shift ", %%mm1       \n\t"\
851         "packssdw %%mm2, %%mm2          \n\t" /* A2+B2  a2+b2 */\
852         "packssdw %%mm6, %%mm6          \n\t" /* A3+B3  a3+b3 */\
853         "movd %%mm2, 32+" #dst "        \n\t"\
854         "packssdw %%mm1, %%mm1          \n\t" /* A3-B3  a3-b3 */\
855         "packssdw %%mm5, %%mm5          \n\t" /* A2-B2  a2-b2 */\
856         "movd %%mm6, 48+" #dst "        \n\t"\
857         "movd %%mm1, 64+" #dst "        \n\t"\
858         "movd %%mm5, 80+" #dst "        \n\t"
859
860 //IDCT(  src0,   src4,   src1,    src5,    dst, shift)
861 IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
862 IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
863 IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
864 IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
865         "jmp 9f                         \n\t"
866
867         "# .p2align 4                   \n\t"\
868         "6:                             \n\t"
869 Z_COND_IDCT(  96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 7f)
870
871 #undef IDCT
872 #define IDCT(src0, src4, src1, src5, dst, shift) \
873         "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
874         "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
875         "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
876         "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
877         "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
878         "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
879         "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
880         "movq %%mm0, %%mm5              \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
881         "movq 56(%2), %%mm1             \n\t" /* C7     C5      C7      C5 */\
882         "pmaddwd %%mm3, %%mm1           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
883         "movq 72(%2), %%mm7             \n\t" /* -C5    -C1     -C5     -C1 */\
884         "pmaddwd %%mm3, %%mm7           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
885         "paddd %%mm4, %%mm1             \n\t" /* A0+B0          a0+b0 */\
886         "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
887         "psubd %%mm1, %%mm4             \n\t" /* A0-B0          a0-b0 */\
888         "psrad $" #shift ", %%mm1       \n\t"\
889         "psrad $" #shift ", %%mm4       \n\t"\
890         "movq %%mm0, %%mm2              \n\t" /* A1             a1 */\
891         "paddd %%mm7, %%mm0             \n\t" /* A1+B1          a1+b1 */\
892         "psubd %%mm7, %%mm2             \n\t" /* A1-B1          a1-b1 */\
893         "psrad $" #shift ", %%mm0       \n\t"\
894         "psrad $" #shift ", %%mm2       \n\t"\
895         "packssdw %%mm1, %%mm1          \n\t" /* A0+B0  a0+b0 */\
896         "movd %%mm1, " #dst "           \n\t"\
897         "packssdw %%mm0, %%mm0          \n\t" /* A1+B1  a1+b1 */\
898         "movd %%mm0, 16+" #dst "        \n\t"\
899         "packssdw %%mm2, %%mm2          \n\t" /* A1-B1  a1-b1 */\
900         "movd %%mm2, 96+" #dst "        \n\t"\
901         "packssdw %%mm4, %%mm4          \n\t" /* A0-B0  a0-b0 */\
902         "movd %%mm4, 112+" #dst "       \n\t"\
903         "movq 88(%2), %%mm1             \n\t" /* C3     C7      C3      C7 */\
904         "pmaddwd %%mm3, %%mm1           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
905         "movq %%mm5, %%mm2              \n\t" /* A2             a2 */\
906         "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
907         "paddd %%mm1, %%mm2             \n\t" /* A2+B2          a2+b2 */\
908         "psubd %%mm1, %%mm5             \n\t" /* a2-B2          a2-b2 */\
909         "psrad $" #shift ", %%mm2       \n\t"\
910         "psrad $" #shift ", %%mm5       \n\t"\
911         "movq %%mm6, %%mm1              \n\t" /* A3             a3 */\
912         "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
913         "psubd %%mm3, %%mm1             \n\t" /* a3-B3          a3-b3 */\
914         "psrad $" #shift ", %%mm6       \n\t"\
915         "psrad $" #shift ", %%mm1       \n\t"\
916         "packssdw %%mm2, %%mm2          \n\t" /* A2+B2  a2+b2 */\
917         "packssdw %%mm6, %%mm6          \n\t" /* A3+B3  a3+b3 */\
918         "movd %%mm2, 32+" #dst "        \n\t"\
919         "packssdw %%mm1, %%mm1          \n\t" /* A3-B3  a3-b3 */\
920         "packssdw %%mm5, %%mm5          \n\t" /* A2-B2  a2-b2 */\
921         "movd %%mm6, 48+" #dst "        \n\t"\
922         "movd %%mm1, 64+" #dst "        \n\t"\
923         "movd %%mm5, 80+" #dst "        \n\t"
924
925
926 //IDCT(  src0,   src4,   src1,    src5,    dst, shift)
927 IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
928 IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
929 IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
930 IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
931         "jmp 9f                         \n\t"
932
933         "# .p2align 4                   \n\t"\
934         "2:                             \n\t"
935 Z_COND_IDCT(  96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 3f)
936
937 #undef IDCT
938 #define IDCT(src0, src4, src1, src5, dst, shift) \
939         "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
940         "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
941         "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
942         "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
943         "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
944         "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
945         "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
946         "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
947         "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
948         "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
949         "movq %%mm0, %%mm5              \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
950         "movq 56(%2), %%mm1             \n\t" /* C7     C5      C7      C5 */\
951         "pmaddwd %%mm3, %%mm1           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
952         "pmaddwd 64(%2), %%mm2          \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
953         "paddd %%mm1, %%mm7             \n\t" /* B0             b0 */\
954         "movq 72(%2), %%mm1             \n\t" /* -C5    -C1     -C5     -C1 */\
955         "pmaddwd %%mm3, %%mm1           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
956         "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
957         "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
958         "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
959         "paddd %%mm2, %%mm1             \n\t" /* B1             b1 */\
960         "psrad $" #shift ", %%mm7       \n\t"\
961         "psrad $" #shift ", %%mm4       \n\t"\
962         "movq %%mm0, %%mm2              \n\t" /* A1             a1 */\
963         "paddd %%mm1, %%mm0             \n\t" /* A1+B1          a1+b1 */\
964         "psubd %%mm1, %%mm2             \n\t" /* A1-B1          a1-b1 */\
965         "psrad $" #shift ", %%mm0       \n\t"\
966         "psrad $" #shift ", %%mm2       \n\t"\
967         "packssdw %%mm7, %%mm7          \n\t" /* A0+B0  a0+b0 */\
968         "movd %%mm7, " #dst "           \n\t"\
969         "packssdw %%mm0, %%mm0          \n\t" /* A1+B1  a1+b1 */\
970         "movd %%mm0, 16+" #dst "        \n\t"\
971         "packssdw %%mm2, %%mm2          \n\t" /* A1-B1  a1-b1 */\
972         "movd %%mm2, 96+" #dst "        \n\t"\
973         "packssdw %%mm4, %%mm4          \n\t" /* A0-B0  a0-b0 */\
974         "movd %%mm4, 112+" #dst "       \n\t"\
975         "movq " #src1 ", %%mm0          \n\t" /* R3     R1      r3      r1 */\
976         "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
977         "pmaddwd %%mm0, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
978         "movq 88(%2), %%mm7             \n\t" /* C3     C7      C3      C7 */\
979         "pmaddwd 96(%2), %%mm0          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
980         "pmaddwd %%mm3, %%mm7           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
981         "movq %%mm5, %%mm2              \n\t" /* A2             a2 */\
982         "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
983         "paddd %%mm7, %%mm4             \n\t" /* B2             b2 */\
984         "paddd %%mm4, %%mm2             \n\t" /* A2+B2          a2+b2 */\
985         "psubd %%mm4, %%mm5             \n\t" /* a2-B2          a2-b2 */\
986         "psrad $" #shift ", %%mm2       \n\t"\
987         "psrad $" #shift ", %%mm5       \n\t"\
988         "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
989         "paddd %%mm0, %%mm3             \n\t" /* B3             b3 */\
990         "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
991         "psubd %%mm3, %%mm4             \n\t" /* a3-B3          a3-b3 */\
992         "psrad $" #shift ", %%mm6       \n\t"\
993         "psrad $" #shift ", %%mm4       \n\t"\
994         "packssdw %%mm2, %%mm2          \n\t" /* A2+B2  a2+b2 */\
995         "packssdw %%mm6, %%mm6          \n\t" /* A3+B3  a3+b3 */\
996         "movd %%mm2, 32+" #dst "        \n\t"\
997         "packssdw %%mm4, %%mm4          \n\t" /* A3-B3  a3-b3 */\
998         "packssdw %%mm5, %%mm5          \n\t" /* A2-B2  a2-b2 */\
999         "movd %%mm6, 48+" #dst "        \n\t"\
1000         "movd %%mm4, 64+" #dst "        \n\t"\
1001         "movd %%mm5, 80+" #dst "        \n\t"
1002
1003 //IDCT(  src0,   src4,   src1,    src5,    dst, shift)
1004 IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
1005 IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
1006 IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
1007 IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
1008         "jmp 9f                         \n\t"
1009
1010         "# .p2align 4                   \n\t"\
1011         "3:                             \n\t"
1012 #undef IDCT
1013 #define IDCT(src0, src4, src1, src5, dst, shift) \
1014         "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
1015         "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
1016         "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
1017         "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
1018         "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
1019         "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
1020         "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
1021         "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
1022         "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
1023         "movq %%mm0, %%mm5              \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
1024         "movq 64(%2), %%mm3             \n\t"\
1025         "pmaddwd %%mm2, %%mm3           \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
1026         "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
1027         "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
1028         "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
1029         "psrad $" #shift ", %%mm7       \n\t"\
1030         "psrad $" #shift ", %%mm4       \n\t"\
1031         "movq %%mm0, %%mm1              \n\t" /* A1             a1 */\
1032         "paddd %%mm3, %%mm0             \n\t" /* A1+B1          a1+b1 */\
1033         "psubd %%mm3, %%mm1             \n\t" /* A1-B1          a1-b1 */\
1034         "psrad $" #shift ", %%mm0       \n\t"\
1035         "psrad $" #shift ", %%mm1       \n\t"\
1036         "packssdw %%mm7, %%mm7          \n\t" /* A0+B0  a0+b0 */\
1037         "movd %%mm7, " #dst "           \n\t"\
1038         "packssdw %%mm0, %%mm0          \n\t" /* A1+B1  a1+b1 */\
1039         "movd %%mm0, 16+" #dst "        \n\t"\
1040         "packssdw %%mm1, %%mm1          \n\t" /* A1-B1  a1-b1 */\
1041         "movd %%mm1, 96+" #dst "        \n\t"\
1042         "packssdw %%mm4, %%mm4          \n\t" /* A0-B0  a0-b0 */\
1043         "movd %%mm4, 112+" #dst "       \n\t"\
1044         "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
1045         "pmaddwd %%mm2, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
1046         "pmaddwd 96(%2), %%mm2          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
1047         "movq %%mm5, %%mm1              \n\t" /* A2             a2 */\
1048         "paddd %%mm4, %%mm1             \n\t" /* A2+B2          a2+b2 */\
1049         "psubd %%mm4, %%mm5             \n\t" /* a2-B2          a2-b2 */\
1050         "psrad $" #shift ", %%mm1       \n\t"\
1051         "psrad $" #shift ", %%mm5       \n\t"\
1052         "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
1053         "paddd %%mm2, %%mm6             \n\t" /* A3+B3          a3+b3 */\
1054         "psubd %%mm2, %%mm4             \n\t" /* a3-B3          a3-b3 */\
1055         "psrad $" #shift ", %%mm6       \n\t"\
1056         "psrad $" #shift ", %%mm4       \n\t"\
1057         "packssdw %%mm1, %%mm1          \n\t" /* A2+B2  a2+b2 */\
1058         "packssdw %%mm6, %%mm6          \n\t" /* A3+B3  a3+b3 */\
1059         "movd %%mm1, 32+" #dst "        \n\t"\
1060         "packssdw %%mm4, %%mm4          \n\t" /* A3-B3  a3-b3 */\
1061         "packssdw %%mm5, %%mm5          \n\t" /* A2-B2  a2-b2 */\
1062         "movd %%mm6, 48+" #dst "        \n\t"\
1063         "movd %%mm4, 64+" #dst "        \n\t"\
1064         "movd %%mm5, 80+" #dst "        \n\t"
1065
1066
1067 //IDCT(  src0,   src4,   src1,    src5,    dst, shift)
1068 IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
1069 IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
1070 IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
1071 IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
1072         "jmp 9f                         \n\t"
1073
1074         "# .p2align 4                   \n\t"\
1075         "5:                             \n\t"
1076 #undef IDCT
1077 #define IDCT(src0, src4, src1, src5, dst, shift) \
1078         "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
1079         "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
1080         "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
1081         "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
1082         "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
1083         "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
1084         "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
1085         "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
1086         "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
1087         "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
1088         "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
1089         "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
1090         "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
1091         "movq %%mm0, %%mm5              \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
1092         "paddd %%mm1, %%mm0             \n\t" /* A1             a1 */\
1093         "psubd %%mm1, %%mm5             \n\t" /* A2             a2 */\
1094         "movq 8+" #src0 ", %%mm2        \n\t" /* R4     R0      r4      r0 */\
1095         "movq 8+" #src4 ", %%mm3        \n\t" /* R6     R2      r6      r2 */\
1096         "movq 16(%2), %%mm1             \n\t" /* C4     C4      C4      C4 */\
1097         "pmaddwd %%mm2, %%mm1           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
1098         "movq 24(%2), %%mm7             \n\t" /* -C4    C4      -C4     C4 */\
1099         "pmaddwd %%mm7, %%mm2           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
1100         "movq 32(%2), %%mm7             \n\t" /* C6     C2      C6      C2 */\
1101         "pmaddwd %%mm3, %%mm7           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
1102         "pmaddwd 40(%2), %%mm3          \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
1103         "paddd %%mm1, %%mm7             \n\t" /* A0             a0 */\
1104         "paddd %%mm1, %%mm1             \n\t" /* 2C0            2c0 */\
1105         "psubd %%mm7, %%mm1             \n\t" /* A3             a3 */\
1106         "paddd %%mm2, %%mm3             \n\t" /* A1             a1 */\
1107         "paddd %%mm2, %%mm2             \n\t" /* 2C1            2c1 */\
1108         "psubd %%mm3, %%mm2             \n\t" /* A2             a2 */\
1109         "psrad $" #shift ", %%mm4       \n\t"\
1110         "psrad $" #shift ", %%mm7       \n\t"\
1111         "psrad $" #shift ", %%mm3       \n\t"\
1112         "packssdw %%mm7, %%mm4          \n\t" /* A0     a0 */\
1113         "movq %%mm4, " #dst "           \n\t"\
1114         "psrad $" #shift ", %%mm0       \n\t"\
1115         "packssdw %%mm3, %%mm0          \n\t" /* A1     a1 */\
1116         "movq %%mm0, 16+" #dst "        \n\t"\
1117         "movq %%mm0, 96+" #dst "        \n\t"\
1118         "movq %%mm4, 112+" #dst "       \n\t"\
1119         "psrad $" #shift ", %%mm5       \n\t"\
1120         "psrad $" #shift ", %%mm6       \n\t"\
1121         "psrad $" #shift ", %%mm2       \n\t"\
1122         "packssdw %%mm2, %%mm5          \n\t" /* A2-B2  a2-b2 */\
1123         "movq %%mm5, 32+" #dst "        \n\t"\
1124         "psrad $" #shift ", %%mm1       \n\t"\
1125         "packssdw %%mm1, %%mm6          \n\t" /* A3+B3  a3+b3 */\
1126         "movq %%mm6, 48+" #dst "        \n\t"\
1127         "movq %%mm6, 64+" #dst "        \n\t"\
1128         "movq %%mm5, 80+" #dst "        \n\t"
1129
1130
1131 //IDCT(  src0,   src4,   src1,    src5,    dst, shift)
1132 IDCT(    0(%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
1133 //IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
1134 IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
1135 //IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
1136         "jmp 9f                         \n\t"
1137
1138
1139         "# .p2align 4                   \n\t"\
1140         "1:                             \n\t"
1141 #undef IDCT
1142 #define IDCT(src0, src4, src1, src5, dst, shift) \
1143         "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
1144         "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
1145         "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
1146         "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
1147         "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
1148         "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
1149         "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
1150         "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
1151         "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
1152         "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
1153         "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
1154         "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
1155         "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
1156         "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
1157         "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
1158         "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
1159         "movq %%mm0, %%mm5              \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
1160         "paddd %%mm1, %%mm0             \n\t" /* A1             a1 */\
1161         "psubd %%mm1, %%mm5             \n\t" /* A2             a2 */\
1162         "movq 64(%2), %%mm1             \n\t"\
1163         "pmaddwd %%mm2, %%mm1           \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
1164         "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
1165         "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
1166         "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
1167         "psrad $" #shift ", %%mm7       \n\t"\
1168         "psrad $" #shift ", %%mm4       \n\t"\
1169         "movq %%mm0, %%mm3              \n\t" /* A1             a1 */\
1170         "paddd %%mm1, %%mm0             \n\t" /* A1+B1          a1+b1 */\
1171         "psubd %%mm1, %%mm3             \n\t" /* A1-B1          a1-b1 */\
1172         "psrad $" #shift ", %%mm0       \n\t"\
1173         "psrad $" #shift ", %%mm3       \n\t"\
1174         "packssdw %%mm7, %%mm7          \n\t" /* A0+B0  a0+b0 */\
1175         "movd %%mm7, " #dst "           \n\t"\
1176         "packssdw %%mm0, %%mm0          \n\t" /* A1+B1  a1+b1 */\
1177         "movd %%mm0, 16+" #dst "        \n\t"\
1178         "packssdw %%mm3, %%mm3          \n\t" /* A1-B1  a1-b1 */\
1179         "movd %%mm3, 96+" #dst "        \n\t"\
1180         "packssdw %%mm4, %%mm4          \n\t" /* A0-B0  a0-b0 */\
1181         "movd %%mm4, 112+" #dst "       \n\t"\
1182         "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
1183         "pmaddwd %%mm2, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
1184         "pmaddwd 96(%2), %%mm2          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
1185         "movq %%mm5, %%mm3              \n\t" /* A2             a2 */\
1186         "paddd %%mm4, %%mm3             \n\t" /* A2+B2          a2+b2 */\
1187         "psubd %%mm4, %%mm5             \n\t" /* a2-B2          a2-b2 */\
1188         "psrad $" #shift ", %%mm3       \n\t"\
1189         "psrad $" #shift ", %%mm5       \n\t"\
1190         "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
1191         "paddd %%mm2, %%mm6             \n\t" /* A3+B3          a3+b3 */\
1192         "psubd %%mm2, %%mm4             \n\t" /* a3-B3          a3-b3 */\
1193         "psrad $" #shift ", %%mm6       \n\t"\
1194         "packssdw %%mm3, %%mm3          \n\t" /* A2+B2  a2+b2 */\
1195         "movd %%mm3, 32+" #dst "        \n\t"\
1196         "psrad $" #shift ", %%mm4       \n\t"\
1197         "packssdw %%mm6, %%mm6          \n\t" /* A3+B3  a3+b3 */\
1198         "movd %%mm6, 48+" #dst "        \n\t"\
1199         "packssdw %%mm4, %%mm4          \n\t" /* A3-B3  a3-b3 */\
1200         "packssdw %%mm5, %%mm5          \n\t" /* A2-B2  a2-b2 */\
1201         "movd %%mm4, 64+" #dst "        \n\t"\
1202         "movd %%mm5, 80+" #dst "        \n\t"
1203
1204
1205 //IDCT(  src0,   src4,   src1,    src5,    dst, shift)
1206 IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
1207 IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
1208 IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
1209 IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
1210         "jmp 9f                         \n\t"
1211
1212
1213         "# .p2align 4                   \n\t"
1214         "7:                             \n\t"
1215 #undef IDCT
1216 #define IDCT(src0, src4, src1, src5, dst, shift) \
1217         "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
1218         "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
1219         "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
1220         "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
1221         "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
1222         "psrad $" #shift ", %%mm4       \n\t"\
1223         "psrad $" #shift ", %%mm0       \n\t"\
1224         "movq 8+" #src0 ", %%mm2        \n\t" /* R4     R0      r4      r0 */\
1225         "movq 16(%2), %%mm1             \n\t" /* C4     C4      C4      C4 */\
1226         "pmaddwd %%mm2, %%mm1           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
1227         "movq 24(%2), %%mm7             \n\t" /* -C4    C4      -C4     C4 */\
1228         "pmaddwd %%mm7, %%mm2           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
1229         "movq 32(%2), %%mm7             \n\t" /* C6     C2      C6      C2 */\
1230         "psrad $" #shift ", %%mm1       \n\t"\
1231         "packssdw %%mm1, %%mm4          \n\t" /* A0     a0 */\
1232         "movq %%mm4, " #dst "           \n\t"\
1233         "psrad $" #shift ", %%mm2       \n\t"\
1234         "packssdw %%mm2, %%mm0          \n\t" /* A1     a1 */\
1235         "movq %%mm0, 16+" #dst "        \n\t"\
1236         "movq %%mm0, 96+" #dst "        \n\t"\
1237         "movq %%mm4, 112+" #dst "       \n\t"\
1238         "movq %%mm0, 32+" #dst "        \n\t"\
1239         "movq %%mm4, 48+" #dst "        \n\t"\
1240         "movq %%mm4, 64+" #dst "        \n\t"\
1241         "movq %%mm0, 80+" #dst "        \n\t"
1242
1243 //IDCT(  src0,   src4,   src1,    src5,    dst, shift)
1244 IDCT(   0(%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
1245 //IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
1246 IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
1247 //IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
1248
1249
1250 #endif
1251
1252 /*
1253 Input
1254  00 40 04 44 20 60 24 64
1255  10 30 14 34 50 70 54 74
1256  01 41 03 43 21 61 23 63
1257  11 31 13 33 51 71 53 73
1258  02 42 06 46 22 62 26 66
1259  12 32 16 36 52 72 56 76
1260  05 45 07 47 25 65 27 67
1261  15 35 17 37 55 75 57 77
1262
1263 Temp
1264  00 04 10 14 20 24 30 34
1265  40 44 50 54 60 64 70 74
1266  01 03 11 13 21 23 31 33
1267  41 43 51 53 61 63 71 73
1268  02 06 12 16 22 26 32 36
1269  42 46 52 56 62 66 72 76
1270  05 07 15 17 25 27 35 37
1271  45 47 55 57 65 67 75 77
1272 */
1273
1274 "9: \n\t"
1275                 :: "r" (block), "r" (temp), "r" (coeffs)
1276                 : "%eax"
1277         );
1278 }
1279
1280 void ff_simple_idct_mmx(int16_t *block)
1281 {
1282     idct(block);
1283 }
1284
1285 //FIXME merge add/put into the idct
1286
1287 void ff_simple_idct_put_mmx(uint8_t *dest, int line_size, DCTELEM *block)
1288 {
1289     idct(block);
1290     ff_put_pixels_clamped_mmx(block, dest, line_size);
1291 }
1292 void ff_simple_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block)
1293 {
1294     idct(block);
1295     ff_add_pixels_clamped_mmx(block, dest, line_size);
1296 }