OSDN Git Service

8d0a04da8fb898b008507a00a6c84d29c675054f
[android-x86/external-mesa.git] / src / gallium / drivers / r600 / r600_asm.c
1 /*
2  * Copyright 2010 Jerome Glisse <glisse@freedesktop.org>
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * on the rights to use, copy, modify, merge, publish, distribute, sub
8  * license, and/or sell copies of the Software, and to permit persons to whom
9  * the Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18  * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21  * USE OR OTHER DEALINGS IN THE SOFTWARE.
22  */
23 #include <stdio.h>
24 #include <errno.h>
25 #include <byteswap.h>
26 #include "util/u_format.h"
27 #include "util/u_memory.h"
28 #include "pipe/p_shader_tokens.h"
29 #include "r600_pipe.h"
30 #include "r600_sq.h"
31 #include "r600_opcodes.h"
32 #include "r600_asm.h"
33 #include "r600_formats.h"
34 #include "r600d.h"
35
36 #define NUM_OF_CYCLES 3
37 #define NUM_OF_COMPONENTS 4
38
39 static inline unsigned int r600_bytecode_get_num_operands(struct r600_bytecode *bc, struct r600_bytecode_alu *alu)
40 {
41         if(alu->is_op3)
42                 return 3;
43
44         switch (bc->chip_class) {
45         case R600:
46         case R700:
47                 switch (alu->inst) {
48                 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP:
49                         return 0;
50                 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD:
51                 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD_INT:
52                 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SUB_INT:
53                 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_AND_INT:
54                 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_OR_INT:
55                 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLE:
56                 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT:
57                 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGE:
58                 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLNE:
59                 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL:
60                 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULHI_INT:
61                 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULLO_INT:
62                 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULHI_UINT:
63                 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULLO_UINT:
64                 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX:
65                 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MIN:
66                 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX_UINT:
67                 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MIN_UINT:
68                 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX_INT:
69                 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MIN_INT:
70                 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETE:
71                 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETE_INT:
72                 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETNE:
73                 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETNE_INT:
74                 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT:
75                 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT_INT:
76                 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT_UINT:
77                 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE:
78                 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE_INT:
79                 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE_UINT:
80                 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE:
81                 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGT:
82                 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGE:
83                 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE:
84                 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE_INT:
85                 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4:
86                 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4_IEEE:
87                 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_CUBE:
88                 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_XOR_INT:
89                         return 2;
90
91                 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV:
92                 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA:
93                 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_FLOOR:
94                 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_INT:
95                 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FRACT:
96                 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLOOR:
97                 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_TRUNC:
98                 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE:
99                 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_CLAMPED:
100                 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_IEEE:
101                 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_CLAMPED:
102                 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_IEEE:
103                 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIPSQRT_CLAMPED:
104                 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIPSQRT_IEEE:
105                 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_INT:
106                 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_INT_TO_FLT:
107                 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SIN:
108                 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_COS:
109                 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RNDNE:
110                 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOT_INT:
111                         return 1;
112                 default: R600_ERR(
113                         "Need instruction operand number for 0x%x.\n", alu->inst);
114                 }
115                 break;
116         case EVERGREEN:
117         case CAYMAN:
118                 switch (alu->inst) {
119                 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP:
120                         return 0;
121                 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD:
122                 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD_INT:
123                 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SUB_INT:
124                 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_AND_INT:
125                 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_OR_INT:
126                 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLE:
127                 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT:
128                 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGE:
129                 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLNE:
130                 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL:
131                 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULHI_INT:
132                 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULLO_INT:
133                 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULHI_UINT:
134                 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULLO_UINT:
135                 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX:
136                 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MIN:
137                 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX_UINT:
138                 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MIN_UINT:
139                 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX_INT:
140                 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MIN_INT:
141                 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETE:
142                 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETE_INT:
143                 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETNE:
144                 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETNE_INT:
145                 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT:
146                 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT_INT:
147                 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT_UINT:
148                 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE:
149                 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE_INT:
150                 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE_UINT:
151                 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE:
152                 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGT:
153                 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGE:
154                 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE:
155                 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE_INT:
156                 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4:
157                 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4_IEEE:
158                 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_CUBE:
159                 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INTERP_XY:
160                 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INTERP_ZW:
161                 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_XOR_INT:
162                         return 2;
163
164                 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV:
165                 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_INT:
166                 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FRACT:
167                 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLOOR:
168                 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_TRUNC:
169                 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE:
170                 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_CLAMPED:
171                 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_IEEE:
172                 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_CLAMPED:
173                 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_IEEE:
174                 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIPSQRT_CLAMPED:
175                 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIPSQRT_IEEE:
176                 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_INT:
177                 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_INT_FLOOR:
178                 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_INT_TO_FLT:
179                 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SIN:
180                 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_COS:
181                 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RNDNE:
182                 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOT_INT:
183                 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INTERP_LOAD_P0:
184                         return 1;
185                 default: R600_ERR(
186                         "Need instruction operand number for 0x%x.\n", alu->inst);
187                 }
188                 break;
189         }
190
191         return 3;
192 }
193
194 int r700_bytecode_alu_build(struct r600_bytecode *bc, struct r600_bytecode_alu *alu, unsigned id);
195
196 static struct r600_bytecode_cf *r600_bytecode_cf(void)
197 {
198         struct r600_bytecode_cf *cf = CALLOC_STRUCT(r600_bytecode_cf);
199
200         if (cf == NULL)
201                 return NULL;
202         LIST_INITHEAD(&cf->list);
203         LIST_INITHEAD(&cf->alu);
204         LIST_INITHEAD(&cf->vtx);
205         LIST_INITHEAD(&cf->tex);
206         return cf;
207 }
208
209 static struct r600_bytecode_alu *r600_bytecode_alu(void)
210 {
211         struct r600_bytecode_alu *alu = CALLOC_STRUCT(r600_bytecode_alu);
212
213         if (alu == NULL)
214                 return NULL;
215         LIST_INITHEAD(&alu->list);
216         return alu;
217 }
218
219 static struct r600_bytecode_vtx *r600_bytecode_vtx(void)
220 {
221         struct r600_bytecode_vtx *vtx = CALLOC_STRUCT(r600_bytecode_vtx);
222
223         if (vtx == NULL)
224                 return NULL;
225         LIST_INITHEAD(&vtx->list);
226         return vtx;
227 }
228
229 static struct r600_bytecode_tex *r600_bytecode_tex(void)
230 {
231         struct r600_bytecode_tex *tex = CALLOC_STRUCT(r600_bytecode_tex);
232
233         if (tex == NULL)
234                 return NULL;
235         LIST_INITHEAD(&tex->list);
236         return tex;
237 }
238
239 void r600_bytecode_init(struct r600_bytecode *bc, enum chip_class chip_class)
240 {
241         LIST_INITHEAD(&bc->cf);
242         bc->chip_class = chip_class;
243 }
244
245 static int r600_bytecode_add_cf(struct r600_bytecode *bc)
246 {
247         struct r600_bytecode_cf *cf = r600_bytecode_cf();
248
249         if (cf == NULL)
250                 return -ENOMEM;
251         LIST_ADDTAIL(&cf->list, &bc->cf);
252         if (bc->cf_last)
253                 cf->id = bc->cf_last->id + 2;
254         bc->cf_last = cf;
255         bc->ncf++;
256         bc->ndw += 2;
257         bc->force_add_cf = 0;
258         bc->ar_loaded = 0;
259         return 0;
260 }
261
262 int r600_bytecode_add_output(struct r600_bytecode *bc, const struct r600_bytecode_output *output)
263 {
264         int r;
265
266         if (bc->cf_last && (bc->cf_last->inst == output->inst ||
267                 (bc->cf_last->inst == BC_INST(bc, V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT) &&
268                 output->inst == BC_INST(bc, V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT_DONE))) &&
269                 output->type == bc->cf_last->output.type &&
270                 output->elem_size == bc->cf_last->output.elem_size &&
271                 output->swizzle_x == bc->cf_last->output.swizzle_x &&
272                 output->swizzle_y == bc->cf_last->output.swizzle_y &&
273                 output->swizzle_z == bc->cf_last->output.swizzle_z &&
274                 output->swizzle_w == bc->cf_last->output.swizzle_w &&
275                 (output->burst_count + bc->cf_last->output.burst_count) <= 16) {
276
277                 if ((output->gpr + output->burst_count) == bc->cf_last->output.gpr &&
278                         (output->array_base + output->burst_count) == bc->cf_last->output.array_base) {
279
280                         bc->cf_last->output.end_of_program |= output->end_of_program;
281                         bc->cf_last->output.inst = output->inst;
282                         bc->cf_last->output.gpr = output->gpr;
283                         bc->cf_last->output.array_base = output->array_base;
284                         bc->cf_last->output.burst_count += output->burst_count;
285                         return 0;
286
287                 } else if (output->gpr == (bc->cf_last->output.gpr + bc->cf_last->output.burst_count) &&
288                         output->array_base == (bc->cf_last->output.array_base + bc->cf_last->output.burst_count)) {
289
290                         bc->cf_last->output.end_of_program |= output->end_of_program;
291                         bc->cf_last->output.inst = output->inst;
292                         bc->cf_last->output.burst_count += output->burst_count;
293                         return 0;
294                 }
295         }
296
297         r = r600_bytecode_add_cf(bc);
298         if (r)
299                 return r;
300         bc->cf_last->inst = output->inst;
301         memcpy(&bc->cf_last->output, output, sizeof(struct r600_bytecode_output));
302         return 0;
303 }
304
305 /* alu instructions that can ony exits once per group */
306 static int is_alu_once_inst(struct r600_bytecode *bc, struct r600_bytecode_alu *alu)
307 {
308         switch (bc->chip_class) {
309         case R600:
310         case R700:
311                 return !alu->is_op3 && (
312                         alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLE ||
313                         alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT ||
314                         alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGE ||
315                         alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLNE ||
316                         alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT_UINT ||
317                         alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGE_UINT ||
318                         alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLE_INT ||
319                         alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT_INT ||
320                         alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGE_INT ||
321                         alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLNE_INT ||
322                         alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGT_UINT ||
323                         alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGE_UINT ||
324                         alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE ||
325                         alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGT ||
326                         alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGE ||
327                         alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE ||
328                         alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SET_INV ||
329                         alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SET_POP ||
330                         alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SET_CLR ||
331                         alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SET_RESTORE ||
332                         alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE_PUSH ||
333                         alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGT_PUSH ||
334                         alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGE_PUSH ||
335                         alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE_PUSH ||
336                         alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE_INT ||
337                         alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGT_INT ||
338                         alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGE_INT ||
339                         alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE_INT ||
340                         alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE_PUSH_INT ||
341                         alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGT_PUSH_INT ||
342                         alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGE_PUSH_INT ||
343                         alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE_PUSH_INT ||
344                         alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETLT_PUSH_INT ||
345                         alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETLE_PUSH_INT);
346         case EVERGREEN:
347         case CAYMAN:
348         default:
349                 return !alu->is_op3 && (
350                         alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLE ||
351                         alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT ||
352                         alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGE ||
353                         alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLNE ||
354                         alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT_UINT ||
355                         alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGE_UINT ||
356                         alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLE_INT ||
357                         alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT_INT ||
358                         alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGE_INT ||
359                         alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLNE_INT ||
360                         alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGT_UINT ||
361                         alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGE_UINT ||
362                         alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE ||
363                         alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGT ||
364                         alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGE ||
365                         alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE ||
366                         alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SET_INV ||
367                         alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SET_POP ||
368                         alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SET_CLR ||
369                         alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SET_RESTORE ||
370                         alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE_PUSH ||
371                         alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGT_PUSH ||
372                         alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGE_PUSH ||
373                         alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE_PUSH ||
374                         alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE_INT ||
375                         alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGT_INT ||
376                         alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGE_INT ||
377                         alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE_INT ||
378                         alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE_PUSH_INT ||
379                         alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGT_PUSH_INT ||
380                         alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGE_PUSH_INT ||
381                         alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE_PUSH_INT ||
382                         alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETLT_PUSH_INT ||
383                         alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETLE_PUSH_INT);
384         }
385 }
386
387 static int is_alu_reduction_inst(struct r600_bytecode *bc, struct r600_bytecode_alu *alu)
388 {
389         switch (bc->chip_class) {
390         case R600:
391         case R700:
392                 return !alu->is_op3 && (
393                         alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_CUBE ||
394                         alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4 ||
395                         alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4_IEEE ||
396                         alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX4);
397         case EVERGREEN:
398         case CAYMAN:
399         default:
400                 return !alu->is_op3 && (
401                         alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_CUBE ||
402                         alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4 ||
403                         alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4_IEEE ||
404                         alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX4);
405         }
406 }
407
408 static int is_alu_cube_inst(struct r600_bytecode *bc, struct r600_bytecode_alu *alu)
409 {
410         switch (bc->chip_class) {
411         case R600:
412         case R700:
413                 return !alu->is_op3 &&
414                         alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_CUBE;
415         case EVERGREEN:
416         case CAYMAN:
417         default:
418                 return !alu->is_op3 &&
419                         alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_CUBE;
420         }
421 }
422
423 static int is_alu_mova_inst(struct r600_bytecode *bc, struct r600_bytecode_alu *alu)
424 {
425         switch (bc->chip_class) {
426         case R600:
427         case R700:
428                 return !alu->is_op3 && (
429                         alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA ||
430                         alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_FLOOR ||
431                         alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_INT);
432         case EVERGREEN:
433         case CAYMAN:
434         default:
435                 return !alu->is_op3 && (
436                         alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_INT);
437         }
438 }
439
440 /* alu instructions that can only execute on the vector unit */
441 static int is_alu_vec_unit_inst(struct r600_bytecode *bc, struct r600_bytecode_alu *alu)
442 {
443         switch (bc->chip_class) {
444         case R600:
445         case R700:
446                 return is_alu_reduction_inst(bc, alu) ||
447                         is_alu_mova_inst(bc, alu);
448         case EVERGREEN:
449         case CAYMAN:
450         default:
451                 return is_alu_reduction_inst(bc, alu) ||
452                         is_alu_mova_inst(bc, alu) ||
453                         (alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_INT ||
454                          alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_INT_FLOOR ||
455                          alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INTERP_XY ||
456                          alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INTERP_ZW);
457         }
458 }
459
460 /* alu instructions that can only execute on the trans unit */
461 static int is_alu_trans_unit_inst(struct r600_bytecode *bc, struct r600_bytecode_alu *alu)
462 {
463         switch (bc->chip_class) {
464         case R600:
465         case R700:
466                 if (!alu->is_op3)
467                         return alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ASHR_INT ||
468                                 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_INT_TO_FLT ||
469                                 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_UINT ||
470                                 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_INT ||
471                                 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LSHL_INT ||
472                                 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LSHR_INT ||
473                                 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULHI_INT ||
474                                 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULHI_UINT ||
475                                 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULLO_INT ||
476                                 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULLO_UINT ||
477                                 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_INT ||
478                                 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_UINT ||
479                                 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_UINT_TO_FLT ||
480                                 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_COS ||
481                                 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE ||
482                                 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_CLAMPED ||
483                                 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_IEEE ||
484                                 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_CLAMPED ||
485                                 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_FF ||
486                                 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_IEEE ||
487                                 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIPSQRT_CLAMPED ||
488                                 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIPSQRT_FF ||
489                                 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIPSQRT_IEEE ||
490                                 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SIN ||
491                                 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SQRT_IEEE;
492                 else
493                         return alu->inst == V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MUL_LIT ||
494                                 alu->inst == V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MUL_LIT_D2 ||
495                                 alu->inst == V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MUL_LIT_M2 ||
496                                 alu->inst == V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MUL_LIT_M4;
497         case EVERGREEN:
498         case CAYMAN:
499         default:
500                 if (!alu->is_op3)
501                         /* Note that FLT_TO_INT_* instructions are vector-only instructions
502                          * on Evergreen, despite what the documentation says. FLT_TO_INT
503                          * can do both vector and scalar. */
504                         return alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ASHR_INT ||
505                                 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_INT_TO_FLT ||
506                                 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LSHL_INT ||
507                                 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LSHR_INT ||
508                                 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULHI_INT ||
509                                 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULHI_UINT ||
510                                 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULLO_INT ||
511                                 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULLO_UINT ||
512                                 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_INT ||
513                                 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_UINT ||
514                                 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_UINT_TO_FLT ||
515                                 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_COS ||
516                                 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE ||
517                                 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_CLAMPED ||
518                                 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_IEEE ||
519                                 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_CLAMPED ||
520                                 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_FF ||
521                                 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_IEEE ||
522                                 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIPSQRT_CLAMPED ||
523                                 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIPSQRT_FF ||
524                                 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIPSQRT_IEEE ||
525                                 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SIN ||
526                                 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SQRT_IEEE;
527                 else
528                         return alu->inst == EG_V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MUL_LIT;
529         }
530 }
531
532 /* alu instructions that can execute on any unit */
533 static int is_alu_any_unit_inst(struct r600_bytecode *bc, struct r600_bytecode_alu *alu)
534 {
535         return !is_alu_vec_unit_inst(bc, alu) &&
536                 !is_alu_trans_unit_inst(bc, alu);
537 }
538
539 static int assign_alu_units(struct r600_bytecode *bc, struct r600_bytecode_alu *alu_first,
540                             struct r600_bytecode_alu *assignment[5])
541 {
542         struct r600_bytecode_alu *alu;
543         unsigned i, chan, trans;
544         int max_slots = bc->chip_class == CAYMAN ? 4 : 5;
545
546         for (i = 0; i < max_slots; i++)
547                 assignment[i] = NULL;
548
549         for (alu = alu_first; alu; alu = LIST_ENTRY(struct r600_bytecode_alu, alu->list.next, list)) {
550                 chan = alu->dst.chan;
551                 if (max_slots == 4)
552                         trans = 0;
553                 else if (is_alu_trans_unit_inst(bc, alu))
554                         trans = 1;
555                 else if (is_alu_vec_unit_inst(bc, alu))
556                         trans = 0;
557                 else if (assignment[chan])
558                         trans = 1; /* Assume ALU_INST_PREFER_VECTOR. */
559                 else
560                         trans = 0;
561
562                 if (trans) {
563                         if (assignment[4]) {
564                                 assert(0); /* ALU.Trans has already been allocated. */
565                                 return -1;
566                         }
567                         assignment[4] = alu;
568                 } else {
569                         if (assignment[chan]) {
570                                 assert(0); /* ALU.chan has already been allocated. */
571                                 return -1;
572                         }
573                         assignment[chan] = alu;
574                 }
575
576                 if (alu->last)
577                         break;
578         }
579         return 0;
580 }
581
582 struct alu_bank_swizzle {
583         int     hw_gpr[NUM_OF_CYCLES][NUM_OF_COMPONENTS];
584         int     hw_cfile_addr[4];
585         int     hw_cfile_elem[4];
586 };
587
588 static const unsigned cycle_for_bank_swizzle_vec[][3] = {
589         [SQ_ALU_VEC_012] = { 0, 1, 2 },
590         [SQ_ALU_VEC_021] = { 0, 2, 1 },
591         [SQ_ALU_VEC_120] = { 1, 2, 0 },
592         [SQ_ALU_VEC_102] = { 1, 0, 2 },
593         [SQ_ALU_VEC_201] = { 2, 0, 1 },
594         [SQ_ALU_VEC_210] = { 2, 1, 0 }
595 };
596
597 static const unsigned cycle_for_bank_swizzle_scl[][3] = {
598         [SQ_ALU_SCL_210] = { 2, 1, 0 },
599         [SQ_ALU_SCL_122] = { 1, 2, 2 },
600         [SQ_ALU_SCL_212] = { 2, 1, 2 },
601         [SQ_ALU_SCL_221] = { 2, 2, 1 }
602 };
603
604 static void init_bank_swizzle(struct alu_bank_swizzle *bs)
605 {
606         int i, cycle, component;
607         /* set up gpr use */
608         for (cycle = 0; cycle < NUM_OF_CYCLES; cycle++)
609                 for (component = 0; component < NUM_OF_COMPONENTS; component++)
610                          bs->hw_gpr[cycle][component] = -1;
611         for (i = 0; i < 4; i++)
612                 bs->hw_cfile_addr[i] = -1;
613         for (i = 0; i < 4; i++)
614                 bs->hw_cfile_elem[i] = -1;
615 }
616
617 static int reserve_gpr(struct alu_bank_swizzle *bs, unsigned sel, unsigned chan, unsigned cycle)
618 {
619         if (bs->hw_gpr[cycle][chan] == -1)
620                 bs->hw_gpr[cycle][chan] = sel;
621         else if (bs->hw_gpr[cycle][chan] != (int)sel) {
622                 /* Another scalar operation has already used the GPR read port for the channel. */
623                 return -1;
624         }
625         return 0;
626 }
627
628 static int reserve_cfile(struct r600_bytecode *bc, struct alu_bank_swizzle *bs, unsigned sel, unsigned chan)
629 {
630         int res, num_res = 4;
631         if (bc->chip_class >= R700) {
632                 num_res = 2;
633                 chan /= 2;
634         }
635         for (res = 0; res < num_res; ++res) {
636                 if (bs->hw_cfile_addr[res] == -1) {
637                         bs->hw_cfile_addr[res] = sel;
638                         bs->hw_cfile_elem[res] = chan;
639                         return 0;
640                 } else if (bs->hw_cfile_addr[res] == sel &&
641                         bs->hw_cfile_elem[res] == chan)
642                         return 0; /* Read for this scalar element already reserved, nothing to do here. */
643         }
644         /* All cfile read ports are used, cannot reference vector element. */
645         return -1;
646 }
647
648 static int is_gpr(unsigned sel)
649 {
650         return (sel >= 0 && sel <= 127);
651 }
652
653 /* CB constants start at 512, and get translated to a kcache index when ALU
654  * clauses are constructed. Note that we handle kcache constants the same way
655  * as (the now gone) cfile constants, is that really required? */
656 static int is_cfile(unsigned sel)
657 {
658         return (sel > 255 && sel < 512) ||
659                 (sel > 511 && sel < 4607) || /* Kcache before translation. */
660                 (sel > 127 && sel < 192); /* Kcache after translation. */
661 }
662
663 static int is_const(int sel)
664 {
665         return is_cfile(sel) ||
666                 (sel >= V_SQ_ALU_SRC_0 &&
667                 sel <= V_SQ_ALU_SRC_LITERAL);
668 }
669
670 static int check_vector(struct r600_bytecode *bc, struct r600_bytecode_alu *alu,
671                         struct alu_bank_swizzle *bs, int bank_swizzle)
672 {
673         int r, src, num_src, sel, elem, cycle;
674
675         num_src = r600_bytecode_get_num_operands(bc, alu);
676         for (src = 0; src < num_src; src++) {
677                 sel = alu->src[src].sel;
678                 elem = alu->src[src].chan;
679                 if (is_gpr(sel)) {
680                         cycle = cycle_for_bank_swizzle_vec[bank_swizzle][src];
681                         if (src == 1 && sel == alu->src[0].sel && elem == alu->src[0].chan)
682                                 /* Nothing to do; special-case optimization,
683                                  * second source uses first source’s reservation. */
684                                 continue;
685                         else {
686                                 r = reserve_gpr(bs, sel, elem, cycle);
687                                 if (r)
688                                         return r;
689                         }
690                 } else if (is_cfile(sel)) {
691                         r = reserve_cfile(bc, bs, sel, elem);
692                         if (r)
693                                 return r;
694                 }
695                 /* No restrictions on PV, PS, literal or special constants. */
696         }
697         return 0;
698 }
699
700 static int check_scalar(struct r600_bytecode *bc, struct r600_bytecode_alu *alu,
701                         struct alu_bank_swizzle *bs, int bank_swizzle)
702 {
703         int r, src, num_src, const_count, sel, elem, cycle;
704
705         num_src = r600_bytecode_get_num_operands(bc, alu);
706         for (const_count = 0, src = 0; src < num_src; ++src) {
707                 sel = alu->src[src].sel;
708                 elem = alu->src[src].chan;
709                 if (is_const(sel)) { /* Any constant, including literal and inline constants. */
710                         if (const_count >= 2)
711                                 /* More than two references to a constant in
712                                  * transcendental operation. */
713                                 return -1;
714                         else
715                                 const_count++;
716                 }
717                 if (is_cfile(sel)) {
718                         r = reserve_cfile(bc, bs, sel, elem);
719                         if (r)
720                                 return r;
721                 }
722         }
723         for (src = 0; src < num_src; ++src) {
724                 sel = alu->src[src].sel;
725                 elem = alu->src[src].chan;
726                 if (is_gpr(sel)) {
727                         cycle = cycle_for_bank_swizzle_scl[bank_swizzle][src];
728                         if (cycle < const_count)
729                                 /* Cycle for GPR load conflicts with
730                                  * constant load in transcendental operation. */
731                                 return -1;
732                         r = reserve_gpr(bs, sel, elem, cycle);
733                         if (r)
734                                 return r;
735                 }
736                 /* PV PS restrictions */
737                 if (const_count && (sel == 254 || sel == 255)) {
738                         cycle = cycle_for_bank_swizzle_scl[bank_swizzle][src];
739                         if (cycle < const_count)
740                                 return -1;
741                 }
742         }
743         return 0;
744 }
745
746 static int check_and_set_bank_swizzle(struct r600_bytecode *bc,
747                                       struct r600_bytecode_alu *slots[5])
748 {
749         struct alu_bank_swizzle bs;
750         int bank_swizzle[5];
751         int i, r = 0, forced = 1;
752         boolean scalar_only = bc->chip_class == CAYMAN ? false : true;
753         int max_slots = bc->chip_class == CAYMAN ? 4 : 5;
754
755         for (i = 0; i < max_slots; i++) {
756                 if (slots[i]) {
757                         if (slots[i]->bank_swizzle_force) {
758                                 slots[i]->bank_swizzle = slots[i]->bank_swizzle_force;
759                         } else {
760                                 forced = 0;
761                         }
762                 }
763
764                 if (i < 4 && slots[i])
765                         scalar_only = false;
766         }
767         if (forced)
768                 return 0;
769
770         /* Just check every possible combination of bank swizzle.
771          * Not very efficent, but works on the first try in most of the cases. */
772         for (i = 0; i < 4; i++)
773                 if (!slots[i] || !slots[i]->bank_swizzle_force)
774                         bank_swizzle[i] = SQ_ALU_VEC_012;
775                 else
776                         bank_swizzle[i] = slots[i]->bank_swizzle;
777
778         bank_swizzle[4] = SQ_ALU_SCL_210;
779         while(bank_swizzle[4] <= SQ_ALU_SCL_221) {
780
781                 if (max_slots == 4) {
782                         for (i = 0; i < max_slots; i++) {
783                                 if (bank_swizzle[i] == SQ_ALU_VEC_210)
784                                   return -1;
785                         }
786                 }
787                 init_bank_swizzle(&bs);
788                 if (scalar_only == false) {
789                         for (i = 0; i < 4; i++) {
790                                 if (slots[i]) {
791                                         r = check_vector(bc, slots[i], &bs, bank_swizzle[i]);
792                                         if (r)
793                                                 break;
794                                 }
795                         }
796                 } else
797                         r = 0;
798
799                 if (!r && slots[4] && max_slots == 5) {
800                         r = check_scalar(bc, slots[4], &bs, bank_swizzle[4]);
801                 }
802                 if (!r) {
803                         for (i = 0; i < max_slots; i++) {
804                                 if (slots[i])
805                                         slots[i]->bank_swizzle = bank_swizzle[i];
806                         }
807                         return 0;
808                 }
809
810                 if (scalar_only) {
811                         bank_swizzle[4]++;
812                 } else {
813                         for (i = 0; i < max_slots; i++) {
814                                 if (!slots[i] || !slots[i]->bank_swizzle_force) {
815                                         bank_swizzle[i]++;
816                                         if (bank_swizzle[i] <= SQ_ALU_VEC_210)
817                                                 break;
818                                         else
819                                                 bank_swizzle[i] = SQ_ALU_VEC_012;
820                                 }
821                         }
822                 }
823         }
824
825         /* Couldn't find a working swizzle. */
826         return -1;
827 }
828
829 static int replace_gpr_with_pv_ps(struct r600_bytecode *bc,
830                                   struct r600_bytecode_alu *slots[5], struct r600_bytecode_alu *alu_prev)
831 {
832         struct r600_bytecode_alu *prev[5];
833         int gpr[5], chan[5];
834         int i, j, r, src, num_src;
835         int max_slots = bc->chip_class == CAYMAN ? 4 : 5;
836
837         r = assign_alu_units(bc, alu_prev, prev);
838         if (r)
839                 return r;
840
841         for (i = 0; i < max_slots; ++i) {
842                 if (prev[i] && (prev[i]->dst.write || prev[i]->is_op3) && !prev[i]->dst.rel) {
843                         gpr[i] = prev[i]->dst.sel;
844                         /* cube writes more than PV.X */
845                         if (!is_alu_cube_inst(bc, prev[i]) && is_alu_reduction_inst(bc, prev[i]))
846                                 chan[i] = 0;
847                         else
848                                 chan[i] = prev[i]->dst.chan;
849                 } else
850                         gpr[i] = -1;
851         }
852
853         for (i = 0; i < max_slots; ++i) {
854                 struct r600_bytecode_alu *alu = slots[i];
855                 if(!alu)
856                         continue;
857
858                 num_src = r600_bytecode_get_num_operands(bc, alu);
859                 for (src = 0; src < num_src; ++src) {
860                         if (!is_gpr(alu->src[src].sel) || alu->src[src].rel)
861                                 continue;
862
863                         if (bc->chip_class < CAYMAN) {
864                                 if (alu->src[src].sel == gpr[4] &&
865                                     alu->src[src].chan == chan[4]) {
866                                         alu->src[src].sel = V_SQ_ALU_SRC_PS;
867                                         alu->src[src].chan = 0;
868                                         continue;
869                                 }
870                         }
871
872                         for (j = 0; j < 4; ++j) {
873                                 if (alu->src[src].sel == gpr[j] &&
874                                         alu->src[src].chan == j) {
875                                         alu->src[src].sel = V_SQ_ALU_SRC_PV;
876                                         alu->src[src].chan = chan[j];
877                                         break;
878                                 }
879                         }
880                 }
881         }
882
883         return 0;
884 }
885
886 void r600_bytecode_special_constants(u32 value, unsigned *sel, unsigned *neg)
887 {
888         switch(value) {
889         case 0:
890                 *sel = V_SQ_ALU_SRC_0;
891                 break;
892         case 1:
893                 *sel = V_SQ_ALU_SRC_1_INT;
894                 break;
895         case -1:
896                 *sel = V_SQ_ALU_SRC_M_1_INT;
897                 break;
898         case 0x3F800000: /* 1.0f */
899                 *sel = V_SQ_ALU_SRC_1;
900                 break;
901         case 0x3F000000: /* 0.5f */
902                 *sel = V_SQ_ALU_SRC_0_5;
903                 break;
904         case 0xBF800000: /* -1.0f */
905                 *sel = V_SQ_ALU_SRC_1;
906                 *neg ^= 1;
907                 break;
908         case 0xBF000000: /* -0.5f */
909                 *sel = V_SQ_ALU_SRC_0_5;
910                 *neg ^= 1;
911                 break;
912         default:
913                 *sel = V_SQ_ALU_SRC_LITERAL;
914                 break;
915         }
916 }
917
918 /* compute how many literal are needed */
919 static int r600_bytecode_alu_nliterals(struct r600_bytecode *bc, struct r600_bytecode_alu *alu,
920                                  uint32_t literal[4], unsigned *nliteral)
921 {
922         unsigned num_src = r600_bytecode_get_num_operands(bc, alu);
923         unsigned i, j;
924
925         for (i = 0; i < num_src; ++i) {
926                 if (alu->src[i].sel == V_SQ_ALU_SRC_LITERAL) {
927                         uint32_t value = alu->src[i].value;
928                         unsigned found = 0;
929                         for (j = 0; j < *nliteral; ++j) {
930                                 if (literal[j] == value) {
931                                         found = 1;
932                                         break;
933                                 }
934                         }
935                         if (!found) {
936                                 if (*nliteral >= 4)
937                                         return -EINVAL;
938                                 literal[(*nliteral)++] = value;
939                         }
940                 }
941         }
942         return 0;
943 }
944
945 static void r600_bytecode_alu_adjust_literals(struct r600_bytecode *bc,
946                                         struct r600_bytecode_alu *alu,
947                                         uint32_t literal[4], unsigned nliteral)
948 {
949         unsigned num_src = r600_bytecode_get_num_operands(bc, alu);
950         unsigned i, j;
951
952         for (i = 0; i < num_src; ++i) {
953                 if (alu->src[i].sel == V_SQ_ALU_SRC_LITERAL) {
954                         uint32_t value = alu->src[i].value;
955                         for (j = 0; j < nliteral; ++j) {
956                                 if (literal[j] == value) {
957                                         alu->src[i].chan = j;
958                                         break;
959                                 }
960                         }
961                 }
962         }
963 }
964
965 static int merge_inst_groups(struct r600_bytecode *bc, struct r600_bytecode_alu *slots[5],
966                              struct r600_bytecode_alu *alu_prev)
967 {
968         struct r600_bytecode_alu *prev[5];
969         struct r600_bytecode_alu *result[5] = { NULL };
970
971         uint32_t literal[4], prev_literal[4];
972         unsigned nliteral = 0, prev_nliteral = 0;
973
974         int i, j, r, src, num_src;
975         int num_once_inst = 0;
976         int have_mova = 0, have_rel = 0;
977         int max_slots = bc->chip_class == CAYMAN ? 4 : 5;
978
979         r = assign_alu_units(bc, alu_prev, prev);
980         if (r)
981                 return r;
982
983         for (i = 0; i < max_slots; ++i) {
984                 struct r600_bytecode_alu *alu;
985
986                 /* check number of literals */
987                 if (prev[i]) {
988                         if (r600_bytecode_alu_nliterals(bc, prev[i], literal, &nliteral))
989                                 return 0;
990                         if (r600_bytecode_alu_nliterals(bc, prev[i], prev_literal, &prev_nliteral))
991                                 return 0;
992                         if (is_alu_mova_inst(bc, prev[i])) {
993                                 if (have_rel)
994                                         return 0;
995                                 have_mova = 1;
996                         }
997                         num_once_inst += is_alu_once_inst(bc, prev[i]);
998                 }
999                 if (slots[i] && r600_bytecode_alu_nliterals(bc, slots[i], literal, &nliteral))
1000                         return 0;
1001
1002                 /* Let's check used slots. */
1003                 if (prev[i] && !slots[i]) {
1004                         result[i] = prev[i];
1005                         continue;
1006                 } else if (prev[i] && slots[i]) {
1007                         if (max_slots == 5 && result[4] == NULL && prev[4] == NULL && slots[4] == NULL) {
1008                                 /* Trans unit is still free try to use it. */
1009                                 if (is_alu_any_unit_inst(bc, slots[i])) {
1010                                         result[i] = prev[i];
1011                                         result[4] = slots[i];
1012                                 } else if (is_alu_any_unit_inst(bc, prev[i])) {
1013                                         result[i] = slots[i];
1014                                         result[4] = prev[i];
1015                                 } else
1016                                         return 0;
1017                         } else
1018                                 return 0;
1019                 } else if(!slots[i]) {
1020                         continue;
1021                 } else
1022                         result[i] = slots[i];
1023
1024                 alu = slots[i];
1025                 num_once_inst += is_alu_once_inst(bc, alu);
1026
1027                 /* Let's check dst gpr. */
1028                 if (alu->dst.rel) {
1029                         if (have_mova)
1030                                 return 0;
1031                         have_rel = 1;
1032                 }
1033
1034                 /* Let's check source gprs */
1035                 num_src = r600_bytecode_get_num_operands(bc, alu);
1036                 for (src = 0; src < num_src; ++src) {
1037                         if (alu->src[src].rel) {
1038                                 if (have_mova)
1039                                         return 0;
1040                                 have_rel = 1;
1041                         }
1042
1043                         /* Constants don't matter. */
1044                         if (!is_gpr(alu->src[src].sel))
1045                                 continue;
1046
1047                         for (j = 0; j < max_slots; ++j) {
1048                                 if (!prev[j] || !prev[j]->dst.write)
1049                                         continue;
1050
1051                                 /* If it's relative then we can't determin which gpr is really used. */
1052                                 if (prev[j]->dst.chan == alu->src[src].chan &&
1053                                         (prev[j]->dst.sel == alu->src[src].sel ||
1054                                         prev[j]->dst.rel || alu->src[src].rel))
1055                                         return 0;
1056                         }
1057                 }
1058         }
1059
1060         /* more than one PRED_ or KILL_ ? */
1061         if (num_once_inst > 1)
1062                 return 0;
1063
1064         /* check if the result can still be swizzlet */
1065         r = check_and_set_bank_swizzle(bc, result);
1066         if (r)
1067                 return 0;
1068
1069         /* looks like everything worked out right, apply the changes */
1070
1071         /* undo adding previus literals */
1072         bc->cf_last->ndw -= align(prev_nliteral, 2);
1073
1074         /* sort instructions */
1075         for (i = 0; i < max_slots; ++i) {
1076                 slots[i] = result[i];
1077                 if (result[i]) {
1078                         LIST_DEL(&result[i]->list);
1079                         result[i]->last = 0;
1080                         LIST_ADDTAIL(&result[i]->list, &bc->cf_last->alu);
1081                 }
1082         }
1083
1084         /* determine new last instruction */
1085         LIST_ENTRY(struct r600_bytecode_alu, bc->cf_last->alu.prev, list)->last = 1;
1086
1087         /* determine new first instruction */
1088         for (i = 0; i < max_slots; ++i) {
1089                 if (result[i]) {
1090                         bc->cf_last->curr_bs_head = result[i];
1091                         break;
1092                 }
1093         }
1094
1095         bc->cf_last->prev_bs_head = bc->cf_last->prev2_bs_head;
1096         bc->cf_last->prev2_bs_head = NULL;
1097
1098         return 0;
1099 }
1100
1101 /* This code handles kcache lines as single blocks of 32 constants. We could
1102  * probably do slightly better by recognizing that we actually have two
1103  * consecutive lines of 16 constants, but the resulting code would also be
1104  * somewhat more complicated. */
1105 static int r600_bytecode_alloc_kcache_lines(struct r600_bytecode *bc, struct r600_bytecode_alu *alu, int type)
1106 {
1107         struct r600_bytecode_kcache *kcache = bc->cf_last->kcache;
1108         unsigned int required_lines;
1109         unsigned int free_lines = 0;
1110         unsigned int cache_line[3];
1111         unsigned int count = 0;
1112         unsigned int i, j;
1113         int r;
1114
1115         /* Collect required cache lines. */
1116         for (i = 0; i < 3; ++i) {
1117                 boolean found = false;
1118                 unsigned int line;
1119
1120                 if (alu->src[i].sel < 512)
1121                         continue;
1122
1123                 line = ((alu->src[i].sel - 512) / 32) * 2;
1124
1125                 for (j = 0; j < count; ++j) {
1126                         if (cache_line[j] == line) {
1127                                 found = true;
1128                                 break;
1129                         }
1130                 }
1131
1132                 if (!found)
1133                         cache_line[count++] = line;
1134         }
1135
1136         /* This should never actually happen. */
1137         if (count >= 3) return -ENOMEM;
1138
1139         for (i = 0; i < 2; ++i) {
1140                 if (kcache[i].mode == V_SQ_CF_KCACHE_NOP) {
1141                         ++free_lines;
1142                 }
1143         }
1144
1145         /* Filter lines pulled in by previous intructions. Note that this is
1146          * only for the required_lines count, we can't remove these from the
1147          * cache_line array since we may have to start a new ALU clause. */
1148         for (i = 0, required_lines = count; i < count; ++i) {
1149                 for (j = 0; j < 2; ++j) {
1150                         if (kcache[j].mode == V_SQ_CF_KCACHE_LOCK_2 &&
1151                             kcache[j].addr == cache_line[i]) {
1152                                 --required_lines;
1153                                 break;
1154                         }
1155                 }
1156         }
1157
1158         /* Start a new ALU clause if needed. */
1159         if (required_lines > free_lines) {
1160                 if ((r = r600_bytecode_add_cf(bc))) {
1161                         return r;
1162                 }
1163                 bc->cf_last->inst = (type << 3);
1164                 kcache = bc->cf_last->kcache;
1165         }
1166
1167         /* Setup the kcache lines. */
1168         for (i = 0; i < count; ++i) {
1169                 boolean found = false;
1170
1171                 for (j = 0; j < 2; ++j) {
1172                         if (kcache[j].mode == V_SQ_CF_KCACHE_LOCK_2 &&
1173                             kcache[j].addr == cache_line[i]) {
1174                                 found = true;
1175                                 break;
1176                         }
1177                 }
1178
1179                 if (found) continue;
1180
1181                 for (j = 0; j < 2; ++j) {
1182                         if (kcache[j].mode == V_SQ_CF_KCACHE_NOP) {
1183                                 kcache[j].bank = 0;
1184                                 kcache[j].addr = cache_line[i];
1185                                 kcache[j].mode = V_SQ_CF_KCACHE_LOCK_2;
1186                                 break;
1187                         }
1188                 }
1189         }
1190
1191         /* Alter the src operands to refer to the kcache. */
1192         for (i = 0; i < 3; ++i) {
1193                 static const unsigned int base[] = {128, 160, 256, 288};
1194                 unsigned int line;
1195
1196                 if (alu->src[i].sel < 512)
1197                         continue;
1198
1199                 alu->src[i].sel -= 512;
1200                 line = (alu->src[i].sel / 32) * 2;
1201
1202                 for (j = 0; j < 2; ++j) {
1203                         if (kcache[j].mode == V_SQ_CF_KCACHE_LOCK_2 &&
1204                             kcache[j].addr == line) {
1205                                 alu->src[i].sel &= 0x1f;
1206                                 alu->src[i].sel += base[j];
1207                                 break;
1208                         }
1209                 }
1210         }
1211
1212         return 0;
1213 }
1214
1215 /* load AR register from gpr (bc->ar_reg) with MOVA_INT */
1216 static int load_ar(struct r600_bytecode *bc)
1217 {
1218         struct r600_bytecode_alu alu;
1219         int r;
1220
1221         if (bc->ar_loaded)
1222                 return 0;
1223
1224         /* hack to avoid making MOVA the last instruction in the clause */
1225         if ((bc->cf_last->ndw>>1) >= 110)
1226                 bc->force_add_cf = 1;
1227
1228         memset(&alu, 0, sizeof(alu));
1229         alu.inst = BC_INST(bc, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_INT);
1230         alu.src[0].sel = bc->ar_reg;
1231         alu.last = 1;
1232         r = r600_bytecode_add_alu(bc, &alu);
1233         if (r)
1234                 return r;
1235
1236         bc->cf_last->r6xx_uses_waterfall = 1;
1237         bc->ar_loaded = 1;
1238         return 0;
1239 }
1240
1241 int r600_bytecode_add_alu_type(struct r600_bytecode *bc, const struct r600_bytecode_alu *alu, int type)
1242 {
1243         struct r600_bytecode_alu *nalu = r600_bytecode_alu();
1244         struct r600_bytecode_alu *lalu;
1245         int i, r;
1246
1247         if (nalu == NULL)
1248                 return -ENOMEM;
1249         memcpy(nalu, alu, sizeof(struct r600_bytecode_alu));
1250
1251         if (bc->cf_last != NULL && bc->cf_last->inst != (type << 3)) {
1252                 /* check if we could add it anyway */
1253                 if (bc->cf_last->inst == (V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU << 3) &&
1254                         type == V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_PUSH_BEFORE) {
1255                         LIST_FOR_EACH_ENTRY(lalu, &bc->cf_last->alu, list) {
1256                                 if (lalu->predicate) {
1257                                         bc->force_add_cf = 1;
1258                                         break;
1259                                 }
1260                         }
1261                 } else
1262                         bc->force_add_cf = 1;
1263         }
1264
1265         /* cf can contains only alu or only vtx or only tex */
1266         if (bc->cf_last == NULL || bc->force_add_cf) {
1267                 r = r600_bytecode_add_cf(bc);
1268                 if (r) {
1269                         free(nalu);
1270                         return r;
1271                 }
1272         }
1273         bc->cf_last->inst = (type << 3);
1274
1275         /* Check AR usage and load it if required */
1276         for (i = 0; i < 3; i++)
1277                 if (nalu->src[i].rel && !bc->ar_loaded)
1278                         load_ar(bc);
1279
1280         if (nalu->dst.rel && !bc->ar_loaded)
1281                 load_ar(bc);
1282
1283         /* Setup the kcache for this ALU instruction. This will start a new
1284          * ALU clause if needed. */
1285         if ((r = r600_bytecode_alloc_kcache_lines(bc, nalu, type))) {
1286                 free(nalu);
1287                 return r;
1288         }
1289
1290         if (!bc->cf_last->curr_bs_head) {
1291                 bc->cf_last->curr_bs_head = nalu;
1292         }
1293         /* number of gpr == the last gpr used in any alu */
1294         for (i = 0; i < 3; i++) {
1295                 if (nalu->src[i].sel >= bc->ngpr && nalu->src[i].sel < 128) {
1296                         bc->ngpr = nalu->src[i].sel + 1;
1297                 }
1298                 if (nalu->src[i].sel == V_SQ_ALU_SRC_LITERAL)
1299                         r600_bytecode_special_constants(nalu->src[i].value,
1300                                 &nalu->src[i].sel, &nalu->src[i].neg);
1301         }
1302         if (nalu->dst.sel >= bc->ngpr) {
1303                 bc->ngpr = nalu->dst.sel + 1;
1304         }
1305         LIST_ADDTAIL(&nalu->list, &bc->cf_last->alu);
1306         /* each alu use 2 dwords */
1307         bc->cf_last->ndw += 2;
1308         bc->ndw += 2;
1309
1310         /* process cur ALU instructions for bank swizzle */
1311         if (nalu->last) {
1312                 uint32_t literal[4];
1313                 unsigned nliteral;
1314                 struct r600_bytecode_alu *slots[5];
1315                 int max_slots = bc->chip_class == CAYMAN ? 4 : 5;
1316                 r = assign_alu_units(bc, bc->cf_last->curr_bs_head, slots);
1317                 if (r)
1318                         return r;
1319
1320                 if (bc->cf_last->prev_bs_head) {
1321                         r = merge_inst_groups(bc, slots, bc->cf_last->prev_bs_head);
1322                         if (r)
1323                                 return r;
1324                 }
1325
1326                 if (bc->cf_last->prev_bs_head) {
1327                         r = replace_gpr_with_pv_ps(bc, slots, bc->cf_last->prev_bs_head);
1328                         if (r)
1329                                 return r;
1330                 }
1331
1332                 r = check_and_set_bank_swizzle(bc, slots);
1333                 if (r)
1334                         return r;
1335
1336                 for (i = 0, nliteral = 0; i < max_slots; i++) {
1337                         if (slots[i]) {
1338                                 r = r600_bytecode_alu_nliterals(bc, slots[i], literal, &nliteral);
1339                                 if (r)
1340                                         return r;
1341                         }
1342                 }
1343                 bc->cf_last->ndw += align(nliteral, 2);
1344
1345                 /* at most 128 slots, one add alu can add 5 slots + 4 constants(2 slots)
1346                  * worst case */
1347                 if ((bc->cf_last->ndw >> 1) >= 120) {
1348                         bc->force_add_cf = 1;
1349                 }
1350
1351                 bc->cf_last->prev2_bs_head = bc->cf_last->prev_bs_head;
1352                 bc->cf_last->prev_bs_head = bc->cf_last->curr_bs_head;
1353                 bc->cf_last->curr_bs_head = NULL;
1354         }
1355         return 0;
1356 }
1357
1358 int r600_bytecode_add_alu(struct r600_bytecode *bc, const struct r600_bytecode_alu *alu)
1359 {
1360         return r600_bytecode_add_alu_type(bc, alu, BC_INST(bc, V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU));
1361 }
1362
1363 static unsigned r600_bytecode_num_tex_and_vtx_instructions(const struct r600_bytecode *bc)
1364 {
1365         switch (bc->chip_class) {
1366         case R600:
1367                 return 8;
1368
1369         case R700:
1370         case EVERGREEN:
1371                 return 16;
1372
1373         case CAYMAN:
1374                 return 64;
1375
1376         default:
1377                 R600_ERR("Unknown chip class %d.\n", bc->chip_class);
1378                 return 8;
1379         }
1380 }
1381
1382 static inline boolean last_inst_was_vtx_fetch(struct r600_bytecode *bc)
1383 {
1384         if (bc->chip_class == CAYMAN) {
1385                 if (bc->cf_last->inst != CM_V_SQ_CF_WORD1_SQ_CF_INST_TC)
1386                         return TRUE;
1387         } else {
1388                 if (bc->cf_last->inst != V_SQ_CF_WORD1_SQ_CF_INST_VTX &&
1389                     bc->cf_last->inst != V_SQ_CF_WORD1_SQ_CF_INST_VTX_TC)
1390                         return TRUE;
1391         }
1392         return FALSE;
1393 }
1394
1395 int r600_bytecode_add_vtx(struct r600_bytecode *bc, const struct r600_bytecode_vtx *vtx)
1396 {
1397         struct r600_bytecode_vtx *nvtx = r600_bytecode_vtx();
1398         int r;
1399
1400         if (nvtx == NULL)
1401                 return -ENOMEM;
1402         memcpy(nvtx, vtx, sizeof(struct r600_bytecode_vtx));
1403
1404         /* cf can contains only alu or only vtx or only tex */
1405         if (bc->cf_last == NULL ||
1406             last_inst_was_vtx_fetch(bc) ||
1407             bc->force_add_cf) {
1408                 r = r600_bytecode_add_cf(bc);
1409                 if (r) {
1410                         free(nvtx);
1411                         return r;
1412                 }
1413                 if (bc->chip_class == CAYMAN)
1414                         bc->cf_last->inst = CM_V_SQ_CF_WORD1_SQ_CF_INST_TC;
1415                 else
1416                         bc->cf_last->inst = V_SQ_CF_WORD1_SQ_CF_INST_VTX;
1417         }
1418         LIST_ADDTAIL(&nvtx->list, &bc->cf_last->vtx);
1419         /* each fetch use 4 dwords */
1420         bc->cf_last->ndw += 4;
1421         bc->ndw += 4;
1422         if ((bc->cf_last->ndw / 4) >= r600_bytecode_num_tex_and_vtx_instructions(bc))
1423                 bc->force_add_cf = 1;
1424         return 0;
1425 }
1426
1427 int r600_bytecode_add_tex(struct r600_bytecode *bc, const struct r600_bytecode_tex *tex)
1428 {
1429         struct r600_bytecode_tex *ntex = r600_bytecode_tex();
1430         int r;
1431
1432         if (ntex == NULL)
1433                 return -ENOMEM;
1434         memcpy(ntex, tex, sizeof(struct r600_bytecode_tex));
1435
1436         /* we can't fetch data und use it as texture lookup address in the same TEX clause */
1437         if (bc->cf_last != NULL &&
1438                 bc->cf_last->inst == V_SQ_CF_WORD1_SQ_CF_INST_TEX) {
1439                 struct r600_bytecode_tex *ttex;
1440                 LIST_FOR_EACH_ENTRY(ttex, &bc->cf_last->tex, list) {
1441                         if (ttex->dst_gpr == ntex->src_gpr) {
1442                                 bc->force_add_cf = 1;
1443                                 break;
1444                         }
1445                 }
1446                 /* slight hack to make gradients always go into same cf */
1447                 if (ntex->inst == SQ_TEX_INST_SET_GRADIENTS_H)
1448                         bc->force_add_cf = 1;
1449         }
1450
1451         /* cf can contains only alu or only vtx or only tex */
1452         if (bc->cf_last == NULL ||
1453                 bc->cf_last->inst != V_SQ_CF_WORD1_SQ_CF_INST_TEX ||
1454                 bc->force_add_cf) {
1455                 r = r600_bytecode_add_cf(bc);
1456                 if (r) {
1457                         free(ntex);
1458                         return r;
1459                 }
1460                 bc->cf_last->inst = V_SQ_CF_WORD1_SQ_CF_INST_TEX;
1461         }
1462         if (ntex->src_gpr >= bc->ngpr) {
1463                 bc->ngpr = ntex->src_gpr + 1;
1464         }
1465         if (ntex->dst_gpr >= bc->ngpr) {
1466                 bc->ngpr = ntex->dst_gpr + 1;
1467         }
1468         LIST_ADDTAIL(&ntex->list, &bc->cf_last->tex);
1469         /* each texture fetch use 4 dwords */
1470         bc->cf_last->ndw += 4;
1471         bc->ndw += 4;
1472         if ((bc->cf_last->ndw / 4) >= r600_bytecode_num_tex_and_vtx_instructions(bc))
1473                 bc->force_add_cf = 1;
1474         return 0;
1475 }
1476
1477 int r600_bytecode_add_cfinst(struct r600_bytecode *bc, int inst)
1478 {
1479         int r;
1480         r = r600_bytecode_add_cf(bc);
1481         if (r)
1482                 return r;
1483
1484         bc->cf_last->cond = V_SQ_CF_COND_ACTIVE;
1485         bc->cf_last->inst = inst;
1486         return 0;
1487 }
1488
1489 int cm_bytecode_add_cf_end(struct r600_bytecode *bc)
1490 {
1491         return r600_bytecode_add_cfinst(bc, CM_V_SQ_CF_WORD1_SQ_CF_INST_END);
1492 }
1493
1494 /* common to all 3 families */
1495 static int r600_bytecode_vtx_build(struct r600_bytecode *bc, struct r600_bytecode_vtx *vtx, unsigned id)
1496 {
1497         bc->bytecode[id] = S_SQ_VTX_WORD0_BUFFER_ID(vtx->buffer_id) |
1498                         S_SQ_VTX_WORD0_FETCH_TYPE(vtx->fetch_type) |
1499                         S_SQ_VTX_WORD0_SRC_GPR(vtx->src_gpr) |
1500                         S_SQ_VTX_WORD0_SRC_SEL_X(vtx->src_sel_x);
1501         if (bc->chip_class < CAYMAN)
1502                 bc->bytecode[id] |= S_SQ_VTX_WORD0_MEGA_FETCH_COUNT(vtx->mega_fetch_count);
1503         id++;
1504         bc->bytecode[id++] = S_SQ_VTX_WORD1_DST_SEL_X(vtx->dst_sel_x) |
1505                                 S_SQ_VTX_WORD1_DST_SEL_Y(vtx->dst_sel_y) |
1506                                 S_SQ_VTX_WORD1_DST_SEL_Z(vtx->dst_sel_z) |
1507                                 S_SQ_VTX_WORD1_DST_SEL_W(vtx->dst_sel_w) |
1508                                 S_SQ_VTX_WORD1_USE_CONST_FIELDS(vtx->use_const_fields) |
1509                                 S_SQ_VTX_WORD1_DATA_FORMAT(vtx->data_format) |
1510                                 S_SQ_VTX_WORD1_NUM_FORMAT_ALL(vtx->num_format_all) |
1511                                 S_SQ_VTX_WORD1_FORMAT_COMP_ALL(vtx->format_comp_all) |
1512                                 S_SQ_VTX_WORD1_SRF_MODE_ALL(vtx->srf_mode_all) |
1513                                 S_SQ_VTX_WORD1_GPR_DST_GPR(vtx->dst_gpr);
1514         bc->bytecode[id] = S_SQ_VTX_WORD2_OFFSET(vtx->offset)|
1515                                 S_SQ_VTX_WORD2_ENDIAN_SWAP(vtx->endian);
1516         if (bc->chip_class < CAYMAN)
1517                 bc->bytecode[id] |= S_SQ_VTX_WORD2_MEGA_FETCH(1);
1518         id++;
1519         bc->bytecode[id++] = 0;
1520         return 0;
1521 }
1522
1523 /* common to all 3 families */
1524 static int r600_bytecode_tex_build(struct r600_bytecode *bc, struct r600_bytecode_tex *tex, unsigned id)
1525 {
1526         bc->bytecode[id++] = S_SQ_TEX_WORD0_TEX_INST(tex->inst) |
1527                                 S_SQ_TEX_WORD0_RESOURCE_ID(tex->resource_id) |
1528                                 S_SQ_TEX_WORD0_SRC_GPR(tex->src_gpr) |
1529                                 S_SQ_TEX_WORD0_SRC_REL(tex->src_rel);
1530         bc->bytecode[id++] = S_SQ_TEX_WORD1_DST_GPR(tex->dst_gpr) |
1531                                 S_SQ_TEX_WORD1_DST_REL(tex->dst_rel) |
1532                                 S_SQ_TEX_WORD1_DST_SEL_X(tex->dst_sel_x) |
1533                                 S_SQ_TEX_WORD1_DST_SEL_Y(tex->dst_sel_y) |
1534                                 S_SQ_TEX_WORD1_DST_SEL_Z(tex->dst_sel_z) |
1535                                 S_SQ_TEX_WORD1_DST_SEL_W(tex->dst_sel_w) |
1536                                 S_SQ_TEX_WORD1_LOD_BIAS(tex->lod_bias) |
1537                                 S_SQ_TEX_WORD1_COORD_TYPE_X(tex->coord_type_x) |
1538                                 S_SQ_TEX_WORD1_COORD_TYPE_Y(tex->coord_type_y) |
1539                                 S_SQ_TEX_WORD1_COORD_TYPE_Z(tex->coord_type_z) |
1540                                 S_SQ_TEX_WORD1_COORD_TYPE_W(tex->coord_type_w);
1541         bc->bytecode[id++] = S_SQ_TEX_WORD2_OFFSET_X(tex->offset_x) |
1542                                 S_SQ_TEX_WORD2_OFFSET_Y(tex->offset_y) |
1543                                 S_SQ_TEX_WORD2_OFFSET_Z(tex->offset_z) |
1544                                 S_SQ_TEX_WORD2_SAMPLER_ID(tex->sampler_id) |
1545                                 S_SQ_TEX_WORD2_SRC_SEL_X(tex->src_sel_x) |
1546                                 S_SQ_TEX_WORD2_SRC_SEL_Y(tex->src_sel_y) |
1547                                 S_SQ_TEX_WORD2_SRC_SEL_Z(tex->src_sel_z) |
1548                                 S_SQ_TEX_WORD2_SRC_SEL_W(tex->src_sel_w);
1549         bc->bytecode[id++] = 0;
1550         return 0;
1551 }
1552
1553 /* r600 only, r700/eg bits in r700_asm.c */
1554 static int r600_bytecode_alu_build(struct r600_bytecode *bc, struct r600_bytecode_alu *alu, unsigned id)
1555 {
1556         /* don't replace gpr by pv or ps for destination register */
1557         bc->bytecode[id++] = S_SQ_ALU_WORD0_SRC0_SEL(alu->src[0].sel) |
1558                                 S_SQ_ALU_WORD0_SRC0_REL(alu->src[0].rel) |
1559                                 S_SQ_ALU_WORD0_SRC0_CHAN(alu->src[0].chan) |
1560                                 S_SQ_ALU_WORD0_SRC0_NEG(alu->src[0].neg) |
1561                                 S_SQ_ALU_WORD0_SRC1_SEL(alu->src[1].sel) |
1562                                 S_SQ_ALU_WORD0_SRC1_REL(alu->src[1].rel) |
1563                                 S_SQ_ALU_WORD0_SRC1_CHAN(alu->src[1].chan) |
1564                                 S_SQ_ALU_WORD0_SRC1_NEG(alu->src[1].neg) |
1565                                 S_SQ_ALU_WORD0_LAST(alu->last);
1566
1567         if (alu->is_op3) {
1568                 bc->bytecode[id++] = S_SQ_ALU_WORD1_DST_GPR(alu->dst.sel) |
1569                                         S_SQ_ALU_WORD1_DST_CHAN(alu->dst.chan) |
1570                                         S_SQ_ALU_WORD1_DST_REL(alu->dst.rel) |
1571                                         S_SQ_ALU_WORD1_CLAMP(alu->dst.clamp) |
1572                                         S_SQ_ALU_WORD1_OP3_SRC2_SEL(alu->src[2].sel) |
1573                                         S_SQ_ALU_WORD1_OP3_SRC2_REL(alu->src[2].rel) |
1574                                         S_SQ_ALU_WORD1_OP3_SRC2_CHAN(alu->src[2].chan) |
1575                                         S_SQ_ALU_WORD1_OP3_SRC2_NEG(alu->src[2].neg) |
1576                                         S_SQ_ALU_WORD1_OP3_ALU_INST(alu->inst) |
1577                                         S_SQ_ALU_WORD1_BANK_SWIZZLE(alu->bank_swizzle);
1578         } else {
1579                 bc->bytecode[id++] = S_SQ_ALU_WORD1_DST_GPR(alu->dst.sel) |
1580                                         S_SQ_ALU_WORD1_DST_CHAN(alu->dst.chan) |
1581                                         S_SQ_ALU_WORD1_DST_REL(alu->dst.rel) |
1582                                         S_SQ_ALU_WORD1_CLAMP(alu->dst.clamp) |
1583                                         S_SQ_ALU_WORD1_OP2_SRC0_ABS(alu->src[0].abs) |
1584                                         S_SQ_ALU_WORD1_OP2_SRC1_ABS(alu->src[1].abs) |
1585                                         S_SQ_ALU_WORD1_OP2_WRITE_MASK(alu->dst.write) |
1586                                         S_SQ_ALU_WORD1_OP2_OMOD(alu->omod) |
1587                                         S_SQ_ALU_WORD1_OP2_ALU_INST(alu->inst) |
1588                                         S_SQ_ALU_WORD1_BANK_SWIZZLE(alu->bank_swizzle) |
1589                                         S_SQ_ALU_WORD1_OP2_UPDATE_EXECUTE_MASK(alu->predicate) |
1590                                         S_SQ_ALU_WORD1_OP2_UPDATE_PRED(alu->predicate);
1591         }
1592         return 0;
1593 }
1594
1595 static void r600_bytecode_cf_vtx_build(uint32_t *bytecode, const struct r600_bytecode_cf *cf)
1596 {
1597         *bytecode++ = S_SQ_CF_WORD0_ADDR(cf->addr >> 1);
1598         *bytecode++ = S_SQ_CF_WORD1_CF_INST(cf->inst) |
1599                         S_SQ_CF_WORD1_BARRIER(1) |
1600                         S_SQ_CF_WORD1_COUNT((cf->ndw / 4) - 1);
1601 }
1602
1603 /* common for r600/r700 - eg in eg_asm.c */
1604 static int r600_bytecode_cf_build(struct r600_bytecode *bc, struct r600_bytecode_cf *cf)
1605 {
1606         unsigned id = cf->id;
1607
1608         switch (cf->inst) {
1609         case (V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU << 3):
1610         case (V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_PUSH_BEFORE << 3):
1611         case (V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_POP_AFTER << 3):
1612         case (V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_POP2_AFTER << 3):
1613                 bc->bytecode[id++] = S_SQ_CF_ALU_WORD0_ADDR(cf->addr >> 1) |
1614                         S_SQ_CF_ALU_WORD0_KCACHE_MODE0(cf->kcache[0].mode) |
1615                         S_SQ_CF_ALU_WORD0_KCACHE_BANK0(cf->kcache[0].bank) |
1616                         S_SQ_CF_ALU_WORD0_KCACHE_BANK1(cf->kcache[1].bank);
1617
1618                 bc->bytecode[id++] = S_SQ_CF_ALU_WORD1_CF_INST(cf->inst >> 3) |
1619                         S_SQ_CF_ALU_WORD1_KCACHE_MODE1(cf->kcache[1].mode) |
1620                         S_SQ_CF_ALU_WORD1_KCACHE_ADDR0(cf->kcache[0].addr) |
1621                         S_SQ_CF_ALU_WORD1_KCACHE_ADDR1(cf->kcache[1].addr) |
1622                                         S_SQ_CF_ALU_WORD1_BARRIER(1) |
1623                                         S_SQ_CF_ALU_WORD1_USES_WATERFALL(bc->chip_class == R600 ? cf->r6xx_uses_waterfall : 0) |
1624                                         S_SQ_CF_ALU_WORD1_COUNT((cf->ndw / 2) - 1);
1625                 break;
1626         case V_SQ_CF_WORD1_SQ_CF_INST_TEX:
1627         case V_SQ_CF_WORD1_SQ_CF_INST_VTX:
1628         case V_SQ_CF_WORD1_SQ_CF_INST_VTX_TC:
1629                 if (bc->chip_class == R700)
1630                         r700_bytecode_cf_vtx_build(&bc->bytecode[id], cf);
1631                 else
1632                         r600_bytecode_cf_vtx_build(&bc->bytecode[id], cf);
1633                 break;
1634         case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT:
1635         case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT_DONE:
1636                 bc->bytecode[id++] = S_SQ_CF_ALLOC_EXPORT_WORD0_RW_GPR(cf->output.gpr) |
1637                         S_SQ_CF_ALLOC_EXPORT_WORD0_ELEM_SIZE(cf->output.elem_size) |
1638                         S_SQ_CF_ALLOC_EXPORT_WORD0_ARRAY_BASE(cf->output.array_base) |
1639                         S_SQ_CF_ALLOC_EXPORT_WORD0_TYPE(cf->output.type);
1640                 bc->bytecode[id++] = S_SQ_CF_ALLOC_EXPORT_WORD1_BURST_COUNT(cf->output.burst_count - 1) |
1641                         S_SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_X(cf->output.swizzle_x) |
1642                         S_SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_Y(cf->output.swizzle_y) |
1643                         S_SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_Z(cf->output.swizzle_z) |
1644                         S_SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_W(cf->output.swizzle_w) |
1645                         S_SQ_CF_ALLOC_EXPORT_WORD1_BARRIER(cf->output.barrier) |
1646                         S_SQ_CF_ALLOC_EXPORT_WORD1_CF_INST(cf->output.inst) |
1647                         S_SQ_CF_ALLOC_EXPORT_WORD1_END_OF_PROGRAM(cf->output.end_of_program);
1648                 break;
1649         case V_SQ_CF_WORD1_SQ_CF_INST_JUMP:
1650         case V_SQ_CF_WORD1_SQ_CF_INST_ELSE:
1651         case V_SQ_CF_WORD1_SQ_CF_INST_POP:
1652         case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_START_NO_AL:
1653         case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_END:
1654         case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_CONTINUE:
1655         case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_BREAK:
1656         case V_SQ_CF_WORD1_SQ_CF_INST_CALL_FS:
1657         case V_SQ_CF_WORD1_SQ_CF_INST_RETURN:
1658                 bc->bytecode[id++] = S_SQ_CF_WORD0_ADDR(cf->cf_addr >> 1);
1659                 bc->bytecode[id++] = S_SQ_CF_WORD1_CF_INST(cf->inst) |
1660                                         S_SQ_CF_WORD1_BARRIER(1) |
1661                                         S_SQ_CF_WORD1_COND(cf->cond) |
1662                                         S_SQ_CF_WORD1_POP_COUNT(cf->pop_count);
1663
1664                 break;
1665         default:
1666                 R600_ERR("unsupported CF instruction (0x%X)\n", cf->inst);
1667                 return -EINVAL;
1668         }
1669         return 0;
1670 }
1671
1672 int r600_bytecode_build(struct r600_bytecode *bc)
1673 {
1674         struct r600_bytecode_cf *cf;
1675         struct r600_bytecode_alu *alu;
1676         struct r600_bytecode_vtx *vtx;
1677         struct r600_bytecode_tex *tex;
1678         uint32_t literal[4];
1679         unsigned nliteral;
1680         unsigned addr;
1681         int i, r;
1682
1683         if (bc->callstack[0].max > 0)
1684                 bc->nstack = ((bc->callstack[0].max + 3) >> 2) + 2;
1685         if (bc->type == TGSI_PROCESSOR_VERTEX && !bc->nstack) {
1686                 bc->nstack = 1;
1687         }
1688
1689         /* first path compute addr of each CF block */
1690         /* addr start after all the CF instructions */
1691         addr = bc->cf_last->id + 2;
1692         LIST_FOR_EACH_ENTRY(cf, &bc->cf, list) {
1693                 switch (cf->inst) {
1694                 case (V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU << 3):
1695                 case (V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_POP_AFTER << 3):
1696                 case (V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_POP2_AFTER << 3):
1697                 case (V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_PUSH_BEFORE << 3):
1698                         break;
1699                 case V_SQ_CF_WORD1_SQ_CF_INST_TEX:
1700                 case V_SQ_CF_WORD1_SQ_CF_INST_VTX:
1701                 case V_SQ_CF_WORD1_SQ_CF_INST_VTX_TC:
1702                         /* fetch node need to be 16 bytes aligned*/
1703                         addr += 3;
1704                         addr &= 0xFFFFFFFCUL;
1705                         break;
1706                 case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT:
1707                 case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT_DONE:
1708                 case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT:
1709                 case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT_DONE:
1710                         break;
1711                 case V_SQ_CF_WORD1_SQ_CF_INST_JUMP:
1712                 case V_SQ_CF_WORD1_SQ_CF_INST_ELSE:
1713                 case V_SQ_CF_WORD1_SQ_CF_INST_POP:
1714                 case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_START_NO_AL:
1715                 case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_END:
1716                 case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_CONTINUE:
1717                 case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_BREAK:
1718                 case V_SQ_CF_WORD1_SQ_CF_INST_CALL_FS:
1719                 case V_SQ_CF_WORD1_SQ_CF_INST_RETURN:
1720                 case CM_V_SQ_CF_WORD1_SQ_CF_INST_END:
1721                         break;
1722                 default:
1723                         R600_ERR("unsupported CF instruction (0x%X)\n", cf->inst);
1724                         return -EINVAL;
1725                 }
1726                 cf->addr = addr;
1727                 addr += cf->ndw;
1728                 bc->ndw = cf->addr + cf->ndw;
1729         }
1730         free(bc->bytecode);
1731         bc->bytecode = calloc(1, bc->ndw * 4);
1732         if (bc->bytecode == NULL)
1733                 return -ENOMEM;
1734         LIST_FOR_EACH_ENTRY(cf, &bc->cf, list) {
1735                 addr = cf->addr;
1736                 if (bc->chip_class >= EVERGREEN)
1737                         r = eg_bytecode_cf_build(bc, cf);
1738                 else
1739                         r = r600_bytecode_cf_build(bc, cf);
1740                 if (r)
1741                         return r;
1742                 switch (cf->inst) {
1743                 case (V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU << 3):
1744                 case (V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_POP_AFTER << 3):
1745                 case (V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_POP2_AFTER << 3):
1746                 case (V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_PUSH_BEFORE << 3):
1747                         nliteral = 0;
1748                         memset(literal, 0, sizeof(literal));
1749                         LIST_FOR_EACH_ENTRY(alu, &cf->alu, list) {
1750                                 r = r600_bytecode_alu_nliterals(bc, alu, literal, &nliteral);
1751                                 if (r)
1752                                         return r;
1753                                 r600_bytecode_alu_adjust_literals(bc, alu, literal, nliteral);
1754                                 switch(bc->chip_class) {
1755                                 case R600:
1756                                         r = r600_bytecode_alu_build(bc, alu, addr);
1757                                         break;
1758                                 case R700:
1759                                 case EVERGREEN: /* eg alu is same encoding as r700 */
1760                                 case CAYMAN: /* eg alu is same encoding as r700 */
1761                                         r = r700_bytecode_alu_build(bc, alu, addr);
1762                                         break;
1763                                 default:
1764                                         R600_ERR("unknown chip class %d.\n", bc->chip_class);
1765                                         return -EINVAL;
1766                                 }
1767                                 if (r)
1768                                         return r;
1769                                 addr += 2;
1770                                 if (alu->last) {
1771                                         for (i = 0; i < align(nliteral, 2); ++i) {
1772                                                 bc->bytecode[addr++] = literal[i];
1773                                         }
1774                                         nliteral = 0;
1775                                         memset(literal, 0, sizeof(literal));
1776                                 }
1777                         }
1778                         break;
1779                 case V_SQ_CF_WORD1_SQ_CF_INST_VTX:
1780                 case V_SQ_CF_WORD1_SQ_CF_INST_VTX_TC:
1781                         LIST_FOR_EACH_ENTRY(vtx, &cf->vtx, list) {
1782                                 r = r600_bytecode_vtx_build(bc, vtx, addr);
1783                                 if (r)
1784                                         return r;
1785                                 addr += 4;
1786                         }
1787                         break;
1788                 case V_SQ_CF_WORD1_SQ_CF_INST_TEX:
1789                         if (bc->chip_class == CAYMAN) {
1790                                 LIST_FOR_EACH_ENTRY(vtx, &cf->vtx, list) {
1791                                         r = r600_bytecode_vtx_build(bc, vtx, addr);
1792                                         if (r)
1793                                                 return r;
1794                                         addr += 4;
1795                                 }
1796                         }
1797                         LIST_FOR_EACH_ENTRY(tex, &cf->tex, list) {
1798                                 r = r600_bytecode_tex_build(bc, tex, addr);
1799                                 if (r)
1800                                         return r;
1801                                 addr += 4;
1802                         }
1803                         break;
1804                 case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT:
1805                 case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT_DONE:
1806                 case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT:
1807                 case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT_DONE:
1808                 case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_START_NO_AL:
1809                 case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_END:
1810                 case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_CONTINUE:
1811                 case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_BREAK:
1812                 case V_SQ_CF_WORD1_SQ_CF_INST_JUMP:
1813                 case V_SQ_CF_WORD1_SQ_CF_INST_ELSE:
1814                 case V_SQ_CF_WORD1_SQ_CF_INST_POP:
1815                 case V_SQ_CF_WORD1_SQ_CF_INST_CALL_FS:
1816                 case V_SQ_CF_WORD1_SQ_CF_INST_RETURN:
1817                 case CM_V_SQ_CF_WORD1_SQ_CF_INST_END:
1818                         break;
1819                 default:
1820                         R600_ERR("unsupported CF instruction (0x%X)\n", cf->inst);
1821                         return -EINVAL;
1822                 }
1823         }
1824         return 0;
1825 }
1826
1827 void r600_bytecode_clear(struct r600_bytecode *bc)
1828 {
1829         struct r600_bytecode_cf *cf = NULL, *next_cf;
1830
1831         free(bc->bytecode);
1832         bc->bytecode = NULL;
1833
1834         LIST_FOR_EACH_ENTRY_SAFE(cf, next_cf, &bc->cf, list) {
1835                 struct r600_bytecode_alu *alu = NULL, *next_alu;
1836                 struct r600_bytecode_tex *tex = NULL, *next_tex;
1837                 struct r600_bytecode_tex *vtx = NULL, *next_vtx;
1838
1839                 LIST_FOR_EACH_ENTRY_SAFE(alu, next_alu, &cf->alu, list) {
1840                         free(alu);
1841                 }
1842
1843                 LIST_INITHEAD(&cf->alu);
1844
1845                 LIST_FOR_EACH_ENTRY_SAFE(tex, next_tex, &cf->tex, list) {
1846                         free(tex);
1847                 }
1848
1849                 LIST_INITHEAD(&cf->tex);
1850
1851                 LIST_FOR_EACH_ENTRY_SAFE(vtx, next_vtx, &cf->vtx, list) {
1852                         free(vtx);
1853                 }
1854
1855                 LIST_INITHEAD(&cf->vtx);
1856
1857                 free(cf);
1858         }
1859
1860         LIST_INITHEAD(&cf->list);
1861 }
1862
1863 void r600_bytecode_dump(struct r600_bytecode *bc)
1864 {
1865         struct r600_bytecode_cf *cf = NULL;
1866         struct r600_bytecode_alu *alu = NULL;
1867         struct r600_bytecode_vtx *vtx = NULL;
1868         struct r600_bytecode_tex *tex = NULL;
1869
1870         unsigned i, id;
1871         uint32_t literal[4];
1872         unsigned nliteral;
1873         char chip = '6';
1874
1875         switch (bc->chip_class) {
1876         case R700:
1877                 chip = '7';
1878                 break;
1879         case EVERGREEN:
1880                 chip = 'E';
1881                 break;
1882         case CAYMAN:
1883                 chip = 'C';
1884                 break;
1885         case R600:
1886         default:
1887                 chip = '6';
1888                 break;
1889         }
1890         fprintf(stderr, "bytecode %d dw -- %d gprs ---------------------\n", bc->ndw, bc->ngpr);
1891         fprintf(stderr, "     %c\n", chip);
1892
1893         LIST_FOR_EACH_ENTRY(cf, &bc->cf, list) {
1894                 id = cf->id;
1895
1896                 switch (cf->inst) {
1897                 case (V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU << 3):
1898                 case (V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_POP_AFTER << 3):
1899                 case (V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_POP2_AFTER << 3):
1900                 case (V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_PUSH_BEFORE << 3):
1901                         fprintf(stderr, "%04d %08X ALU ", id, bc->bytecode[id]);
1902                         fprintf(stderr, "ADDR:%d ", cf->addr);
1903                         fprintf(stderr, "KCACHE_MODE0:%X ", cf->kcache[0].mode);
1904                         fprintf(stderr, "KCACHE_BANK0:%X ", cf->kcache[0].bank);
1905                         fprintf(stderr, "KCACHE_BANK1:%X\n", cf->kcache[1].bank);
1906                         id++;
1907                         fprintf(stderr, "%04d %08X ALU ", id, bc->bytecode[id]);
1908                         fprintf(stderr, "INST:0x%x ", cf->inst);
1909                         fprintf(stderr, "KCACHE_MODE1:%X ", cf->kcache[1].mode);
1910                         fprintf(stderr, "KCACHE_ADDR0:%X ", cf->kcache[0].addr);
1911                         fprintf(stderr, "KCACHE_ADDR1:%X ", cf->kcache[1].addr);
1912                         fprintf(stderr, "COUNT:%d\n", cf->ndw / 2);
1913                         break;
1914                 case V_SQ_CF_WORD1_SQ_CF_INST_TEX:
1915                 case V_SQ_CF_WORD1_SQ_CF_INST_VTX:
1916                 case V_SQ_CF_WORD1_SQ_CF_INST_VTX_TC:
1917                         fprintf(stderr, "%04d %08X TEX/VTX ", id, bc->bytecode[id]);
1918                         fprintf(stderr, "ADDR:%d\n", cf->addr);
1919                         id++;
1920                         fprintf(stderr, "%04d %08X TEX/VTX ", id, bc->bytecode[id]);
1921                         fprintf(stderr, "INST:0x%x ", cf->inst);
1922                         fprintf(stderr, "COUNT:%d\n", cf->ndw / 4);
1923                         break;
1924                 case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT:
1925                 case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT_DONE:
1926                 case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT:
1927                 case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT_DONE:
1928                         fprintf(stderr, "%04d %08X EXPORT ", id, bc->bytecode[id]);
1929                         fprintf(stderr, "GPR:%X ", cf->output.gpr);
1930                         fprintf(stderr, "ELEM_SIZE:%X ", cf->output.elem_size);
1931                         fprintf(stderr, "ARRAY_BASE:%X ", cf->output.array_base);
1932                         fprintf(stderr, "TYPE:%X\n", cf->output.type);
1933                         id++;
1934                         fprintf(stderr, "%04d %08X EXPORT ", id, bc->bytecode[id]);
1935                         fprintf(stderr, "SWIZ_X:%X ", cf->output.swizzle_x);
1936                         fprintf(stderr, "SWIZ_Y:%X ", cf->output.swizzle_y);
1937                         fprintf(stderr, "SWIZ_Z:%X ", cf->output.swizzle_z);
1938                         fprintf(stderr, "SWIZ_W:%X ", cf->output.swizzle_w);
1939                         fprintf(stderr, "BARRIER:%X ", cf->output.barrier);
1940                         fprintf(stderr, "INST:0x%x ", cf->output.inst);
1941                         fprintf(stderr, "BURST_COUNT:%d ", cf->output.burst_count);
1942                         fprintf(stderr, "EOP:%X\n", cf->output.end_of_program);
1943                         break;
1944                 case V_SQ_CF_WORD1_SQ_CF_INST_JUMP:
1945                 case V_SQ_CF_WORD1_SQ_CF_INST_ELSE:
1946                 case V_SQ_CF_WORD1_SQ_CF_INST_POP:
1947                 case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_START_NO_AL:
1948                 case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_END:
1949                 case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_CONTINUE:
1950                 case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_BREAK:
1951                 case V_SQ_CF_WORD1_SQ_CF_INST_CALL_FS:
1952                 case V_SQ_CF_WORD1_SQ_CF_INST_RETURN:
1953                 case CM_V_SQ_CF_WORD1_SQ_CF_INST_END:
1954                         fprintf(stderr, "%04d %08X CF ", id, bc->bytecode[id]);
1955                         fprintf(stderr, "ADDR:%d\n", cf->cf_addr);
1956                         id++;
1957                         fprintf(stderr, "%04d %08X CF ", id, bc->bytecode[id]);
1958                         fprintf(stderr, "INST:0x%x ", cf->inst);
1959                         fprintf(stderr, "COND:%X ", cf->cond);
1960                         fprintf(stderr, "POP_COUNT:%X\n", cf->pop_count);
1961                         break;
1962                 }
1963
1964                 id = cf->addr;
1965                 nliteral = 0;
1966                 LIST_FOR_EACH_ENTRY(alu, &cf->alu, list) {
1967                         r600_bytecode_alu_nliterals(bc, alu, literal, &nliteral);
1968
1969                         fprintf(stderr, "%04d %08X   ", id, bc->bytecode[id]);
1970                         fprintf(stderr, "SRC0(SEL:%d ", alu->src[0].sel);
1971                         fprintf(stderr, "REL:%d ", alu->src[0].rel);
1972                         fprintf(stderr, "CHAN:%d ", alu->src[0].chan);
1973                         fprintf(stderr, "NEG:%d) ", alu->src[0].neg);
1974                         fprintf(stderr, "SRC1(SEL:%d ", alu->src[1].sel);
1975                         fprintf(stderr, "REL:%d ", alu->src[1].rel);
1976                         fprintf(stderr, "CHAN:%d ", alu->src[1].chan);
1977                         fprintf(stderr, "NEG:%d) ", alu->src[1].neg);
1978                         fprintf(stderr, "LAST:%d)\n", alu->last);
1979                         id++;
1980                         fprintf(stderr, "%04d %08X %c ", id, bc->bytecode[id], alu->last ? '*' : ' ');
1981                         fprintf(stderr, "INST:0x%x ", alu->inst);
1982                         fprintf(stderr, "DST(SEL:%d ", alu->dst.sel);
1983                         fprintf(stderr, "CHAN:%d ", alu->dst.chan);
1984                         fprintf(stderr, "REL:%d ", alu->dst.rel);
1985                         fprintf(stderr, "CLAMP:%d) ", alu->dst.clamp);
1986                         fprintf(stderr, "BANK_SWIZZLE:%d ", alu->bank_swizzle);
1987                         if (alu->is_op3) {
1988                                 fprintf(stderr, "SRC2(SEL:%d ", alu->src[2].sel);
1989                                 fprintf(stderr, "REL:%d ", alu->src[2].rel);
1990                                 fprintf(stderr, "CHAN:%d ", alu->src[2].chan);
1991                                 fprintf(stderr, "NEG:%d)\n", alu->src[2].neg);
1992                         } else {
1993                                 fprintf(stderr, "SRC0_ABS:%d ", alu->src[0].abs);
1994                                 fprintf(stderr, "SRC1_ABS:%d ", alu->src[1].abs);
1995                                 fprintf(stderr, "WRITE_MASK:%d ", alu->dst.write);
1996                                 fprintf(stderr, "OMOD:%d ", alu->omod);
1997                                 fprintf(stderr, "EXECUTE_MASK:%d ", alu->predicate);
1998                                 fprintf(stderr, "UPDATE_PRED:%d\n", alu->predicate);
1999                         }
2000
2001                         id++;
2002                         if (alu->last) {
2003                                 for (i = 0; i < nliteral; i++, id++) {
2004                                         float *f = (float*)(bc->bytecode + id);
2005                                         fprintf(stderr, "%04d %08X\t%f\n", id, bc->bytecode[id], *f);
2006                                 }
2007                                 id += nliteral & 1;
2008                                 nliteral = 0;
2009                         }
2010                 }
2011
2012                 LIST_FOR_EACH_ENTRY(tex, &cf->tex, list) {
2013                         fprintf(stderr, "%04d %08X   ", id, bc->bytecode[id]);
2014                         fprintf(stderr, "INST:0x%x ", tex->inst);
2015                         fprintf(stderr, "RESOURCE_ID:%d ", tex->resource_id);
2016                         fprintf(stderr, "SRC(GPR:%d ", tex->src_gpr);
2017                         fprintf(stderr, "REL:%d)\n", tex->src_rel);
2018                         id++;
2019                         fprintf(stderr, "%04d %08X   ", id, bc->bytecode[id]);
2020                         fprintf(stderr, "DST(GPR:%d ", tex->dst_gpr);
2021                         fprintf(stderr, "REL:%d ", tex->dst_rel);
2022                         fprintf(stderr, "SEL_X:%d ", tex->dst_sel_x);
2023                         fprintf(stderr, "SEL_Y:%d ", tex->dst_sel_y);
2024                         fprintf(stderr, "SEL_Z:%d ", tex->dst_sel_z);
2025                         fprintf(stderr, "SEL_W:%d) ", tex->dst_sel_w);
2026                         fprintf(stderr, "LOD_BIAS:%d ", tex->lod_bias);
2027                         fprintf(stderr, "COORD_TYPE_X:%d ", tex->coord_type_x);
2028                         fprintf(stderr, "COORD_TYPE_Y:%d ", tex->coord_type_y);
2029                         fprintf(stderr, "COORD_TYPE_Z:%d ", tex->coord_type_z);
2030                         fprintf(stderr, "COORD_TYPE_W:%d\n", tex->coord_type_w);
2031                         id++;
2032                         fprintf(stderr, "%04d %08X   ", id, bc->bytecode[id]);
2033                         fprintf(stderr, "OFFSET_X:%d ", tex->offset_x);
2034                         fprintf(stderr, "OFFSET_Y:%d ", tex->offset_y);
2035                         fprintf(stderr, "OFFSET_Z:%d ", tex->offset_z);
2036                         fprintf(stderr, "SAMPLER_ID:%d ", tex->sampler_id);
2037                         fprintf(stderr, "SRC(SEL_X:%d ", tex->src_sel_x);
2038                         fprintf(stderr, "SEL_Y:%d ", tex->src_sel_y);
2039                         fprintf(stderr, "SEL_Z:%d ", tex->src_sel_z);
2040                         fprintf(stderr, "SEL_W:%d)\n", tex->src_sel_w);
2041                         id++;
2042                         fprintf(stderr, "%04d %08X   \n", id, bc->bytecode[id]);
2043                         id++;
2044                 }
2045
2046                 LIST_FOR_EACH_ENTRY(vtx, &cf->vtx, list) {
2047                         fprintf(stderr, "%04d %08X   ", id, bc->bytecode[id]);
2048                         fprintf(stderr, "INST:%d ", vtx->inst);
2049                         fprintf(stderr, "FETCH_TYPE:%d ", vtx->fetch_type);
2050                         fprintf(stderr, "BUFFER_ID:%d\n", vtx->buffer_id);
2051                         id++;
2052                         /* This assumes that no semantic fetches exist */
2053                         fprintf(stderr, "%04d %08X   ", id, bc->bytecode[id]);
2054                         fprintf(stderr, "SRC(GPR:%d ", vtx->src_gpr);
2055                         fprintf(stderr, "SEL_X:%d) ", vtx->src_sel_x);
2056                         if (bc->chip_class < CAYMAN)
2057                                 fprintf(stderr, "MEGA_FETCH_COUNT:%d ", vtx->mega_fetch_count);
2058                         else
2059                                 fprintf(stderr, "SEL_Y:%d) ", 0);
2060                         fprintf(stderr, "DST(GPR:%d ", vtx->dst_gpr);
2061                         fprintf(stderr, "SEL_X:%d ", vtx->dst_sel_x);
2062                         fprintf(stderr, "SEL_Y:%d ", vtx->dst_sel_y);
2063                         fprintf(stderr, "SEL_Z:%d ", vtx->dst_sel_z);
2064                         fprintf(stderr, "SEL_W:%d) ", vtx->dst_sel_w);
2065                         fprintf(stderr, "USE_CONST_FIELDS:%d ", vtx->use_const_fields);
2066                         fprintf(stderr, "FORMAT(DATA:%d ", vtx->data_format);
2067                         fprintf(stderr, "NUM:%d ", vtx->num_format_all);
2068                         fprintf(stderr, "COMP:%d ", vtx->format_comp_all);
2069                         fprintf(stderr, "MODE:%d)\n", vtx->srf_mode_all);
2070                         id++;
2071                         fprintf(stderr, "%04d %08X   ", id, bc->bytecode[id]);
2072                         fprintf(stderr, "ENDIAN:%d ", vtx->endian);
2073                         fprintf(stderr, "OFFSET:%d\n", vtx->offset);
2074                         /* TODO */
2075                         id++;
2076                         fprintf(stderr, "%04d %08X   \n", id, bc->bytecode[id]);
2077                         id++;
2078                 }
2079         }
2080
2081         fprintf(stderr, "--------------------------------------\n");
2082 }
2083
2084 static void r600_vertex_data_type(enum pipe_format pformat,
2085                                   unsigned *format,
2086                                   unsigned *num_format, unsigned *format_comp, unsigned *endian)
2087 {
2088         const struct util_format_description *desc;
2089         unsigned i;
2090
2091         *format = 0;
2092         *num_format = 0;
2093         *format_comp = 0;
2094         *endian = ENDIAN_NONE;
2095
2096         desc = util_format_description(pformat);
2097         if (desc->layout != UTIL_FORMAT_LAYOUT_PLAIN) {
2098                 goto out_unknown;
2099         }
2100
2101         /* Find the first non-VOID channel. */
2102         for (i = 0; i < 4; i++) {
2103                 if (desc->channel[i].type != UTIL_FORMAT_TYPE_VOID) {
2104                         break;
2105                 }
2106         }
2107
2108         *endian = r600_endian_swap(desc->channel[i].size);
2109
2110         switch (desc->channel[i].type) {
2111         /* Half-floats, floats, ints */
2112         case UTIL_FORMAT_TYPE_FLOAT:
2113                 switch (desc->channel[i].size) {
2114                 case 16:
2115                         switch (desc->nr_channels) {
2116                         case 1:
2117                                 *format = FMT_16_FLOAT;
2118                                 break;
2119                         case 2:
2120                                 *format = FMT_16_16_FLOAT;
2121                                 break;
2122                         case 3:
2123                         case 4:
2124                                 *format = FMT_16_16_16_16_FLOAT;
2125                                 break;
2126                         }
2127                         break;
2128                 case 32:
2129                         switch (desc->nr_channels) {
2130                         case 1:
2131                                 *format = FMT_32_FLOAT;
2132                                 break;
2133                         case 2:
2134                                 *format = FMT_32_32_FLOAT;
2135                                 break;
2136                         case 3:
2137                                 *format = FMT_32_32_32_FLOAT;
2138                                 break;
2139                         case 4:
2140                                 *format = FMT_32_32_32_32_FLOAT;
2141                                 break;
2142                         }
2143                         break;
2144                 default:
2145                         goto out_unknown;
2146                 }
2147                 break;
2148                 /* Unsigned ints */
2149         case UTIL_FORMAT_TYPE_UNSIGNED:
2150                 /* Signed ints */
2151         case UTIL_FORMAT_TYPE_SIGNED:
2152                 switch (desc->channel[i].size) {
2153                 case 8:
2154                         switch (desc->nr_channels) {
2155                         case 1:
2156                                 *format = FMT_8;
2157                                 break;
2158                         case 2:
2159                                 *format = FMT_8_8;
2160                                 break;
2161                         case 3:
2162                         case 4:
2163                                 *format = FMT_8_8_8_8;
2164                                 break;
2165                         }
2166                         break;
2167                 case 10:
2168                         if (desc->nr_channels != 4)
2169                                 goto out_unknown;
2170
2171                         *format = FMT_2_10_10_10;
2172                         break;
2173                 case 16:
2174                         switch (desc->nr_channels) {
2175                         case 1:
2176                                 *format = FMT_16;
2177                                 break;
2178                         case 2:
2179                                 *format = FMT_16_16;
2180                                 break;
2181                         case 3:
2182                         case 4:
2183                                 *format = FMT_16_16_16_16;
2184                                 break;
2185                         }
2186                         break;
2187                 case 32:
2188                         switch (desc->nr_channels) {
2189                         case 1:
2190                                 *format = FMT_32;
2191                                 break;
2192                         case 2:
2193                                 *format = FMT_32_32;
2194                                 break;
2195                         case 3:
2196                                 *format = FMT_32_32_32;
2197                                 break;
2198                         case 4:
2199                                 *format = FMT_32_32_32_32;
2200                                 break;
2201                         }
2202                         break;
2203                 default:
2204                         goto out_unknown;
2205                 }
2206                 break;
2207         default:
2208                 goto out_unknown;
2209         }
2210
2211         if (desc->channel[i].type == UTIL_FORMAT_TYPE_SIGNED) {
2212                 *format_comp = 1;
2213         }
2214
2215         *num_format = 0;
2216         if (desc->channel[i].type == UTIL_FORMAT_TYPE_UNSIGNED ||
2217             desc->channel[i].type == UTIL_FORMAT_TYPE_SIGNED) {
2218                 if (!desc->channel[i].normalized) {
2219                         if (desc->channel[i].pure_integer)
2220                                 *num_format = 1;
2221                         else
2222                                 *num_format = 2;
2223                 }
2224         }
2225         return;
2226 out_unknown:
2227         R600_ERR("unsupported vertex format %s\n", util_format_name(pformat));
2228 }
2229
2230 int r600_vertex_elements_build_fetch_shader(struct r600_pipe_context *rctx, struct r600_vertex_element *ve)
2231 {
2232         static int dump_shaders = -1;
2233
2234         struct r600_bytecode bc;
2235         struct r600_bytecode_vtx vtx;
2236         struct pipe_vertex_element *elements = ve->elements;
2237         const struct util_format_description *desc;
2238         unsigned fetch_resource_start = rctx->chip_class >= EVERGREEN ? 0 : 160;
2239         unsigned format, num_format, format_comp, endian;
2240         u32 *bytecode;
2241         int i, r;
2242
2243         /* Vertex element offsets need special handling. If the offset is
2244          * bigger than what we can put in the fetch instruction we need to
2245          * alter the vertex resource offset. In order to simplify code we
2246          * will bind one resource per element in such cases. It's a worst
2247          * case scenario. */
2248         for (i = 0; i < ve->count; i++) {
2249                 ve->vbuffer_offset[i] = C_SQ_VTX_WORD2_OFFSET & elements[i].src_offset;
2250                 if (ve->vbuffer_offset[i]) {
2251                         ve->vbuffer_need_offset = 1;
2252                 }
2253         }
2254
2255         memset(&bc, 0, sizeof(bc));
2256         r600_bytecode_init(&bc, rctx->chip_class);
2257
2258         for (i = 0; i < ve->count; i++) {
2259                 if (elements[i].instance_divisor > 1) {
2260                         struct r600_bytecode_alu alu;
2261
2262                         memset(&alu, 0, sizeof(alu));
2263                         alu.inst = BC_INST(&bc, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULHI_UINT);
2264                         alu.src[0].sel = 0;
2265                         alu.src[0].chan = 3;
2266
2267                         alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
2268                         alu.src[1].value = (1ll << 32) / elements[i].instance_divisor + 1;
2269
2270                         alu.dst.sel = i + 1;
2271                         alu.dst.chan = 3;
2272                         alu.dst.write = 1;
2273                         alu.last = 1;
2274
2275                         if ((r = r600_bytecode_add_alu(&bc, &alu))) {
2276                                 r600_bytecode_clear(&bc);
2277                                 return r;
2278                         }
2279                 }
2280         }
2281
2282         for (i = 0; i < ve->count; i++) {
2283                 unsigned vbuffer_index;
2284                 r600_vertex_data_type(ve->elements[i].src_format,
2285                                       &format, &num_format, &format_comp, &endian);
2286                 desc = util_format_description(ve->elements[i].src_format);
2287                 if (desc == NULL) {
2288                         r600_bytecode_clear(&bc);
2289                         R600_ERR("unknown format %d\n", ve->elements[i].src_format);
2290                         return -EINVAL;
2291                 }
2292
2293                 /* see above for vbuffer_need_offset explanation */
2294                 vbuffer_index = elements[i].vertex_buffer_index;
2295                 memset(&vtx, 0, sizeof(vtx));
2296                 vtx.buffer_id = (ve->vbuffer_need_offset ? i : vbuffer_index) + fetch_resource_start;
2297                 vtx.fetch_type = elements[i].instance_divisor ? 1 : 0;
2298                 vtx.src_gpr = elements[i].instance_divisor > 1 ? i + 1 : 0;
2299                 vtx.src_sel_x = elements[i].instance_divisor ? 3 : 0;
2300                 vtx.mega_fetch_count = 0x1F;
2301                 vtx.dst_gpr = i + 1;
2302                 vtx.dst_sel_x = desc->swizzle[0];
2303                 vtx.dst_sel_y = desc->swizzle[1];
2304                 vtx.dst_sel_z = desc->swizzle[2];
2305                 vtx.dst_sel_w = desc->swizzle[3];
2306                 vtx.data_format = format;
2307                 vtx.num_format_all = num_format;
2308                 vtx.format_comp_all = format_comp;
2309                 vtx.srf_mode_all = 1;
2310                 vtx.offset = elements[i].src_offset;
2311                 vtx.endian = endian;
2312
2313                 if ((r = r600_bytecode_add_vtx(&bc, &vtx))) {
2314                         r600_bytecode_clear(&bc);
2315                         return r;
2316                 }
2317         }
2318
2319         r600_bytecode_add_cfinst(&bc, BC_INST(&bc, V_SQ_CF_WORD1_SQ_CF_INST_RETURN));
2320
2321         if ((r = r600_bytecode_build(&bc))) {
2322                 r600_bytecode_clear(&bc);
2323                 return r;
2324         }
2325
2326         if (dump_shaders == -1)
2327                 dump_shaders = debug_get_bool_option("R600_DUMP_SHADERS", FALSE);
2328
2329         if (dump_shaders) {
2330                 fprintf(stderr, "--------------------------------------------------------------\n");
2331                 r600_bytecode_dump(&bc);
2332                 fprintf(stderr, "______________________________________________________________\n");
2333         }
2334
2335         ve->fs_size = bc.ndw*4;
2336
2337         ve->fetch_shader = (struct r600_resource*)
2338                         pipe_buffer_create(rctx->context.screen,
2339                                            PIPE_BIND_CUSTOM,
2340                                            PIPE_USAGE_IMMUTABLE, ve->fs_size);
2341         if (ve->fetch_shader == NULL) {
2342                 r600_bytecode_clear(&bc);
2343                 return -ENOMEM;
2344         }
2345
2346         bytecode = rctx->ws->buffer_map(ve->fetch_shader->buf, rctx->ctx.cs, PIPE_TRANSFER_WRITE);
2347         if (bytecode == NULL) {
2348                 r600_bytecode_clear(&bc);
2349                 pipe_resource_reference((struct pipe_resource**)&ve->fetch_shader, NULL);
2350                 return -ENOMEM;
2351         }
2352
2353         if (R600_BIG_ENDIAN) {
2354                 for (i = 0; i < ve->fs_size / 4; ++i) {
2355                         bytecode[i] = bswap_32(bc.bytecode[i]);
2356                 }
2357         } else {
2358                 memcpy(bytecode, bc.bytecode, ve->fs_size);
2359         }
2360
2361         rctx->ws->buffer_unmap(ve->fetch_shader->buf);
2362         r600_bytecode_clear(&bc);
2363
2364         if (rctx->chip_class >= EVERGREEN)
2365                 evergreen_fetch_shader(&rctx->context, ve);
2366         else
2367                 r600_fetch_shader(&rctx->context, ve);
2368
2369         return 0;
2370 }