1 /* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */
4 * Copyright (C) 2013 Rob Clark <robclark@freedesktop.org>
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the "Software"),
8 * to deal in the Software without restriction, including without limitation
9 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10 * and/or sell copies of the Software, and to permit persons to whom the
11 * Software is furnished to do so, subject to the following conditions:
13 * The above copyright notice and this permission notice (including the next
14 * paragraph) shall be included in all copies or substantial portions of the
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
26 * Rob Clark <robclark@freedesktop.org>
31 #include "pipe/p_state.h"
32 #include "util/u_string.h"
33 #include "util/u_memory.h"
34 #include "util/u_inlines.h"
35 #include "tgsi/tgsi_lowering.h"
36 #include "tgsi/tgsi_parse.h"
37 #include "tgsi/tgsi_ureg.h"
38 #include "tgsi/tgsi_info.h"
39 #include "tgsi/tgsi_strings.h"
40 #include "tgsi/tgsi_dump.h"
41 #include "tgsi/tgsi_scan.h"
43 #include "freedreno_util.h"
45 #include "ir3_compiler.h"
46 #include "ir3_shader.h"
48 #include "instr-a3xx.h"
51 struct ir3_compile_context {
52 const struct tgsi_token *tokens;
55 struct ir3_shader_variant *so;
57 struct ir3_block *block;
58 struct ir3_instruction *current_instr;
60 /* we need to defer updates to block->outputs[] until the end
61 * of an instruction (so we don't see new value until *after*
62 * the src registers are processed)
65 struct ir3_instruction *instr, **instrp;
67 unsigned num_output_updates;
69 /* are we in a sequence of "atomic" instructions?
73 /* For fragment shaders, from the hw perspective the only
74 * actual input is r0.xy position register passed to bary.f.
75 * But TGSI doesn't know that, it still declares things as
76 * IN[] registers. So we do all the input tracking normally
77 * and fix things up after compile_instructions()
79 * NOTE that frag_pos is the hardware position (possibly it
80 * is actually an index or tag or some such.. it is *not*
81 * values that can be directly used for gl_FragCoord..)
83 struct ir3_instruction *frag_pos, *frag_face, *frag_coord[4];
85 struct tgsi_parse_context parser;
88 struct tgsi_shader_info info;
90 /* for calculating input/output positions/linkages: */
93 unsigned num_internal_temps;
94 struct tgsi_src_register internal_temps[8];
96 /* idx/slot for last compiler generated immediate */
97 unsigned immediate_idx;
99 /* stack of branch instructions that mark (potentially nested)
100 * branch if/else/loop/etc
103 struct ir3_instruction *instr, *cond;
104 bool inv; /* true iff in else leg of branch */
106 unsigned int branch_count;
108 /* list of kill instructions: */
109 struct ir3_instruction *kill[16];
110 unsigned int kill_count;
112 /* used when dst is same as one of the src, to avoid overwriting a
113 * src element before the remaining scalar instructions that make
114 * up the vector operation
116 struct tgsi_dst_register tmp_dst;
117 struct tgsi_src_register *tmp_src;
119 /* just for catching incorrect use of get_dst()/put_dst():
125 static void vectorize(struct ir3_compile_context *ctx,
126 struct ir3_instruction *instr, struct tgsi_dst_register *dst,
128 static void create_mov(struct ir3_compile_context *ctx,
129 struct tgsi_dst_register *dst, struct tgsi_src_register *src);
130 static type_t get_ftype(struct ir3_compile_context *ctx);
133 compile_init(struct ir3_compile_context *ctx, struct ir3_shader_variant *so,
134 const struct tgsi_token *tokens)
137 struct tgsi_shader_info *info = &ctx->info;
138 struct tgsi_lowering_config lconfig = {
139 .color_two_side = so->key.color_two_side,
157 case SHADER_FRAGMENT:
159 lconfig.saturate_s = so->key.fsaturate_s;
160 lconfig.saturate_t = so->key.fsaturate_t;
161 lconfig.saturate_r = so->key.fsaturate_r;
164 lconfig.saturate_s = so->key.vsaturate_s;
165 lconfig.saturate_t = so->key.vsaturate_t;
166 lconfig.saturate_r = so->key.vsaturate_r;
170 ctx->tokens = tgsi_transform_lowering(&lconfig, tokens, &ctx->info);
171 ctx->free_tokens = !!ctx->tokens;
174 ctx->tokens = tokens;
179 ctx->num_internal_temps = 0;
180 ctx->branch_count = 0;
183 ctx->current_instr = NULL;
184 ctx->num_output_updates = 0;
186 ctx->frag_pos = NULL;
187 ctx->frag_face = NULL;
189 ctx->using_tmp_dst = false;
191 memset(ctx->frag_coord, 0, sizeof(ctx->frag_coord));
193 #define FM(x) (1 << TGSI_FILE_##x)
194 /* optimize can't deal with relative addressing: */
195 if (info->indirect_files & (FM(TEMPORARY) | FM(INPUT) | FM(OUTPUT)))
196 return TGSI_PARSE_ERROR;
198 /* NOTE: if relative addressing is used, we set constlen in
199 * the compiler (to worst-case value) since we don't know in
200 * the assembler what the max addr reg value can be:
202 if (info->indirect_files & FM(CONSTANT))
203 so->constlen = 4 * (ctx->info.file_max[TGSI_FILE_CONSTANT] + 1);
205 /* Immediates go after constants: */
206 so->first_immediate = info->file_max[TGSI_FILE_CONSTANT] + 1;
207 ctx->immediate_idx = 4 * (ctx->info.file_max[TGSI_FILE_IMMEDIATE] + 1);
209 ret = tgsi_parse_init(&ctx->parser, ctx->tokens);
210 if (ret != TGSI_PARSE_OK)
213 ctx->type = ctx->parser.FullHeader.Processor.Processor;
219 compile_error(struct ir3_compile_context *ctx, const char *format, ...)
222 va_start(ap, format);
223 _debug_vprintf(format, ap);
225 tgsi_dump(ctx->tokens, 0);
229 #define compile_assert(ctx, cond) do { \
230 if (!(cond)) compile_error((ctx), "failed assert: "#cond"\n"); \
234 compile_free(struct ir3_compile_context *ctx)
236 if (ctx->free_tokens)
237 free((void *)ctx->tokens);
238 tgsi_parse_free(&ctx->parser);
241 struct instr_translater {
242 void (*fxn)(const struct instr_translater *t,
243 struct ir3_compile_context *ctx,
244 struct tgsi_full_instruction *inst);
247 opc_t hopc; /* opc to use for half_precision mode, if different */
252 instr_finish(struct ir3_compile_context *ctx)
259 for (i = 0; i < ctx->num_output_updates; i++)
260 *(ctx->output_updates[i].instrp) = ctx->output_updates[i].instr;
262 ctx->num_output_updates = 0;
265 /* For "atomic" groups of instructions, for example the four scalar
266 * instructions to perform a vec4 operation. Basically this just
267 * blocks out handling of output_updates so the next scalar instruction
268 * still sees the result from before the start of the atomic group.
270 * NOTE: when used properly, this could probably replace get/put_dst()
274 instr_atomic_start(struct ir3_compile_context *ctx)
280 instr_atomic_end(struct ir3_compile_context *ctx)
286 static struct ir3_instruction *
287 instr_create(struct ir3_compile_context *ctx, int category, opc_t opc)
290 return (ctx->current_instr = ir3_instr_create(ctx->block, category, opc));
293 static struct ir3_instruction *
294 instr_clone(struct ir3_compile_context *ctx, struct ir3_instruction *instr)
297 return (ctx->current_instr = ir3_instr_clone(instr));
300 static struct ir3_block *
301 push_block(struct ir3_compile_context *ctx)
303 struct ir3_block *block;
304 unsigned ntmp, nin, nout;
306 #define SCALAR_REGS(file) (4 * (ctx->info.file_max[TGSI_FILE_ ## file] + 1))
308 /* hmm, give ourselves room to create 8 extra temporaries (vec4):
310 ntmp = SCALAR_REGS(TEMPORARY);
313 nout = SCALAR_REGS(OUTPUT);
314 nin = SCALAR_REGS(INPUT);
316 /* for outermost block, 'inputs' are the actual shader INPUT
317 * register file. Reads from INPUT registers always go back to
318 * top block. For nested blocks, 'inputs' is used to track any
319 * TEMPORARY file register from one of the enclosing blocks that
320 * is ready in this block.
323 /* NOTE: fragment shaders actually have two inputs (r0.xy, the
326 if (ctx->type == TGSI_PROCESSOR_FRAGMENT) {
328 if (ctx->info.reads_position)
330 if (ctx->info.uses_frontface)
333 nout += ARRAY_SIZE(ctx->kill);
339 block = ir3_block_create(ctx->ir, ntmp, nin, nout);
341 if ((ctx->type == TGSI_PROCESSOR_FRAGMENT) && !ctx->block)
342 block->noutputs -= ARRAY_SIZE(ctx->kill);
344 block->parent = ctx->block;
351 pop_block(struct ir3_compile_context *ctx)
353 ctx->block = ctx->block->parent;
354 compile_assert(ctx, ctx->block);
357 static struct ir3_instruction *
358 create_output(struct ir3_block *block, struct ir3_instruction *instr,
361 struct ir3_instruction *out;
363 out = ir3_instr_create(block, -1, OPC_META_OUTPUT);
364 out->inout.block = block;
365 ir3_reg_create(out, n, 0);
367 ir3_reg_create(out, 0, IR3_REG_SSA)->instr = instr;
372 static struct ir3_instruction *
373 create_input(struct ir3_block *block, struct ir3_instruction *instr,
376 struct ir3_instruction *in;
378 in = ir3_instr_create(block, -1, OPC_META_INPUT);
379 in->inout.block = block;
380 ir3_reg_create(in, n, 0);
382 ir3_reg_create(in, 0, IR3_REG_SSA)->instr = instr;
387 static struct ir3_instruction *
388 block_input(struct ir3_block *block, unsigned n)
390 /* references to INPUT register file always go back up to
394 return block_input(block->parent, n);
395 return block->inputs[n];
398 /* return temporary in scope, creating if needed meta-input node
399 * to track block inputs
401 static struct ir3_instruction *
402 block_temporary(struct ir3_block *block, unsigned n)
404 /* references to TEMPORARY register file, find the nearest
405 * enclosing block which has already assigned this temporary,
406 * creating meta-input instructions along the way to keep
407 * track of block inputs
409 if (block->parent && !block->temporaries[n]) {
410 /* if already have input for this block, reuse: */
411 if (!block->inputs[n])
412 block->inputs[n] = block_temporary(block->parent, n);
414 /* and create new input to return: */
415 return create_input(block, block->inputs[n], n);
417 return block->temporaries[n];
420 static struct ir3_instruction *
421 create_immed(struct ir3_compile_context *ctx, float val)
423 /* NOTE: *don't* use instr_create() here!
425 struct ir3_instruction *instr;
426 instr = ir3_instr_create(ctx->block, 1, 0);
427 instr->cat1.src_type = get_ftype(ctx);
428 instr->cat1.dst_type = get_ftype(ctx);
429 ir3_reg_create(instr, 0, 0);
430 ir3_reg_create(instr, 0, IR3_REG_IMMED)->fim_val = val;
435 ssa_dst(struct ir3_compile_context *ctx, struct ir3_instruction *instr,
436 const struct tgsi_dst_register *dst, unsigned chan)
438 unsigned n = regid(dst->Index, chan);
439 unsigned idx = ctx->num_output_updates;
441 compile_assert(ctx, idx < ARRAY_SIZE(ctx->output_updates));
443 /* NOTE: defer update of temporaries[idx] or output[idx]
444 * until instr_finish(), so that if the current instruction
445 * reads the same TEMP/OUT[] it gets the old value:
447 * bleh.. this might be a bit easier to just figure out
448 * in instr_finish(). But at that point we've already
449 * lost information about OUTPUT vs TEMPORARY register
454 case TGSI_FILE_OUTPUT:
455 compile_assert(ctx, n < ctx->block->noutputs);
456 ctx->output_updates[idx].instrp = &ctx->block->outputs[n];
457 ctx->output_updates[idx].instr = instr;
458 ctx->num_output_updates++;
460 case TGSI_FILE_TEMPORARY:
461 compile_assert(ctx, n < ctx->block->ntemporaries);
462 ctx->output_updates[idx].instrp = &ctx->block->temporaries[n];
463 ctx->output_updates[idx].instr = instr;
464 ctx->num_output_updates++;
466 case TGSI_FILE_ADDRESS:
467 compile_assert(ctx, n < 1);
468 ctx->output_updates[idx].instrp = &ctx->block->address;
469 ctx->output_updates[idx].instr = instr;
470 ctx->num_output_updates++;
476 ssa_src(struct ir3_compile_context *ctx, struct ir3_register *reg,
477 const struct tgsi_src_register *src, unsigned chan)
479 struct ir3_block *block = ctx->block;
480 unsigned n = regid(src->Index, chan);
483 case TGSI_FILE_INPUT:
484 reg->flags |= IR3_REG_SSA;
485 reg->instr = block_input(ctx->block, n);
487 case TGSI_FILE_OUTPUT:
488 /* really this should just happen in case of 'MOV_SAT OUT[n], ..',
489 * for the following clamp instructions:
491 reg->flags |= IR3_REG_SSA;
492 reg->instr = block->outputs[n];
493 /* we don't have to worry about read from an OUTPUT that was
494 * assigned outside of the current block, because the _SAT
495 * clamp instructions will always be in the same block as
496 * the original instruction which wrote the OUTPUT
498 compile_assert(ctx, reg->instr);
500 case TGSI_FILE_TEMPORARY:
501 reg->flags |= IR3_REG_SSA;
502 reg->instr = block_temporary(ctx->block, n);
506 if ((reg->flags & IR3_REG_SSA) && !reg->instr) {
507 /* this can happen when registers (or components of a TGSI
508 * register) are used as src before they have been assigned
509 * (undefined contents). To avoid confusing the rest of the
510 * compiler, and to generally keep things peachy, substitute
511 * an instruction that sets the src to 0.0. Or to keep
512 * things undefined, I could plug in a random number? :-P
514 * NOTE: *don't* use instr_create() here!
516 reg->instr = create_immed(ctx, 0.0);
520 static struct ir3_register *
521 add_dst_reg_wrmask(struct ir3_compile_context *ctx,
522 struct ir3_instruction *instr, const struct tgsi_dst_register *dst,
523 unsigned chan, unsigned wrmask)
525 unsigned flags = 0, num = 0;
526 struct ir3_register *reg;
529 case TGSI_FILE_OUTPUT:
530 case TGSI_FILE_TEMPORARY:
533 case TGSI_FILE_ADDRESS:
534 flags |= IR3_REG_ADDR;
538 compile_error(ctx, "unsupported dst register file: %s\n",
539 tgsi_file_name(dst->File));
544 flags |= IR3_REG_RELATIV;
546 reg = ir3_reg_create(instr, regid(num, chan), flags);
548 /* NOTE: do not call ssa_dst() if atomic.. vectorize()
549 * itself will call ssa_dst(). This is to filter out
550 * the (initially bogus) .x component dst which is
551 * created (but not necessarily used, ie. if the net
552 * result of the vector operation does not write to
556 reg->wrmask = wrmask;
560 ssa_dst(ctx, instr, dst, chan);
561 } else if ((dst->File == TGSI_FILE_TEMPORARY) ||
562 (dst->File == TGSI_FILE_OUTPUT) ||
563 (dst->File == TGSI_FILE_ADDRESS)) {
566 /* if instruction writes multiple, we need to create
567 * some place-holder collect the registers:
569 for (i = 0; i < 4; i++) {
570 if (wrmask & (1 << i)) {
571 struct ir3_instruction *collect =
572 ir3_instr_create(ctx->block, -1, OPC_META_FO);
574 /* unused dst reg: */
575 ir3_reg_create(collect, 0, 0);
576 /* and src reg used to hold original instr */
577 ir3_reg_create(collect, 0, IR3_REG_SSA)->instr = instr;
579 ssa_dst(ctx, collect, dst, chan+i);
587 static struct ir3_register *
588 add_dst_reg(struct ir3_compile_context *ctx, struct ir3_instruction *instr,
589 const struct tgsi_dst_register *dst, unsigned chan)
591 return add_dst_reg_wrmask(ctx, instr, dst, chan, 0x1);
594 static struct ir3_register *
595 add_src_reg_wrmask(struct ir3_compile_context *ctx,
596 struct ir3_instruction *instr, const struct tgsi_src_register *src,
597 unsigned chan, unsigned wrmask)
599 unsigned flags = 0, num = 0;
600 struct ir3_register *reg;
601 struct ir3_instruction *orig = NULL;
604 case TGSI_FILE_IMMEDIATE:
605 /* TODO if possible, use actual immediate instead of const.. but
606 * TGSI has vec4 immediates, we can only embed scalar (of limited
607 * size, depending on instruction..)
609 flags |= IR3_REG_CONST;
610 num = src->Index + ctx->so->first_immediate;
612 case TGSI_FILE_CONSTANT:
613 flags |= IR3_REG_CONST;
616 case TGSI_FILE_OUTPUT:
617 /* NOTE: we should only end up w/ OUTPUT file for things like
618 * clamp()'ing saturated dst instructions
620 case TGSI_FILE_INPUT:
621 case TGSI_FILE_TEMPORARY:
625 compile_error(ctx, "unsupported src register file: %s\n",
626 tgsi_file_name(src->File));
630 /* We seem to have 8 bits (6.2) for dst register always, so I think
631 * it is safe to assume GPR cannot be >=64
633 * cat3 instructions only have 8 bits for src2, but cannot take a
636 * cat5 and cat6 in some cases only has 8 bits, but cannot take a
639 * Other than that we seem to have 12 bits to encode const src,
640 * except for cat1 which may only have 11 bits (but that seems like
643 if (flags & IR3_REG_CONST)
644 compile_assert(ctx, src->Index < (1 << 9));
646 compile_assert(ctx, src->Index < (1 << 6));
649 flags |= IR3_REG_ABS;
651 flags |= IR3_REG_NEGATE;
654 flags |= IR3_REG_RELATIV;
656 /* shouldn't happen, and we can't cope with it below: */
657 compile_assert(ctx, wrmask == 0x1);
659 /* wrap in a meta-deref to track both the src and address: */
662 instr = ir3_instr_create(ctx->block, -1, OPC_META_DEREF);
663 ir3_reg_create(instr, 0, 0);
664 ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = ctx->block->address;
667 reg = ir3_reg_create(instr, regid(num, chan), flags);
669 reg->wrmask = wrmask;
672 ssa_src(ctx, reg, src, chan);
673 } else if ((src->File == TGSI_FILE_TEMPORARY) ||
674 (src->File == TGSI_FILE_OUTPUT) ||
675 (src->File == TGSI_FILE_INPUT)) {
676 struct ir3_instruction *collect;
679 compile_assert(ctx, !src->Indirect);
681 /* if instruction reads multiple, we need to create
682 * some place-holder collect the registers:
684 collect = ir3_instr_create(ctx->block, -1, OPC_META_FI);
685 ir3_reg_create(collect, 0, 0); /* unused dst reg */
687 for (i = 0; i < 4; i++) {
688 if (wrmask & (1 << i)) {
689 /* and src reg used point to the original instr */
690 ssa_src(ctx, ir3_reg_create(collect, 0, IR3_REG_SSA),
692 } else if (wrmask & ~((i << i) - 1)) {
693 /* if any remaining components, then dummy
694 * placeholder src reg to fill in the blanks:
696 ir3_reg_create(collect, 0, 0);
700 reg->flags |= IR3_REG_SSA;
701 reg->instr = collect;
705 reg = ir3_reg_create(orig, 0, flags | IR3_REG_SSA);
711 static struct ir3_register *
712 add_src_reg(struct ir3_compile_context *ctx, struct ir3_instruction *instr,
713 const struct tgsi_src_register *src, unsigned chan)
715 return add_src_reg_wrmask(ctx, instr, src, chan, 0x1);
719 src_from_dst(struct tgsi_src_register *src, struct tgsi_dst_register *dst)
721 src->File = dst->File;
722 src->Indirect = dst->Indirect;
723 src->Dimension = dst->Dimension;
724 src->Index = dst->Index;
727 src->SwizzleX = TGSI_SWIZZLE_X;
728 src->SwizzleY = TGSI_SWIZZLE_Y;
729 src->SwizzleZ = TGSI_SWIZZLE_Z;
730 src->SwizzleW = TGSI_SWIZZLE_W;
733 /* Get internal-temp src/dst to use for a sequence of instructions
734 * generated by a single TGSI op.
736 static struct tgsi_src_register *
737 get_internal_temp(struct ir3_compile_context *ctx,
738 struct tgsi_dst_register *tmp_dst)
740 struct tgsi_src_register *tmp_src;
743 tmp_dst->File = TGSI_FILE_TEMPORARY;
744 tmp_dst->WriteMask = TGSI_WRITEMASK_XYZW;
745 tmp_dst->Indirect = 0;
746 tmp_dst->Dimension = 0;
748 /* assign next temporary: */
749 n = ctx->num_internal_temps++;
750 compile_assert(ctx, n < ARRAY_SIZE(ctx->internal_temps));
751 tmp_src = &ctx->internal_temps[n];
753 tmp_dst->Index = ctx->info.file_max[TGSI_FILE_TEMPORARY] + n + 1;
755 src_from_dst(tmp_src, tmp_dst);
761 is_const(struct tgsi_src_register *src)
763 return (src->File == TGSI_FILE_CONSTANT) ||
764 (src->File == TGSI_FILE_IMMEDIATE);
768 is_relative(struct tgsi_src_register *src)
770 return src->Indirect;
774 is_rel_or_const(struct tgsi_src_register *src)
776 return is_relative(src) || is_const(src);
780 get_ftype(struct ir3_compile_context *ctx)
786 get_utype(struct ir3_compile_context *ctx)
792 get_stype(struct ir3_compile_context *ctx)
798 src_swiz(struct tgsi_src_register *src, int chan)
801 case 0: return src->SwizzleX;
802 case 1: return src->SwizzleY;
803 case 2: return src->SwizzleZ;
804 case 3: return src->SwizzleW;
810 /* for instructions that cannot take a const register as src, if needed
811 * generate a move to temporary gpr:
813 static struct tgsi_src_register *
814 get_unconst(struct ir3_compile_context *ctx, struct tgsi_src_register *src)
816 struct tgsi_dst_register tmp_dst;
817 struct tgsi_src_register *tmp_src;
819 compile_assert(ctx, is_rel_or_const(src));
821 tmp_src = get_internal_temp(ctx, &tmp_dst);
823 create_mov(ctx, &tmp_dst, src);
829 get_immediate(struct ir3_compile_context *ctx,
830 struct tgsi_src_register *reg, uint32_t val)
832 unsigned neg, swiz, idx, i;
833 /* actually maps 1:1 currently.. not sure if that is safe to rely on: */
834 static const unsigned swiz2tgsi[] = {
835 TGSI_SWIZZLE_X, TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Z, TGSI_SWIZZLE_W,
838 for (i = 0; i < ctx->immediate_idx; i++) {
842 if (ctx->so->immediates[idx].val[swiz] == val) {
847 if (ctx->so->immediates[idx].val[swiz] == -val) {
853 if (i == ctx->immediate_idx) {
854 /* need to generate a new immediate: */
858 ctx->so->immediates[idx].val[swiz] = val;
859 ctx->so->immediates_count = idx + 1;
860 ctx->immediate_idx++;
863 reg->File = TGSI_FILE_IMMEDIATE;
869 reg->SwizzleX = swiz2tgsi[swiz];
870 reg->SwizzleY = swiz2tgsi[swiz];
871 reg->SwizzleZ = swiz2tgsi[swiz];
872 reg->SwizzleW = swiz2tgsi[swiz];
876 create_mov(struct ir3_compile_context *ctx, struct tgsi_dst_register *dst,
877 struct tgsi_src_register *src)
879 type_t type_mov = get_ftype(ctx);
882 for (i = 0; i < 4; i++) {
883 /* move to destination: */
884 if (dst->WriteMask & (1 << i)) {
885 struct ir3_instruction *instr;
887 if (src->Absolute || src->Negate) {
888 /* can't have abs or neg on a mov instr, so use
889 * absneg.f instead to handle these cases:
891 instr = instr_create(ctx, 2, OPC_ABSNEG_F);
893 instr = instr_create(ctx, 1, 0);
894 instr->cat1.src_type = type_mov;
895 instr->cat1.dst_type = type_mov;
898 add_dst_reg(ctx, instr, dst, i);
899 add_src_reg(ctx, instr, src, src_swiz(src, i));
905 create_clamp(struct ir3_compile_context *ctx,
906 struct tgsi_dst_register *dst, struct tgsi_src_register *val,
907 struct tgsi_src_register *minval, struct tgsi_src_register *maxval)
909 struct ir3_instruction *instr;
911 instr = instr_create(ctx, 2, OPC_MAX_F);
912 vectorize(ctx, instr, dst, 2, val, 0, minval, 0);
914 instr = instr_create(ctx, 2, OPC_MIN_F);
915 vectorize(ctx, instr, dst, 2, val, 0, maxval, 0);
919 create_clamp_imm(struct ir3_compile_context *ctx,
920 struct tgsi_dst_register *dst,
921 uint32_t minval, uint32_t maxval)
923 struct tgsi_src_register minconst, maxconst;
924 struct tgsi_src_register src;
926 src_from_dst(&src, dst);
928 get_immediate(ctx, &minconst, minval);
929 get_immediate(ctx, &maxconst, maxval);
931 create_clamp(ctx, dst, &src, &minconst, &maxconst);
934 static struct tgsi_dst_register *
935 get_dst(struct ir3_compile_context *ctx, struct tgsi_full_instruction *inst)
937 struct tgsi_dst_register *dst = &inst->Dst[0].Register;
940 compile_assert(ctx, !ctx->using_tmp_dst);
941 ctx->using_tmp_dst = true;
943 for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
944 struct tgsi_src_register *src = &inst->Src[i].Register;
945 if ((src->File == dst->File) && (src->Index == dst->Index)) {
946 if ((dst->WriteMask == TGSI_WRITEMASK_XYZW) &&
947 (src->SwizzleX == TGSI_SWIZZLE_X) &&
948 (src->SwizzleY == TGSI_SWIZZLE_Y) &&
949 (src->SwizzleZ == TGSI_SWIZZLE_Z) &&
950 (src->SwizzleW == TGSI_SWIZZLE_W))
952 ctx->tmp_src = get_internal_temp(ctx, &ctx->tmp_dst);
953 ctx->tmp_dst.WriteMask = dst->WriteMask;
962 put_dst(struct ir3_compile_context *ctx, struct tgsi_full_instruction *inst,
963 struct tgsi_dst_register *dst)
965 compile_assert(ctx, ctx->using_tmp_dst);
966 ctx->using_tmp_dst = false;
968 /* if necessary, add mov back into original dst: */
969 if (dst != &inst->Dst[0].Register) {
970 create_mov(ctx, &inst->Dst[0].Register, ctx->tmp_src);
974 /* helper to generate the necessary repeat and/or additional instructions
975 * to turn a scalar instruction into a vector operation:
978 vectorize(struct ir3_compile_context *ctx, struct ir3_instruction *instr,
979 struct tgsi_dst_register *dst, int nsrcs, ...)
984 instr_atomic_start(ctx);
986 add_dst_reg(ctx, instr, dst, TGSI_SWIZZLE_X);
989 for (j = 0; j < nsrcs; j++) {
990 struct tgsi_src_register *src =
991 va_arg(ap, struct tgsi_src_register *);
992 unsigned flags = va_arg(ap, unsigned);
993 struct ir3_register *reg;
994 if (flags & IR3_REG_IMMED) {
995 reg = ir3_reg_create(instr, 0, IR3_REG_IMMED);
996 /* this is an ugly cast.. should have put flags first! */
997 reg->iim_val = *(int *)&src;
999 reg = add_src_reg(ctx, instr, src, TGSI_SWIZZLE_X);
1001 reg->flags |= flags & ~IR3_REG_NEGATE;
1002 if (flags & IR3_REG_NEGATE)
1003 reg->flags ^= IR3_REG_NEGATE;
1007 for (i = 0; i < 4; i++) {
1008 if (dst->WriteMask & (1 << i)) {
1009 struct ir3_instruction *cur;
1014 cur = instr_clone(ctx, instr);
1017 ssa_dst(ctx, cur, dst, i);
1019 /* fix-up dst register component: */
1020 cur->regs[0]->num = regid(cur->regs[0]->num >> 2, i);
1022 /* fix-up src register component: */
1023 va_start(ap, nsrcs);
1024 for (j = 0; j < nsrcs; j++) {
1025 struct ir3_register *reg = cur->regs[j+1];
1026 struct tgsi_src_register *src =
1027 va_arg(ap, struct tgsi_src_register *);
1028 unsigned flags = va_arg(ap, unsigned);
1029 if (reg->flags & IR3_REG_SSA) {
1030 ssa_src(ctx, reg, src, src_swiz(src, i));
1031 } else if (!(flags & IR3_REG_IMMED)) {
1032 reg->num = regid(reg->num >> 2, src_swiz(src, i));
1039 instr_atomic_end(ctx);
1043 * Handlers for TGSI instructions which do not have a 1:1 mapping to
1044 * native instructions:
1048 trans_clamp(const struct instr_translater *t,
1049 struct ir3_compile_context *ctx,
1050 struct tgsi_full_instruction *inst)
1052 struct tgsi_dst_register *dst = get_dst(ctx, inst);
1053 struct tgsi_src_register *src0 = &inst->Src[0].Register;
1054 struct tgsi_src_register *src1 = &inst->Src[1].Register;
1055 struct tgsi_src_register *src2 = &inst->Src[2].Register;
1057 create_clamp(ctx, dst, src0, src1, src2);
1059 put_dst(ctx, inst, dst);
1062 /* ARL(x) = x, but mova from hrN.x to a0.. */
1064 trans_arl(const struct instr_translater *t,
1065 struct ir3_compile_context *ctx,
1066 struct tgsi_full_instruction *inst)
1068 struct ir3_instruction *instr;
1069 struct tgsi_dst_register tmp_dst;
1070 struct tgsi_src_register *tmp_src;
1071 struct tgsi_dst_register *dst = &inst->Dst[0].Register;
1072 struct tgsi_src_register *src = &inst->Src[0].Register;
1073 unsigned chan = src->SwizzleX;
1075 compile_assert(ctx, dst->File == TGSI_FILE_ADDRESS);
1077 /* NOTE: we allocate a temporary from a flat register
1078 * namespace (ignoring half vs full). It turns out
1079 * not to really matter since registers get reassigned
1080 * later in ir3_ra which (hopefully!) can deal a bit
1081 * better with mixed half and full precision.
1083 tmp_src = get_internal_temp(ctx, &tmp_dst);
1085 /* cov.{u,f}{32,16}s16 Rtmp, Rsrc */
1086 instr = instr_create(ctx, 1, 0);
1087 instr->cat1.src_type = (t->tgsi_opc == TGSI_OPCODE_ARL) ?
1088 get_ftype(ctx) : get_utype(ctx);
1089 instr->cat1.dst_type = TYPE_S16;
1090 add_dst_reg(ctx, instr, &tmp_dst, chan)->flags |= IR3_REG_HALF;
1091 add_src_reg(ctx, instr, src, chan);
1093 /* shl.b Rtmp, Rtmp, 2 */
1094 instr = instr_create(ctx, 2, OPC_SHL_B);
1095 add_dst_reg(ctx, instr, &tmp_dst, chan)->flags |= IR3_REG_HALF;
1096 add_src_reg(ctx, instr, tmp_src, chan)->flags |= IR3_REG_HALF;
1097 ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = 2;
1100 instr = instr_create(ctx, 1, 0);
1101 instr->cat1.src_type = TYPE_S16;
1102 instr->cat1.dst_type = TYPE_S16;
1103 add_dst_reg(ctx, instr, dst, 0)->flags |= IR3_REG_HALF;
1104 add_src_reg(ctx, instr, tmp_src, chan)->flags |= IR3_REG_HALF;
1108 * texture fetch/sample instructions:
1114 unsigned src_wrmask, flags;
1117 struct target_info {
1124 static const struct target_info tex_targets[] = {
1125 [TGSI_TEXTURE_1D] = { 1, 0, 0, 0 },
1126 [TGSI_TEXTURE_2D] = { 2, 0, 0, 0 },
1127 [TGSI_TEXTURE_3D] = { 3, 0, 0, 0 },
1128 [TGSI_TEXTURE_CUBE] = { 3, 1, 0, 0 },
1129 [TGSI_TEXTURE_RECT] = { 2, 0, 0, 0 },
1130 [TGSI_TEXTURE_SHADOW1D] = { 1, 0, 0, 1 },
1131 [TGSI_TEXTURE_SHADOW2D] = { 2, 0, 0, 1 },
1132 [TGSI_TEXTURE_SHADOWRECT] = { 2, 0, 0, 1 },
1133 [TGSI_TEXTURE_1D_ARRAY] = { 1, 0, 1, 0 },
1134 [TGSI_TEXTURE_2D_ARRAY] = { 2, 0, 1, 0 },
1135 [TGSI_TEXTURE_SHADOW1D_ARRAY] = { 1, 0, 1, 1 },
1136 [TGSI_TEXTURE_SHADOW2D_ARRAY] = { 2, 0, 1, 1 },
1137 [TGSI_TEXTURE_SHADOWCUBE] = { 3, 1, 0, 1 },
1138 [TGSI_TEXTURE_2D_MSAA] = { 2, 0, 0, 0 },
1139 [TGSI_TEXTURE_2D_ARRAY_MSAA] = { 2, 0, 1, 0 },
1140 [TGSI_TEXTURE_CUBE_ARRAY] = { 3, 1, 1, 0 },
1141 [TGSI_TEXTURE_SHADOWCUBE_ARRAY] = { 3, 1, 1, 1 },
1145 fill_tex_info(struct ir3_compile_context *ctx,
1146 struct tgsi_full_instruction *inst,
1147 struct tex_info *info)
1149 const struct target_info *tgt = &tex_targets[inst->Texture.Texture];
1152 info->flags |= IR3_INSTR_3D;
1154 info->flags |= IR3_INSTR_A;
1156 info->flags |= IR3_INSTR_S;
1158 switch (inst->Instruction.Opcode) {
1159 case TGSI_OPCODE_TXB:
1160 case TGSI_OPCODE_TXB2:
1161 case TGSI_OPCODE_TXL:
1162 case TGSI_OPCODE_TXF:
1165 case TGSI_OPCODE_TXP:
1166 info->flags |= IR3_INSTR_P;
1168 case TGSI_OPCODE_TEX:
1169 case TGSI_OPCODE_TXD:
1175 * lay out the first argument in the proper order:
1176 * - actual coordinates first
1178 * - shadow reference
1181 * bias/lod go into the second arg
1184 for (arg = 0; arg < tgt->dims; arg++)
1185 info->order[arg] = pos++;
1187 info->order[pos++] = -1;
1189 info->order[pos++] = MAX2(arg + tgt->array, 2);
1191 info->order[pos++] = arg++;
1192 if (info->flags & IR3_INSTR_P)
1193 info->order[pos++] = 3;
1195 info->src_wrmask = (1 << pos) - 1;
1197 for (; pos < 4; pos++)
1198 info->order[pos] = -1;
1203 static bool check_swiz(struct tgsi_src_register *src, const int8_t order[4])
1206 for (i = 1; (i < 4) && order[i] >= 0; i++)
1207 if (src_swiz(src, i) != (src_swiz(src, 0) + order[i]))
1212 static bool is_1d(unsigned tex)
1214 return tex_targets[tex].dims == 1;
1217 static struct tgsi_src_register *
1218 get_tex_coord(struct ir3_compile_context *ctx,
1219 struct tgsi_full_instruction *inst,
1220 const struct tex_info *tinf)
1222 struct tgsi_src_register *coord = &inst->Src[0].Register;
1223 struct ir3_instruction *instr;
1224 unsigned tex = inst->Texture.Texture;
1225 struct tgsi_dst_register tmp_dst;
1226 struct tgsi_src_register *tmp_src;
1227 type_t type_mov = get_ftype(ctx);
1230 /* need to move things around: */
1231 tmp_src = get_internal_temp(ctx, &tmp_dst);
1233 for (j = 0; j < 4; j++) {
1234 if (tinf->order[j] < 0)
1236 instr = instr_create(ctx, 1, 0); /* mov */
1237 instr->cat1.src_type = type_mov;
1238 instr->cat1.dst_type = type_mov;
1239 add_dst_reg(ctx, instr, &tmp_dst, j);
1240 add_src_reg(ctx, instr, coord,
1241 src_swiz(coord, tinf->order[j]));
1244 /* fix up .y coord: */
1246 struct ir3_register *imm;
1247 instr = instr_create(ctx, 1, 0); /* mov */
1248 instr->cat1.src_type = type_mov;
1249 instr->cat1.dst_type = type_mov;
1250 add_dst_reg(ctx, instr, &tmp_dst, 1); /* .y */
1251 imm = ir3_reg_create(instr, 0, IR3_REG_IMMED);
1252 if (inst->Instruction.Opcode == TGSI_OPCODE_TXF)
1262 trans_samp(const struct instr_translater *t,
1263 struct ir3_compile_context *ctx,
1264 struct tgsi_full_instruction *inst)
1266 struct ir3_instruction *instr, *collect;
1267 struct ir3_register *reg;
1268 struct tgsi_dst_register *dst = &inst->Dst[0].Register;
1269 struct tgsi_src_register *orig, *coord, *samp, *offset, *dpdx, *dpdy;
1270 struct tgsi_src_register zero;
1271 const struct target_info *tgt = &tex_targets[inst->Texture.Texture];
1272 struct tex_info tinf;
1275 memset(&tinf, 0, sizeof(tinf));
1276 fill_tex_info(ctx, inst, &tinf);
1277 coord = get_tex_coord(ctx, inst, &tinf);
1278 get_immediate(ctx, &zero, 0);
1280 switch (inst->Instruction.Opcode) {
1281 case TGSI_OPCODE_TXB2:
1282 orig = &inst->Src[1].Register;
1283 samp = &inst->Src[2].Register;
1285 case TGSI_OPCODE_TXD:
1286 orig = &inst->Src[0].Register;
1287 dpdx = &inst->Src[1].Register;
1288 dpdy = &inst->Src[2].Register;
1289 samp = &inst->Src[3].Register;
1290 if (is_rel_or_const(dpdx))
1291 dpdx = get_unconst(ctx, dpdx);
1292 if (is_rel_or_const(dpdy))
1293 dpdy = get_unconst(ctx, dpdy);
1296 orig = &inst->Src[0].Register;
1297 samp = &inst->Src[1].Register;
1300 if (tinf.args > 1 && is_rel_or_const(orig))
1301 orig = get_unconst(ctx, orig);
1303 /* scale up integer coords for TXF based on the LOD */
1304 if (inst->Instruction.Opcode == TGSI_OPCODE_TXF) {
1305 struct tgsi_dst_register tmp_dst;
1306 struct tgsi_src_register *tmp_src;
1307 type_t type_mov = get_utype(ctx);
1309 tmp_src = get_internal_temp(ctx, &tmp_dst);
1310 for (i = 0; i < tgt->dims; i++) {
1311 instr = instr_create(ctx, 2, OPC_SHL_B);
1312 add_dst_reg(ctx, instr, &tmp_dst, i);
1313 add_src_reg(ctx, instr, coord, src_swiz(coord, i));
1314 add_src_reg(ctx, instr, orig, orig->SwizzleW);
1316 if (tgt->dims < 2) {
1317 instr = instr_create(ctx, 1, 0);
1318 instr->cat1.src_type = type_mov;
1319 instr->cat1.dst_type = type_mov;
1320 add_dst_reg(ctx, instr, &tmp_dst, i);
1321 add_src_reg(ctx, instr, &zero, 0);
1325 instr = instr_create(ctx, 1, 0);
1326 instr->cat1.src_type = type_mov;
1327 instr->cat1.dst_type = type_mov;
1328 add_dst_reg(ctx, instr, &tmp_dst, i);
1329 add_src_reg(ctx, instr, coord, src_swiz(coord, i));
1334 if (inst->Texture.NumOffsets) {
1335 struct tgsi_texture_offset *tex_offset = &inst->TexOffsets[0];
1336 struct tgsi_src_register offset_src = {0};
1338 offset_src.File = tex_offset->File;
1339 offset_src.Index = tex_offset->Index;
1340 offset_src.SwizzleX = tex_offset->SwizzleX;
1341 offset_src.SwizzleY = tex_offset->SwizzleY;
1342 offset_src.SwizzleZ = tex_offset->SwizzleZ;
1343 offset = get_unconst(ctx, &offset_src);
1344 tinf.flags |= IR3_INSTR_O;
1347 instr = instr_create(ctx, 5, t->opc);
1348 instr->cat5.type = get_ftype(ctx);
1349 instr->cat5.samp = samp->Index;
1350 instr->cat5.tex = samp->Index;
1351 instr->flags |= tinf.flags;
1353 add_dst_reg_wrmask(ctx, instr, dst, 0, dst->WriteMask);
1355 reg = ir3_reg_create(instr, 0, IR3_REG_SSA);
1357 collect = ir3_instr_create(ctx->block, -1, OPC_META_FI);
1358 ir3_reg_create(collect, 0, 0);
1359 for (i = 0; i < 4; i++)
1360 if (tinf.src_wrmask & (1 << i))
1361 ssa_src(ctx, ir3_reg_create(collect, 0, IR3_REG_SSA),
1362 coord, src_swiz(coord, i));
1363 else if (tinf.src_wrmask & ~((1 << i) - 1))
1364 ir3_reg_create(collect, 0, 0);
1366 /* Attach derivatives onto the end of the fan-in. Derivatives start after
1367 * the 4th argument, so make sure that fi is padded up to 4 first.
1369 if (inst->Instruction.Opcode == TGSI_OPCODE_TXD) {
1370 while (collect->regs_count < 5)
1371 ssa_src(ctx, ir3_reg_create(collect, 0, IR3_REG_SSA), &zero, 0);
1372 for (i = 0; i < tgt->dims; i++)
1373 ssa_src(ctx, ir3_reg_create(collect, 0, IR3_REG_SSA), dpdx, i);
1375 ssa_src(ctx, ir3_reg_create(collect, 0, IR3_REG_SSA), &zero, 0);
1376 for (i = 0; i < tgt->dims; i++)
1377 ssa_src(ctx, ir3_reg_create(collect, 0, IR3_REG_SSA), dpdy, i);
1379 ssa_src(ctx, ir3_reg_create(collect, 0, IR3_REG_SSA), &zero, 0);
1380 tinf.src_wrmask |= ((1 << (2 * MAX2(tgt->dims, 2))) - 1) << 4;
1383 reg->instr = collect;
1384 reg->wrmask = tinf.src_wrmask;
1386 /* The second argument contains the offsets, followed by the lod/bias
1387 * argument. This is constructed more manually due to the dynamic nature.
1389 if (inst->Texture.NumOffsets == 0 && tinf.args == 1)
1392 reg = ir3_reg_create(instr, 0, IR3_REG_SSA);
1394 collect = ir3_instr_create(ctx->block, -1, OPC_META_FI);
1395 ir3_reg_create(collect, 0, 0);
1397 if (inst->Texture.NumOffsets) {
1398 for (i = 0; i < tgt->dims; i++)
1399 ssa_src(ctx, ir3_reg_create(collect, 0, IR3_REG_SSA),
1402 ssa_src(ctx, ir3_reg_create(collect, 0, IR3_REG_SSA), &zero, 0);
1404 if (inst->Instruction.Opcode == TGSI_OPCODE_TXB2)
1405 ssa_src(ctx, ir3_reg_create(collect, 0, IR3_REG_SSA),
1406 orig, orig->SwizzleX);
1407 else if (tinf.args > 1)
1408 ssa_src(ctx, ir3_reg_create(collect, 0, IR3_REG_SSA),
1409 orig, orig->SwizzleW);
1411 reg->instr = collect;
1412 reg->wrmask = (1 << (collect->regs_count - 1)) - 1;
1416 trans_txq(const struct instr_translater *t,
1417 struct ir3_compile_context *ctx,
1418 struct tgsi_full_instruction *inst)
1420 struct ir3_instruction *instr;
1421 struct tgsi_dst_register *dst = &inst->Dst[0].Register;
1422 struct tgsi_src_register *level = &inst->Src[0].Register;
1423 struct tgsi_src_register *samp = &inst->Src[1].Register;
1424 struct tex_info tinf;
1426 memset(&tinf, 0, sizeof(tinf));
1427 fill_tex_info(ctx, inst, &tinf);
1428 if (is_rel_or_const(level))
1429 level = get_unconst(ctx, level);
1431 instr = instr_create(ctx, 5, OPC_GETSIZE);
1432 instr->cat5.type = get_utype(ctx);
1433 instr->cat5.samp = samp->Index;
1434 instr->cat5.tex = samp->Index;
1435 instr->flags |= tinf.flags;
1437 add_dst_reg_wrmask(ctx, instr, dst, 0, dst->WriteMask);
1438 add_src_reg_wrmask(ctx, instr, level, level->SwizzleX, 0x1);
1443 trans_deriv(const struct instr_translater *t,
1444 struct ir3_compile_context *ctx,
1445 struct tgsi_full_instruction *inst)
1447 struct ir3_instruction *instr;
1448 struct tgsi_dst_register *dst = &inst->Dst[0].Register;
1449 struct tgsi_src_register *src = &inst->Src[0].Register;
1450 static const int8_t order[4] = {0, 1, 2, 3};
1452 if (!check_swiz(src, order)) {
1453 struct tgsi_dst_register tmp_dst;
1454 struct tgsi_src_register *tmp_src;
1456 tmp_src = get_internal_temp(ctx, &tmp_dst);
1457 create_mov(ctx, &tmp_dst, src);
1462 /* This might be a workaround for hw bug? Blob compiler always
1463 * seems to work two components at a time for dsy/dsx. It does
1464 * actually seem to work in some cases (or at least some piglit
1465 * tests) for four components at a time. But seems more reliable
1466 * to split this into two instructions like the blob compiler
1470 instr = instr_create(ctx, 5, t->opc);
1471 instr->cat5.type = get_ftype(ctx);
1472 add_dst_reg_wrmask(ctx, instr, dst, 0, dst->WriteMask & 0x3);
1473 add_src_reg_wrmask(ctx, instr, src, 0, dst->WriteMask & 0x3);
1475 instr = instr_create(ctx, 5, t->opc);
1476 instr->cat5.type = get_ftype(ctx);
1477 add_dst_reg_wrmask(ctx, instr, dst, 2, (dst->WriteMask >> 2) & 0x3);
1478 add_src_reg_wrmask(ctx, instr, src, 2, (dst->WriteMask >> 2) & 0x3);
1482 * SEQ(a,b) = (a == b) ? 1.0 : 0.0
1483 * cmps.f.eq tmp0, a, b
1484 * cov.u16f16 dst, tmp0
1486 * SNE(a,b) = (a != b) ? 1.0 : 0.0
1487 * cmps.f.ne tmp0, a, b
1488 * cov.u16f16 dst, tmp0
1490 * SGE(a,b) = (a >= b) ? 1.0 : 0.0
1491 * cmps.f.ge tmp0, a, b
1492 * cov.u16f16 dst, tmp0
1494 * SLE(a,b) = (a <= b) ? 1.0 : 0.0
1495 * cmps.f.le tmp0, a, b
1496 * cov.u16f16 dst, tmp0
1498 * SGT(a,b) = (a > b) ? 1.0 : 0.0
1499 * cmps.f.gt tmp0, a, b
1500 * cov.u16f16 dst, tmp0
1502 * SLT(a,b) = (a < b) ? 1.0 : 0.0
1503 * cmps.f.lt tmp0, a, b
1504 * cov.u16f16 dst, tmp0
1506 * CMP(a,b,c) = (a < 0.0) ? b : c
1507 * cmps.f.lt tmp0, a, {0.0}
1508 * sel.b16 dst, b, tmp0, c
1511 trans_cmp(const struct instr_translater *t,
1512 struct ir3_compile_context *ctx,
1513 struct tgsi_full_instruction *inst)
1515 struct ir3_instruction *instr;
1516 struct tgsi_dst_register tmp_dst;
1517 struct tgsi_src_register *tmp_src;
1518 struct tgsi_src_register constval0;
1519 /* final instruction for CMP() uses orig src1 and src2: */
1520 struct tgsi_dst_register *dst = get_dst(ctx, inst);
1521 struct tgsi_src_register *a0, *a1, *a2;
1524 tmp_src = get_internal_temp(ctx, &tmp_dst);
1526 a0 = &inst->Src[0].Register; /* a */
1527 a1 = &inst->Src[1].Register; /* b */
1529 switch (t->tgsi_opc) {
1530 case TGSI_OPCODE_SEQ:
1531 case TGSI_OPCODE_FSEQ:
1532 condition = IR3_COND_EQ;
1534 case TGSI_OPCODE_SNE:
1535 case TGSI_OPCODE_FSNE:
1536 condition = IR3_COND_NE;
1538 case TGSI_OPCODE_SGE:
1539 case TGSI_OPCODE_FSGE:
1540 condition = IR3_COND_GE;
1542 case TGSI_OPCODE_SLT:
1543 case TGSI_OPCODE_FSLT:
1544 condition = IR3_COND_LT;
1546 case TGSI_OPCODE_SLE:
1547 condition = IR3_COND_LE;
1549 case TGSI_OPCODE_SGT:
1550 condition = IR3_COND_GT;
1552 case TGSI_OPCODE_CMP:
1553 get_immediate(ctx, &constval0, fui(0.0));
1554 a0 = &inst->Src[0].Register; /* a */
1555 a1 = &constval0; /* {0.0} */
1556 condition = IR3_COND_LT;
1559 compile_assert(ctx, 0);
1563 if (is_const(a0) && is_const(a1))
1564 a0 = get_unconst(ctx, a0);
1566 /* cmps.f.<cond> tmp, a0, a1 */
1567 instr = instr_create(ctx, 2, OPC_CMPS_F);
1568 instr->cat2.condition = condition;
1569 vectorize(ctx, instr, &tmp_dst, 2, a0, 0, a1, 0);
1571 switch (t->tgsi_opc) {
1572 case TGSI_OPCODE_SEQ:
1573 case TGSI_OPCODE_SGE:
1574 case TGSI_OPCODE_SLE:
1575 case TGSI_OPCODE_SNE:
1576 case TGSI_OPCODE_SGT:
1577 case TGSI_OPCODE_SLT:
1578 /* cov.u16f16 dst, tmp0 */
1579 instr = instr_create(ctx, 1, 0);
1580 instr->cat1.src_type = get_utype(ctx);
1581 instr->cat1.dst_type = get_ftype(ctx);
1582 vectorize(ctx, instr, dst, 1, tmp_src, 0);
1584 case TGSI_OPCODE_FSEQ:
1585 case TGSI_OPCODE_FSGE:
1586 case TGSI_OPCODE_FSNE:
1587 case TGSI_OPCODE_FSLT:
1588 /* absneg.s dst, (neg)tmp0 */
1589 instr = instr_create(ctx, 2, OPC_ABSNEG_S);
1590 vectorize(ctx, instr, dst, 1, tmp_src, IR3_REG_NEGATE);
1592 case TGSI_OPCODE_CMP:
1593 a1 = &inst->Src[1].Register;
1594 a2 = &inst->Src[2].Register;
1595 /* sel.{b32,b16} dst, src2, tmp, src1 */
1596 instr = instr_create(ctx, 3, OPC_SEL_B32);
1597 vectorize(ctx, instr, dst, 3, a1, 0, tmp_src, 0, a2, 0);
1602 put_dst(ctx, inst, dst);
1606 * USNE(a,b) = (a != b) ? ~0 : 0
1607 * cmps.u32.ne dst, a, b
1609 * USEQ(a,b) = (a == b) ? ~0 : 0
1610 * cmps.u32.eq dst, a, b
1612 * ISGE(a,b) = (a > b) ? ~0 : 0
1613 * cmps.s32.ge dst, a, b
1615 * USGE(a,b) = (a > b) ? ~0 : 0
1616 * cmps.u32.ge dst, a, b
1618 * ISLT(a,b) = (a < b) ? ~0 : 0
1619 * cmps.s32.lt dst, a, b
1621 * USLT(a,b) = (a < b) ? ~0 : 0
1622 * cmps.u32.lt dst, a, b
1626 trans_icmp(const struct instr_translater *t,
1627 struct ir3_compile_context *ctx,
1628 struct tgsi_full_instruction *inst)
1630 struct ir3_instruction *instr;
1631 struct tgsi_dst_register *dst = get_dst(ctx, inst);
1632 struct tgsi_dst_register tmp_dst;
1633 struct tgsi_src_register *tmp_src;
1634 struct tgsi_src_register *a0, *a1;
1637 a0 = &inst->Src[0].Register; /* a */
1638 a1 = &inst->Src[1].Register; /* b */
1640 switch (t->tgsi_opc) {
1641 case TGSI_OPCODE_USNE:
1642 condition = IR3_COND_NE;
1644 case TGSI_OPCODE_USEQ:
1645 condition = IR3_COND_EQ;
1647 case TGSI_OPCODE_ISGE:
1648 case TGSI_OPCODE_USGE:
1649 condition = IR3_COND_GE;
1651 case TGSI_OPCODE_ISLT:
1652 case TGSI_OPCODE_USLT:
1653 condition = IR3_COND_LT;
1657 compile_assert(ctx, 0);
1661 if (is_const(a0) && is_const(a1))
1662 a0 = get_unconst(ctx, a0);
1664 tmp_src = get_internal_temp(ctx, &tmp_dst);
1665 /* cmps.{u32,s32}.<cond> tmp, a0, a1 */
1666 instr = instr_create(ctx, 2, t->opc);
1667 instr->cat2.condition = condition;
1668 vectorize(ctx, instr, &tmp_dst, 2, a0, 0, a1, 0);
1670 /* absneg.s dst, (neg)tmp */
1671 instr = instr_create(ctx, 2, OPC_ABSNEG_S);
1672 vectorize(ctx, instr, dst, 1, tmp_src, IR3_REG_NEGATE);
1674 put_dst(ctx, inst, dst);
1678 * UCMP(a,b,c) = a ? b : c
1679 * sel.b16 dst, b, a, c
1682 trans_ucmp(const struct instr_translater *t,
1683 struct ir3_compile_context *ctx,
1684 struct tgsi_full_instruction *inst)
1686 struct ir3_instruction *instr;
1687 struct tgsi_dst_register *dst = get_dst(ctx, inst);
1688 struct tgsi_src_register *a0, *a1, *a2;
1690 a0 = &inst->Src[0].Register; /* a */
1691 a1 = &inst->Src[1].Register; /* b */
1692 a2 = &inst->Src[2].Register; /* c */
1694 if (is_rel_or_const(a0))
1695 a0 = get_unconst(ctx, a0);
1697 /* sel.{b32,b16} dst, b, a, c */
1698 instr = instr_create(ctx, 3, OPC_SEL_B32);
1699 vectorize(ctx, instr, dst, 3, a1, 0, a0, 0, a2, 0);
1700 put_dst(ctx, inst, dst);
1704 * ISSG(a) = a < 0 ? -1 : a > 0 ? 1 : 0
1705 * cmps.s.lt tmp_neg, a, 0 # 1 if a is negative
1706 * cmps.s.gt tmp_pos, a, 0 # 1 if a is positive
1707 * sub.u dst, tmp_pos, tmp_neg
1710 trans_issg(const struct instr_translater *t,
1711 struct ir3_compile_context *ctx,
1712 struct tgsi_full_instruction *inst)
1714 struct ir3_instruction *instr;
1715 struct tgsi_dst_register *dst = get_dst(ctx, inst);
1716 struct tgsi_src_register *a = &inst->Src[0].Register;
1717 struct tgsi_dst_register neg_dst, pos_dst;
1718 struct tgsi_src_register *neg_src, *pos_src;
1720 neg_src = get_internal_temp(ctx, &neg_dst);
1721 pos_src = get_internal_temp(ctx, &pos_dst);
1723 /* cmps.s.lt neg, a, 0 */
1724 instr = instr_create(ctx, 2, OPC_CMPS_S);
1725 instr->cat2.condition = IR3_COND_LT;
1726 vectorize(ctx, instr, &neg_dst, 2, a, 0, 0, IR3_REG_IMMED);
1728 /* cmps.s.gt pos, a, 0 */
1729 instr = instr_create(ctx, 2, OPC_CMPS_S);
1730 instr->cat2.condition = IR3_COND_GT;
1731 vectorize(ctx, instr, &pos_dst, 2, a, 0, 0, IR3_REG_IMMED);
1733 /* sub.u dst, pos, neg */
1734 instr = instr_create(ctx, 2, OPC_SUB_U);
1735 vectorize(ctx, instr, dst, 2, pos_src, 0, neg_src, 0);
1737 put_dst(ctx, inst, dst);
1743 * Conditional / Flow control
1747 push_branch(struct ir3_compile_context *ctx, bool inv,
1748 struct ir3_instruction *instr, struct ir3_instruction *cond)
1750 unsigned int idx = ctx->branch_count++;
1751 compile_assert(ctx, idx < ARRAY_SIZE(ctx->branch));
1752 ctx->branch[idx].instr = instr;
1753 ctx->branch[idx].inv = inv;
1754 /* else side of branch has same condition: */
1756 ctx->branch[idx].cond = cond;
1759 static struct ir3_instruction *
1760 pop_branch(struct ir3_compile_context *ctx)
1762 unsigned int idx = --ctx->branch_count;
1763 return ctx->branch[idx].instr;
1767 trans_if(const struct instr_translater *t,
1768 struct ir3_compile_context *ctx,
1769 struct tgsi_full_instruction *inst)
1771 struct ir3_instruction *instr, *cond;
1772 struct tgsi_src_register *src = &inst->Src[0].Register;
1773 struct tgsi_dst_register tmp_dst;
1774 struct tgsi_src_register *tmp_src;
1775 struct tgsi_src_register constval;
1777 get_immediate(ctx, &constval, fui(0.0));
1778 tmp_src = get_internal_temp(ctx, &tmp_dst);
1781 src = get_unconst(ctx, src);
1783 /* cmps.{f,u}.ne tmp0, b, {0.0} */
1784 instr = instr_create(ctx, 2, t->opc);
1785 add_dst_reg(ctx, instr, &tmp_dst, 0);
1786 add_src_reg(ctx, instr, src, src->SwizzleX);
1787 add_src_reg(ctx, instr, &constval, constval.SwizzleX);
1788 instr->cat2.condition = IR3_COND_NE;
1790 compile_assert(ctx, instr->regs[1]->flags & IR3_REG_SSA); /* because get_unconst() */
1791 cond = instr->regs[1]->instr;
1793 /* meta:flow tmp0 */
1794 instr = instr_create(ctx, -1, OPC_META_FLOW);
1795 ir3_reg_create(instr, 0, 0); /* dummy dst */
1796 add_src_reg(ctx, instr, tmp_src, TGSI_SWIZZLE_X);
1798 push_branch(ctx, false, instr, cond);
1799 instr->flow.if_block = push_block(ctx);
1803 trans_else(const struct instr_translater *t,
1804 struct ir3_compile_context *ctx,
1805 struct tgsi_full_instruction *inst)
1807 struct ir3_instruction *instr;
1811 instr = pop_branch(ctx);
1813 compile_assert(ctx, (instr->category == -1) &&
1814 (instr->opc == OPC_META_FLOW));
1816 push_branch(ctx, true, instr, NULL);
1817 instr->flow.else_block = push_block(ctx);
1820 static struct ir3_instruction *
1821 find_temporary(struct ir3_block *block, unsigned n)
1823 if (block->parent && !block->temporaries[n])
1824 return find_temporary(block->parent, n);
1825 return block->temporaries[n];
1828 static struct ir3_instruction *
1829 find_output(struct ir3_block *block, unsigned n)
1831 if (block->parent && !block->outputs[n])
1832 return find_output(block->parent, n);
1833 return block->outputs[n];
1836 static struct ir3_instruction *
1837 create_phi(struct ir3_compile_context *ctx, struct ir3_instruction *cond,
1838 struct ir3_instruction *a, struct ir3_instruction *b)
1840 struct ir3_instruction *phi;
1842 compile_assert(ctx, cond);
1844 /* Either side of the condition could be null.. which
1845 * indicates a variable written on only one side of the
1846 * branch. Normally this should only be variables not
1847 * used outside of that side of the branch. So we could
1848 * just 'return a ? a : b;' in that case. But for better
1849 * defined undefined behavior we just stick in imm{0.0}.
1850 * In the common case of a value only used within the
1851 * one side of the branch, the PHI instruction will not
1855 a = create_immed(ctx, 0.0);
1857 b = create_immed(ctx, 0.0);
1859 phi = instr_create(ctx, -1, OPC_META_PHI);
1860 ir3_reg_create(phi, 0, 0); /* dummy dst */
1861 ir3_reg_create(phi, 0, IR3_REG_SSA)->instr = cond;
1862 ir3_reg_create(phi, 0, IR3_REG_SSA)->instr = a;
1863 ir3_reg_create(phi, 0, IR3_REG_SSA)->instr = b;
1869 trans_endif(const struct instr_translater *t,
1870 struct ir3_compile_context *ctx,
1871 struct tgsi_full_instruction *inst)
1873 struct ir3_instruction *instr;
1874 struct ir3_block *ifb, *elseb;
1875 struct ir3_instruction **ifout, **elseout;
1876 unsigned i, ifnout = 0, elsenout = 0;
1880 instr = pop_branch(ctx);
1882 compile_assert(ctx, (instr->category == -1) &&
1883 (instr->opc == OPC_META_FLOW));
1885 ifb = instr->flow.if_block;
1886 elseb = instr->flow.else_block;
1887 /* if there is no else block, the parent block is used for the
1888 * branch-not-taken src of the PHI instructions:
1891 elseb = ifb->parent;
1893 /* worst case sizes: */
1894 ifnout = ifb->ntemporaries + ifb->noutputs;
1895 elsenout = elseb->ntemporaries + elseb->noutputs;
1897 ifout = ir3_alloc(ctx->ir, sizeof(ifb->outputs[0]) * ifnout);
1898 if (elseb != ifb->parent)
1899 elseout = ir3_alloc(ctx->ir, sizeof(ifb->outputs[0]) * elsenout);
1904 /* generate PHI instructions for any temporaries written: */
1905 for (i = 0; i < ifb->ntemporaries; i++) {
1906 struct ir3_instruction *a = ifb->temporaries[i];
1907 struct ir3_instruction *b = elseb->temporaries[i];
1909 /* if temporary written in if-block, or if else block
1910 * is present and temporary written in else-block:
1912 if (a || ((elseb != ifb->parent) && b)) {
1913 struct ir3_instruction *phi;
1915 /* if only written on one side, find the closest
1916 * enclosing update on other side:
1919 a = find_temporary(ifb, i);
1921 b = find_temporary(elseb, i);
1924 a = create_output(ifb, a, ifnout++);
1926 if (elseb != ifb->parent) {
1927 elseout[elsenout] = b;
1928 b = create_output(elseb, b, elsenout++);
1931 phi = create_phi(ctx, instr, a, b);
1932 ctx->block->temporaries[i] = phi;
1936 compile_assert(ctx, ifb->noutputs == elseb->noutputs);
1938 /* .. and any outputs written: */
1939 for (i = 0; i < ifb->noutputs; i++) {
1940 struct ir3_instruction *a = ifb->outputs[i];
1941 struct ir3_instruction *b = elseb->outputs[i];
1943 /* if output written in if-block, or if else block
1944 * is present and output written in else-block:
1946 if (a || ((elseb != ifb->parent) && b)) {
1947 struct ir3_instruction *phi;
1949 /* if only written on one side, find the closest
1950 * enclosing update on other side:
1953 a = find_output(ifb, i);
1955 b = find_output(elseb, i);
1958 a = create_output(ifb, a, ifnout++);
1960 if (elseb != ifb->parent) {
1961 elseout[elsenout] = b;
1962 b = create_output(elseb, b, elsenout++);
1965 phi = create_phi(ctx, instr, a, b);
1966 ctx->block->outputs[i] = phi;
1970 ifb->noutputs = ifnout;
1971 ifb->outputs = ifout;
1973 if (elseb != ifb->parent) {
1974 elseb->noutputs = elsenout;
1975 elseb->outputs = elseout;
1978 // TODO maybe we want to compact block->inputs?
1986 trans_kill(const struct instr_translater *t,
1987 struct ir3_compile_context *ctx,
1988 struct tgsi_full_instruction *inst)
1990 struct ir3_instruction *instr, *immed, *cond = NULL;
1993 switch (t->tgsi_opc) {
1994 case TGSI_OPCODE_KILL:
1995 /* unconditional kill, use enclosing if condition: */
1996 if (ctx->branch_count > 0) {
1997 unsigned int idx = ctx->branch_count - 1;
1998 cond = ctx->branch[idx].cond;
1999 inv = ctx->branch[idx].inv;
2001 cond = create_immed(ctx, 1.0);
2007 compile_assert(ctx, cond);
2009 immed = create_immed(ctx, 0.0);
2011 /* cmps.f.ne p0.x, cond, {0.0} */
2012 instr = instr_create(ctx, 2, OPC_CMPS_F);
2013 instr->cat2.condition = IR3_COND_NE;
2014 ir3_reg_create(instr, regid(REG_P0, 0), 0);
2015 ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = cond;
2016 ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = immed;
2020 instr = instr_create(ctx, 0, OPC_KILL);
2021 instr->cat0.inv = inv;
2022 ir3_reg_create(instr, 0, 0); /* dummy dst */
2023 ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = cond;
2025 ctx->kill[ctx->kill_count++] = instr;
2027 ctx->so->has_kill = true;
2035 trans_killif(const struct instr_translater *t,
2036 struct ir3_compile_context *ctx,
2037 struct tgsi_full_instruction *inst)
2039 struct tgsi_src_register *src = &inst->Src[0].Register;
2040 struct ir3_instruction *instr, *immed, *cond = NULL;
2043 immed = create_immed(ctx, 0.0);
2045 /* cmps.f.ne p0.x, cond, {0.0} */
2046 instr = instr_create(ctx, 2, OPC_CMPS_F);
2047 instr->cat2.condition = IR3_COND_NE;
2048 ir3_reg_create(instr, regid(REG_P0, 0), 0);
2049 ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = immed;
2050 add_src_reg(ctx, instr, src, src->SwizzleX);
2055 instr = instr_create(ctx, 0, OPC_KILL);
2056 instr->cat0.inv = inv;
2057 ir3_reg_create(instr, 0, 0); /* dummy dst */
2058 ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = cond;
2060 ctx->kill[ctx->kill_count++] = instr;
2062 ctx->so->has_kill = true;
2066 * I2F / U2F / F2I / F2U
2070 trans_cov(const struct instr_translater *t,
2071 struct ir3_compile_context *ctx,
2072 struct tgsi_full_instruction *inst)
2074 struct ir3_instruction *instr;
2075 struct tgsi_dst_register *dst = get_dst(ctx, inst);
2076 struct tgsi_src_register *src = &inst->Src[0].Register;
2078 // cov.f32s32 dst, tmp0 /
2079 instr = instr_create(ctx, 1, 0);
2080 switch (t->tgsi_opc) {
2081 case TGSI_OPCODE_U2F:
2082 instr->cat1.src_type = TYPE_U32;
2083 instr->cat1.dst_type = TYPE_F32;
2085 case TGSI_OPCODE_I2F:
2086 instr->cat1.src_type = TYPE_S32;
2087 instr->cat1.dst_type = TYPE_F32;
2089 case TGSI_OPCODE_F2U:
2090 instr->cat1.src_type = TYPE_F32;
2091 instr->cat1.dst_type = TYPE_U32;
2093 case TGSI_OPCODE_F2I:
2094 instr->cat1.src_type = TYPE_F32;
2095 instr->cat1.dst_type = TYPE_S32;
2099 vectorize(ctx, instr, dst, 1, src, 0);
2100 put_dst(ctx, inst, dst);
2106 * There is no 32-bit multiply instruction, so splitting a and b into high and
2107 * low components, we get that
2109 * dst = al * bl + ah * bl << 16 + al * bh << 16
2111 * mull.u tmp0, a, b (mul low, i.e. al * bl)
2112 * madsh.m16 tmp1, a, b, tmp0 (mul-add shift high mix, i.e. ah * bl << 16)
2113 * madsh.m16 dst, b, a, tmp1 (i.e. al * bh << 16)
2115 * For UMAD, replace first mull.u with mad.u16.
2118 trans_umul(const struct instr_translater *t,
2119 struct ir3_compile_context *ctx,
2120 struct tgsi_full_instruction *inst)
2122 struct ir3_instruction *instr;
2123 struct tgsi_dst_register *dst = get_dst(ctx, inst);
2124 struct tgsi_src_register *a = &inst->Src[0].Register;
2125 struct tgsi_src_register *b = &inst->Src[1].Register;
2127 struct tgsi_dst_register tmp0_dst, tmp1_dst;
2128 struct tgsi_src_register *tmp0_src, *tmp1_src;
2130 tmp0_src = get_internal_temp(ctx, &tmp0_dst);
2131 tmp1_src = get_internal_temp(ctx, &tmp1_dst);
2133 if (is_rel_or_const(a))
2134 a = get_unconst(ctx, a);
2135 if (is_rel_or_const(b))
2136 b = get_unconst(ctx, b);
2138 if (t->tgsi_opc == TGSI_OPCODE_UMUL) {
2139 /* mull.u tmp0, a, b */
2140 instr = instr_create(ctx, 2, OPC_MULL_U);
2141 vectorize(ctx, instr, &tmp0_dst, 2, a, 0, b, 0);
2143 struct tgsi_src_register *c = &inst->Src[2].Register;
2145 /* mad.u16 tmp0, a, b, c */
2146 instr = instr_create(ctx, 3, OPC_MAD_U16);
2147 vectorize(ctx, instr, &tmp0_dst, 3, a, 0, b, 0, c, 0);
2150 /* madsh.m16 tmp1, a, b, tmp0 */
2151 instr = instr_create(ctx, 3, OPC_MADSH_M16);
2152 vectorize(ctx, instr, &tmp1_dst, 3, a, 0, b, 0, tmp0_src, 0);
2154 /* madsh.m16 dst, b, a, tmp1 */
2155 instr = instr_create(ctx, 3, OPC_MADSH_M16);
2156 vectorize(ctx, instr, dst, 3, b, 0, a, 0, tmp1_src, 0);
2157 put_dst(ctx, inst, dst);
2161 * IDIV / UDIV / MOD / UMOD
2163 * See NV50LegalizeSSA::handleDIV for the origin of this implementation. For
2164 * MOD/UMOD, it becomes a - [IU]DIV(a, modulus) * modulus.
2167 trans_idiv(const struct instr_translater *t,
2168 struct ir3_compile_context *ctx,
2169 struct tgsi_full_instruction *inst)
2171 struct ir3_instruction *instr;
2172 struct tgsi_dst_register *dst = get_dst(ctx, inst), *premod_dst = dst;
2173 struct tgsi_src_register *a = &inst->Src[0].Register;
2174 struct tgsi_src_register *b = &inst->Src[1].Register;
2176 struct tgsi_dst_register af_dst, bf_dst, q_dst, r_dst, a_dst, b_dst;
2177 struct tgsi_src_register *af_src, *bf_src, *q_src, *r_src, *a_src, *b_src;
2179 struct tgsi_src_register negative_2, thirty_one;
2182 if (t->tgsi_opc == TGSI_OPCODE_IDIV || t->tgsi_opc == TGSI_OPCODE_MOD)
2183 src_type = get_stype(ctx);
2185 src_type = get_utype(ctx);
2187 af_src = get_internal_temp(ctx, &af_dst);
2188 bf_src = get_internal_temp(ctx, &bf_dst);
2189 q_src = get_internal_temp(ctx, &q_dst);
2190 r_src = get_internal_temp(ctx, &r_dst);
2191 a_src = get_internal_temp(ctx, &a_dst);
2192 b_src = get_internal_temp(ctx, &b_dst);
2194 get_immediate(ctx, &negative_2, -2);
2195 get_immediate(ctx, &thirty_one, 31);
2197 if (t->tgsi_opc == TGSI_OPCODE_MOD || t->tgsi_opc == TGSI_OPCODE_UMOD)
2198 premod_dst = &q_dst;
2200 /* cov.[us]32f32 af, numerator */
2201 instr = instr_create(ctx, 1, 0);
2202 instr->cat1.src_type = src_type;
2203 instr->cat1.dst_type = get_ftype(ctx);
2204 vectorize(ctx, instr, &af_dst, 1, a, 0);
2206 /* cov.[us]32f32 bf, denominator */
2207 instr = instr_create(ctx, 1, 0);
2208 instr->cat1.src_type = src_type;
2209 instr->cat1.dst_type = get_ftype(ctx);
2210 vectorize(ctx, instr, &bf_dst, 1, b, 0);
2212 /* Get the absolute values for IDIV */
2213 if (type_sint(src_type)) {
2214 /* absneg.f af, (abs)af */
2215 instr = instr_create(ctx, 2, OPC_ABSNEG_F);
2216 vectorize(ctx, instr, &af_dst, 1, af_src, IR3_REG_ABS);
2218 /* absneg.f bf, (abs)bf */
2219 instr = instr_create(ctx, 2, OPC_ABSNEG_F);
2220 vectorize(ctx, instr, &bf_dst, 1, bf_src, IR3_REG_ABS);
2222 /* absneg.s a, (abs)numerator */
2223 instr = instr_create(ctx, 2, OPC_ABSNEG_S);
2224 vectorize(ctx, instr, &a_dst, 1, a, IR3_REG_ABS);
2226 /* absneg.s b, (abs)denominator */
2227 instr = instr_create(ctx, 2, OPC_ABSNEG_S);
2228 vectorize(ctx, instr, &b_dst, 1, b, IR3_REG_ABS);
2230 /* mov.u32u32 a, numerator */
2231 instr = instr_create(ctx, 1, 0);
2232 instr->cat1.src_type = src_type;
2233 instr->cat1.dst_type = src_type;
2234 vectorize(ctx, instr, &a_dst, 1, a, 0);
2236 /* mov.u32u32 b, denominator */
2237 instr = instr_create(ctx, 1, 0);
2238 instr->cat1.src_type = src_type;
2239 instr->cat1.dst_type = src_type;
2240 vectorize(ctx, instr, &b_dst, 1, b, 0);
2244 instr = instr_create(ctx, 4, OPC_RCP);
2245 vectorize(ctx, instr, &bf_dst, 1, bf_src, 0);
2247 /* That's right, subtract 2 as an integer from the float */
2248 /* add.u bf, bf, -2 */
2249 instr = instr_create(ctx, 2, OPC_ADD_U);
2250 vectorize(ctx, instr, &bf_dst, 2, bf_src, 0, &negative_2, 0);
2252 /* mul.f q, af, bf */
2253 instr = instr_create(ctx, 2, OPC_MUL_F);
2254 vectorize(ctx, instr, &q_dst, 2, af_src, 0, bf_src, 0);
2256 /* cov.f32[us]32 q, q */
2257 instr = instr_create(ctx, 1, 0);
2258 instr->cat1.src_type = get_ftype(ctx);
2259 instr->cat1.dst_type = src_type;
2260 vectorize(ctx, instr, &q_dst, 1, q_src, 0);
2262 /* integer multiply q by b */
2263 /* mull.u r, q, b */
2264 instr = instr_create(ctx, 2, OPC_MULL_U);
2265 vectorize(ctx, instr, &r_dst, 2, q_src, 0, b_src, 0);
2267 /* madsh.m16 r, q, b, r */
2268 instr = instr_create(ctx, 3, OPC_MADSH_M16);
2269 vectorize(ctx, instr, &r_dst, 3, q_src, 0, b_src, 0, r_src, 0);
2271 /* madsh.m16, r, b, q, r */
2272 instr = instr_create(ctx, 3, OPC_MADSH_M16);
2273 vectorize(ctx, instr, &r_dst, 3, b_src, 0, q_src, 0, r_src, 0);
2276 instr = instr_create(ctx, 2, OPC_SUB_U);
2277 vectorize(ctx, instr, &r_dst, 2, a_src, 0, r_src, 0);
2279 /* cov.u32f32, r, r */
2280 instr = instr_create(ctx, 1, 0);
2281 instr->cat1.src_type = get_utype(ctx);
2282 instr->cat1.dst_type = get_ftype(ctx);
2283 vectorize(ctx, instr, &r_dst, 1, r_src, 0);
2285 /* mul.f r, r, bf */
2286 instr = instr_create(ctx, 2, OPC_MUL_F);
2287 vectorize(ctx, instr, &r_dst, 2, r_src, 0, bf_src, 0);
2289 /* cov.f32u32 r, r */
2290 instr = instr_create(ctx, 1, 0);
2291 instr->cat1.src_type = get_ftype(ctx);
2292 instr->cat1.dst_type = get_utype(ctx);
2293 vectorize(ctx, instr, &r_dst, 1, r_src, 0);
2296 instr = instr_create(ctx, 2, OPC_ADD_U);
2297 vectorize(ctx, instr, &q_dst, 2, q_src, 0, r_src, 0);
2299 /* mull.u r, q, b */
2300 instr = instr_create(ctx, 2, OPC_MULL_U);
2301 vectorize(ctx, instr, &r_dst, 2, q_src, 0, b_src, 0);
2303 /* madsh.m16 r, q, b, r */
2304 instr = instr_create(ctx, 3, OPC_MADSH_M16);
2305 vectorize(ctx, instr, &r_dst, 3, q_src, 0, b_src, 0, r_src, 0);
2307 /* madsh.m16 r, b, q, r */
2308 instr = instr_create(ctx, 3, OPC_MADSH_M16);
2309 vectorize(ctx, instr, &r_dst, 3, b_src, 0, q_src, 0, r_src, 0);
2312 instr = instr_create(ctx, 2, OPC_SUB_U);
2313 vectorize(ctx, instr, &r_dst, 2, a_src, 0, r_src, 0);
2315 /* cmps.u.ge r, r, b */
2316 instr = instr_create(ctx, 2, OPC_CMPS_U);
2317 instr->cat2.condition = IR3_COND_GE;
2318 vectorize(ctx, instr, &r_dst, 2, r_src, 0, b_src, 0);
2320 if (type_uint(src_type)) {
2321 /* add.u dst, q, r */
2322 instr = instr_create(ctx, 2, OPC_ADD_U);
2323 vectorize(ctx, instr, premod_dst, 2, q_src, 0, r_src, 0);
2326 instr = instr_create(ctx, 2, OPC_ADD_U);
2327 vectorize(ctx, instr, &q_dst, 2, q_src, 0, r_src, 0);
2329 /* negate result based on the original arguments */
2330 if (is_const(a) && is_const(b))
2331 a = get_unconst(ctx, a);
2333 /* xor.b r, numerator, denominator */
2334 instr = instr_create(ctx, 2, OPC_XOR_B);
2335 vectorize(ctx, instr, &r_dst, 2, a, 0, b, 0);
2337 /* shr.b r, r, 31 */
2338 instr = instr_create(ctx, 2, OPC_SHR_B);
2339 vectorize(ctx, instr, &r_dst, 2, r_src, 0, &thirty_one, 0);
2341 /* absneg.s b, (neg)q */
2342 instr = instr_create(ctx, 2, OPC_ABSNEG_S);
2343 vectorize(ctx, instr, &b_dst, 1, q_src, IR3_REG_NEGATE);
2345 /* sel.b dst, b, r, q */
2346 instr = instr_create(ctx, 3, OPC_SEL_B32);
2347 vectorize(ctx, instr, premod_dst, 3, b_src, 0, r_src, 0, q_src, 0);
2350 if (t->tgsi_opc == TGSI_OPCODE_MOD || t->tgsi_opc == TGSI_OPCODE_UMOD) {
2351 /* The division result will have ended up in q. */
2353 if (is_rel_or_const(b))
2354 b = get_unconst(ctx, b);
2356 /* mull.u r, q, b */
2357 instr = instr_create(ctx, 2, OPC_MULL_U);
2358 vectorize(ctx, instr, &r_dst, 2, q_src, 0, b, 0);
2360 /* madsh.m16 r, q, b, r */
2361 instr = instr_create(ctx, 3, OPC_MADSH_M16);
2362 vectorize(ctx, instr, &r_dst, 3, q_src, 0, b, 0, r_src, 0);
2364 /* madsh.m16 r, b, q, r */
2365 instr = instr_create(ctx, 3, OPC_MADSH_M16);
2366 vectorize(ctx, instr, &r_dst, 3, b, 0, q_src, 0, r_src, 0);
2368 /* sub.u dst, a, r */
2369 instr = instr_create(ctx, 2, OPC_SUB_U);
2370 vectorize(ctx, instr, dst, 2, a, 0, r_src, 0);
2373 put_dst(ctx, inst, dst);
2377 * Handlers for TGSI instructions which do have 1:1 mapping to native
2382 instr_cat0(const struct instr_translater *t,
2383 struct ir3_compile_context *ctx,
2384 struct tgsi_full_instruction *inst)
2386 instr_create(ctx, 0, t->opc);
2390 instr_cat1(const struct instr_translater *t,
2391 struct ir3_compile_context *ctx,
2392 struct tgsi_full_instruction *inst)
2394 struct tgsi_dst_register *dst = get_dst(ctx, inst);
2395 struct tgsi_src_register *src = &inst->Src[0].Register;
2396 create_mov(ctx, dst, src);
2397 put_dst(ctx, inst, dst);
2401 instr_cat2(const struct instr_translater *t,
2402 struct ir3_compile_context *ctx,
2403 struct tgsi_full_instruction *inst)
2405 struct tgsi_dst_register *dst = get_dst(ctx, inst);
2406 struct tgsi_src_register *src0 = &inst->Src[0].Register;
2407 struct tgsi_src_register *src1 = &inst->Src[1].Register;
2408 struct ir3_instruction *instr;
2409 unsigned src0_flags = 0, src1_flags = 0;
2411 switch (t->tgsi_opc) {
2412 case TGSI_OPCODE_ABS:
2413 case TGSI_OPCODE_IABS:
2414 src0_flags = IR3_REG_ABS;
2416 case TGSI_OPCODE_INEG:
2417 src0_flags = IR3_REG_NEGATE;
2419 case TGSI_OPCODE_SUB:
2420 src1_flags = IR3_REG_NEGATE;
2439 /* these only have one src reg */
2440 instr = instr_create(ctx, 2, t->opc);
2441 vectorize(ctx, instr, dst, 1, src0, src0_flags);
2444 if (is_const(src0) && is_const(src1))
2445 src0 = get_unconst(ctx, src0);
2447 instr = instr_create(ctx, 2, t->opc);
2448 vectorize(ctx, instr, dst, 2, src0, src0_flags,
2453 put_dst(ctx, inst, dst);
2457 instr_cat3(const struct instr_translater *t,
2458 struct ir3_compile_context *ctx,
2459 struct tgsi_full_instruction *inst)
2461 struct tgsi_dst_register *dst = get_dst(ctx, inst);
2462 struct tgsi_src_register *src0 = &inst->Src[0].Register;
2463 struct tgsi_src_register *src1 = &inst->Src[1].Register;
2464 struct ir3_instruction *instr;
2466 /* in particular, can't handle const for src1 for cat3..
2467 * for mad, we can swap first two src's if needed:
2469 if (is_rel_or_const(src1)) {
2470 if (is_mad(t->opc) && !is_rel_or_const(src0)) {
2471 struct tgsi_src_register *tmp;
2476 src1 = get_unconst(ctx, src1);
2480 instr = instr_create(ctx, 3, t->opc);
2481 vectorize(ctx, instr, dst, 3, src0, 0, src1, 0,
2482 &inst->Src[2].Register, 0);
2483 put_dst(ctx, inst, dst);
2487 instr_cat4(const struct instr_translater *t,
2488 struct ir3_compile_context *ctx,
2489 struct tgsi_full_instruction *inst)
2491 struct tgsi_dst_register *dst = get_dst(ctx, inst);
2492 struct tgsi_src_register *src = &inst->Src[0].Register;
2493 struct ir3_instruction *instr;
2496 /* seems like blob compiler avoids const as src.. */
2498 src = get_unconst(ctx, src);
2500 /* we need to replicate into each component: */
2501 for (i = 0; i < 4; i++) {
2502 if (dst->WriteMask & (1 << i)) {
2503 instr = instr_create(ctx, 4, t->opc);
2504 add_dst_reg(ctx, instr, dst, i);
2505 add_src_reg(ctx, instr, src, src->SwizzleX);
2509 put_dst(ctx, inst, dst);
2512 static const struct instr_translater translaters[TGSI_OPCODE_LAST] = {
2513 #define INSTR(n, f, ...) \
2514 [TGSI_OPCODE_ ## n] = { .fxn = (f), .tgsi_opc = TGSI_OPCODE_ ## n, ##__VA_ARGS__ }
2516 INSTR(MOV, instr_cat1),
2517 INSTR(RCP, instr_cat4, .opc = OPC_RCP),
2518 INSTR(RSQ, instr_cat4, .opc = OPC_RSQ),
2519 INSTR(SQRT, instr_cat4, .opc = OPC_SQRT),
2520 INSTR(MUL, instr_cat2, .opc = OPC_MUL_F),
2521 INSTR(ADD, instr_cat2, .opc = OPC_ADD_F),
2522 INSTR(SUB, instr_cat2, .opc = OPC_ADD_F),
2523 INSTR(MIN, instr_cat2, .opc = OPC_MIN_F),
2524 INSTR(MAX, instr_cat2, .opc = OPC_MAX_F),
2525 INSTR(UADD, instr_cat2, .opc = OPC_ADD_U),
2526 INSTR(IMIN, instr_cat2, .opc = OPC_MIN_S),
2527 INSTR(UMIN, instr_cat2, .opc = OPC_MIN_U),
2528 INSTR(IMAX, instr_cat2, .opc = OPC_MAX_S),
2529 INSTR(UMAX, instr_cat2, .opc = OPC_MAX_U),
2530 INSTR(AND, instr_cat2, .opc = OPC_AND_B),
2531 INSTR(OR, instr_cat2, .opc = OPC_OR_B),
2532 INSTR(NOT, instr_cat2, .opc = OPC_NOT_B),
2533 INSTR(XOR, instr_cat2, .opc = OPC_XOR_B),
2534 INSTR(UMUL, trans_umul),
2535 INSTR(UMAD, trans_umul),
2536 INSTR(UDIV, trans_idiv),
2537 INSTR(IDIV, trans_idiv),
2538 INSTR(MOD, trans_idiv),
2539 INSTR(UMOD, trans_idiv),
2540 INSTR(SHL, instr_cat2, .opc = OPC_SHL_B),
2541 INSTR(USHR, instr_cat2, .opc = OPC_SHR_B),
2542 INSTR(ISHR, instr_cat2, .opc = OPC_ASHR_B),
2543 INSTR(IABS, instr_cat2, .opc = OPC_ABSNEG_S),
2544 INSTR(INEG, instr_cat2, .opc = OPC_ABSNEG_S),
2545 INSTR(AND, instr_cat2, .opc = OPC_AND_B),
2546 INSTR(MAD, instr_cat3, .opc = OPC_MAD_F32, .hopc = OPC_MAD_F16),
2547 INSTR(TRUNC, instr_cat2, .opc = OPC_TRUNC_F),
2548 INSTR(CLAMP, trans_clamp),
2549 INSTR(FLR, instr_cat2, .opc = OPC_FLOOR_F),
2550 INSTR(ROUND, instr_cat2, .opc = OPC_RNDNE_F),
2551 INSTR(SSG, instr_cat2, .opc = OPC_SIGN_F),
2552 INSTR(CEIL, instr_cat2, .opc = OPC_CEIL_F),
2553 INSTR(ARL, trans_arl),
2554 INSTR(UARL, trans_arl),
2555 INSTR(EX2, instr_cat4, .opc = OPC_EXP2),
2556 INSTR(LG2, instr_cat4, .opc = OPC_LOG2),
2557 INSTR(ABS, instr_cat2, .opc = OPC_ABSNEG_F),
2558 INSTR(COS, instr_cat4, .opc = OPC_COS),
2559 INSTR(SIN, instr_cat4, .opc = OPC_SIN),
2560 INSTR(TEX, trans_samp, .opc = OPC_SAM, .arg = TGSI_OPCODE_TEX),
2561 INSTR(TXP, trans_samp, .opc = OPC_SAM, .arg = TGSI_OPCODE_TXP),
2562 INSTR(TXB, trans_samp, .opc = OPC_SAMB, .arg = TGSI_OPCODE_TXB),
2563 INSTR(TXB2, trans_samp, .opc = OPC_SAMB, .arg = TGSI_OPCODE_TXB2),
2564 INSTR(TXL, trans_samp, .opc = OPC_SAML, .arg = TGSI_OPCODE_TXL),
2565 INSTR(TXD, trans_samp, .opc = OPC_SAMGQ, .arg = TGSI_OPCODE_TXD),
2566 INSTR(TXF, trans_samp, .opc = OPC_ISAML, .arg = TGSI_OPCODE_TXF),
2567 INSTR(TXQ, trans_txq),
2568 INSTR(DDX, trans_deriv, .opc = OPC_DSX),
2569 INSTR(DDY, trans_deriv, .opc = OPC_DSY),
2570 INSTR(SGT, trans_cmp),
2571 INSTR(SLT, trans_cmp),
2572 INSTR(FSLT, trans_cmp),
2573 INSTR(SGE, trans_cmp),
2574 INSTR(FSGE, trans_cmp),
2575 INSTR(SLE, trans_cmp),
2576 INSTR(SNE, trans_cmp),
2577 INSTR(FSNE, trans_cmp),
2578 INSTR(SEQ, trans_cmp),
2579 INSTR(FSEQ, trans_cmp),
2580 INSTR(CMP, trans_cmp),
2581 INSTR(USNE, trans_icmp, .opc = OPC_CMPS_U),
2582 INSTR(USEQ, trans_icmp, .opc = OPC_CMPS_U),
2583 INSTR(ISGE, trans_icmp, .opc = OPC_CMPS_S),
2584 INSTR(USGE, trans_icmp, .opc = OPC_CMPS_U),
2585 INSTR(ISLT, trans_icmp, .opc = OPC_CMPS_S),
2586 INSTR(USLT, trans_icmp, .opc = OPC_CMPS_U),
2587 INSTR(UCMP, trans_ucmp),
2588 INSTR(ISSG, trans_issg),
2589 INSTR(IF, trans_if, .opc = OPC_CMPS_F),
2590 INSTR(UIF, trans_if, .opc = OPC_CMPS_U),
2591 INSTR(ELSE, trans_else),
2592 INSTR(ENDIF, trans_endif),
2593 INSTR(END, instr_cat0, .opc = OPC_END),
2594 INSTR(KILL, trans_kill, .opc = OPC_KILL),
2595 INSTR(KILL_IF, trans_killif, .opc = OPC_KILL),
2596 INSTR(I2F, trans_cov),
2597 INSTR(U2F, trans_cov),
2598 INSTR(F2I, trans_cov),
2599 INSTR(F2U, trans_cov),
2603 decl_semantic(const struct tgsi_declaration_semantic *sem)
2605 return ir3_semantic_name(sem->Name, sem->Index);
2608 static struct ir3_instruction *
2609 decl_in_frag_bary(struct ir3_compile_context *ctx, unsigned regid,
2610 unsigned j, unsigned inloc)
2612 struct ir3_instruction *instr;
2613 struct ir3_register *src;
2615 /* bary.f dst, #inloc, r0.x */
2616 instr = instr_create(ctx, 2, OPC_BARY_F);
2617 ir3_reg_create(instr, regid, 0); /* dummy dst */
2618 ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = inloc;
2619 src = ir3_reg_create(instr, 0, IR3_REG_SSA);
2621 src->instr = ctx->frag_pos;
2626 /* TGSI_SEMANTIC_POSITION
2627 * """"""""""""""""""""""
2629 * For fragment shaders, TGSI_SEMANTIC_POSITION is used to indicate that
2630 * fragment shader input contains the fragment's window position. The X
2631 * component starts at zero and always increases from left to right.
2632 * The Y component starts at zero and always increases but Y=0 may either
2633 * indicate the top of the window or the bottom depending on the fragment
2634 * coordinate origin convention (see TGSI_PROPERTY_FS_COORD_ORIGIN).
2635 * The Z coordinate ranges from 0 to 1 to represent depth from the front
2636 * to the back of the Z buffer. The W component contains the reciprocol
2637 * of the interpolated vertex position W component.
2639 static struct ir3_instruction *
2640 decl_in_frag_coord(struct ir3_compile_context *ctx, unsigned regid,
2643 struct ir3_instruction *instr, *src;
2645 compile_assert(ctx, !ctx->frag_coord[j]);
2647 ctx->frag_coord[j] = create_input(ctx->block, NULL, 0);
2653 /* for frag_coord, we get unsigned values.. we need
2654 * to subtract (integer) 8 and divide by 16 (right-
2655 * shift by 4) then convert to float:
2658 /* add.s tmp, src, -8 */
2659 instr = instr_create(ctx, 2, OPC_ADD_S);
2660 ir3_reg_create(instr, regid, 0); /* dummy dst */
2661 ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = ctx->frag_coord[j];
2662 ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = -8;
2665 /* shr.b tmp, tmp, 4 */
2666 instr = instr_create(ctx, 2, OPC_SHR_B);
2667 ir3_reg_create(instr, regid, 0); /* dummy dst */
2668 ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = src;
2669 ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = 4;
2672 /* mov.u32f32 dst, tmp */
2673 instr = instr_create(ctx, 1, 0);
2674 instr->cat1.src_type = TYPE_U32;
2675 instr->cat1.dst_type = TYPE_F32;
2676 ir3_reg_create(instr, regid, 0); /* dummy dst */
2677 ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = src;
2682 /* seems that we can use these as-is: */
2683 instr = ctx->frag_coord[j];
2686 compile_error(ctx, "invalid channel\n");
2687 instr = create_immed(ctx, 0.0);
2694 /* TGSI_SEMANTIC_FACE
2695 * """"""""""""""""""
2697 * This label applies to fragment shader inputs only and indicates that
2698 * the register contains front/back-face information of the form (F, 0,
2699 * 0, 1). The first component will be positive when the fragment belongs
2700 * to a front-facing polygon, and negative when the fragment belongs to a
2701 * back-facing polygon.
2703 static struct ir3_instruction *
2704 decl_in_frag_face(struct ir3_compile_context *ctx, unsigned regid,
2707 struct ir3_instruction *instr, *src;
2711 compile_assert(ctx, !ctx->frag_face);
2713 ctx->frag_face = create_input(ctx->block, NULL, 0);
2715 /* for faceness, we always get -1 or 0 (int).. but TGSI expects
2716 * positive vs negative float.. and piglit further seems to
2717 * expect -1.0 or 1.0:
2719 * mul.s tmp, hr0.x, 2
2721 * mov.s16f32, dst, tmp
2725 instr = instr_create(ctx, 2, OPC_MUL_S);
2726 ir3_reg_create(instr, regid, 0); /* dummy dst */
2727 ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = ctx->frag_face;
2728 ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = 2;
2731 instr = instr_create(ctx, 2, OPC_ADD_S);
2732 ir3_reg_create(instr, regid, 0); /* dummy dst */
2733 ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = src;
2734 ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = 1;
2737 instr = instr_create(ctx, 1, 0); /* mov */
2738 instr->cat1.src_type = TYPE_S32;
2739 instr->cat1.dst_type = TYPE_F32;
2740 ir3_reg_create(instr, regid, 0); /* dummy dst */
2741 ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = src;
2746 instr = create_immed(ctx, 0.0);
2749 instr = create_immed(ctx, 1.0);
2752 compile_error(ctx, "invalid channel\n");
2753 instr = create_immed(ctx, 0.0);
2761 decl_in(struct ir3_compile_context *ctx, struct tgsi_full_declaration *decl)
2763 struct ir3_shader_variant *so = ctx->so;
2764 unsigned name = decl->Semantic.Name;
2767 /* I don't think we should get frag shader input without
2768 * semantic info? Otherwise how do inputs get linked to
2771 compile_assert(ctx, (ctx->type == TGSI_PROCESSOR_VERTEX) ||
2772 decl->Declaration.Semantic);
2774 for (i = decl->Range.First; i <= decl->Range.Last; i++) {
2775 unsigned n = so->inputs_count++;
2776 unsigned r = regid(i, 0);
2779 /* we'll figure out the actual components used after scheduling */
2782 DBG("decl in -> r%d", i);
2784 compile_assert(ctx, n < ARRAY_SIZE(so->inputs));
2786 so->inputs[n].semantic = decl_semantic(&decl->Semantic);
2787 so->inputs[n].compmask = (1 << ncomp) - 1;
2788 so->inputs[n].regid = r;
2789 so->inputs[n].inloc = ctx->next_inloc;
2790 so->inputs[n].interpolate = decl->Interp.Interpolate;
2792 for (j = 0; j < ncomp; j++) {
2793 struct ir3_instruction *instr = NULL;
2795 if (ctx->type == TGSI_PROCESSOR_FRAGMENT) {
2796 /* for fragment shaders, POSITION and FACE are handled
2797 * specially, not using normal varying / bary.f
2799 if (name == TGSI_SEMANTIC_POSITION) {
2800 so->inputs[n].bary = false;
2801 so->frag_coord = true;
2802 instr = decl_in_frag_coord(ctx, r + j, j);
2803 } else if (name == TGSI_SEMANTIC_FACE) {
2804 so->inputs[n].bary = false;
2805 so->frag_face = true;
2806 instr = decl_in_frag_face(ctx, r + j, j);
2808 so->inputs[n].bary = true;
2809 instr = decl_in_frag_bary(ctx, r + j, j,
2810 so->inputs[n].inloc + j - 8);
2813 instr = create_input(ctx->block, NULL, (i * 4) + j);
2816 ctx->block->inputs[(i * 4) + j] = instr;
2819 if (so->inputs[n].bary || (ctx->type == TGSI_PROCESSOR_VERTEX)) {
2820 ctx->next_inloc += ncomp;
2821 so->total_in += ncomp;
2827 decl_out(struct ir3_compile_context *ctx, struct tgsi_full_declaration *decl)
2829 struct ir3_shader_variant *so = ctx->so;
2831 unsigned name = decl->Semantic.Name;
2834 compile_assert(ctx, decl->Declaration.Semantic);
2836 DBG("decl out[%d] -> r%d", name, decl->Range.First);
2838 if (ctx->type == TGSI_PROCESSOR_VERTEX) {
2840 case TGSI_SEMANTIC_POSITION:
2841 so->writes_pos = true;
2843 case TGSI_SEMANTIC_PSIZE:
2844 so->writes_psize = true;
2846 case TGSI_SEMANTIC_COLOR:
2847 case TGSI_SEMANTIC_BCOLOR:
2848 case TGSI_SEMANTIC_GENERIC:
2849 case TGSI_SEMANTIC_FOG:
2850 case TGSI_SEMANTIC_TEXCOORD:
2853 compile_error(ctx, "unknown VS semantic name: %s\n",
2854 tgsi_semantic_names[name]);
2858 case TGSI_SEMANTIC_POSITION:
2859 comp = 2; /* tgsi will write to .z component */
2860 so->writes_pos = true;
2862 case TGSI_SEMANTIC_COLOR:
2865 compile_error(ctx, "unknown FS semantic name: %s\n",
2866 tgsi_semantic_names[name]);
2870 for (i = decl->Range.First; i <= decl->Range.Last; i++) {
2871 unsigned n = so->outputs_count++;
2876 compile_assert(ctx, n < ARRAY_SIZE(so->outputs));
2878 so->outputs[n].semantic = decl_semantic(&decl->Semantic);
2879 so->outputs[n].regid = regid(i, comp);
2881 /* avoid undefined outputs, stick a dummy mov from imm{0.0},
2882 * which if the output is actually assigned will be over-
2885 for (j = 0; j < ncomp; j++)
2886 ctx->block->outputs[(i * 4) + j] = create_immed(ctx, 0.0);
2890 /* from TGSI perspective, we actually have inputs. But most of the "inputs"
2891 * for a fragment shader are just bary.f instructions. The *actual* inputs
2892 * from the hw perspective are the frag_pos and optionally frag_coord and
2896 fixup_frag_inputs(struct ir3_compile_context *ctx)
2898 struct ir3_shader_variant *so = ctx->so;
2899 struct ir3_block *block = ctx->block;
2900 struct ir3_instruction **inputs;
2901 struct ir3_instruction *instr;
2906 n = 4; /* always have frag_pos */
2907 n += COND(so->frag_face, 4);
2908 n += COND(so->frag_coord, 4);
2910 inputs = ir3_alloc(ctx->ir, n * (sizeof(struct ir3_instruction *)));
2912 if (so->frag_face) {
2913 /* this ultimately gets assigned to hr0.x so doesn't conflict
2914 * with frag_coord/frag_pos..
2916 inputs[block->ninputs++] = ctx->frag_face;
2917 ctx->frag_face->regs[0]->num = 0;
2919 /* remaining channels not used, but let's avoid confusing
2920 * other parts that expect inputs to come in groups of vec4
2922 inputs[block->ninputs++] = NULL;
2923 inputs[block->ninputs++] = NULL;
2924 inputs[block->ninputs++] = NULL;
2927 /* since we don't know where to set the regid for frag_coord,
2928 * we have to use r0.x for it. But we don't want to *always*
2929 * use r1.x for frag_pos as that could increase the register
2930 * footprint on simple shaders:
2932 if (so->frag_coord) {
2933 ctx->frag_coord[0]->regs[0]->num = regid++;
2934 ctx->frag_coord[1]->regs[0]->num = regid++;
2935 ctx->frag_coord[2]->regs[0]->num = regid++;
2936 ctx->frag_coord[3]->regs[0]->num = regid++;
2938 inputs[block->ninputs++] = ctx->frag_coord[0];
2939 inputs[block->ninputs++] = ctx->frag_coord[1];
2940 inputs[block->ninputs++] = ctx->frag_coord[2];
2941 inputs[block->ninputs++] = ctx->frag_coord[3];
2944 /* we always have frag_pos: */
2945 so->pos_regid = regid;
2948 instr = create_input(block, NULL, block->ninputs);
2949 instr->regs[0]->num = regid++;
2950 inputs[block->ninputs++] = instr;
2951 ctx->frag_pos->regs[1]->instr = instr;
2954 instr = create_input(block, NULL, block->ninputs);
2955 instr->regs[0]->num = regid++;
2956 inputs[block->ninputs++] = instr;
2957 ctx->frag_pos->regs[2]->instr = instr;
2959 block->inputs = inputs;
2963 compile_instructions(struct ir3_compile_context *ctx)
2967 /* for fragment shader, we have a single input register (usually
2968 * r0.xy) which is used as the base for bary.f varying fetch instrs:
2970 if (ctx->type == TGSI_PROCESSOR_FRAGMENT) {
2971 struct ir3_instruction *instr;
2972 instr = ir3_instr_create(ctx->block, -1, OPC_META_FI);
2973 ir3_reg_create(instr, 0, 0);
2974 ir3_reg_create(instr, 0, IR3_REG_SSA); /* r0.x */
2975 ir3_reg_create(instr, 0, IR3_REG_SSA); /* r0.y */
2976 ctx->frag_pos = instr;
2979 while (!tgsi_parse_end_of_tokens(&ctx->parser)) {
2980 tgsi_parse_token(&ctx->parser);
2982 switch (ctx->parser.FullToken.Token.Type) {
2983 case TGSI_TOKEN_TYPE_DECLARATION: {
2984 struct tgsi_full_declaration *decl =
2985 &ctx->parser.FullToken.FullDeclaration;
2986 if (decl->Declaration.File == TGSI_FILE_OUTPUT) {
2987 decl_out(ctx, decl);
2988 } else if (decl->Declaration.File == TGSI_FILE_INPUT) {
2993 case TGSI_TOKEN_TYPE_IMMEDIATE: {
2994 /* TODO: if we know the immediate is small enough, and only
2995 * used with instructions that can embed an immediate, we
2998 struct tgsi_full_immediate *imm =
2999 &ctx->parser.FullToken.FullImmediate;
3000 unsigned n = ctx->so->immediates_count++;
3001 compile_assert(ctx, n < ARRAY_SIZE(ctx->so->immediates));
3002 memcpy(ctx->so->immediates[n].val, imm->u, 16);
3005 case TGSI_TOKEN_TYPE_INSTRUCTION: {
3006 struct tgsi_full_instruction *inst =
3007 &ctx->parser.FullToken.FullInstruction;
3008 unsigned opc = inst->Instruction.Opcode;
3009 const struct instr_translater *t = &translaters[opc];
3012 t->fxn(t, ctx, inst);
3013 ctx->num_internal_temps = 0;
3015 compile_assert(ctx, !ctx->using_tmp_dst);
3017 compile_error(ctx, "unknown TGSI opc: %s\n",
3018 tgsi_get_opcode_name(opc));
3021 switch (inst->Instruction.Saturate) {
3022 case TGSI_SAT_ZERO_ONE:
3023 create_clamp_imm(ctx, &inst->Dst[0].Register,
3024 fui(0.0), fui(1.0));
3026 case TGSI_SAT_MINUS_PLUS_ONE:
3027 create_clamp_imm(ctx, &inst->Dst[0].Register,
3028 fui(-1.0), fui(1.0));
3043 compile_dump(struct ir3_compile_context *ctx)
3045 const char *name = (ctx->so->type == SHADER_VERTEX) ? "vert" : "frag";
3046 static unsigned n = 0;
3049 snprintf(fname, sizeof(fname), "%s-%04u.dot", name, n++);
3050 f = fopen(fname, "w");
3053 ir3_block_depth(ctx->block);
3054 ir3_dump(ctx->ir, name, ctx->block, f);
3059 ir3_compile_shader(struct ir3_shader_variant *so,
3060 const struct tgsi_token *tokens, struct ir3_shader_key key,
3063 struct ir3_compile_context ctx;
3064 struct ir3_block *block;
3065 struct ir3_instruction **inputs;
3066 unsigned i, j, actual_in;
3067 int ret = 0, max_bary;
3071 so->ir = ir3_create();
3075 if (compile_init(&ctx, so, tokens) != TGSI_PARSE_OK) {
3076 DBG("INIT failed!");
3081 compile_instructions(&ctx);
3084 so->ir->block = block;
3086 /* keep track of the inputs from TGSI perspective.. */
3087 inputs = block->inputs;
3089 /* but fixup actual inputs for frag shader: */
3090 if (ctx.type == TGSI_PROCESSOR_FRAGMENT)
3091 fixup_frag_inputs(&ctx);
3093 /* at this point, for binning pass, throw away unneeded outputs: */
3094 if (key.binning_pass) {
3095 for (i = 0, j = 0; i < so->outputs_count; i++) {
3096 unsigned name = sem2name(so->outputs[i].semantic);
3097 unsigned idx = sem2name(so->outputs[i].semantic);
3099 /* throw away everything but first position/psize */
3100 if ((idx == 0) && ((name == TGSI_SEMANTIC_POSITION) ||
3101 (name == TGSI_SEMANTIC_PSIZE))) {
3103 so->outputs[j] = so->outputs[i];
3104 block->outputs[(j*4)+0] = block->outputs[(i*4)+0];
3105 block->outputs[(j*4)+1] = block->outputs[(i*4)+1];
3106 block->outputs[(j*4)+2] = block->outputs[(i*4)+2];
3107 block->outputs[(j*4)+3] = block->outputs[(i*4)+3];
3112 so->outputs_count = j;
3113 block->noutputs = j * 4;
3116 /* for rendering to alpha format, we only need the .w component,
3117 * and we need it to be in the .x position:
3120 for (i = 0, j = 0; i < so->outputs_count; i++) {
3121 unsigned name = sem2name(so->outputs[i].semantic);
3123 /* move .w component to .x and discard others: */
3124 if (name == TGSI_SEMANTIC_COLOR) {
3125 block->outputs[(i*4)+0] = block->outputs[(i*4)+3];
3126 block->outputs[(i*4)+1] = NULL;
3127 block->outputs[(i*4)+2] = NULL;
3128 block->outputs[(i*4)+3] = NULL;
3133 /* at this point, we want the kill's in the outputs array too,
3134 * so that they get scheduled (since they have no dst).. we've
3135 * already ensured that the array is big enough in push_block():
3137 if (ctx.type == TGSI_PROCESSOR_FRAGMENT) {
3138 for (i = 0; i < ctx.kill_count; i++)
3139 block->outputs[block->noutputs++] = ctx.kill[i];
3142 if (fd_mesa_debug & FD_DBG_OPTDUMP)
3145 ret = ir3_block_flatten(block);
3147 DBG("FLATTEN failed!");
3150 if ((ret > 0) && (fd_mesa_debug & FD_DBG_OPTDUMP))
3153 if (fd_mesa_debug & FD_DBG_OPTMSGS) {
3154 printf("BEFORE CP:\n");
3155 ir3_dump_instr_list(block->head);
3158 if (cp && !(fd_mesa_debug & FD_DBG_NOCP))
3159 ir3_block_cp(block);
3161 if (fd_mesa_debug & FD_DBG_OPTDUMP)
3164 ir3_block_depth(block);
3166 if (fd_mesa_debug & FD_DBG_OPTMSGS) {
3167 printf("AFTER DEPTH:\n");
3168 ir3_dump_instr_list(block->head);
3171 ret = ir3_block_sched(block);
3173 DBG("SCHED failed!");
3177 if (fd_mesa_debug & FD_DBG_OPTMSGS) {
3178 printf("AFTER SCHED:\n");
3179 ir3_dump_instr_list(block->head);
3182 ret = ir3_block_ra(block, so->type, key.half_precision,
3183 so->frag_coord, so->frag_face, &so->has_samp, &max_bary);
3189 if (fd_mesa_debug & FD_DBG_OPTMSGS) {
3190 printf("AFTER RA:\n");
3191 ir3_dump_instr_list(block->head);
3194 /* fixup input/outputs: */
3195 for (i = 0; i < so->outputs_count; i++) {
3196 so->outputs[i].regid = block->outputs[i*4]->regs[0]->num;
3197 /* preserve hack for depth output.. tgsi writes depth to .z,
3198 * but what we give the hw is the scalar register:
3200 if ((ctx.type == TGSI_PROCESSOR_FRAGMENT) &&
3201 (sem2name(so->outputs[i].semantic) == TGSI_SEMANTIC_POSITION))
3202 so->outputs[i].regid += 2;
3204 /* Note that some or all channels of an input may be unused: */
3206 for (i = 0; i < so->inputs_count; i++) {
3207 unsigned j, regid = ~0, compmask = 0;
3208 so->inputs[i].ncomp = 0;
3209 for (j = 0; j < 4; j++) {
3210 struct ir3_instruction *in = inputs[(i*4) + j];
3212 compmask |= (1 << j);
3213 regid = in->regs[0]->num - j;
3215 so->inputs[i].ncomp++;
3218 so->inputs[i].regid = regid;
3219 so->inputs[i].compmask = compmask;
3222 /* fragment shader always gets full vec4's even if it doesn't
3223 * fetch all components, but vertex shader we need to update
3224 * with the actual number of components fetch, otherwise thing
3225 * will hang due to mismaptch between VFD_DECODE's and
3228 if (so->type == SHADER_VERTEX)
3229 so->total_in = actual_in;
3231 so->total_in = align(max_bary + 1, 4);
3235 ir3_destroy(so->ir);