From 2e35981d4d625d951328ef5b8f95798112997fb3 Mon Sep 17 00:00:00 2001 From: Eric Anholt Date: Sat, 28 Jun 2014 17:26:15 +0100 Subject: [PATCH] vc4: Add support for SNE/SEQ/SGE/SLT. --- src/gallium/drivers/vc4/vc4_program.c | 4 +++ src/gallium/drivers/vc4/vc4_qir.c | 6 ++++ src/gallium/drivers/vc4/vc4_qir.h | 6 ++++ src/gallium/drivers/vc4/vc4_qpu_defines.h | 2 ++ src/gallium/drivers/vc4/vc4_qpu_disasm.c | 34 ++++++++++++++++--- src/gallium/drivers/vc4/vc4_qpu_emit.c | 55 +++++++++++++++++++++++++++---- 6 files changed, 96 insertions(+), 11 deletions(-) diff --git a/src/gallium/drivers/vc4/vc4_program.c b/src/gallium/drivers/vc4/vc4_program.c index 4755ea0db90..4742c54772a 100644 --- a/src/gallium/drivers/vc4/vc4_program.c +++ b/src/gallium/drivers/vc4/vc4_program.c @@ -254,6 +254,10 @@ emit_tgsi_instruction(struct tgsi_to_qir *trans, [TGSI_OPCODE_MIN] = { QOP_FMIN, tgsi_to_qir_alu }, [TGSI_OPCODE_MAX] = { QOP_FMAX, tgsi_to_qir_alu }, [TGSI_OPCODE_RSQ] = { QOP_RSQ, tgsi_to_qir_alu }, + [TGSI_OPCODE_SEQ] = { QOP_SEQ, tgsi_to_qir_alu }, + [TGSI_OPCODE_SNE] = { QOP_SNE, tgsi_to_qir_alu }, + [TGSI_OPCODE_SGE] = { QOP_SGE, tgsi_to_qir_alu }, + [TGSI_OPCODE_SLT] = { QOP_SLT, tgsi_to_qir_alu }, [TGSI_OPCODE_MAD] = { 0, tgsi_to_qir_mad }, [TGSI_OPCODE_DP2] = { 0, tgsi_to_qir_dp2 }, [TGSI_OPCODE_DP3] = { 0, tgsi_to_qir_dp3 }, diff --git a/src/gallium/drivers/vc4/vc4_qir.c b/src/gallium/drivers/vc4/vc4_qir.c index a4bb6cd1fd1..4ee1f018fc5 100644 --- a/src/gallium/drivers/vc4/vc4_qir.c +++ b/src/gallium/drivers/vc4/vc4_qir.c @@ -43,6 +43,12 @@ static const struct qir_op_info qir_op_info[] = { [QOP_FMAX] = { "fmax", 1, 2 }, [QOP_FMINABS] = { "fminabs", 1, 2 }, [QOP_FMAXABS] = { "fmaxabs", 1, 2 }, + + [QOP_SEQ] = { "seq", 1, 2 }, + [QOP_SNE] = { "sne", 1, 2 }, + [QOP_SGE] = { "sge", 1, 2 }, + [QOP_SLT] = { "slt", 1, 2 }, + [QOP_FTOI] = { "ftoi", 1, 1 }, [QOP_RCP] = { "rcp", 1, 1 }, [QOP_RSQ] = { "rsq", 1, 1 }, diff --git a/src/gallium/drivers/vc4/vc4_qir.h b/src/gallium/drivers/vc4/vc4_qir.h index ae9e1796b90..4263adcddd1 100644 --- a/src/gallium/drivers/vc4/vc4_qir.h +++ b/src/gallium/drivers/vc4/vc4_qir.h @@ -50,6 +50,12 @@ enum qop { QOP_FMAX, QOP_FMINABS, QOP_FMAXABS, + + QOP_SEQ, + QOP_SNE, + QOP_SGE, + QOP_SLT, + QOP_FTOI, QOP_RCP, QOP_RSQ, diff --git a/src/gallium/drivers/vc4/vc4_qpu_defines.h b/src/gallium/drivers/vc4/vc4_qpu_defines.h index 13c940c0f8e..bdd5d94708f 100644 --- a/src/gallium/drivers/vc4/vc4_qpu_defines.h +++ b/src/gallium/drivers/vc4/vc4_qpu_defines.h @@ -223,6 +223,8 @@ enum qpu_pack_a { #define QPU_COND_MUL_SHIFT 46 #define QPU_COND_MUL_MASK QPU_MASK(48, 46) +#define QPU_SF ((uint64_t)1 << 45) + #define QPU_WADDR_ADD_SHIFT 38 #define QPU_WADDR_ADD_MASK QPU_MASK(43, 38) #define QPU_WADDR_MUL_SHIFT 32 diff --git a/src/gallium/drivers/vc4/vc4_qpu_disasm.c b/src/gallium/drivers/vc4/vc4_qpu_disasm.c index 0aea2970f68..4ec6d9657b7 100644 --- a/src/gallium/drivers/vc4/vc4_qpu_disasm.c +++ b/src/gallium/drivers/vc4/vc4_qpu_disasm.c @@ -199,6 +199,17 @@ static const char *qpu_pack_a[] = { [QPU_PACK_A_8D_SAT] = ".8d.sat", }; +static const char *qpu_condflags[] = { + [QPU_COND_NEVER] = ".never", + [QPU_COND_ALWAYS] = "", + [QPU_COND_ZS] = ".zs", + [QPU_COND_ZC] = ".zc", + [QPU_COND_NS] = ".ns", + [QPU_COND_NC] = ".nc", + [QPU_COND_CS] = ".cs", + [QPU_COND_CC] = ".cc", +}; + #define DESC(array, index) \ ((index > ARRAY_SIZE(array) || !(array)[index]) ? \ "???" : (array)[index]) @@ -282,11 +293,15 @@ static void print_add_op(uint64_t inst) { uint32_t op_add = QPU_GET_FIELD(inst, QPU_OP_ADD); + uint32_t cond = QPU_GET_FIELD(inst, QPU_COND_ADD); bool is_mov = (op_add == QPU_A_OR && QPU_GET_FIELD(inst, QPU_ADD_A) == QPU_GET_FIELD(inst, QPU_ADD_B)); - fprintf(stderr, "%s ", is_mov ? "mov" : DESC(qpu_add_opcodes, op_add)); + fprintf(stderr, "%s%s%s ", + is_mov ? "mov" : DESC(qpu_add_opcodes, op_add), + ((inst & QPU_SF) && op_add != QPU_A_NOP) ? ".sf" : "", + op_add != QPU_A_NOP ? DESC(qpu_condflags, cond) : ""); print_alu_dst(inst, false); fprintf(stderr, ", "); @@ -303,12 +318,17 @@ print_add_op(uint64_t inst) static void print_mul_op(uint64_t inst) { + uint32_t op_add = QPU_GET_FIELD(inst, QPU_OP_ADD); uint32_t op_mul = QPU_GET_FIELD(inst, QPU_OP_MUL); + uint32_t cond = QPU_GET_FIELD(inst, QPU_COND_MUL); bool is_mov = (op_mul == QPU_M_V8MIN && QPU_GET_FIELD(inst, QPU_MUL_A) == QPU_GET_FIELD(inst, QPU_MUL_B)); - fprintf(stderr, "%s ", is_mov ? "mov" : DESC(qpu_mul_opcodes, op_mul)); + fprintf(stderr, "%s%s%s ", + is_mov ? "mov" : DESC(qpu_mul_opcodes, op_mul), + ((inst & QPU_SF) && op_add == QPU_A_NOP) ? ".sf" : "", + op_mul != QPU_M_NOP ? DESC(qpu_condflags, cond) : ""); print_alu_dst(inst, true); fprintf(stderr, ", "); @@ -325,12 +345,18 @@ static void print_load_imm(uint64_t inst) { uint32_t imm = inst; + uint32_t waddr_add = QPU_GET_FIELD(inst, QPU_WADDR_ADD); + uint32_t waddr_mul = QPU_GET_FIELD(inst, QPU_WADDR_MUL); + uint32_t cond_add = QPU_GET_FIELD(inst, QPU_COND_ADD); + uint32_t cond_mul = QPU_GET_FIELD(inst, QPU_COND_MUL); fprintf(stderr, "load_imm "); print_alu_dst(inst, false); - fprintf(stderr, ", "); + fprintf(stderr, "%s, ", (waddr_add != QPU_W_NOP ? + DESC(qpu_condflags, cond_add) : "")); print_alu_dst(inst, true); - fprintf(stderr, ", "); + fprintf(stderr, "%s, ", (waddr_mul != QPU_W_NOP ? + DESC(qpu_condflags, cond_mul) : "")); fprintf(stderr, "0x%08x (%f)", imm, uif(imm)); } diff --git a/src/gallium/drivers/vc4/vc4_qpu_emit.c b/src/gallium/drivers/vc4/vc4_qpu_emit.c index 330876734d1..d0f7f894182 100644 --- a/src/gallium/drivers/vc4/vc4_qpu_emit.c +++ b/src/gallium/drivers/vc4/vc4_qpu_emit.c @@ -40,6 +40,28 @@ vc4_dump_program(struct qcompile *c) } } +/** + * This is used to resolve the fact that we might register-allocate two + * different operands of an instruction to the same physical register file + * even though instructions have only one field for the register file source + * address. + * + * In that case, we need to move one to a temporary that can be used in the + * instruction, instead. + */ +static void +fixup_raddr_conflict(uint64_t *insts, uint32_t *ni, + struct qpu_reg src0, struct qpu_reg *src1) +{ + if ((src0.mux == QPU_MUX_A || src0.mux == QPU_MUX_B) && + (src1->mux == QPU_MUX_A || src1->mux == QPU_MUX_B) && + src0.addr != src1->addr) { + insts[(*ni)++] = qpu_inst(qpu_a_MOV(qpu_r5(), *src1), + qpu_m_NOP()); + *src1 = qpu_r5(); + } +} + void vc4_generate_code(struct qcompile *c) { @@ -110,6 +132,13 @@ vc4_generate_code(struct qcompile *c) M(FMUL), }; + static const uint32_t compareflags[] = { + [QOP_SEQ - QOP_SEQ] = QPU_COND_ZS, + [QOP_SNE - QOP_SEQ] = QPU_COND_ZC, + [QOP_SLT - QOP_SEQ] = QPU_COND_NS, + [QOP_SGE - QOP_SEQ] = QPU_COND_NC, + }; + struct qpu_reg src[4]; for (int i = 0; i < qir_get_op_nsrc(qinst->op); i++) { int index = qinst->src[i].index; @@ -184,6 +213,24 @@ vc4_generate_code(struct qcompile *c) } break; + case QOP_SEQ: + case QOP_SNE: + case QOP_SGE: + case QOP_SLT: + fixup_raddr_conflict(insts, &ni, src[0], &src[1]); + insts[ni++] = qpu_inst(qpu_a_SUB(qpu_ra(QPU_W_NOP), + src[0], src[1]), + qpu_m_NOP()); + insts[ni - 1] |= QPU_SF; + + insts[ni++] = qpu_load_imm_f(dst, 0.0); + insts[ni++] = qpu_load_imm_f(dst, 1.0); + insts[ni - 1] = ((insts[ni - 1] & ~QPU_COND_ADD_MASK) + | QPU_SET_FIELD(compareflags[qinst->op - QOP_SEQ], + QPU_COND_ADD)); + + break; + case QOP_VPM_WRITE: insts[ni++] = qpu_inst(qpu_a_MOV(qpu_ra(QPU_W_VPM), src[0]), @@ -274,13 +321,7 @@ vc4_generate_code(struct qcompile *c) if (qir_get_op_nsrc(qinst->op) == 1) src[1] = src[0]; - if ((src[0].mux == QPU_MUX_A || src[0].mux == QPU_MUX_B) && - (src[1].mux == QPU_MUX_A || src[1].mux == QPU_MUX_B) && - src[0].addr != src[1].addr) { - insts[ni++] = qpu_inst(qpu_a_MOV(qpu_r5(), src[1]), - qpu_m_NOP()); - src[1] = qpu_r5(); - } + fixup_raddr_conflict(insts, &ni, src[0], &src[1]); if (translate[qinst->op].is_mul) { insts[ni++] = qpu_inst(qpu_a_NOP(), -- 2.11.0