Ejemplo n.º 1
0
bool
qir_opt_cse(struct vc4_compile *c)
{
        bool progress = false;
        uint32_t sf_count = 0, r4_count = 0;

        struct hash_table *ht = _mesa_hash_table_create(NULL, NULL,
                                                        inst_key_equals);
        if (!ht)
                return false;

        list_for_each_entry(struct qinst, inst, &c->instructions, link) {
                if (qir_has_side_effects(c, inst) ||
                    qir_has_side_effect_reads(c, inst) ||
                    inst->op == QOP_TLB_COLOR_READ) {
                        continue;
                }

                if (inst->sf) {
                        sf_count++;
                } else {
                        struct qinst *cse = vc4_find_cse(c, ht, inst,
                                                         sf_count, r4_count);
                        if (cse) {
                                inst->src[0] = cse->dst;
                                for (int i = 1; i < qir_get_op_nsrc(inst->op);
                                     i++)
                                        inst->src[i] = c->undef;
                                inst->op = QOP_MOV;
                                progress = true;

                                if (debug) {
                                        fprintf(stderr, "  Turned into:   ");
                                        qir_dump_inst(c, inst);
                                        fprintf(stderr, "\n");
                                }
                        }
                }

                if (qir_writes_r4(inst))
                        r4_count++;
        }

        ralloc_free(ht);

        return progress;
}
Ejemplo n.º 2
0
/**
 * Returns a mapping from QFILE_TEMP indices to struct qpu_regs.
 *
 * The return value should be freed by the caller.
 */
struct qpu_reg *
vc4_register_allocate(struct vc4_context *vc4, struct vc4_compile *c)
{
        struct node_to_temp_map map[c->num_temps];
        uint32_t temp_to_node[c->num_temps];
        uint8_t class_bits[c->num_temps];
        struct qpu_reg *temp_registers = calloc(c->num_temps,
                                                sizeof(*temp_registers));

        /* If things aren't ever written (undefined values), just read from
         * r0.
         */
        for (uint32_t i = 0; i < c->num_temps; i++)
                temp_registers[i] = qpu_rn(0);

        vc4_alloc_reg_set(vc4);

        struct ra_graph *g = ra_alloc_interference_graph(vc4->regs,
                                                         c->num_temps);

        /* Compute the live ranges so we can figure out interference. */
        qir_calculate_live_intervals(c);

        for (uint32_t i = 0; i < c->num_temps; i++) {
                map[i].temp = i;
                map[i].priority = c->temp_end[i] - c->temp_start[i];
        }
        qsort(map, c->num_temps, sizeof(map[0]), node_to_temp_priority);
        for (uint32_t i = 0; i < c->num_temps; i++) {
                temp_to_node[map[i].temp] = i;
        }

        /* Figure out our register classes and preallocated registers.  We
         * start with any temp being able to be in any file, then instructions
         * incrementally remove bits that the temp definitely can't be in.
         */
        memset(class_bits,
               CLASS_BIT_A | CLASS_BIT_B_OR_ACC | CLASS_BIT_R4,
               sizeof(class_bits));

        int ip = 0;
        qir_for_each_inst_inorder(inst, c) {
                if (qir_writes_r4(inst)) {
                        /* This instruction writes r4 (and optionally moves
                         * its result to a temp), so nothing else can be
                         * stored in r4 across it.
                         */
                        for (int i = 0; i < c->num_temps; i++) {
                                if (c->temp_start[i] < ip && c->temp_end[i] > ip)
                                        class_bits[i] &= ~CLASS_BIT_R4;
                        }
                } else {
                        /* R4 can't be written as a general purpose
                         * register. (it's TMU_NOSWAP as a write address).
                         */
                        if (inst->dst.file == QFILE_TEMP)
                                class_bits[inst->dst.index] &= ~CLASS_BIT_R4;
                }

                switch (inst->op) {
                case QOP_FRAG_Z:
                        ra_set_node_reg(g, temp_to_node[inst->dst.index],
                                        AB_INDEX + QPU_R_FRAG_PAYLOAD_ZW * 2 + 1);
                        break;

                case QOP_FRAG_W:
                        ra_set_node_reg(g, temp_to_node[inst->dst.index],
                                        AB_INDEX + QPU_R_FRAG_PAYLOAD_ZW * 2);
                        break;

                case QOP_ROT_MUL:
                        assert(inst->src[0].file == QFILE_TEMP);
                        class_bits[inst->src[0].index] &= ~CLASS_BIT_R0_R3;
                        break;

                default:
                        break;
                }

                if (inst->dst.pack && !qir_is_mul(inst)) {
                        /* The non-MUL pack flags require an A-file dst
                         * register.
                         */
                        class_bits[inst->dst.index] &= CLASS_BIT_A;
                }

                /* Apply restrictions for src unpacks.  The integer unpacks
                 * can only be done from regfile A, while float unpacks can be
                 * either A or R4.
                 */
                for (int i = 0; i < qir_get_op_nsrc(inst->op); i++) {
                        if (inst->src[i].file == QFILE_TEMP &&
                            inst->src[i].pack) {
                                if (qir_is_float_input(inst)) {
                                        class_bits[inst->src[i].index] &=
                                                CLASS_BIT_A | CLASS_BIT_R4;
                                } else {
                                        class_bits[inst->src[i].index] &=
                                                CLASS_BIT_A;
                                }
                        }
                }

                ip++;
        }

        for (uint32_t i = 0; i < c->num_temps; i++) {
                int node = temp_to_node[i];

                switch (class_bits[i]) {
                case CLASS_BIT_A | CLASS_BIT_B_OR_ACC | CLASS_BIT_R4:
                        ra_set_node_class(g, node, vc4->reg_class_any);
                        break;
                case CLASS_BIT_A | CLASS_BIT_B_OR_ACC:
                        ra_set_node_class(g, node, vc4->reg_class_a_or_b_or_acc);
                        break;
                case CLASS_BIT_A | CLASS_BIT_R4:
                        ra_set_node_class(g, node, vc4->reg_class_r4_or_a);
                        break;
                case CLASS_BIT_A:
                        ra_set_node_class(g, node, vc4->reg_class_a);
                        break;
                case CLASS_BIT_R0_R3:
                        ra_set_node_class(g, node, vc4->reg_class_r0_r3);
                        break;
                default:
                        fprintf(stderr, "temp %d: bad class bits: 0x%x\n",
                                i, class_bits[i]);
                        abort();
                        break;
                }
        }

        for (uint32_t i = 0; i < c->num_temps; i++) {
                for (uint32_t j = i + 1; j < c->num_temps; j++) {
                        if (!(c->temp_start[i] >= c->temp_end[j] ||
                              c->temp_start[j] >= c->temp_end[i])) {
                                ra_add_node_interference(g,
                                                         temp_to_node[i],
                                                         temp_to_node[j]);
                        }
                }
        }

        bool ok = ra_allocate(g);
        if (!ok) {
                fprintf(stderr, "Failed to register allocate:\n");
                qir_dump(c);
                c->failed = true;
                return NULL;
        }

        for (uint32_t i = 0; i < c->num_temps; i++) {
                temp_registers[i] = vc4_regs[ra_get_node_reg(g, temp_to_node[i])];

                /* If the value's never used, just write to the NOP register
                 * for clarity in debug output.
                 */
                if (c->temp_start[i] == c->temp_end[i])
                        temp_registers[i] = qpu_ra(QPU_W_NOP);
        }

        ralloc_free(g);

        return temp_registers;
}