Beispiel #1
0
static int
find_output(const struct ir3_shader_variant *so, ir3_semantic semantic)
{
	int j;

	for (j = 0; j < so->outputs_count; j++)
		if (so->outputs[j].semantic == semantic)
			return j;

	/* it seems optional to have a OUT.BCOLOR[n] for each OUT.COLOR[n]
	 * in the vertex shader.. but the fragment shader doesn't know this
	 * so  it will always have both IN.COLOR[n] and IN.BCOLOR[n].  So
	 * at link time if there is no matching OUT.BCOLOR[n], we must map
	 * OUT.COLOR[n] to IN.BCOLOR[n].  And visa versa if there is only
	 * a OUT.BCOLOR[n] but no matching OUT.COLOR[n]
	 */
	if (sem2name(semantic) == TGSI_SEMANTIC_BCOLOR) {
		unsigned idx = sem2idx(semantic);
		semantic = ir3_semantic_name(TGSI_SEMANTIC_COLOR, idx);
	} else if (sem2name(semantic) == TGSI_SEMANTIC_COLOR) {
		unsigned idx = sem2idx(semantic);
		semantic = ir3_semantic_name(TGSI_SEMANTIC_BCOLOR, idx);
	}

	for (j = 0; j < so->outputs_count; j++)
		if (so->outputs[j].semantic == semantic)
			return j;

	debug_assert(0);

	return 0;
}
Beispiel #2
0
void
fd4_program_emit(struct fd_ringbuffer *ring, struct fd4_emit *emit)
{
	struct stage s[MAX_STAGES];
	uint32_t pos_regid, posz_regid, psize_regid, color_regid;
	uint32_t face_regid, coord_regid, zwcoord_regid;
	int constmode;
	int i, j, k;

	setup_stages(emit, s);

	/* blob seems to always use constmode currently: */
	constmode = 1;

	pos_regid = ir3_find_output_regid(s[VS].v,
		ir3_semantic_name(TGSI_SEMANTIC_POSITION, 0));
	posz_regid = ir3_find_output_regid(s[FS].v,
		ir3_semantic_name(TGSI_SEMANTIC_POSITION, 0));
	psize_regid = ir3_find_output_regid(s[VS].v,
		ir3_semantic_name(TGSI_SEMANTIC_PSIZE, 0));
	color_regid = ir3_find_output_regid(s[FS].v,
		ir3_semantic_name(TGSI_SEMANTIC_COLOR, 0));

	if (util_format_is_alpha(emit->pformat))
		color_regid += 3;

	/* TODO get these dynamically: */
	face_regid = s[FS].v->frag_face ? regid(0,0) : regid(63,0);
	coord_regid = s[FS].v->frag_coord ? regid(0,0) : regid(63,0);
	zwcoord_regid = s[FS].v->frag_coord ? regid(0,2) : regid(63,0);

	/* we could probably divide this up into things that need to be
	 * emitted if frag-prog is dirty vs if vert-prog is dirty..
	 */

	OUT_PKT0(ring, REG_A4XX_HLSQ_UPDATE_CONTROL, 1);
	OUT_RING(ring, 0x00000003);

	OUT_PKT0(ring, REG_A4XX_HLSQ_CONTROL_0_REG, 5);
	OUT_RING(ring, A4XX_HLSQ_CONTROL_0_REG_FSTHREADSIZE(FOUR_QUADS) |
			A4XX_HLSQ_CONTROL_0_REG_CONSTMODE(constmode) |
			A4XX_HLSQ_CONTROL_0_REG_FSSUPERTHREADENABLE |
			/* NOTE:  I guess SHADERRESTART and CONSTFULLUPDATE maybe
			 * flush some caches? I think we only need to set those
			 * bits if we have updated const or shader..
			 */
			A4XX_HLSQ_CONTROL_0_REG_SPSHADERRESTART |
			A4XX_HLSQ_CONTROL_0_REG_SPCONSTFULLUPDATE);
	OUT_RING(ring, A4XX_HLSQ_CONTROL_1_REG_VSTHREADSIZE(TWO_QUADS) |
			A4XX_HLSQ_CONTROL_1_REG_VSSUPERTHREADENABLE |
			A4XX_HLSQ_CONTROL_1_REG_COORDREGID(coord_regid) |
			A4XX_HLSQ_CONTROL_1_REG_ZWCOORDREGID(zwcoord_regid));
	OUT_RING(ring, A4XX_HLSQ_CONTROL_2_REG_PRIMALLOCTHRESHOLD(63) |
			0x3f3f000 |           /* XXX */
			A4XX_HLSQ_CONTROL_2_REG_FACEREGID(face_regid));
	OUT_RING(ring, A4XX_HLSQ_CONTROL_3_REG_REGID(s[FS].v->pos_regid) |
			0xfcfcfc00);
	OUT_RING(ring, 0x00fcfcfc);   /* XXX HLSQ_CONTROL_4 */

	OUT_PKT0(ring, REG_A4XX_HLSQ_VS_CONTROL_REG, 5);
	OUT_RING(ring, A4XX_HLSQ_VS_CONTROL_REG_CONSTLENGTH(s[VS].constlen) |
			A4XX_HLSQ_VS_CONTROL_REG_CONSTOBJECTOFFSET(s[VS].constoff) |
			A4XX_HLSQ_VS_CONTROL_REG_INSTRLENGTH(s[VS].instrlen) |
			A4XX_HLSQ_VS_CONTROL_REG_SHADEROBJOFFSET(s[VS].instroff));
	OUT_RING(ring, A4XX_HLSQ_FS_CONTROL_REG_CONSTLENGTH(s[FS].constlen) |
			A4XX_HLSQ_FS_CONTROL_REG_CONSTOBJECTOFFSET(s[FS].constoff) |
			A4XX_HLSQ_FS_CONTROL_REG_INSTRLENGTH(s[FS].instrlen) |
			A4XX_HLSQ_FS_CONTROL_REG_SHADEROBJOFFSET(s[FS].instroff));
	OUT_RING(ring, A4XX_HLSQ_HS_CONTROL_REG_CONSTLENGTH(s[HS].constlen) |
			A4XX_HLSQ_HS_CONTROL_REG_CONSTOBJECTOFFSET(s[HS].constoff) |
			A4XX_HLSQ_HS_CONTROL_REG_INSTRLENGTH(s[HS].instrlen) |
			A4XX_HLSQ_HS_CONTROL_REG_SHADEROBJOFFSET(s[HS].instroff));
	OUT_RING(ring, A4XX_HLSQ_DS_CONTROL_REG_CONSTLENGTH(s[DS].constlen) |
			A4XX_HLSQ_DS_CONTROL_REG_CONSTOBJECTOFFSET(s[DS].constoff) |
			A4XX_HLSQ_DS_CONTROL_REG_INSTRLENGTH(s[DS].instrlen) |
			A4XX_HLSQ_DS_CONTROL_REG_SHADEROBJOFFSET(s[DS].instroff));
	OUT_RING(ring, A4XX_HLSQ_GS_CONTROL_REG_CONSTLENGTH(s[GS].constlen) |
			A4XX_HLSQ_GS_CONTROL_REG_CONSTOBJECTOFFSET(s[GS].constoff) |
			A4XX_HLSQ_GS_CONTROL_REG_INSTRLENGTH(s[GS].instrlen) |
			A4XX_HLSQ_GS_CONTROL_REG_SHADEROBJOFFSET(s[GS].instroff));

	OUT_PKT0(ring, REG_A4XX_SP_SP_CTRL_REG, 1);
	OUT_RING(ring, 0x140010 | /* XXX */
			COND(emit->key.binning_pass, A4XX_SP_SP_CTRL_REG_BINNING_PASS));

	OUT_PKT0(ring, REG_A4XX_SP_INSTR_CACHE_CTRL, 1);
	OUT_RING(ring, 0x7f | /* XXX */
			COND(s[VS].instrlen, A4XX_SP_INSTR_CACHE_CTRL_VS_BUFFER) |
			COND(s[FS].instrlen, A4XX_SP_INSTR_CACHE_CTRL_FS_BUFFER) |
			COND(s[VS].instrlen && s[FS].instrlen,
					A4XX_SP_INSTR_CACHE_CTRL_INSTR_BUFFER));

	OUT_PKT0(ring, REG_A4XX_SP_VS_LENGTH_REG, 1);
	OUT_RING(ring, s[VS].v->instrlen);      /* SP_VS_LENGTH_REG */

	OUT_PKT0(ring, REG_A4XX_SP_VS_CTRL_REG0, 3);
	OUT_RING(ring, A4XX_SP_VS_CTRL_REG0_THREADMODE(MULTI) |
			A4XX_SP_VS_CTRL_REG0_HALFREGFOOTPRINT(s[VS].i->max_half_reg + 1) |
			A4XX_SP_VS_CTRL_REG0_FULLREGFOOTPRINT(s[VS].i->max_reg + 1) |
			A4XX_SP_VS_CTRL_REG0_INOUTREGOVERLAP(0) |
			A4XX_SP_VS_CTRL_REG0_THREADSIZE(TWO_QUADS) |
			A4XX_SP_VS_CTRL_REG0_SUPERTHREADMODE |
			COND(s[VS].v->has_samp, A4XX_SP_VS_CTRL_REG0_PIXLODENABLE));
	OUT_RING(ring, A4XX_SP_VS_CTRL_REG1_CONSTLENGTH(s[VS].constlen) |
			A4XX_SP_VS_CTRL_REG1_INITIALOUTSTANDING(s[VS].v->total_in));
	OUT_RING(ring, A4XX_SP_VS_PARAM_REG_POSREGID(pos_regid) |
			A4XX_SP_VS_PARAM_REG_PSIZEREGID(psize_regid) |
			A4XX_SP_VS_PARAM_REG_TOTALVSOUTVAR(align(s[FS].v->total_in, 4) / 4));

	for (i = 0, j = -1; (i < 16) && (j < (int)s[FS].v->inputs_count); i++) {
		uint32_t reg = 0;

		OUT_PKT0(ring, REG_A4XX_SP_VS_OUT_REG(i), 1);

		j = ir3_next_varying(s[FS].v, j);
		if (j < s[FS].v->inputs_count) {
			k = ir3_find_output(s[VS].v, s[FS].v->inputs[j].semantic);
			reg |= A4XX_SP_VS_OUT_REG_A_REGID(s[VS].v->outputs[k].regid);
			reg |= A4XX_SP_VS_OUT_REG_A_COMPMASK(s[FS].v->inputs[j].compmask);
		}

		j = ir3_next_varying(s[FS].v, j);
		if (j < s[FS].v->inputs_count) {
			k = ir3_find_output(s[VS].v, s[FS].v->inputs[j].semantic);
			reg |= A4XX_SP_VS_OUT_REG_B_REGID(s[VS].v->outputs[k].regid);
			reg |= A4XX_SP_VS_OUT_REG_B_COMPMASK(s[FS].v->inputs[j].compmask);
		}

		OUT_RING(ring, reg);
	}

	for (i = 0, j = -1; (i < 8) && (j < (int)s[FS].v->inputs_count); i++) {
		uint32_t reg = 0;

		OUT_PKT0(ring, REG_A4XX_SP_VS_VPC_DST_REG(i), 1);

		j = ir3_next_varying(s[FS].v, j);
		if (j < s[FS].v->inputs_count)
			reg |= A4XX_SP_VS_VPC_DST_REG_OUTLOC0(s[FS].v->inputs[j].inloc);
		j = ir3_next_varying(s[FS].v, j);
		if (j < s[FS].v->inputs_count)
			reg |= A4XX_SP_VS_VPC_DST_REG_OUTLOC1(s[FS].v->inputs[j].inloc);
		j = ir3_next_varying(s[FS].v, j);
		if (j < s[FS].v->inputs_count)
			reg |= A4XX_SP_VS_VPC_DST_REG_OUTLOC2(s[FS].v->inputs[j].inloc);
		j = ir3_next_varying(s[FS].v, j);
		if (j < s[FS].v->inputs_count)
			reg |= A4XX_SP_VS_VPC_DST_REG_OUTLOC3(s[FS].v->inputs[j].inloc);

		OUT_RING(ring, reg);
	}

	OUT_PKT0(ring, REG_A4XX_SP_VS_OBJ_OFFSET_REG, 2);
	OUT_RING(ring, A4XX_SP_VS_OBJ_OFFSET_REG_CONSTOBJECTOFFSET(s[VS].constoff) |
			A4XX_SP_VS_OBJ_OFFSET_REG_SHADEROBJOFFSET(s[VS].instroff));
	OUT_RELOC(ring, s[VS].v->bo, 0, 0, 0);  /* SP_VS_OBJ_START_REG */

	OUT_PKT0(ring, REG_A4XX_SP_FS_LENGTH_REG, 1);
	OUT_RING(ring, s[FS].v->instrlen);  /* SP_FS_LENGTH_REG */

	OUT_PKT0(ring, REG_A4XX_SP_FS_CTRL_REG0, 2);
	OUT_RING(ring, A4XX_SP_FS_CTRL_REG0_THREADMODE(MULTI) |
			COND(s[FS].v->total_in > 0, A4XX_SP_FS_CTRL_REG0_VARYING) |
			A4XX_SP_FS_CTRL_REG0_HALFREGFOOTPRINT(s[FS].i->max_half_reg + 1) |
			A4XX_SP_FS_CTRL_REG0_FULLREGFOOTPRINT(s[FS].i->max_reg + 1) |
			A4XX_SP_FS_CTRL_REG0_INOUTREGOVERLAP(1) |
			A4XX_SP_FS_CTRL_REG0_THREADSIZE(FOUR_QUADS) |
			A4XX_SP_FS_CTRL_REG0_SUPERTHREADMODE |
			COND(s[FS].v->has_samp, A4XX_SP_FS_CTRL_REG0_PIXLODENABLE));
	OUT_RING(ring, A4XX_SP_FS_CTRL_REG1_CONSTLENGTH(s[FS].constlen) |
			0x80000000 |      /* XXX */
			COND(s[FS].v->frag_face, A4XX_SP_FS_CTRL_REG1_FACENESS) |
			COND(s[FS].v->total_in > 0, A4XX_SP_FS_CTRL_REG1_VARYING) |
			COND(s[FS].v->frag_coord, A4XX_SP_FS_CTRL_REG1_FRAGCOORD));

	OUT_PKT0(ring, REG_A4XX_SP_FS_OBJ_OFFSET_REG, 2);
	OUT_RING(ring, A4XX_SP_FS_OBJ_OFFSET_REG_CONSTOBJECTOFFSET(s[FS].constoff) |
			A4XX_SP_FS_OBJ_OFFSET_REG_SHADEROBJOFFSET(s[FS].instroff));
	if (emit->key.binning_pass)
		OUT_RING(ring, 0x00000000);
	else
		OUT_RELOC(ring, s[FS].v->bo, 0, 0, 0);  /* SP_FS_OBJ_START_REG */

	OUT_PKT0(ring, REG_A4XX_SP_HS_OBJ_OFFSET_REG, 1);
	OUT_RING(ring, A4XX_SP_HS_OBJ_OFFSET_REG_CONSTOBJECTOFFSET(s[HS].constoff) |
			A4XX_SP_HS_OBJ_OFFSET_REG_SHADEROBJOFFSET(s[HS].instroff));

	OUT_PKT0(ring, REG_A4XX_SP_DS_OBJ_OFFSET_REG, 1);
	OUT_RING(ring, A4XX_SP_DS_OBJ_OFFSET_REG_CONSTOBJECTOFFSET(s[DS].constoff) |
			A4XX_SP_DS_OBJ_OFFSET_REG_SHADEROBJOFFSET(s[DS].instroff));

	OUT_PKT0(ring, REG_A4XX_SP_GS_OBJ_OFFSET_REG, 1);
	OUT_RING(ring, A4XX_SP_GS_OBJ_OFFSET_REG_CONSTOBJECTOFFSET(s[GS].constoff) |
			A4XX_SP_GS_OBJ_OFFSET_REG_SHADEROBJOFFSET(s[GS].instroff));

	OUT_PKT0(ring, REG_A4XX_RB_RENDER_CONTROL2, 1);
	OUT_RING(ring, A4XX_RB_RENDER_CONTROL2_MSAA_SAMPLES(0) |
			COND(s[FS].v->total_in > 0, A4XX_RB_RENDER_CONTROL2_VARYING) |
			COND(s[FS].v->frag_face, A4XX_RB_RENDER_CONTROL2_FACENESS) |
			COND(s[FS].v->frag_coord, A4XX_RB_RENDER_CONTROL2_XCOORD |
					A4XX_RB_RENDER_CONTROL2_YCOORD |
// TODO enabling gl_FragCoord.z is causing lockups on 0ad (but seems
// to work everywhere else).
//					A4XX_RB_RENDER_CONTROL2_ZCOORD |
					A4XX_RB_RENDER_CONTROL2_WCOORD));

	OUT_PKT0(ring, REG_A4XX_RB_FS_OUTPUT_REG, 1);
	OUT_RING(ring, A4XX_RB_FS_OUTPUT_REG_MRT(1) |
			COND(s[FS].v->writes_pos, A4XX_RB_FS_OUTPUT_REG_FRAG_WRITES_Z));

	OUT_PKT0(ring, REG_A4XX_SP_FS_OUTPUT_REG, 1);
	if (s[FS].v->writes_pos) {
		OUT_RING(ring, 0x00000001 |
				A4XX_SP_FS_OUTPUT_REG_DEPTH_ENABLE |
				A4XX_SP_FS_OUTPUT_REG_DEPTH_REGID(posz_regid));
	} else {
		OUT_RING(ring, 0x00000001);
	}

	OUT_PKT0(ring, REG_A4XX_SP_FS_MRT_REG(0), 8);
	OUT_RING(ring, A4XX_SP_FS_MRT_REG_REGID(color_regid) |
			A4XX_SP_FS_MRT_REG_MRTFORMAT(emit->format) |
			COND(emit->key.half_precision, A4XX_SP_FS_MRT_REG_HALF_PRECISION));
	OUT_RING(ring, A4XX_SP_FS_MRT_REG_REGID(0));
	OUT_RING(ring, A4XX_SP_FS_MRT_REG_REGID(0));
	OUT_RING(ring, A4XX_SP_FS_MRT_REG_REGID(0));
	OUT_RING(ring, A4XX_SP_FS_MRT_REG_REGID(0));
	OUT_RING(ring, A4XX_SP_FS_MRT_REG_REGID(0));
	OUT_RING(ring, A4XX_SP_FS_MRT_REG_REGID(0));
	OUT_RING(ring, A4XX_SP_FS_MRT_REG_REGID(0));

	if (emit->key.binning_pass) {
		OUT_PKT0(ring, REG_A4XX_VPC_ATTR, 2);
		OUT_RING(ring, A4XX_VPC_ATTR_THRDASSIGN(1) |
				0x40000000 |      /* XXX */
				COND(s[VS].v->writes_psize, A4XX_VPC_ATTR_PSIZE));
		OUT_RING(ring, 0x00000000);
	} else {
		uint32_t vinterp[8], flatshade[2];

		memset(vinterp, 0, sizeof(vinterp));
		memset(flatshade, 0, sizeof(flatshade));

		/* looks like we need to do int varyings in the frag
		 * shader on a4xx (no flatshad reg?  or a420.0 bug?):
		 *
		 *    (sy)(ss)nop
		 *    (sy)ldlv.u32 r0.x,l[r0.x], 1
		 *    ldlv.u32 r0.y,l[r0.x+1], 1
		 *    (ss)bary.f (ei)r63.x, 0, r0.x
		 *    (ss)(rpt1)cov.s32f16 hr0.x, (r)r0.x
		 *    (rpt5)nop
		 *    sam (f16)(xyzw)hr0.x, hr0.x, s#0, t#0
		 *
		 * Possibly on later a4xx variants we'll be able to use
		 * something like the code below instead of workaround
		 * in the shader:
		 */
#if 0
		/* figure out VARYING_INTERP / FLAT_SHAD register values: */
		for (j = -1; (j = ir3_next_varying(s[FS].v, j)) < (int)s[FS].v->inputs_count; ) {
			uint32_t interp = s[FS].v->inputs[j].interpolate;
			if ((interp == TGSI_INTERPOLATE_CONSTANT) ||
					((interp == TGSI_INTERPOLATE_COLOR) && emit->rasterflat)) {
				/* TODO might be cleaner to just +8 in SP_VS_VPC_DST_REG
				 * instead.. rather than -8 everywhere else..
				 */
				uint32_t loc = s[FS].v->inputs[j].inloc - 8;

				/* currently assuming varyings aligned to 4 (not
				 * packed):
				 */
				debug_assert((loc % 4) == 0);

				for (i = 0; i < 4; i++, loc++) {
					vinterp[loc / 16] |= 1 << ((loc % 16) * 2);
					flatshade[loc / 32] |= 1 << (loc % 32);
				}
			}
		}
#endif

		OUT_PKT0(ring, REG_A4XX_VPC_ATTR, 2);
		OUT_RING(ring, A4XX_VPC_ATTR_TOTALATTR(s[FS].v->total_in) |
				A4XX_VPC_ATTR_THRDASSIGN(1) |
				COND(s[FS].v->total_in > 0, A4XX_VPC_ATTR_ENABLE) |
				0x40000000 |      /* XXX */
				COND(s[VS].v->writes_psize, A4XX_VPC_ATTR_PSIZE));
		OUT_RING(ring, A4XX_VPC_PACK_NUMFPNONPOSVAR(s[FS].v->total_in) |
				A4XX_VPC_PACK_NUMNONPOSVSVAR(s[FS].v->total_in));

		OUT_PKT0(ring, REG_A4XX_VPC_VARYING_INTERP_MODE(0), 8);
		for (i = 0; i < 8; i++)
			OUT_RING(ring, vinterp[i]);     /* VPC_VARYING_INTERP[i].MODE */

		OUT_PKT0(ring, REG_A4XX_VPC_VARYING_PS_REPL_MODE(0), 8);
		for (i = 0; i < 8; i++)
			OUT_RING(ring, s[FS].v->shader->vpsrepl[i]);   /* VPC_VARYING_PS_REPL[i] */
	}

	if (s[VS].instrlen)
		emit_shader(ring, s[VS].v);

	if (!emit->key.binning_pass)
		if (s[FS].instrlen)
			emit_shader(ring, s[FS].v);
}
Beispiel #3
0
void
fd3_program_emit(struct fd_ringbuffer *ring, struct fd3_emit *emit,
                 int nr, struct pipe_surface **bufs)
{
    const struct ir3_shader_variant *vp, *fp;
    const struct ir3_info *vsi, *fsi;
    enum a3xx_instrbuffermode fpbuffer, vpbuffer;
    uint32_t fpbuffersz, vpbuffersz, fsoff;
    uint32_t pos_regid, posz_regid, psize_regid, color_regid[4] = {0};
    int constmode;
    int i, j, k;

    debug_assert(nr <= ARRAY_SIZE(color_regid));

    vp = fd3_emit_get_vp(emit);

    if (emit->key.binning_pass) {
        /* use dummy stateobj to simplify binning vs non-binning: */
        static const struct ir3_shader_variant binning_fp = {};
        fp = &binning_fp;
    } else {
        fp = fd3_emit_get_fp(emit);
    }

    vsi = &vp->info;
    fsi = &fp->info;

    fpbuffer = BUFFER;
    vpbuffer = BUFFER;
    fpbuffersz = fp->instrlen;
    vpbuffersz = vp->instrlen;

    /*
     * Decide whether to use BUFFER or CACHE mode for VS and FS.  It
     * appears like 256 is the hard limit, but when the combined size
     * exceeds 128 then blob will try to keep FS in BUFFER mode and
     * switch to CACHE for VS until VS is too large.  The blob seems
     * to switch FS out of BUFFER mode at slightly under 128.  But
     * a bit fuzzy on the decision tree, so use slightly conservative
     * limits.
     *
     * TODO check if these thresholds for BUFFER vs CACHE mode are the
     *      same for all a3xx or whether we need to consider the gpuid
     */

    if ((fpbuffersz + vpbuffersz) > 128) {
        if (fpbuffersz < 112) {
            /* FP:BUFFER   VP:CACHE  */
            vpbuffer = CACHE;
            vpbuffersz = 256 - fpbuffersz;
        } else if (vpbuffersz < 112) {
            /* FP:CACHE    VP:BUFFER */
            fpbuffer = CACHE;
            fpbuffersz = 256 - vpbuffersz;
        } else {
            /* FP:CACHE    VP:CACHE  */
            vpbuffer = fpbuffer = CACHE;
            vpbuffersz = fpbuffersz = 192;
        }
    }

    if (fpbuffer == BUFFER) {
        fsoff = 128 - fpbuffersz;
    } else {
        fsoff = 256 - fpbuffersz;
    }

    /* seems like vs->constlen + fs->constlen > 256, then CONSTMODE=1 */
    constmode = ((vp->constlen + fp->constlen) > 256) ? 1 : 0;

    pos_regid = ir3_find_output_regid(vp,
                                      ir3_semantic_name(TGSI_SEMANTIC_POSITION, 0));
    posz_regid = ir3_find_output_regid(fp,
                                       ir3_semantic_name(TGSI_SEMANTIC_POSITION, 0));
    psize_regid = ir3_find_output_regid(vp,
                                        ir3_semantic_name(TGSI_SEMANTIC_PSIZE, 0));
    if (fp->color0_mrt) {
        color_regid[0] = color_regid[1] = color_regid[2] = color_regid[3] =
                                              ir3_find_output_regid(fp, ir3_semantic_name(TGSI_SEMANTIC_COLOR, 0));
    } else {
        for (i = 0; i < fp->outputs_count; i++) {
            ir3_semantic sem = fp->outputs[i].semantic;
            unsigned idx = sem2idx(sem);
            if (sem2name(sem) != TGSI_SEMANTIC_COLOR)
                continue;
            debug_assert(idx < ARRAY_SIZE(color_regid));
            color_regid[idx] = fp->outputs[i].regid;
        }
    }

    /* adjust regids for alpha output formats. there is no alpha render
     * format, so it's just treated like red
     */
    for (i = 0; i < nr; i++)
        if (util_format_is_alpha(pipe_surface_format(bufs[i])))
            color_regid[i] += 3;

    /* we could probably divide this up into things that need to be
     * emitted if frag-prog is dirty vs if vert-prog is dirty..
     */

    OUT_PKT0(ring, REG_A3XX_HLSQ_CONTROL_0_REG, 6);
    OUT_RING(ring, A3XX_HLSQ_CONTROL_0_REG_FSTHREADSIZE(FOUR_QUADS) |
             A3XX_HLSQ_CONTROL_0_REG_CONSTMODE(constmode) |
             /* NOTE:  I guess SHADERRESTART and CONSTFULLUPDATE maybe
              * flush some caches? I think we only need to set those
              * bits if we have updated const or shader..
              */
             A3XX_HLSQ_CONTROL_0_REG_SPSHADERRESTART |
             A3XX_HLSQ_CONTROL_0_REG_SPCONSTFULLUPDATE);
    OUT_RING(ring, A3XX_HLSQ_CONTROL_1_REG_VSTHREADSIZE(TWO_QUADS) |
             A3XX_HLSQ_CONTROL_1_REG_VSSUPERTHREADENABLE |
             COND(fp->frag_coord, A3XX_HLSQ_CONTROL_1_REG_ZWCOORD));
    OUT_RING(ring, A3XX_HLSQ_CONTROL_2_REG_PRIMALLOCTHRESHOLD(31));
    OUT_RING(ring, A3XX_HLSQ_CONTROL_3_REG_REGID(fp->pos_regid));
    OUT_RING(ring, A3XX_HLSQ_VS_CONTROL_REG_CONSTLENGTH(vp->constlen) |
             A3XX_HLSQ_VS_CONTROL_REG_CONSTSTARTOFFSET(0) |
             A3XX_HLSQ_VS_CONTROL_REG_INSTRLENGTH(vpbuffersz));
    OUT_RING(ring, A3XX_HLSQ_FS_CONTROL_REG_CONSTLENGTH(fp->constlen) |
             A3XX_HLSQ_FS_CONTROL_REG_CONSTSTARTOFFSET(128) |
             A3XX_HLSQ_FS_CONTROL_REG_INSTRLENGTH(fpbuffersz));

    OUT_PKT0(ring, REG_A3XX_SP_SP_CTRL_REG, 1);
    OUT_RING(ring, A3XX_SP_SP_CTRL_REG_CONSTMODE(constmode) |
             COND(emit->key.binning_pass, A3XX_SP_SP_CTRL_REG_BINNING) |
             A3XX_SP_SP_CTRL_REG_SLEEPMODE(1) |
             A3XX_SP_SP_CTRL_REG_L0MODE(0));

    OUT_PKT0(ring, REG_A3XX_SP_VS_LENGTH_REG, 1);
    OUT_RING(ring, A3XX_SP_VS_LENGTH_REG_SHADERLENGTH(vp->instrlen));

    OUT_PKT0(ring, REG_A3XX_SP_VS_CTRL_REG0, 3);
    OUT_RING(ring, A3XX_SP_VS_CTRL_REG0_THREADMODE(MULTI) |
             A3XX_SP_VS_CTRL_REG0_INSTRBUFFERMODE(vpbuffer) |
             COND(vpbuffer == CACHE, A3XX_SP_VS_CTRL_REG0_CACHEINVALID) |
             A3XX_SP_VS_CTRL_REG0_HALFREGFOOTPRINT(vsi->max_half_reg + 1) |
             A3XX_SP_VS_CTRL_REG0_FULLREGFOOTPRINT(vsi->max_reg + 1) |
             A3XX_SP_VS_CTRL_REG0_INOUTREGOVERLAP(0) |
             A3XX_SP_VS_CTRL_REG0_THREADSIZE(TWO_QUADS) |
             A3XX_SP_VS_CTRL_REG0_SUPERTHREADMODE |
             COND(vp->has_samp, A3XX_SP_VS_CTRL_REG0_PIXLODENABLE) |
             A3XX_SP_VS_CTRL_REG0_LENGTH(vpbuffersz));
    OUT_RING(ring, A3XX_SP_VS_CTRL_REG1_CONSTLENGTH(vp->constlen) |
             A3XX_SP_VS_CTRL_REG1_INITIALOUTSTANDING(vp->total_in) |
             A3XX_SP_VS_CTRL_REG1_CONSTFOOTPRINT(MAX2(vp->constlen + 1, 0)));
    OUT_RING(ring, A3XX_SP_VS_PARAM_REG_POSREGID(pos_regid) |
             A3XX_SP_VS_PARAM_REG_PSIZEREGID(psize_regid) |
             A3XX_SP_VS_PARAM_REG_TOTALVSOUTVAR(align(fp->total_in, 4) / 4));

    for (i = 0, j = -1; (i < 8) && (j < (int)fp->inputs_count); i++) {
        uint32_t reg = 0;

        OUT_PKT0(ring, REG_A3XX_SP_VS_OUT_REG(i), 1);

        j = ir3_next_varying(fp, j);
        if (j < fp->inputs_count) {
            k = ir3_find_output(vp, fp->inputs[j].semantic);
            reg |= A3XX_SP_VS_OUT_REG_A_REGID(vp->outputs[k].regid);
            reg |= A3XX_SP_VS_OUT_REG_A_COMPMASK(fp->inputs[j].compmask);
        }

        j = ir3_next_varying(fp, j);
        if (j < fp->inputs_count) {
            k = ir3_find_output(vp, fp->inputs[j].semantic);
            reg |= A3XX_SP_VS_OUT_REG_B_REGID(vp->outputs[k].regid);
            reg |= A3XX_SP_VS_OUT_REG_B_COMPMASK(fp->inputs[j].compmask);
        }

        OUT_RING(ring, reg);
    }

    for (i = 0, j = -1; (i < 4) && (j < (int)fp->inputs_count); i++) {
        uint32_t reg = 0;

        OUT_PKT0(ring, REG_A3XX_SP_VS_VPC_DST_REG(i), 1);

        j = ir3_next_varying(fp, j);
        if (j < fp->inputs_count)
            reg |= A3XX_SP_VS_VPC_DST_REG_OUTLOC0(fp->inputs[j].inloc);
        j = ir3_next_varying(fp, j);
        if (j < fp->inputs_count)
            reg |= A3XX_SP_VS_VPC_DST_REG_OUTLOC1(fp->inputs[j].inloc);
        j = ir3_next_varying(fp, j);
        if (j < fp->inputs_count)
            reg |= A3XX_SP_VS_VPC_DST_REG_OUTLOC2(fp->inputs[j].inloc);
        j = ir3_next_varying(fp, j);
        if (j < fp->inputs_count)
            reg |= A3XX_SP_VS_VPC_DST_REG_OUTLOC3(fp->inputs[j].inloc);

        OUT_RING(ring, reg);
    }

    OUT_PKT0(ring, REG_A3XX_SP_VS_OBJ_OFFSET_REG, 2);
    OUT_RING(ring, A3XX_SP_VS_OBJ_OFFSET_REG_CONSTOBJECTOFFSET(0) |
             A3XX_SP_VS_OBJ_OFFSET_REG_SHADEROBJOFFSET(0));
    OUT_RELOC(ring, vp->bo, 0, 0, 0);  /* SP_VS_OBJ_START_REG */

    if (emit->key.binning_pass) {
        OUT_PKT0(ring, REG_A3XX_SP_FS_LENGTH_REG, 1);
        OUT_RING(ring, 0x00000000);

        OUT_PKT0(ring, REG_A3XX_SP_FS_CTRL_REG0, 2);
        OUT_RING(ring, A3XX_SP_FS_CTRL_REG0_THREADMODE(MULTI) |
                 A3XX_SP_FS_CTRL_REG0_INSTRBUFFERMODE(BUFFER));
        OUT_RING(ring, 0x00000000);

        OUT_PKT0(ring, REG_A3XX_SP_FS_OBJ_OFFSET_REG, 1);
        OUT_RING(ring, A3XX_SP_FS_OBJ_OFFSET_REG_CONSTOBJECTOFFSET(128) |
                 A3XX_SP_FS_OBJ_OFFSET_REG_SHADEROBJOFFSET(0));
    } else {
        OUT_PKT0(ring, REG_A3XX_SP_FS_LENGTH_REG, 1);
        OUT_RING(ring, A3XX_SP_FS_LENGTH_REG_SHADERLENGTH(fp->instrlen));

        OUT_PKT0(ring, REG_A3XX_SP_FS_CTRL_REG0, 2);
        OUT_RING(ring, A3XX_SP_FS_CTRL_REG0_THREADMODE(MULTI) |
                 A3XX_SP_FS_CTRL_REG0_INSTRBUFFERMODE(fpbuffer) |
                 COND(fpbuffer == CACHE, A3XX_SP_FS_CTRL_REG0_CACHEINVALID) |
                 A3XX_SP_FS_CTRL_REG0_HALFREGFOOTPRINT(fsi->max_half_reg + 1) |
                 A3XX_SP_FS_CTRL_REG0_FULLREGFOOTPRINT(fsi->max_reg + 1) |
                 A3XX_SP_FS_CTRL_REG0_INOUTREGOVERLAP(1) |
                 A3XX_SP_FS_CTRL_REG0_THREADSIZE(FOUR_QUADS) |
                 A3XX_SP_FS_CTRL_REG0_SUPERTHREADMODE |
                 COND(fp->has_samp > 0, A3XX_SP_FS_CTRL_REG0_PIXLODENABLE) |
                 A3XX_SP_FS_CTRL_REG0_LENGTH(fpbuffersz));
        OUT_RING(ring, A3XX_SP_FS_CTRL_REG1_CONSTLENGTH(fp->constlen) |
                 A3XX_SP_FS_CTRL_REG1_INITIALOUTSTANDING(fp->total_in) |
                 A3XX_SP_FS_CTRL_REG1_CONSTFOOTPRINT(MAX2(fp->constlen + 1, 0)) |
                 A3XX_SP_FS_CTRL_REG1_HALFPRECVAROFFSET(63));

        OUT_PKT0(ring, REG_A3XX_SP_FS_OBJ_OFFSET_REG, 2);
        OUT_RING(ring, A3XX_SP_FS_OBJ_OFFSET_REG_CONSTOBJECTOFFSET(
                     MAX2(128, vp->constlen)) |
                 A3XX_SP_FS_OBJ_OFFSET_REG_SHADEROBJOFFSET(fsoff));
        OUT_RELOC(ring, fp->bo, 0, 0, 0);  /* SP_FS_OBJ_START_REG */
    }

    OUT_PKT0(ring, REG_A3XX_SP_FS_OUTPUT_REG, 1);
    OUT_RING(ring,
             COND(fp->writes_pos, A3XX_SP_FS_OUTPUT_REG_DEPTH_ENABLE) |
             A3XX_SP_FS_OUTPUT_REG_DEPTH_REGID(posz_regid) |
             A3XX_SP_FS_OUTPUT_REG_MRT(MAX2(1, nr) - 1));

    OUT_PKT0(ring, REG_A3XX_SP_FS_MRT_REG(0), 4);
    for (i = 0; i < 4; i++) {
        uint32_t mrt_reg = A3XX_SP_FS_MRT_REG_REGID(color_regid[i]) |
                           COND(fp->key.half_precision, A3XX_SP_FS_MRT_REG_HALF_PRECISION);

        if (i < nr) {
            enum pipe_format fmt = pipe_surface_format(bufs[i]);
            mrt_reg |= COND(util_format_is_pure_uint(fmt), A3XX_SP_FS_MRT_REG_UINT) |
                       COND(util_format_is_pure_sint(fmt), A3XX_SP_FS_MRT_REG_SINT);
        }
        OUT_RING(ring, mrt_reg);
    }

    if (emit->key.binning_pass) {
        OUT_PKT0(ring, REG_A3XX_VPC_ATTR, 2);
        OUT_RING(ring, A3XX_VPC_ATTR_THRDASSIGN(1) |
                 A3XX_VPC_ATTR_LMSIZE(1) |
                 COND(vp->writes_psize, A3XX_VPC_ATTR_PSIZE));
        OUT_RING(ring, 0x00000000);
    } else {
        uint32_t vinterp[4], flatshade[2], vpsrepl[4];

        memset(vinterp, 0, sizeof(vinterp));
        memset(flatshade, 0, sizeof(flatshade));
        memset(vpsrepl, 0, sizeof(vpsrepl));

        /* figure out VARYING_INTERP / FLAT_SHAD register values: */
        for (j = -1; (j = ir3_next_varying(fp, j)) < (int)fp->inputs_count; ) {
            uint32_t interp = fp->inputs[j].interpolate;

            /* TODO might be cleaner to just +8 in SP_VS_VPC_DST_REG
             * instead.. rather than -8 everywhere else..
             */
            uint32_t inloc = fp->inputs[j].inloc - 8;

            /* currently assuming varyings aligned to 4 (not
             * packed):
             */
            debug_assert((inloc % 4) == 0);

            if ((interp == TGSI_INTERPOLATE_CONSTANT) ||
                    ((interp == TGSI_INTERPOLATE_COLOR) && emit->rasterflat)) {
                uint32_t loc = inloc;
                for (i = 0; i < 4; i++, loc++) {
                    vinterp[loc / 16] |= FLAT << ((loc % 16) * 2);
                    flatshade[loc / 32] |= 1 << (loc % 32);
                }
            }

            /* Replace the .xy coordinates with S/T from the point sprite. Set
             * interpolation bits for .zw such that they become .01
             */
            if (emit->sprite_coord_enable & (1 << sem2idx(fp->inputs[j].semantic))) {
                vpsrepl[inloc / 16] |= (emit->sprite_coord_mode ? 0x0d : 0x09)
                                       << ((inloc % 16) * 2);
                vinterp[(inloc + 2) / 16] |= 2 << (((inloc + 2) % 16) * 2);
                vinterp[(inloc + 3) / 16] |= 3 << (((inloc + 3) % 16) * 2);
            }
        }

        OUT_PKT0(ring, REG_A3XX_VPC_ATTR, 2);
        OUT_RING(ring, A3XX_VPC_ATTR_TOTALATTR(fp->total_in) |
                 A3XX_VPC_ATTR_THRDASSIGN(1) |
                 A3XX_VPC_ATTR_LMSIZE(1) |
                 COND(vp->writes_psize, A3XX_VPC_ATTR_PSIZE));
        OUT_RING(ring, A3XX_VPC_PACK_NUMFPNONPOSVAR(fp->total_in) |
                 A3XX_VPC_PACK_NUMNONPOSVSVAR(fp->total_in));

        OUT_PKT0(ring, REG_A3XX_VPC_VARYING_INTERP_MODE(0), 4);
        OUT_RING(ring, vinterp[0]);    /* VPC_VARYING_INTERP[0].MODE */
        OUT_RING(ring, vinterp[1]);    /* VPC_VARYING_INTERP[1].MODE */
        OUT_RING(ring, vinterp[2]);    /* VPC_VARYING_INTERP[2].MODE */
        OUT_RING(ring, vinterp[3]);    /* VPC_VARYING_INTERP[3].MODE */

        OUT_PKT0(ring, REG_A3XX_VPC_VARYING_PS_REPL_MODE(0), 4);
        OUT_RING(ring, vpsrepl[0]);    /* VPC_VARYING_PS_REPL[0].MODE */
        OUT_RING(ring, vpsrepl[1]);    /* VPC_VARYING_PS_REPL[1].MODE */
        OUT_RING(ring, vpsrepl[2]);    /* VPC_VARYING_PS_REPL[2].MODE */
        OUT_RING(ring, vpsrepl[3]);    /* VPC_VARYING_PS_REPL[3].MODE */

        OUT_PKT0(ring, REG_A3XX_SP_FS_FLAT_SHAD_MODE_REG_0, 2);
        OUT_RING(ring, flatshade[0]);        /* SP_FS_FLAT_SHAD_MODE_REG_0 */
        OUT_RING(ring, flatshade[1]);        /* SP_FS_FLAT_SHAD_MODE_REG_1 */
    }

    if (vpbuffer == BUFFER)
        emit_shader(ring, vp);

    OUT_PKT0(ring, REG_A3XX_VFD_PERFCOUNTER0_SELECT, 1);
    OUT_RING(ring, 0x00000000);        /* VFD_PERFCOUNTER0_SELECT */

    if (!emit->key.binning_pass) {
        if (fpbuffer == BUFFER)
            emit_shader(ring, fp);

        OUT_PKT0(ring, REG_A3XX_VFD_PERFCOUNTER0_SELECT, 1);
        OUT_RING(ring, 0x00000000);        /* VFD_PERFCOUNTER0_SELECT */
    }
}