Ejemplo n.º 1
0
void
I810SubsequentScreenToScreenCopy(ScrnInfoPtr pScrn, int x1, int y1,
				 int x2, int y2, int w, int h)
{
    I810Ptr pI810 = I810PTR(pScrn);
    int src, dst;
    int w_back = w;

    if (I810_DEBUG & DEBUG_VERBOSE_ACCEL)
	ErrorF( "I810SubsequentScreenToScreenCopy %d,%d - %d,%d %dx%d\n",
		x1,y1,x2,y2,w,h);
    /* 
     * This works around a bug in the i810 drawing engine.
     * This was developed empirically so it may not catch all
     * cases.
     */
#define I810_MWIDTH 8

    if ( !(pI810->BR[13] & BR13_RIGHT_TO_LEFT) && (y2 - y1) < 3 
	 && (y2 - y1) >= 0 && (x2 - x1) <= (w + I810_MWIDTH)
	 && (w > I810_MWIDTH))
	w = I810_MWIDTH;
    do {

	if (pI810->BR[13] & BR13_PITCH_SIGN_BIT) {
	    src = (y1 + h - 1) * pScrn->displayWidth * pI810->cpp;
	    dst = (y2 + h - 1) * pScrn->displayWidth * pI810->cpp;
	} else {
	    src = y1 * pScrn->displayWidth * pI810->cpp;
	    dst = y2 * pScrn->displayWidth * pI810->cpp;
	}

	if (pI810->BR[13] & BR13_RIGHT_TO_LEFT) {
	    src += (x1 + w - 1) * pI810->cpp + pI810->cpp - 1;
	    dst += (x2 + w - 1) * pI810->cpp + pI810->cpp - 1;
	} else {
	    src += x1 * pI810->cpp;
	    dst += x2 * pI810->cpp;
	}


	/* SRC_COPY_BLT, p169 */
	{
	    BEGIN_LP_RING(6);
	    OUT_RING( BR00_BITBLT_CLIENT | BR00_OP_SRC_COPY_BLT | 0x4 );
	    OUT_RING( pI810->BR[13]);

	    OUT_RING( (h << 16) | (w * pI810->cpp));
	    OUT_RING( pI810->bufferOffset + dst);

	    OUT_RING( pI810->BR[13] & 0xFFFF);
	    OUT_RING( pI810->bufferOffset + src);
	    ADVANCE_LP_RING();
	}
	w_back -= w;
	if (w_back <= 0)
	    break;
	x2 += w;
	x1 += w;
	if (w_back > I810_MWIDTH)
	    w = I810_MWIDTH;
	else
	    w = w_back;
    }  while (1);
}
Ejemplo n.º 2
0
static inline void
out_dstpix(struct fd_ringbuffer *ring, PixmapPtr pix)
{
	struct fd_bo *bo = pix->bo;
	uint32_t w, h, p;

	w = pix->width;
	h = pix->height;

	/* pitch specified in units of 32 bytes, it appears.. not quite sure
	 * max size yet, but I think 11 or 12 bits..
	 */
	p = (pix->pitch / 32) & 0xfff;

	/* not quite sure if these first three dwords belong here, but all
	 * blits seem to start with these immediately before the dst surf
	 * parameters, so I'm putting them here for now
	 *
	 * Note that there are some similar dwords preceding src surf state
	 * although it varies slightly for composite (some extra bits set
	 * for src surface and no 0x11000000 like dword for mask surface..
	 * so this may need some shuffling around when I start playing with
	 * emitting dst/src/mask surf state in the corresponding Prepare
	 * fxns rather than for every blit..
	 */
	OUT_RING (ring, REG(G2D_ALPHABLEND) | 0x0);
	OUT_RING (ring, REG(G2D_BLENDERCFG) | 0x0);
	OUT_RING (ring, REG(G2D_GRADIENT) | 0x030000);
	OUT_RING (ring, REG(GRADW_TEXSIZE) | ((h & 0xfff) << 13) | (w & 0xfff));
	OUT_RING (ring, REG(G2D_CFG0) | p |
			((pix->depth == 8) ? 0xe000 : 0x7000));
	OUT_RING (ring, REGM(G2D_BASE0, 1));
	OUT_RELOC(ring, bo);
	OUT_RING (ring, REGM(GRADW_TEXBASE, 1));
	OUT_RELOC(ring, bo);
	OUT_RING (ring, REGM(GRADW_TEXCFG, 1));
	OUT_RING (ring, 0x40000000 | p |
			((pix->depth == 8) ? 0xe000 : 0x7000));
	OUT_RING (ring, 0xd5000000);
	OUT_RING (ring, REG(G2D_ALPHABLEND) | 0x0);
	OUT_RING (ring, REG(G2D_SCISSORX) | (w & 0xfff) << 12);
	OUT_RING (ring, REG(G2D_SCISSORY) | (h & 0xfff) << 12);
}
Ejemplo n.º 3
0
static void copy(PixmapPtr dest, PixmapPtr src, int srcX, int srcY,
		int dstX, int dstY, int width, int height)
{
	BEGIN_RING(45);
	out_dstpix(ring, dest);
	OUT_RING  (ring, REGM(G2D_FOREGROUND, 2));
	OUT_RING  (ring, 0xff000000);      /* G2D_FOREGROUND */
	OUT_RING  (ring, 0xff000000);      /* G2D_BACKGROUND */
	OUT_RING  (ring, REG(G2D_BLENDERCFG) | 0x0);
	OUT_RING  (ring, 0xd0000000);
	out_srcpix(ring, src);
	OUT_RING  (ring, 0xd5000000);
	OUT_RING  (ring, 0xd0000000);
	OUT_RING  (ring, REG(G2D_INPUT) | iena(G2D_INPUT_SCOORD1));
	OUT_RING  (ring, REG(G2D_INPUT) | iena(0));
	OUT_RING  (ring, REG(G2D_INPUT) | idis(G2D_INPUT_COLOR));
	OUT_RING  (ring, REG(G2D_INPUT) | iena(G2D_INPUT_COPYCOORD));
	OUT_RING  (ring, 0xd0000000);
	OUT_RING  (ring, REG(G2D_INPUT) | iena(0));
	OUT_RING  (ring, REG(G2D_INPUT) | iena(0));
	OUT_RING  (ring, REG(G2D_INPUT) | iena(0));
	OUT_RING  (ring, REG(G2D_CONFIG) | G2D_CONFIG_SRC1); /* we don't read from dst */
	OUT_RING  (ring, REGM(G2D_XY, 3));
	OUT_RING  (ring, (dstX & 0xffff) << 16 | (dstY & 0xffff));    /* G2D_XY */
	OUT_RING  (ring, (width & 0xfff) << 16 | (height & 0xffff));  /* G2D_WIDTHHEIGHT */
	OUT_RING  (ring, (srcX & 0xffff) << 16 | (srcY & 0xffff));    /* G2D_SXY */
	OUT_RING  (ring, 0xd0000000);
	OUT_RING  (ring, 0xd0000000);
	OUT_RING  (ring, 0xd0000000);
	OUT_RING  (ring, 0xd0000000);
	OUT_RING  (ring, 0xd0000000);
	OUT_RING  (ring, 0xd0000000);
	END_RING  ();
}
static void
nouveau_accel_init(struct nouveau_drm *drm)
{
	struct nouveau_device *device = nv_device(drm->device);
	struct nouveau_object *object;
	u32 arg0, arg1;
	int ret;

	if (nouveau_noaccel || !nouveau_fifo(device) /*XXX*/)
		return;

	/* initialise synchronisation routines */
	if      (device->card_type < NV_10) ret = nv04_fence_create(drm);
	else if (device->card_type < NV_11 ||
		 device->chipset   <  0x17) ret = nv10_fence_create(drm);
	else if (device->card_type < NV_50) ret = nv17_fence_create(drm);
	else if (device->chipset   <  0x84) ret = nv50_fence_create(drm);
	else if (device->card_type < NV_C0) ret = nv84_fence_create(drm);
	else                                ret = nvc0_fence_create(drm);
	if (ret) {
		NV_ERROR(drm, "failed to initialise sync subsystem, %d\n", ret);
		nouveau_accel_fini(drm);
		return;
	}

	if (device->card_type >= NV_E0) {
		ret = nouveau_channel_new(drm, &drm->client, NVDRM_DEVICE,
					  NVDRM_CHAN + 1,
					  NVE0_CHANNEL_IND_ENGINE_CE0 |
					  NVE0_CHANNEL_IND_ENGINE_CE1, 0,
					  &drm->cechan);
		if (ret)
			NV_ERROR(drm, "failed to create ce channel, %d\n", ret);

		arg0 = NVE0_CHANNEL_IND_ENGINE_GR;
		arg1 = 1;
	} else
	if (device->chipset >= 0xa3 &&
	    device->chipset != 0xaa &&
	    device->chipset != 0xac) {
		ret = nouveau_channel_new(drm, &drm->client, NVDRM_DEVICE,
					  NVDRM_CHAN + 1, NvDmaFB, NvDmaTT,
					  &drm->cechan);
		if (ret)
			NV_ERROR(drm, "failed to create ce channel, %d\n", ret);

		arg0 = NvDmaFB;
		arg1 = NvDmaTT;
	} else {
		arg0 = NvDmaFB;
		arg1 = NvDmaTT;
	}

	ret = nouveau_channel_new(drm, &drm->client, NVDRM_DEVICE, NVDRM_CHAN,
				  arg0, arg1, &drm->channel);
	if (ret) {
		NV_ERROR(drm, "failed to create kernel channel, %d\n", ret);
		nouveau_accel_fini(drm);
		return;
	}

	ret = nouveau_object_new(nv_object(drm), NVDRM_CHAN, NVDRM_NVSW,
				 nouveau_abi16_swclass(drm), NULL, 0, &object);
	if (ret == 0) {
		struct nouveau_software_chan *swch = (void *)object->parent;
		ret = RING_SPACE(drm->channel, 2);
		if (ret == 0) {
			if (device->card_type < NV_C0) {
				BEGIN_NV04(drm->channel, NvSubSw, 0, 1);
				OUT_RING  (drm->channel, NVDRM_NVSW);
			} else
			if (device->card_type < NV_E0) {
				BEGIN_NVC0(drm->channel, FermiSw, 0, 1);
				OUT_RING  (drm->channel, 0x001f0000);
			}
		}
		swch = (void *)object->parent;
		swch->flip = nouveau_flip_complete;
		swch->flip_data = drm->channel;
	}

	if (ret) {
		NV_ERROR(drm, "failed to allocate software object, %d\n", ret);
		nouveau_accel_fini(drm);
		return;
	}

	if (device->card_type < NV_C0) {
		ret = nouveau_gpuobj_new(drm->device, NULL, 32, 0, 0,
					&drm->notify);
		if (ret) {
			NV_ERROR(drm, "failed to allocate notifier, %d\n", ret);
			nouveau_accel_fini(drm);
			return;
		}

		ret = nouveau_object_new(nv_object(drm),
					 drm->channel->handle, NvNotify0,
					 0x003d, &(struct nv_dma_class) {
						.flags = NV_DMA_TARGET_VRAM |
							 NV_DMA_ACCESS_RDWR,
						.start = drm->notify->addr,
						.limit = drm->notify->addr + 31
						}, sizeof(struct nv_dma_class),
Ejemplo n.º 5
0
void fd_program_emit_state(struct fd_program *program, uint32_t first,
		struct fd_parameters *uniforms, struct fd_parameters *attr,
		struct fd_ringbuffer *ring)
{
	struct fd_shader *vs = get_shader(program, FD_SHADER_VERTEX);
	struct fd_shader *fs = get_shader(program, FD_SHADER_FRAGMENT);
	struct ir3_shader_info *vsi = &vs->info;
	struct ir3_shader_info *fsi = &fs->info;
	uint32_t vsconstlen = constlen(vs);
	uint32_t fsconstlen = constlen(fs);
	uint32_t i, outloc;

	uint32_t posregid   = getpos(vs, "gl_Position", 0);
	uint32_t psizeregid = getpos(vs, "gl_PointSize", (63 << 2));
	uint32_t colorregid = getpos(fs, "gl_FragColor", 0);

	uint32_t numvar = totalvar(fs);

	assert (vs->ir->varyings_count == fs->ir->varyings_count);

	OUT_PKT0(ring, REG_A3XX_HLSQ_CONTROL_0_REG, 6);
	OUT_RING(ring, A3XX_HLSQ_CONTROL_0_REG_FSTHREADSIZE(FOUR_QUADS) |
			A3XX_HLSQ_CONTROL_0_REG_SPSHADERRESTART |
			A3XX_HLSQ_CONTROL_0_REG_SPCONSTFULLUPDATE);
	OUT_RING(ring, A3XX_HLSQ_CONTROL_1_REG_VSTHREADSIZE(TWO_QUADS) |
			A3XX_HLSQ_CONTROL_1_REG_VSSUPERTHREADENABLE);
	OUT_RING(ring, A3XX_HLSQ_CONTROL_2_REG_PRIMALLOCTHRESHOLD(31));
	OUT_RING(ring, 0x00000000);        /* HLSQ_CONTROL_3_REG */
	OUT_RING(ring, A3XX_HLSQ_VS_CONTROL_REG_CONSTLENGTH(vsconstlen) |
			A3XX_HLSQ_VS_CONTROL_REG_CONSTSTARTOFFSET(0) |
			A3XX_HLSQ_VS_CONTROL_REG_INSTRLENGTH(instrlen(vs)));
	OUT_RING(ring, A3XX_HLSQ_FS_CONTROL_REG_CONSTLENGTH(fsconstlen) |
			A3XX_HLSQ_FS_CONTROL_REG_CONSTSTARTOFFSET(128) |
			A3XX_HLSQ_FS_CONTROL_REG_INSTRLENGTH(instrlen(fs)));

	OUT_PKT0(ring, REG_A3XX_SP_SP_CTRL_REG, 1);
	OUT_RING(ring, A3XX_SP_SP_CTRL_REG_CONSTMODE(0) |
			A3XX_SP_SP_CTRL_REG_SLEEPMODE(1) |
			// XXX "resolve" (?) bit set on gmem->mem pass..
			COND(!uniforms, A3XX_SP_SP_CTRL_REG_RESOLVE) |
			// XXX sometimes 0, sometimes 1:
			A3XX_SP_SP_CTRL_REG_LOMODE(1));

	/* emit unknown sequence of writes to 0x0ec4/0x0ec8 that the blob
	 * emits as part of the program state (it seems)..
	 */
	for (i = 0; i < 6; i++) {
		OUT_PKT0(ring, REG_A3XX_SP_PERFCOUNTER0_SELECT, 1);
		OUT_RING(ring, 0x00000000);    /* SP_PERFCOUNTER0_SELECT */

		OUT_PKT0(ring, REG_A3XX_SP_PERFCOUNTER3_SELECT, 1);
		OUT_RING(ring, 0x00000000);    /* SP_PERFCOUNTER3_SELECT */
	}

	OUT_PKT0(ring, REG_A3XX_SP_VS_LENGTH_REG, 1);
	OUT_RING(ring, A3XX_SP_VS_LENGTH_REG_SHADERLENGTH(instrlen(vs)));

	OUT_PKT0(ring, REG_A3XX_SP_VS_CTRL_REG0, 3);
	OUT_RING(ring, A3XX_SP_VS_CTRL_REG0_THREADMODE(MULTI) |
			A3XX_SP_VS_CTRL_REG0_INSTRBUFFERMODE(BUFFER) |
			A3XX_SP_VS_CTRL_REG0_HALFREGFOOTPRINT(vsi->max_half_reg + 1) |
			A3XX_SP_VS_CTRL_REG0_FULLREGFOOTPRINT(vsi->max_reg + 1) |
			A3XX_SP_VS_CTRL_REG0_INOUTREGOVERLAP(0) |
			A3XX_SP_VS_CTRL_REG0_THREADSIZE(TWO_QUADS) |
			A3XX_SP_VS_CTRL_REG0_SUPERTHREADMODE |
			A3XX_SP_VS_CTRL_REG0_LENGTH(instrlen(vs)));

	OUT_RING(ring, A3XX_SP_VS_CTRL_REG1_CONSTLENGTH(vsconstlen) |
			A3XX_SP_VS_CTRL_REG1_INITIALOUTSTANDING(totalattr(vs)) |
			A3XX_SP_VS_CTRL_REG1_CONSTFOOTPRINT(max(vsi->max_const, 0)));
	OUT_RING(ring, A3XX_SP_VS_PARAM_REG_POSREGID(posregid) |
			A3XX_SP_VS_PARAM_REG_PSIZEREGID(psizeregid) |
			A3XX_SP_VS_PARAM_REG_TOTALVSOUTVAR(fs->ir->varyings_count));

	for (i = 0; i < vs->ir->varyings_count; ) {
		struct ir3_varying *v;
		uint32_t reg = 0;

		OUT_PKT0(ring, REG_A3XX_SP_VS_OUT_REG(i/2), 1);

		v = vs->ir->varyings[i++];
		if (v) {
			reg |= A3XX_SP_VS_OUT_REG_A_REGID(v->rstart->num);
			reg |= A3XX_SP_VS_OUT_REG_A_COMPMASK(regmask(v->num));
		}

		v = vs->ir->varyings[i++];
		if (v) {
			reg |= A3XX_SP_VS_OUT_REG_B_REGID(v->rstart->num);
			reg |= A3XX_SP_VS_OUT_REG_B_COMPMASK(regmask(v->num));
		}

		OUT_RING(ring, reg);
	}

	outloc = 8;    /* I assume 0 and 4 are gl_Position/gl_PointSize? */
	for (i = 0; i < vs->ir->varyings_count; ) {
		struct ir3_varying *v;
		uint32_t reg = 0;

		OUT_PKT0(ring, REG_A3XX_SP_VS_VPC_DST_REG(i/4), 1);

		/* note: if we supported anything other than vec4 varyings, we'd
		 * actually be incrementing outloc by the actual varying size in
		 * units of scalar registers (ie. vec3 -> 3)
		 */

		v = vs->ir->varyings[i++];
		if (v) {
			reg |= A3XX_SP_VS_VPC_DST_REG_OUTLOC0(outloc);
			outloc += v->num;
		}

		v = vs->ir->varyings[i++];
		if (v) {
			reg |= A3XX_SP_VS_VPC_DST_REG_OUTLOC1(outloc);
			outloc += v->num;
		}

		v = vs->ir->varyings[i++];
		if (v) {
			reg |= A3XX_SP_VS_VPC_DST_REG_OUTLOC2(outloc);
			outloc += v->num;
		}

		v = vs->ir->varyings[i++];
		if (v) {
			reg |= A3XX_SP_VS_VPC_DST_REG_OUTLOC3(outloc);
			outloc += v->num;
		}

		OUT_RING(ring, reg);
	}

	// TODO SP_VS_OBJ_OFFSET_REG / SP_VS_OBJ_START_REG

	OUT_PKT0(ring, REG_A3XX_SP_FS_LENGTH_REG, 1);
	OUT_RING(ring, A3XX_SP_FS_LENGTH_REG_SHADERLENGTH(instrlen(fs)));

	OUT_PKT0(ring, REG_A3XX_SP_FS_CTRL_REG0, 2);
	OUT_RING(ring, A3XX_SP_FS_CTRL_REG0_THREADMODE(MULTI) |
			A3XX_SP_FS_CTRL_REG0_INSTRBUFFERMODE(BUFFER) |
			A3XX_SP_FS_CTRL_REG0_HALFREGFOOTPRINT(fsi->max_half_reg + 1) |
			A3XX_SP_FS_CTRL_REG0_FULLREGFOOTPRINT(fsi->max_reg + 1) |
			A3XX_SP_FS_CTRL_REG0_INOUTREGOVERLAP(1) |
			A3XX_SP_FS_CTRL_REG0_THREADSIZE(FOUR_QUADS) |
			A3XX_SP_FS_CTRL_REG0_SUPERTHREADMODE |
			COND(fs->ir->samplers_count > 0, A3XX_SP_FS_CTRL_REG0_PIXLODENABLE) |
			A3XX_SP_FS_CTRL_REG0_LENGTH(instrlen(fs)));
	OUT_RING(ring, A3XX_SP_FS_CTRL_REG1_CONSTLENGTH(fsconstlen) |
			A3XX_SP_FS_CTRL_REG1_INITIALOUTSTANDING(0) |
			A3XX_SP_FS_CTRL_REG1_CONSTFOOTPRINT(max(fsi->max_const, 0)) |
			A3XX_SP_FS_CTRL_REG1_HALFPRECVAROFFSET(63));

	// TODO SP_FS_OBJ_OFFSET_REG / SP_FS_OBJ_START_REG

	OUT_PKT0(ring, REG_A3XX_SP_FS_FLAT_SHAD_MODE_REG_0, 2);
	OUT_RING(ring, 0x00000000);        /* SP_FS_FLAT_SHAD_MODE_REG_0 */
	OUT_RING(ring, 0x00000000);        /* SP_FS_FLAT_SHAD_MODE_REG_1 */

	OUT_PKT0(ring, REG_A3XX_SP_FS_OUTPUT_REG, 1);
	OUT_RING(ring, 0x00000000);        /* SP_FS_OUTPUT_REG */

	OUT_PKT0(ring, REG_A3XX_SP_FS_MRT_REG(0), 4);
	OUT_RING(ring, A3XX_SP_FS_MRT_REG_REGID(colorregid) |  /* SP_FS_MRT[0].REG */
			A3XX_SP_FS_MRT_REG_HALF_PRECISION);
	OUT_RING(ring, A3XX_SP_FS_MRT_REG_REGID(0));           /* SP_FS_MRT[1].REG */
	OUT_RING(ring, A3XX_SP_FS_MRT_REG_REGID(0));           /* SP_FS_MRT[2].REG */
	OUT_RING(ring, A3XX_SP_FS_MRT_REG_REGID(0));           /* SP_FS_MRT[3].REG */

	OUT_PKT0(ring, REG_A3XX_VPC_ATTR, 2);
	OUT_RING(ring, A3XX_VPC_ATTR_TOTALATTR(numvar) |
			A3XX_VPC_ATTR_THRDASSIGN(1) |
			A3XX_VPC_ATTR_LMSIZE(1));
	OUT_RING(ring, A3XX_VPC_PACK_NUMFPNONPOSVAR(numvar) |
			A3XX_VPC_PACK_NUMNONPOSVSVAR(numvar));

	OUT_PKT0(ring, REG_A3XX_VPC_VARYING_INTERP_MODE(0), 4);
	OUT_RING(ring, 0x00000000);        /* VPC_VARYING_INTERP[0].MODE */
	OUT_RING(ring, 0x00000000);        /* VPC_VARYING_INTERP[1].MODE */
	OUT_RING(ring, 0x00000000);        /* VPC_VARYING_INTERP[2].MODE */
	OUT_RING(ring, 0x00000000);        /* VPC_VARYING_INTERP[3].MODE */

	OUT_PKT0(ring, REG_A3XX_VPC_VARYING_PS_REPL_MODE(0), 4);
	OUT_RING(ring, 0x00000000);        /* VPC_VARYING_PS_REPL[0].MODE */
	OUT_RING(ring, 0x00000000);        /* VPC_VARYING_PS_REPL[1].MODE */
	OUT_RING(ring, 0x00000000);        /* VPC_VARYING_PS_REPL[2].MODE */
	OUT_RING(ring, 0x00000000);        /* VPC_VARYING_PS_REPL[3].MODE */

	OUT_PKT0(ring, REG_A3XX_VFD_VS_THREADING_THRESHOLD, 1);
	OUT_RING(ring, A3XX_VFD_VS_THREADING_THRESHOLD_REGID_THRESHOLD(15) |
			A3XX_VFD_VS_THREADING_THRESHOLD_REGID_VTXCNT(252));

	emit_shader(ring, vs, SB_VERT_SHADER);

	OUT_PKT0(ring, REG_A3XX_VFD_PERFCOUNTER0_SELECT, 1);
	OUT_RING(ring, 0x00000000);        /* VFD_PERFCOUNTER0_SELECT */

	emit_shader(ring, fs, SB_FRAG_SHADER);

	OUT_PKT0(ring, REG_A3XX_VFD_PERFCOUNTER0_SELECT, 1);
	OUT_RING(ring, 0x00000000);        /* VFD_PERFCOUNTER0_SELECT */

	OUT_PKT0(ring, REG_A3XX_VFD_CONTROL_0, 2);
	OUT_RING(ring, A3XX_VFD_CONTROL_0_TOTALATTRTOVS(totalattr(vs)) |
			A3XX_VFD_CONTROL_0_PACKETSIZE(2) |
			A3XX_VFD_CONTROL_0_STRMDECINSTRCNT(vs->ir->attributes_count) |
			A3XX_VFD_CONTROL_0_STRMFETCHINSTRCNT(vs->ir->attributes_count));
	OUT_RING(ring, A3XX_VFD_CONTROL_1_MAXSTORAGE(1) | // XXX
			A3XX_VFD_CONTROL_1_REGID4VTX(63 << 2) |
			A3XX_VFD_CONTROL_1_REGID4INST(63 << 2));

	emit_vtx_fetch(ring, vs, attr, first);

	/* we have this sometimes, not others.. perhaps we could be clever
	 * and figure out actually when we need to invalidate cache:
	 */
	OUT_PKT0(ring, REG_A3XX_UCHE_CACHE_INVALIDATE0_REG, 2);
	OUT_RING(ring, A3XX_UCHE_CACHE_INVALIDATE0_REG_ADDR(0));
	OUT_RING(ring, A3XX_UCHE_CACHE_INVALIDATE1_REG_ADDR(0) |
			A3XX_UCHE_CACHE_INVALIDATE1_REG_OPCODE(INVALIDATE) |
			A3XX_UCHE_CACHE_INVALIDATE1_REG_ENTIRE_CACHE);

	/* for RB_RESOLVE_PASS, I think the consts are not needed: */
	if (uniforms) {
		emit_uniconst(ring, vs, uniforms, SB_VERT_SHADER);
		emit_uniconst(ring, fs, uniforms, SB_FRAG_SHADER);
	}
}
Ejemplo n.º 6
0
static boolean
nv30_vertprog_validate(struct nv30_context *nv30)
{ 
	struct pipe_screen *pscreen = nv30->pipe.screen;
	struct nouveau_grobj *rankine = nv30->screen->rankine;
	struct nv30_vertex_program *vp;
	struct pipe_buffer *constbuf;
	boolean upload_code = FALSE, upload_data = FALSE;
	int i;

	vp = nv30->vertprog;
	constbuf = nv30->constbuf[PIPE_SHADER_VERTEX];

	/* Translate TGSI shader into hw bytecode */
	if (!vp->translated) {
		nv30_vertprog_translate(nv30, vp);
		if (!vp->translated)
			return FALSE;
	}

	/* Allocate hw vtxprog exec slots */
	if (!vp->exec) {
		struct nouveau_resource *heap = nv30->screen->vp_exec_heap;
		struct nouveau_stateobj *so;
		uint vplen = vp->nr_insns;

		if (nouveau_resource_alloc(heap, vplen, vp, &vp->exec)) {
			while (heap->next && heap->size < vplen) {
				struct nv30_vertex_program *evict;
				
				evict = heap->next->priv;
				nouveau_resource_free(&evict->exec);
			}

			if (nouveau_resource_alloc(heap, vplen, vp, &vp->exec))
				assert(0);
		}

		so = so_new(2, 0);
		so_method(so, rankine, NV34TCL_VP_START_FROM_ID, 1);
		so_data  (so, vp->exec->start);
		so_ref(so, &vp->so);
		so_ref(NULL, &so);

		upload_code = TRUE;
	}

	/* Allocate hw vtxprog const slots */
	if (vp->nr_consts && !vp->data) {
		struct nouveau_resource *heap = nv30->screen->vp_data_heap;

		if (nouveau_resource_alloc(heap, vp->nr_consts, vp, &vp->data)) {
			while (heap->next && heap->size < vp->nr_consts) {
				struct nv30_vertex_program *evict;
				
				evict = heap->next->priv;
				nouveau_resource_free(&evict->data);
			}

			if (nouveau_resource_alloc(heap, vp->nr_consts, vp,
						   &vp->data))
				assert(0);
		}

		/*XXX: handle this some day */
		assert(vp->data->start >= vp->data_start_min);

		upload_data = TRUE;
		if (vp->data_start != vp->data->start)
			upload_code = TRUE;
	}

	/* If exec or data segments moved we need to patch the program to
	 * fixup offsets and register IDs.
	 */
	if (vp->exec_start != vp->exec->start) {
		for (i = 0; i < vp->nr_insns; i++) {
			struct nv30_vertex_program_exec *vpi = &vp->insns[i];

			if (vpi->has_branch_offset) {
				assert(0);
			}
		}

		vp->exec_start = vp->exec->start;
	}

	if (vp->nr_consts && vp->data_start != vp->data->start) {
		for (i = 0; i < vp->nr_insns; i++) {
			struct nv30_vertex_program_exec *vpi = &vp->insns[i];

			if (vpi->const_index >= 0) {
				vpi->data[1] &= ~NV30_VP_INST_CONST_SRC_MASK;
				vpi->data[1] |=
					(vpi->const_index + vp->data->start) <<
					NV30_VP_INST_CONST_SRC_SHIFT;

			}
		}

		vp->data_start = vp->data->start;
	}

	/* Update + Upload constant values */
	if (vp->nr_consts) {
		float *map = NULL;

		if (constbuf) {
			map = pipe_buffer_map(pscreen, constbuf,
					      PIPE_BUFFER_USAGE_CPU_READ);
		}

		for (i = 0; i < vp->nr_consts; i++) {
			struct nv30_vertex_program_data *vpd = &vp->consts[i];

			if (vpd->index >= 0) {
				if (!upload_data &&
				    !memcmp(vpd->value, &map[vpd->index * 4],
					    4 * sizeof(float)))
					continue;
				memcpy(vpd->value, &map[vpd->index * 4],
				       4 * sizeof(float));
			}

			BEGIN_RING(rankine, NV34TCL_VP_UPLOAD_CONST_ID, 5);
			OUT_RING  (i + vp->data->start);
			OUT_RINGp ((uint32_t *)vpd->value, 4);
		}

		if (constbuf)
			pipe_buffer_unmap(pscreen, constbuf);
	}

	/* Upload vtxprog */
	if (upload_code) {
#if 0
		for (i = 0; i < vp->nr_insns; i++) {
			NOUVEAU_MSG("VP inst %d: 0x%08x 0x%08x 0x%08x 0x%08x\n",
				i, vp->insns[i].data[0], vp->insns[i].data[1],
				vp->insns[i].data[2], vp->insns[i].data[3]);
		}
#endif
		BEGIN_RING(rankine, NV34TCL_VP_UPLOAD_FROM_ID, 1);
		OUT_RING  (vp->exec->start);
		for (i = 0; i < vp->nr_insns; i++) {
			BEGIN_RING(rankine, NV34TCL_VP_UPLOAD_INST(0), 4);
			OUT_RINGp (vp->insns[i].data, 4);
		}
	}

	if (vp->so != nv30->state.hw[NV30_STATE_VERTPROG]) {
		so_ref(vp->so, &nv30->state.hw[NV30_STATE_VERTPROG]);
		return TRUE;
	}

	return FALSE;
}
Ejemplo n.º 7
0
void
nvfx_push_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
{
	struct nvfx_context *nvfx = nvfx_context(pipe);
	struct nouveau_channel *chan = nvfx->screen->base.channel;
	struct nouveau_grobj *eng3d = nvfx->screen->eng3d;
	struct push_context ctx;
	struct util_split_prim s;
	unsigned instances_left = info->instance_count;
	int vtx_value;
	unsigned hw_mode = nvgl_primitive(info->mode);
	int i;
	struct
	{
		uint8_t* map;
		unsigned step;
	} per_instance[16];
	unsigned p_overhead = 64 /* magic fix */
			+ 4 /* begin/end */
			+ 4; /* potential edgeflag enable/disable */

	ctx.chan = nvfx->screen->base.channel;
	ctx.eng3d = nvfx->screen->eng3d;
	ctx.translate = nvfx->vtxelt->translate;
	ctx.idxbuf = NULL;
	ctx.vertex_length = nvfx->vtxelt->vertex_length;
	ctx.max_vertices_per_packet = nvfx->vtxelt->max_vertices_per_packet;
	ctx.edgeflag = 0.5f;
	// TODO: figure out if we really want to handle this, and do so in that case
	ctx.edgeflag_attr = 0xff; // nvfx->vertprog->cfg.edgeflag_in;

	if(!nvfx->use_vertex_buffers)
	{
		for(i = 0; i < nvfx->vtxelt->num_per_vertex_buffer_infos; ++i)
		{
			struct nvfx_per_vertex_buffer_info* vbi = &nvfx->vtxelt->per_vertex_buffer_info[i];
			struct pipe_vertex_buffer *vb = &nvfx->vtxbuf[vbi->vertex_buffer_index];
			uint8_t* data = nvfx_buffer(vb->buffer)->data + vb->buffer_offset;
			if(info->indexed)
				data += info->index_bias * vb->stride;
			ctx.translate->set_buffer(ctx.translate, i, data, vb->stride, ~0);
		}

		if(ctx.edgeflag_attr < 16)
			vtx_value = -(ctx.vertex_length + 3);  /* vertex data and edgeflag header and value */
		else
		{
			p_overhead += 1; /* initial vertex_data header */
			vtx_value = -ctx.vertex_length;  /* vertex data and edgeflag header and value */
		}

		if (info->indexed) {
			// XXX: this case and is broken and probably need a new VTX_ATTR push path
			if (nvfx->idxbuf.index_size == 1)
				s.emit = emit_vertices_lookup8;
			else if (nvfx->idxbuf.index_size == 2)
				s.emit = emit_vertices_lookup16;
			else
				s.emit = emit_vertices_lookup32;
		} else
			s.emit = emit_vertices;
	}
	else
	{
		if(!info->indexed || nvfx->use_index_buffer)
		{
			s.emit = info->indexed ? emit_ib_ranges : emit_vb_ranges;
			p_overhead += 3;
			vtx_value = 0;
		}
		else if (nvfx->idxbuf.index_size == 4)
		{
			s.emit = emit_elt32;
			p_overhead += 1;
			vtx_value = 8;
		}
		else
		{
			s.emit = (nvfx->idxbuf.index_size == 2) ? emit_elt16 : emit_elt8;
			p_overhead += 3;
			vtx_value = 7;
		}
	}

	ctx.idxbias = info->index_bias;
	if(nvfx->use_vertex_buffers)
		ctx.idxbias -= nvfx->base_vertex;

	/* map index buffer, if present */
	if (info->indexed && !nvfx->use_index_buffer)
		ctx.idxbuf = nvfx_buffer(nvfx->idxbuf.buffer)->data + nvfx->idxbuf.offset;

	s.priv = &ctx;
	s.edge = emit_edgeflag;

	for (i = 0; i < nvfx->vtxelt->num_per_instance; ++i)
	{
		struct nvfx_per_instance_element *ve = &nvfx->vtxelt->per_instance[i];
		struct pipe_vertex_buffer *vb = &nvfx->vtxbuf[ve->base.vertex_buffer_index];
		float v[4];
		per_instance[i].step = info->start_instance % ve->instance_divisor;
		per_instance[i].map = nvfx_buffer(vb->buffer)->data + vb->buffer_offset + ve->base.src_offset;

		nvfx->vtxelt->per_instance[i].base.fetch_rgba_float(v, per_instance[i].map, 0, 0);

		nvfx_emit_vtx_attr(chan, eng3d,
				   nvfx->vtxelt->per_instance[i].base.idx, v,
				   nvfx->vtxelt->per_instance[i].base.ncomp);
	}

	/* per-instance loop */
	while (instances_left--) {
		int max_verts;
		boolean done;

		util_split_prim_init(&s, info->mode, info->start, info->count);
		nvfx_state_emit(nvfx);
		for(;;) {
			max_verts  = AVAIL_RING(chan);
			max_verts -= p_overhead;

			/* if vtx_value < 0, each vertex is -vtx_value words long
			 * otherwise, each vertex is 2^(vtx_value) / 255 words long (this is an approximation)
			 */
			if(vtx_value < 0)
			{
				max_verts /= -vtx_value;
				max_verts -= (max_verts >> 10); /* vertex data headers */
			}
			else
			{
				if(max_verts >= (1 << 23)) /* avoid overflow here */
					max_verts = (1 << 23);
				max_verts = (max_verts * 255) >> vtx_value;
			}

			//printf("avail %u max_verts %u\n", AVAIL_RING(chan), max_verts);

			if(max_verts >= 16)
			{
				/* XXX: any command a lot of times seems to (mostly) fix corruption that would otherwise happen */
				/* this seems to cause issues on nv3x, and also be unneeded there */
				if(nvfx->is_nv4x)
				{
					int i;
					for(i = 0; i < 32; ++i)
					{
						BEGIN_RING(chan, eng3d,
							   0x1dac, 1);
						OUT_RING(chan, 0);
					}
				}

				BEGIN_RING(chan, eng3d,
					   NV30_3D_VERTEX_BEGIN_END, 1);
				OUT_RING(chan, hw_mode);
				done = util_split_prim_next(&s, max_verts);
				BEGIN_RING(chan, eng3d,
					   NV30_3D_VERTEX_BEGIN_END, 1);
				OUT_RING(chan, 0);

				if(done)
					break;
			}

			FIRE_RING(chan);
			nvfx_state_emit(nvfx);
		}
Ejemplo n.º 8
0
void
NV30EXAComposite(PixmapPtr pdPix, int srcX , int srcY,
				  int maskX, int maskY,
				  int dstX , int dstY,
				  int width, int height)
{
	ScrnInfoPtr pScrn = xf86Screens[pdPix->drawable.pScreen->myNum];
	NVPtr pNv = NVPTR(pScrn);
	struct nouveau_channel *chan = pNv->chan;
	struct nouveau_grobj *rankine = pNv->Nv3D;
	float sX0, sX1, sX2, sY0, sY1, sY2;
	float mX0, mX1, mX2, mY0, mY1, mY2;
	NV30EXA_STATE;

	WAIT_RING(chan, 64);

	/* We're drawing a triangle, we need to scissor it to a quad. */
	/* The scissors are here for a good reason, we don't get the full image, but just a part. */
	/* Handling the cliprects is done for us already. */
	BEGIN_RING(chan, rankine, NV34TCL_SCISSOR_HORIZ, 2);
	OUT_RING  (chan, (width << 16) | dstX);
	OUT_RING  (chan, (height << 16) | dstY);
	BEGIN_RING(chan, rankine, NV34TCL_VERTEX_BEGIN_END, 1);
	OUT_RING  (chan, NV34TCL_VERTEX_BEGIN_END_TRIANGLES);

#if 0
	ErrorF("Composite [%dx%d] (%d,%d)IN(%d,%d)OP(%d,%d)\n",width,height,srcX,srcY,maskX,maskY,dstX,dstY);
#endif
	NV30EXATransformCoord(state->unit[0].transform, 
				srcX, srcY - height,
				state->unit[0].width,
				state->unit[0].height, &sX0, &sY0);
	NV30EXATransformCoord(state->unit[0].transform,
				srcX, srcY + height,
				state->unit[0].width,
				state->unit[0].height, &sX1, &sY1);
	NV30EXATransformCoord(state->unit[0].transform,
				srcX + 2*width, srcY + height,
				state->unit[0].width,
				state->unit[0].height, &sX2, &sY2);

	if (state->have_mask) {
		NV30EXATransformCoord(state->unit[1].transform, 
					maskX, maskY - height,
					state->unit[1].width,
					state->unit[1].height, &mX0, &mY0);
		NV30EXATransformCoord(state->unit[1].transform,
					maskX, maskY + height,
					state->unit[1].width,
					state->unit[1].height, &mX1, &mY1);
		NV30EXATransformCoord(state->unit[1].transform,
					maskX + 2*width, maskY + height,
					state->unit[1].width,
					state->unit[1].height, &mX2, &mY2);

		CV_OUTm(sX0 , sY0 , mX0, mY0, dstX			,	dstY - height);
		CV_OUTm(sX1 , sY1 , mX1, mY1, dstX			,	dstY + height);
		CV_OUTm(sX2 , sY2 , mX2, mY2, dstX + 2*width	, 	dstY + height);
	} else {
		CV_OUT(sX0 , sY0 , dstX			,	dstY - height);
		CV_OUT(sX1 , sY1 , dstX			,	dstY + height);
		CV_OUT(sX2 , sY2 , dstX + 2*width	, 	dstY + height);
	}

	BEGIN_RING(chan, rankine, NV34TCL_VERTEX_BEGIN_END, 1);
	OUT_RING  (chan, 0);
}
Ejemplo n.º 9
0
void
nvc0_tfb_validate(struct nvc0_context *nvc0)
{
   struct nouveau_channel *chan = nvc0->screen->base.channel;
   struct nvc0_transform_feedback_state *tfb;
   unsigned b, n, i;

   if (nvc0->gmtyprog) tfb = nvc0->gmtyprog->tfb;
   else
   if (nvc0->tevlprog) tfb = nvc0->tevlprog->tfb;
   else
      tfb = nvc0->vertprog->tfb;

   IMMED_RING(chan, RING_3D(TFB_ENABLE), (tfb && nvc0->num_tfbbufs) ? 1 : 0);

   if (tfb && tfb != nvc0->state.tfb) {
      uint8_t var[128];

      for (n = 0, b = 0; b < 4; n += tfb->varying_count[b++]) {
         if (tfb->varying_count[b]) {
            BEGIN_RING(chan, RING_3D(TFB_STREAM(b)), 3);
            OUT_RING  (chan, 0);
            OUT_RING  (chan, tfb->varying_count[b]);
            OUT_RING  (chan, tfb->stride[b]);

            for (i = 0; i < tfb->varying_count[b]; ++i)
               var[i] = tfb->varying_index[n + i];
            for (; i & 3; ++i)
               var[i] = 0; /* zero rest of method word bits */

            BEGIN_RING(chan, RING_3D(TFB_VARYING_LOCS(b, 0)), i / 4);
            OUT_RINGp (chan, var, i / 4);

            if (nvc0->tfbbuf[b])
               nvc0_so_target(nvc0->tfbbuf[b])->stride = tfb->stride[b];
         } else {
            IMMED_RING(chan, RING_3D(TFB_VARYING_COUNT(b)), 0);
         }
      }
   }
   nvc0->state.tfb = tfb;

   if (!(nvc0->dirty & NVC0_NEW_TFB_TARGETS))
      return;
   nvc0_bufctx_reset(nvc0, NVC0_BUFCTX_TFB);

   for (b = 0; b < nvc0->num_tfbbufs; ++b) {
      struct nvc0_so_target *targ = nvc0_so_target(nvc0->tfbbuf[b]);
      struct nv04_resource *buf = nv04_resource(targ->pipe.buffer);

      if (tfb)
         targ->stride = tfb->stride[b];

      if (!(nvc0->tfbbuf_dirty & (1 << b)))
         continue;

      if (!targ->clean)
         nvc0_query_fifo_wait(chan, targ->pq);
      BEGIN_RING(chan, RING_3D(TFB_BUFFER_ENABLE(b)), 5);
      OUT_RING  (chan, 1);
      OUT_RESRCh(chan, buf, targ->pipe.buffer_offset, NOUVEAU_BO_WR);
      OUT_RESRCl(chan, buf, targ->pipe.buffer_offset, NOUVEAU_BO_WR);
      OUT_RING  (chan, targ->pipe.buffer_size);
      if (!targ->clean) {
         nvc0_query_pushbuf_submit(chan, targ->pq, 0x4);
      } else {
         OUT_RING(chan, 0); /* TFB_BUFFER_OFFSET */
         targ->clean = FALSE;
      }
      nvc0_bufctx_add_resident(nvc0, NVC0_BUFCTX_TFB, buf, NOUVEAU_BO_WR);
   }
   for (; b < 4; ++b)
      IMMED_RING(chan, RING_3D(TFB_BUFFER_ENABLE(b)), 0);
}
Ejemplo n.º 10
0
void
nvc0_m2mf_transfer_rect(struct pipe_screen *pscreen,
                        const struct nv50_m2mf_rect *dst,
                        const struct nv50_m2mf_rect *src,
                        uint32_t nblocksx, uint32_t nblocksy)
{
   struct nouveau_channel *chan = nouveau_screen(pscreen)->channel;
   const int cpp = dst->cpp;
   uint32_t src_ofst = src->base;
   uint32_t dst_ofst = dst->base;
   uint32_t height = nblocksy;
   uint32_t sy = src->y;
   uint32_t dy = dst->y;
   uint32_t exec = (1 << 20);

   assert(dst->cpp == src->cpp);

   if (nouveau_bo_tile_layout(src->bo)) {
      BEGIN_RING(chan, RING_MF(TILING_MODE_IN), 5);
      OUT_RING  (chan, src->tile_mode);
      OUT_RING  (chan, src->width * cpp);
      OUT_RING  (chan, src->height);
      OUT_RING  (chan, src->depth);
      OUT_RING  (chan, src->z);
   } else {
      src_ofst += src->y * src->pitch + src->x * cpp;

      BEGIN_RING(chan, RING_MF(PITCH_IN), 1);
      OUT_RING  (chan, src->width * cpp);

      exec |= NVC0_M2MF_EXEC_LINEAR_IN;
   }

   if (nouveau_bo_tile_layout(dst->bo)) {
      BEGIN_RING(chan, RING_MF(TILING_MODE_OUT), 5);
      OUT_RING  (chan, dst->tile_mode);
      OUT_RING  (chan, dst->width * cpp);
      OUT_RING  (chan, dst->height);
      OUT_RING  (chan, dst->depth);
      OUT_RING  (chan, dst->z);
   } else {
      dst_ofst += dst->y * dst->pitch + dst->x * cpp;

      BEGIN_RING(chan, RING_MF(PITCH_OUT), 1);
      OUT_RING  (chan, dst->width * cpp);

      exec |= NVC0_M2MF_EXEC_LINEAR_OUT;
   }

   while (height) {
      int line_count = height > 2047 ? 2047 : height;

      MARK_RING (chan, 17, 4);

      BEGIN_RING(chan, RING_MF(OFFSET_IN_HIGH), 2);
      OUT_RELOCh(chan, src->bo, src_ofst, src->domain | NOUVEAU_BO_RD);
      OUT_RELOCl(chan, src->bo, src_ofst, src->domain | NOUVEAU_BO_RD);

      BEGIN_RING(chan, RING_MF(OFFSET_OUT_HIGH), 2);
      OUT_RELOCh(chan, dst->bo, dst_ofst, dst->domain | NOUVEAU_BO_WR);
      OUT_RELOCl(chan, dst->bo, dst_ofst, dst->domain | NOUVEAU_BO_WR);

      if (!(exec & NVC0_M2MF_EXEC_LINEAR_IN)) {
         BEGIN_RING(chan, RING_MF(TILING_POSITION_IN_X), 2);
         OUT_RING  (chan, src->x * cpp);
         OUT_RING  (chan, sy);
      } else {
         src_ofst += line_count * src->pitch;
      }
      if (!(exec & NVC0_M2MF_EXEC_LINEAR_OUT)) {
         BEGIN_RING(chan, RING_MF(TILING_POSITION_OUT_X), 2);
         OUT_RING  (chan, dst->x * cpp);
         OUT_RING  (chan, dy);
      } else {
         dst_ofst += line_count * dst->pitch;
      }

      BEGIN_RING(chan, RING_MF(LINE_LENGTH_IN), 2);
      OUT_RING  (chan, nblocksx * cpp);
      OUT_RING  (chan, line_count);
      BEGIN_RING(chan, RING_MF(EXEC), 1);
      OUT_RING  (chan, exec);

      height -= line_count;
      sy += line_count;
      dy += line_count;
   }
}
Ejemplo n.º 11
0
Bool
NV30EXAPrepareComposite(int op, PicturePtr psPict,
		PicturePtr pmPict,
		PicturePtr pdPict,
		PixmapPtr  psPix,
		PixmapPtr  pmPix,
		PixmapPtr  pdPix)
{
	ScrnInfoPtr pScrn = xf86Screens[psPix->drawable.pScreen->myNum];
	NVPtr pNv = NVPTR(pScrn);
	struct nouveau_channel *chan = pNv->chan;
	struct nouveau_grobj *rankine = pNv->Nv3D;
	nv_pict_op_t *blend;
	int fpid = NV30EXA_FPID_PASS_COL0;
	NV30EXA_STATE;

	if (MARK_RING(chan, 128, 1 + 1 + 4))
		return FALSE;

	blend = NV30_GetPictOpRec(op);

	NV30_SetupBlend(pScrn, blend, pdPict->format,
			(pmPict && pmPict->componentAlpha &&
			 PICT_FORMAT_RGB(pmPict->format)));

	if (!NV30_SetupSurface(pScrn, pdPix, pdPict) ||
	    !NV30EXATexture(pScrn, psPix, psPict, 0)) {
		MARK_UNDO(chan);
		return FALSE;
	}

#if 0
#define printformat(f) ErrorF("(%xh %s %dbpp A%dR%dG%dB%d)",f,(f>>16)&0xf==2?"ARGB":"ABGR",(f>>24),(f&0xf000)>>12,(f&0xf00)>>8,(f&0xf0)>>4,f&0xf)
	ErrorF("Preparecomposite src(%dx%d)",psPict->pDrawable->width,psPict->pDrawable->height);
	printformat((psPict->format));
	ErrorF(" dst(%dx%d)",pdPict->pDrawable->width,pdPict->pDrawable->height);
	printformat((pdPict->format));
	if (pmPict)
	{
		ErrorF(" mask(%dx%d)",pmPict->pDrawable->width,pmPict->pDrawable->height);
		printformat((pmPict->format));
	}
	ErrorF("\n");
#endif

	if (pmPict) {
		if (!NV30EXATexture(pScrn, pmPix, pmPict, 1)) {
			MARK_UNDO(chan);
			return FALSE;
		}

		if (pmPict->componentAlpha && PICT_FORMAT_RGB(pmPict->format)) {
			if (blend->src_alpha)
				fpid = NV30EXA_FPID_COMPOSITE_MASK_SA_CA;
			else
				fpid = NV30EXA_FPID_COMPOSITE_MASK_CA;
		} else {
			fpid = NV30EXA_FPID_COMPOSITE_MASK;
		}

		state->have_mask = TRUE;
	} else {
		fpid = NV30EXA_FPID_PASS_TEX0;

		state->have_mask = FALSE;
	}

	if (!NV30_LoadFragProg(pScrn, (pdPict->format == PICT_a8) ?
			       nv40_fp_map_a8[fpid] : nv40_fp_map[fpid])) {
		MARK_UNDO(chan);
		return FALSE;
	}

	BEGIN_RING(chan, rankine, 0x23c, 1);
	OUT_RING  (chan, pmPict?3:1);

	pNv->alu = op;
	pNv->pspict = psPict;
	pNv->pmpict = pmPict;
	pNv->pdpict = pdPict;
	pNv->pspix = psPix;
	pNv->pmpix = pmPix;
	pNv->pdpix = pdPix;
	chan->flush_notify = NV30EXAStateCompositeReemit;
	return TRUE;
}
Ejemplo n.º 12
0
void
fd5_program_emit(struct fd_ringbuffer *ring, struct fd5_emit *emit,
		int nr, struct pipe_surface **bufs)
{
	struct stage s[MAX_STAGES];
	uint32_t pos_regid, posz_regid, psize_regid, color_regid[8];
	uint32_t face_regid, coord_regid, zwcoord_regid;
	uint32_t vcoord_regid, vertex_regid, instance_regid;
	int i, j;

	debug_assert(nr <= ARRAY_SIZE(color_regid));

	if (emit->key.binning_pass)
		nr = 0;

	setup_stages(emit, s);

	pos_regid = ir3_find_output_regid(s[VS].v, VARYING_SLOT_POS);
	posz_regid = ir3_find_output_regid(s[FS].v, FRAG_RESULT_DEPTH);
	psize_regid = ir3_find_output_regid(s[VS].v, VARYING_SLOT_PSIZ);
	vertex_regid = ir3_find_output_regid(s[VS].v, SYSTEM_VALUE_VERTEX_ID_ZERO_BASE);
	instance_regid = ir3_find_output_regid(s[VS].v, SYSTEM_VALUE_INSTANCE_ID);

	if (s[FS].v->color0_mrt) {
		color_regid[0] = color_regid[1] = color_regid[2] = color_regid[3] =
		color_regid[4] = color_regid[5] = color_regid[6] = color_regid[7] =
			ir3_find_output_regid(s[FS].v, FRAG_RESULT_COLOR);
	} else {
		color_regid[0] = ir3_find_output_regid(s[FS].v, FRAG_RESULT_DATA0);
		color_regid[1] = ir3_find_output_regid(s[FS].v, FRAG_RESULT_DATA1);
		color_regid[2] = ir3_find_output_regid(s[FS].v, FRAG_RESULT_DATA2);
		color_regid[3] = ir3_find_output_regid(s[FS].v, FRAG_RESULT_DATA3);
		color_regid[4] = ir3_find_output_regid(s[FS].v, FRAG_RESULT_DATA4);
		color_regid[5] = ir3_find_output_regid(s[FS].v, FRAG_RESULT_DATA5);
		color_regid[6] = ir3_find_output_regid(s[FS].v, FRAG_RESULT_DATA6);
		color_regid[7] = ir3_find_output_regid(s[FS].v, FRAG_RESULT_DATA7);
	}

	/* TODO get these dynamically: */
	face_regid = s[FS].v->frag_face ? regid(0,0) : regid(63,0);
	coord_regid = s[FS].v->frag_coord ? regid(0,0) : regid(63,0);
	zwcoord_regid = s[FS].v->frag_coord ? regid(0,2) : regid(63,0);
	vcoord_regid = (s[FS].v->total_in > 0) ? regid(0,0) : regid(63,0);

	/* we could probably divide this up into things that need to be
	 * emitted if frag-prog is dirty vs if vert-prog is dirty..
	 */

	OUT_PKT4(ring, REG_A5XX_HLSQ_VS_CONTROL_REG, 5);
	OUT_RING(ring, A5XX_HLSQ_VS_CONTROL_REG_CONSTOBJECTOFFSET(s[VS].constoff) |
			A5XX_HLSQ_VS_CONTROL_REG_SHADEROBJOFFSET(s[VS].instroff) |
			COND(s[VS].v, A5XX_HLSQ_VS_CONTROL_REG_ENABLED));
	OUT_RING(ring, A5XX_HLSQ_FS_CONTROL_REG_CONSTOBJECTOFFSET(s[FS].constoff) |
			A5XX_HLSQ_FS_CONTROL_REG_SHADEROBJOFFSET(s[FS].instroff) |
			COND(s[FS].v, A5XX_HLSQ_FS_CONTROL_REG_ENABLED));
	OUT_RING(ring, A5XX_HLSQ_HS_CONTROL_REG_CONSTOBJECTOFFSET(s[HS].constoff) |
			A5XX_HLSQ_HS_CONTROL_REG_SHADEROBJOFFSET(s[HS].instroff) |
			COND(s[HS].v, A5XX_HLSQ_HS_CONTROL_REG_ENABLED));
	OUT_RING(ring, A5XX_HLSQ_DS_CONTROL_REG_CONSTOBJECTOFFSET(s[DS].constoff) |
			A5XX_HLSQ_DS_CONTROL_REG_SHADEROBJOFFSET(s[DS].instroff) |
			COND(s[DS].v, A5XX_HLSQ_DS_CONTROL_REG_ENABLED));
	OUT_RING(ring, A5XX_HLSQ_GS_CONTROL_REG_CONSTOBJECTOFFSET(s[GS].constoff) |
			A5XX_HLSQ_GS_CONTROL_REG_SHADEROBJOFFSET(s[GS].instroff) |
			COND(s[GS].v, A5XX_HLSQ_GS_CONTROL_REG_ENABLED));

	OUT_PKT4(ring, REG_A5XX_HLSQ_CS_CONFIG, 1);
	OUT_RING(ring, 0x00000000);

	OUT_PKT4(ring, REG_A5XX_HLSQ_VS_CNTL, 5);
	OUT_RING(ring, A5XX_HLSQ_VS_CNTL_INSTRLEN(s[VS].instrlen));
	OUT_RING(ring, A5XX_HLSQ_FS_CNTL_INSTRLEN(s[FS].instrlen));
	OUT_RING(ring, A5XX_HLSQ_HS_CNTL_INSTRLEN(s[HS].instrlen));
	OUT_RING(ring, A5XX_HLSQ_DS_CNTL_INSTRLEN(s[DS].instrlen));
	OUT_RING(ring, A5XX_HLSQ_GS_CNTL_INSTRLEN(s[GS].instrlen));

	OUT_PKT4(ring, REG_A5XX_SP_VS_CONTROL_REG, 5);
	OUT_RING(ring, A5XX_SP_VS_CONTROL_REG_CONSTOBJECTOFFSET(s[VS].constoff) |
			A5XX_SP_VS_CONTROL_REG_SHADEROBJOFFSET(s[VS].instroff) |
			COND(s[VS].v, A5XX_SP_VS_CONTROL_REG_ENABLED));
	OUT_RING(ring, A5XX_SP_FS_CONTROL_REG_CONSTOBJECTOFFSET(s[FS].constoff) |
			A5XX_SP_FS_CONTROL_REG_SHADEROBJOFFSET(s[FS].instroff) |
			COND(s[FS].v, A5XX_SP_FS_CONTROL_REG_ENABLED));
	OUT_RING(ring, A5XX_SP_HS_CONTROL_REG_CONSTOBJECTOFFSET(s[HS].constoff) |
			A5XX_SP_HS_CONTROL_REG_SHADEROBJOFFSET(s[HS].instroff) |
			COND(s[HS].v, A5XX_SP_HS_CONTROL_REG_ENABLED));
	OUT_RING(ring, A5XX_SP_DS_CONTROL_REG_CONSTOBJECTOFFSET(s[DS].constoff) |
			A5XX_SP_DS_CONTROL_REG_SHADEROBJOFFSET(s[DS].instroff) |
			COND(s[DS].v, A5XX_SP_DS_CONTROL_REG_ENABLED));
	OUT_RING(ring, A5XX_SP_GS_CONTROL_REG_CONSTOBJECTOFFSET(s[GS].constoff) |
			A5XX_SP_GS_CONTROL_REG_SHADEROBJOFFSET(s[GS].instroff) |
			COND(s[GS].v, A5XX_SP_GS_CONTROL_REG_ENABLED));

	OUT_PKT4(ring, REG_A5XX_SP_CS_CONFIG, 1);
	OUT_RING(ring, 0x00000000);

	OUT_PKT4(ring, REG_A5XX_HLSQ_VS_CONSTLEN, 2);
	OUT_RING(ring, s[VS].constlen);    /* HLSQ_VS_CONSTLEN */
	OUT_RING(ring, s[VS].instrlen);    /* HLSQ_VS_INSTRLEN */

	OUT_PKT4(ring, REG_A5XX_HLSQ_FS_CONSTLEN, 2);
	OUT_RING(ring, s[FS].constlen);    /* HLSQ_FS_CONSTLEN */
	OUT_RING(ring, s[FS].instrlen);    /* HLSQ_FS_INSTRLEN */

	OUT_PKT4(ring, REG_A5XX_HLSQ_HS_CONSTLEN, 2);
	OUT_RING(ring, s[HS].constlen);    /* HLSQ_HS_CONSTLEN */
	OUT_RING(ring, s[HS].instrlen);    /* HLSQ_HS_INSTRLEN */

	OUT_PKT4(ring, REG_A5XX_HLSQ_DS_CONSTLEN, 2);
	OUT_RING(ring, s[DS].constlen);    /* HLSQ_DS_CONSTLEN */
	OUT_RING(ring, s[DS].instrlen);    /* HLSQ_DS_INSTRLEN */

	OUT_PKT4(ring, REG_A5XX_HLSQ_GS_CONSTLEN, 2);
	OUT_RING(ring, s[GS].constlen);    /* HLSQ_GS_CONSTLEN */
	OUT_RING(ring, s[GS].instrlen);    /* HLSQ_GS_INSTRLEN */

	OUT_PKT4(ring, REG_A5XX_HLSQ_CONTEXT_SWITCH_CS_SW_3, 2);
	OUT_RING(ring, 0x00000000);   /* HLSQ_CONTEXT_SWITCH_CS_SW_3 */
	OUT_RING(ring, 0x00000000);   /* HLSQ_CONTEXT_SWITCH_CS_SW_4 */

	OUT_PKT4(ring, REG_A5XX_SP_VS_CTRL_REG0, 1);
	OUT_RING(ring, A5XX_SP_VS_CTRL_REG0_HALFREGFOOTPRINT(s[VS].i->max_half_reg + 1) |
			A5XX_SP_VS_CTRL_REG0_FULLREGFOOTPRINT(s[VS].i->max_reg + 1) |
			0x6 | /* XXX seems to be always set? */
			A5XX_SP_VS_CTRL_REG0_BRANCHSTACK(0x3) |  // XXX need to figure this out somehow..
			COND(s[VS].v->has_samp, A5XX_SP_VS_CTRL_REG0_PIXLODENABLE));

	struct ir3_shader_linkage l = {0};
	ir3_link_shaders(&l, s[VS].v, s[FS].v);

	BITSET_DECLARE(varbs, 128) = {0};
	uint32_t *varmask = (uint32_t *)varbs;

	for (i = 0; i < l.cnt; i++)
		for (j = 0; j < util_last_bit(l.var[i].compmask); j++)
			BITSET_SET(varbs, l.var[i].loc + j);

	OUT_PKT4(ring, REG_A5XX_VPC_VAR_DISABLE(0), 4);
	OUT_RING(ring, ~varmask[0]);  /* VPC_VAR[0].DISABLE */
	OUT_RING(ring, ~varmask[1]);  /* VPC_VAR[1].DISABLE */
	OUT_RING(ring, ~varmask[2]);  /* VPC_VAR[2].DISABLE */
	OUT_RING(ring, ~varmask[3]);  /* VPC_VAR[3].DISABLE */

	/* a5xx appends pos/psize to end of the linkage map: */
	if (pos_regid != regid(63,0))
		ir3_link_add(&l, pos_regid, 0xf, l.max_loc);

	if (psize_regid != regid(63,0))
		ir3_link_add(&l, psize_regid, 0x1, l.max_loc);

	for (i = 0, j = 0; (i < 16) && (j < l.cnt); i++) {
		uint32_t reg = 0;

		OUT_PKT4(ring, REG_A5XX_SP_VS_OUT_REG(i), 1);

		reg |= A5XX_SP_VS_OUT_REG_A_REGID(l.var[j].regid);
		reg |= A5XX_SP_VS_OUT_REG_A_COMPMASK(l.var[j].compmask);
		j++;

		reg |= A5XX_SP_VS_OUT_REG_B_REGID(l.var[j].regid);
		reg |= A5XX_SP_VS_OUT_REG_B_COMPMASK(l.var[j].compmask);
		j++;

		OUT_RING(ring, reg);
	}

	for (i = 0, j = 0; (i < 8) && (j < l.cnt); i++) {
		uint32_t reg = 0;

		OUT_PKT4(ring, REG_A5XX_SP_VS_VPC_DST_REG(i), 1);

		reg |= A5XX_SP_VS_VPC_DST_REG_OUTLOC0(l.var[j++].loc);
		reg |= A5XX_SP_VS_VPC_DST_REG_OUTLOC1(l.var[j++].loc);
		reg |= A5XX_SP_VS_VPC_DST_REG_OUTLOC2(l.var[j++].loc);
		reg |= A5XX_SP_VS_VPC_DST_REG_OUTLOC3(l.var[j++].loc);

		OUT_RING(ring, reg);
	}

	OUT_PKT4(ring, REG_A5XX_SP_VS_OBJ_START_LO, 2);
	OUT_RELOC(ring, s[VS].v->bo, 0, 0, 0);  /* SP_VS_OBJ_START_LO/HI */

	if (s[VS].instrlen)
		emit_shader(ring, s[VS].v);

	// TODO depending on other bits in this reg (if any) set somewhere else?
	OUT_PKT4(ring, REG_A5XX_PC_PRIM_VTX_CNTL, 1);
	OUT_RING(ring, COND(s[VS].v->writes_psize, A5XX_PC_PRIM_VTX_CNTL_PSIZE));

	if (emit->key.binning_pass) {
		OUT_PKT4(ring, REG_A5XX_SP_FS_OBJ_START_LO, 2);
		OUT_RING(ring, 0x00000000);    /* SP_FS_OBJ_START_LO */
		OUT_RING(ring, 0x00000000);    /* SP_FS_OBJ_START_HI */
	} else {
		uint32_t stride_in_vpc = align(s[FS].v->total_in, 4) + 4;

		if (s[VS].v->writes_psize)
			stride_in_vpc++;

		// TODO if some of these other bits depend on something other than
		// program state we should probably move these next three regs:

		OUT_PKT4(ring, REG_A5XX_SP_PRIMITIVE_CNTL, 1);
		OUT_RING(ring, A5XX_SP_PRIMITIVE_CNTL_VSOUT(l.cnt));

		OUT_PKT4(ring, REG_A5XX_VPC_CNTL_0, 1);
		OUT_RING(ring, A5XX_VPC_CNTL_0_STRIDE_IN_VPC(stride_in_vpc) |
				COND(s[FS].v->total_in > 0, A5XX_VPC_CNTL_0_VARYING) |
				0x10000);    // XXX

		OUT_PKT4(ring, REG_A5XX_PC_PRIMITIVE_CNTL, 1);
		OUT_RING(ring, A5XX_PC_PRIMITIVE_CNTL_STRIDE_IN_VPC(stride_in_vpc) |
				0x400);      // XXX

		OUT_PKT4(ring, REG_A5XX_SP_FS_OBJ_START_LO, 2);
		OUT_RELOC(ring, s[FS].v->bo, 0, 0, 0);  /* SP_FS_OBJ_START_LO/HI */
	}

	OUT_PKT4(ring, REG_A5XX_HLSQ_CONTROL_0_REG, 5);
	OUT_RING(ring, 0x00000881);        /* XXX HLSQ_CONTROL_0 */
	OUT_RING(ring, A5XX_HLSQ_CONTROL_1_REG_PRIMALLOCTHRESHOLD(63));
	OUT_RING(ring, A5XX_HLSQ_CONTROL_2_REG_FACEREGID(face_regid) |
			0xfcfcfc00);               /* XXX */
	OUT_RING(ring, A5XX_HLSQ_CONTROL_3_REG_FRAGCOORDXYREGID(vcoord_regid) |
			0xfcfcfc00);               /* XXX */
	OUT_RING(ring, A5XX_HLSQ_CONTROL_4_REG_XYCOORDREGID(coord_regid) |
			A5XX_HLSQ_CONTROL_4_REG_ZWCOORDREGID(zwcoord_regid) |
			0x0000fcfc);               /* XXX */

	OUT_PKT4(ring, REG_A5XX_SP_FS_CTRL_REG0, 1);
	OUT_RING(ring, COND(s[FS].v->total_in > 0, A5XX_SP_FS_CTRL_REG0_VARYING) |
			0x4000e | /* XXX set pretty much everywhere */
			A5XX_SP_FS_CTRL_REG0_HALFREGFOOTPRINT(s[FS].i->max_half_reg + 1) |
			A5XX_SP_FS_CTRL_REG0_FULLREGFOOTPRINT(s[FS].i->max_reg + 1) |
			A5XX_SP_FS_CTRL_REG0_BRANCHSTACK(0x3) |  // XXX need to figure this out somehow..
			COND(s[FS].v->has_samp, A5XX_SP_FS_CTRL_REG0_PIXLODENABLE));

	OUT_PKT4(ring, REG_A5XX_HLSQ_UPDATE_CNTL, 1);
	OUT_RING(ring, 0x020fffff);        /* XXX */

	OUT_PKT4(ring, REG_A5XX_VPC_GS_SIV_CNTL, 1);
	OUT_RING(ring, 0x0000ffff);        /* XXX */

	OUT_PKT4(ring, REG_A5XX_SP_SP_CNTL, 1);
	OUT_RING(ring, 0x00000010);        /* XXX */

	OUT_PKT4(ring, REG_A5XX_GRAS_CNTL, 1);
	OUT_RING(ring, COND(s[FS].v->total_in > 0, A5XX_GRAS_CNTL_VARYING) |
			COND(s[FS].v->frag_coord, A5XX_GRAS_CNTL_XCOORD |
					A5XX_GRAS_CNTL_YCOORD |
					A5XX_GRAS_CNTL_ZCOORD |
					A5XX_GRAS_CNTL_WCOORD |
					A5XX_GRAS_CNTL_UNK3) |
			COND(s[FS].v->frag_face, A5XX_GRAS_CNTL_UNK3));

	OUT_PKT4(ring, REG_A5XX_RB_RENDER_CONTROL0, 3);
	OUT_RING(ring,
			COND(s[FS].v->total_in > 0, A5XX_RB_RENDER_CONTROL0_VARYING) |
			COND(s[FS].v->frag_coord, A5XX_RB_RENDER_CONTROL0_XCOORD |
					A5XX_RB_RENDER_CONTROL0_YCOORD |
					A5XX_RB_RENDER_CONTROL0_ZCOORD |
					A5XX_RB_RENDER_CONTROL0_WCOORD |
					A5XX_RB_RENDER_CONTROL0_UNK3) |
			COND(s[FS].v->frag_face, A5XX_RB_RENDER_CONTROL0_UNK3));

	OUT_RING(ring,
			COND(s[FS].v->frag_face, A5XX_RB_RENDER_CONTROL1_FACENESS));
	OUT_RING(ring, A5XX_RB_FS_OUTPUT_CNTL_MRT(nr) |
			COND(s[FS].v->writes_pos, A5XX_RB_FS_OUTPUT_CNTL_FRAG_WRITES_Z));

	OUT_PKT4(ring, REG_A5XX_SP_FS_OUTPUT_CNTL, 9);
	OUT_RING(ring, A5XX_SP_FS_OUTPUT_CNTL_MRT(nr) |
			A5XX_SP_FS_OUTPUT_CNTL_DEPTH_REGID(posz_regid) |
			A5XX_SP_FS_OUTPUT_CNTL_SAMPLEMASK_REGID(regid(63, 0)));
	for (i = 0; i < 8; i++) {
		OUT_RING(ring, A5XX_SP_FS_OUTPUT_REG_REGID(color_regid[i]) |
				COND(emit->key.half_precision,
					A5XX_SP_FS_OUTPUT_REG_HALF_PRECISION));
	}

	if (emit->key.binning_pass) {
		OUT_PKT4(ring, REG_A5XX_VPC_PACK, 1);
		OUT_RING(ring, A5XX_VPC_PACK_NUMNONPOSVAR(0));
	} else {
		uint32_t vinterp[8], vpsrepl[8];

		memset(vinterp, 0, sizeof(vinterp));
		memset(vpsrepl, 0, sizeof(vpsrepl));

		/* looks like we need to do int varyings in the frag
		 * shader on a5xx (no flatshad reg?  or a420.0 bug?):
		 *
		 *    (sy)(ss)nop
		 *    (sy)ldlv.u32 r0.x,l[r0.x], 1
		 *    ldlv.u32 r0.y,l[r0.x+1], 1
		 *    (ss)bary.f (ei)r63.x, 0, r0.x
		 *    (ss)(rpt1)cov.s32f16 hr0.x, (r)r0.x
		 *    (rpt5)nop
		 *    sam (f16)(xyzw)hr0.x, hr0.x, s#0, t#0
		 *
		 * Possibly on later a5xx variants we'll be able to use
		 * something like the code below instead of workaround
		 * in the shader:
		 */
		/* figure out VARYING_INTERP / VARYING_PS_REPL register values: */
		for (j = -1; (j = ir3_next_varying(s[FS].v, j)) < (int)s[FS].v->inputs_count; ) {
			/* NOTE: varyings are packed, so if compmask is 0xb
			 * then first, third, and fourth component occupy
			 * three consecutive varying slots:
			 */
			unsigned compmask = s[FS].v->inputs[j].compmask;

			uint32_t inloc = s[FS].v->inputs[j].inloc;

			if ((s[FS].v->inputs[j].interpolate == INTERP_MODE_FLAT) ||
					(s[FS].v->inputs[j].rasterflat && emit->rasterflat)) {
				uint32_t loc = inloc;

				for (i = 0; i < 4; i++) {
					if (compmask & (1 << i)) {
						vinterp[loc / 16] |= 1 << ((loc % 16) * 2);
						//flatshade[loc / 32] |= 1 << (loc % 32);
						loc++;
					}
				}
			}

			gl_varying_slot slot = s[FS].v->inputs[j].slot;

			/* since we don't enable PIPE_CAP_TGSI_TEXCOORD: */
			if (slot >= VARYING_SLOT_VAR0) {
				unsigned texmask = 1 << (slot - VARYING_SLOT_VAR0);
				/* Replace the .xy coordinates with S/T from the point sprite. Set
				 * interpolation bits for .zw such that they become .01
				 */
				if (emit->sprite_coord_enable & texmask) {
					/* mask is two 2-bit fields, where:
					 *   '01' -> S
					 *   '10' -> T
					 *   '11' -> 1 - T  (flip mode)
					 */
					unsigned mask = emit->sprite_coord_mode ? 0b1101 : 0b1001;
					uint32_t loc = inloc;
					if (compmask & 0x1) {
						vpsrepl[loc / 16] |= ((mask >> 0) & 0x3) << ((loc % 16) * 2);
						loc++;
					}
					if (compmask & 0x2) {
						vpsrepl[loc / 16] |= ((mask >> 2) & 0x3) << ((loc % 16) * 2);
						loc++;
					}
					if (compmask & 0x4) {
						/* .z <- 0.0f */
						vinterp[loc / 16] |= 0b10 << ((loc % 16) * 2);
						loc++;
					}
					if (compmask & 0x8) {
						/* .w <- 1.0f */
						vinterp[loc / 16] |= 0b11 << ((loc % 16) * 2);
						loc++;
					}
				}
Ejemplo n.º 13
0
void
fd3_program_emit(struct fd_ringbuffer *ring, struct fd3_emit *emit)
{
	const struct ir3_shader_variant *vp, *fp;
	const struct ir3_info *vsi, *fsi;
	enum a3xx_instrbuffermode fpbuffer, vpbuffer;
	uint32_t fpbuffersz, vpbuffersz, fsoff;
	uint32_t pos_regid, posz_regid, psize_regid, color_regid;
	int constmode;
	int i, j, k;

	vp = fd3_emit_get_vp(emit);

	if (emit->key.binning_pass) {
		/* use dummy stateobj to simplify binning vs non-binning: */
		static const struct ir3_shader_variant binning_fp = {};
		fp = &binning_fp;
	} else {
		fp = fd3_emit_get_fp(emit);
	}

	vsi = &vp->info;
	fsi = &fp->info;

	fpbuffer = BUFFER;
	vpbuffer = BUFFER;
	fpbuffersz = fp->instrlen;
	vpbuffersz = vp->instrlen;

	/*
	 * Decide whether to use BUFFER or CACHE mode for VS and FS.  It
	 * appears like 256 is the hard limit, but when the combined size
	 * exceeds 128 then blob will try to keep FS in BUFFER mode and
	 * switch to CACHE for VS until VS is too large.  The blob seems
	 * to switch FS out of BUFFER mode at slightly under 128.  But
	 * a bit fuzzy on the decision tree, so use slightly conservative
	 * limits.
	 *
	 * TODO check if these thresholds for BUFFER vs CACHE mode are the
	 *      same for all a3xx or whether we need to consider the gpuid
	 */

	if ((fpbuffersz + vpbuffersz) > 128) {
		if (fpbuffersz < 112) {
			/* FP:BUFFER   VP:CACHE  */
			vpbuffer = CACHE;
			vpbuffersz = 256 - fpbuffersz;
		} else if (vpbuffersz < 112) {
			/* FP:CACHE    VP:BUFFER */
			fpbuffer = CACHE;
			fpbuffersz = 256 - vpbuffersz;
		} else {
			/* FP:CACHE    VP:CACHE  */
			vpbuffer = fpbuffer = CACHE;
			vpbuffersz = fpbuffersz = 192;
		}
	}

	if (fpbuffer == BUFFER) {
		fsoff = 128 - fpbuffersz;
	} else {
		fsoff = 256 - fpbuffersz;
	}

	/* seems like vs->constlen + fs->constlen > 256, then CONSTMODE=1 */
	constmode = ((vp->constlen + fp->constlen) > 256) ? 1 : 0;

	pos_regid = find_output_regid(vp,
		ir3_semantic_name(TGSI_SEMANTIC_POSITION, 0));
	posz_regid = find_output_regid(fp,
		ir3_semantic_name(TGSI_SEMANTIC_POSITION, 0));
	psize_regid = find_output_regid(vp,
		ir3_semantic_name(TGSI_SEMANTIC_PSIZE, 0));
	color_regid = find_output_regid(fp,
		ir3_semantic_name(TGSI_SEMANTIC_COLOR, 0));

	/* we could probably divide this up into things that need to be
	 * emitted if frag-prog is dirty vs if vert-prog is dirty..
	 */

	OUT_PKT0(ring, REG_A3XX_HLSQ_CONTROL_0_REG, 6);
	OUT_RING(ring, A3XX_HLSQ_CONTROL_0_REG_FSTHREADSIZE(FOUR_QUADS) |
			A3XX_HLSQ_CONTROL_0_REG_CONSTMODE(constmode) |
			/* NOTE:  I guess SHADERRESTART and CONSTFULLUPDATE maybe
			 * flush some caches? I think we only need to set those
			 * bits if we have updated const or shader..
			 */
			A3XX_HLSQ_CONTROL_0_REG_SPSHADERRESTART |
			A3XX_HLSQ_CONTROL_0_REG_SPCONSTFULLUPDATE);
	OUT_RING(ring, A3XX_HLSQ_CONTROL_1_REG_VSTHREADSIZE(TWO_QUADS) |
			A3XX_HLSQ_CONTROL_1_REG_VSSUPERTHREADENABLE |
			COND(fp->frag_coord, A3XX_HLSQ_CONTROL_1_REG_ZWCOORD));
	OUT_RING(ring, A3XX_HLSQ_CONTROL_2_REG_PRIMALLOCTHRESHOLD(31));
	OUT_RING(ring, A3XX_HLSQ_CONTROL_3_REG_REGID(fp->pos_regid));
	OUT_RING(ring, A3XX_HLSQ_VS_CONTROL_REG_CONSTLENGTH(vp->constlen) |
			A3XX_HLSQ_VS_CONTROL_REG_CONSTSTARTOFFSET(0) |
			A3XX_HLSQ_VS_CONTROL_REG_INSTRLENGTH(vpbuffersz));
	OUT_RING(ring, A3XX_HLSQ_FS_CONTROL_REG_CONSTLENGTH(fp->constlen) |
			A3XX_HLSQ_FS_CONTROL_REG_CONSTSTARTOFFSET(128) |
			A3XX_HLSQ_FS_CONTROL_REG_INSTRLENGTH(fpbuffersz));

	OUT_PKT0(ring, REG_A3XX_SP_SP_CTRL_REG, 1);
	OUT_RING(ring, A3XX_SP_SP_CTRL_REG_CONSTMODE(constmode) |
			COND(emit->key.binning_pass, A3XX_SP_SP_CTRL_REG_BINNING) |
			A3XX_SP_SP_CTRL_REG_SLEEPMODE(1) |
			A3XX_SP_SP_CTRL_REG_L0MODE(0));

	OUT_PKT0(ring, REG_A3XX_SP_VS_LENGTH_REG, 1);
	OUT_RING(ring, A3XX_SP_VS_LENGTH_REG_SHADERLENGTH(vp->instrlen));

	OUT_PKT0(ring, REG_A3XX_SP_VS_CTRL_REG0, 3);
	OUT_RING(ring, A3XX_SP_VS_CTRL_REG0_THREADMODE(MULTI) |
			A3XX_SP_VS_CTRL_REG0_INSTRBUFFERMODE(vpbuffer) |
			COND(vpbuffer == CACHE, A3XX_SP_VS_CTRL_REG0_CACHEINVALID) |
			A3XX_SP_VS_CTRL_REG0_HALFREGFOOTPRINT(vsi->max_half_reg + 1) |
			A3XX_SP_VS_CTRL_REG0_FULLREGFOOTPRINT(vsi->max_reg + 1) |
			A3XX_SP_VS_CTRL_REG0_INOUTREGOVERLAP(0) |
			A3XX_SP_VS_CTRL_REG0_THREADSIZE(TWO_QUADS) |
			A3XX_SP_VS_CTRL_REG0_SUPERTHREADMODE |
			COND(vp->has_samp, A3XX_SP_VS_CTRL_REG0_PIXLODENABLE) |
			A3XX_SP_VS_CTRL_REG0_LENGTH(vpbuffersz));
	OUT_RING(ring, A3XX_SP_VS_CTRL_REG1_CONSTLENGTH(vp->constlen) |
			A3XX_SP_VS_CTRL_REG1_INITIALOUTSTANDING(vp->total_in) |
			A3XX_SP_VS_CTRL_REG1_CONSTFOOTPRINT(MAX2(vp->constlen + 1, 0)));
	OUT_RING(ring, A3XX_SP_VS_PARAM_REG_POSREGID(pos_regid) |
			A3XX_SP_VS_PARAM_REG_PSIZEREGID(psize_regid) |
			A3XX_SP_VS_PARAM_REG_TOTALVSOUTVAR(align(fp->total_in, 4) / 4));

	for (i = 0, j = -1; (i < 8) && (j < (int)fp->inputs_count); i++) {
		uint32_t reg = 0;

		OUT_PKT0(ring, REG_A3XX_SP_VS_OUT_REG(i), 1);

		j = next_varying(fp, j);
		if (j < fp->inputs_count) {
			k = find_output(vp, fp->inputs[j].semantic);
			reg |= A3XX_SP_VS_OUT_REG_A_REGID(vp->outputs[k].regid);
			reg |= A3XX_SP_VS_OUT_REG_A_COMPMASK(fp->inputs[j].compmask);
		}

		j = next_varying(fp, j);
		if (j < fp->inputs_count) {
			k = find_output(vp, fp->inputs[j].semantic);
			reg |= A3XX_SP_VS_OUT_REG_B_REGID(vp->outputs[k].regid);
			reg |= A3XX_SP_VS_OUT_REG_B_COMPMASK(fp->inputs[j].compmask);
		}

		OUT_RING(ring, reg);
	}

	for (i = 0, j = -1; (i < 4) && (j < (int)fp->inputs_count); i++) {
		uint32_t reg = 0;

		OUT_PKT0(ring, REG_A3XX_SP_VS_VPC_DST_REG(i), 1);

		j = next_varying(fp, j);
		if (j < fp->inputs_count)
			reg |= A3XX_SP_VS_VPC_DST_REG_OUTLOC0(fp->inputs[j].inloc);
		j = next_varying(fp, j);
		if (j < fp->inputs_count)
			reg |= A3XX_SP_VS_VPC_DST_REG_OUTLOC1(fp->inputs[j].inloc);
		j = next_varying(fp, j);
		if (j < fp->inputs_count)
			reg |= A3XX_SP_VS_VPC_DST_REG_OUTLOC2(fp->inputs[j].inloc);
		j = next_varying(fp, j);
		if (j < fp->inputs_count)
			reg |= A3XX_SP_VS_VPC_DST_REG_OUTLOC3(fp->inputs[j].inloc);

		OUT_RING(ring, reg);
	}

	OUT_PKT0(ring, REG_A3XX_SP_VS_OBJ_OFFSET_REG, 2);
	OUT_RING(ring, A3XX_SP_VS_OBJ_OFFSET_REG_CONSTOBJECTOFFSET(0) |
			A3XX_SP_VS_OBJ_OFFSET_REG_SHADEROBJOFFSET(0));
	OUT_RELOC(ring, vp->bo, 0, 0, 0);  /* SP_VS_OBJ_START_REG */

	if (emit->key.binning_pass) {
		OUT_PKT0(ring, REG_A3XX_SP_FS_LENGTH_REG, 1);
		OUT_RING(ring, 0x00000000);

		OUT_PKT0(ring, REG_A3XX_SP_FS_CTRL_REG0, 2);
		OUT_RING(ring, A3XX_SP_FS_CTRL_REG0_THREADMODE(MULTI) |
				A3XX_SP_FS_CTRL_REG0_INSTRBUFFERMODE(BUFFER));
		OUT_RING(ring, 0x00000000);

		OUT_PKT0(ring, REG_A3XX_SP_FS_OBJ_OFFSET_REG, 1);
		OUT_RING(ring, A3XX_SP_FS_OBJ_OFFSET_REG_CONSTOBJECTOFFSET(128) |
				A3XX_SP_FS_OBJ_OFFSET_REG_SHADEROBJOFFSET(0));
	} else {
		OUT_PKT0(ring, REG_A3XX_SP_FS_LENGTH_REG, 1);
		OUT_RING(ring, A3XX_SP_FS_LENGTH_REG_SHADERLENGTH(fp->instrlen));

		OUT_PKT0(ring, REG_A3XX_SP_FS_CTRL_REG0, 2);
		OUT_RING(ring, A3XX_SP_FS_CTRL_REG0_THREADMODE(MULTI) |
				A3XX_SP_FS_CTRL_REG0_INSTRBUFFERMODE(fpbuffer) |
				COND(fpbuffer == CACHE, A3XX_SP_FS_CTRL_REG0_CACHEINVALID) |
				A3XX_SP_FS_CTRL_REG0_HALFREGFOOTPRINT(fsi->max_half_reg + 1) |
				A3XX_SP_FS_CTRL_REG0_FULLREGFOOTPRINT(fsi->max_reg + 1) |
				A3XX_SP_FS_CTRL_REG0_INOUTREGOVERLAP(1) |
				A3XX_SP_FS_CTRL_REG0_THREADSIZE(FOUR_QUADS) |
				A3XX_SP_FS_CTRL_REG0_SUPERTHREADMODE |
				COND(fp->has_samp > 0, A3XX_SP_FS_CTRL_REG0_PIXLODENABLE) |
				A3XX_SP_FS_CTRL_REG0_LENGTH(fpbuffersz));
		OUT_RING(ring, A3XX_SP_FS_CTRL_REG1_CONSTLENGTH(fp->constlen) |
				A3XX_SP_FS_CTRL_REG1_INITIALOUTSTANDING(fp->total_in) |
				A3XX_SP_FS_CTRL_REG1_CONSTFOOTPRINT(MAX2(fp->constlen + 1, 0)) |
				A3XX_SP_FS_CTRL_REG1_HALFPRECVAROFFSET(63));

		OUT_PKT0(ring, REG_A3XX_SP_FS_OBJ_OFFSET_REG, 2);
		OUT_RING(ring, A3XX_SP_FS_OBJ_OFFSET_REG_CONSTOBJECTOFFSET(
					MAX2(128, vp->constlen)) |
				A3XX_SP_FS_OBJ_OFFSET_REG_SHADEROBJOFFSET(fsoff));
		OUT_RELOC(ring, fp->bo, 0, 0, 0);  /* SP_FS_OBJ_START_REG */
	}

	OUT_PKT0(ring, REG_A3XX_SP_FS_OUTPUT_REG, 1);
	if (fp->writes_pos) {
		OUT_RING(ring, A3XX_SP_FS_OUTPUT_REG_DEPTH_ENABLE |
				A3XX_SP_FS_OUTPUT_REG_DEPTH_REGID(posz_regid));
	} else {
		OUT_RING(ring, 0x00000000);
	}

	OUT_PKT0(ring, REG_A3XX_SP_FS_MRT_REG(0), 4);
	OUT_RING(ring, A3XX_SP_FS_MRT_REG_REGID(color_regid) |
			COND(fp->key.half_precision, A3XX_SP_FS_MRT_REG_HALF_PRECISION));
	OUT_RING(ring, A3XX_SP_FS_MRT_REG_REGID(0));
	OUT_RING(ring, A3XX_SP_FS_MRT_REG_REGID(0));
	OUT_RING(ring, A3XX_SP_FS_MRT_REG_REGID(0));

	if (emit->key.binning_pass) {
		OUT_PKT0(ring, REG_A3XX_VPC_ATTR, 2);
		OUT_RING(ring, A3XX_VPC_ATTR_THRDASSIGN(1) |
				A3XX_VPC_ATTR_LMSIZE(1) |
				COND(vp->writes_psize, A3XX_VPC_ATTR_PSIZE));
		OUT_RING(ring, 0x00000000);
	} else {
		uint32_t vinterp[4] = {0}, flatshade[2] = {0};

		/* figure out VARYING_INTERP / FLAT_SHAD register values: */
		for (j = -1; (j = next_varying(fp, j)) < (int)fp->inputs_count; ) {
			uint32_t interp = fp->inputs[j].interpolate;
			if ((interp == TGSI_INTERPOLATE_CONSTANT) ||
					((interp == TGSI_INTERPOLATE_COLOR) && emit->rasterflat)) {
				/* TODO might be cleaner to just +8 in SP_VS_VPC_DST_REG
				 * instead.. rather than -8 everywhere else..
				 */
				uint32_t loc = fp->inputs[j].inloc - 8;

				/* currently assuming varyings aligned to 4 (not
				 * packed):
				 */
				debug_assert((loc % 4) == 0);

				for (i = 0; i < 4; i++, loc++) {
					vinterp[loc / 16] |= FLAT << ((loc % 16) * 2);
					flatshade[loc / 32] |= 1 << (loc % 32);
				}
			}
		}

		OUT_PKT0(ring, REG_A3XX_VPC_ATTR, 2);
		OUT_RING(ring, A3XX_VPC_ATTR_TOTALATTR(fp->total_in) |
				A3XX_VPC_ATTR_THRDASSIGN(1) |
				A3XX_VPC_ATTR_LMSIZE(1) |
				COND(vp->writes_psize, A3XX_VPC_ATTR_PSIZE));
		OUT_RING(ring, A3XX_VPC_PACK_NUMFPNONPOSVAR(fp->total_in) |
				A3XX_VPC_PACK_NUMNONPOSVSVAR(fp->total_in));

		OUT_PKT0(ring, REG_A3XX_VPC_VARYING_INTERP_MODE(0), 4);
		OUT_RING(ring, vinterp[0]);    /* VPC_VARYING_INTERP[0].MODE */
		OUT_RING(ring, vinterp[1]);    /* VPC_VARYING_INTERP[1].MODE */
		OUT_RING(ring, vinterp[2]);    /* VPC_VARYING_INTERP[2].MODE */
		OUT_RING(ring, vinterp[3]);    /* VPC_VARYING_INTERP[3].MODE */

		OUT_PKT0(ring, REG_A3XX_VPC_VARYING_PS_REPL_MODE(0), 4);
		OUT_RING(ring, fp->shader->vpsrepl[0]);    /* VPC_VARYING_PS_REPL[0].MODE */
		OUT_RING(ring, fp->shader->vpsrepl[1]);    /* VPC_VARYING_PS_REPL[1].MODE */
		OUT_RING(ring, fp->shader->vpsrepl[2]);    /* VPC_VARYING_PS_REPL[2].MODE */
		OUT_RING(ring, fp->shader->vpsrepl[3]);    /* VPC_VARYING_PS_REPL[3].MODE */

		OUT_PKT0(ring, REG_A3XX_SP_FS_FLAT_SHAD_MODE_REG_0, 2);
		OUT_RING(ring, flatshade[0]);        /* SP_FS_FLAT_SHAD_MODE_REG_0 */
		OUT_RING(ring, flatshade[1]);        /* SP_FS_FLAT_SHAD_MODE_REG_1 */
	}

	OUT_PKT0(ring, REG_A3XX_VFD_VS_THREADING_THRESHOLD, 1);
	OUT_RING(ring, A3XX_VFD_VS_THREADING_THRESHOLD_REGID_THRESHOLD(15) |
			A3XX_VFD_VS_THREADING_THRESHOLD_REGID_VTXCNT(252));

	if (vpbuffer == BUFFER)
		emit_shader(ring, vp);

	OUT_PKT0(ring, REG_A3XX_VFD_PERFCOUNTER0_SELECT, 1);
	OUT_RING(ring, 0x00000000);        /* VFD_PERFCOUNTER0_SELECT */

	if (!emit->key.binning_pass) {
		if (fpbuffer == BUFFER)
			emit_shader(ring, fp);

		OUT_PKT0(ring, REG_A3XX_VFD_PERFCOUNTER0_SELECT, 1);
		OUT_RING(ring, 0x00000000);        /* VFD_PERFCOUNTER0_SELECT */
	}
}
Ejemplo n.º 14
0
static void
nouveau_accel_init(struct nouveau_drm *drm)
{
	struct nvif_device *device = &drm->device;
	u32 arg0, arg1;
	u32 sclass[16];
	int ret, i;

	if (nouveau_noaccel)
		return;

	/* initialise synchronisation routines */
	/*XXX: this is crap, but the fence/channel stuff is a little
	 *     backwards in some places.  this will be fixed.
	 */
	ret = nvif_object_sclass(&device->base, sclass, ARRAY_SIZE(sclass));
	if (ret < 0)
		return;

	for (ret = -ENOSYS, i = 0; ret && i < ARRAY_SIZE(sclass); i++) {
		switch (sclass[i]) {
		case NV03_CHANNEL_DMA:
			ret = nv04_fence_create(drm);
			break;
		case NV10_CHANNEL_DMA:
			ret = nv10_fence_create(drm);
			break;
		case NV17_CHANNEL_DMA:
		case NV40_CHANNEL_DMA:
			ret = nv17_fence_create(drm);
			break;
		case NV50_CHANNEL_GPFIFO:
			ret = nv50_fence_create(drm);
			break;
		case G82_CHANNEL_GPFIFO:
			ret = nv84_fence_create(drm);
			break;
		case FERMI_CHANNEL_GPFIFO:
		case KEPLER_CHANNEL_GPFIFO_A:
			ret = nvc0_fence_create(drm);
			break;
		default:
			break;
		}
	}

	if (ret) {
		NV_ERROR(drm, "failed to initialise sync subsystem, %d\n", ret);
		nouveau_accel_fini(drm);
		return;
	}

	if (device->info.family >= NV_DEVICE_INFO_V0_KEPLER) {
		ret = nouveau_channel_new(drm, &drm->device, NVDRM_CHAN + 1,
					  KEPLER_CHANNEL_GPFIFO_A_V0_ENGINE_CE0|
					  KEPLER_CHANNEL_GPFIFO_A_V0_ENGINE_CE1,
					  0, &drm->cechan);
		if (ret)
			NV_ERROR(drm, "failed to create ce channel, %d\n", ret);

		arg0 = KEPLER_CHANNEL_GPFIFO_A_V0_ENGINE_GR;
		arg1 = 1;
	} else
	if (device->info.chipset >= 0xa3 &&
	    device->info.chipset != 0xaa &&
	    device->info.chipset != 0xac) {
		ret = nouveau_channel_new(drm, &drm->device, NVDRM_CHAN + 1,
					  NvDmaFB, NvDmaTT, &drm->cechan);
		if (ret)
			NV_ERROR(drm, "failed to create ce channel, %d\n", ret);

		arg0 = NvDmaFB;
		arg1 = NvDmaTT;
	} else {
		arg0 = NvDmaFB;
		arg1 = NvDmaTT;
	}

	ret = nouveau_channel_new(drm, &drm->device, NVDRM_CHAN, arg0, arg1,
				 &drm->channel);
	if (ret) {
		NV_ERROR(drm, "failed to create kernel channel, %d\n", ret);
		nouveau_accel_fini(drm);
		return;
	}

	ret = nvif_object_init(drm->channel->object, NULL, NVDRM_NVSW,
			       nouveau_abi16_swclass(drm), NULL, 0, &drm->nvsw);
	if (ret == 0) {
		struct nouveau_software_chan *swch;
		ret = RING_SPACE(drm->channel, 2);
		if (ret == 0) {
			if (device->info.family < NV_DEVICE_INFO_V0_FERMI) {
				BEGIN_NV04(drm->channel, NvSubSw, 0, 1);
				OUT_RING  (drm->channel, NVDRM_NVSW);
			} else
			if (device->info.family < NV_DEVICE_INFO_V0_KEPLER) {
				BEGIN_NVC0(drm->channel, FermiSw, 0, 1);
				OUT_RING  (drm->channel, 0x001f0000);
			}
		}
		swch = (void *)nvkm_object(&drm->nvsw)->parent;
		swch->flip = nouveau_flip_complete;
		swch->flip_data = drm->channel;
	}

	if (ret) {
		NV_ERROR(drm, "failed to allocate software object, %d\n", ret);
		nouveau_accel_fini(drm);
		return;
	}

	if (device->info.family < NV_DEVICE_INFO_V0_FERMI) {
		ret = nouveau_gpuobj_new(nvkm_object(&drm->device), NULL, 32,
					 0, 0, &drm->notify);
		if (ret) {
			NV_ERROR(drm, "failed to allocate notifier, %d\n", ret);
			nouveau_accel_fini(drm);
			return;
		}

		ret = nvif_object_init(drm->channel->object, NULL, NvNotify0,
				       NV_DMA_IN_MEMORY,
				       &(struct nv_dma_v0) {
						.target = NV_DMA_V0_TARGET_VRAM,
						.access = NV_DMA_V0_ACCESS_RDWR,
						.start = drm->notify->addr,
						.limit = drm->notify->addr + 31
				       }, sizeof(struct nv_dma_v0),
Ejemplo n.º 15
0
static void
fd4_emit_tile_gmem2mem(struct fd_context *ctx, struct fd_tile *tile)
{
	struct fd4_context *fd4_ctx = fd4_context(ctx);
	struct fd_ringbuffer *ring = ctx->ring;
	struct pipe_framebuffer_state *pfb = &ctx->framebuffer;
	struct fd4_emit emit = {
			.vtx = &fd4_ctx->solid_vbuf_state,
			.prog = &ctx->solid_prog,
			.key = key,
			.format = fd4_emit_format(pfb->cbufs[0]),
	};

	OUT_PKT0(ring, REG_A4XX_RB_DEPTH_CONTROL, 1);
	OUT_RING(ring, A4XX_RB_DEPTH_CONTROL_ZFUNC(FUNC_NEVER));

	OUT_PKT0(ring, REG_A4XX_RB_STENCIL_CONTROL, 2);
	OUT_RING(ring, A4XX_RB_STENCIL_CONTROL_FUNC(FUNC_NEVER) |
			A4XX_RB_STENCIL_CONTROL_FAIL(STENCIL_KEEP) |
			A4XX_RB_STENCIL_CONTROL_ZPASS(STENCIL_KEEP) |
			A4XX_RB_STENCIL_CONTROL_ZFAIL(STENCIL_KEEP) |
			A4XX_RB_STENCIL_CONTROL_FUNC_BF(FUNC_NEVER) |
			A4XX_RB_STENCIL_CONTROL_FAIL_BF(STENCIL_KEEP) |
			A4XX_RB_STENCIL_CONTROL_ZPASS_BF(STENCIL_KEEP) |
			A4XX_RB_STENCIL_CONTROL_ZFAIL_BF(STENCIL_KEEP));
	OUT_RING(ring, 0x00000000); /* RB_STENCIL_CONTROL2 */

	OUT_PKT0(ring, REG_A4XX_RB_STENCILREFMASK, 2);
	OUT_RING(ring, 0xff000000 |
			A4XX_RB_STENCILREFMASK_STENCILREF(0) |
			A4XX_RB_STENCILREFMASK_STENCILMASK(0) |
			A4XX_RB_STENCILREFMASK_STENCILWRITEMASK(0xff));
	OUT_RING(ring, 0xff000000 |
			A4XX_RB_STENCILREFMASK_BF_STENCILREF(0) |
			A4XX_RB_STENCILREFMASK_BF_STENCILMASK(0) |
			A4XX_RB_STENCILREFMASK_BF_STENCILWRITEMASK(0xff));

	OUT_PKT0(ring, REG_A4XX_GRAS_SU_MODE_CONTROL, 1);
	OUT_RING(ring, A4XX_GRAS_SU_MODE_CONTROL_LINEHALFWIDTH(0));

	fd_wfi(ctx, ring);

	OUT_PKT0(ring, REG_A4XX_GRAS_CL_CLIP_CNTL, 1);
	OUT_RING(ring, 0x80000);      /* GRAS_CL_CLIP_CNTL */

	OUT_PKT0(ring, REG_A4XX_GRAS_CL_VPORT_XOFFSET_0, 6);
	OUT_RING(ring, A4XX_GRAS_CL_VPORT_XOFFSET_0((float)pfb->width/2.0));
	OUT_RING(ring, A4XX_GRAS_CL_VPORT_XSCALE_0((float)pfb->width/2.0));
	OUT_RING(ring, A4XX_GRAS_CL_VPORT_YOFFSET_0((float)pfb->height/2.0));
	OUT_RING(ring, A4XX_GRAS_CL_VPORT_YSCALE_0(-(float)pfb->height/2.0));
	OUT_RING(ring, A4XX_GRAS_CL_VPORT_ZOFFSET_0(0.0));
	OUT_RING(ring, A4XX_GRAS_CL_VPORT_ZSCALE_0(1.0));

	OUT_PKT0(ring, REG_A4XX_RB_RENDER_CONTROL, 1);
	OUT_RING(ring, A4XX_RB_RENDER_CONTROL_DISABLE_COLOR_PIPE |
			0xa);       /* XXX */

	OUT_PKT0(ring, REG_A4XX_GRAS_SC_CONTROL, 1);
	OUT_RING(ring, A4XX_GRAS_SC_CONTROL_RENDER_MODE(RB_RESOLVE_PASS) |
			A4XX_GRAS_SC_CONTROL_MSAA_DISABLE |
			A4XX_GRAS_SC_CONTROL_MSAA_SAMPLES(MSAA_ONE) |
			A4XX_GRAS_SC_CONTROL_RASTER_MODE(1));

	OUT_PKT0(ring, REG_A4XX_PC_PRIM_VTX_CNTL, 1);
	OUT_RING(ring, A4XX_PC_PRIM_VTX_CNTL_PROVOKING_VTX_LAST);

	OUT_PKT0(ring, REG_A4XX_GRAS_ALPHA_CONTROL, 1);
	OUT_RING(ring, 0x00000002);

	OUT_PKT0(ring, REG_A4XX_GRAS_SC_WINDOW_SCISSOR_BR, 2);
	OUT_RING(ring, A4XX_GRAS_SC_WINDOW_SCISSOR_BR_X(pfb->width - 1) |
			A4XX_GRAS_SC_WINDOW_SCISSOR_BR_Y(pfb->height - 1));
	OUT_RING(ring, A4XX_GRAS_SC_WINDOW_SCISSOR_TL_X(0) |
			A4XX_GRAS_SC_WINDOW_SCISSOR_TL_Y(0));

	OUT_PKT0(ring, REG_A4XX_VFD_INDEX_OFFSET, 2);
	OUT_RING(ring, 0);            /* VFD_INDEX_OFFSET */
	OUT_RING(ring, 0);            /* ??? UNKNOWN_2209 */

	fd4_program_emit(ring, &emit);
	fd4_emit_vertex_bufs(ring, &emit);

	if (ctx->resolve & (FD_BUFFER_DEPTH | FD_BUFFER_STENCIL)) {
		uint32_t base = depth_base(ctx);
		emit_gmem2mem_surf(ctx, base, pfb->zsbuf);
	}

	if (ctx->resolve & FD_BUFFER_COLOR) {
		emit_gmem2mem_surf(ctx, 0, pfb->cbufs[0]);
	}

	OUT_PKT0(ring, REG_A4XX_GRAS_SC_CONTROL, 1);
	OUT_RING(ring, A4XX_GRAS_SC_CONTROL_RENDER_MODE(RB_RENDERING_PASS) |
			A4XX_GRAS_SC_CONTROL_MSAA_DISABLE |
			A4XX_GRAS_SC_CONTROL_MSAA_SAMPLES(MSAA_ONE) |
			A4XX_GRAS_SC_CONTROL_RASTER_MODE(0));
}

/* transfer from system memory to gmem */

static void
emit_mem2gmem_surf(struct fd_context *ctx, uint32_t base,
		struct pipe_surface *psurf, uint32_t bin_w)
{
	struct fd_ringbuffer *ring = ctx->ring;

	emit_mrt(ring, 1, &psurf, &base, bin_w);

	fd4_emit_gmem_restore_tex(ring, psurf);

	fd4_draw(ctx, ring, DI_PT_RECTLIST, IGNORE_VISIBILITY,
			DI_SRC_SEL_AUTO_INDEX, 2, 1, INDEX_SIZE_IGN, 0, 0, NULL);
}

static void
fd4_emit_tile_mem2gmem(struct fd_context *ctx, struct fd_tile *tile)
{
	struct fd4_context *fd4_ctx = fd4_context(ctx);
	struct fd_gmem_stateobj *gmem = &ctx->gmem;
	struct fd_ringbuffer *ring = ctx->ring;
	struct pipe_framebuffer_state *pfb = &ctx->framebuffer;
	struct fd4_emit emit = {
			.vtx = &fd4_ctx->blit_vbuf_state,
			.prog = &ctx->blit_prog[0],
			.key = key,
			.format = fd4_emit_format(pfb->cbufs[0]),
	};
	float x0, y0, x1, y1;
	unsigned bin_w = tile->bin_w;
	unsigned bin_h = tile->bin_h;
	unsigned i;

	/* write texture coordinates to vertexbuf: */
	x0 = ((float)tile->xoff) / ((float)pfb->width);
	x1 = ((float)tile->xoff + bin_w) / ((float)pfb->width);
	y0 = ((float)tile->yoff) / ((float)pfb->height);
	y1 = ((float)tile->yoff + bin_h) / ((float)pfb->height);

	OUT_PKT3(ring, CP_MEM_WRITE, 5);
	OUT_RELOCW(ring, fd_resource(fd4_ctx->blit_texcoord_vbuf)->bo, 0, 0, 0);
	OUT_RING(ring, fui(x0));
	OUT_RING(ring, fui(y0));
	OUT_RING(ring, fui(x1));
	OUT_RING(ring, fui(y1));

	for (i = 0; i < 8; i++) {
		OUT_PKT0(ring, REG_A4XX_RB_MRT_CONTROL(i), 1);
		OUT_RING(ring, A4XX_RB_MRT_CONTROL_FASTCLEAR |
				A4XX_RB_MRT_CONTROL_B11 |
				A4XX_RB_MRT_CONTROL_COMPONENT_ENABLE(0xf));

		OUT_PKT0(ring, REG_A4XX_RB_MRT_BLEND_CONTROL(i), 1);
		OUT_RING(ring, A4XX_RB_MRT_BLEND_CONTROL_RGB_SRC_FACTOR(FACTOR_ONE) |
				A4XX_RB_MRT_BLEND_CONTROL_RGB_BLEND_OPCODE(BLEND_DST_PLUS_SRC) |
				A4XX_RB_MRT_BLEND_CONTROL_RGB_DEST_FACTOR(FACTOR_ZERO) |
				A4XX_RB_MRT_BLEND_CONTROL_ALPHA_SRC_FACTOR(FACTOR_ONE) |
				A4XX_RB_MRT_BLEND_CONTROL_ALPHA_BLEND_OPCODE(BLEND_DST_PLUS_SRC) |
				A4XX_RB_MRT_BLEND_CONTROL_ALPHA_DEST_FACTOR(FACTOR_ZERO));
	}

	OUT_PKT0(ring, REG_A4XX_RB_RENDER_CONTROL, 1);
	OUT_RING(ring, 0x8);          /* XXX RB_RENDER_CONTROL */

	OUT_PKT0(ring, REG_A4XX_RB_DEPTH_CONTROL, 1);
	OUT_RING(ring, A4XX_RB_DEPTH_CONTROL_ZFUNC(FUNC_LESS));

	OUT_PKT0(ring, REG_A4XX_GRAS_CL_CLIP_CNTL, 1);
	OUT_RING(ring, 0x280000);     /* XXX GRAS_CL_CLIP_CNTL */

	OUT_PKT0(ring, REG_A4XX_GRAS_SU_MODE_CONTROL, 1);
	OUT_RING(ring, A4XX_GRAS_SU_MODE_CONTROL_LINEHALFWIDTH(0) |
			A4XX_GRAS_SU_MODE_CONTROL_RENDERING_PASS);

	OUT_PKT0(ring, REG_A4XX_GRAS_CL_VPORT_XOFFSET_0, 6);
	OUT_RING(ring, A4XX_GRAS_CL_VPORT_XOFFSET_0((float)bin_w/2.0));
	OUT_RING(ring, A4XX_GRAS_CL_VPORT_XSCALE_0((float)bin_w/2.0));
	OUT_RING(ring, A4XX_GRAS_CL_VPORT_YOFFSET_0((float)bin_h/2.0));
	OUT_RING(ring, A4XX_GRAS_CL_VPORT_YSCALE_0(-(float)bin_h/2.0));
	OUT_RING(ring, A4XX_GRAS_CL_VPORT_ZOFFSET_0(0.0));
	OUT_RING(ring, A4XX_GRAS_CL_VPORT_ZSCALE_0(1.0));

	OUT_PKT0(ring, REG_A4XX_GRAS_SC_WINDOW_SCISSOR_BR, 2);
	OUT_RING(ring, A4XX_GRAS_SC_WINDOW_SCISSOR_BR_X(bin_w - 1) |
			A4XX_GRAS_SC_WINDOW_SCISSOR_BR_Y(bin_h - 1));
	OUT_RING(ring, A4XX_GRAS_SC_WINDOW_SCISSOR_TL_X(0) |
			A4XX_GRAS_SC_WINDOW_SCISSOR_TL_Y(0));

	OUT_PKT0(ring, REG_A4XX_GRAS_SC_SCREEN_SCISSOR_TL, 2);
	OUT_RING(ring, A4XX_GRAS_SC_SCREEN_SCISSOR_TL_X(0) |
			A4XX_GRAS_SC_SCREEN_SCISSOR_TL_Y(0));
	OUT_RING(ring, A4XX_GRAS_SC_SCREEN_SCISSOR_BR_X(bin_w - 1) |
			A4XX_GRAS_SC_SCREEN_SCISSOR_BR_Y(bin_h - 1));

	OUT_PKT0(ring, REG_A4XX_RB_MODE_CONTROL, 1);
	OUT_RING(ring, A4XX_RB_MODE_CONTROL_WIDTH(gmem->bin_w) |
			A4XX_RB_MODE_CONTROL_HEIGHT(gmem->bin_h));

	OUT_PKT0(ring, REG_A4XX_RB_STENCIL_CONTROL, 2);
	OUT_RING(ring, A4XX_RB_STENCIL_CONTROL_FUNC(FUNC_ALWAYS) |
			A4XX_RB_STENCIL_CONTROL_FAIL(STENCIL_KEEP) |
			A4XX_RB_STENCIL_CONTROL_ZPASS(STENCIL_KEEP) |
			A4XX_RB_STENCIL_CONTROL_ZFAIL(STENCIL_KEEP) |
			A4XX_RB_STENCIL_CONTROL_FUNC_BF(FUNC_ALWAYS) |
			A4XX_RB_STENCIL_CONTROL_FAIL_BF(STENCIL_KEEP) |
			A4XX_RB_STENCIL_CONTROL_ZPASS_BF(STENCIL_KEEP) |
			A4XX_RB_STENCIL_CONTROL_ZFAIL_BF(STENCIL_KEEP));
	OUT_RING(ring, 0x00000000); /* RB_STENCIL_CONTROL2 */

	OUT_PKT0(ring, REG_A4XX_GRAS_SC_CONTROL, 1);
	OUT_RING(ring, A4XX_GRAS_SC_CONTROL_RENDER_MODE(RB_RENDERING_PASS) |
			A4XX_GRAS_SC_CONTROL_MSAA_DISABLE |
			A4XX_GRAS_SC_CONTROL_MSAA_SAMPLES(MSAA_ONE) |
			A4XX_GRAS_SC_CONTROL_RASTER_MODE(1));

	OUT_PKT0(ring, REG_A4XX_PC_PRIM_VTX_CNTL, 1);
	OUT_RING(ring, A4XX_PC_PRIM_VTX_CNTL_PROVOKING_VTX_LAST |
			A4XX_PC_PRIM_VTX_CNTL_VAROUT(1));

	OUT_PKT0(ring, REG_A4XX_VFD_INDEX_OFFSET, 2);
	OUT_RING(ring, 0);            /* VFD_INDEX_OFFSET */
	OUT_RING(ring, 0);            /* ??? UNKNOWN_2209 */

	fd4_program_emit(ring, &emit);
	fd4_emit_vertex_bufs(ring, &emit);

	/* for gmem pitch/base calculations, we need to use the non-
	 * truncated tile sizes:
	 */
	bin_w = gmem->bin_w;
	bin_h = gmem->bin_h;

	if (fd_gmem_needs_restore(ctx, tile, FD_BUFFER_DEPTH | FD_BUFFER_STENCIL))
		emit_mem2gmem_surf(ctx, depth_base(ctx), pfb->zsbuf, bin_w);

	if (fd_gmem_needs_restore(ctx, tile, FD_BUFFER_COLOR))
		emit_mem2gmem_surf(ctx, 0, pfb->cbufs[0], bin_w);

	OUT_PKT0(ring, REG_A4XX_GRAS_SC_CONTROL, 1);
	OUT_RING(ring, A4XX_GRAS_SC_CONTROL_RENDER_MODE(RB_RENDERING_PASS) |
			A4XX_GRAS_SC_CONTROL_MSAA_SAMPLES(MSAA_ONE) |
			A4XX_GRAS_SC_CONTROL_RASTER_MODE(0));

	OUT_PKT0(ring, REG_A4XX_RB_MODE_CONTROL, 1);
	OUT_RING(ring, A4XX_RB_MODE_CONTROL_WIDTH(gmem->bin_w) |
			A4XX_RB_MODE_CONTROL_HEIGHT(gmem->bin_h) |
			0x00010000);  /* XXX */
}

static void
patch_draws(struct fd_context *ctx, enum pc_di_vis_cull_mode vismode)
{
	unsigned i;
	for (i = 0; i < fd_patch_num_elements(&ctx->draw_patches); i++) {
		struct fd_cs_patch *patch = fd_patch_element(&ctx->draw_patches, i);
		*patch->cs = patch->val | DRAW4(0, 0, 0, vismode);
	}
	util_dynarray_resize(&ctx->draw_patches, 0);
}

static void
patch_rbrc(struct fd_context *ctx, uint32_t val)
{
	struct fd4_context *fd4_ctx = fd4_context(ctx);
	unsigned i;
	for (i = 0; i < fd_patch_num_elements(&fd4_ctx->rbrc_patches); i++) {
		struct fd_cs_patch *patch = fd_patch_element(&fd4_ctx->rbrc_patches, i);
		*patch->cs = patch->val | val;
	}
	util_dynarray_resize(&fd4_ctx->rbrc_patches, 0);
}

/* for rendering directly to system memory: */
static void
fd4_emit_sysmem_prep(struct fd_context *ctx)
{
	struct pipe_framebuffer_state *pfb = &ctx->framebuffer;
	struct fd_ringbuffer *ring = ctx->ring;

	fd4_emit_restore(ctx);

	OUT_PKT0(ring, REG_A4XX_RB_FRAME_BUFFER_DIMENSION, 1);
	OUT_RING(ring, A4XX_RB_FRAME_BUFFER_DIMENSION_WIDTH(pfb->width) |
			A4XX_RB_FRAME_BUFFER_DIMENSION_HEIGHT(pfb->height));

	emit_mrt(ring, pfb->nr_cbufs, pfb->cbufs, NULL, 0);

	/* setup scissor/offset for current tile: */
	OUT_PKT0(ring, REG_A4XX_RB_BIN_OFFSET, 1);
	OUT_RING(ring, A4XX_RB_BIN_OFFSET_X(0) |
			A4XX_RB_BIN_OFFSET_Y(0));

	OUT_PKT0(ring, REG_A4XX_GRAS_SC_SCREEN_SCISSOR_TL, 2);
	OUT_RING(ring, A4XX_GRAS_SC_SCREEN_SCISSOR_TL_X(0) |
			A4XX_GRAS_SC_SCREEN_SCISSOR_TL_Y(0));
	OUT_RING(ring, A4XX_GRAS_SC_SCREEN_SCISSOR_BR_X(pfb->width - 1) |
			A4XX_GRAS_SC_SCREEN_SCISSOR_BR_Y(pfb->height - 1));

	OUT_PKT0(ring, REG_A4XX_RB_MODE_CONTROL, 1);
	OUT_RING(ring, A4XX_RB_MODE_CONTROL_WIDTH(0) |
			A4XX_RB_MODE_CONTROL_HEIGHT(0) |
			0x00c00000);  /* XXX */

	patch_draws(ctx, IGNORE_VISIBILITY);
	patch_rbrc(ctx, 0);  // XXX
}

static void
update_vsc_pipe(struct fd_context *ctx)
{
	struct fd4_context *fd4_ctx = fd4_context(ctx);
	struct fd_ringbuffer *ring = ctx->ring;
	int i;

	OUT_PKT0(ring, REG_A4XX_VSC_SIZE_ADDRESS, 1);
	OUT_RELOCW(ring, fd4_ctx->vsc_size_mem, 0, 0, 0); /* VSC_SIZE_ADDRESS */

	OUT_PKT0(ring, REG_A4XX_VSC_PIPE_CONFIG_REG(0), 8);
	for (i = 0; i < 8; i++) {
		struct fd_vsc_pipe *pipe = &ctx->pipe[i];
		OUT_RING(ring, A4XX_VSC_PIPE_CONFIG_REG_X(pipe->x) |
				A4XX_VSC_PIPE_CONFIG_REG_Y(pipe->y) |
				A4XX_VSC_PIPE_CONFIG_REG_W(pipe->w) |
				A4XX_VSC_PIPE_CONFIG_REG_H(pipe->h));
	}

	OUT_PKT0(ring, REG_A4XX_VSC_PIPE_DATA_ADDRESS_REG(0), 8);
	for (i = 0; i < 8; i++) {
		struct fd_vsc_pipe *pipe = &ctx->pipe[i];
		if (!pipe->bo) {
			pipe->bo = fd_bo_new(ctx->dev, 0x40000,
					DRM_FREEDRENO_GEM_TYPE_KMEM);
		}
		OUT_RELOCW(ring, pipe->bo, 0, 0, 0);       /* VSC_PIPE_DATA_ADDRESS[i] */
	}

	OUT_PKT0(ring, REG_A4XX_VSC_PIPE_DATA_LENGTH_REG(0), 8);
	for (i = 0; i < 8; i++) {
		struct fd_vsc_pipe *pipe = &ctx->pipe[i];
		OUT_RING(ring, fd_bo_size(pipe->bo) - 32); /* VSC_PIPE_DATA_LENGTH[i] */
	}
}

/* before first tile */
static void
fd4_emit_tile_init(struct fd_context *ctx)
{
	struct fd_ringbuffer *ring = ctx->ring;
	struct fd_gmem_stateobj *gmem = &ctx->gmem;
	uint32_t rb_render_control;

	fd4_emit_restore(ctx);

	OUT_PKT0(ring, REG_A4XX_VSC_BIN_SIZE, 1);
	OUT_RING(ring, A4XX_VSC_BIN_SIZE_WIDTH(gmem->bin_w) |
			A4XX_VSC_BIN_SIZE_HEIGHT(gmem->bin_h));

	OUT_PKT0(ring, REG_A4XX_RB_MODE_CONTROL, 1);
	OUT_RING(ring, A4XX_RB_MODE_CONTROL_WIDTH(gmem->bin_w) |
			A4XX_RB_MODE_CONTROL_HEIGHT(gmem->bin_h) |
			0x00010000);  /* XXX */

	update_vsc_pipe(ctx);
	patch_draws(ctx, IGNORE_VISIBILITY);

	rb_render_control = 0; // XXX or BINNING_PASS.. but maybe we can emit only from gmem
	patch_rbrc(ctx, rb_render_control);
}

/* before mem2gmem */
static void
fd4_emit_tile_prep(struct fd_context *ctx, struct fd_tile *tile)
{
	struct fd_ringbuffer *ring = ctx->ring;
	struct pipe_framebuffer_state *pfb = &ctx->framebuffer;
	struct fd_gmem_stateobj *gmem = &ctx->gmem;
	uint32_t reg;

	OUT_PKT0(ring, REG_A4XX_RB_DEPTH_INFO, 3);
	reg = A4XX_RB_DEPTH_INFO_DEPTH_BASE(depth_base(ctx));
	if (pfb->zsbuf) {
		reg |= A4XX_RB_DEPTH_INFO_DEPTH_FORMAT(fd4_pipe2depth(pfb->zsbuf->format));
	}
	OUT_RING(ring, reg);
	if (pfb->zsbuf) {
		uint32_t cpp = util_format_get_blocksize(pfb->zsbuf->format);
		OUT_RING(ring, A4XX_RB_DEPTH_PITCH(cpp * gmem->bin_w));
		OUT_RING(ring, A4XX_RB_DEPTH_PITCH2(cpp * gmem->bin_w));
	} else {
		OUT_RING(ring, 0x00000000);
		OUT_RING(ring, 0x00000000);
	}

	OUT_PKT0(ring, REG_A4XX_GRAS_DEPTH_CONTROL, 1);
	if (pfb->zsbuf) {
		OUT_RING(ring, A4XX_GRAS_DEPTH_CONTROL_FORMAT(
				fd4_pipe2depth(pfb->zsbuf->format)));
	} else {
		OUT_RING(ring, A4XX_GRAS_DEPTH_CONTROL_FORMAT(DEPTH4_NONE));
	}

	if (ctx->needs_rb_fbd) {
		fd_wfi(ctx, ring);
		OUT_PKT0(ring, REG_A4XX_RB_FRAME_BUFFER_DIMENSION, 1);
		OUT_RING(ring, A4XX_RB_FRAME_BUFFER_DIMENSION_WIDTH(pfb->width) |
				A4XX_RB_FRAME_BUFFER_DIMENSION_HEIGHT(pfb->height));
		ctx->needs_rb_fbd = false;
	}
}

/* before IB to rendering cmds: */
static void
fd4_emit_tile_renderprep(struct fd_context *ctx, struct fd_tile *tile)
{
	struct fd_ringbuffer *ring = ctx->ring;
	struct fd_gmem_stateobj *gmem = &ctx->gmem;
	struct pipe_framebuffer_state *pfb = &ctx->framebuffer;

	uint32_t x1 = tile->xoff;
	uint32_t y1 = tile->yoff;
	uint32_t x2 = tile->xoff + tile->bin_w - 1;
	uint32_t y2 = tile->yoff + tile->bin_h - 1;

	OUT_PKT3(ring, CP_SET_BIN, 3);
	OUT_RING(ring, 0x00000000);
	OUT_RING(ring, CP_SET_BIN_1_X1(x1) | CP_SET_BIN_1_Y1(y1));
	OUT_RING(ring, CP_SET_BIN_2_X2(x2) | CP_SET_BIN_2_Y2(y2));

	emit_mrt(ring, pfb->nr_cbufs, pfb->cbufs, NULL, gmem->bin_w);

	/* setup scissor/offset for current tile: */
	OUT_PKT0(ring, REG_A4XX_RB_BIN_OFFSET, 1);
	OUT_RING(ring, A4XX_RB_BIN_OFFSET_X(tile->xoff) |
			A4XX_RB_BIN_OFFSET_Y(tile->yoff));

	OUT_PKT0(ring, REG_A4XX_GRAS_SC_SCREEN_SCISSOR_TL, 2);
	OUT_RING(ring, A4XX_GRAS_SC_SCREEN_SCISSOR_TL_X(x1) |
			A4XX_GRAS_SC_SCREEN_SCISSOR_TL_Y(y1));
	OUT_RING(ring, A4XX_GRAS_SC_SCREEN_SCISSOR_BR_X(x2) |
			A4XX_GRAS_SC_SCREEN_SCISSOR_BR_Y(y2));
}

void
fd4_gmem_init(struct pipe_context *pctx)
{
	struct fd_context *ctx = fd_context(pctx);

	ctx->emit_sysmem_prep = fd4_emit_sysmem_prep;
	ctx->emit_tile_init = fd4_emit_tile_init;
	ctx->emit_tile_prep = fd4_emit_tile_prep;
	ctx->emit_tile_mem2gmem = fd4_emit_tile_mem2gmem;
	ctx->emit_tile_renderprep = fd4_emit_tile_renderprep;
	ctx->emit_tile_gmem2mem = fd4_emit_tile_gmem2mem;
}
Ejemplo n.º 16
0
int
nv50_surface_do_copy(struct nv50_screen *screen, struct pipe_surface *dst,
		     int dx, int dy, struct pipe_surface *src, int sx, int sy,
		     int w, int h)
{
	struct nouveau_channel *chan = screen->eng2d->channel;
	struct nouveau_grobj *eng2d = screen->eng2d;
	int ret;

	WAIT_RING (chan, 32);

	ret = nv50_surface_set(screen, dst, 1);
	if (ret)
		return ret;

	ret = nv50_surface_set(screen, src, 0);
	if (ret)
		return ret;

	BEGIN_RING(chan, eng2d, 0x088c, 1);
	OUT_RING  (chan, 0);
	BEGIN_RING(chan, eng2d, NV50_2D_BLIT_DST_X, 4);
	OUT_RING  (chan, dx);
	OUT_RING  (chan, dy);
	OUT_RING  (chan, w);
	OUT_RING  (chan, h);
	BEGIN_RING(chan, eng2d, 0x08c0, 4);
	OUT_RING  (chan, 0);
	OUT_RING  (chan, 1);
	OUT_RING  (chan, 0);
	OUT_RING  (chan, 1);
	BEGIN_RING(chan, eng2d, 0x08d0, 4);
	OUT_RING  (chan, 0);
	OUT_RING  (chan, sx);
	OUT_RING  (chan, 0);
	OUT_RING  (chan, sy);

	return 0;
}
Ejemplo n.º 17
0
static void
emit_mrt(struct fd_ringbuffer *ring, unsigned nr_bufs,
		struct pipe_surface **bufs, uint32_t *bases, uint32_t bin_w)
{
	enum a4xx_tile_mode tile_mode;
	unsigned i;

	if (bin_w) {
		tile_mode = 2;
	} else {
		tile_mode = TILE4_LINEAR;
	}

	for (i = 0; i < 8; i++) {
		enum a4xx_color_fmt format = 0;
		enum a3xx_color_swap swap = WZYX;
		struct fd_resource *rsc = NULL;
		struct fd_resource_slice *slice = NULL;
		uint32_t stride = 0;
		uint32_t base = 0;
		uint32_t offset = 0;

		if ((i < nr_bufs) && bufs[i]) {
			struct pipe_surface *psurf = bufs[i];

			rsc = fd_resource(psurf->texture);
			slice = fd_resource_slice(rsc, psurf->u.tex.level);
			format = fd4_pipe2color(psurf->format);
			swap = fd4_pipe2swap(psurf->format);

			debug_assert(psurf->u.tex.first_layer == psurf->u.tex.last_layer);

			offset = fd_resource_offset(rsc, psurf->u.tex.level,
					psurf->u.tex.first_layer);

			if (bin_w) {
				stride = bin_w * rsc->cpp;

				if (bases) {
					base = bases[i];
				}
			} else {
				stride = slice->pitch * rsc->cpp;
			}
		}

		OUT_PKT0(ring, REG_A4XX_RB_MRT_BUF_INFO(i), 3);
		OUT_RING(ring, A4XX_RB_MRT_BUF_INFO_COLOR_FORMAT(format) |
				A4XX_RB_MRT_BUF_INFO_COLOR_TILE_MODE(tile_mode) |
				A4XX_RB_MRT_BUF_INFO_COLOR_BUF_PITCH(stride) |
				A4XX_RB_MRT_BUF_INFO_COLOR_SWAP(swap));
		if (bin_w || (i >= nr_bufs)) {
			OUT_RING(ring, base);
			OUT_RING(ring, A4XX_RB_MRT_CONTROL3_STRIDE(stride));
		} else {
			OUT_RELOCW(ring, rsc->bo, offset, 0, 0);
			/* RB_MRT[i].CONTROL3.STRIDE not emitted by c2d..
			 * not sure if we need to skip it for bypass or
			 * not.
			 */
			OUT_RING(ring, A4XX_RB_MRT_CONTROL3_STRIDE(0));
		}
	}
}
Ejemplo n.º 18
0
static int
nv50_surface_set(struct nv50_screen *screen, struct pipe_surface *ps, int dst)
{
	struct nv50_miptree *mt = nv50_miptree(ps->texture);
	struct nouveau_channel *chan = screen->eng2d->channel;
	struct nouveau_grobj *eng2d = screen->eng2d;
	struct nouveau_bo *bo = nv50_miptree(ps->texture)->base.bo;
 	int format, mthd = dst ? NV50_2D_DST_FORMAT : NV50_2D_SRC_FORMAT;
 	int flags = NOUVEAU_BO_VRAM | (dst ? NOUVEAU_BO_WR : NOUVEAU_BO_RD);

 	format = nv50_format(ps->format);
 	if (format < 0)
 		return 1;

 	if (!bo->tile_flags) {
 		BEGIN_RING(chan, eng2d, mthd, 2);
 		OUT_RING  (chan, format);
 		OUT_RING  (chan, 1);
 		BEGIN_RING(chan, eng2d, mthd + 0x14, 5);
		OUT_RING  (chan, mt->level[ps->level].pitch);
 		OUT_RING  (chan, ps->width);
 		OUT_RING  (chan, ps->height);
 		OUT_RELOCh(chan, bo, ps->offset, flags);
 		OUT_RELOCl(chan, bo, ps->offset, flags);
 	} else {
 		BEGIN_RING(chan, eng2d, mthd, 5);
 		OUT_RING  (chan, format);
 		OUT_RING  (chan, 0);
		OUT_RING  (chan, mt->level[ps->level].tile_mode << 4);
 		OUT_RING  (chan, 1);
 		OUT_RING  (chan, 0);
 		BEGIN_RING(chan, eng2d, mthd + 0x18, 4);
 		OUT_RING  (chan, ps->width);
 		OUT_RING  (chan, ps->height);
 		OUT_RELOCh(chan, bo, ps->offset, flags);
 		OUT_RELOCl(chan, bo, ps->offset, flags);
 	}
 
#if 0
 	if (dst) {
 		BEGIN_RING(chan, eng2d, NV50_2D_CLIP_X, 4);
 		OUT_RING  (chan, 0);
 		OUT_RING  (chan, 0);
 		OUT_RING  (chan, surf->width);
 		OUT_RING  (chan, surf->height);
 	}
#endif
  
 	return 0;
}
void
fd4_program_emit(struct fd_ringbuffer *ring, struct fd4_emit *emit,
		int nr, struct pipe_surface **bufs)
{
	struct stage s[MAX_STAGES];
	uint32_t pos_regid, posz_regid, psize_regid, color_regid[8];
	uint32_t face_regid, coord_regid, zwcoord_regid;
	int constmode;
	int i, j, k;

	debug_assert(nr <= ARRAY_SIZE(color_regid));

	setup_stages(emit, s);

	/* blob seems to always use constmode currently: */
	constmode = 1;

	pos_regid = ir3_find_output_regid(s[VS].v, VARYING_SLOT_POS);
	posz_regid = ir3_find_output_regid(s[FS].v, FRAG_RESULT_DEPTH);
	psize_regid = ir3_find_output_regid(s[VS].v, VARYING_SLOT_PSIZ);
	if (s[FS].v->color0_mrt) {
		color_regid[0] = color_regid[1] = color_regid[2] = color_regid[3] =
		color_regid[4] = color_regid[5] = color_regid[6] = color_regid[7] =
			ir3_find_output_regid(s[FS].v, FRAG_RESULT_COLOR);
	} else {
		color_regid[0] = ir3_find_output_regid(s[FS].v, FRAG_RESULT_DATA0);
		color_regid[1] = ir3_find_output_regid(s[FS].v, FRAG_RESULT_DATA1);
		color_regid[2] = ir3_find_output_regid(s[FS].v, FRAG_RESULT_DATA2);
		color_regid[3] = ir3_find_output_regid(s[FS].v, FRAG_RESULT_DATA3);
		color_regid[4] = ir3_find_output_regid(s[FS].v, FRAG_RESULT_DATA4);
		color_regid[5] = ir3_find_output_regid(s[FS].v, FRAG_RESULT_DATA5);
		color_regid[6] = ir3_find_output_regid(s[FS].v, FRAG_RESULT_DATA6);
		color_regid[7] = ir3_find_output_regid(s[FS].v, FRAG_RESULT_DATA7);
	}

	/* TODO get these dynamically: */
	face_regid = s[FS].v->frag_face ? regid(0,0) : regid(63,0);
	coord_regid = s[FS].v->frag_coord ? regid(0,0) : regid(63,0);
	zwcoord_regid = s[FS].v->frag_coord ? regid(0,2) : regid(63,0);

	/* we could probably divide this up into things that need to be
	 * emitted if frag-prog is dirty vs if vert-prog is dirty..
	 */

	OUT_PKT0(ring, REG_A4XX_HLSQ_UPDATE_CONTROL, 1);
	OUT_RING(ring, 0x00000003);

	OUT_PKT0(ring, REG_A4XX_HLSQ_CONTROL_0_REG, 5);
	OUT_RING(ring, A4XX_HLSQ_CONTROL_0_REG_FSTHREADSIZE(FOUR_QUADS) |
			A4XX_HLSQ_CONTROL_0_REG_CONSTMODE(constmode) |
			A4XX_HLSQ_CONTROL_0_REG_FSSUPERTHREADENABLE |
			/* NOTE:  I guess SHADERRESTART and CONSTFULLUPDATE maybe
			 * flush some caches? I think we only need to set those
			 * bits if we have updated const or shader..
			 */
			A4XX_HLSQ_CONTROL_0_REG_SPSHADERRESTART |
			A4XX_HLSQ_CONTROL_0_REG_SPCONSTFULLUPDATE);
	OUT_RING(ring, A4XX_HLSQ_CONTROL_1_REG_VSTHREADSIZE(TWO_QUADS) |
			A4XX_HLSQ_CONTROL_1_REG_VSSUPERTHREADENABLE |
			A4XX_HLSQ_CONTROL_1_REG_COORDREGID(coord_regid) |
			A4XX_HLSQ_CONTROL_1_REG_ZWCOORDREGID(zwcoord_regid));
	OUT_RING(ring, A4XX_HLSQ_CONTROL_2_REG_PRIMALLOCTHRESHOLD(63) |
			0x3f3f000 |           /* XXX */
			A4XX_HLSQ_CONTROL_2_REG_FACEREGID(face_regid));
	OUT_RING(ring, A4XX_HLSQ_CONTROL_3_REG_REGID(s[FS].v->pos_regid) |
			0xfcfcfc00);
	OUT_RING(ring, 0x00fcfcfc);   /* XXX HLSQ_CONTROL_4 */

	OUT_PKT0(ring, REG_A4XX_HLSQ_VS_CONTROL_REG, 5);
	OUT_RING(ring, A4XX_HLSQ_VS_CONTROL_REG_CONSTLENGTH(s[VS].constlen) |
			A4XX_HLSQ_VS_CONTROL_REG_CONSTOBJECTOFFSET(s[VS].constoff) |
			A4XX_HLSQ_VS_CONTROL_REG_INSTRLENGTH(s[VS].instrlen) |
			A4XX_HLSQ_VS_CONTROL_REG_SHADEROBJOFFSET(s[VS].instroff));
	OUT_RING(ring, A4XX_HLSQ_FS_CONTROL_REG_CONSTLENGTH(s[FS].constlen) |
			A4XX_HLSQ_FS_CONTROL_REG_CONSTOBJECTOFFSET(s[FS].constoff) |
			A4XX_HLSQ_FS_CONTROL_REG_INSTRLENGTH(s[FS].instrlen) |
			A4XX_HLSQ_FS_CONTROL_REG_SHADEROBJOFFSET(s[FS].instroff));
	OUT_RING(ring, A4XX_HLSQ_HS_CONTROL_REG_CONSTLENGTH(s[HS].constlen) |
			A4XX_HLSQ_HS_CONTROL_REG_CONSTOBJECTOFFSET(s[HS].constoff) |
			A4XX_HLSQ_HS_CONTROL_REG_INSTRLENGTH(s[HS].instrlen) |
			A4XX_HLSQ_HS_CONTROL_REG_SHADEROBJOFFSET(s[HS].instroff));
	OUT_RING(ring, A4XX_HLSQ_DS_CONTROL_REG_CONSTLENGTH(s[DS].constlen) |
			A4XX_HLSQ_DS_CONTROL_REG_CONSTOBJECTOFFSET(s[DS].constoff) |
			A4XX_HLSQ_DS_CONTROL_REG_INSTRLENGTH(s[DS].instrlen) |
			A4XX_HLSQ_DS_CONTROL_REG_SHADEROBJOFFSET(s[DS].instroff));
	OUT_RING(ring, A4XX_HLSQ_GS_CONTROL_REG_CONSTLENGTH(s[GS].constlen) |
			A4XX_HLSQ_GS_CONTROL_REG_CONSTOBJECTOFFSET(s[GS].constoff) |
			A4XX_HLSQ_GS_CONTROL_REG_INSTRLENGTH(s[GS].instrlen) |
			A4XX_HLSQ_GS_CONTROL_REG_SHADEROBJOFFSET(s[GS].instroff));

	OUT_PKT0(ring, REG_A4XX_SP_SP_CTRL_REG, 1);
	OUT_RING(ring, 0x140010 | /* XXX */
			COND(emit->key.binning_pass, A4XX_SP_SP_CTRL_REG_BINNING_PASS));

	OUT_PKT0(ring, REG_A4XX_SP_INSTR_CACHE_CTRL, 1);
	OUT_RING(ring, 0x7f | /* XXX */
			COND(s[VS].instrlen, A4XX_SP_INSTR_CACHE_CTRL_VS_BUFFER) |
			COND(s[FS].instrlen, A4XX_SP_INSTR_CACHE_CTRL_FS_BUFFER) |
			COND(s[VS].instrlen && s[FS].instrlen,
					A4XX_SP_INSTR_CACHE_CTRL_INSTR_BUFFER));

	OUT_PKT0(ring, REG_A4XX_SP_VS_LENGTH_REG, 1);
	OUT_RING(ring, s[VS].v->instrlen);      /* SP_VS_LENGTH_REG */

	OUT_PKT0(ring, REG_A4XX_SP_VS_CTRL_REG0, 3);
	OUT_RING(ring, A4XX_SP_VS_CTRL_REG0_THREADMODE(MULTI) |
			A4XX_SP_VS_CTRL_REG0_HALFREGFOOTPRINT(s[VS].i->max_half_reg + 1) |
			A4XX_SP_VS_CTRL_REG0_FULLREGFOOTPRINT(s[VS].i->max_reg + 1) |
			A4XX_SP_VS_CTRL_REG0_INOUTREGOVERLAP(0) |
			A4XX_SP_VS_CTRL_REG0_THREADSIZE(TWO_QUADS) |
			A4XX_SP_VS_CTRL_REG0_SUPERTHREADMODE |
			COND(s[VS].v->has_samp, A4XX_SP_VS_CTRL_REG0_PIXLODENABLE));
	OUT_RING(ring, A4XX_SP_VS_CTRL_REG1_CONSTLENGTH(s[VS].constlen) |
			A4XX_SP_VS_CTRL_REG1_INITIALOUTSTANDING(s[VS].v->total_in));
	OUT_RING(ring, A4XX_SP_VS_PARAM_REG_POSREGID(pos_regid) |
			A4XX_SP_VS_PARAM_REG_PSIZEREGID(psize_regid) |
			A4XX_SP_VS_PARAM_REG_TOTALVSOUTVAR(align(s[FS].v->total_in, 4) / 4));

	for (i = 0, j = -1; (i < 16) && (j < (int)s[FS].v->inputs_count); i++) {
		uint32_t reg = 0;

		OUT_PKT0(ring, REG_A4XX_SP_VS_OUT_REG(i), 1);

		j = ir3_next_varying(s[FS].v, j);
		if (j < s[FS].v->inputs_count) {
			k = ir3_find_output(s[VS].v, s[FS].v->inputs[j].slot);
			reg |= A4XX_SP_VS_OUT_REG_A_REGID(s[VS].v->outputs[k].regid);
			reg |= A4XX_SP_VS_OUT_REG_A_COMPMASK(s[FS].v->inputs[j].compmask);
		}

		j = ir3_next_varying(s[FS].v, j);
		if (j < s[FS].v->inputs_count) {
			k = ir3_find_output(s[VS].v, s[FS].v->inputs[j].slot);
			reg |= A4XX_SP_VS_OUT_REG_B_REGID(s[VS].v->outputs[k].regid);
			reg |= A4XX_SP_VS_OUT_REG_B_COMPMASK(s[FS].v->inputs[j].compmask);
		}

		OUT_RING(ring, reg);
	}

	for (i = 0, j = -1; (i < 8) && (j < (int)s[FS].v->inputs_count); i++) {
		uint32_t reg = 0;

		OUT_PKT0(ring, REG_A4XX_SP_VS_VPC_DST_REG(i), 1);

		j = ir3_next_varying(s[FS].v, j);
		if (j < s[FS].v->inputs_count)
			reg |= A4XX_SP_VS_VPC_DST_REG_OUTLOC0(s[FS].v->inputs[j].inloc);
		j = ir3_next_varying(s[FS].v, j);
		if (j < s[FS].v->inputs_count)
			reg |= A4XX_SP_VS_VPC_DST_REG_OUTLOC1(s[FS].v->inputs[j].inloc);
		j = ir3_next_varying(s[FS].v, j);
		if (j < s[FS].v->inputs_count)
			reg |= A4XX_SP_VS_VPC_DST_REG_OUTLOC2(s[FS].v->inputs[j].inloc);
		j = ir3_next_varying(s[FS].v, j);
		if (j < s[FS].v->inputs_count)
			reg |= A4XX_SP_VS_VPC_DST_REG_OUTLOC3(s[FS].v->inputs[j].inloc);

		OUT_RING(ring, reg);
	}

	OUT_PKT0(ring, REG_A4XX_SP_VS_OBJ_OFFSET_REG, 2);
	OUT_RING(ring, A4XX_SP_VS_OBJ_OFFSET_REG_CONSTOBJECTOFFSET(s[VS].constoff) |
			A4XX_SP_VS_OBJ_OFFSET_REG_SHADEROBJOFFSET(s[VS].instroff));
	OUT_RELOC(ring, s[VS].v->bo, 0, 0, 0);  /* SP_VS_OBJ_START_REG */

	OUT_PKT0(ring, REG_A4XX_SP_FS_LENGTH_REG, 1);
	OUT_RING(ring, s[FS].v->instrlen);  /* SP_FS_LENGTH_REG */

	OUT_PKT0(ring, REG_A4XX_SP_FS_CTRL_REG0, 2);
	OUT_RING(ring, A4XX_SP_FS_CTRL_REG0_THREADMODE(MULTI) |
			COND(s[FS].v->total_in > 0, A4XX_SP_FS_CTRL_REG0_VARYING) |
			A4XX_SP_FS_CTRL_REG0_HALFREGFOOTPRINT(s[FS].i->max_half_reg + 1) |
			A4XX_SP_FS_CTRL_REG0_FULLREGFOOTPRINT(s[FS].i->max_reg + 1) |
			A4XX_SP_FS_CTRL_REG0_INOUTREGOVERLAP(1) |
			A4XX_SP_FS_CTRL_REG0_THREADSIZE(FOUR_QUADS) |
			A4XX_SP_FS_CTRL_REG0_SUPERTHREADMODE |
			COND(s[FS].v->has_samp, A4XX_SP_FS_CTRL_REG0_PIXLODENABLE));
	OUT_RING(ring, A4XX_SP_FS_CTRL_REG1_CONSTLENGTH(s[FS].constlen) |
			0x80000000 |      /* XXX */
			COND(s[FS].v->frag_face, A4XX_SP_FS_CTRL_REG1_FACENESS) |
			COND(s[FS].v->total_in > 0, A4XX_SP_FS_CTRL_REG1_VARYING) |
			COND(s[FS].v->frag_coord, A4XX_SP_FS_CTRL_REG1_FRAGCOORD));

	OUT_PKT0(ring, REG_A4XX_SP_FS_OBJ_OFFSET_REG, 2);
	OUT_RING(ring, A4XX_SP_FS_OBJ_OFFSET_REG_CONSTOBJECTOFFSET(s[FS].constoff) |
			A4XX_SP_FS_OBJ_OFFSET_REG_SHADEROBJOFFSET(s[FS].instroff));
	if (emit->key.binning_pass)
		OUT_RING(ring, 0x00000000);
	else
		OUT_RELOC(ring, s[FS].v->bo, 0, 0, 0);  /* SP_FS_OBJ_START_REG */

	OUT_PKT0(ring, REG_A4XX_SP_HS_OBJ_OFFSET_REG, 1);
	OUT_RING(ring, A4XX_SP_HS_OBJ_OFFSET_REG_CONSTOBJECTOFFSET(s[HS].constoff) |
			A4XX_SP_HS_OBJ_OFFSET_REG_SHADEROBJOFFSET(s[HS].instroff));

	OUT_PKT0(ring, REG_A4XX_SP_DS_OBJ_OFFSET_REG, 1);
	OUT_RING(ring, A4XX_SP_DS_OBJ_OFFSET_REG_CONSTOBJECTOFFSET(s[DS].constoff) |
			A4XX_SP_DS_OBJ_OFFSET_REG_SHADEROBJOFFSET(s[DS].instroff));

	OUT_PKT0(ring, REG_A4XX_SP_GS_OBJ_OFFSET_REG, 1);
	OUT_RING(ring, A4XX_SP_GS_OBJ_OFFSET_REG_CONSTOBJECTOFFSET(s[GS].constoff) |
			A4XX_SP_GS_OBJ_OFFSET_REG_SHADEROBJOFFSET(s[GS].instroff));

	OUT_PKT0(ring, REG_A4XX_RB_RENDER_CONTROL2, 1);
	OUT_RING(ring, A4XX_RB_RENDER_CONTROL2_MSAA_SAMPLES(0) |
			COND(s[FS].v->total_in > 0, A4XX_RB_RENDER_CONTROL2_VARYING) |
			COND(s[FS].v->frag_face, A4XX_RB_RENDER_CONTROL2_FACENESS) |
			COND(s[FS].v->frag_coord, A4XX_RB_RENDER_CONTROL2_XCOORD |
					A4XX_RB_RENDER_CONTROL2_YCOORD |
// TODO enabling gl_FragCoord.z is causing lockups on 0ad (but seems
// to work everywhere else).
//					A4XX_RB_RENDER_CONTROL2_ZCOORD |
					A4XX_RB_RENDER_CONTROL2_WCOORD));

	OUT_PKT0(ring, REG_A4XX_RB_FS_OUTPUT_REG, 1);
	OUT_RING(ring, A4XX_RB_FS_OUTPUT_REG_MRT(MAX2(1, nr)) |
			COND(s[FS].v->writes_pos, A4XX_RB_FS_OUTPUT_REG_FRAG_WRITES_Z));

	OUT_PKT0(ring, REG_A4XX_SP_FS_OUTPUT_REG, 1);
	OUT_RING(ring, A4XX_SP_FS_OUTPUT_REG_MRT(MAX2(1, nr)) |
			COND(s[FS].v->writes_pos, A4XX_SP_FS_OUTPUT_REG_DEPTH_ENABLE) |
			A4XX_SP_FS_OUTPUT_REG_DEPTH_REGID(posz_regid));

	OUT_PKT0(ring, REG_A4XX_SP_FS_MRT_REG(0), 8);
	for (i = 0; i < 8; i++) {
		enum a4xx_color_fmt format = 0;
		bool srgb = false;
		if (i < nr) {
			format = fd4_emit_format(bufs[i]);
			if (bufs[i] && !emit->no_decode_srgb)
				srgb = util_format_is_srgb(bufs[i]->format);
		}
		OUT_RING(ring, A4XX_SP_FS_MRT_REG_REGID(color_regid[i]) |
				A4XX_SP_FS_MRT_REG_MRTFORMAT(format) |
				COND(srgb, A4XX_SP_FS_MRT_REG_COLOR_SRGB) |
				COND(emit->key.half_precision,
					A4XX_SP_FS_MRT_REG_HALF_PRECISION));
	}

	if (emit->key.binning_pass) {
		OUT_PKT0(ring, REG_A4XX_VPC_ATTR, 2);
		OUT_RING(ring, A4XX_VPC_ATTR_THRDASSIGN(1) |
				0x40000000 |      /* XXX */
				COND(s[VS].v->writes_psize, A4XX_VPC_ATTR_PSIZE));
		OUT_RING(ring, 0x00000000);
	} else {
		uint32_t vinterp[8], vpsrepl[8];

		memset(vinterp, 0, sizeof(vinterp));
		memset(vpsrepl, 0, sizeof(vpsrepl));

		/* looks like we need to do int varyings in the frag
		 * shader on a4xx (no flatshad reg?  or a420.0 bug?):
		 *
		 *    (sy)(ss)nop
		 *    (sy)ldlv.u32 r0.x,l[r0.x], 1
		 *    ldlv.u32 r0.y,l[r0.x+1], 1
		 *    (ss)bary.f (ei)r63.x, 0, r0.x
		 *    (ss)(rpt1)cov.s32f16 hr0.x, (r)r0.x
		 *    (rpt5)nop
		 *    sam (f16)(xyzw)hr0.x, hr0.x, s#0, t#0
		 *
		 * Possibly on later a4xx variants we'll be able to use
		 * something like the code below instead of workaround
		 * in the shader:
		 */
		/* figure out VARYING_INTERP / VARYING_PS_REPL register values: */
		for (j = -1; (j = ir3_next_varying(s[FS].v, j)) < (int)s[FS].v->inputs_count; ) {

			/* TODO might be cleaner to just +8 in SP_VS_VPC_DST_REG
			 * instead.. rather than -8 everywhere else..
			 */
			uint32_t inloc = s[FS].v->inputs[j].inloc - 8;

			/* currently assuming varyings aligned to 4 (not
			 * packed):
			 */
			debug_assert((inloc % 4) == 0);

			if ((s[FS].v->inputs[j].interpolate == INTERP_QUALIFIER_FLAT) ||
					(s[FS].v->inputs[j].rasterflat && emit->rasterflat)) {
				uint32_t loc = inloc;

				for (i = 0; i < 4; i++, loc++) {
					vinterp[loc / 16] |= 1 << ((loc % 16) * 2);
					//flatshade[loc / 32] |= 1 << (loc % 32);
				}
			}

			gl_varying_slot slot = s[FS].v->inputs[j].slot;

			/* since we don't enable PIPE_CAP_TGSI_TEXCOORD: */
			if (slot >= VARYING_SLOT_VAR0) {
				unsigned texmask = 1 << (slot - VARYING_SLOT_VAR0);
				/* Replace the .xy coordinates with S/T from the point sprite. Set
				 * interpolation bits for .zw such that they become .01
				 */
				if (emit->sprite_coord_enable & texmask) {
					vpsrepl[inloc / 16] |= (emit->sprite_coord_mode ? 0x0d : 0x09)
						<< ((inloc % 16) * 2);
					vinterp[(inloc + 2) / 16] |= 2 << (((inloc + 2) % 16) * 2);
					vinterp[(inloc + 3) / 16] |= 3 << (((inloc + 3) % 16) * 2);
				}
			}
		}

		OUT_PKT0(ring, REG_A4XX_VPC_ATTR, 2);
		OUT_RING(ring, A4XX_VPC_ATTR_TOTALATTR(s[FS].v->total_in) |
				A4XX_VPC_ATTR_THRDASSIGN(1) |
				COND(s[FS].v->total_in > 0, A4XX_VPC_ATTR_ENABLE) |
				0x40000000 |      /* XXX */
				COND(s[VS].v->writes_psize, A4XX_VPC_ATTR_PSIZE));
		OUT_RING(ring, A4XX_VPC_PACK_NUMFPNONPOSVAR(s[FS].v->total_in) |
				A4XX_VPC_PACK_NUMNONPOSVSVAR(s[FS].v->total_in));

		OUT_PKT0(ring, REG_A4XX_VPC_VARYING_INTERP_MODE(0), 8);
		for (i = 0; i < 8; i++)
			OUT_RING(ring, vinterp[i]);     /* VPC_VARYING_INTERP[i].MODE */

		OUT_PKT0(ring, REG_A4XX_VPC_VARYING_PS_REPL_MODE(0), 8);
		for (i = 0; i < 8; i++)
			OUT_RING(ring, vpsrepl[i]);   /* VPC_VARYING_PS_REPL[i] */
	}

	if (s[VS].instrlen)
		emit_shader(ring, s[VS].v);

	if (!emit->key.binning_pass)
		if (s[FS].instrlen)
			emit_shader(ring, s[FS].v);
}
Ejemplo n.º 20
0
void
fd3_program_emit(struct fd_ringbuffer *ring, struct fd3_emit *emit,
				 int nr, struct pipe_surface **bufs)
{
	const struct ir3_shader_variant *vp, *fp;
	const struct ir3_info *vsi, *fsi;
	enum a3xx_instrbuffermode fpbuffer, vpbuffer;
	uint32_t fpbuffersz, vpbuffersz, fsoff;
	uint32_t pos_regid, posz_regid, psize_regid;
	uint32_t vcoord_regid, face_regid, coord_regid, zwcoord_regid;
	uint32_t color_regid[4] = {0};
	int constmode;
	int i, j;

	debug_assert(nr <= ARRAY_SIZE(color_regid));

	vp = fd3_emit_get_vp(emit);
	fp = fd3_emit_get_fp(emit);

	vsi = &vp->info;
	fsi = &fp->info;

	fpbuffer = BUFFER;
	vpbuffer = BUFFER;
	fpbuffersz = fp->instrlen;
	vpbuffersz = vp->instrlen;

	/*
	 * Decide whether to use BUFFER or CACHE mode for VS and FS.  It
	 * appears like 256 is the hard limit, but when the combined size
	 * exceeds 128 then blob will try to keep FS in BUFFER mode and
	 * switch to CACHE for VS until VS is too large.  The blob seems
	 * to switch FS out of BUFFER mode at slightly under 128.  But
	 * a bit fuzzy on the decision tree, so use slightly conservative
	 * limits.
	 *
	 * TODO check if these thresholds for BUFFER vs CACHE mode are the
	 *      same for all a3xx or whether we need to consider the gpuid
	 */

	if ((fpbuffersz + vpbuffersz) > 128) {
		if (fpbuffersz < 112) {
			/* FP:BUFFER   VP:CACHE  */
			vpbuffer = CACHE;
			vpbuffersz = 256 - fpbuffersz;
		} else if (vpbuffersz < 112) {
			/* FP:CACHE    VP:BUFFER */
			fpbuffer = CACHE;
			fpbuffersz = 256 - vpbuffersz;
		} else {
			/* FP:CACHE    VP:CACHE  */
			vpbuffer = fpbuffer = CACHE;
			vpbuffersz = fpbuffersz = 192;
		}
	}

	if (fpbuffer == BUFFER) {
		fsoff = 128 - fpbuffersz;
	} else {
		fsoff = 256 - fpbuffersz;
	}

	/* seems like vs->constlen + fs->constlen > 256, then CONSTMODE=1 */
	constmode = ((vp->constlen + fp->constlen) > 256) ? 1 : 0;

	pos_regid = ir3_find_output_regid(vp, VARYING_SLOT_POS);
	posz_regid = ir3_find_output_regid(fp, FRAG_RESULT_DEPTH);
	psize_regid = ir3_find_output_regid(vp, VARYING_SLOT_PSIZ);
	if (fp->color0_mrt) {
		color_regid[0] = color_regid[1] = color_regid[2] = color_regid[3] =
			ir3_find_output_regid(fp, FRAG_RESULT_COLOR);
	} else {
		color_regid[0] = ir3_find_output_regid(fp, FRAG_RESULT_DATA0);
		color_regid[1] = ir3_find_output_regid(fp, FRAG_RESULT_DATA1);
		color_regid[2] = ir3_find_output_regid(fp, FRAG_RESULT_DATA2);
		color_regid[3] = ir3_find_output_regid(fp, FRAG_RESULT_DATA3);
	}

	face_regid      = ir3_find_sysval_regid(fp, SYSTEM_VALUE_FRONT_FACE);
	coord_regid     = ir3_find_sysval_regid(fp, SYSTEM_VALUE_FRAG_COORD);
	zwcoord_regid   = (coord_regid == regid(63,0)) ? regid(63,0) : (coord_regid + 2);
	vcoord_regid    = ir3_find_sysval_regid(fp, SYSTEM_VALUE_BARYCENTRIC_PIXEL);

	/* adjust regids for alpha output formats. there is no alpha render
	 * format, so it's just treated like red
	 */
	for (i = 0; i < nr; i++)
		if (util_format_is_alpha(pipe_surface_format(bufs[i])))
			color_regid[i] += 3;

	/* we could probably divide this up into things that need to be
	 * emitted if frag-prog is dirty vs if vert-prog is dirty..
	 */

	OUT_PKT0(ring, REG_A3XX_HLSQ_CONTROL_0_REG, 6);
	OUT_RING(ring, A3XX_HLSQ_CONTROL_0_REG_FSTHREADSIZE(FOUR_QUADS) |
			A3XX_HLSQ_CONTROL_0_REG_FSSUPERTHREADENABLE |
			A3XX_HLSQ_CONTROL_0_REG_CONSTMODE(constmode) |
			/* NOTE:  I guess SHADERRESTART and CONSTFULLUPDATE maybe
			 * flush some caches? I think we only need to set those
			 * bits if we have updated const or shader..
			 */
			A3XX_HLSQ_CONTROL_0_REG_SPSHADERRESTART |
			A3XX_HLSQ_CONTROL_0_REG_SPCONSTFULLUPDATE);
	OUT_RING(ring, A3XX_HLSQ_CONTROL_1_REG_VSTHREADSIZE(TWO_QUADS) |
			A3XX_HLSQ_CONTROL_1_REG_VSSUPERTHREADENABLE |
			A3XX_HLSQ_CONTROL_1_REG_FRAGCOORDXYREGID(coord_regid) |
			A3XX_HLSQ_CONTROL_1_REG_FRAGCOORDZWREGID(zwcoord_regid));
	OUT_RING(ring, A3XX_HLSQ_CONTROL_2_REG_PRIMALLOCTHRESHOLD(31) |
			A3XX_HLSQ_CONTROL_2_REG_FACENESSREGID(face_regid));
	OUT_RING(ring, A3XX_HLSQ_CONTROL_3_REG_REGID(vcoord_regid));
	OUT_RING(ring, A3XX_HLSQ_VS_CONTROL_REG_CONSTLENGTH(vp->constlen) |
			A3XX_HLSQ_VS_CONTROL_REG_CONSTSTARTOFFSET(0) |
			A3XX_HLSQ_VS_CONTROL_REG_INSTRLENGTH(vpbuffersz));
	OUT_RING(ring, A3XX_HLSQ_FS_CONTROL_REG_CONSTLENGTH(fp->constlen) |
			A3XX_HLSQ_FS_CONTROL_REG_CONSTSTARTOFFSET(128) |
			A3XX_HLSQ_FS_CONTROL_REG_INSTRLENGTH(fpbuffersz));

	OUT_PKT0(ring, REG_A3XX_SP_SP_CTRL_REG, 1);
	OUT_RING(ring, A3XX_SP_SP_CTRL_REG_CONSTMODE(constmode) |
			COND(emit->binning_pass, A3XX_SP_SP_CTRL_REG_BINNING) |
			A3XX_SP_SP_CTRL_REG_SLEEPMODE(1) |
			A3XX_SP_SP_CTRL_REG_L0MODE(0));

	OUT_PKT0(ring, REG_A3XX_SP_VS_LENGTH_REG, 1);
	OUT_RING(ring, A3XX_SP_VS_LENGTH_REG_SHADERLENGTH(vp->instrlen));

	OUT_PKT0(ring, REG_A3XX_SP_VS_CTRL_REG0, 3);
	OUT_RING(ring, A3XX_SP_VS_CTRL_REG0_THREADMODE(MULTI) |
			A3XX_SP_VS_CTRL_REG0_INSTRBUFFERMODE(vpbuffer) |
			COND(vpbuffer == CACHE, A3XX_SP_VS_CTRL_REG0_CACHEINVALID) |
			A3XX_SP_VS_CTRL_REG0_HALFREGFOOTPRINT(vsi->max_half_reg + 1) |
			A3XX_SP_VS_CTRL_REG0_FULLREGFOOTPRINT(vsi->max_reg + 1) |
			A3XX_SP_VS_CTRL_REG0_THREADSIZE(TWO_QUADS) |
			A3XX_SP_VS_CTRL_REG0_SUPERTHREADMODE |
			A3XX_SP_VS_CTRL_REG0_LENGTH(vpbuffersz));
	OUT_RING(ring, A3XX_SP_VS_CTRL_REG1_CONSTLENGTH(vp->constlen) |
			A3XX_SP_VS_CTRL_REG1_INITIALOUTSTANDING(vp->total_in) |
			A3XX_SP_VS_CTRL_REG1_CONSTFOOTPRINT(MAX2(vp->constlen + 1, 0)));
	OUT_RING(ring, A3XX_SP_VS_PARAM_REG_POSREGID(pos_regid) |
			A3XX_SP_VS_PARAM_REG_PSIZEREGID(psize_regid) |
			A3XX_SP_VS_PARAM_REG_TOTALVSOUTVAR(fp->varying_in));

	struct ir3_shader_linkage l = {0};
	ir3_link_shaders(&l, vp, fp);

	for (i = 0, j = 0; (i < 16) && (j < l.cnt); i++) {
		uint32_t reg = 0;

		OUT_PKT0(ring, REG_A3XX_SP_VS_OUT_REG(i), 1);

		reg |= A3XX_SP_VS_OUT_REG_A_REGID(l.var[j].regid);
		reg |= A3XX_SP_VS_OUT_REG_A_COMPMASK(l.var[j].compmask);
		j++;

		reg |= A3XX_SP_VS_OUT_REG_B_REGID(l.var[j].regid);
		reg |= A3XX_SP_VS_OUT_REG_B_COMPMASK(l.var[j].compmask);
		j++;

		OUT_RING(ring, reg);
	}

	for (i = 0, j = 0; (i < 8) && (j < l.cnt); i++) {
		uint32_t reg = 0;

		OUT_PKT0(ring, REG_A3XX_SP_VS_VPC_DST_REG(i), 1);

		reg |= A3XX_SP_VS_VPC_DST_REG_OUTLOC0(l.var[j++].loc + 8);
		reg |= A3XX_SP_VS_VPC_DST_REG_OUTLOC1(l.var[j++].loc + 8);
		reg |= A3XX_SP_VS_VPC_DST_REG_OUTLOC2(l.var[j++].loc + 8);
		reg |= A3XX_SP_VS_VPC_DST_REG_OUTLOC3(l.var[j++].loc + 8);

		OUT_RING(ring, reg);
	}

	OUT_PKT0(ring, REG_A3XX_SP_VS_OBJ_OFFSET_REG, 2);
	OUT_RING(ring, A3XX_SP_VS_OBJ_OFFSET_REG_CONSTOBJECTOFFSET(0) |
			A3XX_SP_VS_OBJ_OFFSET_REG_SHADEROBJOFFSET(0));
	OUT_RELOC(ring, vp->bo, 0, 0, 0);  /* SP_VS_OBJ_START_REG */

	if (emit->binning_pass) {
		OUT_PKT0(ring, REG_A3XX_SP_FS_LENGTH_REG, 1);
		OUT_RING(ring, 0x00000000);

		OUT_PKT0(ring, REG_A3XX_SP_FS_CTRL_REG0, 2);
		OUT_RING(ring, A3XX_SP_FS_CTRL_REG0_THREADMODE(MULTI) |
				A3XX_SP_FS_CTRL_REG0_INSTRBUFFERMODE(BUFFER));
		OUT_RING(ring, 0x00000000);

		OUT_PKT0(ring, REG_A3XX_SP_FS_OBJ_OFFSET_REG, 1);
		OUT_RING(ring, A3XX_SP_FS_OBJ_OFFSET_REG_CONSTOBJECTOFFSET(128) |
				A3XX_SP_FS_OBJ_OFFSET_REG_SHADEROBJOFFSET(0));
	} else {
		OUT_PKT0(ring, REG_A3XX_SP_FS_LENGTH_REG, 1);
		OUT_RING(ring, A3XX_SP_FS_LENGTH_REG_SHADERLENGTH(fp->instrlen));

		OUT_PKT0(ring, REG_A3XX_SP_FS_CTRL_REG0, 2);
		OUT_RING(ring, A3XX_SP_FS_CTRL_REG0_THREADMODE(MULTI) |
				A3XX_SP_FS_CTRL_REG0_INSTRBUFFERMODE(fpbuffer) |
				COND(fpbuffer == CACHE, A3XX_SP_FS_CTRL_REG0_CACHEINVALID) |
				A3XX_SP_FS_CTRL_REG0_HALFREGFOOTPRINT(fsi->max_half_reg + 1) |
				A3XX_SP_FS_CTRL_REG0_FULLREGFOOTPRINT(fsi->max_reg + 1) |
				A3XX_SP_FS_CTRL_REG0_INOUTREGOVERLAP |
				A3XX_SP_FS_CTRL_REG0_THREADSIZE(FOUR_QUADS) |
				A3XX_SP_FS_CTRL_REG0_SUPERTHREADMODE |
				COND(fp->num_samp > 0, A3XX_SP_FS_CTRL_REG0_PIXLODENABLE) |
				A3XX_SP_FS_CTRL_REG0_LENGTH(fpbuffersz));
		OUT_RING(ring, A3XX_SP_FS_CTRL_REG1_CONSTLENGTH(fp->constlen) |
				A3XX_SP_FS_CTRL_REG1_INITIALOUTSTANDING(fp->total_in) |
				A3XX_SP_FS_CTRL_REG1_CONSTFOOTPRINT(MAX2(fp->constlen + 1, 0)) |
				A3XX_SP_FS_CTRL_REG1_HALFPRECVAROFFSET(63));

		OUT_PKT0(ring, REG_A3XX_SP_FS_OBJ_OFFSET_REG, 2);
		OUT_RING(ring, A3XX_SP_FS_OBJ_OFFSET_REG_CONSTOBJECTOFFSET(
					MAX2(128, vp->constlen)) |
				A3XX_SP_FS_OBJ_OFFSET_REG_SHADEROBJOFFSET(fsoff));
		OUT_RELOC(ring, fp->bo, 0, 0, 0);  /* SP_FS_OBJ_START_REG */
	}

	OUT_PKT0(ring, REG_A3XX_SP_FS_OUTPUT_REG, 1);
	OUT_RING(ring,
			 COND(fp->writes_pos, A3XX_SP_FS_OUTPUT_REG_DEPTH_ENABLE) |
			 A3XX_SP_FS_OUTPUT_REG_DEPTH_REGID(posz_regid) |
			 A3XX_SP_FS_OUTPUT_REG_MRT(MAX2(1, nr) - 1));

	OUT_PKT0(ring, REG_A3XX_SP_FS_MRT_REG(0), 4);
	for (i = 0; i < 4; i++) {
		uint32_t mrt_reg = A3XX_SP_FS_MRT_REG_REGID(color_regid[i]) |
			COND(fp->key.half_precision, A3XX_SP_FS_MRT_REG_HALF_PRECISION);

		if (i < nr) {
			enum pipe_format fmt = pipe_surface_format(bufs[i]);
			mrt_reg |= COND(util_format_is_pure_uint(fmt), A3XX_SP_FS_MRT_REG_UINT) |
				COND(util_format_is_pure_sint(fmt), A3XX_SP_FS_MRT_REG_SINT);
		}
		OUT_RING(ring, mrt_reg);
	}

	if (emit->binning_pass) {
		OUT_PKT0(ring, REG_A3XX_VPC_ATTR, 2);
		OUT_RING(ring, A3XX_VPC_ATTR_THRDASSIGN(1) |
				A3XX_VPC_ATTR_LMSIZE(1) |
				COND(vp->writes_psize, A3XX_VPC_ATTR_PSIZE));
		OUT_RING(ring, 0x00000000);
	} else {
		uint32_t vinterp[4], flatshade[2], vpsrepl[4];

		memset(vinterp, 0, sizeof(vinterp));
		memset(flatshade, 0, sizeof(flatshade));
		memset(vpsrepl, 0, sizeof(vpsrepl));

		/* figure out VARYING_INTERP / FLAT_SHAD register values: */
		for (j = -1; (j = ir3_next_varying(fp, j)) < (int)fp->inputs_count; ) {
			/* NOTE: varyings are packed, so if compmask is 0xb
			 * then first, third, and fourth component occupy
			 * three consecutive varying slots:
			 */
			unsigned compmask = fp->inputs[j].compmask;

			uint32_t inloc = fp->inputs[j].inloc;

			if ((fp->inputs[j].interpolate == INTERP_MODE_FLAT) ||
					(fp->inputs[j].rasterflat && emit->rasterflat)) {
				uint32_t loc = inloc;

				for (i = 0; i < 4; i++) {
					if (compmask & (1 << i)) {
						vinterp[loc / 16] |= FLAT << ((loc % 16) * 2);
						flatshade[loc / 32] |= 1 << (loc % 32);
						loc++;
					}
				}
			}

			gl_varying_slot slot = fp->inputs[j].slot;

			/* since we don't enable PIPE_CAP_TGSI_TEXCOORD: */
			if (slot >= VARYING_SLOT_VAR0) {
				unsigned texmask = 1 << (slot - VARYING_SLOT_VAR0);
				/* Replace the .xy coordinates with S/T from the point sprite. Set
				 * interpolation bits for .zw such that they become .01
				 */
				if (emit->sprite_coord_enable & texmask) {
					/* mask is two 2-bit fields, where:
					 *   '01' -> S
					 *   '10' -> T
					 *   '11' -> 1 - T  (flip mode)
					 */
					unsigned mask = emit->sprite_coord_mode ? 0b1101 : 0b1001;
					uint32_t loc = inloc;
					if (compmask & 0x1) {
						vpsrepl[loc / 16] |= ((mask >> 0) & 0x3) << ((loc % 16) * 2);
						loc++;
					}
					if (compmask & 0x2) {
						vpsrepl[loc / 16] |= ((mask >> 2) & 0x3) << ((loc % 16) * 2);
						loc++;
					}
					if (compmask & 0x4) {
						/* .z <- 0.0f */
						vinterp[loc / 16] |= 0b10 << ((loc % 16) * 2);
						loc++;
					}
					if (compmask & 0x8) {
						/* .w <- 1.0f */
						vinterp[loc / 16] |= 0b11 << ((loc % 16) * 2);
						loc++;
					}
				}