Exemple #1
0
static void si_vertex_buffers_begin_new_cs(struct si_context *sctx)
{
	struct si_descriptors *desc = &sctx->vertex_buffers;
	int count = sctx->vertex_elements ? sctx->vertex_elements->count : 0;
	int i;

	for (i = 0; i < count; i++) {
		int vb = sctx->vertex_elements->elements[i].vertex_buffer_index;

		if (vb >= Elements(sctx->vertex_buffer))
			continue;
		if (!sctx->vertex_buffer[vb].buffer)
			continue;

		radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
				      (struct r600_resource*)sctx->vertex_buffer[vb].buffer,
				      RADEON_USAGE_READ, RADEON_PRIO_VERTEX_BUFFER);
	}

	if (!desc->buffer)
		return;
	radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
			      desc->buffer, RADEON_USAGE_READ,
			      RADEON_PRIO_DESCRIPTORS);
}
Exemple #2
0
static void si_set_sampler_view(struct si_context *sctx, unsigned shader,
				unsigned slot, struct pipe_sampler_view *view,
				unsigned *view_desc)
{
	struct si_sampler_views *views = &sctx->samplers[shader].views;

	if (views->views[slot] == view)
		return;

	if (view) {
		struct si_sampler_view *rview =
			(struct si_sampler_view*)view;

		if (rview->resource)
			radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
				rview->resource, RADEON_USAGE_READ,
				r600_get_sampler_view_priority(rview->resource));

		if (rview->dcc_buffer && rview->dcc_buffer != rview->resource)
			radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
				rview->dcc_buffer, RADEON_USAGE_READ,
				RADEON_PRIO_DCC);

		pipe_sampler_view_reference(&views->views[slot], view);
		memcpy(views->desc.list + slot*8, view_desc, 8*4);
		views->desc.enabled_mask |= 1llu << slot;
	} else {
		pipe_sampler_view_reference(&views->views[slot], NULL);
		memcpy(views->desc.list + slot*8, null_descriptor, 8*4);
		views->desc.enabled_mask &= ~(1llu << slot);
	}

	views->desc.list_dirty = true;
}
Exemple #3
0
void r600_need_dma_space(struct r600_common_context *ctx, unsigned num_dw,
                         struct r600_resource *dst, struct r600_resource *src)
{
	uint64_t vram = 0, gtt = 0;

	if (dst) {
		if (dst->domains & RADEON_DOMAIN_VRAM)
			vram += dst->buf->size;
		else if (dst->domains & RADEON_DOMAIN_GTT)
			gtt += dst->buf->size;
	}
	if (src) {
		if (src->domains & RADEON_DOMAIN_VRAM)
			vram += src->buf->size;
		else if (src->domains & RADEON_DOMAIN_GTT)
			gtt += src->buf->size;
	}

	/* Flush the GFX IB if DMA depends on it. */
	if (radeon_emitted(ctx->gfx.cs, ctx->initial_gfx_cs_size) &&
	    ((dst &&
	      ctx->ws->cs_is_buffer_referenced(ctx->gfx.cs, dst->buf,
					       RADEON_USAGE_READWRITE)) ||
	     (src &&
	      ctx->ws->cs_is_buffer_referenced(ctx->gfx.cs, src->buf,
					       RADEON_USAGE_WRITE))))
		ctx->gfx.flush(ctx, RADEON_FLUSH_ASYNC, NULL);

	/* Flush if there's not enough space, or if the memory usage per IB
	 * is too large.
	 */
	if (!ctx->ws->cs_check_space(ctx->dma.cs, num_dw) ||
	    !ctx->ws->cs_memory_below_limit(ctx->dma.cs, vram, gtt)) {
		ctx->dma.flush(ctx, RADEON_FLUSH_ASYNC, NULL);
		assert((num_dw + ctx->dma.cs->current.cdw) <= ctx->dma.cs->current.max_dw);
	}

	/* If GPUVM is not supported, the CS checker needs 2 entries
	 * in the buffer list per packet, which has to be done manually.
	 */
	if (ctx->screen->info.has_virtual_memory) {
		if (dst)
			radeon_add_to_buffer_list(ctx, &ctx->dma, dst,
						  RADEON_USAGE_WRITE,
						  RADEON_PRIO_SDMA_BUFFER);
		if (src)
			radeon_add_to_buffer_list(ctx, &ctx->dma, src,
						  RADEON_USAGE_READ,
						  RADEON_PRIO_SDMA_BUFFER);
	}
}
Exemple #4
0
/**
 * Emit function for r600_cs_shader_state atom
 */
void evergreen_emit_cs_shader(struct r600_context *rctx,
			      struct r600_atom *atom)
{
	struct r600_cs_shader_state *state =
					(struct r600_cs_shader_state*)atom;
	struct r600_pipe_compute *shader = state->shader;
	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
	uint64_t va;
	struct r600_resource *code_bo;
	unsigned ngpr, nstack;

	code_bo = shader->code_bo;
	va = shader->code_bo->gpu_address + state->pc;
	ngpr = shader->bc.ngpr;
	nstack = shader->bc.nstack;

	radeon_compute_set_context_reg_seq(cs, R_0288D0_SQ_PGM_START_LS, 3);
	radeon_emit(cs, va >> 8); /* R_0288D0_SQ_PGM_START_LS */
	radeon_emit(cs,           /* R_0288D4_SQ_PGM_RESOURCES_LS */
			S_0288D4_NUM_GPRS(ngpr)
			| S_0288D4_STACK_SIZE(nstack));
	radeon_emit(cs, 0);	/* R_0288D8_SQ_PGM_RESOURCES_LS_2 */

	radeon_emit(cs, PKT3C(PKT3_NOP, 0, 0));
	radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx,
					      code_bo, RADEON_USAGE_READ,
					      RADEON_PRIO_USER_SHADER));
}
Exemple #5
0
static bool si_upload_descriptors(struct si_context *sctx,
				  struct si_descriptors *desc)
{
	unsigned list_size = desc->num_elements * desc->element_dw_size * 4;
	void *ptr;

	if (!desc->list_dirty)
		return true;

	u_upload_alloc(sctx->b.uploader, 0, list_size,
		       &desc->buffer_offset,
		       (struct pipe_resource**)&desc->buffer, &ptr);
	if (!desc->buffer)
		return false; /* skip the draw call */

	util_memcpy_cpu_to_le32(ptr, desc->list, list_size);

	radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, desc->buffer,
			      RADEON_USAGE_READ, RADEON_PRIO_DESCRIPTORS);

	desc->list_dirty = false;
	desc->pointer_dirty = true;
	si_mark_atom_dirty(sctx, &sctx->shader_userdata.atom);
	return true;
}
Exemple #6
0
static void
si_image_views_begin_new_cs(struct si_context *sctx, struct si_images_info *images)
{
	uint mask = images->desc.enabled_mask;

	/* Add buffers to the CS. */
	while (mask) {
		int i = u_bit_scan(&mask);
		struct pipe_image_view *view = &images->views[i];

		assert(view->resource);

		si_sampler_view_add_buffer(sctx, view->resource,
					   RADEON_USAGE_READWRITE);
	}

	images->desc.ce_ram_dirty = true;

	if (images->desc.buffer) {
		radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
					  images->desc.buffer,
					  RADEON_USAGE_READ,
					  RADEON_PRIO_DESCRIPTORS);
	}
}
Exemple #7
0
static void si_reinitialize_ce_ram(struct si_context *sctx,
                            struct si_descriptors *desc)
{
	if (desc->buffer) {
		struct r600_resource *buffer = (struct r600_resource*)desc->buffer;
		unsigned list_size = desc->num_elements * desc->element_dw_size * 4;
		uint64_t va = buffer->gpu_address + desc->buffer_offset;
		struct radeon_winsys_cs *ib = sctx->ce_preamble_ib;

		if (!ib)
			ib = sctx->ce_ib;

		list_size = align(list_size, 32);

		radeon_emit(ib, PKT3(PKT3_LOAD_CONST_RAM, 3, 0));
		radeon_emit(ib, va);
		radeon_emit(ib, va >> 32);
		radeon_emit(ib, list_size / 4);
		radeon_emit(ib, desc->ce_offset);

		radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, desc->buffer,
		                    RADEON_USAGE_READ, RADEON_PRIO_DESCRIPTORS);
	}
	desc->ce_ram_dirty = false;
}
Exemple #8
0
void r600_emit_pfp_sync_me(struct r600_context *rctx)
{
	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;

	if (rctx->b.chip_class >= EVERGREEN &&
	    rctx->b.screen->info.drm_minor >= 46) {
		radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0));
		radeon_emit(cs, 0);
	} else {
		/* Emulate PFP_SYNC_ME by writing a value to memory in ME and
		 * waiting for it in PFP.
		 */
		struct r600_resource *buf = NULL;
		unsigned offset, reloc;
		uint64_t va;

		/* 16-byte address alignment is required by WAIT_REG_MEM. */
		u_suballocator_alloc(rctx->b.allocator_zeroed_memory, 4, 16,
				     &offset, (struct pipe_resource**)&buf);
		if (!buf) {
			/* This is too heavyweight, but will work. */
			rctx->b.gfx.flush(rctx, RADEON_FLUSH_ASYNC, NULL);
			return;
		}

		reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, buf,
						  RADEON_USAGE_READWRITE,
						  RADEON_PRIO_FENCE);

		va = buf->gpu_address + offset;
		assert(va % 16 == 0);

		/* Write 1 to memory in ME. */
		radeon_emit(cs, PKT3(PKT3_MEM_WRITE, 3, 0));
		radeon_emit(cs, va);
		radeon_emit(cs, ((va >> 32) & 0xff) | MEM_WRITE_32_BITS);
		radeon_emit(cs, 1);
		radeon_emit(cs, 0);

		radeon_emit(cs, PKT3(PKT3_NOP, 0, 0));
		radeon_emit(cs, reloc);

		/* Wait in PFP (PFP can only do GEQUAL against memory). */
		radeon_emit(cs, PKT3(PKT3_WAIT_REG_MEM, 5, 0));
		radeon_emit(cs, WAIT_REG_MEM_GEQUAL |
			        WAIT_REG_MEM_MEMORY |
			        WAIT_REG_MEM_PFP);
		radeon_emit(cs, va);
		radeon_emit(cs, va >> 32);
		radeon_emit(cs, 1); /* reference value */
		radeon_emit(cs, 0xffffffff); /* mask */
		radeon_emit(cs, 4); /* poll interval */

		radeon_emit(cs, PKT3(PKT3_NOP, 0, 0));
		radeon_emit(cs, reloc);

		r600_resource_reference(&buf, NULL);
	}
}
Exemple #9
0
static void si_set_constant_buffer(struct pipe_context *ctx, uint shader, uint slot,
				   struct pipe_constant_buffer *input)
{
	struct si_context *sctx = (struct si_context *)ctx;
	struct si_buffer_resources *buffers = &sctx->const_buffers[shader];

	if (shader >= SI_NUM_SHADERS)
		return;

	assert(slot < buffers->desc.num_elements);
	pipe_resource_reference(&buffers->buffers[slot], NULL);

	/* CIK cannot unbind a constant buffer (S_BUFFER_LOAD is buggy
	 * with a NULL buffer). We need to use a dummy buffer instead. */
	if (sctx->b.chip_class == CIK &&
	    (!input || (!input->buffer && !input->user_buffer)))
		input = &sctx->null_const_buf;

	if (input && (input->buffer || input->user_buffer)) {
		struct pipe_resource *buffer = NULL;
		uint64_t va;

		/* Upload the user buffer if needed. */
		if (input->user_buffer) {
			unsigned buffer_offset;

			si_upload_const_buffer(sctx,
					       (struct r600_resource**)&buffer, input->user_buffer,
					       input->buffer_size, &buffer_offset);
			if (!buffer) {
				/* Just unbind on failure. */
				si_set_constant_buffer(ctx, shader, slot, NULL);
				return;
			}
			va = r600_resource(buffer)->gpu_address + buffer_offset;
		} else {
			pipe_resource_reference(&buffer, input->buffer);
			va = r600_resource(buffer)->gpu_address + input->buffer_offset;
		}

		/* Set the descriptor. */
		uint32_t *desc = buffers->desc.list + slot*4;
		desc[0] = va;
		desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) |
			  S_008F04_STRIDE(0);
		desc[2] = input->buffer_size;
		desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
			  S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
			  S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
			  S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
			  S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
			  S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);

		buffers->buffers[slot] = buffer;
		radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
				      (struct r600_resource*)buffer,
				      buffers->shader_usage, buffers->priority);
		buffers->desc.enabled_mask |= 1llu << slot;
	} else {
Exemple #10
0
static void si_sampler_states_begin_new_cs(struct si_context *sctx,
					   struct si_sampler_states *states)
{
	if (!states->desc.buffer)
		return;
	radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, states->desc.buffer,
			      RADEON_USAGE_READWRITE, RADEON_PRIO_DESCRIPTORS);
}
Exemple #11
0
static bool si_upload_descriptors(struct si_context *sctx,
				  struct si_descriptors *desc,
				  struct r600_atom * atom)
{
	unsigned list_size = desc->num_elements * desc->element_dw_size * 4;

	if (!desc->dirty_mask)
		return true;

	if (sctx->ce_ib) {
		uint32_t const* list = (uint32_t const*)desc->list;

		if (desc->ce_ram_dirty)
			si_reinitialize_ce_ram(sctx, desc);

		while(desc->dirty_mask) {
			int begin, count;
			u_bit_scan_consecutive_range(&desc->dirty_mask, &begin,
						     &count);

			begin *= desc->element_dw_size;
			count *= desc->element_dw_size;

			radeon_emit(sctx->ce_ib,
			            PKT3(PKT3_WRITE_CONST_RAM, count, 0));
			radeon_emit(sctx->ce_ib, desc->ce_offset + begin * 4);
			radeon_emit_array(sctx->ce_ib, list + begin, count);
		}

		if (!si_ce_upload(sctx, desc->ce_offset, list_size,
		                           &desc->buffer_offset, &desc->buffer))
			return false;
	} else {
		void *ptr;

		u_upload_alloc(sctx->b.uploader, 0, list_size, 256,
			&desc->buffer_offset,
			(struct pipe_resource**)&desc->buffer, &ptr);
		if (!desc->buffer)
			return false; /* skip the draw call */

		util_memcpy_cpu_to_le32(ptr, desc->list, list_size);

		radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, desc->buffer,
	                            RADEON_USAGE_READ, RADEON_PRIO_DESCRIPTORS);
	}
	desc->pointer_dirty = true;
	desc->dirty_mask = 0;

	if (atom)
		si_mark_atom_dirty(sctx, atom);

	return true;
}
Exemple #12
0
static void si_buffer_resources_begin_new_cs(struct si_context *sctx,
					     struct si_buffer_resources *buffers)
{
	uint64_t mask = buffers->desc.enabled_mask;

	/* Add buffers to the CS. */
	while (mask) {
		int i = u_bit_scan64(&mask);

		radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
				      (struct r600_resource*)buffers->buffers[i],
				      buffers->shader_usage, buffers->priority);
	}

	if (!buffers->desc.buffer)
		return;
	radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
			      buffers->desc.buffer, RADEON_USAGE_READWRITE,
			      RADEON_PRIO_DESCRIPTORS);
}
Exemple #13
0
static void si_sampler_view_add_buffer(struct si_context *sctx,
				       struct pipe_resource *resource,
				       enum radeon_bo_usage usage)
{
	struct r600_resource *rres = (struct r600_resource*)resource;

	if (!resource)
		return;

	radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, rres, usage,
				  r600_get_sampler_view_priority(rres));
}
Exemple #14
0
void r600_dma_copy_buffer(struct r600_context *rctx,
			  struct pipe_resource *dst,
			  struct pipe_resource *src,
			  uint64_t dst_offset,
			  uint64_t src_offset,
			  uint64_t size)
{
	struct radeon_winsys_cs *cs = rctx->b.dma.cs;
	unsigned i, ncopy, csize;
	struct r600_resource *rdst = (struct r600_resource*)dst;
	struct r600_resource *rsrc = (struct r600_resource*)src;

	/* Mark the buffer range of destination as valid (initialized),
	 * so that transfer_map knows it should wait for the GPU when mapping
	 * that range. */
	util_range_add(&rdst->valid_buffer_range, dst_offset,
		       dst_offset + size);

	size >>= 2; /* convert to dwords */
	ncopy = (size / R600_DMA_COPY_MAX_SIZE_DW) + !!(size % R600_DMA_COPY_MAX_SIZE_DW);

	r600_need_dma_space(&rctx->b, ncopy * 5, rdst, rsrc);
	for (i = 0; i < ncopy; i++) {
		csize = size < R600_DMA_COPY_MAX_SIZE_DW ? size : R600_DMA_COPY_MAX_SIZE_DW;
		/* emit reloc before writing cs so that cs is always in consistent state */
		radeon_add_to_buffer_list(&rctx->b, &rctx->b.dma, rsrc, RADEON_USAGE_READ,
				      RADEON_PRIO_SDMA_BUFFER);
		radeon_add_to_buffer_list(&rctx->b, &rctx->b.dma, rdst, RADEON_USAGE_WRITE,
				      RADEON_PRIO_SDMA_BUFFER);
		radeon_emit(cs, DMA_PACKET(DMA_PACKET_COPY, 0, 0, csize));
		radeon_emit(cs, dst_offset & 0xfffffffc);
		radeon_emit(cs, src_offset & 0xfffffffc);
		radeon_emit(cs, (dst_offset >> 32UL) & 0xff);
		radeon_emit(cs, (src_offset >> 32UL) & 0xff);
		dst_offset += csize << 2;
		src_offset += csize << 2;
		size -= csize;
	}
	r600_dma_emit_wait_idle(&rctx->b);
}
Exemple #15
0
static void cik_sdma_do_copy_buffer(struct si_context *ctx,
				    struct pipe_resource *dst,
				    struct pipe_resource *src,
				    uint64_t dst_offset,
				    uint64_t src_offset,
				    uint64_t size)
{
	struct radeon_winsys_cs *cs = ctx->b.dma.cs;
	unsigned i, ncopy, csize;
	struct r600_resource *rdst = (struct r600_resource*)dst;
	struct r600_resource *rsrc = (struct r600_resource*)src;

	dst_offset += r600_resource(dst)->gpu_address;
	src_offset += r600_resource(src)->gpu_address;

	ncopy = (size + CIK_SDMA_COPY_MAX_SIZE - 1) / CIK_SDMA_COPY_MAX_SIZE;
	r600_need_dma_space(&ctx->b, ncopy * 7);

	radeon_add_to_buffer_list(&ctx->b, &ctx->b.dma, rsrc, RADEON_USAGE_READ,
			      RADEON_PRIO_SDMA_BUFFER);
	radeon_add_to_buffer_list(&ctx->b, &ctx->b.dma, rdst, RADEON_USAGE_WRITE,
			      RADEON_PRIO_SDMA_BUFFER);

	for (i = 0; i < ncopy; i++) {
		csize = size < CIK_SDMA_COPY_MAX_SIZE ? size : CIK_SDMA_COPY_MAX_SIZE;
		cs->buf[cs->cdw++] = CIK_SDMA_PACKET(CIK_SDMA_OPCODE_COPY,
						     CIK_SDMA_COPY_SUB_OPCODE_LINEAR,
						     0);
		cs->buf[cs->cdw++] = csize;
		cs->buf[cs->cdw++] = 0; /* src/dst endian swap */
		cs->buf[cs->cdw++] = src_offset;
		cs->buf[cs->cdw++] = src_offset >> 32;
		cs->buf[cs->cdw++] = dst_offset;
		cs->buf[cs->cdw++] = dst_offset >> 32;
		dst_offset += csize;
		src_offset += csize;
		size -= csize;
	}
}
Exemple #16
0
static void si_sampler_views_begin_new_cs(struct si_context *sctx,
					  struct si_sampler_views *views)
{
	uint64_t mask = views->desc.enabled_mask;

	/* Add buffers to the CS. */
	while (mask) {
		int i = u_bit_scan64(&mask);
		struct si_sampler_view *rview =
			(struct si_sampler_view*)views->views[i];

		if (!rview->resource)
			continue;

		radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
				      rview->resource, RADEON_USAGE_READ,
				      r600_get_sampler_view_priority(rview->resource));
	}

	if (!views->desc.buffer)
		return;
	radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, views->desc.buffer,
			      RADEON_USAGE_READWRITE, RADEON_PRIO_DESCRIPTORS);
}
Exemple #17
0
void si_pm4_emit(struct si_context *sctx, struct si_pm4_state *state)
{
	struct radeon_winsys_cs *cs = sctx->b.gfx.cs;

	for (int i = 0; i < state->nbo; ++i) {
		radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, state->bo[i],
				      state->bo_usage[i], state->bo_priority[i]);
	}

	if (!state->indirect_buffer) {
		radeon_emit_array(cs, state->pm4, state->ndw);
	} else {
		struct r600_resource *ib = state->indirect_buffer;

		radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, ib,
					  RADEON_USAGE_READ,
                                          RADEON_PRIO_IB2);

		radeon_emit(cs, PKT3(PKT3_INDIRECT_BUFFER_CIK, 2, 0));
		radeon_emit(cs, ib->gpu_address);
		radeon_emit(cs, ib->gpu_address >> 32);
		radeon_emit(cs, (ib->b.b.width0 >> 2) & 0xfffff);
	}
}
Exemple #18
0
static void si_sampler_views_begin_new_cs(struct si_context *sctx,
					  struct si_sampler_views *views)
{
	unsigned mask = views->desc.enabled_mask;

	/* Add buffers to the CS. */
	while (mask) {
		int i = u_bit_scan(&mask);

		si_sampler_view_add_buffer(sctx, views->views[i]->texture,
					   RADEON_USAGE_READ);
	}

	views->desc.ce_ram_dirty = true;

	if (!views->desc.buffer)
		return;
	radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, views->desc.buffer,
			      RADEON_USAGE_READWRITE, RADEON_PRIO_DESCRIPTORS);
}
Exemple #19
0
static bool si_ce_upload(struct si_context *sctx, unsigned ce_offset, unsigned size,
			 unsigned *out_offset, struct r600_resource **out_buf) {
	uint64_t va;

	u_suballocator_alloc(sctx->ce_suballocator, size, out_offset,
			     (struct pipe_resource**)out_buf);
	if (!out_buf)
			return false;

	va = (*out_buf)->gpu_address + *out_offset;

	radeon_emit(sctx->ce_ib, PKT3(PKT3_DUMP_CONST_RAM, 3, 0));
	radeon_emit(sctx->ce_ib, ce_offset);
	radeon_emit(sctx->ce_ib, size / 4);
	radeon_emit(sctx->ce_ib, va);
	radeon_emit(sctx->ce_ib, va >> 32);

	radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, *out_buf,
	                       RADEON_USAGE_READWRITE, RADEON_PRIO_DESCRIPTORS);

	sctx->ce_need_synchronization = true;
	return true;
}
Exemple #20
0
void r600_cp_dma_copy_buffer(struct r600_context *rctx,
			     struct pipe_resource *dst, uint64_t dst_offset,
			     struct pipe_resource *src, uint64_t src_offset,
			     unsigned size)
{
	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;

	assert(size);
	assert(rctx->screen->b.has_cp_dma);

	/* Mark the buffer range of destination as valid (initialized),
	 * so that transfer_map knows it should wait for the GPU when mapping
	 * that range. */
	util_range_add(&r600_resource(dst)->valid_buffer_range, dst_offset,
		       dst_offset + size);

	dst_offset += r600_resource(dst)->gpu_address;
	src_offset += r600_resource(src)->gpu_address;

	/* Flush the caches where the resources are bound. */
	rctx->b.flags |= R600_CONTEXT_INV_CONST_CACHE |
			 R600_CONTEXT_INV_VERTEX_CACHE |
			 R600_CONTEXT_INV_TEX_CACHE |
			 R600_CONTEXT_FLUSH_AND_INV |
			 R600_CONTEXT_FLUSH_AND_INV_CB |
			 R600_CONTEXT_FLUSH_AND_INV_DB |
			 R600_CONTEXT_FLUSH_AND_INV_CB_META |
			 R600_CONTEXT_FLUSH_AND_INV_DB_META |
			 R600_CONTEXT_STREAMOUT_FLUSH |
			 R600_CONTEXT_WAIT_3D_IDLE;

	/* There are differences between R700 and EG in CP DMA,
	 * but we only use the common bits here. */
	while (size) {
		unsigned sync = 0;
		unsigned byte_count = MIN2(size, CP_DMA_MAX_BYTE_COUNT);
		unsigned src_reloc, dst_reloc;

		r600_need_cs_space(rctx, 10 + (rctx->b.flags ? R600_MAX_FLUSH_CS_DWORDS : 0), FALSE);

		/* Flush the caches for the first copy only. */
		if (rctx->b.flags) {
			r600_flush_emit(rctx);
		}

		/* Do the synchronization after the last copy, so that all data is written to memory. */
		if (size == byte_count) {
			sync = PKT3_CP_DMA_CP_SYNC;
		}

		/* This must be done after r600_need_cs_space. */
		src_reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, (struct r600_resource*)src,
						  RADEON_USAGE_READ, RADEON_PRIO_CP_DMA);
		dst_reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, (struct r600_resource*)dst,
						  RADEON_USAGE_WRITE, RADEON_PRIO_CP_DMA);

		radeon_emit(cs, PKT3(PKT3_CP_DMA, 4, 0));
		radeon_emit(cs, src_offset);	/* SRC_ADDR_LO [31:0] */
		radeon_emit(cs, sync | ((src_offset >> 32) & 0xff));		/* CP_SYNC [31] | SRC_ADDR_HI [7:0] */
		radeon_emit(cs, dst_offset);	/* DST_ADDR_LO [31:0] */
		radeon_emit(cs, (dst_offset >> 32) & 0xff);		/* DST_ADDR_HI [7:0] */
		radeon_emit(cs, byte_count);	/* COMMAND [29:22] | BYTE_COUNT [20:0] */

		radeon_emit(cs, PKT3(PKT3_NOP, 0, 0));
		radeon_emit(cs, src_reloc);
		radeon_emit(cs, PKT3(PKT3_NOP, 0, 0));
		radeon_emit(cs, dst_reloc);

		size -= byte_count;
		src_offset += byte_count;
		dst_offset += byte_count;
	}

	/* Invalidate the read caches. */
	rctx->b.flags |= R600_CONTEXT_INV_CONST_CACHE |
			 R600_CONTEXT_INV_VERTEX_CACHE |
			 R600_CONTEXT_INV_TEX_CACHE;
}
void r600_need_dma_space(struct r600_common_context *ctx, unsigned num_dw,
                         struct r600_resource *dst, struct r600_resource *src)
{
	uint64_t vram = ctx->dma.cs->used_vram;
	uint64_t gtt = ctx->dma.cs->used_gart;

	if (dst) {
		vram += dst->vram_usage;
		gtt += dst->gart_usage;
	}
	if (src) {
		vram += src->vram_usage;
		gtt += src->gart_usage;
	}

	/* Flush the GFX IB if DMA depends on it. */
	if (radeon_emitted(ctx->gfx.cs, ctx->initial_gfx_cs_size) &&
	    ((dst &&
	      ctx->ws->cs_is_buffer_referenced(ctx->gfx.cs, dst->buf,
					       RADEON_USAGE_READWRITE)) ||
	     (src &&
	      ctx->ws->cs_is_buffer_referenced(ctx->gfx.cs, src->buf,
					       RADEON_USAGE_WRITE))))
		ctx->gfx.flush(ctx, RADEON_FLUSH_ASYNC, NULL);

	/* Flush if there's not enough space, or if the memory usage per IB
	 * is too large.
	 *
	 * IBs using too little memory are limited by the IB submission overhead.
	 * IBs using too much memory are limited by the kernel/TTM overhead.
	 * Too long IBs create CPU-GPU pipeline bubbles and add latency.
	 *
	 * This heuristic makes sure that DMA requests are executed
	 * very soon after the call is made and lowers memory usage.
	 * It improves texture upload performance by keeping the DMA
	 * engine busy while uploads are being submitted.
	 */
	num_dw++; /* for emit_wait_idle below */
	if (!ctx->ws->cs_check_space(ctx->dma.cs, num_dw) ||
	    ctx->dma.cs->used_vram + ctx->dma.cs->used_gart > 64 * 1024 * 1024 ||
	    !radeon_cs_memory_below_limit(ctx->screen, ctx->dma.cs, vram, gtt)) {
		ctx->dma.flush(ctx, RADEON_FLUSH_ASYNC, NULL);
		assert((num_dw + ctx->dma.cs->current.cdw) <= ctx->dma.cs->current.max_dw);
	}

	/* Wait for idle if either buffer has been used in the IB before to
	 * prevent read-after-write hazards.
	 */
	if ((dst &&
	     ctx->ws->cs_is_buffer_referenced(ctx->dma.cs, dst->buf,
					      RADEON_USAGE_READWRITE)) ||
	    (src &&
	     ctx->ws->cs_is_buffer_referenced(ctx->dma.cs, src->buf,
					      RADEON_USAGE_WRITE)))
		r600_dma_emit_wait_idle(ctx);

	/* If GPUVM is not supported, the CS checker needs 2 entries
	 * in the buffer list per packet, which has to be done manually.
	 */
	if (ctx->screen->info.has_virtual_memory) {
		if (dst)
			radeon_add_to_buffer_list(ctx, &ctx->dma, dst,
						  RADEON_USAGE_WRITE,
						  RADEON_PRIO_SDMA_BUFFER);
		if (src)
			radeon_add_to_buffer_list(ctx, &ctx->dma, src,
						  RADEON_USAGE_READ,
						  RADEON_PRIO_SDMA_BUFFER);
	}

	/* this function is called before all DMA calls, so increment this. */
	ctx->num_dma_calls++;
}
Exemple #22
0
static void compute_emit_cs(struct r600_context *ctx, const uint *block_layout,
		const uint *grid_layout)
{
	struct radeon_winsys_cs *cs = ctx->b.gfx.cs;
	unsigned i;

	/* make sure that the gfx ring is only one active */
	if (ctx->b.dma.cs && ctx->b.dma.cs->cdw) {
		ctx->b.dma.flush(ctx, RADEON_FLUSH_ASYNC, NULL);
	}

	/* Initialize all the compute-related registers.
	 *
	 * See evergreen_init_atom_start_compute_cs() in this file for the list
	 * of registers initialized by the start_compute_cs_cmd atom.
	 */
	r600_emit_command_buffer(cs, &ctx->start_compute_cs_cmd);

	/* emit config state */
	if (ctx->b.chip_class == EVERGREEN)
		r600_emit_atom(ctx, &ctx->config_state.atom);

	ctx->b.flags |= R600_CONTEXT_WAIT_3D_IDLE | R600_CONTEXT_FLUSH_AND_INV;
	r600_flush_emit(ctx);

	/* Emit colorbuffers. */
	/* XXX support more than 8 colorbuffers (the offsets are not a multiple of 0x3C for CB8-11) */
	for (i = 0; i < 8 && i < ctx->framebuffer.state.nr_cbufs; i++) {
		struct r600_surface *cb = (struct r600_surface*)ctx->framebuffer.state.cbufs[i];
		unsigned reloc = radeon_add_to_buffer_list(&ctx->b, &ctx->b.gfx,
						       (struct r600_resource*)cb->base.texture,
						       RADEON_USAGE_READWRITE,
						       RADEON_PRIO_SHADER_RW_BUFFER);

		radeon_compute_set_context_reg_seq(cs, R_028C60_CB_COLOR0_BASE + i * 0x3C, 7);
		radeon_emit(cs, cb->cb_color_base);	/* R_028C60_CB_COLOR0_BASE */
		radeon_emit(cs, cb->cb_color_pitch);	/* R_028C64_CB_COLOR0_PITCH */
		radeon_emit(cs, cb->cb_color_slice);	/* R_028C68_CB_COLOR0_SLICE */
		radeon_emit(cs, cb->cb_color_view);	/* R_028C6C_CB_COLOR0_VIEW */
		radeon_emit(cs, cb->cb_color_info);	/* R_028C70_CB_COLOR0_INFO */
		radeon_emit(cs, cb->cb_color_attrib);	/* R_028C74_CB_COLOR0_ATTRIB */
		radeon_emit(cs, cb->cb_color_dim);		/* R_028C78_CB_COLOR0_DIM */

		radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C60_CB_COLOR0_BASE */
		radeon_emit(cs, reloc);

		if (!ctx->keep_tiling_flags) {
			radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C70_CB_COLOR0_INFO */
			radeon_emit(cs, reloc);
		}

		radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C74_CB_COLOR0_ATTRIB */
		radeon_emit(cs, reloc);
	}
	if (ctx->keep_tiling_flags) {
		for (; i < 8 ; i++) {
			radeon_compute_set_context_reg(cs, R_028C70_CB_COLOR0_INFO + i * 0x3C,
						       S_028C70_FORMAT(V_028C70_COLOR_INVALID));
		}
		for (; i < 12; i++) {
			radeon_compute_set_context_reg(cs, R_028E50_CB_COLOR8_INFO + (i - 8) * 0x1C,
						       S_028C70_FORMAT(V_028C70_COLOR_INVALID));
		}
	}

	/* Set CB_TARGET_MASK  XXX: Use cb_misc_state */
	radeon_compute_set_context_reg(cs, R_028238_CB_TARGET_MASK,
					ctx->compute_cb_target_mask);


	/* Emit vertex buffer state */
	ctx->cs_vertex_buffer_state.atom.num_dw = 12 * util_bitcount(ctx->cs_vertex_buffer_state.dirty_mask);
	r600_emit_atom(ctx, &ctx->cs_vertex_buffer_state.atom);

	/* Emit constant buffer state */
	r600_emit_atom(ctx, &ctx->constbuf_state[PIPE_SHADER_COMPUTE].atom);

	/* Emit sampler state */
	r600_emit_atom(ctx, &ctx->samplers[PIPE_SHADER_COMPUTE].states.atom);

	/* Emit sampler view (texture resource) state */
	r600_emit_atom(ctx, &ctx->samplers[PIPE_SHADER_COMPUTE].views.atom);

	/* Emit compute shader state */
	r600_emit_atom(ctx, &ctx->cs_shader_state.atom);

	/* Emit dispatch state and dispatch packet */
	evergreen_emit_direct_dispatch(ctx, block_layout, grid_layout);

	/* XXX evergreen_flush_emit() hardcodes the CP_COHER_SIZE to 0xffffffff
	 */
	ctx->b.flags |= R600_CONTEXT_INV_CONST_CACHE |
		      R600_CONTEXT_INV_VERTEX_CACHE |
	              R600_CONTEXT_INV_TEX_CACHE;
	r600_flush_emit(ctx);
	ctx->b.flags = 0;

	if (ctx->b.chip_class >= CAYMAN) {
		cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 0, 0);
		cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4);
		/* DEALLOC_STATE prevents the GPU from hanging when a
		 * SURFACE_SYNC packet is emitted some time after a DISPATCH_DIRECT
		 * with any of the CB*_DEST_BASE_ENA or DB_DEST_BASE_ENA bits set.
		 */
		cs->buf[cs->cdw++] = PKT3C(PKT3_DEALLOC_STATE, 0, 0);
		cs->buf[cs->cdw++] = 0;
	}

#if 0
	COMPUTE_DBG(ctx->screen, "cdw: %i\n", cs->cdw);
	for (i = 0; i < cs->cdw; i++) {
		COMPUTE_DBG(ctx->screen, "%4i : 0x%08X\n", i, cs->buf[i]);
	}
#endif

}
Exemple #23
0
void r600_cp_dma_copy_buffer(struct r600_context *rctx,
			     struct pipe_resource *dst, uint64_t dst_offset,
			     struct pipe_resource *src, uint64_t src_offset,
			     unsigned size)
{
	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;

	assert(size);
	assert(rctx->screen->b.has_cp_dma);

	/* Mark the buffer range of destination as valid (initialized),
	 * so that transfer_map knows it should wait for the GPU when mapping
	 * that range. */
	util_range_add(&r600_resource(dst)->valid_buffer_range, dst_offset,
		       dst_offset + size);

	dst_offset += r600_resource(dst)->gpu_address;
	src_offset += r600_resource(src)->gpu_address;

	/* Flush the caches where the resources are bound. */
	rctx->b.flags |= r600_get_flush_flags(R600_COHERENCY_SHADER) |
			 R600_CONTEXT_WAIT_3D_IDLE;

	/* There are differences between R700 and EG in CP DMA,
	 * but we only use the common bits here. */
	while (size) {
		unsigned sync = 0;
		unsigned byte_count = MIN2(size, CP_DMA_MAX_BYTE_COUNT);
		unsigned src_reloc, dst_reloc;

		r600_need_cs_space(rctx,
				   10 + (rctx->b.flags ? R600_MAX_FLUSH_CS_DWORDS : 0) +
				   3 + R600_MAX_PFP_SYNC_ME_DWORDS, FALSE);

		/* Flush the caches for the first copy only. */
		if (rctx->b.flags) {
			r600_flush_emit(rctx);
		}

		/* Do the synchronization after the last copy, so that all data is written to memory. */
		if (size == byte_count) {
			sync = PKT3_CP_DMA_CP_SYNC;
		}

		/* This must be done after r600_need_cs_space. */
		src_reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, (struct r600_resource*)src,
						  RADEON_USAGE_READ, RADEON_PRIO_CP_DMA);
		dst_reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, (struct r600_resource*)dst,
						  RADEON_USAGE_WRITE, RADEON_PRIO_CP_DMA);

		radeon_emit(cs, PKT3(PKT3_CP_DMA, 4, 0));
		radeon_emit(cs, src_offset);	/* SRC_ADDR_LO [31:0] */
		radeon_emit(cs, sync | ((src_offset >> 32) & 0xff));		/* CP_SYNC [31] | SRC_ADDR_HI [7:0] */
		radeon_emit(cs, dst_offset);	/* DST_ADDR_LO [31:0] */
		radeon_emit(cs, (dst_offset >> 32) & 0xff);		/* DST_ADDR_HI [7:0] */
		radeon_emit(cs, byte_count);	/* COMMAND [29:22] | BYTE_COUNT [20:0] */

		radeon_emit(cs, PKT3(PKT3_NOP, 0, 0));
		radeon_emit(cs, src_reloc);
		radeon_emit(cs, PKT3(PKT3_NOP, 0, 0));
		radeon_emit(cs, dst_reloc);

		size -= byte_count;
		src_offset += byte_count;
		dst_offset += byte_count;
	}

	/* CP_DMA_CP_SYNC doesn't wait for idle on R6xx, but this does. */
	if (rctx->b.chip_class == R600)
		radeon_set_config_reg(cs, R_008040_WAIT_UNTIL,
				      S_008040_WAIT_CP_DMA_IDLE(1));

	/* CP DMA is executed in ME, but index buffers are read by PFP.
	 * This ensures that ME (CP DMA) is idle before PFP starts fetching
	 * indices. If we wanted to execute CP DMA in PFP, this packet
	 * should precede it.
	 */
	r600_emit_pfp_sync_me(rctx);
}
Exemple #24
0
static bool si_upload_vertex_buffer_descriptors(struct si_context *sctx)
{
	struct si_descriptors *desc = &sctx->vertex_buffers;
	bool bound[SI_NUM_VERTEX_BUFFERS] = {};
	unsigned i, count = sctx->vertex_elements->count;
	uint64_t va;
	uint32_t *ptr;

	if (!sctx->vertex_buffers_dirty)
		return true;
	if (!count || !sctx->vertex_elements)
		return true;

	/* Vertex buffer descriptors are the only ones which are uploaded
	 * directly through a staging buffer and don't go through
	 * the fine-grained upload path.
	 */
	u_upload_alloc(sctx->b.uploader, 0, count * 16, 256, &desc->buffer_offset,
		       (struct pipe_resource**)&desc->buffer, (void**)&ptr);
	if (!desc->buffer)
		return false;

	radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
			      desc->buffer, RADEON_USAGE_READ,
			      RADEON_PRIO_DESCRIPTORS);

	assert(count <= SI_NUM_VERTEX_BUFFERS);

	for (i = 0; i < count; i++) {
		struct pipe_vertex_element *ve = &sctx->vertex_elements->elements[i];
		struct pipe_vertex_buffer *vb;
		struct r600_resource *rbuffer;
		unsigned offset;
		uint32_t *desc = &ptr[i*4];

		if (ve->vertex_buffer_index >= Elements(sctx->vertex_buffer)) {
			memset(desc, 0, 16);
			continue;
		}

		vb = &sctx->vertex_buffer[ve->vertex_buffer_index];
		rbuffer = (struct r600_resource*)vb->buffer;
		if (!rbuffer) {
			memset(desc, 0, 16);
			continue;
		}

		offset = vb->buffer_offset + ve->src_offset;
		va = rbuffer->gpu_address + offset;

		/* Fill in T# buffer resource description */
		desc[0] = va;
		desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) |
			  S_008F04_STRIDE(vb->stride);

		if (sctx->b.chip_class <= CIK && vb->stride)
			/* Round up by rounding down and adding 1 */
			desc[2] = (vb->buffer->width0 - offset -
				   sctx->vertex_elements->format_size[i]) /
				  vb->stride + 1;
		else
			desc[2] = vb->buffer->width0 - offset;

		desc[3] = sctx->vertex_elements->rsrc_word3[i];

		if (!bound[ve->vertex_buffer_index]) {
			radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
					      (struct r600_resource*)vb->buffer,
					      RADEON_USAGE_READ, RADEON_PRIO_VERTEX_BUFFER);
			bound[ve->vertex_buffer_index] = true;
		}
	}

	/* Don't flush the const cache. It would have a very negative effect
	 * on performance (confirmed by testing). New descriptors are always
	 * uploaded to a fresh new buffer, so I don't think flushing the const
	 * cache is needed. */
	desc->pointer_dirty = true;
	si_mark_atom_dirty(sctx, &sctx->shader_userdata.atom);
	sctx->vertex_buffers_dirty = false;
	return true;
}
Exemple #25
0
static void cik_sdma_copy_tile(struct si_context *ctx,
			       struct pipe_resource *dst,
			       unsigned dst_level,
			       struct pipe_resource *src,
			       unsigned src_level,
			       unsigned y,
			       unsigned copy_height,
			       unsigned y_align,
			       unsigned pitch,
			       unsigned bpe)
{
	struct radeon_winsys_cs *cs = ctx->b.dma.cs;
	struct si_screen *sscreen = ctx->screen;
	struct r600_texture *rsrc = (struct r600_texture*)src;
	struct r600_texture *rdst = (struct r600_texture*)dst;
	struct r600_texture *rlinear, *rtiled;
	unsigned linear_lvl, tiled_lvl;
	unsigned array_mode, lbpe, pitch_tile_max, slice_tile_max, size;
	unsigned ncopy, height, cheight, detile, i, src_mode, dst_mode;
	unsigned sub_op, bank_h, bank_w, mt_aspect, nbanks, tile_split, mt;
	uint64_t base, addr;
	unsigned pipe_config, tile_mode_index;

	dst_mode = rdst->surface.level[dst_level].mode;
	src_mode = rsrc->surface.level[src_level].mode;
	assert(dst_mode != src_mode);
	assert(src_mode == RADEON_SURF_MODE_LINEAR_ALIGNED || dst_mode == RADEON_SURF_MODE_LINEAR_ALIGNED);

	sub_op = CIK_SDMA_COPY_SUB_OPCODE_TILED;
	lbpe = util_logbase2(bpe);
	pitch_tile_max = ((pitch / bpe) / 8) - 1;

	detile = dst_mode == RADEON_SURF_MODE_LINEAR_ALIGNED;
	rlinear = detile ? rdst : rsrc;
	rtiled = detile ? rsrc : rdst;
	linear_lvl = detile ? dst_level : src_level;
	tiled_lvl = detile ? src_level : dst_level;

	assert(!util_format_is_depth_and_stencil(rtiled->resource.b.b.format));

	array_mode = si_array_mode(rtiled->surface.level[tiled_lvl].mode);
	slice_tile_max = (rtiled->surface.level[tiled_lvl].nblk_x *
			  rtiled->surface.level[tiled_lvl].nblk_y) / (8*8) - 1;
	height = rlinear->surface.level[linear_lvl].nblk_y;
	base = rtiled->surface.level[tiled_lvl].offset;
	addr = rlinear->surface.level[linear_lvl].offset;
	bank_h = cik_bank_wh(rtiled->surface.bankh);
	bank_w = cik_bank_wh(rtiled->surface.bankw);
	mt_aspect = cik_macro_tile_aspect(rtiled->surface.mtilea);
	tile_split = cik_tile_split(rtiled->surface.tile_split);
	tile_mode_index = si_tile_mode_index(rtiled, tiled_lvl, false);
	nbanks = si_num_banks(sscreen, rtiled);
	base += rtiled->resource.gpu_address;
	addr += rlinear->resource.gpu_address;

	pipe_config = cik_db_pipe_config(sscreen, tile_mode_index);
	mt = cik_micro_tile_mode(sscreen, tile_mode_index);

	size = (copy_height * pitch) / 4;
	cheight = copy_height;
	if (((cheight * pitch) / 4) > CIK_SDMA_COPY_MAX_SIZE) {
		cheight = (CIK_SDMA_COPY_MAX_SIZE * 4) / pitch;
		cheight &= ~(y_align - 1);
	}
	ncopy = (copy_height + cheight - 1) / cheight;
	r600_need_dma_space(&ctx->b, ncopy * 12);

	radeon_add_to_buffer_list(&ctx->b, &ctx->b.dma, &rsrc->resource,
			      RADEON_USAGE_READ, RADEON_PRIO_SDMA_TEXTURE);
	radeon_add_to_buffer_list(&ctx->b, &ctx->b.dma, &rdst->resource,
			      RADEON_USAGE_WRITE, RADEON_PRIO_SDMA_TEXTURE);

	copy_height = size * 4 / pitch;
	for (i = 0; i < ncopy; i++) {
		cheight = copy_height;
		if (((cheight * pitch) / 4) > CIK_SDMA_COPY_MAX_SIZE) {
			cheight = (CIK_SDMA_COPY_MAX_SIZE * 4) / pitch;
			cheight &= ~(y_align - 1);
		}
		size = (cheight * pitch) / 4;

		cs->buf[cs->cdw++] = CIK_SDMA_PACKET(CIK_SDMA_OPCODE_COPY,
						     sub_op, detile << 15);
		cs->buf[cs->cdw++] = base;
		cs->buf[cs->cdw++] = base >> 32;
		cs->buf[cs->cdw++] = ((height - 1) << 16) | pitch_tile_max;
		cs->buf[cs->cdw++] = slice_tile_max;
		cs->buf[cs->cdw++] = (pipe_config << 26) | (mt_aspect << 24) |
			(nbanks << 21) | (bank_h << 18) | (bank_w << 15) |
			(tile_split << 11) | (mt << 8) | (array_mode << 3) |
			lbpe;
		cs->buf[cs->cdw++] = y << 16; /* | x */
		cs->buf[cs->cdw++] = 0; /* z */
		cs->buf[cs->cdw++] = addr & 0xfffffffc;
		cs->buf[cs->cdw++] = addr >> 32;
		cs->buf[cs->cdw++] = (pitch / bpe) - 1;
		cs->buf[cs->cdw++] = size;

		copy_height -= cheight;
		y += cheight;
	}
}