示例#1
0
struct pipe_resource *r600_compute_global_buffer_create(
	struct pipe_screen *screen,
	const struct pipe_resource *templ)
{
	assert(templ->target == PIPE_BUFFER);
	assert(templ->bind & PIPE_BIND_GLOBAL);
	assert(templ->array_size == 1 || templ->array_size == 0);
	assert(templ->depth0 == 1 || templ->depth0 == 0);
	assert(templ->height0 == 1 || templ->height0 == 0);

	struct r600_resource_global* result = (struct r600_resource_global*)
		CALLOC(sizeof(struct r600_resource_global), 1);
	struct r600_screen* rscreen = (struct r600_screen*)screen;

	COMPUTE_DBG("*** r600_compute_global_buffer_create\n");
	COMPUTE_DBG("width = %u array_size = %u\n", templ->width0,
			templ->array_size);

	result->base.b.vtbl = &r600_global_buffer_vtbl;
	result->base.b.b.screen = screen;
	result->base.b.b = *templ;
	pipe_reference_init(&result->base.b.b.reference, 1);

	int size_in_dw = (templ->width0+3) / 4;

	result->chunk = compute_memory_alloc(rscreen->global_pool, size_in_dw);

	if (result->chunk == NULL)
	{
		free(result);
		return NULL;
	}

	return &result->base.b.b;
}
示例#2
0
void *r600_compute_global_transfer_map(
	struct pipe_context *ctx_,
	struct pipe_resource *resource,
	unsigned level,
	unsigned usage,
	const struct pipe_box *box,
	struct pipe_transfer **ptransfer)
{
	struct r600_context *rctx = (struct r600_context*)ctx_;
	struct compute_memory_pool *pool = rctx->screen->global_pool;
	struct r600_resource_global* buffer =
		(struct r600_resource_global*)resource;

	COMPUTE_DBG(rctx->screen, "* r600_compute_global_transfer_map()\n"
			"level = %u, usage = %u, box(x = %u, y = %u, z = %u "
			"width = %u, height = %u, depth = %u)\n", level, usage,
			box->x, box->y, box->z, box->width, box->height,
			box->depth);
	COMPUTE_DBG(rctx->screen, "Buffer id = %u offset = "
		"%u (box.x)\n", buffer->chunk->id, box->x);


	compute_memory_finalize_pending(pool, ctx_);

	assert(resource->target == PIPE_BUFFER);
	assert(resource->bind & PIPE_BIND_GLOBAL);
	assert(box->x >= 0);
	assert(box->y == 0);
	assert(box->z == 0);

	///TODO: do it better, mapping is not possible if the pool is too big
	return pipe_buffer_map_range(ctx_, (struct pipe_resource*)buffer->chunk->pool->bo,
			box->x + (buffer->chunk->start_in_dw * 4),
			box->width, usage, ptransfer);
}
示例#3
0
void *r600_compute_global_transfer_map(
	struct pipe_context *ctx_,
	struct pipe_resource *resource,
	unsigned level,
	unsigned usage,
	const struct pipe_box *box,
	struct pipe_transfer **ptransfer)
{
	struct r600_context *rctx = (struct r600_context*)ctx_;
	struct compute_memory_pool *pool = rctx->screen->global_pool;
	struct pipe_transfer *transfer = util_slab_alloc(&rctx->pool_transfers);
	struct r600_resource_global* buffer =
		(struct r600_resource_global*)resource;
	uint32_t* map;

	compute_memory_finalize_pending(pool, ctx_);

	assert(resource->target == PIPE_BUFFER);

	COMPUTE_DBG(rctx->screen, "* r600_compute_global_get_transfer()\n"
			"level = %u, usage = %u, box(x = %u, y = %u, z = %u "
			"width = %u, height = %u, depth = %u)\n", level, usage,
			box->x, box->y, box->z, box->width, box->height,
			box->depth);

	transfer->resource = resource;
	transfer->level = level;
	transfer->usage = usage;
	transfer->box = *box;
	transfer->stride = 0;
	transfer->layer_stride = 0;

	assert(transfer->resource->target == PIPE_BUFFER);
	assert(transfer->resource->bind & PIPE_BIND_GLOBAL);
	assert(transfer->box.x >= 0);
	assert(transfer->box.y == 0);
	assert(transfer->box.z == 0);

	///TODO: do it better, mapping is not possible if the pool is too big

	COMPUTE_DBG(rctx->screen, "* r600_compute_global_transfer_map()\n");

	if (!(map = r600_buffer_mmap_sync_with_rings(rctx, buffer->chunk->pool->bo, transfer->usage))) {
		util_slab_free(&rctx->pool_transfers, transfer);
		return NULL;
	}

	*ptransfer = transfer;

	COMPUTE_DBG(rctx->screen, "Buffer: %p + %u (buffer offset in global memory) "
		"+ %u (box.x)\n", map, buffer->chunk->start_in_dw, transfer->box.x);
	return ((char*)(map + buffer->chunk->start_in_dw)) + transfer->box.x;
}
示例#4
0
void *r600_compute_global_transfer_map(
	struct pipe_context *ctx_,
	struct pipe_resource *resource,
	unsigned level,
	unsigned usage,
	const struct pipe_box *box,
	struct pipe_transfer **ptransfer)
{
	struct r600_context *rctx = (struct r600_context*)ctx_;
	struct compute_memory_pool *pool = rctx->screen->global_pool;
	struct r600_resource_global* buffer =
		(struct r600_resource_global*)resource;

	struct compute_memory_item *item = buffer->chunk;
	struct pipe_resource *dst = NULL;
	unsigned offset = box->x;

	if (is_item_in_pool(item)) {
		compute_memory_demote_item(pool, item, ctx_);
	}
	else {
		if (item->real_buffer == NULL) {
			item->real_buffer =
					r600_compute_buffer_alloc_vram(pool->screen, item->size_in_dw * 4);
		}
	}

	dst = (struct pipe_resource*)item->real_buffer;

	if (usage & PIPE_TRANSFER_READ)
		buffer->chunk->status |= ITEM_MAPPED_FOR_READING;

	COMPUTE_DBG(rctx->screen, "* r600_compute_global_transfer_map()\n"
			"level = %u, usage = %u, box(x = %u, y = %u, z = %u "
			"width = %u, height = %u, depth = %u)\n", level, usage,
			box->x, box->y, box->z, box->width, box->height,
			box->depth);
	COMPUTE_DBG(rctx->screen, "Buffer id = %"PRIi64" offset = "
		"%u (box.x)\n", item->id, box->x);


	assert(resource->target == PIPE_BUFFER);
	assert(resource->bind & PIPE_BIND_GLOBAL);
	assert(box->x >= 0);
	assert(box->y == 0);
	assert(box->z == 0);

	///TODO: do it better, mapping is not possible if the pool is too big
	return pipe_buffer_map_range(ctx_, dst,
			offset, box->width, usage, ptransfer);
}
示例#5
0
void *evergreen_create_compute_state(
	struct pipe_context *ctx_,
	const const struct pipe_compute_state *cso)
{
	struct r600_context *ctx = (struct r600_context *)ctx_;
	struct r600_pipe_compute *shader = CALLOC_STRUCT(r600_pipe_compute);

#ifdef HAVE_OPENCL
	const struct pipe_llvm_program_header * header;
	const unsigned char * code;
	unsigned i;

	COMPUTE_DBG(ctx->screen, "*** evergreen_create_compute_state\n");

	header = cso->prog;
	code = cso->prog + sizeof(struct pipe_llvm_program_header);
#endif

	shader->ctx = (struct r600_context*)ctx;
	shader->local_size = cso->req_local_mem; ///TODO: assert it
	shader->private_size = cso->req_private_mem;
	shader->input_size = cso->req_input_mem;

#ifdef HAVE_OPENCL 
	shader->num_kernels = radeon_llvm_get_num_kernels(code, header->num_bytes);
	shader->kernels = CALLOC(sizeof(struct r600_kernel), shader->num_kernels);

	for (i = 0; i < shader->num_kernels; i++) {
		struct r600_kernel *kernel = &shader->kernels[i];
		kernel->llvm_module = radeon_llvm_get_kernel_module(i, code,
							header->num_bytes);
	}
#endif
	return shader;
}
示例#6
0
static void evergreen_launch_grid(
		struct pipe_context *ctx_,
		const uint *block_layout, const uint *grid_layout,
		uint32_t pc, const void *input)
{
	struct r600_context *ctx = (struct r600_context *)ctx_;

#ifdef HAVE_OPENCL 
	COMPUTE_DBG(ctx->screen, "*** evergreen_launch_grid: pc = %u\n", pc);

	struct r600_pipe_compute *shader = ctx->cs_shader_state.shader;
	if (!shader->kernels[pc].code_bo) {
		void *p;
		struct r600_kernel *kernel = &shader->kernels[pc];
		r600_compute_shader_create(ctx_, kernel->llvm_module, &kernel->bc);
		kernel->code_bo = r600_compute_buffer_alloc_vram(ctx->screen,
							kernel->bc.ndw * 4);
		p = r600_buffer_mmap_sync_with_rings(ctx, kernel->code_bo, PIPE_TRANSFER_WRITE);
		memcpy(p, kernel->bc.bytecode, kernel->bc.ndw * 4);
		ctx->ws->buffer_unmap(kernel->code_bo->cs_buf);
	}
#endif

	ctx->cs_shader_state.kernel_index = pc;
	evergreen_compute_upload_input(ctx_, block_layout, grid_layout, input);
	compute_emit_cs(ctx, block_layout, grid_layout);
}
示例#7
0
void evergreen_delete_compute_state(struct pipe_context *ctx_, void* state)
{
	struct r600_context *ctx = (struct r600_context *)ctx_;
	COMPUTE_DBG(ctx->screen, "*** evergreen_delete_compute_state\n");
	struct r600_pipe_compute *shader = state;

	if (!shader)
		return;

#ifdef HAVE_OPENCL
#if HAVE_LLVM < 0x0306
	for (unsigned i = 0; i < shader->num_kernels; i++) {
		struct r600_kernel *kernel = &shader->kernels[i];
		LLVMDisposeModule(module);
	}
	FREE(shader->kernels);
	LLVMContextDispose(shader->llvm_ctx);
#else
	radeon_shader_binary_clean(&shader->binary);
	r600_destroy_shader(&shader->bc);

	/* TODO destroy shader->code_bo, shader->const_bo
	 * we'll need something like r600_buffer_free */
#endif
#endif
	FREE(shader);
}
示例#8
0
static void evergreen_set_compute_resources(struct pipe_context * ctx_,
		unsigned start, unsigned count,
		struct pipe_surface ** surfaces)
{
	struct r600_context *ctx = (struct r600_context *)ctx_;
	struct r600_surface **resources = (struct r600_surface **)surfaces;

	COMPUTE_DBG("*** evergreen_set_compute_resources: start = %u count = %u\n",
			start, count);

	for (int i = 0; i < count; i++)	{
		/* The First two vertex buffers are reserved for parameters and
		 * global buffers. */
		unsigned vtx_id = 2 + i;
		if (resources[i]) {
			struct r600_resource_global *buffer =
				(struct r600_resource_global*)
				resources[i]->base.texture;
			if (resources[i]->base.writable) {
				assert(i+1 < 12);

				evergreen_set_rat(ctx->cs_shader_state.shader, i+1,
				(struct r600_resource *)resources[i]->base.texture,
				buffer->chunk->start_in_dw*4,
				resources[i]->base.texture->width0);
			}

			evergreen_cs_set_vertex_buffer(ctx, vtx_id,
					buffer->chunk->start_in_dw * 4,
					resources[i]->base.texture);
		}
	}
}
示例#9
0
static void evergreen_set_global_binding(
	struct pipe_context *ctx_, unsigned first, unsigned n,
	struct pipe_resource **resources,
	uint32_t **handles)
{
	struct r600_context *ctx = (struct r600_context *)ctx_;
	struct compute_memory_pool *pool = ctx->screen->global_pool;
	struct r600_resource_global **buffers =
		(struct r600_resource_global **)resources;

	COMPUTE_DBG("*** evergreen_set_global_binding first = %u n = %u\n",
			first, n);

	if (!resources) {
		/* XXX: Unset */
		return;
	}

	compute_memory_finalize_pending(pool, ctx_);

	for (int i = 0; i < n; i++)
	{
		assert(resources[i]->target == PIPE_BUFFER);
		assert(resources[i]->bind & PIPE_BIND_GLOBAL);

		*(handles[i]) = buffers[i]->chunk->start_in_dw * 4;
	}

	evergreen_set_rat(ctx->cs_shader_state.shader, 0, pool->bo, 0, pool->size_in_dw * 4);
	evergreen_cs_set_vertex_buffer(ctx, 1, 0,
				(struct pipe_resource*)pool->bo);
}
示例#10
0
static void *evergreen_create_compute_state(struct pipe_context *ctx,
					    const struct pipe_compute_state *cso)
{
	struct r600_context *rctx = (struct r600_context *)ctx;
	struct r600_pipe_compute *shader = CALLOC_STRUCT(r600_pipe_compute);
#ifdef HAVE_OPENCL
	const struct pipe_llvm_program_header *header;
	const char *code;
	void *p;
	boolean use_kill;

	COMPUTE_DBG(rctx->screen, "*** evergreen_create_compute_state\n");
	header = cso->prog;
	code = cso->prog + sizeof(struct pipe_llvm_program_header);
	radeon_shader_binary_init(&shader->binary);
	radeon_elf_read(code, header->num_bytes, &shader->binary);
	r600_create_shader(&shader->bc, &shader->binary, &use_kill);

	/* Upload code + ROdata */
	shader->code_bo = r600_compute_buffer_alloc_vram(rctx->screen,
							shader->bc.ndw * 4);
	p = r600_buffer_map_sync_with_rings(&rctx->b, shader->code_bo, PIPE_TRANSFER_WRITE);
	//TODO: use util_memcpy_cpu_to_le32 ?
	memcpy(p, shader->bc.bytecode, shader->bc.ndw * 4);
	rctx->b.ws->buffer_unmap(shader->code_bo->buf);
#endif

	shader->ctx = rctx;
	shader->local_size = cso->req_local_mem;
	shader->private_size = cso->req_private_mem;
	shader->input_size = cso->req_input_mem;

	return shader;
}
示例#11
0
void* r600_compute_global_transfer_map(
	struct pipe_context *ctx_,
	struct pipe_transfer* transfer)
{
	assert(transfer->resource->target == PIPE_BUFFER);
	assert(transfer->resource->bind & PIPE_BIND_GLOBAL);
	assert(transfer->box.x >= 0);
	assert(transfer->box.y == 0);
	assert(transfer->box.z == 0);

	struct r600_context *ctx = (struct r600_context *)ctx_;
	struct r600_resource_global* buffer =
		(struct r600_resource_global*)transfer->resource;

	uint32_t* map;
	///TODO: do it better, mapping is not possible if the pool is too big

	if (!(map = ctx->ws->buffer_map(buffer->chunk->pool->bo->cs_buf,
						ctx->cs, transfer->usage))) {
		return NULL;
	}

	COMPUTE_DBG("buffer start: %lli\n", buffer->chunk->start_in_dw);
	return ((char*)(map + buffer->chunk->start_in_dw)) + transfer->box.x;
}
示例#12
0
static void evergreen_bind_compute_state(struct pipe_context *ctx_, void *state)
{
	struct r600_context *ctx = (struct r600_context *)ctx_;

	COMPUTE_DBG("*** evergreen_bind_compute_state\n");

	ctx->cs_shader_state.shader = (struct r600_pipe_compute *)state;
}
示例#13
0
/* The kernel parameters are stored a vtx buffer (ID=0), besides the explicit
 * kernel parameters there are inplicit parameters that need to be stored
 * in the vertex buffer as well.  Here is how these parameters are organized in
 * the buffer:
 *
 * DWORDS 0-2: Number of work groups in each dimension (x,y,z)
 * DWORDS 3-5: Number of global work items in each dimension (x,y,z)
 * DWORDS 6-8: Number of work items within each work group in each dimension
 *             (x,y,z)
 * DWORDS 9+ : Kernel parameters
 */
void evergreen_compute_upload_input(
	struct pipe_context *ctx_,
	const uint *block_layout,
	const uint *grid_layout,
	const void *input)
{
	struct r600_context *ctx = (struct r600_context *)ctx_;
	struct r600_pipe_compute *shader = ctx->cs_shader_state.shader;
	int i;
	/* We need to reserve 9 dwords (36 bytes) for implicit kernel
	 * parameters.
	 */
	unsigned input_size = shader->input_size + 36;
	uint32_t * num_work_groups_start;
	uint32_t * global_size_start;
	uint32_t * local_size_start;
	uint32_t * kernel_parameters_start;

	if (shader->input_size == 0) {
		return;
	}

	if (!shader->kernel_param) {
		/* Add space for the grid dimensions */
		shader->kernel_param = r600_compute_buffer_alloc_vram(
						ctx->screen, input_size);
	}

	num_work_groups_start = r600_buffer_mmap_sync_with_rings(ctx, shader->kernel_param, PIPE_TRANSFER_WRITE);
	global_size_start = num_work_groups_start + (3 * (sizeof(uint) /4));
	local_size_start = global_size_start + (3 * (sizeof(uint)) / 4);
	kernel_parameters_start = local_size_start + (3 * (sizeof(uint)) / 4);

	/* Copy the work group size */
	memcpy(num_work_groups_start, grid_layout, 3 * sizeof(uint));

	/* Copy the global size */
	for (i = 0; i < 3; i++) {
		global_size_start[i] = grid_layout[i] * block_layout[i];
	}

	/* Copy the local dimensions */
	memcpy(local_size_start, block_layout, 3 * sizeof(uint));

	/* Copy the kernel inputs */
	memcpy(kernel_parameters_start, input, shader->input_size);

	for (i = 0; i < (input_size / 4); i++) {
		COMPUTE_DBG(ctx->screen, "input %i : %i\n", i,
			((unsigned*)num_work_groups_start)[i]);
	}

	ctx->ws->buffer_unmap(shader->kernel_param->cs_buf);

	/* ID=0 is reserved for the parameters */
	evergreen_cs_set_constant_buffer(ctx, 0, 0, input_size,
			(struct pipe_resource*)shader->kernel_param);
}
示例#14
0
static void evergreen_launch_grid(
		struct pipe_context *ctx_,
		const uint *block_layout, const uint *grid_layout,
		uint32_t pc, const void *input)
{
	struct r600_context *ctx = (struct r600_context *)ctx_;
#ifdef HAVE_OPENCL
	struct r600_pipe_compute *shader = ctx->cs_shader_state.shader;
	boolean use_kill;

#if HAVE_LLVM < 0x0306
	struct r600_kernel *kernel = &shader->kernels[pc];
	(void)use_kill;
        if (!kernel->code_bo) {
                void *p;
                struct r600_bytecode *bc = &kernel->bc;
                LLVMModuleRef mod = kernel->llvm_module;
                boolean use_kill = false;
                bool dump = (ctx->screen->b.debug_flags & DBG_CS) != 0;
                unsigned use_sb = ctx->screen->b.debug_flags & DBG_SB_CS;
                unsigned sb_disasm = use_sb ||
                        (ctx->screen->b.debug_flags & DBG_SB_DISASM);

                r600_bytecode_init(bc, ctx->b.chip_class, ctx->b.family,
                           ctx->screen->has_compressed_msaa_texturing);
                bc->type = TGSI_PROCESSOR_COMPUTE;
                bc->isa = ctx->isa;
                r600_llvm_compile(mod, ctx->b.family, bc, &use_kill, dump, &ctx->b.debug);

                if (dump && !sb_disasm) {
                        r600_bytecode_disasm(bc);
                } else if ((dump && sb_disasm) || use_sb) {
                        if (r600_sb_bytecode_process(ctx, bc, NULL, dump, use_sb))
                                R600_ERR("r600_sb_bytecode_process failed!\n");
                }

                kernel->code_bo = r600_compute_buffer_alloc_vram(ctx->screen,
                                                        kernel->bc.ndw * 4);
                p = r600_buffer_map_sync_with_rings(&ctx->b, kernel->code_bo, PIPE_TRANSFER_WRITE);
                memcpy(p, kernel->bc.bytecode, kernel->bc.ndw * 4);
                ctx->b.ws->buffer_unmap(kernel->code_bo->buf);
        }
	shader->active_kernel = kernel;
	ctx->cs_shader_state.kernel_index = pc;
#else
	ctx->cs_shader_state.pc = pc;
	/* Get the config information for this kernel. */
	r600_shader_binary_read_config(&shader->binary, &shader->bc, pc, &use_kill);
#endif
#endif

	COMPUTE_DBG(ctx->screen, "*** evergreen_launch_grid: pc = %u\n", pc);


	evergreen_compute_upload_input(ctx_, block_layout, grid_layout, input);
	compute_emit_cs(ctx, block_layout, grid_layout);
}
示例#15
0
static void evergreen_set_global_binding(struct pipe_context *ctx,
					 unsigned first, unsigned n,
					 struct pipe_resource **resources,
					 uint32_t **handles)
{
	struct r600_context *rctx = (struct r600_context *)ctx;
	struct compute_memory_pool *pool = rctx->screen->global_pool;
	struct r600_resource_global **buffers =
		(struct r600_resource_global **)resources;
	unsigned i;

	COMPUTE_DBG(rctx->screen, "*** evergreen_set_global_binding first = %u n = %u\n",
			first, n);

	if (!resources) {
		/* XXX: Unset */
		return;
	}

	/* We mark these items for promotion to the pool if they
	 * aren't already there */
	for (i = first; i < first + n; i++) {
		struct compute_memory_item *item = buffers[i]->chunk;

		if (!is_item_in_pool(item))
			buffers[i]->chunk->status |= ITEM_FOR_PROMOTING;
	}

	if (compute_memory_finalize_pending(pool, ctx) == -1) {
		/* XXX: Unset */
		return;
	}

	for (i = first; i < first + n; i++)
	{
		uint32_t buffer_offset;
		uint32_t handle;
		assert(resources[i]->target == PIPE_BUFFER);
		assert(resources[i]->bind & PIPE_BIND_GLOBAL);

		buffer_offset = util_le32_to_cpu(*(handles[i]));
		handle = buffer_offset + buffers[i]->chunk->start_in_dw * 4;

		*(handles[i]) = util_cpu_to_le32(handle);
	}

	/* globals for writing */
	evergreen_set_rat(rctx->cs_shader_state.shader, 0, pool->bo, 0, pool->size_in_dw * 4);
	/* globals for reading */
	evergreen_cs_set_vertex_buffer(rctx, 1, 0,
				(struct pipe_resource*)pool->bo);

	/* constants for reading, LLVM puts them in text segment */
	evergreen_cs_set_vertex_buffer(rctx, 2, 0,
				(struct pipe_resource*)rctx->cs_shader_state.shader->code_bo);
}
示例#16
0
static void evergreen_launch_grid(
		struct pipe_context *ctx_,
		const uint *block_layout, const uint *grid_layout,
		uint32_t pc, const void *input)
{
	struct r600_context *ctx = (struct r600_context *)ctx_;

	COMPUTE_DBG("PC: %i\n", pc);

	evergreen_compute_upload_input(ctx_, block_layout, grid_layout, input);
	compute_emit_cs(ctx, block_layout, grid_layout);
}
示例#17
0
void *evergreen_create_compute_state(
	struct pipe_context *ctx_,
	const const struct pipe_compute_state *cso)
{
	struct r600_context *ctx = (struct r600_context *)ctx_;
	struct r600_pipe_compute *shader = CALLOC_STRUCT(r600_pipe_compute);
#ifdef HAVE_OPENCL
	const struct pipe_llvm_program_header * header;
	const char *code;
	void *p;
	boolean use_kill;

	COMPUTE_DBG(ctx->screen, "*** evergreen_create_compute_state\n");
	header = cso->prog;
	code = cso->prog + sizeof(struct pipe_llvm_program_header);
#if HAVE_LLVM < 0x0306
        (void)use_kill;
	(void)p;
	shader->llvm_ctx = LLVMContextCreate();
	shader->num_kernels = radeon_llvm_get_num_kernels(shader->llvm_ctx,
				code, header->num_bytes);
	shader->kernels = CALLOC(sizeof(struct r600_kernel),
				shader->num_kernels);
	{
		unsigned i;
		for (i = 0; i < shader->num_kernels; i++) {
			struct r600_kernel *kernel = &shader->kernels[i];
			kernel->llvm_module = radeon_llvm_get_kernel_module(
				shader->llvm_ctx, i, code, header->num_bytes);
		}
	}
#else
	radeon_shader_binary_init(&shader->binary);
	radeon_elf_read(code, header->num_bytes, &shader->binary);
	r600_create_shader(&shader->bc, &shader->binary, &use_kill);

	shader->code_bo = r600_compute_buffer_alloc_vram(ctx->screen,
							shader->bc.ndw * 4);
	p = r600_buffer_map_sync_with_rings(&ctx->b, shader->code_bo, PIPE_TRANSFER_WRITE);
	memcpy(p, shader->bc.bytecode, shader->bc.ndw * 4);
	ctx->b.ws->buffer_unmap(shader->code_bo->buf);
#endif
#endif

	shader->ctx = ctx;
	shader->local_size = cso->req_local_mem;
	shader->private_size = cso->req_private_mem;
	shader->input_size = cso->req_input_mem;

	return shader;
}
示例#18
0
static void evergreen_launch_grid(
    struct pipe_context *ctx_,
    const uint *block_layout, const uint *grid_layout,
    uint32_t pc, const void *input)
{
    COMPUTE_DBG("PC: %i\n", pc);

    struct r600_context *ctx = (struct r600_context *)ctx_;
    unsigned num_waves;
    unsigned num_pipes = ctx->screen->info.r600_max_pipes;
    unsigned wave_divisor = (16 * num_pipes);

    /* num_waves = ceil((tg_size.x * tg_size.y, tg_size.z) / (16 * num_pipes)) */
    num_waves = (block_layout[0] * block_layout[1] * block_layout[2] +
                 wave_divisor - 1) / wave_divisor;

    COMPUTE_DBG("Using %u pipes, there are %u wavefronts per thread block\n",
                num_pipes, num_waves);

    evergreen_set_lds(ctx->cs_shader, 0, 0, num_waves);
    evergreen_compute_upload_input(ctx_, block_layout, grid_layout, input);
    evergreen_direct_dispatch(ctx_, block_layout, grid_layout);
    compute_emit_cs(ctx);
}
示例#19
0
static void evergreen_delete_compute_state(struct pipe_context *ctx, void *state)
{
	struct r600_context *rctx = (struct r600_context *)ctx;
	struct r600_pipe_compute *shader = state;

	COMPUTE_DBG(rctx->screen, "*** evergreen_delete_compute_state\n");

	if (!shader)
		return;

	radeon_shader_binary_clean(&shader->binary);
	r600_destroy_shader(&shader->bc);

	/* TODO destroy shader->code_bo, shader->const_bo
	 * we'll need something like r600_buffer_free */
	FREE(shader);
}
示例#20
0
void r600_compute_global_transfer_unmap(
	struct pipe_context *ctx_,
	struct pipe_transfer* transfer)
{
	struct r600_context *ctx = NULL;
	struct r600_resource_global* buffer = NULL;

	assert(transfer->resource->target == PIPE_BUFFER);
	assert(transfer->resource->bind & PIPE_BIND_GLOBAL);

	ctx = (struct r600_context *)ctx_;
	buffer = (struct r600_resource_global*)transfer->resource;

	COMPUTE_DBG(ctx->screen, "* r600_compute_global_transfer_unmap()\n");

	ctx->ws->buffer_unmap(buffer->chunk->pool->bo->cs_buf);
	util_slab_free(&ctx->pool_transfers, transfer);
}
示例#21
0
static void evergreen_set_rat(
	struct r600_pipe_compute *pipe,
	int id,
	struct r600_resource* bo,
	int start,
	int size)
{
	struct pipe_surface rat_templ;
	struct r600_surface *surf = NULL;
	struct r600_context *rctx = NULL;

	assert(id < 12);
	assert((size & 3) == 0);
	assert((start & 0xFF) == 0);

	rctx = pipe->ctx;

	COMPUTE_DBG(rctx->screen, "bind rat: %i \n", id);

	/* Create the RAT surface */
	memset(&rat_templ, 0, sizeof(rat_templ));
	rat_templ.format = PIPE_FORMAT_R32_UINT;
	rat_templ.u.tex.level = 0;
	rat_templ.u.tex.first_layer = 0;
	rat_templ.u.tex.last_layer = 0;

	/* Add the RAT the list of color buffers */
	pipe->ctx->framebuffer.state.cbufs[id] = pipe->ctx->context.create_surface(
		(struct pipe_context *)pipe->ctx,
		(struct pipe_resource *)bo, &rat_templ);

	/* Update the number of color buffers */
	pipe->ctx->framebuffer.state.nr_cbufs =
		MAX2(id + 1, pipe->ctx->framebuffer.state.nr_cbufs);

	/* Update the cb_target_mask
	 * XXX: I think this is a potential spot for bugs once we start doing
	 * GL interop.  cb_target_mask may be modified in the 3D sections
	 * of this driver. */
	pipe->ctx->compute_cb_target_mask |= (0xf << (id * 4));

	surf = (struct r600_surface*)pipe->ctx->framebuffer.state.cbufs[id];
	evergreen_init_color_surface_rat(rctx, surf);
}
示例#22
0
static void evergreen_launch_grid(struct pipe_context *ctx,
				  const struct pipe_grid_info *info)
{
	struct r600_context *rctx = (struct r600_context *)ctx;
#ifdef HAVE_OPENCL
	struct r600_pipe_compute *shader = rctx->cs_shader_state.shader;
	boolean use_kill;

	rctx->cs_shader_state.pc = info->pc;
	/* Get the config information for this kernel. */
	r600_shader_binary_read_config(&shader->binary, &shader->bc,
                                  info->pc, &use_kill);
#endif

	COMPUTE_DBG(rctx->screen, "*** evergreen_launch_grid: pc = %u\n", info->pc);


	evergreen_compute_upload_input(ctx, info);
	compute_emit_cs(rctx, info);
}
示例#23
0
void *evergreen_create_compute_state(
	struct pipe_context *ctx_,
	const const struct pipe_compute_state *cso)
{
	struct r600_context *ctx = (struct r600_context *)ctx_;
	struct r600_pipe_compute *shader = CALLOC_STRUCT(r600_pipe_compute);
	void *p;

#ifdef HAVE_OPENCL
	const struct pipe_llvm_program_header * header;
	const unsigned char * code;

	COMPUTE_DBG("*** evergreen_create_compute_state\n");

	header = cso->prog;
	code = cso->prog + sizeof(struct pipe_llvm_program_header);
#endif

	shader->ctx = (struct r600_context*)ctx;
	shader->resources = (struct evergreen_compute_resource*)
			CALLOC(sizeof(struct evergreen_compute_resource),
			get_compute_resource_num());
	shader->local_size = cso->req_local_mem; ///TODO: assert it
	shader->private_size = cso->req_private_mem;
	shader->input_size = cso->req_input_mem;

#ifdef HAVE_OPENCL 
	shader->mod = llvm_parse_bitcode(code, header->num_bytes);

	r600_compute_shader_create(ctx_, shader->mod, &shader->bc);
#endif
	shader->shader_code_bo = r600_compute_buffer_alloc_vram(ctx->screen,
							shader->bc.ndw * 4);

	p = ctx->ws->buffer_map(shader->shader_code_bo->cs_buf, ctx->cs,
							PIPE_TRANSFER_WRITE);

	memcpy(p, shader->bc.bytecode, shader->bc.ndw * 4);
	ctx->ws->buffer_unmap(shader->shader_code_bo->cs_buf);
	return shader;
}
示例#24
0
static void compute_emit_cs(struct r600_context *ctx, const uint *block_layout,
		const uint *grid_layout)
{
	struct radeon_winsys_cs *cs = ctx->b.gfx.cs;
	unsigned i;

	/* make sure that the gfx ring is only one active */
	if (ctx->b.dma.cs && ctx->b.dma.cs->cdw) {
		ctx->b.dma.flush(ctx, RADEON_FLUSH_ASYNC, NULL);
	}

	/* Initialize all the compute-related registers.
	 *
	 * See evergreen_init_atom_start_compute_cs() in this file for the list
	 * of registers initialized by the start_compute_cs_cmd atom.
	 */
	r600_emit_command_buffer(cs, &ctx->start_compute_cs_cmd);

	/* emit config state */
	if (ctx->b.chip_class == EVERGREEN)
		r600_emit_atom(ctx, &ctx->config_state.atom);

	ctx->b.flags |= R600_CONTEXT_WAIT_3D_IDLE | R600_CONTEXT_FLUSH_AND_INV;
	r600_flush_emit(ctx);

	/* Emit colorbuffers. */
	/* XXX support more than 8 colorbuffers (the offsets are not a multiple of 0x3C for CB8-11) */
	for (i = 0; i < 8 && i < ctx->framebuffer.state.nr_cbufs; i++) {
		struct r600_surface *cb = (struct r600_surface*)ctx->framebuffer.state.cbufs[i];
		unsigned reloc = radeon_add_to_buffer_list(&ctx->b, &ctx->b.gfx,
						       (struct r600_resource*)cb->base.texture,
						       RADEON_USAGE_READWRITE,
						       RADEON_PRIO_SHADER_RW_BUFFER);

		radeon_compute_set_context_reg_seq(cs, R_028C60_CB_COLOR0_BASE + i * 0x3C, 7);
		radeon_emit(cs, cb->cb_color_base);	/* R_028C60_CB_COLOR0_BASE */
		radeon_emit(cs, cb->cb_color_pitch);	/* R_028C64_CB_COLOR0_PITCH */
		radeon_emit(cs, cb->cb_color_slice);	/* R_028C68_CB_COLOR0_SLICE */
		radeon_emit(cs, cb->cb_color_view);	/* R_028C6C_CB_COLOR0_VIEW */
		radeon_emit(cs, cb->cb_color_info);	/* R_028C70_CB_COLOR0_INFO */
		radeon_emit(cs, cb->cb_color_attrib);	/* R_028C74_CB_COLOR0_ATTRIB */
		radeon_emit(cs, cb->cb_color_dim);		/* R_028C78_CB_COLOR0_DIM */

		radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C60_CB_COLOR0_BASE */
		radeon_emit(cs, reloc);

		if (!ctx->keep_tiling_flags) {
			radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C70_CB_COLOR0_INFO */
			radeon_emit(cs, reloc);
		}

		radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C74_CB_COLOR0_ATTRIB */
		radeon_emit(cs, reloc);
	}
	if (ctx->keep_tiling_flags) {
		for (; i < 8 ; i++) {
			radeon_compute_set_context_reg(cs, R_028C70_CB_COLOR0_INFO + i * 0x3C,
						       S_028C70_FORMAT(V_028C70_COLOR_INVALID));
		}
		for (; i < 12; i++) {
			radeon_compute_set_context_reg(cs, R_028E50_CB_COLOR8_INFO + (i - 8) * 0x1C,
						       S_028C70_FORMAT(V_028C70_COLOR_INVALID));
		}
	}

	/* Set CB_TARGET_MASK  XXX: Use cb_misc_state */
	radeon_compute_set_context_reg(cs, R_028238_CB_TARGET_MASK,
					ctx->compute_cb_target_mask);


	/* Emit vertex buffer state */
	ctx->cs_vertex_buffer_state.atom.num_dw = 12 * util_bitcount(ctx->cs_vertex_buffer_state.dirty_mask);
	r600_emit_atom(ctx, &ctx->cs_vertex_buffer_state.atom);

	/* Emit constant buffer state */
	r600_emit_atom(ctx, &ctx->constbuf_state[PIPE_SHADER_COMPUTE].atom);

	/* Emit sampler state */
	r600_emit_atom(ctx, &ctx->samplers[PIPE_SHADER_COMPUTE].states.atom);

	/* Emit sampler view (texture resource) state */
	r600_emit_atom(ctx, &ctx->samplers[PIPE_SHADER_COMPUTE].views.atom);

	/* Emit compute shader state */
	r600_emit_atom(ctx, &ctx->cs_shader_state.atom);

	/* Emit dispatch state and dispatch packet */
	evergreen_emit_direct_dispatch(ctx, block_layout, grid_layout);

	/* XXX evergreen_flush_emit() hardcodes the CP_COHER_SIZE to 0xffffffff
	 */
	ctx->b.flags |= R600_CONTEXT_INV_CONST_CACHE |
		      R600_CONTEXT_INV_VERTEX_CACHE |
	              R600_CONTEXT_INV_TEX_CACHE;
	r600_flush_emit(ctx);
	ctx->b.flags = 0;

	if (ctx->b.chip_class >= CAYMAN) {
		cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 0, 0);
		cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4);
		/* DEALLOC_STATE prevents the GPU from hanging when a
		 * SURFACE_SYNC packet is emitted some time after a DISPATCH_DIRECT
		 * with any of the CB*_DEST_BASE_ENA or DB_DEST_BASE_ENA bits set.
		 */
		cs->buf[cs->cdw++] = PKT3C(PKT3_DEALLOC_STATE, 0, 0);
		cs->buf[cs->cdw++] = 0;
	}

#if 0
	COMPUTE_DBG(ctx->screen, "cdw: %i\n", cs->cdw);
	for (i = 0; i < cs->cdw; i++) {
		COMPUTE_DBG(ctx->screen, "%4i : 0x%08X\n", i, cs->buf[i]);
	}
#endif

}
示例#25
0
static void compute_emit_cs(struct r600_context *ctx, const uint *block_layout,
		const uint *grid_layout)
{
	struct radeon_winsys_cs *cs = ctx->rings.gfx.cs;
	unsigned flush_flags = 0;
	int i;

	/* make sure that the gfx ring is only one active */
	if (ctx->rings.dma.cs) {
		ctx->rings.dma.flush(ctx, RADEON_FLUSH_ASYNC);
	}

	/* Initialize all the compute-related registers.
	 *
	 * See evergreen_init_atom_start_compute_cs() in this file for the list
	 * of registers initialized by the start_compute_cs_cmd atom.
	 */
	r600_emit_command_buffer(cs, &ctx->start_compute_cs_cmd);

	ctx->flags |= R600_CONTEXT_WAIT_3D_IDLE | R600_CONTEXT_FLUSH_AND_INV;
	r600_flush_emit(ctx);

	/* Emit colorbuffers. */
	for (i = 0; i < ctx->framebuffer.state.nr_cbufs; i++) {
		struct r600_surface *cb = (struct r600_surface*)ctx->framebuffer.state.cbufs[i];
		unsigned reloc = r600_context_bo_reloc(ctx, &ctx->rings.gfx,
						       (struct r600_resource*)cb->base.texture,
						       RADEON_USAGE_READWRITE);

		r600_write_compute_context_reg_seq(cs, R_028C60_CB_COLOR0_BASE + i * 0x3C, 7);
		r600_write_value(cs, cb->cb_color_base);	/* R_028C60_CB_COLOR0_BASE */
		r600_write_value(cs, cb->cb_color_pitch);	/* R_028C64_CB_COLOR0_PITCH */
		r600_write_value(cs, cb->cb_color_slice);	/* R_028C68_CB_COLOR0_SLICE */
		r600_write_value(cs, cb->cb_color_view);	/* R_028C6C_CB_COLOR0_VIEW */
		r600_write_value(cs, cb->cb_color_info);	/* R_028C70_CB_COLOR0_INFO */
		r600_write_value(cs, cb->cb_color_attrib);	/* R_028C74_CB_COLOR0_ATTRIB */
		r600_write_value(cs, cb->cb_color_dim);		/* R_028C78_CB_COLOR0_DIM */

		r600_write_value(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C60_CB_COLOR0_BASE */
		r600_write_value(cs, reloc);

		if (!ctx->keep_tiling_flags) {
			r600_write_value(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C70_CB_COLOR0_INFO */
			r600_write_value(cs, reloc);
		}

		r600_write_value(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C74_CB_COLOR0_ATTRIB */
		r600_write_value(cs, reloc);
	}

	/* Set CB_TARGET_MASK  XXX: Use cb_misc_state */
	r600_write_compute_context_reg(cs, R_028238_CB_TARGET_MASK,
					ctx->compute_cb_target_mask);


	/* Emit vertex buffer state */
	ctx->cs_vertex_buffer_state.atom.num_dw = 12 * util_bitcount(ctx->cs_vertex_buffer_state.dirty_mask);
	r600_emit_atom(ctx, &ctx->cs_vertex_buffer_state.atom);

	/* Emit constant buffer state */
	r600_emit_atom(ctx, &ctx->constbuf_state[PIPE_SHADER_COMPUTE].atom);

	/* Emit compute shader state */
	r600_emit_atom(ctx, &ctx->cs_shader_state.atom);

	/* Emit dispatch state and dispatch packet */
	evergreen_emit_direct_dispatch(ctx, block_layout, grid_layout);

	/* XXX evergreen_flush_emit() hardcodes the CP_COHER_SIZE to 0xffffffff
	 */
	ctx->flags |= R600_CONTEXT_INVAL_READ_CACHES;
	r600_flush_emit(ctx);

#if 0
	COMPUTE_DBG(ctx->screen, "cdw: %i\n", cs->cdw);
	for (i = 0; i < cs->cdw; i++) {
		COMPUTE_DBG(ctx->screen, "%4i : 0x%08X\n", i, ctx->cs->buf[i]);
	}
#endif

	flush_flags = RADEON_FLUSH_ASYNC | RADEON_FLUSH_COMPUTE;
	if (ctx->keep_tiling_flags) {
		flush_flags |= RADEON_FLUSH_KEEP_TILING_FLAGS;
	}

	ctx->ws->cs_flush(ctx->rings.gfx.cs, flush_flags, ctx->screen->cs_count++);

	ctx->flags = 0;

	COMPUTE_DBG(ctx->screen, "shader started\n");
}
示例#26
0
static void compute_emit_cs(struct r600_context *ctx, const uint *block_layout,
		const uint *grid_layout)
{
	struct radeon_winsys_cs *cs = ctx->cs;
	int i;

	struct r600_resource *onebo = NULL;
	struct r600_pipe_state *cb_state;
	struct evergreen_compute_resource *resources =
					ctx->cs_shader_state.shader->resources;

	/* Initialize all the compute-related registers.
	 *
	 * See evergreen_init_atom_start_compute_cs() in this file for the list
	 * of registers initialized by the start_compute_cs_cmd atom.
	 */
	r600_emit_atom(ctx, &ctx->start_compute_cs_cmd.atom);

	ctx->flags |= R600_CONTEXT_CB_FLUSH;
	r600_flush_emit(ctx);

	/* Emit cb_state */
	cb_state = ctx->states[R600_PIPE_STATE_FRAMEBUFFER];
	r600_context_pipe_state_emit(ctx, cb_state, RADEON_CP_PACKET3_COMPUTE_MODE);

	/* Set CB_TARGET_MASK  XXX: Use cb_misc_state */
	r600_write_compute_context_reg(cs, R_028238_CB_TARGET_MASK,
					ctx->compute_cb_target_mask);


	/* Emit vertex buffer state */
	ctx->cs_vertex_buffer_state.atom.num_dw = 12 * util_bitcount(ctx->cs_vertex_buffer_state.dirty_mask);
	r600_emit_atom(ctx, &ctx->cs_vertex_buffer_state.atom);

	/* Emit compute shader state */
	r600_emit_atom(ctx, &ctx->cs_shader_state.atom);

	for (i = 0; i < get_compute_resource_num(); i++) {
		if (resources[i].enabled) {
			int j;
			COMPUTE_DBG("resnum: %i, cdw: %i\n", i, cs->cdw);

			for (j = 0; j < resources[i].cs_end; j++) {
				if (resources[i].do_reloc[j]) {
					assert(resources[i].bo);
					evergreen_emit_ctx_reloc(ctx,
						resources[i].bo,
						resources[i].usage);
				}

				cs->buf[cs->cdw++] = resources[i].cs[j];
			}

			if (resources[i].bo) {
				onebo = resources[i].bo;
				evergreen_emit_ctx_reloc(ctx,
					resources[i].bo,
					resources[i].usage);

				///special case for textures
				if (resources[i].do_reloc
					[resources[i].cs_end] == 2) {
					evergreen_emit_ctx_reloc(ctx,
						resources[i].bo,
						resources[i].usage);
				}
			}
		}
	}

	/* Emit dispatch state and dispatch packet */
	evergreen_emit_direct_dispatch(ctx, block_layout, grid_layout);

	/* XXX evergreen_flush_emit() hardcodes the CP_COHER_SIZE to 0xffffffff
	 */
	ctx->flags |= R600_CONTEXT_CB_FLUSH;
	r600_flush_emit(ctx);

#if 0
	COMPUTE_DBG("cdw: %i\n", cs->cdw);
	for (i = 0; i < cs->cdw; i++) {
		COMPUTE_DBG("%4i : 0x%08X\n", i, ctx->cs->buf[i]);
	}
#endif

	ctx->ws->cs_flush(ctx->cs, RADEON_FLUSH_ASYNC | RADEON_FLUSH_COMPUTE);

	ctx->pm4_dirty_cdwords = 0;
	ctx->flags = 0;

	COMPUTE_DBG("shader started\n");

	ctx->ws->buffer_wait(onebo->buf, 0);

	COMPUTE_DBG("...\n");

	ctx->streamout_start = TRUE;
	ctx->streamout_append_bitmask = ~0;

}
示例#27
0
static void evergreen_emit_direct_dispatch(
		struct r600_context *rctx,
		const uint *block_layout, const uint *grid_layout)
{
	int i;
	struct radeon_winsys_cs *cs = rctx->cs;
	unsigned num_waves;
	unsigned num_pipes = rctx->screen->info.r600_max_pipes;
	unsigned wave_divisor = (16 * num_pipes);
	int group_size = 1;
	int grid_size = 1;
	/* XXX: Enable lds and get size from cs_shader_state */
	unsigned lds_size = 0;

	/* Calculate group_size/grid_size */
	for (i = 0; i < 3; i++) {
		group_size *= block_layout[i];
	}

	for (i = 0; i < 3; i++)	{
		grid_size *= grid_layout[i];
	}

	/* num_waves = ceil((tg_size.x * tg_size.y, tg_size.z) / (16 * num_pipes)) */
	num_waves = (block_layout[0] * block_layout[1] * block_layout[2] +
			wave_divisor - 1) / wave_divisor;

	COMPUTE_DBG("Using %u pipes, there are %u wavefronts per thread block\n",
							num_pipes, num_waves);

	/* XXX: Partition the LDS between PS/CS.  By default half (4096 dwords
	 * on Evergreen) oes to Pixel Shaders and half goes to Compute Shaders.
	 * We may need to allocat the entire LDS space for Compute Shaders.
	 *
	 * EG: R_008E2C_SQ_LDS_RESOURCE_MGMT := S_008E2C_NUM_LS_LDS(lds_dwords)
	 * CM: CM_R_0286FC_SPI_LDS_MGMT :=  S_0286FC_NUM_LS_LDS(lds_dwords)
	 */

	r600_write_config_reg(cs, R_008970_VGT_NUM_INDICES, group_size);

	r600_write_config_reg_seq(cs, R_00899C_VGT_COMPUTE_START_X, 3);
	r600_write_value(cs, 0); /* R_00899C_VGT_COMPUTE_START_X */
	r600_write_value(cs, 0); /* R_0089A0_VGT_COMPUTE_START_Y */
	r600_write_value(cs, 0); /* R_0089A4_VGT_COMPUTE_START_Z */

	r600_write_config_reg(cs, R_0089AC_VGT_COMPUTE_THREAD_GROUP_SIZE,
								group_size);

	r600_write_compute_context_reg_seq(cs, R_0286EC_SPI_COMPUTE_NUM_THREAD_X, 3);
	r600_write_value(cs, block_layout[0]); /* R_0286EC_SPI_COMPUTE_NUM_THREAD_X */
	r600_write_value(cs, block_layout[1]); /* R_0286F0_SPI_COMPUTE_NUM_THREAD_Y */
	r600_write_value(cs, block_layout[2]); /* R_0286F4_SPI_COMPUTE_NUM_THREAD_Z */

	r600_write_compute_context_reg(cs, CM_R_0288E8_SQ_LDS_ALLOC,
					lds_size | (num_waves << 14));

	/* Dispatch packet */
	r600_write_value(cs, PKT3C(PKT3_DISPATCH_DIRECT, 3, 0));
	r600_write_value(cs, grid_layout[0]);
	r600_write_value(cs, grid_layout[1]);
	r600_write_value(cs, grid_layout[2]);
	/* VGT_DISPATCH_INITIATOR = COMPUTE_SHADER_EN */
	r600_write_value(cs, 1);
}
示例#28
0
/* The kernel parameters are stored a vtx buffer (ID=0), besides the explicit
 * kernel parameters there are inplicit parameters that need to be stored
 * in the vertex buffer as well.  Here is how these parameters are organized in
 * the buffer:
 *
 * DWORDS 0-2: Number of work groups in each dimension (x,y,z)
 * DWORDS 3-5: Number of global work items in each dimension (x,y,z)
 * DWORDS 6-8: Number of work items within each work group in each dimension
 *             (x,y,z)
 * DWORDS 9+ : Kernel parameters
 */
void evergreen_compute_upload_input(
	struct pipe_context *ctx_,
	const uint *block_layout,
	const uint *grid_layout,
	const void *input)
{
	struct r600_context *ctx = (struct r600_context *)ctx_;
	struct r600_pipe_compute *shader = ctx->cs_shader_state.shader;
	int i;
	unsigned kernel_parameters_offset_bytes = 36;
	uint32_t * num_work_groups_start;
	uint32_t * global_size_start;
	uint32_t * local_size_start;
	uint32_t * kernel_parameters_start;

	if (shader->input_size == 0) {
		return;
	}

	if (!shader->kernel_param) {
		unsigned buffer_size = shader->input_size;

		/* Add space for the grid dimensions */
		buffer_size += kernel_parameters_offset_bytes * sizeof(uint);
		shader->kernel_param = r600_compute_buffer_alloc_vram(
						ctx->screen, buffer_size);
	}

	num_work_groups_start = ctx->ws->buffer_map(
		shader->kernel_param->cs_buf, ctx->cs, PIPE_TRANSFER_WRITE);
	global_size_start = num_work_groups_start + (3 * (sizeof(uint) /4));
	local_size_start = global_size_start + (3 * (sizeof(uint)) / 4);
	kernel_parameters_start = local_size_start + (3 * (sizeof(uint)) / 4);

	/* Copy the work group size */
	memcpy(num_work_groups_start, grid_layout, 3 * sizeof(uint));

	/* Copy the global size */
	for (i = 0; i < 3; i++) {
		global_size_start[i] = grid_layout[i] * block_layout[i];
	}

	/* Copy the local dimensions */
	memcpy(local_size_start, block_layout, 3 * sizeof(uint));

	/* Copy the kernel inputs */
	memcpy(kernel_parameters_start, input, shader->input_size);

	for (i = 0; i < (kernel_parameters_offset_bytes / 4) +
					(shader->input_size / 4); i++) {
		COMPUTE_DBG("input %i : %i\n", i,
			((unsigned*)num_work_groups_start)[i]);
	}

	ctx->ws->buffer_unmap(shader->kernel_param->cs_buf);

	///ID=0 is reserved for the parameters
	evergreen_cs_set_vertex_buffer(ctx, 0, 0,
			(struct pipe_resource*)shader->kernel_param);
	///ID=0 is reserved for parameters
	evergreen_set_const_cache(shader, 0, shader->kernel_param,
						shader->input_size, 0);
}
示例#29
0
static void evergreen_emit_direct_dispatch(
		struct r600_context *rctx,
		const uint *block_layout, const uint *grid_layout)
{
	int i;
	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
	struct r600_pipe_compute *shader = rctx->cs_shader_state.shader;
	unsigned num_waves;
	unsigned num_pipes = rctx->screen->b.info.r600_max_pipes;
	unsigned wave_divisor = (16 * num_pipes);
	int group_size = 1;
	int grid_size = 1;
	unsigned lds_size = shader->local_size / 4 +
#if HAVE_LLVM < 0x0306
		shader->active_kernel->bc.nlds_dw;
#else
		shader->bc.nlds_dw;
#endif


	/* Calculate group_size/grid_size */
	for (i = 0; i < 3; i++) {
		group_size *= block_layout[i];
	}

	for (i = 0; i < 3; i++)	{
		grid_size *= grid_layout[i];
	}

	/* num_waves = ceil((tg_size.x * tg_size.y, tg_size.z) / (16 * num_pipes)) */
	num_waves = (block_layout[0] * block_layout[1] * block_layout[2] +
			wave_divisor - 1) / wave_divisor;

	COMPUTE_DBG(rctx->screen, "Using %u pipes, "
				"%u wavefronts per thread block, "
				"allocating %u dwords lds.\n",
				num_pipes, num_waves, lds_size);

	radeon_set_config_reg(cs, R_008970_VGT_NUM_INDICES, group_size);

	radeon_set_config_reg_seq(cs, R_00899C_VGT_COMPUTE_START_X, 3);
	radeon_emit(cs, 0); /* R_00899C_VGT_COMPUTE_START_X */
	radeon_emit(cs, 0); /* R_0089A0_VGT_COMPUTE_START_Y */
	radeon_emit(cs, 0); /* R_0089A4_VGT_COMPUTE_START_Z */

	radeon_set_config_reg(cs, R_0089AC_VGT_COMPUTE_THREAD_GROUP_SIZE,
								group_size);

	radeon_compute_set_context_reg_seq(cs, R_0286EC_SPI_COMPUTE_NUM_THREAD_X, 3);
	radeon_emit(cs, block_layout[0]); /* R_0286EC_SPI_COMPUTE_NUM_THREAD_X */
	radeon_emit(cs, block_layout[1]); /* R_0286F0_SPI_COMPUTE_NUM_THREAD_Y */
	radeon_emit(cs, block_layout[2]); /* R_0286F4_SPI_COMPUTE_NUM_THREAD_Z */

	if (rctx->b.chip_class < CAYMAN) {
		assert(lds_size <= 8192);
	} else {
		/* Cayman appears to have a slightly smaller limit, see the
		 * value of CM_R_0286FC_SPI_LDS_MGMT.NUM_LS_LDS */
		assert(lds_size <= 8160);
	}

	radeon_compute_set_context_reg(cs, R_0288E8_SQ_LDS_ALLOC,
					lds_size | (num_waves << 14));

	/* Dispatch packet */
	radeon_emit(cs, PKT3C(PKT3_DISPATCH_DIRECT, 3, 0));
	radeon_emit(cs, grid_layout[0]);
	radeon_emit(cs, grid_layout[1]);
	radeon_emit(cs, grid_layout[2]);
	/* VGT_DISPATCH_INITIATOR = COMPUTE_SHADER_EN */
	radeon_emit(cs, 1);
}
示例#30
0
/* The kernel parameters are stored a vtx buffer (ID=0), besides the explicit
 * kernel parameters there are implicit parameters that need to be stored
 * in the vertex buffer as well.  Here is how these parameters are organized in
 * the buffer:
 *
 * DWORDS 0-2: Number of work groups in each dimension (x,y,z)
 * DWORDS 3-5: Number of global work items in each dimension (x,y,z)
 * DWORDS 6-8: Number of work items within each work group in each dimension
 *             (x,y,z)
 * DWORDS 9+ : Kernel parameters
 */
void evergreen_compute_upload_input(
	struct pipe_context *ctx_,
	const uint *block_layout,
	const uint *grid_layout,
	const void *input)
{
	struct r600_context *ctx = (struct r600_context *)ctx_;
	struct r600_pipe_compute *shader = ctx->cs_shader_state.shader;
	unsigned i;
	/* We need to reserve 9 dwords (36 bytes) for implicit kernel
	 * parameters.
	 */
	unsigned input_size = shader->input_size + 36;
	uint32_t * num_work_groups_start;
	uint32_t * global_size_start;
	uint32_t * local_size_start;
	uint32_t * kernel_parameters_start;
	struct pipe_box box;
	struct pipe_transfer *transfer = NULL;

	if (shader->input_size == 0) {
		return;
	}

	if (!shader->kernel_param) {
		/* Add space for the grid dimensions */
		shader->kernel_param = (struct r600_resource *)
			pipe_buffer_create(ctx_->screen, PIPE_BIND_CUSTOM,
					PIPE_USAGE_IMMUTABLE, input_size);
	}

	u_box_1d(0, input_size, &box);
	num_work_groups_start = ctx_->transfer_map(ctx_,
			(struct pipe_resource*)shader->kernel_param,
			0, PIPE_TRANSFER_WRITE | PIPE_TRANSFER_DISCARD_RANGE,
			&box, &transfer);
	global_size_start = num_work_groups_start + (3 * (sizeof(uint) /4));
	local_size_start = global_size_start + (3 * (sizeof(uint)) / 4);
	kernel_parameters_start = local_size_start + (3 * (sizeof(uint)) / 4);

	/* Copy the work group size */
	memcpy(num_work_groups_start, grid_layout, 3 * sizeof(uint));

	/* Copy the global size */
	for (i = 0; i < 3; i++) {
		global_size_start[i] = grid_layout[i] * block_layout[i];
	}

	/* Copy the local dimensions */
	memcpy(local_size_start, block_layout, 3 * sizeof(uint));

	/* Copy the kernel inputs */
	memcpy(kernel_parameters_start, input, shader->input_size);

	for (i = 0; i < (input_size / 4); i++) {
		COMPUTE_DBG(ctx->screen, "input %i : %u\n", i,
			((unsigned*)num_work_groups_start)[i]);
	}

	ctx_->transfer_unmap(ctx_, transfer);

	/* ID=0 is reserved for the parameters */
	evergreen_cs_set_constant_buffer(ctx, 0, 0, input_size,
			(struct pipe_resource*)shader->kernel_param);
}