static void r600_clear_buffer(struct pipe_context *ctx, struct pipe_resource *dst, unsigned offset, unsigned size, unsigned value) { struct r600_context *rctx = (struct r600_context*)ctx; if (rctx->screen->b.has_cp_dma && rctx->b.chip_class >= EVERGREEN && offset % 4 == 0 && size % 4 == 0) { evergreen_cp_dma_clear_buffer(rctx, dst, offset, size, value); } else if (rctx->screen->b.has_streamout && offset % 4 == 0 && size % 4 == 0) { union pipe_color_union clear_value; clear_value.ui[0] = value; r600_blitter_begin(ctx, R600_DISABLE_RENDER_COND); util_blitter_clear_buffer(rctx->blitter, dst, offset, size, 1, &clear_value); r600_blitter_end(ctx); } else { uint32_t *map = r600_buffer_map_sync_with_rings(&rctx->b, r600_resource(dst), PIPE_TRANSFER_WRITE); size /= 4; for (unsigned i = 0; i < size; i++) *map++ = value; } }
static void *evergreen_create_compute_state(struct pipe_context *ctx, const struct pipe_compute_state *cso) { struct r600_context *rctx = (struct r600_context *)ctx; struct r600_pipe_compute *shader = CALLOC_STRUCT(r600_pipe_compute); #ifdef HAVE_OPENCL const struct pipe_llvm_program_header *header; const char *code; void *p; boolean use_kill; COMPUTE_DBG(rctx->screen, "*** evergreen_create_compute_state\n"); header = cso->prog; code = cso->prog + sizeof(struct pipe_llvm_program_header); radeon_shader_binary_init(&shader->binary); radeon_elf_read(code, header->num_bytes, &shader->binary); r600_create_shader(&shader->bc, &shader->binary, &use_kill); /* Upload code + ROdata */ shader->code_bo = r600_compute_buffer_alloc_vram(rctx->screen, shader->bc.ndw * 4); p = r600_buffer_map_sync_with_rings(&rctx->b, shader->code_bo, PIPE_TRANSFER_WRITE); //TODO: use util_memcpy_cpu_to_le32 ? memcpy(p, shader->bc.bytecode, shader->bc.ndw * 4); rctx->b.ws->buffer_unmap(shader->code_bo->buf); #endif shader->ctx = rctx; shader->local_size = cso->req_local_mem; shader->private_size = cso->req_private_mem; shader->input_size = cso->req_input_mem; return shader; }
static void evergreen_launch_grid( struct pipe_context *ctx_, const uint *block_layout, const uint *grid_layout, uint32_t pc, const void *input) { struct r600_context *ctx = (struct r600_context *)ctx_; #ifdef HAVE_OPENCL struct r600_pipe_compute *shader = ctx->cs_shader_state.shader; boolean use_kill; #if HAVE_LLVM < 0x0306 struct r600_kernel *kernel = &shader->kernels[pc]; (void)use_kill; if (!kernel->code_bo) { void *p; struct r600_bytecode *bc = &kernel->bc; LLVMModuleRef mod = kernel->llvm_module; boolean use_kill = false; bool dump = (ctx->screen->b.debug_flags & DBG_CS) != 0; unsigned use_sb = ctx->screen->b.debug_flags & DBG_SB_CS; unsigned sb_disasm = use_sb || (ctx->screen->b.debug_flags & DBG_SB_DISASM); r600_bytecode_init(bc, ctx->b.chip_class, ctx->b.family, ctx->screen->has_compressed_msaa_texturing); bc->type = TGSI_PROCESSOR_COMPUTE; bc->isa = ctx->isa; r600_llvm_compile(mod, ctx->b.family, bc, &use_kill, dump, &ctx->b.debug); if (dump && !sb_disasm) { r600_bytecode_disasm(bc); } else if ((dump && sb_disasm) || use_sb) { if (r600_sb_bytecode_process(ctx, bc, NULL, dump, use_sb)) R600_ERR("r600_sb_bytecode_process failed!\n"); } kernel->code_bo = r600_compute_buffer_alloc_vram(ctx->screen, kernel->bc.ndw * 4); p = r600_buffer_map_sync_with_rings(&ctx->b, kernel->code_bo, PIPE_TRANSFER_WRITE); memcpy(p, kernel->bc.bytecode, kernel->bc.ndw * 4); ctx->b.ws->buffer_unmap(kernel->code_bo->buf); } shader->active_kernel = kernel; ctx->cs_shader_state.kernel_index = pc; #else ctx->cs_shader_state.pc = pc; /* Get the config information for this kernel. */ r600_shader_binary_read_config(&shader->binary, &shader->bc, pc, &use_kill); #endif #endif COMPUTE_DBG(ctx->screen, "*** evergreen_launch_grid: pc = %u\n", pc); evergreen_compute_upload_input(ctx_, block_layout, grid_layout, input); compute_emit_cs(ctx, block_layout, grid_layout); }
void *evergreen_create_compute_state( struct pipe_context *ctx_, const const struct pipe_compute_state *cso) { struct r600_context *ctx = (struct r600_context *)ctx_; struct r600_pipe_compute *shader = CALLOC_STRUCT(r600_pipe_compute); #ifdef HAVE_OPENCL const struct pipe_llvm_program_header * header; const char *code; void *p; boolean use_kill; COMPUTE_DBG(ctx->screen, "*** evergreen_create_compute_state\n"); header = cso->prog; code = cso->prog + sizeof(struct pipe_llvm_program_header); #if HAVE_LLVM < 0x0306 (void)use_kill; (void)p; shader->llvm_ctx = LLVMContextCreate(); shader->num_kernels = radeon_llvm_get_num_kernels(shader->llvm_ctx, code, header->num_bytes); shader->kernels = CALLOC(sizeof(struct r600_kernel), shader->num_kernels); { unsigned i; for (i = 0; i < shader->num_kernels; i++) { struct r600_kernel *kernel = &shader->kernels[i]; kernel->llvm_module = radeon_llvm_get_kernel_module( shader->llvm_ctx, i, code, header->num_bytes); } } #else radeon_shader_binary_init(&shader->binary); radeon_elf_read(code, header->num_bytes, &shader->binary); r600_create_shader(&shader->bc, &shader->binary, &use_kill); shader->code_bo = r600_compute_buffer_alloc_vram(ctx->screen, shader->bc.ndw * 4); p = r600_buffer_map_sync_with_rings(&ctx->b, shader->code_bo, PIPE_TRANSFER_WRITE); memcpy(p, shader->bc.bytecode, shader->bc.ndw * 4); ctx->b.ws->buffer_unmap(shader->code_bo->buf); #endif #endif shader->ctx = ctx; shader->local_size = cso->req_local_mem; shader->private_size = cso->req_private_mem; shader->input_size = cso->req_input_mem; return shader; }
static void *r600_buffer_transfer_map(struct pipe_context *ctx, struct pipe_resource *resource, unsigned level, unsigned usage, const struct pipe_box *box, struct pipe_transfer **ptransfer) { struct r600_common_context *rctx = (struct r600_common_context*)ctx; struct r600_common_screen *rscreen = (struct r600_common_screen*)ctx->screen; struct r600_resource *rbuffer = r600_resource(resource); uint8_t *data; assert(box->x + box->width <= resource->width0); /* See if the buffer range being mapped has never been initialized, * in which case it can be mapped unsynchronized. */ if (!(usage & PIPE_TRANSFER_UNSYNCHRONIZED) && usage & PIPE_TRANSFER_WRITE && !util_ranges_intersect(&rbuffer->valid_buffer_range, box->x, box->x + box->width)) { usage |= PIPE_TRANSFER_UNSYNCHRONIZED; } /* If discarding the entire range, discard the whole resource instead. */ if (usage & PIPE_TRANSFER_DISCARD_RANGE && box->x == 0 && box->width == resource->width0) { usage |= PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE; } if (usage & PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE && !(usage & PIPE_TRANSFER_UNSYNCHRONIZED)) { assert(usage & PIPE_TRANSFER_WRITE); if (r600_invalidate_buffer(rctx, rbuffer)) { /* At this point, the buffer is always idle. */ usage |= PIPE_TRANSFER_UNSYNCHRONIZED; } } else if ((usage & PIPE_TRANSFER_DISCARD_RANGE) && !(usage & PIPE_TRANSFER_UNSYNCHRONIZED) && !(rscreen->debug_flags & DBG_NO_DISCARD_RANGE) && r600_can_dma_copy_buffer(rctx, box->x, 0, box->width)) { assert(usage & PIPE_TRANSFER_WRITE); /* Check if mapping this buffer would cause waiting for the GPU. */ if (r600_rings_is_buffer_referenced(rctx, rbuffer->buf, RADEON_USAGE_READWRITE) || !rctx->ws->buffer_wait(rbuffer->buf, 0, RADEON_USAGE_READWRITE)) { /* Do a wait-free write-only transfer using a temporary buffer. */ unsigned offset; struct r600_resource *staging = NULL; u_upload_alloc(rctx->uploader, 0, box->width + (box->x % R600_MAP_BUFFER_ALIGNMENT), 256, &offset, (struct pipe_resource**)&staging, (void**)&data); if (staging) { data += box->x % R600_MAP_BUFFER_ALIGNMENT; return r600_buffer_get_transfer(ctx, resource, level, usage, box, ptransfer, data, staging, offset); } } else { /* At this point, the buffer is always idle (we checked it above). */ usage |= PIPE_TRANSFER_UNSYNCHRONIZED; } } /* Using a staging buffer in GTT for larger reads is much faster. */ else if ((usage & PIPE_TRANSFER_READ) && !(usage & PIPE_TRANSFER_WRITE) && rbuffer->domains == RADEON_DOMAIN_VRAM && r600_can_dma_copy_buffer(rctx, 0, box->x, box->width)) { struct r600_resource *staging; staging = (struct r600_resource*) pipe_buffer_create( ctx->screen, PIPE_BIND_TRANSFER_READ, PIPE_USAGE_STAGING, box->width + (box->x % R600_MAP_BUFFER_ALIGNMENT)); if (staging) { /* Copy the VRAM buffer to the staging buffer. */ rctx->dma_copy(ctx, &staging->b.b, 0, box->x % R600_MAP_BUFFER_ALIGNMENT, 0, 0, resource, level, box); data = r600_buffer_map_sync_with_rings(rctx, staging, PIPE_TRANSFER_READ); data += box->x % R600_MAP_BUFFER_ALIGNMENT; return r600_buffer_get_transfer(ctx, resource, level, usage, box, ptransfer, data, staging, 0); } } data = r600_buffer_map_sync_with_rings(rctx, rbuffer, usage); if (!data) { return NULL; } data += box->x; return r600_buffer_get_transfer(ctx, resource, level, usage, box, ptransfer, data, NULL, 0); }
static void *r600_buffer_transfer_map(struct pipe_context *ctx, struct pipe_resource *resource, unsigned level, unsigned usage, const struct pipe_box *box, struct pipe_transfer **ptransfer) { struct r600_context *rctx = (struct r600_context*)ctx; struct r600_resource *rbuffer = r600_resource(resource); uint8_t *data; assert(box->x + box->width <= resource->width0); /* See if the buffer range being mapped has never been initialized, * in which case it can be mapped unsynchronized. */ if (!(usage & PIPE_TRANSFER_UNSYNCHRONIZED) && usage & PIPE_TRANSFER_WRITE && !util_ranges_intersect(&rbuffer->valid_buffer_range, box->x, box->x + box->width)) { usage |= PIPE_TRANSFER_UNSYNCHRONIZED; } if (usage & PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE && !(usage & PIPE_TRANSFER_UNSYNCHRONIZED)) { assert(usage & PIPE_TRANSFER_WRITE); /* Check if mapping this buffer would cause waiting for the GPU. */ if (r600_rings_is_buffer_referenced(&rctx->b, rbuffer->cs_buf, RADEON_USAGE_READWRITE) || rctx->b.ws->buffer_is_busy(rbuffer->buf, RADEON_USAGE_READWRITE)) { unsigned i, mask; /* Discard the buffer. */ pb_reference(&rbuffer->buf, NULL); /* Create a new one in the same pipe_resource. */ /* XXX We probably want a different alignment for buffers and textures. */ r600_init_resource(&rctx->screen->b, rbuffer, rbuffer->b.b.width0, 4096, TRUE, rbuffer->b.b.usage); /* We changed the buffer, now we need to bind it where the old one was bound. */ /* Vertex buffers. */ mask = rctx->vertex_buffer_state.enabled_mask; while (mask) { i = u_bit_scan(&mask); if (rctx->vertex_buffer_state.vb[i].buffer == &rbuffer->b.b) { rctx->vertex_buffer_state.dirty_mask |= 1 << i; r600_vertex_buffers_dirty(rctx); } } /* Streamout buffers. */ for (i = 0; i < rctx->b.streamout.num_targets; i++) { if (rctx->b.streamout.targets[i]->b.buffer == &rbuffer->b.b) { if (rctx->b.streamout.begin_emitted) { r600_emit_streamout_end(&rctx->b); } rctx->b.streamout.append_bitmask = rctx->b.streamout.enabled_mask; r600_streamout_buffers_dirty(&rctx->b); } } /* Constant buffers. */ r600_set_constants_dirty_if_bound(rctx, rbuffer); } } else if ((usage & PIPE_TRANSFER_DISCARD_RANGE) && !(usage & PIPE_TRANSFER_UNSYNCHRONIZED) && !(rctx->screen->b.debug_flags & DBG_NO_DISCARD_RANGE) && (rctx->screen->has_cp_dma || (rctx->screen->has_streamout && /* The buffer range must be aligned to 4 with streamout. */ box->x % 4 == 0 && box->width % 4 == 0))) { assert(usage & PIPE_TRANSFER_WRITE); /* Check if mapping this buffer would cause waiting for the GPU. */ if (r600_rings_is_buffer_referenced(&rctx->b, rbuffer->cs_buf, RADEON_USAGE_READWRITE) || rctx->b.ws->buffer_is_busy(rbuffer->buf, RADEON_USAGE_READWRITE)) { /* Do a wait-free write-only transfer using a temporary buffer. */ unsigned offset; struct r600_resource *staging = NULL; u_upload_alloc(rctx->uploader, 0, box->width + (box->x % R600_MAP_BUFFER_ALIGNMENT), &offset, (struct pipe_resource**)&staging, (void**)&data); if (staging) { data += box->x % R600_MAP_BUFFER_ALIGNMENT; return r600_buffer_get_transfer(ctx, resource, level, usage, box, ptransfer, data, staging, offset); } } } /* mmap and synchronize with rings */ data = r600_buffer_map_sync_with_rings(&rctx->b, rbuffer, usage); if (!data) { return NULL; } data += box->x; return r600_buffer_get_transfer(ctx, resource, level, usage, box, ptransfer, data, NULL, 0); }
static void *r600_texture_transfer_map(struct pipe_context *ctx, struct pipe_resource *texture, unsigned level, unsigned usage, const struct pipe_box *box, struct pipe_transfer **ptransfer) { struct r600_common_context *rctx = (struct r600_common_context*)ctx; struct r600_texture *rtex = (struct r600_texture*)texture; struct r600_transfer *trans; boolean use_staging_texture = FALSE; struct r600_resource *buf; unsigned offset = 0; char *map; /* We cannot map a tiled texture directly because the data is * in a different order, therefore we do detiling using a blit. * * Also, use a temporary in GTT memory for read transfers, as * the CPU is much happier reading out of cached system memory * than uncached VRAM. */ if (rtex->surface.level[0].mode >= RADEON_SURF_MODE_1D) { use_staging_texture = TRUE; } else if ((usage & PIPE_TRANSFER_READ) && !(usage & PIPE_TRANSFER_MAP_DIRECTLY) && (rtex->resource.domains == RADEON_DOMAIN_VRAM)) { /* Untiled buffers in VRAM, which is slow for CPU reads */ use_staging_texture = TRUE; } else if (!(usage & PIPE_TRANSFER_READ) && (r600_rings_is_buffer_referenced(rctx, rtex->resource.cs_buf, RADEON_USAGE_READWRITE) || !rctx->ws->buffer_wait(rtex->resource.buf, 0, RADEON_USAGE_READWRITE))) { /* Use a staging texture for uploads if the underlying BO is busy. */ use_staging_texture = TRUE; } if (texture->flags & R600_RESOURCE_FLAG_TRANSFER) { use_staging_texture = FALSE; } if (use_staging_texture && (usage & PIPE_TRANSFER_MAP_DIRECTLY)) { return NULL; } trans = CALLOC_STRUCT(r600_transfer); if (trans == NULL) return NULL; trans->transfer.resource = texture; trans->transfer.level = level; trans->transfer.usage = usage; trans->transfer.box = *box; if (rtex->is_depth) { struct r600_texture *staging_depth; if (rtex->resource.b.b.nr_samples > 1) { /* MSAA depth buffers need to be converted to single sample buffers. * * Mapping MSAA depth buffers can occur if ReadPixels is called * with a multisample GLX visual. * * First downsample the depth buffer to a temporary texture, * then decompress the temporary one to staging. * * Only the region being mapped is transfered. */ struct pipe_resource resource; r600_init_temp_resource_from_box(&resource, texture, box, level, 0); if (!r600_init_flushed_depth_texture(ctx, &resource, &staging_depth)) { R600_ERR("failed to create temporary texture to hold untiled copy\n"); FREE(trans); return NULL; } if (usage & PIPE_TRANSFER_READ) { struct pipe_resource *temp = ctx->screen->resource_create(ctx->screen, &resource); r600_copy_region_with_blit(ctx, temp, 0, 0, 0, 0, texture, level, box); rctx->blit_decompress_depth(ctx, (struct r600_texture*)temp, staging_depth, 0, 0, 0, box->depth, 0, 0); pipe_resource_reference((struct pipe_resource**)&temp, NULL); } } else { /* XXX: only readback the rectangle which is being mapped? */ /* XXX: when discard is true, no need to read back from depth texture */ if (!r600_init_flushed_depth_texture(ctx, texture, &staging_depth)) { R600_ERR("failed to create temporary texture to hold untiled copy\n"); FREE(trans); return NULL; } rctx->blit_decompress_depth(ctx, rtex, staging_depth, level, level, box->z, box->z + box->depth - 1, 0, 0); offset = r600_texture_get_offset(staging_depth, level, box); } trans->transfer.stride = staging_depth->surface.level[level].pitch_bytes; trans->transfer.layer_stride = staging_depth->surface.level[level].slice_size; trans->staging = (struct r600_resource*)staging_depth; } else if (use_staging_texture) { struct pipe_resource resource; struct r600_texture *staging; r600_init_temp_resource_from_box(&resource, texture, box, level, R600_RESOURCE_FLAG_TRANSFER); resource.usage = (usage & PIPE_TRANSFER_READ) ? PIPE_USAGE_STAGING : PIPE_USAGE_STREAM; /* Create the temporary texture. */ staging = (struct r600_texture*)ctx->screen->resource_create(ctx->screen, &resource); if (staging == NULL) { R600_ERR("failed to create temporary texture to hold untiled copy\n"); FREE(trans); return NULL; } trans->staging = &staging->resource; trans->transfer.stride = staging->surface.level[0].pitch_bytes; trans->transfer.layer_stride = staging->surface.level[0].slice_size; if (usage & PIPE_TRANSFER_READ) { r600_copy_to_staging_texture(ctx, trans); } } else { /* the resource is mapped directly */ trans->transfer.stride = rtex->surface.level[level].pitch_bytes; trans->transfer.layer_stride = rtex->surface.level[level].slice_size; offset = r600_texture_get_offset(rtex, level, box); } if (trans->staging) { buf = trans->staging; if (!rtex->is_depth && !(usage & PIPE_TRANSFER_READ)) usage |= PIPE_TRANSFER_UNSYNCHRONIZED; } else { buf = &rtex->resource; } if (!(map = r600_buffer_map_sync_with_rings(rctx, buf, usage))) { pipe_resource_reference((struct pipe_resource**)&trans->staging, NULL); FREE(trans); return NULL; } *ptransfer = &trans->transfer; return map + offset; }
static void *r600_buffer_transfer_map(struct pipe_context *ctx, struct pipe_resource *resource, unsigned level, unsigned usage, const struct pipe_box *box, struct pipe_transfer **ptransfer) { struct r600_common_context *rctx = (struct r600_common_context*)ctx; struct r600_common_screen *rscreen = (struct r600_common_screen*)ctx->screen; struct r600_resource *rbuffer = r600_resource(resource); uint8_t *data; assert(box->x + box->width <= resource->width0); /* See if the buffer range being mapped has never been initialized, * in which case it can be mapped unsynchronized. */ if (!(usage & PIPE_TRANSFER_UNSYNCHRONIZED) && usage & PIPE_TRANSFER_WRITE && !util_ranges_intersect(&rbuffer->valid_buffer_range, box->x, box->x + box->width)) { usage |= PIPE_TRANSFER_UNSYNCHRONIZED; } /* If discarding the entire range, discard the whole resource instead. */ if (usage & PIPE_TRANSFER_DISCARD_RANGE && box->x == 0 && box->width == resource->width0) { usage |= PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE; } if (usage & PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE && !(usage & PIPE_TRANSFER_UNSYNCHRONIZED)) { assert(usage & PIPE_TRANSFER_WRITE); /* Check if mapping this buffer would cause waiting for the GPU. */ if (r600_rings_is_buffer_referenced(rctx, rbuffer->cs_buf, RADEON_USAGE_READWRITE) || rctx->ws->buffer_is_busy(rbuffer->buf, RADEON_USAGE_READWRITE)) { rctx->invalidate_buffer(&rctx->b, &rbuffer->b.b); } /* At this point, the buffer is always idle. */ usage |= PIPE_TRANSFER_UNSYNCHRONIZED; } else if ((usage & PIPE_TRANSFER_DISCARD_RANGE) && !(usage & PIPE_TRANSFER_UNSYNCHRONIZED) && !(rscreen->debug_flags & DBG_NO_DISCARD_RANGE) && (rscreen->has_cp_dma || (rscreen->has_streamout && /* The buffer range must be aligned to 4 with streamout. */ box->x % 4 == 0 && box->width % 4 == 0))) { assert(usage & PIPE_TRANSFER_WRITE); /* Check if mapping this buffer would cause waiting for the GPU. */ if (r600_rings_is_buffer_referenced(rctx, rbuffer->cs_buf, RADEON_USAGE_READWRITE) || rctx->ws->buffer_is_busy(rbuffer->buf, RADEON_USAGE_READWRITE)) { /* Do a wait-free write-only transfer using a temporary buffer. */ unsigned offset; struct r600_resource *staging = NULL; u_upload_alloc(rctx->uploader, 0, box->width + (box->x % R600_MAP_BUFFER_ALIGNMENT), &offset, (struct pipe_resource**)&staging, (void**)&data); if (staging) { data += box->x % R600_MAP_BUFFER_ALIGNMENT; return r600_buffer_get_transfer(ctx, resource, level, usage, box, ptransfer, data, staging, offset); } else { return NULL; /* error, shouldn't occur though */ } } /* At this point, the buffer is always idle (we checked it above). */ usage |= PIPE_TRANSFER_UNSYNCHRONIZED; } data = r600_buffer_map_sync_with_rings(rctx, rbuffer, usage); if (!data) { return NULL; } data += box->x; return r600_buffer_get_transfer(ctx, resource, level, usage, box, ptransfer, data, NULL, 0); }