static void r600_buffer_destroy(struct pipe_screen *screen, struct pipe_resource *buf) { struct r600_resource *rbuffer = r600_resource(buf); util_range_destroy(&rbuffer->valid_buffer_range); pb_reference(&rbuffer->buf, NULL); FREE(rbuffer); }
void r600_invalidate_resource(struct pipe_context *ctx, struct pipe_resource *resource) { struct r600_common_context *rctx = (struct r600_common_context*)ctx; struct r600_resource *rbuffer = r600_resource(resource); /* We currently only do anyting here for buffers */ if (resource->target == PIPE_BUFFER) (void)r600_invalidate_buffer(rctx, rbuffer); }
static void cik_sdma_do_copy_buffer(struct si_context *ctx, struct pipe_resource *dst, struct pipe_resource *src, uint64_t dst_offset, uint64_t src_offset, uint64_t size) { struct radeon_winsys_cs *cs = ctx->b.dma.cs; unsigned i, ncopy, csize; struct r600_resource *rdst = (struct r600_resource*)dst; struct r600_resource *rsrc = (struct r600_resource*)src; dst_offset += r600_resource(dst)->gpu_address; src_offset += r600_resource(src)->gpu_address; ncopy = (size + CIK_SDMA_COPY_MAX_SIZE - 1) / CIK_SDMA_COPY_MAX_SIZE; r600_need_dma_space(&ctx->b, ncopy * 7); radeon_add_to_buffer_list(&ctx->b, &ctx->b.dma, rsrc, RADEON_USAGE_READ, RADEON_PRIO_SDMA_BUFFER); radeon_add_to_buffer_list(&ctx->b, &ctx->b.dma, rdst, RADEON_USAGE_WRITE, RADEON_PRIO_SDMA_BUFFER); for (i = 0; i < ncopy; i++) { csize = size < CIK_SDMA_COPY_MAX_SIZE ? size : CIK_SDMA_COPY_MAX_SIZE; cs->buf[cs->cdw++] = CIK_SDMA_PACKET(CIK_SDMA_OPCODE_COPY, CIK_SDMA_COPY_SUB_OPCODE_LINEAR, 0); cs->buf[cs->cdw++] = csize; cs->buf[cs->cdw++] = 0; /* src/dst endian swap */ cs->buf[cs->cdw++] = src_offset; cs->buf[cs->cdw++] = src_offset >> 32; cs->buf[cs->cdw++] = dst_offset; cs->buf[cs->cdw++] = dst_offset >> 32; dst_offset += csize; src_offset += csize; size -= csize; } }
static void r600_buffer_transfer_unmap(struct pipe_context *pipe, struct pipe_transfer *transfer) { struct r600_context *rctx = (struct r600_context*)pipe; struct r600_transfer *rtransfer = (struct r600_transfer*)transfer; struct r600_resource *rbuffer = r600_resource(transfer->resource); if (rtransfer->staging) { struct pipe_resource *dst, *src; unsigned soffset, doffset, size; dst = transfer->resource; src = &rtransfer->staging->b.b; size = transfer->box.width; doffset = transfer->box.x; soffset = rtransfer->offset + transfer->box.x % R600_MAP_BUFFER_ALIGNMENT; /* Copy the staging buffer into the original one. */ if (rctx->b.rings.dma.cs && !(size % 4) && !(doffset % 4) && !(soffset % 4)) { if (rctx->screen->b.chip_class >= EVERGREEN) { evergreen_dma_copy(rctx, dst, src, doffset, soffset, size); } else { r600_dma_copy(rctx, dst, src, doffset, soffset, size); } } else { struct pipe_box box; u_box_1d(soffset, size, &box); r600_copy_buffer(pipe, dst, doffset, src, &box); } pipe_resource_reference((struct pipe_resource**)&rtransfer->staging, NULL); } if (transfer->usage & PIPE_TRANSFER_WRITE) { util_range_add(&rbuffer->valid_buffer_range, transfer->box.x, transfer->box.x + transfer->box.width); } util_slab_free(&rctx->pool_transfers, transfer); }
static void r600_buffer_transfer_unmap(struct pipe_context *ctx, struct pipe_transfer *transfer) { struct r600_common_context *rctx = (struct r600_common_context*)ctx; struct r600_transfer *rtransfer = (struct r600_transfer*)transfer; struct r600_resource *rbuffer = r600_resource(transfer->resource); if (rtransfer->staging) { struct pipe_resource *dst, *src; unsigned soffset, doffset, size; struct pipe_box box; dst = transfer->resource; src = &rtransfer->staging->b.b; size = transfer->box.width; doffset = transfer->box.x; soffset = rtransfer->offset + transfer->box.x % R600_MAP_BUFFER_ALIGNMENT; u_box_1d(soffset, size, &box); /* Copy the staging buffer into the original one. */ if (!(size % 4) && !(doffset % 4) && !(soffset % 4) && rctx->dma_copy(ctx, dst, 0, doffset, 0, 0, src, 0, &box)) { /* DONE. */ } else { ctx->resource_copy_region(ctx, dst, 0, doffset, 0, 0, src, 0, &box); } pipe_resource_reference((struct pipe_resource**)&rtransfer->staging, NULL); } if (transfer->usage & PIPE_TRANSFER_WRITE) { util_range_add(&rbuffer->valid_buffer_range, transfer->box.x, transfer->box.x + transfer->box.width); } util_slab_free(&rctx->pool_transfers, transfer); }
for (i = 0; i < rctx->streamout.num_targets; i++) { if (!t[i]) continue; t[i]->stride_in_dw = stride_in_dw[i]; if (rctx->chip_class >= SI) { /* SI binds streamout buffers as shader resources. * VGT only counts primitives and tells the shader * through SGPRs what to do. */ r600_write_context_reg_seq(cs, R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 + 16*i, 2); radeon_emit(cs, (t[i]->b.buffer_offset + t[i]->b.buffer_size) >> 2); /* BUFFER_SIZE (in DW) */ radeon_emit(cs, stride_in_dw[i]); /* VTX_STRIDE (in DW) */ } else { uint64_t va = r600_resource(t[i]->b.buffer)->gpu_address; update_flags |= SURFACE_BASE_UPDATE_STRMOUT(i); r600_write_context_reg_seq(cs, R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 + 16*i, 3); radeon_emit(cs, (t[i]->b.buffer_offset + t[i]->b.buffer_size) >> 2); /* BUFFER_SIZE (in DW) */ radeon_emit(cs, stride_in_dw[i]); /* VTX_STRIDE (in DW) */ radeon_emit(cs, va >> 8); /* BUFFER_BASE */ r600_emit_reloc(rctx, &rctx->rings.gfx, r600_resource(t[i]->b.buffer), RADEON_USAGE_WRITE, RADEON_PRIO_SHADER_RESOURCE_RW); /* R7xx requires this packet after updating BUFFER_BASE. * Without this, R7xx locks up. */ if (rctx->family >= CHIP_RS780 && rctx->family <= CHIP_RV740) {
struct pipe_screen *r600_screen_create(struct radeon_winsys *ws) { struct r600_screen *rscreen = CALLOC_STRUCT(r600_screen); if (rscreen == NULL) { return NULL; } /* Set functions first. */ rscreen->b.b.context_create = r600_create_context; rscreen->b.b.destroy = r600_destroy_screen; rscreen->b.b.get_param = r600_get_param; rscreen->b.b.get_shader_param = r600_get_shader_param; rscreen->b.b.resource_create = r600_resource_create; if (!r600_common_screen_init(&rscreen->b, ws)) { FREE(rscreen); return NULL; } if (rscreen->b.info.chip_class >= EVERGREEN) { rscreen->b.b.is_format_supported = evergreen_is_format_supported; } else { rscreen->b.b.is_format_supported = r600_is_format_supported; } rscreen->b.debug_flags |= debug_get_flags_option("R600_DEBUG", r600_debug_options, 0); if (debug_get_bool_option("R600_DEBUG_COMPUTE", FALSE)) rscreen->b.debug_flags |= DBG_COMPUTE; if (debug_get_bool_option("R600_DUMP_SHADERS", FALSE)) rscreen->b.debug_flags |= DBG_FS | DBG_VS | DBG_GS | DBG_PS | DBG_CS; if (debug_get_bool_option("R600_HYPERZ", FALSE)) rscreen->b.debug_flags |= DBG_HYPERZ; if (debug_get_bool_option("R600_LLVM", FALSE)) rscreen->b.debug_flags |= DBG_LLVM; if (rscreen->b.family == CHIP_UNKNOWN) { fprintf(stderr, "r600: Unknown chipset 0x%04X\n", rscreen->b.info.pci_id); FREE(rscreen); return NULL; } /* Figure out streamout kernel support. */ switch (rscreen->b.chip_class) { case R600: if (rscreen->b.family < CHIP_RS780) { rscreen->b.has_streamout = rscreen->b.info.drm_minor >= 14; } else { rscreen->b.has_streamout = rscreen->b.info.drm_minor >= 23; } break; case R700: rscreen->b.has_streamout = rscreen->b.info.drm_minor >= 17; break; case EVERGREEN: case CAYMAN: rscreen->b.has_streamout = rscreen->b.info.drm_minor >= 14; break; default: rscreen->b.has_streamout = FALSE; break; } /* MSAA support. */ switch (rscreen->b.chip_class) { case R600: case R700: rscreen->has_msaa = rscreen->b.info.drm_minor >= 22; rscreen->has_compressed_msaa_texturing = false; break; case EVERGREEN: rscreen->has_msaa = rscreen->b.info.drm_minor >= 19; rscreen->has_compressed_msaa_texturing = rscreen->b.info.drm_minor >= 24; break; case CAYMAN: rscreen->has_msaa = rscreen->b.info.drm_minor >= 19; rscreen->has_compressed_msaa_texturing = true; break; default: rscreen->has_msaa = FALSE; rscreen->has_compressed_msaa_texturing = false; } rscreen->b.has_cp_dma = rscreen->b.info.drm_minor >= 27 && !(rscreen->b.debug_flags & DBG_NO_CP_DMA); rscreen->global_pool = compute_memory_pool_new(rscreen); /* Create the auxiliary context. This must be done last. */ rscreen->b.aux_context = rscreen->b.b.context_create(&rscreen->b.b, NULL); #if 0 /* This is for testing whether aux_context and buffer clearing work correctly. */ struct pipe_resource templ = {}; templ.width0 = 4; templ.height0 = 2048; templ.depth0 = 1; templ.array_size = 1; templ.target = PIPE_TEXTURE_2D; templ.format = PIPE_FORMAT_R8G8B8A8_UNORM; templ.usage = PIPE_USAGE_DEFAULT; struct r600_resource *res = r600_resource(rscreen->screen.resource_create(&rscreen->screen, &templ)); unsigned char *map = ws->buffer_map(res->cs_buf, NULL, PIPE_TRANSFER_WRITE); memset(map, 0, 256); r600_screen_clear_buffer(rscreen, &res->b.b, 4, 4, 0xCC); r600_screen_clear_buffer(rscreen, &res->b.b, 8, 4, 0xDD); r600_screen_clear_buffer(rscreen, &res->b.b, 12, 4, 0xEE); r600_screen_clear_buffer(rscreen, &res->b.b, 20, 4, 0xFF); r600_screen_clear_buffer(rscreen, &res->b.b, 32, 20, 0x87); ws->buffer_wait(res->buf, RADEON_USAGE_WRITE); int i; for (i = 0; i < 256; i++) { printf("%02X", map[i]); if (i % 16 == 15) printf("\n"); } #endif return &rscreen->b.b; }
static void *r600_buffer_transfer_map(struct pipe_context *pipe, struct pipe_transfer *transfer) { struct r600_resource *rbuffer = r600_resource(transfer->resource); struct r600_context *rctx = (struct r600_context*)pipe; uint8_t *data; if (transfer->usage & PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE && !(transfer->usage & PIPE_TRANSFER_UNSYNCHRONIZED)) { assert(transfer->usage & PIPE_TRANSFER_WRITE); /* Check if mapping this buffer would cause waiting for the GPU. */ if (rctx->ws->cs_is_buffer_referenced(rctx->cs, rbuffer->cs_buf, RADEON_USAGE_READWRITE) || rctx->ws->buffer_is_busy(rbuffer->buf, RADEON_USAGE_READWRITE)) { unsigned i, mask; /* Discard the buffer. */ pb_reference(&rbuffer->buf, NULL); /* Create a new one in the same pipe_resource. */ /* XXX We probably want a different alignment for buffers and textures. */ r600_init_resource(rctx->screen, rbuffer, rbuffer->b.b.width0, 4096, rbuffer->b.b.bind, rbuffer->b.b.usage); /* We changed the buffer, now we need to bind it where the old one was bound. */ /* Vertex buffers. */ mask = rctx->vertex_buffer_state.enabled_mask; while (mask) { i = u_bit_scan(&mask); if (rctx->vertex_buffer_state.vb[i].buffer == &rbuffer->b.b) { rctx->vertex_buffer_state.dirty_mask |= 1 << i; r600_vertex_buffers_dirty(rctx); } } /* Streamout buffers. */ for (i = 0; i < rctx->num_so_targets; i++) { if (rctx->so_targets[i]->b.buffer == &rbuffer->b.b) { r600_context_streamout_end(rctx); rctx->streamout_start = TRUE; rctx->streamout_append_bitmask = ~0; } } /* Constant buffers. */ r600_set_constants_dirty_if_bound(rctx, &rctx->vs_constbuf_state, rbuffer); r600_set_constants_dirty_if_bound(rctx, &rctx->ps_constbuf_state, rbuffer); } } #if 0 /* this is broken (see Bug 53130) */ else if ((transfer->usage & PIPE_TRANSFER_DISCARD_RANGE) && !(transfer->usage & PIPE_TRANSFER_UNSYNCHRONIZED) && rctx->screen->has_streamout && /* The buffer range must be aligned to 4. */ transfer->box.x % 4 == 0 && transfer->box.width % 4 == 0) { assert(transfer->usage & PIPE_TRANSFER_WRITE); /* Check if mapping this buffer would cause waiting for the GPU. */ if (rctx->ws->cs_is_buffer_referenced(rctx->cs, rbuffer->cs_buf, RADEON_USAGE_READWRITE) || rctx->ws->buffer_is_busy(rbuffer->buf, RADEON_USAGE_READWRITE)) { /* Do a wait-free write-only transfer using a temporary buffer. */ struct r600_transfer *rtransfer = (struct r600_transfer*)transfer; rtransfer->staging = (struct r600_resource*) pipe_buffer_create(pipe->screen, PIPE_BIND_VERTEX_BUFFER, PIPE_USAGE_STAGING, transfer->box.width); return rctx->ws->buffer_map(rtransfer->staging->cs_buf, rctx->cs, PIPE_TRANSFER_WRITE); } } #endif data = rctx->ws->buffer_map(rbuffer->cs_buf, rctx->cs, transfer->usage); if (!data) return NULL; return (uint8_t*)data + transfer->box.x; }
static void *r600_buffer_transfer_map(struct pipe_context *ctx, struct pipe_resource *resource, unsigned level, unsigned usage, const struct pipe_box *box, struct pipe_transfer **ptransfer) { struct r600_context *rctx = (struct r600_context*)ctx; struct r600_resource *rbuffer = r600_resource(resource); uint8_t *data; assert(box->x + box->width <= resource->width0); if (usage & PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE && !(usage & PIPE_TRANSFER_UNSYNCHRONIZED)) { assert(usage & PIPE_TRANSFER_WRITE); /* Check if mapping this buffer would cause waiting for the GPU. */ if (rctx->ws->cs_is_buffer_referenced(rctx->cs, rbuffer->cs_buf, RADEON_USAGE_READWRITE) || rctx->ws->buffer_is_busy(rbuffer->buf, RADEON_USAGE_READWRITE)) { unsigned i, mask; /* Discard the buffer. */ pb_reference(&rbuffer->buf, NULL); /* Create a new one in the same pipe_resource. */ /* XXX We probably want a different alignment for buffers and textures. */ r600_init_resource(rctx->screen, rbuffer, rbuffer->b.b.width0, 4096, rbuffer->b.b.bind, rbuffer->b.b.usage); /* We changed the buffer, now we need to bind it where the old one was bound. */ /* Vertex buffers. */ mask = rctx->vertex_buffer_state.enabled_mask; while (mask) { i = u_bit_scan(&mask); if (rctx->vertex_buffer_state.vb[i].buffer == &rbuffer->b.b) { rctx->vertex_buffer_state.dirty_mask |= 1 << i; r600_vertex_buffers_dirty(rctx); } } /* Streamout buffers. */ for (i = 0; i < rctx->num_so_targets; i++) { if (rctx->so_targets[i]->b.buffer == &rbuffer->b.b) { r600_context_streamout_end(rctx); rctx->streamout_start = TRUE; rctx->streamout_append_bitmask = ~0; } } /* Constant buffers. */ r600_set_constants_dirty_if_bound(rctx, rbuffer); } } else if ((usage & PIPE_TRANSFER_DISCARD_RANGE) && !(usage & PIPE_TRANSFER_UNSYNCHRONIZED) && rctx->screen->has_streamout && /* The buffer range must be aligned to 4. */ box->x % 4 == 0 && box->width % 4 == 0) { assert(usage & PIPE_TRANSFER_WRITE); /* Check if mapping this buffer would cause waiting for the GPU. */ if (rctx->ws->cs_is_buffer_referenced(rctx->cs, rbuffer->cs_buf, RADEON_USAGE_READWRITE) || rctx->ws->buffer_is_busy(rbuffer->buf, RADEON_USAGE_READWRITE)) { /* Do a wait-free write-only transfer using a temporary buffer. */ struct r600_resource *staging = (struct r600_resource*) pipe_buffer_create(ctx->screen, PIPE_BIND_VERTEX_BUFFER, PIPE_USAGE_STAGING, box->width + (box->x % R600_MAP_BUFFER_ALIGNMENT)); data = rctx->ws->buffer_map(staging->cs_buf, rctx->cs, PIPE_TRANSFER_WRITE); if (!data) return NULL; data += box->x % R600_MAP_BUFFER_ALIGNMENT; return r600_buffer_get_transfer(ctx, resource, level, usage, box, ptransfer, data, staging); } } data = rctx->ws->buffer_map(rbuffer->cs_buf, rctx->cs, usage); if (!data) { return NULL; } data += box->x; return r600_buffer_get_transfer(ctx, resource, level, usage, box, ptransfer, data, NULL); }
static void *r600_buffer_transfer_map(struct pipe_context *ctx, struct pipe_resource *resource, unsigned level, unsigned usage, const struct pipe_box *box, struct pipe_transfer **ptransfer) { struct r600_common_context *rctx = (struct r600_common_context*)ctx; struct r600_common_screen *rscreen = (struct r600_common_screen*)ctx->screen; struct r600_resource *rbuffer = r600_resource(resource); uint8_t *data; assert(box->x + box->width <= resource->width0); /* See if the buffer range being mapped has never been initialized, * in which case it can be mapped unsynchronized. */ if (!(usage & PIPE_TRANSFER_UNSYNCHRONIZED) && usage & PIPE_TRANSFER_WRITE && !util_ranges_intersect(&rbuffer->valid_buffer_range, box->x, box->x + box->width)) { usage |= PIPE_TRANSFER_UNSYNCHRONIZED; } /* If discarding the entire range, discard the whole resource instead. */ if (usage & PIPE_TRANSFER_DISCARD_RANGE && box->x == 0 && box->width == resource->width0) { usage |= PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE; } if (usage & PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE && !(usage & PIPE_TRANSFER_UNSYNCHRONIZED)) { assert(usage & PIPE_TRANSFER_WRITE); if (r600_invalidate_buffer(rctx, rbuffer)) { /* At this point, the buffer is always idle. */ usage |= PIPE_TRANSFER_UNSYNCHRONIZED; } } else if ((usage & PIPE_TRANSFER_DISCARD_RANGE) && !(usage & PIPE_TRANSFER_UNSYNCHRONIZED) && !(rscreen->debug_flags & DBG_NO_DISCARD_RANGE) && r600_can_dma_copy_buffer(rctx, box->x, 0, box->width)) { assert(usage & PIPE_TRANSFER_WRITE); /* Check if mapping this buffer would cause waiting for the GPU. */ if (r600_rings_is_buffer_referenced(rctx, rbuffer->buf, RADEON_USAGE_READWRITE) || !rctx->ws->buffer_wait(rbuffer->buf, 0, RADEON_USAGE_READWRITE)) { /* Do a wait-free write-only transfer using a temporary buffer. */ unsigned offset; struct r600_resource *staging = NULL; u_upload_alloc(rctx->uploader, 0, box->width + (box->x % R600_MAP_BUFFER_ALIGNMENT), 256, &offset, (struct pipe_resource**)&staging, (void**)&data); if (staging) { data += box->x % R600_MAP_BUFFER_ALIGNMENT; return r600_buffer_get_transfer(ctx, resource, level, usage, box, ptransfer, data, staging, offset); } } else { /* At this point, the buffer is always idle (we checked it above). */ usage |= PIPE_TRANSFER_UNSYNCHRONIZED; } } /* Using a staging buffer in GTT for larger reads is much faster. */ else if ((usage & PIPE_TRANSFER_READ) && !(usage & PIPE_TRANSFER_WRITE) && rbuffer->domains == RADEON_DOMAIN_VRAM && r600_can_dma_copy_buffer(rctx, 0, box->x, box->width)) { struct r600_resource *staging; staging = (struct r600_resource*) pipe_buffer_create( ctx->screen, PIPE_BIND_TRANSFER_READ, PIPE_USAGE_STAGING, box->width + (box->x % R600_MAP_BUFFER_ALIGNMENT)); if (staging) { /* Copy the VRAM buffer to the staging buffer. */ rctx->dma_copy(ctx, &staging->b.b, 0, box->x % R600_MAP_BUFFER_ALIGNMENT, 0, 0, resource, level, box); data = r600_buffer_map_sync_with_rings(rctx, staging, PIPE_TRANSFER_READ); data += box->x % R600_MAP_BUFFER_ALIGNMENT; return r600_buffer_get_transfer(ctx, resource, level, usage, box, ptransfer, data, staging, 0); } } data = r600_buffer_map_sync_with_rings(rctx, rbuffer, usage); if (!data) { return NULL; } data += box->x; return r600_buffer_get_transfer(ctx, resource, level, usage, box, ptransfer, data, NULL, 0); }
void r600_cp_dma_copy_buffer(struct r600_context *rctx, struct pipe_resource *dst, uint64_t dst_offset, struct pipe_resource *src, uint64_t src_offset, unsigned size) { struct radeon_winsys_cs *cs = rctx->b.gfx.cs; assert(size); assert(rctx->screen->b.has_cp_dma); /* Mark the buffer range of destination as valid (initialized), * so that transfer_map knows it should wait for the GPU when mapping * that range. */ util_range_add(&r600_resource(dst)->valid_buffer_range, dst_offset, dst_offset + size); dst_offset += r600_resource(dst)->gpu_address; src_offset += r600_resource(src)->gpu_address; /* Flush the caches where the resources are bound. */ rctx->b.flags |= R600_CONTEXT_INV_CONST_CACHE | R600_CONTEXT_INV_VERTEX_CACHE | R600_CONTEXT_INV_TEX_CACHE | R600_CONTEXT_FLUSH_AND_INV | R600_CONTEXT_FLUSH_AND_INV_CB | R600_CONTEXT_FLUSH_AND_INV_DB | R600_CONTEXT_FLUSH_AND_INV_CB_META | R600_CONTEXT_FLUSH_AND_INV_DB_META | R600_CONTEXT_STREAMOUT_FLUSH | R600_CONTEXT_WAIT_3D_IDLE; /* There are differences between R700 and EG in CP DMA, * but we only use the common bits here. */ while (size) { unsigned sync = 0; unsigned byte_count = MIN2(size, CP_DMA_MAX_BYTE_COUNT); unsigned src_reloc, dst_reloc; r600_need_cs_space(rctx, 10 + (rctx->b.flags ? R600_MAX_FLUSH_CS_DWORDS : 0), FALSE); /* Flush the caches for the first copy only. */ if (rctx->b.flags) { r600_flush_emit(rctx); } /* Do the synchronization after the last copy, so that all data is written to memory. */ if (size == byte_count) { sync = PKT3_CP_DMA_CP_SYNC; } /* This must be done after r600_need_cs_space. */ src_reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, (struct r600_resource*)src, RADEON_USAGE_READ, RADEON_PRIO_CP_DMA); dst_reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, (struct r600_resource*)dst, RADEON_USAGE_WRITE, RADEON_PRIO_CP_DMA); radeon_emit(cs, PKT3(PKT3_CP_DMA, 4, 0)); radeon_emit(cs, src_offset); /* SRC_ADDR_LO [31:0] */ radeon_emit(cs, sync | ((src_offset >> 32) & 0xff)); /* CP_SYNC [31] | SRC_ADDR_HI [7:0] */ radeon_emit(cs, dst_offset); /* DST_ADDR_LO [31:0] */ radeon_emit(cs, (dst_offset >> 32) & 0xff); /* DST_ADDR_HI [7:0] */ radeon_emit(cs, byte_count); /* COMMAND [29:22] | BYTE_COUNT [20:0] */ radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); radeon_emit(cs, src_reloc); radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); radeon_emit(cs, dst_reloc); size -= byte_count; src_offset += byte_count; dst_offset += byte_count; } /* Invalidate the read caches. */ rctx->b.flags |= R600_CONTEXT_INV_CONST_CACHE | R600_CONTEXT_INV_VERTEX_CACHE | R600_CONTEXT_INV_TEX_CACHE; }
void r600_set_constant_buffer(struct pipe_context *ctx, uint shader, uint index, struct pipe_resource *buffer) { struct r600_pipe_context *rctx = (struct r600_pipe_context *)ctx; struct r600_resource *rbuffer = r600_resource(buffer); struct r600_pipe_resource_state *rstate; uint32_t offset; /* Note that the state tracker can unbind constant buffers by * passing NULL here. */ if (buffer == NULL) { return; } r600_upload_const_buffer(rctx, &rbuffer, &offset); switch (shader) { case PIPE_SHADER_VERTEX: rctx->vs_const_buffer.nregs = 0; r600_pipe_state_add_reg(&rctx->vs_const_buffer, R_028180_ALU_CONST_BUFFER_SIZE_VS_0 + index * 4, ALIGN_DIVUP(buffer->width0 >> 4, 16), 0xFFFFFFFF, NULL, 0); r600_pipe_state_add_reg(&rctx->vs_const_buffer, R_028980_ALU_CONST_CACHE_VS_0 + index * 4, offset >> 8, 0xFFFFFFFF, rbuffer, RADEON_USAGE_READ); r600_context_pipe_state_set(&rctx->ctx, &rctx->vs_const_buffer); rstate = &rctx->vs_const_buffer_resource[index]; if (!rstate->id) { if (rctx->chip_class >= EVERGREEN) { evergreen_pipe_init_buffer_resource(rctx, rstate); } else { r600_pipe_init_buffer_resource(rctx, rstate); } } if (rctx->chip_class >= EVERGREEN) { evergreen_pipe_mod_buffer_resource(rstate, rbuffer, offset, 16, RADEON_USAGE_READ); evergreen_context_pipe_state_set_vs_resource(&rctx->ctx, rstate, index); } else { r600_pipe_mod_buffer_resource(rstate, rbuffer, offset, 16, RADEON_USAGE_READ); r600_context_pipe_state_set_vs_resource(&rctx->ctx, rstate, index); } break; case PIPE_SHADER_FRAGMENT: rctx->ps_const_buffer.nregs = 0; r600_pipe_state_add_reg(&rctx->ps_const_buffer, R_028140_ALU_CONST_BUFFER_SIZE_PS_0, ALIGN_DIVUP(buffer->width0 >> 4, 16), 0xFFFFFFFF, NULL, 0); r600_pipe_state_add_reg(&rctx->ps_const_buffer, R_028940_ALU_CONST_CACHE_PS_0, offset >> 8, 0xFFFFFFFF, rbuffer, RADEON_USAGE_READ); r600_context_pipe_state_set(&rctx->ctx, &rctx->ps_const_buffer); rstate = &rctx->ps_const_buffer_resource[index]; if (!rstate->id) { if (rctx->chip_class >= EVERGREEN) { evergreen_pipe_init_buffer_resource(rctx, rstate); } else { r600_pipe_init_buffer_resource(rctx, rstate); } } if (rctx->chip_class >= EVERGREEN) { evergreen_pipe_mod_buffer_resource(rstate, rbuffer, offset, 16, RADEON_USAGE_READ); evergreen_context_pipe_state_set_ps_resource(&rctx->ctx, rstate, index); } else { r600_pipe_mod_buffer_resource(rstate, rbuffer, offset, 16, RADEON_USAGE_READ); r600_context_pipe_state_set_ps_resource(&rctx->ctx, rstate, index); } break; default: R600_ERR("unsupported %d\n", shader); return; } if (buffer != &rbuffer->b.b.b) pipe_resource_reference((struct pipe_resource**)&rbuffer, NULL); }
radeon_emit(cs, (t[i]->b.buffer_offset + t[i]->b.buffer_size) >> 2); /* BUFFER_SIZE (in DW) */ radeon_emit(cs, stride_in_dw[i]); /* VTX_STRIDE (in DW) */ } else { uint64_t va = r600_resource_va(rctx->b.screen, (void*)t[i]->b.buffer); update_flags |= SURFACE_BASE_UPDATE_STRMOUT(i); r600_write_context_reg_seq(cs, R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 + 16*i, 3); radeon_emit(cs, (t[i]->b.buffer_offset + t[i]->b.buffer_size) >> 2); /* BUFFER_SIZE (in DW) */ radeon_emit(cs, stride_in_dw[i]); /* VTX_STRIDE (in DW) */ radeon_emit(cs, va >> 8); /* BUFFER_BASE */ r600_emit_reloc(rctx, &rctx->rings.gfx, r600_resource(t[i]->b.buffer), RADEON_USAGE_WRITE); /* R7xx requires this packet after updating BUFFER_BASE. * Without this, R7xx locks up. */ if (rctx->family >= CHIP_RS780 && rctx->family <= CHIP_RV740) { radeon_emit(cs, PKT3(PKT3_STRMOUT_BASE_UPDATE, 1, 0)); radeon_emit(cs, i); radeon_emit(cs, va >> 8); r600_emit_reloc(rctx, &rctx->rings.gfx, r600_resource(t[i]->b.buffer), RADEON_USAGE_WRITE); } } if (rctx->streamout.append_bitmask & (1 << i)) {
static void *r600_buffer_transfer_map(struct pipe_context *ctx, struct pipe_resource *resource, unsigned level, unsigned usage, const struct pipe_box *box, struct pipe_transfer **ptransfer) { struct r600_context *rctx = (struct r600_context*)ctx; struct r600_resource *rbuffer = r600_resource(resource); uint8_t *data; assert(box->x + box->width <= resource->width0); /* See if the buffer range being mapped has never been initialized, * in which case it can be mapped unsynchronized. */ if (!(usage & PIPE_TRANSFER_UNSYNCHRONIZED) && usage & PIPE_TRANSFER_WRITE && !util_ranges_intersect(&rbuffer->valid_buffer_range, box->x, box->x + box->width)) { usage |= PIPE_TRANSFER_UNSYNCHRONIZED; } if (usage & PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE && !(usage & PIPE_TRANSFER_UNSYNCHRONIZED)) { assert(usage & PIPE_TRANSFER_WRITE); /* Check if mapping this buffer would cause waiting for the GPU. */ if (r600_rings_is_buffer_referenced(&rctx->b, rbuffer->cs_buf, RADEON_USAGE_READWRITE) || rctx->b.ws->buffer_is_busy(rbuffer->buf, RADEON_USAGE_READWRITE)) { unsigned i, mask; /* Discard the buffer. */ pb_reference(&rbuffer->buf, NULL); /* Create a new one in the same pipe_resource. */ /* XXX We probably want a different alignment for buffers and textures. */ r600_init_resource(&rctx->screen->b, rbuffer, rbuffer->b.b.width0, 4096, TRUE, rbuffer->b.b.usage); /* We changed the buffer, now we need to bind it where the old one was bound. */ /* Vertex buffers. */ mask = rctx->vertex_buffer_state.enabled_mask; while (mask) { i = u_bit_scan(&mask); if (rctx->vertex_buffer_state.vb[i].buffer == &rbuffer->b.b) { rctx->vertex_buffer_state.dirty_mask |= 1 << i; r600_vertex_buffers_dirty(rctx); } } /* Streamout buffers. */ for (i = 0; i < rctx->b.streamout.num_targets; i++) { if (rctx->b.streamout.targets[i]->b.buffer == &rbuffer->b.b) { if (rctx->b.streamout.begin_emitted) { r600_emit_streamout_end(&rctx->b); } rctx->b.streamout.append_bitmask = rctx->b.streamout.enabled_mask; r600_streamout_buffers_dirty(&rctx->b); } } /* Constant buffers. */ r600_set_constants_dirty_if_bound(rctx, rbuffer); } } else if ((usage & PIPE_TRANSFER_DISCARD_RANGE) && !(usage & PIPE_TRANSFER_UNSYNCHRONIZED) && !(rctx->screen->b.debug_flags & DBG_NO_DISCARD_RANGE) && (rctx->screen->has_cp_dma || (rctx->screen->has_streamout && /* The buffer range must be aligned to 4 with streamout. */ box->x % 4 == 0 && box->width % 4 == 0))) { assert(usage & PIPE_TRANSFER_WRITE); /* Check if mapping this buffer would cause waiting for the GPU. */ if (r600_rings_is_buffer_referenced(&rctx->b, rbuffer->cs_buf, RADEON_USAGE_READWRITE) || rctx->b.ws->buffer_is_busy(rbuffer->buf, RADEON_USAGE_READWRITE)) { /* Do a wait-free write-only transfer using a temporary buffer. */ unsigned offset; struct r600_resource *staging = NULL; u_upload_alloc(rctx->uploader, 0, box->width + (box->x % R600_MAP_BUFFER_ALIGNMENT), &offset, (struct pipe_resource**)&staging, (void**)&data); if (staging) { data += box->x % R600_MAP_BUFFER_ALIGNMENT; return r600_buffer_get_transfer(ctx, resource, level, usage, box, ptransfer, data, staging, offset); } } } /* mmap and synchronize with rings */ data = r600_buffer_map_sync_with_rings(&rctx->b, rbuffer, usage); if (!data) { return NULL; } data += box->x; return r600_buffer_get_transfer(ctx, resource, level, usage, box, ptransfer, data, NULL, 0); }
static void *r600_buffer_transfer_map(struct pipe_context *ctx, struct pipe_resource *resource, unsigned level, unsigned usage, const struct pipe_box *box, struct pipe_transfer **ptransfer) { struct r600_common_context *rctx = (struct r600_common_context*)ctx; struct r600_common_screen *rscreen = (struct r600_common_screen*)ctx->screen; struct r600_resource *rbuffer = r600_resource(resource); uint8_t *data; assert(box->x + box->width <= resource->width0); /* See if the buffer range being mapped has never been initialized, * in which case it can be mapped unsynchronized. */ if (!(usage & PIPE_TRANSFER_UNSYNCHRONIZED) && usage & PIPE_TRANSFER_WRITE && !util_ranges_intersect(&rbuffer->valid_buffer_range, box->x, box->x + box->width)) { usage |= PIPE_TRANSFER_UNSYNCHRONIZED; } /* If discarding the entire range, discard the whole resource instead. */ if (usage & PIPE_TRANSFER_DISCARD_RANGE && box->x == 0 && box->width == resource->width0) { usage |= PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE; } if (usage & PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE && !(usage & PIPE_TRANSFER_UNSYNCHRONIZED)) { assert(usage & PIPE_TRANSFER_WRITE); /* Check if mapping this buffer would cause waiting for the GPU. */ if (r600_rings_is_buffer_referenced(rctx, rbuffer->cs_buf, RADEON_USAGE_READWRITE) || rctx->ws->buffer_is_busy(rbuffer->buf, RADEON_USAGE_READWRITE)) { rctx->invalidate_buffer(&rctx->b, &rbuffer->b.b); } /* At this point, the buffer is always idle. */ usage |= PIPE_TRANSFER_UNSYNCHRONIZED; } else if ((usage & PIPE_TRANSFER_DISCARD_RANGE) && !(usage & PIPE_TRANSFER_UNSYNCHRONIZED) && !(rscreen->debug_flags & DBG_NO_DISCARD_RANGE) && (rscreen->has_cp_dma || (rscreen->has_streamout && /* The buffer range must be aligned to 4 with streamout. */ box->x % 4 == 0 && box->width % 4 == 0))) { assert(usage & PIPE_TRANSFER_WRITE); /* Check if mapping this buffer would cause waiting for the GPU. */ if (r600_rings_is_buffer_referenced(rctx, rbuffer->cs_buf, RADEON_USAGE_READWRITE) || rctx->ws->buffer_is_busy(rbuffer->buf, RADEON_USAGE_READWRITE)) { /* Do a wait-free write-only transfer using a temporary buffer. */ unsigned offset; struct r600_resource *staging = NULL; u_upload_alloc(rctx->uploader, 0, box->width + (box->x % R600_MAP_BUFFER_ALIGNMENT), &offset, (struct pipe_resource**)&staging, (void**)&data); if (staging) { data += box->x % R600_MAP_BUFFER_ALIGNMENT; return r600_buffer_get_transfer(ctx, resource, level, usage, box, ptransfer, data, staging, offset); } else { return NULL; /* error, shouldn't occur though */ } } /* At this point, the buffer is always idle (we checked it above). */ usage |= PIPE_TRANSFER_UNSYNCHRONIZED; } data = r600_buffer_map_sync_with_rings(rctx, rbuffer, usage); if (!data) { return NULL; } data += box->x; return r600_buffer_get_transfer(ctx, resource, level, usage, box, ptransfer, data, NULL, 0); }
void r600_cp_dma_copy_buffer(struct r600_context *rctx, struct pipe_resource *dst, uint64_t dst_offset, struct pipe_resource *src, uint64_t src_offset, unsigned size) { struct radeon_winsys_cs *cs = rctx->b.gfx.cs; assert(size); assert(rctx->screen->b.has_cp_dma); /* Mark the buffer range of destination as valid (initialized), * so that transfer_map knows it should wait for the GPU when mapping * that range. */ util_range_add(&r600_resource(dst)->valid_buffer_range, dst_offset, dst_offset + size); dst_offset += r600_resource(dst)->gpu_address; src_offset += r600_resource(src)->gpu_address; /* Flush the caches where the resources are bound. */ rctx->b.flags |= r600_get_flush_flags(R600_COHERENCY_SHADER) | R600_CONTEXT_WAIT_3D_IDLE; /* There are differences between R700 and EG in CP DMA, * but we only use the common bits here. */ while (size) { unsigned sync = 0; unsigned byte_count = MIN2(size, CP_DMA_MAX_BYTE_COUNT); unsigned src_reloc, dst_reloc; r600_need_cs_space(rctx, 10 + (rctx->b.flags ? R600_MAX_FLUSH_CS_DWORDS : 0) + 3 + R600_MAX_PFP_SYNC_ME_DWORDS, FALSE); /* Flush the caches for the first copy only. */ if (rctx->b.flags) { r600_flush_emit(rctx); } /* Do the synchronization after the last copy, so that all data is written to memory. */ if (size == byte_count) { sync = PKT3_CP_DMA_CP_SYNC; } /* This must be done after r600_need_cs_space. */ src_reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, (struct r600_resource*)src, RADEON_USAGE_READ, RADEON_PRIO_CP_DMA); dst_reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, (struct r600_resource*)dst, RADEON_USAGE_WRITE, RADEON_PRIO_CP_DMA); radeon_emit(cs, PKT3(PKT3_CP_DMA, 4, 0)); radeon_emit(cs, src_offset); /* SRC_ADDR_LO [31:0] */ radeon_emit(cs, sync | ((src_offset >> 32) & 0xff)); /* CP_SYNC [31] | SRC_ADDR_HI [7:0] */ radeon_emit(cs, dst_offset); /* DST_ADDR_LO [31:0] */ radeon_emit(cs, (dst_offset >> 32) & 0xff); /* DST_ADDR_HI [7:0] */ radeon_emit(cs, byte_count); /* COMMAND [29:22] | BYTE_COUNT [20:0] */ radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); radeon_emit(cs, src_reloc); radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); radeon_emit(cs, dst_reloc); size -= byte_count; src_offset += byte_count; dst_offset += byte_count; } /* CP_DMA_CP_SYNC doesn't wait for idle on R6xx, but this does. */ if (rctx->b.chip_class == R600) radeon_set_config_reg(cs, R_008040_WAIT_UNTIL, S_008040_WAIT_CP_DMA_IDLE(1)); /* CP DMA is executed in ME, but index buffers are read by PFP. * This ensures that ME (CP DMA) is idle before PFP starts fetching * indices. If we wanted to execute CP DMA in PFP, this packet * should precede it. */ r600_emit_pfp_sync_me(rctx); }