static void si_vertex_buffers_begin_new_cs(struct si_context *sctx) { struct si_descriptors *desc = &sctx->vertex_buffers; int count = sctx->vertex_elements ? sctx->vertex_elements->count : 0; int i; for (i = 0; i < count; i++) { int vb = sctx->vertex_elements->elements[i].vertex_buffer_index; if (vb >= Elements(sctx->vertex_buffer)) continue; if (!sctx->vertex_buffer[vb].buffer) continue; radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, (struct r600_resource*)sctx->vertex_buffer[vb].buffer, RADEON_USAGE_READ, RADEON_PRIO_VERTEX_BUFFER); } if (!desc->buffer) return; radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, desc->buffer, RADEON_USAGE_READ, RADEON_PRIO_DESCRIPTORS); }
static void si_set_sampler_view(struct si_context *sctx, unsigned shader, unsigned slot, struct pipe_sampler_view *view, unsigned *view_desc) { struct si_sampler_views *views = &sctx->samplers[shader].views; if (views->views[slot] == view) return; if (view) { struct si_sampler_view *rview = (struct si_sampler_view*)view; if (rview->resource) radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, rview->resource, RADEON_USAGE_READ, r600_get_sampler_view_priority(rview->resource)); if (rview->dcc_buffer && rview->dcc_buffer != rview->resource) radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, rview->dcc_buffer, RADEON_USAGE_READ, RADEON_PRIO_DCC); pipe_sampler_view_reference(&views->views[slot], view); memcpy(views->desc.list + slot*8, view_desc, 8*4); views->desc.enabled_mask |= 1llu << slot; } else { pipe_sampler_view_reference(&views->views[slot], NULL); memcpy(views->desc.list + slot*8, null_descriptor, 8*4); views->desc.enabled_mask &= ~(1llu << slot); } views->desc.list_dirty = true; }
void r600_need_dma_space(struct r600_common_context *ctx, unsigned num_dw, struct r600_resource *dst, struct r600_resource *src) { uint64_t vram = 0, gtt = 0; if (dst) { if (dst->domains & RADEON_DOMAIN_VRAM) vram += dst->buf->size; else if (dst->domains & RADEON_DOMAIN_GTT) gtt += dst->buf->size; } if (src) { if (src->domains & RADEON_DOMAIN_VRAM) vram += src->buf->size; else if (src->domains & RADEON_DOMAIN_GTT) gtt += src->buf->size; } /* Flush the GFX IB if DMA depends on it. */ if (radeon_emitted(ctx->gfx.cs, ctx->initial_gfx_cs_size) && ((dst && ctx->ws->cs_is_buffer_referenced(ctx->gfx.cs, dst->buf, RADEON_USAGE_READWRITE)) || (src && ctx->ws->cs_is_buffer_referenced(ctx->gfx.cs, src->buf, RADEON_USAGE_WRITE)))) ctx->gfx.flush(ctx, RADEON_FLUSH_ASYNC, NULL); /* Flush if there's not enough space, or if the memory usage per IB * is too large. */ if (!ctx->ws->cs_check_space(ctx->dma.cs, num_dw) || !ctx->ws->cs_memory_below_limit(ctx->dma.cs, vram, gtt)) { ctx->dma.flush(ctx, RADEON_FLUSH_ASYNC, NULL); assert((num_dw + ctx->dma.cs->current.cdw) <= ctx->dma.cs->current.max_dw); } /* If GPUVM is not supported, the CS checker needs 2 entries * in the buffer list per packet, which has to be done manually. */ if (ctx->screen->info.has_virtual_memory) { if (dst) radeon_add_to_buffer_list(ctx, &ctx->dma, dst, RADEON_USAGE_WRITE, RADEON_PRIO_SDMA_BUFFER); if (src) radeon_add_to_buffer_list(ctx, &ctx->dma, src, RADEON_USAGE_READ, RADEON_PRIO_SDMA_BUFFER); } }
/** * Emit function for r600_cs_shader_state atom */ void evergreen_emit_cs_shader(struct r600_context *rctx, struct r600_atom *atom) { struct r600_cs_shader_state *state = (struct r600_cs_shader_state*)atom; struct r600_pipe_compute *shader = state->shader; struct radeon_winsys_cs *cs = rctx->b.gfx.cs; uint64_t va; struct r600_resource *code_bo; unsigned ngpr, nstack; code_bo = shader->code_bo; va = shader->code_bo->gpu_address + state->pc; ngpr = shader->bc.ngpr; nstack = shader->bc.nstack; radeon_compute_set_context_reg_seq(cs, R_0288D0_SQ_PGM_START_LS, 3); radeon_emit(cs, va >> 8); /* R_0288D0_SQ_PGM_START_LS */ radeon_emit(cs, /* R_0288D4_SQ_PGM_RESOURCES_LS */ S_0288D4_NUM_GPRS(ngpr) | S_0288D4_STACK_SIZE(nstack)); radeon_emit(cs, 0); /* R_0288D8_SQ_PGM_RESOURCES_LS_2 */ radeon_emit(cs, PKT3C(PKT3_NOP, 0, 0)); radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, code_bo, RADEON_USAGE_READ, RADEON_PRIO_USER_SHADER)); }
static bool si_upload_descriptors(struct si_context *sctx, struct si_descriptors *desc) { unsigned list_size = desc->num_elements * desc->element_dw_size * 4; void *ptr; if (!desc->list_dirty) return true; u_upload_alloc(sctx->b.uploader, 0, list_size, &desc->buffer_offset, (struct pipe_resource**)&desc->buffer, &ptr); if (!desc->buffer) return false; /* skip the draw call */ util_memcpy_cpu_to_le32(ptr, desc->list, list_size); radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, desc->buffer, RADEON_USAGE_READ, RADEON_PRIO_DESCRIPTORS); desc->list_dirty = false; desc->pointer_dirty = true; si_mark_atom_dirty(sctx, &sctx->shader_userdata.atom); return true; }
static void si_image_views_begin_new_cs(struct si_context *sctx, struct si_images_info *images) { uint mask = images->desc.enabled_mask; /* Add buffers to the CS. */ while (mask) { int i = u_bit_scan(&mask); struct pipe_image_view *view = &images->views[i]; assert(view->resource); si_sampler_view_add_buffer(sctx, view->resource, RADEON_USAGE_READWRITE); } images->desc.ce_ram_dirty = true; if (images->desc.buffer) { radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, images->desc.buffer, RADEON_USAGE_READ, RADEON_PRIO_DESCRIPTORS); } }
static void si_reinitialize_ce_ram(struct si_context *sctx, struct si_descriptors *desc) { if (desc->buffer) { struct r600_resource *buffer = (struct r600_resource*)desc->buffer; unsigned list_size = desc->num_elements * desc->element_dw_size * 4; uint64_t va = buffer->gpu_address + desc->buffer_offset; struct radeon_winsys_cs *ib = sctx->ce_preamble_ib; if (!ib) ib = sctx->ce_ib; list_size = align(list_size, 32); radeon_emit(ib, PKT3(PKT3_LOAD_CONST_RAM, 3, 0)); radeon_emit(ib, va); radeon_emit(ib, va >> 32); radeon_emit(ib, list_size / 4); radeon_emit(ib, desc->ce_offset); radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, desc->buffer, RADEON_USAGE_READ, RADEON_PRIO_DESCRIPTORS); } desc->ce_ram_dirty = false; }
void r600_emit_pfp_sync_me(struct r600_context *rctx) { struct radeon_winsys_cs *cs = rctx->b.gfx.cs; if (rctx->b.chip_class >= EVERGREEN && rctx->b.screen->info.drm_minor >= 46) { radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0)); radeon_emit(cs, 0); } else { /* Emulate PFP_SYNC_ME by writing a value to memory in ME and * waiting for it in PFP. */ struct r600_resource *buf = NULL; unsigned offset, reloc; uint64_t va; /* 16-byte address alignment is required by WAIT_REG_MEM. */ u_suballocator_alloc(rctx->b.allocator_zeroed_memory, 4, 16, &offset, (struct pipe_resource**)&buf); if (!buf) { /* This is too heavyweight, but will work. */ rctx->b.gfx.flush(rctx, RADEON_FLUSH_ASYNC, NULL); return; } reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, buf, RADEON_USAGE_READWRITE, RADEON_PRIO_FENCE); va = buf->gpu_address + offset; assert(va % 16 == 0); /* Write 1 to memory in ME. */ radeon_emit(cs, PKT3(PKT3_MEM_WRITE, 3, 0)); radeon_emit(cs, va); radeon_emit(cs, ((va >> 32) & 0xff) | MEM_WRITE_32_BITS); radeon_emit(cs, 1); radeon_emit(cs, 0); radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); radeon_emit(cs, reloc); /* Wait in PFP (PFP can only do GEQUAL against memory). */ radeon_emit(cs, PKT3(PKT3_WAIT_REG_MEM, 5, 0)); radeon_emit(cs, WAIT_REG_MEM_GEQUAL | WAIT_REG_MEM_MEMORY | WAIT_REG_MEM_PFP); radeon_emit(cs, va); radeon_emit(cs, va >> 32); radeon_emit(cs, 1); /* reference value */ radeon_emit(cs, 0xffffffff); /* mask */ radeon_emit(cs, 4); /* poll interval */ radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); radeon_emit(cs, reloc); r600_resource_reference(&buf, NULL); } }
static void si_set_constant_buffer(struct pipe_context *ctx, uint shader, uint slot, struct pipe_constant_buffer *input) { struct si_context *sctx = (struct si_context *)ctx; struct si_buffer_resources *buffers = &sctx->const_buffers[shader]; if (shader >= SI_NUM_SHADERS) return; assert(slot < buffers->desc.num_elements); pipe_resource_reference(&buffers->buffers[slot], NULL); /* CIK cannot unbind a constant buffer (S_BUFFER_LOAD is buggy * with a NULL buffer). We need to use a dummy buffer instead. */ if (sctx->b.chip_class == CIK && (!input || (!input->buffer && !input->user_buffer))) input = &sctx->null_const_buf; if (input && (input->buffer || input->user_buffer)) { struct pipe_resource *buffer = NULL; uint64_t va; /* Upload the user buffer if needed. */ if (input->user_buffer) { unsigned buffer_offset; si_upload_const_buffer(sctx, (struct r600_resource**)&buffer, input->user_buffer, input->buffer_size, &buffer_offset); if (!buffer) { /* Just unbind on failure. */ si_set_constant_buffer(ctx, shader, slot, NULL); return; } va = r600_resource(buffer)->gpu_address + buffer_offset; } else { pipe_resource_reference(&buffer, input->buffer); va = r600_resource(buffer)->gpu_address + input->buffer_offset; } /* Set the descriptor. */ uint32_t *desc = buffers->desc.list + slot*4; desc[0] = va; desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) | S_008F04_STRIDE(0); desc[2] = input->buffer_size; desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) | S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) | S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) | S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32); buffers->buffers[slot] = buffer; radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, (struct r600_resource*)buffer, buffers->shader_usage, buffers->priority); buffers->desc.enabled_mask |= 1llu << slot; } else {
static void si_sampler_states_begin_new_cs(struct si_context *sctx, struct si_sampler_states *states) { if (!states->desc.buffer) return; radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, states->desc.buffer, RADEON_USAGE_READWRITE, RADEON_PRIO_DESCRIPTORS); }
static bool si_upload_descriptors(struct si_context *sctx, struct si_descriptors *desc, struct r600_atom * atom) { unsigned list_size = desc->num_elements * desc->element_dw_size * 4; if (!desc->dirty_mask) return true; if (sctx->ce_ib) { uint32_t const* list = (uint32_t const*)desc->list; if (desc->ce_ram_dirty) si_reinitialize_ce_ram(sctx, desc); while(desc->dirty_mask) { int begin, count; u_bit_scan_consecutive_range(&desc->dirty_mask, &begin, &count); begin *= desc->element_dw_size; count *= desc->element_dw_size; radeon_emit(sctx->ce_ib, PKT3(PKT3_WRITE_CONST_RAM, count, 0)); radeon_emit(sctx->ce_ib, desc->ce_offset + begin * 4); radeon_emit_array(sctx->ce_ib, list + begin, count); } if (!si_ce_upload(sctx, desc->ce_offset, list_size, &desc->buffer_offset, &desc->buffer)) return false; } else { void *ptr; u_upload_alloc(sctx->b.uploader, 0, list_size, 256, &desc->buffer_offset, (struct pipe_resource**)&desc->buffer, &ptr); if (!desc->buffer) return false; /* skip the draw call */ util_memcpy_cpu_to_le32(ptr, desc->list, list_size); radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, desc->buffer, RADEON_USAGE_READ, RADEON_PRIO_DESCRIPTORS); } desc->pointer_dirty = true; desc->dirty_mask = 0; if (atom) si_mark_atom_dirty(sctx, atom); return true; }
static void si_buffer_resources_begin_new_cs(struct si_context *sctx, struct si_buffer_resources *buffers) { uint64_t mask = buffers->desc.enabled_mask; /* Add buffers to the CS. */ while (mask) { int i = u_bit_scan64(&mask); radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, (struct r600_resource*)buffers->buffers[i], buffers->shader_usage, buffers->priority); } if (!buffers->desc.buffer) return; radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, buffers->desc.buffer, RADEON_USAGE_READWRITE, RADEON_PRIO_DESCRIPTORS); }
static void si_sampler_view_add_buffer(struct si_context *sctx, struct pipe_resource *resource, enum radeon_bo_usage usage) { struct r600_resource *rres = (struct r600_resource*)resource; if (!resource) return; radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, rres, usage, r600_get_sampler_view_priority(rres)); }
void r600_dma_copy_buffer(struct r600_context *rctx, struct pipe_resource *dst, struct pipe_resource *src, uint64_t dst_offset, uint64_t src_offset, uint64_t size) { struct radeon_winsys_cs *cs = rctx->b.dma.cs; unsigned i, ncopy, csize; struct r600_resource *rdst = (struct r600_resource*)dst; struct r600_resource *rsrc = (struct r600_resource*)src; /* Mark the buffer range of destination as valid (initialized), * so that transfer_map knows it should wait for the GPU when mapping * that range. */ util_range_add(&rdst->valid_buffer_range, dst_offset, dst_offset + size); size >>= 2; /* convert to dwords */ ncopy = (size / R600_DMA_COPY_MAX_SIZE_DW) + !!(size % R600_DMA_COPY_MAX_SIZE_DW); r600_need_dma_space(&rctx->b, ncopy * 5, rdst, rsrc); for (i = 0; i < ncopy; i++) { csize = size < R600_DMA_COPY_MAX_SIZE_DW ? size : R600_DMA_COPY_MAX_SIZE_DW; /* emit reloc before writing cs so that cs is always in consistent state */ radeon_add_to_buffer_list(&rctx->b, &rctx->b.dma, rsrc, RADEON_USAGE_READ, RADEON_PRIO_SDMA_BUFFER); radeon_add_to_buffer_list(&rctx->b, &rctx->b.dma, rdst, RADEON_USAGE_WRITE, RADEON_PRIO_SDMA_BUFFER); radeon_emit(cs, DMA_PACKET(DMA_PACKET_COPY, 0, 0, csize)); radeon_emit(cs, dst_offset & 0xfffffffc); radeon_emit(cs, src_offset & 0xfffffffc); radeon_emit(cs, (dst_offset >> 32UL) & 0xff); radeon_emit(cs, (src_offset >> 32UL) & 0xff); dst_offset += csize << 2; src_offset += csize << 2; size -= csize; } r600_dma_emit_wait_idle(&rctx->b); }
static void cik_sdma_do_copy_buffer(struct si_context *ctx, struct pipe_resource *dst, struct pipe_resource *src, uint64_t dst_offset, uint64_t src_offset, uint64_t size) { struct radeon_winsys_cs *cs = ctx->b.dma.cs; unsigned i, ncopy, csize; struct r600_resource *rdst = (struct r600_resource*)dst; struct r600_resource *rsrc = (struct r600_resource*)src; dst_offset += r600_resource(dst)->gpu_address; src_offset += r600_resource(src)->gpu_address; ncopy = (size + CIK_SDMA_COPY_MAX_SIZE - 1) / CIK_SDMA_COPY_MAX_SIZE; r600_need_dma_space(&ctx->b, ncopy * 7); radeon_add_to_buffer_list(&ctx->b, &ctx->b.dma, rsrc, RADEON_USAGE_READ, RADEON_PRIO_SDMA_BUFFER); radeon_add_to_buffer_list(&ctx->b, &ctx->b.dma, rdst, RADEON_USAGE_WRITE, RADEON_PRIO_SDMA_BUFFER); for (i = 0; i < ncopy; i++) { csize = size < CIK_SDMA_COPY_MAX_SIZE ? size : CIK_SDMA_COPY_MAX_SIZE; cs->buf[cs->cdw++] = CIK_SDMA_PACKET(CIK_SDMA_OPCODE_COPY, CIK_SDMA_COPY_SUB_OPCODE_LINEAR, 0); cs->buf[cs->cdw++] = csize; cs->buf[cs->cdw++] = 0; /* src/dst endian swap */ cs->buf[cs->cdw++] = src_offset; cs->buf[cs->cdw++] = src_offset >> 32; cs->buf[cs->cdw++] = dst_offset; cs->buf[cs->cdw++] = dst_offset >> 32; dst_offset += csize; src_offset += csize; size -= csize; } }
static void si_sampler_views_begin_new_cs(struct si_context *sctx, struct si_sampler_views *views) { uint64_t mask = views->desc.enabled_mask; /* Add buffers to the CS. */ while (mask) { int i = u_bit_scan64(&mask); struct si_sampler_view *rview = (struct si_sampler_view*)views->views[i]; if (!rview->resource) continue; radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, rview->resource, RADEON_USAGE_READ, r600_get_sampler_view_priority(rview->resource)); } if (!views->desc.buffer) return; radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, views->desc.buffer, RADEON_USAGE_READWRITE, RADEON_PRIO_DESCRIPTORS); }
void si_pm4_emit(struct si_context *sctx, struct si_pm4_state *state) { struct radeon_winsys_cs *cs = sctx->b.gfx.cs; for (int i = 0; i < state->nbo; ++i) { radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, state->bo[i], state->bo_usage[i], state->bo_priority[i]); } if (!state->indirect_buffer) { radeon_emit_array(cs, state->pm4, state->ndw); } else { struct r600_resource *ib = state->indirect_buffer; radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, ib, RADEON_USAGE_READ, RADEON_PRIO_IB2); radeon_emit(cs, PKT3(PKT3_INDIRECT_BUFFER_CIK, 2, 0)); radeon_emit(cs, ib->gpu_address); radeon_emit(cs, ib->gpu_address >> 32); radeon_emit(cs, (ib->b.b.width0 >> 2) & 0xfffff); } }
static void si_sampler_views_begin_new_cs(struct si_context *sctx, struct si_sampler_views *views) { unsigned mask = views->desc.enabled_mask; /* Add buffers to the CS. */ while (mask) { int i = u_bit_scan(&mask); si_sampler_view_add_buffer(sctx, views->views[i]->texture, RADEON_USAGE_READ); } views->desc.ce_ram_dirty = true; if (!views->desc.buffer) return; radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, views->desc.buffer, RADEON_USAGE_READWRITE, RADEON_PRIO_DESCRIPTORS); }
static bool si_ce_upload(struct si_context *sctx, unsigned ce_offset, unsigned size, unsigned *out_offset, struct r600_resource **out_buf) { uint64_t va; u_suballocator_alloc(sctx->ce_suballocator, size, out_offset, (struct pipe_resource**)out_buf); if (!out_buf) return false; va = (*out_buf)->gpu_address + *out_offset; radeon_emit(sctx->ce_ib, PKT3(PKT3_DUMP_CONST_RAM, 3, 0)); radeon_emit(sctx->ce_ib, ce_offset); radeon_emit(sctx->ce_ib, size / 4); radeon_emit(sctx->ce_ib, va); radeon_emit(sctx->ce_ib, va >> 32); radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, *out_buf, RADEON_USAGE_READWRITE, RADEON_PRIO_DESCRIPTORS); sctx->ce_need_synchronization = true; return true; }
void r600_cp_dma_copy_buffer(struct r600_context *rctx, struct pipe_resource *dst, uint64_t dst_offset, struct pipe_resource *src, uint64_t src_offset, unsigned size) { struct radeon_winsys_cs *cs = rctx->b.gfx.cs; assert(size); assert(rctx->screen->b.has_cp_dma); /* Mark the buffer range of destination as valid (initialized), * so that transfer_map knows it should wait for the GPU when mapping * that range. */ util_range_add(&r600_resource(dst)->valid_buffer_range, dst_offset, dst_offset + size); dst_offset += r600_resource(dst)->gpu_address; src_offset += r600_resource(src)->gpu_address; /* Flush the caches where the resources are bound. */ rctx->b.flags |= R600_CONTEXT_INV_CONST_CACHE | R600_CONTEXT_INV_VERTEX_CACHE | R600_CONTEXT_INV_TEX_CACHE | R600_CONTEXT_FLUSH_AND_INV | R600_CONTEXT_FLUSH_AND_INV_CB | R600_CONTEXT_FLUSH_AND_INV_DB | R600_CONTEXT_FLUSH_AND_INV_CB_META | R600_CONTEXT_FLUSH_AND_INV_DB_META | R600_CONTEXT_STREAMOUT_FLUSH | R600_CONTEXT_WAIT_3D_IDLE; /* There are differences between R700 and EG in CP DMA, * but we only use the common bits here. */ while (size) { unsigned sync = 0; unsigned byte_count = MIN2(size, CP_DMA_MAX_BYTE_COUNT); unsigned src_reloc, dst_reloc; r600_need_cs_space(rctx, 10 + (rctx->b.flags ? R600_MAX_FLUSH_CS_DWORDS : 0), FALSE); /* Flush the caches for the first copy only. */ if (rctx->b.flags) { r600_flush_emit(rctx); } /* Do the synchronization after the last copy, so that all data is written to memory. */ if (size == byte_count) { sync = PKT3_CP_DMA_CP_SYNC; } /* This must be done after r600_need_cs_space. */ src_reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, (struct r600_resource*)src, RADEON_USAGE_READ, RADEON_PRIO_CP_DMA); dst_reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, (struct r600_resource*)dst, RADEON_USAGE_WRITE, RADEON_PRIO_CP_DMA); radeon_emit(cs, PKT3(PKT3_CP_DMA, 4, 0)); radeon_emit(cs, src_offset); /* SRC_ADDR_LO [31:0] */ radeon_emit(cs, sync | ((src_offset >> 32) & 0xff)); /* CP_SYNC [31] | SRC_ADDR_HI [7:0] */ radeon_emit(cs, dst_offset); /* DST_ADDR_LO [31:0] */ radeon_emit(cs, (dst_offset >> 32) & 0xff); /* DST_ADDR_HI [7:0] */ radeon_emit(cs, byte_count); /* COMMAND [29:22] | BYTE_COUNT [20:0] */ radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); radeon_emit(cs, src_reloc); radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); radeon_emit(cs, dst_reloc); size -= byte_count; src_offset += byte_count; dst_offset += byte_count; } /* Invalidate the read caches. */ rctx->b.flags |= R600_CONTEXT_INV_CONST_CACHE | R600_CONTEXT_INV_VERTEX_CACHE | R600_CONTEXT_INV_TEX_CACHE; }
void r600_need_dma_space(struct r600_common_context *ctx, unsigned num_dw, struct r600_resource *dst, struct r600_resource *src) { uint64_t vram = ctx->dma.cs->used_vram; uint64_t gtt = ctx->dma.cs->used_gart; if (dst) { vram += dst->vram_usage; gtt += dst->gart_usage; } if (src) { vram += src->vram_usage; gtt += src->gart_usage; } /* Flush the GFX IB if DMA depends on it. */ if (radeon_emitted(ctx->gfx.cs, ctx->initial_gfx_cs_size) && ((dst && ctx->ws->cs_is_buffer_referenced(ctx->gfx.cs, dst->buf, RADEON_USAGE_READWRITE)) || (src && ctx->ws->cs_is_buffer_referenced(ctx->gfx.cs, src->buf, RADEON_USAGE_WRITE)))) ctx->gfx.flush(ctx, RADEON_FLUSH_ASYNC, NULL); /* Flush if there's not enough space, or if the memory usage per IB * is too large. * * IBs using too little memory are limited by the IB submission overhead. * IBs using too much memory are limited by the kernel/TTM overhead. * Too long IBs create CPU-GPU pipeline bubbles and add latency. * * This heuristic makes sure that DMA requests are executed * very soon after the call is made and lowers memory usage. * It improves texture upload performance by keeping the DMA * engine busy while uploads are being submitted. */ num_dw++; /* for emit_wait_idle below */ if (!ctx->ws->cs_check_space(ctx->dma.cs, num_dw) || ctx->dma.cs->used_vram + ctx->dma.cs->used_gart > 64 * 1024 * 1024 || !radeon_cs_memory_below_limit(ctx->screen, ctx->dma.cs, vram, gtt)) { ctx->dma.flush(ctx, RADEON_FLUSH_ASYNC, NULL); assert((num_dw + ctx->dma.cs->current.cdw) <= ctx->dma.cs->current.max_dw); } /* Wait for idle if either buffer has been used in the IB before to * prevent read-after-write hazards. */ if ((dst && ctx->ws->cs_is_buffer_referenced(ctx->dma.cs, dst->buf, RADEON_USAGE_READWRITE)) || (src && ctx->ws->cs_is_buffer_referenced(ctx->dma.cs, src->buf, RADEON_USAGE_WRITE))) r600_dma_emit_wait_idle(ctx); /* If GPUVM is not supported, the CS checker needs 2 entries * in the buffer list per packet, which has to be done manually. */ if (ctx->screen->info.has_virtual_memory) { if (dst) radeon_add_to_buffer_list(ctx, &ctx->dma, dst, RADEON_USAGE_WRITE, RADEON_PRIO_SDMA_BUFFER); if (src) radeon_add_to_buffer_list(ctx, &ctx->dma, src, RADEON_USAGE_READ, RADEON_PRIO_SDMA_BUFFER); } /* this function is called before all DMA calls, so increment this. */ ctx->num_dma_calls++; }
static void compute_emit_cs(struct r600_context *ctx, const uint *block_layout, const uint *grid_layout) { struct radeon_winsys_cs *cs = ctx->b.gfx.cs; unsigned i; /* make sure that the gfx ring is only one active */ if (ctx->b.dma.cs && ctx->b.dma.cs->cdw) { ctx->b.dma.flush(ctx, RADEON_FLUSH_ASYNC, NULL); } /* Initialize all the compute-related registers. * * See evergreen_init_atom_start_compute_cs() in this file for the list * of registers initialized by the start_compute_cs_cmd atom. */ r600_emit_command_buffer(cs, &ctx->start_compute_cs_cmd); /* emit config state */ if (ctx->b.chip_class == EVERGREEN) r600_emit_atom(ctx, &ctx->config_state.atom); ctx->b.flags |= R600_CONTEXT_WAIT_3D_IDLE | R600_CONTEXT_FLUSH_AND_INV; r600_flush_emit(ctx); /* Emit colorbuffers. */ /* XXX support more than 8 colorbuffers (the offsets are not a multiple of 0x3C for CB8-11) */ for (i = 0; i < 8 && i < ctx->framebuffer.state.nr_cbufs; i++) { struct r600_surface *cb = (struct r600_surface*)ctx->framebuffer.state.cbufs[i]; unsigned reloc = radeon_add_to_buffer_list(&ctx->b, &ctx->b.gfx, (struct r600_resource*)cb->base.texture, RADEON_USAGE_READWRITE, RADEON_PRIO_SHADER_RW_BUFFER); radeon_compute_set_context_reg_seq(cs, R_028C60_CB_COLOR0_BASE + i * 0x3C, 7); radeon_emit(cs, cb->cb_color_base); /* R_028C60_CB_COLOR0_BASE */ radeon_emit(cs, cb->cb_color_pitch); /* R_028C64_CB_COLOR0_PITCH */ radeon_emit(cs, cb->cb_color_slice); /* R_028C68_CB_COLOR0_SLICE */ radeon_emit(cs, cb->cb_color_view); /* R_028C6C_CB_COLOR0_VIEW */ radeon_emit(cs, cb->cb_color_info); /* R_028C70_CB_COLOR0_INFO */ radeon_emit(cs, cb->cb_color_attrib); /* R_028C74_CB_COLOR0_ATTRIB */ radeon_emit(cs, cb->cb_color_dim); /* R_028C78_CB_COLOR0_DIM */ radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C60_CB_COLOR0_BASE */ radeon_emit(cs, reloc); if (!ctx->keep_tiling_flags) { radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C70_CB_COLOR0_INFO */ radeon_emit(cs, reloc); } radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C74_CB_COLOR0_ATTRIB */ radeon_emit(cs, reloc); } if (ctx->keep_tiling_flags) { for (; i < 8 ; i++) { radeon_compute_set_context_reg(cs, R_028C70_CB_COLOR0_INFO + i * 0x3C, S_028C70_FORMAT(V_028C70_COLOR_INVALID)); } for (; i < 12; i++) { radeon_compute_set_context_reg(cs, R_028E50_CB_COLOR8_INFO + (i - 8) * 0x1C, S_028C70_FORMAT(V_028C70_COLOR_INVALID)); } } /* Set CB_TARGET_MASK XXX: Use cb_misc_state */ radeon_compute_set_context_reg(cs, R_028238_CB_TARGET_MASK, ctx->compute_cb_target_mask); /* Emit vertex buffer state */ ctx->cs_vertex_buffer_state.atom.num_dw = 12 * util_bitcount(ctx->cs_vertex_buffer_state.dirty_mask); r600_emit_atom(ctx, &ctx->cs_vertex_buffer_state.atom); /* Emit constant buffer state */ r600_emit_atom(ctx, &ctx->constbuf_state[PIPE_SHADER_COMPUTE].atom); /* Emit sampler state */ r600_emit_atom(ctx, &ctx->samplers[PIPE_SHADER_COMPUTE].states.atom); /* Emit sampler view (texture resource) state */ r600_emit_atom(ctx, &ctx->samplers[PIPE_SHADER_COMPUTE].views.atom); /* Emit compute shader state */ r600_emit_atom(ctx, &ctx->cs_shader_state.atom); /* Emit dispatch state and dispatch packet */ evergreen_emit_direct_dispatch(ctx, block_layout, grid_layout); /* XXX evergreen_flush_emit() hardcodes the CP_COHER_SIZE to 0xffffffff */ ctx->b.flags |= R600_CONTEXT_INV_CONST_CACHE | R600_CONTEXT_INV_VERTEX_CACHE | R600_CONTEXT_INV_TEX_CACHE; r600_flush_emit(ctx); ctx->b.flags = 0; if (ctx->b.chip_class >= CAYMAN) { cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 0, 0); cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4); /* DEALLOC_STATE prevents the GPU from hanging when a * SURFACE_SYNC packet is emitted some time after a DISPATCH_DIRECT * with any of the CB*_DEST_BASE_ENA or DB_DEST_BASE_ENA bits set. */ cs->buf[cs->cdw++] = PKT3C(PKT3_DEALLOC_STATE, 0, 0); cs->buf[cs->cdw++] = 0; } #if 0 COMPUTE_DBG(ctx->screen, "cdw: %i\n", cs->cdw); for (i = 0; i < cs->cdw; i++) { COMPUTE_DBG(ctx->screen, "%4i : 0x%08X\n", i, cs->buf[i]); } #endif }
void r600_cp_dma_copy_buffer(struct r600_context *rctx, struct pipe_resource *dst, uint64_t dst_offset, struct pipe_resource *src, uint64_t src_offset, unsigned size) { struct radeon_winsys_cs *cs = rctx->b.gfx.cs; assert(size); assert(rctx->screen->b.has_cp_dma); /* Mark the buffer range of destination as valid (initialized), * so that transfer_map knows it should wait for the GPU when mapping * that range. */ util_range_add(&r600_resource(dst)->valid_buffer_range, dst_offset, dst_offset + size); dst_offset += r600_resource(dst)->gpu_address; src_offset += r600_resource(src)->gpu_address; /* Flush the caches where the resources are bound. */ rctx->b.flags |= r600_get_flush_flags(R600_COHERENCY_SHADER) | R600_CONTEXT_WAIT_3D_IDLE; /* There are differences between R700 and EG in CP DMA, * but we only use the common bits here. */ while (size) { unsigned sync = 0; unsigned byte_count = MIN2(size, CP_DMA_MAX_BYTE_COUNT); unsigned src_reloc, dst_reloc; r600_need_cs_space(rctx, 10 + (rctx->b.flags ? R600_MAX_FLUSH_CS_DWORDS : 0) + 3 + R600_MAX_PFP_SYNC_ME_DWORDS, FALSE); /* Flush the caches for the first copy only. */ if (rctx->b.flags) { r600_flush_emit(rctx); } /* Do the synchronization after the last copy, so that all data is written to memory. */ if (size == byte_count) { sync = PKT3_CP_DMA_CP_SYNC; } /* This must be done after r600_need_cs_space. */ src_reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, (struct r600_resource*)src, RADEON_USAGE_READ, RADEON_PRIO_CP_DMA); dst_reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, (struct r600_resource*)dst, RADEON_USAGE_WRITE, RADEON_PRIO_CP_DMA); radeon_emit(cs, PKT3(PKT3_CP_DMA, 4, 0)); radeon_emit(cs, src_offset); /* SRC_ADDR_LO [31:0] */ radeon_emit(cs, sync | ((src_offset >> 32) & 0xff)); /* CP_SYNC [31] | SRC_ADDR_HI [7:0] */ radeon_emit(cs, dst_offset); /* DST_ADDR_LO [31:0] */ radeon_emit(cs, (dst_offset >> 32) & 0xff); /* DST_ADDR_HI [7:0] */ radeon_emit(cs, byte_count); /* COMMAND [29:22] | BYTE_COUNT [20:0] */ radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); radeon_emit(cs, src_reloc); radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); radeon_emit(cs, dst_reloc); size -= byte_count; src_offset += byte_count; dst_offset += byte_count; } /* CP_DMA_CP_SYNC doesn't wait for idle on R6xx, but this does. */ if (rctx->b.chip_class == R600) radeon_set_config_reg(cs, R_008040_WAIT_UNTIL, S_008040_WAIT_CP_DMA_IDLE(1)); /* CP DMA is executed in ME, but index buffers are read by PFP. * This ensures that ME (CP DMA) is idle before PFP starts fetching * indices. If we wanted to execute CP DMA in PFP, this packet * should precede it. */ r600_emit_pfp_sync_me(rctx); }
static bool si_upload_vertex_buffer_descriptors(struct si_context *sctx) { struct si_descriptors *desc = &sctx->vertex_buffers; bool bound[SI_NUM_VERTEX_BUFFERS] = {}; unsigned i, count = sctx->vertex_elements->count; uint64_t va; uint32_t *ptr; if (!sctx->vertex_buffers_dirty) return true; if (!count || !sctx->vertex_elements) return true; /* Vertex buffer descriptors are the only ones which are uploaded * directly through a staging buffer and don't go through * the fine-grained upload path. */ u_upload_alloc(sctx->b.uploader, 0, count * 16, 256, &desc->buffer_offset, (struct pipe_resource**)&desc->buffer, (void**)&ptr); if (!desc->buffer) return false; radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, desc->buffer, RADEON_USAGE_READ, RADEON_PRIO_DESCRIPTORS); assert(count <= SI_NUM_VERTEX_BUFFERS); for (i = 0; i < count; i++) { struct pipe_vertex_element *ve = &sctx->vertex_elements->elements[i]; struct pipe_vertex_buffer *vb; struct r600_resource *rbuffer; unsigned offset; uint32_t *desc = &ptr[i*4]; if (ve->vertex_buffer_index >= Elements(sctx->vertex_buffer)) { memset(desc, 0, 16); continue; } vb = &sctx->vertex_buffer[ve->vertex_buffer_index]; rbuffer = (struct r600_resource*)vb->buffer; if (!rbuffer) { memset(desc, 0, 16); continue; } offset = vb->buffer_offset + ve->src_offset; va = rbuffer->gpu_address + offset; /* Fill in T# buffer resource description */ desc[0] = va; desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) | S_008F04_STRIDE(vb->stride); if (sctx->b.chip_class <= CIK && vb->stride) /* Round up by rounding down and adding 1 */ desc[2] = (vb->buffer->width0 - offset - sctx->vertex_elements->format_size[i]) / vb->stride + 1; else desc[2] = vb->buffer->width0 - offset; desc[3] = sctx->vertex_elements->rsrc_word3[i]; if (!bound[ve->vertex_buffer_index]) { radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, (struct r600_resource*)vb->buffer, RADEON_USAGE_READ, RADEON_PRIO_VERTEX_BUFFER); bound[ve->vertex_buffer_index] = true; } } /* Don't flush the const cache. It would have a very negative effect * on performance (confirmed by testing). New descriptors are always * uploaded to a fresh new buffer, so I don't think flushing the const * cache is needed. */ desc->pointer_dirty = true; si_mark_atom_dirty(sctx, &sctx->shader_userdata.atom); sctx->vertex_buffers_dirty = false; return true; }
static void cik_sdma_copy_tile(struct si_context *ctx, struct pipe_resource *dst, unsigned dst_level, struct pipe_resource *src, unsigned src_level, unsigned y, unsigned copy_height, unsigned y_align, unsigned pitch, unsigned bpe) { struct radeon_winsys_cs *cs = ctx->b.dma.cs; struct si_screen *sscreen = ctx->screen; struct r600_texture *rsrc = (struct r600_texture*)src; struct r600_texture *rdst = (struct r600_texture*)dst; struct r600_texture *rlinear, *rtiled; unsigned linear_lvl, tiled_lvl; unsigned array_mode, lbpe, pitch_tile_max, slice_tile_max, size; unsigned ncopy, height, cheight, detile, i, src_mode, dst_mode; unsigned sub_op, bank_h, bank_w, mt_aspect, nbanks, tile_split, mt; uint64_t base, addr; unsigned pipe_config, tile_mode_index; dst_mode = rdst->surface.level[dst_level].mode; src_mode = rsrc->surface.level[src_level].mode; assert(dst_mode != src_mode); assert(src_mode == RADEON_SURF_MODE_LINEAR_ALIGNED || dst_mode == RADEON_SURF_MODE_LINEAR_ALIGNED); sub_op = CIK_SDMA_COPY_SUB_OPCODE_TILED; lbpe = util_logbase2(bpe); pitch_tile_max = ((pitch / bpe) / 8) - 1; detile = dst_mode == RADEON_SURF_MODE_LINEAR_ALIGNED; rlinear = detile ? rdst : rsrc; rtiled = detile ? rsrc : rdst; linear_lvl = detile ? dst_level : src_level; tiled_lvl = detile ? src_level : dst_level; assert(!util_format_is_depth_and_stencil(rtiled->resource.b.b.format)); array_mode = si_array_mode(rtiled->surface.level[tiled_lvl].mode); slice_tile_max = (rtiled->surface.level[tiled_lvl].nblk_x * rtiled->surface.level[tiled_lvl].nblk_y) / (8*8) - 1; height = rlinear->surface.level[linear_lvl].nblk_y; base = rtiled->surface.level[tiled_lvl].offset; addr = rlinear->surface.level[linear_lvl].offset; bank_h = cik_bank_wh(rtiled->surface.bankh); bank_w = cik_bank_wh(rtiled->surface.bankw); mt_aspect = cik_macro_tile_aspect(rtiled->surface.mtilea); tile_split = cik_tile_split(rtiled->surface.tile_split); tile_mode_index = si_tile_mode_index(rtiled, tiled_lvl, false); nbanks = si_num_banks(sscreen, rtiled); base += rtiled->resource.gpu_address; addr += rlinear->resource.gpu_address; pipe_config = cik_db_pipe_config(sscreen, tile_mode_index); mt = cik_micro_tile_mode(sscreen, tile_mode_index); size = (copy_height * pitch) / 4; cheight = copy_height; if (((cheight * pitch) / 4) > CIK_SDMA_COPY_MAX_SIZE) { cheight = (CIK_SDMA_COPY_MAX_SIZE * 4) / pitch; cheight &= ~(y_align - 1); } ncopy = (copy_height + cheight - 1) / cheight; r600_need_dma_space(&ctx->b, ncopy * 12); radeon_add_to_buffer_list(&ctx->b, &ctx->b.dma, &rsrc->resource, RADEON_USAGE_READ, RADEON_PRIO_SDMA_TEXTURE); radeon_add_to_buffer_list(&ctx->b, &ctx->b.dma, &rdst->resource, RADEON_USAGE_WRITE, RADEON_PRIO_SDMA_TEXTURE); copy_height = size * 4 / pitch; for (i = 0; i < ncopy; i++) { cheight = copy_height; if (((cheight * pitch) / 4) > CIK_SDMA_COPY_MAX_SIZE) { cheight = (CIK_SDMA_COPY_MAX_SIZE * 4) / pitch; cheight &= ~(y_align - 1); } size = (cheight * pitch) / 4; cs->buf[cs->cdw++] = CIK_SDMA_PACKET(CIK_SDMA_OPCODE_COPY, sub_op, detile << 15); cs->buf[cs->cdw++] = base; cs->buf[cs->cdw++] = base >> 32; cs->buf[cs->cdw++] = ((height - 1) << 16) | pitch_tile_max; cs->buf[cs->cdw++] = slice_tile_max; cs->buf[cs->cdw++] = (pipe_config << 26) | (mt_aspect << 24) | (nbanks << 21) | (bank_h << 18) | (bank_w << 15) | (tile_split << 11) | (mt << 8) | (array_mode << 3) | lbpe; cs->buf[cs->cdw++] = y << 16; /* | x */ cs->buf[cs->cdw++] = 0; /* z */ cs->buf[cs->cdw++] = addr & 0xfffffffc; cs->buf[cs->cdw++] = addr >> 32; cs->buf[cs->cdw++] = (pitch / bpe) - 1; cs->buf[cs->cdw++] = size; copy_height -= cheight; y += cheight; } }