/** * Emit function for r600_cs_shader_state atom */ void evergreen_emit_cs_shader(struct r600_context *rctx, struct r600_atom *atom) { struct r600_cs_shader_state *state = (struct r600_cs_shader_state*)atom; struct r600_pipe_compute *shader = state->shader; struct radeon_winsys_cs *cs = rctx->b.gfx.cs; uint64_t va; struct r600_resource *code_bo; unsigned ngpr, nstack; code_bo = shader->code_bo; va = shader->code_bo->gpu_address + state->pc; ngpr = shader->bc.ngpr; nstack = shader->bc.nstack; radeon_compute_set_context_reg_seq(cs, R_0288D0_SQ_PGM_START_LS, 3); radeon_emit(cs, va >> 8); /* R_0288D0_SQ_PGM_START_LS */ radeon_emit(cs, /* R_0288D4_SQ_PGM_RESOURCES_LS */ S_0288D4_NUM_GPRS(ngpr) | S_0288D4_STACK_SIZE(nstack)); radeon_emit(cs, 0); /* R_0288D8_SQ_PGM_RESOURCES_LS_2 */ radeon_emit(cs, PKT3C(PKT3_NOP, 0, 0)); radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, code_bo, RADEON_USAGE_READ, RADEON_PRIO_USER_SHADER)); }
static void si_emit_cp_dma_copy_buffer(struct radv_cmd_buffer *cmd_buffer, uint64_t dst_va, uint64_t src_va, unsigned size, unsigned flags) { struct radeon_winsys_cs *cs = cmd_buffer->cs; uint32_t sync_flag = flags & R600_CP_DMA_SYNC ? S_411_CP_SYNC(1) : 0; uint32_t wr_confirm = !(flags & R600_CP_DMA_SYNC) ? S_414_DISABLE_WR_CONFIRM(1) : 0; uint32_t raw_wait = flags & SI_CP_DMA_RAW_WAIT ? S_414_RAW_WAIT(1) : 0; uint32_t sel = flags & CIK_CP_DMA_USE_L2 ? S_411_SRC_SEL(V_411_SRC_ADDR_TC_L2) | S_411_DSL_SEL(V_411_DST_ADDR_TC_L2) : 0; assert(size); assert((size & ((1<<21)-1)) == size); radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 9); if (cmd_buffer->device->instance->physicalDevice.rad_info.chip_class >= CIK) { radeon_emit(cs, PKT3(PKT3_DMA_DATA, 5, 0)); radeon_emit(cs, sync_flag | sel); /* CP_SYNC [31] */ radeon_emit(cs, src_va); /* SRC_ADDR_LO [31:0] */ radeon_emit(cs, src_va >> 32); /* SRC_ADDR_HI [31:0] */ radeon_emit(cs, dst_va); /* DST_ADDR_LO [31:0] */ radeon_emit(cs, dst_va >> 32); /* DST_ADDR_HI [31:0] */ radeon_emit(cs, size | wr_confirm | raw_wait); /* COMMAND [29:22] | BYTE_COUNT [20:0] */ } else {
void si_ce_enable_loads(struct radeon_winsys_cs *ib) { radeon_emit(ib, PKT3(PKT3_CONTEXT_CONTROL, 1, 0)); radeon_emit(ib, CONTEXT_CONTROL_LOAD_ENABLE(1) | CONTEXT_CONTROL_LOAD_CE_RAM(1)); radeon_emit(ib, CONTEXT_CONTROL_SHADOW_ENABLE(1)); }
static void si_reinitialize_ce_ram(struct si_context *sctx, struct si_descriptors *desc) { if (desc->buffer) { struct r600_resource *buffer = (struct r600_resource*)desc->buffer; unsigned list_size = desc->num_elements * desc->element_dw_size * 4; uint64_t va = buffer->gpu_address + desc->buffer_offset; struct radeon_winsys_cs *ib = sctx->ce_preamble_ib; if (!ib) ib = sctx->ce_ib; list_size = align(list_size, 32); radeon_emit(ib, PKT3(PKT3_LOAD_CONST_RAM, 3, 0)); radeon_emit(ib, va); radeon_emit(ib, va >> 32); radeon_emit(ib, list_size / 4); radeon_emit(ib, desc->ce_offset); radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, desc->buffer, RADEON_USAGE_READ, RADEON_PRIO_DESCRIPTORS); } desc->ce_ram_dirty = false; }
static bool si_upload_descriptors(struct si_context *sctx, struct si_descriptors *desc, struct r600_atom * atom) { unsigned list_size = desc->num_elements * desc->element_dw_size * 4; if (!desc->dirty_mask) return true; if (sctx->ce_ib) { uint32_t const* list = (uint32_t const*)desc->list; if (desc->ce_ram_dirty) si_reinitialize_ce_ram(sctx, desc); while(desc->dirty_mask) { int begin, count; u_bit_scan_consecutive_range(&desc->dirty_mask, &begin, &count); begin *= desc->element_dw_size; count *= desc->element_dw_size; radeon_emit(sctx->ce_ib, PKT3(PKT3_WRITE_CONST_RAM, count, 0)); radeon_emit(sctx->ce_ib, desc->ce_offset + begin * 4); radeon_emit_array(sctx->ce_ib, list + begin, count); } if (!si_ce_upload(sctx, desc->ce_offset, list_size, &desc->buffer_offset, &desc->buffer)) return false; } else { void *ptr; u_upload_alloc(sctx->b.uploader, 0, list_size, 256, &desc->buffer_offset, (struct pipe_resource**)&desc->buffer, &ptr); if (!desc->buffer) return false; /* skip the draw call */ util_memcpy_cpu_to_le32(ptr, desc->list, list_size); radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, desc->buffer, RADEON_USAGE_READ, RADEON_PRIO_DESCRIPTORS); } desc->pointer_dirty = true; desc->dirty_mask = 0; if (atom) si_mark_atom_dirty(sctx, atom); return true; }
void cayman_emit_msaa_config(struct radeon_winsys_cs *cs, int nr_samples, int ps_iter_samples, int overrast_samples) { int setup_samples = nr_samples > 1 ? nr_samples : overrast_samples > 1 ? overrast_samples : 0; if (setup_samples > 1) { /* indexed by log2(nr_samples) */ unsigned max_dist[] = { 0, eg_max_dist_2x, eg_max_dist_4x, cm_max_dist_8x, cm_max_dist_16x }; unsigned log_samples = util_logbase2(setup_samples); unsigned log_ps_iter_samples = util_logbase2(util_next_power_of_two(ps_iter_samples)); radeon_set_context_reg_seq(cs, CM_R_028BDC_PA_SC_LINE_CNTL, 2); radeon_emit(cs, S_028BDC_LAST_PIXEL(1) | S_028BDC_EXPAND_LINE_WIDTH(1)); /* CM_R_028BDC_PA_SC_LINE_CNTL */ radeon_emit(cs, S_028BE0_MSAA_NUM_SAMPLES(log_samples) | S_028BE0_MAX_SAMPLE_DIST(max_dist[log_samples]) | S_028BE0_MSAA_EXPOSED_SAMPLES(log_samples)); /* CM_R_028BE0_PA_SC_AA_CONFIG */ if (nr_samples > 1) { radeon_set_context_reg(cs, CM_R_028804_DB_EQAA, S_028804_MAX_ANCHOR_SAMPLES(log_samples) | S_028804_PS_ITER_SAMPLES(log_ps_iter_samples) | S_028804_MASK_EXPORT_NUM_SAMPLES(log_samples) | S_028804_ALPHA_TO_MASK_NUM_SAMPLES(log_samples) | S_028804_HIGH_QUALITY_INTERSECTIONS(1) | S_028804_STATIC_ANCHOR_ASSOCIATIONS(1)); radeon_set_context_reg(cs, EG_R_028A4C_PA_SC_MODE_CNTL_1, EG_S_028A4C_PS_ITER_SAMPLE(ps_iter_samples > 1)); } else if (overrast_samples > 1) { radeon_set_context_reg(cs, CM_R_028804_DB_EQAA, S_028804_HIGH_QUALITY_INTERSECTIONS(1) | S_028804_STATIC_ANCHOR_ASSOCIATIONS(1) | S_028804_OVERRASTERIZATION_AMOUNT(log_samples)); radeon_set_context_reg(cs, EG_R_028A4C_PA_SC_MODE_CNTL_1, 0); } } else { radeon_set_context_reg_seq(cs, CM_R_028BDC_PA_SC_LINE_CNTL, 2); radeon_emit(cs, S_028BDC_LAST_PIXEL(1)); /* CM_R_028BDC_PA_SC_LINE_CNTL */ radeon_emit(cs, 0); /* CM_R_028BE0_PA_SC_AA_CONFIG */ radeon_set_context_reg(cs, CM_R_028804_DB_EQAA, S_028804_HIGH_QUALITY_INTERSECTIONS(1) | S_028804_STATIC_ANCHOR_ASSOCIATIONS(1)); radeon_set_context_reg(cs, EG_R_028A4C_PA_SC_MODE_CNTL_1, 0); } }
static void evergreen_set_streamout_enable(struct r600_common_context *rctx, unsigned buffer_enable_bit) { struct radeon_winsys_cs *cs = rctx->rings.gfx.cs; if (buffer_enable_bit) { r600_write_context_reg_seq(cs, R_028B94_VGT_STRMOUT_CONFIG, 2); radeon_emit(cs, S_028B94_STREAMOUT_0_EN(1)); /* R_028B94_VGT_STRMOUT_CONFIG */ radeon_emit(cs, S_028B98_STREAM_0_BUFFER_EN(buffer_enable_bit)); /* R_028B98_VGT_STRMOUT_BUFFER_CONFIG */ } else { r600_write_context_reg(cs, R_028B94_VGT_STRMOUT_CONFIG, S_028B94_STREAMOUT_0_EN(0)); } }
static void r600_flush_vgt_streamout(struct r600_common_context *rctx) { struct radeon_winsys_cs *cs = rctx->rings.gfx.cs; unsigned reg_strmout_cntl; /* The register is at different places on different ASICs. */ if (rctx->chip_class >= CIK) { reg_strmout_cntl = R_0300FC_CP_STRMOUT_CNTL; } else if (rctx->chip_class >= EVERGREEN) { reg_strmout_cntl = R_0084FC_CP_STRMOUT_CNTL; } else { reg_strmout_cntl = R_008490_CP_STRMOUT_CNTL; } if (rctx->chip_class >= CIK) { cik_write_uconfig_reg(cs, reg_strmout_cntl, 0); } else { r600_write_config_reg(cs, reg_strmout_cntl, 0); } radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_SO_VGTSTREAMOUT_FLUSH) | EVENT_INDEX(0)); radeon_emit(cs, PKT3(PKT3_WAIT_REG_MEM, 5, 0)); radeon_emit(cs, WAIT_REG_MEM_EQUAL); /* wait until the register is equal to the reference value */ radeon_emit(cs, reg_strmout_cntl >> 2); /* register */ radeon_emit(cs, 0); radeon_emit(cs, S_008490_OFFSET_UPDATE_DONE(1)); /* reference value */ radeon_emit(cs, S_008490_OFFSET_UPDATE_DONE(1)); /* mask */ radeon_emit(cs, 4); /* poll interval */ }
static void r600_emit_streamout_begin(struct r600_common_context *rctx, struct r600_atom *atom) { struct radeon_winsys_cs *cs = rctx->rings.gfx.cs; struct r600_so_target **t = rctx->streamout.targets; unsigned *stride_in_dw = rctx->streamout.stride_in_dw; unsigned i, update_flags = 0; r600_flush_vgt_streamout(rctx); if (rctx->chip_class >= EVERGREEN) { evergreen_set_streamout_enable(rctx, rctx->streamout.enabled_mask); } else { r600_set_streamout_enable(rctx, rctx->streamout.enabled_mask); } for (i = 0; i < rctx->streamout.num_targets; i++) { if (!t[i]) continue; t[i]->stride_in_dw = stride_in_dw[i]; if (rctx->chip_class >= SI) { /* SI binds streamout buffers as shader resources. * VGT only counts primitives and tells the shader * through SGPRs what to do. */ r600_write_context_reg_seq(cs, R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 + 16*i, 2); radeon_emit(cs, (t[i]->b.buffer_offset + t[i]->b.buffer_size) >> 2); /* BUFFER_SIZE (in DW) */ radeon_emit(cs, stride_in_dw[i]); /* VTX_STRIDE (in DW) */ } else {
void si_write_scissors(struct radeon_winsys_cs *cs, int first, int count, const VkRect2D *scissors) { int i; if (count == 0) return; radeon_set_context_reg_seq(cs, R_028250_PA_SC_VPORT_SCISSOR_0_TL + first * 4 * 2, count * 2); for (i = 0; i < count; i++) { radeon_emit(cs, S_028250_TL_X(scissors[i].offset.x) | S_028250_TL_Y(scissors[i].offset.y) | S_028250_WINDOW_OFFSET_DISABLE(1)); radeon_emit(cs, S_028254_BR_X(scissors[i].offset.x + scissors[i].extent.width) | S_028254_BR_Y(scissors[i].offset.y + scissors[i].extent.height)); } }
/* Emit a CP DMA packet to do a copy from one buffer to another. * The size must fit in bits [20:0]. */ static void si_emit_cp_dma_copy_buffer(struct si_context *sctx, uint64_t dst_va, uint64_t src_va, unsigned size, unsigned flags) { struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs; uint32_t sync_flag = flags & R600_CP_DMA_SYNC ? PKT3_CP_DMA_CP_SYNC : 0; uint32_t raw_wait = flags & SI_CP_DMA_RAW_WAIT ? PKT3_CP_DMA_CMD_RAW_WAIT : 0; assert(size); assert((size & ((1<<21)-1)) == size); if (sctx->b.chip_class >= CIK) { radeon_emit(cs, PKT3(PKT3_DMA_DATA, 5, 0)); radeon_emit(cs, sync_flag); /* CP_SYNC [31] */ radeon_emit(cs, src_va); /* SRC_ADDR_LO [31:0] */ radeon_emit(cs, src_va >> 32); /* SRC_ADDR_HI [31:0] */ radeon_emit(cs, dst_va); /* DST_ADDR_LO [31:0] */ radeon_emit(cs, dst_va >> 32); /* DST_ADDR_HI [31:0] */ radeon_emit(cs, size | raw_wait); /* COMMAND [29:22] | BYTE_COUNT [20:0] */ } else {
static void r600_dma_emit_wait_idle(struct r600_common_context *rctx) { struct radeon_winsys_cs *cs = rctx->dma.cs; if (rctx->chip_class >= EVERGREEN) radeon_emit(cs, 0xf0000000); /* NOP */ else { /* TODO: R600-R700 should use the FENCE packet. * CS checker support is required. */ } }
static void si_initialize_compute(struct si_context *sctx) { struct radeon_winsys_cs *cs = sctx->b.gfx.cs; uint64_t bc_va; radeon_set_sh_reg_seq(cs, R_00B858_COMPUTE_STATIC_THREAD_MGMT_SE0, 2); /* R_00B858_COMPUTE_STATIC_THREAD_MGMT_SE0 / SE1 */ radeon_emit(cs, S_00B858_SH0_CU_EN(0xffff) | S_00B858_SH1_CU_EN(0xffff)); radeon_emit(cs, S_00B85C_SH0_CU_EN(0xffff) | S_00B85C_SH1_CU_EN(0xffff)); if (sctx->b.chip_class >= CIK) { /* Also set R_00B858_COMPUTE_STATIC_THREAD_MGMT_SE2 / SE3 */ radeon_set_sh_reg_seq(cs, R_00B864_COMPUTE_STATIC_THREAD_MGMT_SE2, 2); radeon_emit(cs, S_00B864_SH0_CU_EN(0xffff) | S_00B864_SH1_CU_EN(0xffff)); radeon_emit(cs, S_00B868_SH0_CU_EN(0xffff) | S_00B868_SH1_CU_EN(0xffff)); } /* This register has been moved to R_00CD20_COMPUTE_MAX_WAVE_ID * and is now per pipe, so it should be handled in the * kernel if we want to use something other than the default value, * which is now 0x22f. */ if (sctx->b.chip_class <= SI) { /* XXX: This should be: * (number of compute units) * 4 * (waves per simd) - 1 */ radeon_set_sh_reg(cs, R_00B82C_COMPUTE_MAX_WAVE_ID, 0x190 /* Default value */); } /* Set the pointer to border colors. */ bc_va = sctx->border_color_buffer->gpu_address; if (sctx->b.chip_class >= CIK) { radeon_set_uconfig_reg_seq(cs, R_030E00_TA_CS_BC_BASE_ADDR, 2); radeon_emit(cs, bc_va >> 8); /* R_030E00_TA_CS_BC_BASE_ADDR */ radeon_emit(cs, S_030E04_ADDRESS(bc_va >> 40)); /* R_030E04_TA_CS_BC_BASE_ADDR_HI */ } else {
void r600_dma_copy_buffer(struct r600_context *rctx, struct pipe_resource *dst, struct pipe_resource *src, uint64_t dst_offset, uint64_t src_offset, uint64_t size) { struct radeon_winsys_cs *cs = rctx->b.dma.cs; unsigned i, ncopy, csize; struct r600_resource *rdst = (struct r600_resource*)dst; struct r600_resource *rsrc = (struct r600_resource*)src; /* Mark the buffer range of destination as valid (initialized), * so that transfer_map knows it should wait for the GPU when mapping * that range. */ util_range_add(&rdst->valid_buffer_range, dst_offset, dst_offset + size); size >>= 2; /* convert to dwords */ ncopy = (size / R600_DMA_COPY_MAX_SIZE_DW) + !!(size % R600_DMA_COPY_MAX_SIZE_DW); r600_need_dma_space(&rctx->b, ncopy * 5, rdst, rsrc); for (i = 0; i < ncopy; i++) { csize = size < R600_DMA_COPY_MAX_SIZE_DW ? size : R600_DMA_COPY_MAX_SIZE_DW; /* emit reloc before writing cs so that cs is always in consistent state */ radeon_add_to_buffer_list(&rctx->b, &rctx->b.dma, rsrc, RADEON_USAGE_READ, RADEON_PRIO_SDMA_BUFFER); radeon_add_to_buffer_list(&rctx->b, &rctx->b.dma, rdst, RADEON_USAGE_WRITE, RADEON_PRIO_SDMA_BUFFER); radeon_emit(cs, DMA_PACKET(DMA_PACKET_COPY, 0, 0, csize)); radeon_emit(cs, dst_offset & 0xfffffffc); radeon_emit(cs, src_offset & 0xfffffffc); radeon_emit(cs, (dst_offset >> 32UL) & 0xff); radeon_emit(cs, (src_offset >> 32UL) & 0xff); dst_offset += csize << 2; src_offset += csize << 2; size -= csize; } r600_dma_emit_wait_idle(&rctx->b); }
/** * Write an EOP event. * * \param event EVENT_TYPE_* * \param event_flags Optional cache flush flags (TC) * \param data_sel 1 = fence, 3 = timestamp * \param buf Buffer * \param va GPU address * \param old_value Previous fence value (for a bug workaround) * \param new_value Fence value to write for this event. */ void r600_gfx_write_event_eop(struct r600_common_context *ctx, unsigned event, unsigned event_flags, unsigned data_sel, struct r600_resource *buf, uint64_t va, uint32_t new_fence, unsigned query_type) { struct radeon_winsys_cs *cs = ctx->gfx.cs; unsigned op = EVENT_TYPE(event) | EVENT_INDEX(5) | event_flags; unsigned sel = EOP_DATA_SEL(data_sel); radeon_emit(cs, PKT3(PKT3_EVENT_WRITE_EOP, 4, 0)); radeon_emit(cs, op); radeon_emit(cs, va); radeon_emit(cs, ((va >> 32) & 0xffff) | sel); radeon_emit(cs, new_fence); /* immediate data */ radeon_emit(cs, 0); /* unused */ if (buf) r600_emit_reloc(ctx, &ctx->gfx, buf, RADEON_USAGE_WRITE, RADEON_PRIO_QUERY); }
static bool si_ce_upload(struct si_context *sctx, unsigned ce_offset, unsigned size, unsigned *out_offset, struct r600_resource **out_buf) { uint64_t va; u_suballocator_alloc(sctx->ce_suballocator, size, out_offset, (struct pipe_resource**)out_buf); if (!out_buf) return false; va = (*out_buf)->gpu_address + *out_offset; radeon_emit(sctx->ce_ib, PKT3(PKT3_DUMP_CONST_RAM, 3, 0)); radeon_emit(sctx->ce_ib, ce_offset); radeon_emit(sctx->ce_ib, size / 4); radeon_emit(sctx->ce_ib, va); radeon_emit(sctx->ce_ib, va >> 32); radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, *out_buf, RADEON_USAGE_READWRITE, RADEON_PRIO_DESCRIPTORS); sctx->ce_need_synchronization = true; return true; }
/** * Emit function for r600_cs_shader_state atom */ void evergreen_emit_cs_shader( struct r600_context *rctx, struct r600_atom *atom) { struct r600_cs_shader_state *state = (struct r600_cs_shader_state*)atom; struct r600_pipe_compute *shader = state->shader; struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs; uint64_t va; struct r600_resource *code_bo; unsigned ngpr, nstack; #if HAVE_LLVM < 0x0306 struct r600_kernel *kernel = &shader->kernels[state->kernel_index]; code_bo = kernel->code_bo; va = kernel->code_bo->gpu_address; ngpr = kernel->bc.ngpr; nstack = kernel->bc.nstack; #else code_bo = shader->code_bo; va = shader->code_bo->gpu_address + state->pc; ngpr = shader->bc.ngpr; nstack = shader->bc.nstack; #endif r600_write_compute_context_reg_seq(cs, R_0288D0_SQ_PGM_START_LS, 3); radeon_emit(cs, va >> 8); /* R_0288D0_SQ_PGM_START_LS */ radeon_emit(cs, /* R_0288D4_SQ_PGM_RESOURCES_LS */ S_0288D4_NUM_GPRS(ngpr) | S_0288D4_STACK_SIZE(nstack)); radeon_emit(cs, 0); /* R_0288D8_SQ_PGM_RESOURCES_LS_2 */ radeon_emit(cs, PKT3C(PKT3_NOP, 0, 0)); radeon_emit(cs, r600_context_bo_reloc(&rctx->b, &rctx->b.rings.gfx, code_bo, RADEON_USAGE_READ, RADEON_PRIO_SHADER_DATA)); }
/* Emit a CP DMA packet to do a copy from one buffer to another, or to clear * a buffer. The size must fit in bits [20:0]. If CP_DMA_CLEAR is set, src_va is a 32-bit * clear value. */ static void si_emit_cp_dma(struct si_context *sctx, uint64_t dst_va, uint64_t src_va, unsigned size, unsigned flags, enum r600_coherency coher) { struct radeon_winsys_cs *cs = sctx->b.gfx.cs; uint32_t header = 0, command = S_414_BYTE_COUNT(size); assert(size); assert(size <= CP_DMA_MAX_BYTE_COUNT); /* Sync flags. */ if (flags & CP_DMA_SYNC) header |= S_411_CP_SYNC(1); else command |= S_414_DISABLE_WR_CONFIRM(1); if (flags & CP_DMA_RAW_WAIT) command |= S_414_RAW_WAIT(1); /* Src and dst flags. */ if (flags & CP_DMA_USE_L2) header |= S_411_DSL_SEL(V_411_DST_ADDR_TC_L2); if (flags & CP_DMA_CLEAR) header |= S_411_SRC_SEL(V_411_DATA); else if (flags & CP_DMA_USE_L2) header |= S_411_SRC_SEL(V_411_SRC_ADDR_TC_L2); if (sctx->b.chip_class >= CIK) { radeon_emit(cs, PKT3(PKT3_DMA_DATA, 5, 0)); radeon_emit(cs, header); radeon_emit(cs, src_va); /* SRC_ADDR_LO [31:0] */ radeon_emit(cs, src_va >> 32); /* SRC_ADDR_HI [31:0] */ radeon_emit(cs, dst_va); /* DST_ADDR_LO [31:0] */ radeon_emit(cs, dst_va >> 32); /* DST_ADDR_HI [31:0] */ radeon_emit(cs, command); } else {
/* This is required to prevent read-after-write hazards. */ void r600_dma_emit_wait_idle(struct r600_common_context *rctx) { struct radeon_winsys_cs *cs = rctx->dma.cs; /* done at the end of DMA calls, so increment this. */ rctx->num_dma_calls++; /* IBs using too little memory are limited by the IB submission overhead. * IBs using too much memory are limited by the kernel/TTM overhead. * Too long IBs create CPU-GPU pipeline bubbles and add latency. * * This heuristic makes sure that DMA requests are executed * very soon after the call is made and lowers memory usage. * It improves texture upload performance by keeping the DMA * engine busy while uploads are being submitted. */ if (rctx->ws->cs_query_memory_usage(rctx->dma.cs) > 64 * 1024 * 1024) { rctx->dma.flush(rctx, RADEON_FLUSH_ASYNC, NULL); return; } r600_need_dma_space(rctx, 1, NULL, NULL); if (!radeon_emitted(cs, 0)) /* empty queue */ return; /* NOP waits for idle on Evergreen and later. */ if (rctx->chip_class >= CIK) radeon_emit(cs, 0x00000000); /* NOP */ else if (rctx->chip_class >= EVERGREEN) radeon_emit(cs, 0xf0000000); /* NOP */ else { /* TODO: R600-R700 should use the FENCE packet. * CS checker support is required. */ } }
/** * Emit function for r600_cs_shader_state atom */ void evergreen_emit_cs_shader( struct r600_context *rctx, struct r600_atom *atom) { struct r600_cs_shader_state *state = (struct r600_cs_shader_state*)atom; struct r600_pipe_compute *shader = state->shader; struct r600_kernel *kernel = &shader->kernels[state->kernel_index]; struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs; uint64_t va; va = r600_resource_va(&rctx->screen->b.b, &kernel->code_bo->b.b); r600_write_compute_context_reg_seq(cs, R_0288D0_SQ_PGM_START_LS, 3); radeon_emit(cs, va >> 8); /* R_0288D0_SQ_PGM_START_LS */ radeon_emit(cs, /* R_0288D4_SQ_PGM_RESOURCES_LS */ S_0288D4_NUM_GPRS(kernel->bc.ngpr) | S_0288D4_STACK_SIZE(kernel->bc.nstack)); radeon_emit(cs, 0); /* R_0288D8_SQ_PGM_RESOURCES_LS_2 */ radeon_emit(cs, PKT3C(PKT3_NOP, 0, 0)); radeon_emit(cs, r600_context_bo_reloc(&rctx->b, &rctx->b.rings.gfx, kernel->code_bo, RADEON_USAGE_READ)); }
void si_pm4_emit(struct si_context *sctx, struct si_pm4_state *state) { struct radeon_winsys_cs *cs = sctx->b.gfx.cs; for (int i = 0; i < state->nbo; ++i) { radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, state->bo[i], state->bo_usage[i], state->bo_priority[i]); } if (!state->indirect_buffer) { radeon_emit_array(cs, state->pm4, state->ndw); } else { struct r600_resource *ib = state->indirect_buffer; radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, ib, RADEON_USAGE_READ, RADEON_PRIO_IB2); radeon_emit(cs, PKT3(PKT3_INDIRECT_BUFFER_CIK, 2, 0)); radeon_emit(cs, ib->gpu_address); radeon_emit(cs, ib->gpu_address >> 32); radeon_emit(cs, (ib->b.b.width0 >> 2) & 0xfffff); } }
void r600_gfx_wait_fence(struct r600_common_context *ctx, uint64_t va, uint32_t ref, uint32_t mask) { struct radeon_winsys_cs *cs = ctx->gfx.cs; radeon_emit(cs, PKT3(PKT3_WAIT_REG_MEM, 5, 0)); radeon_emit(cs, WAIT_REG_MEM_EQUAL | WAIT_REG_MEM_MEM_SPACE(1)); radeon_emit(cs, va); radeon_emit(cs, va >> 32); radeon_emit(cs, ref); /* reference value */ radeon_emit(cs, mask); /* mask */ radeon_emit(cs, 4); /* poll interval */ }
static void si_init_compute(struct radv_physical_device *physical_device, struct radeon_winsys_cs *cs) { radeon_set_sh_reg_seq(cs, R_00B810_COMPUTE_START_X, 3); radeon_emit(cs, 0); radeon_emit(cs, 0); radeon_emit(cs, 0); radeon_set_sh_reg_seq(cs, R_00B854_COMPUTE_RESOURCE_LIMITS, 3); radeon_emit(cs, 0); /* R_00B858_COMPUTE_STATIC_THREAD_MGMT_SE0 / SE1 */ radeon_emit(cs, S_00B858_SH0_CU_EN(0xffff) | S_00B858_SH1_CU_EN(0xffff)); radeon_emit(cs, S_00B85C_SH0_CU_EN(0xffff) | S_00B85C_SH1_CU_EN(0xffff)); if (physical_device->rad_info.chip_class >= CIK) { /* Also set R_00B858_COMPUTE_STATIC_THREAD_MGMT_SE2 / SE3 */ radeon_set_sh_reg_seq(cs, R_00B864_COMPUTE_STATIC_THREAD_MGMT_SE2, 2); radeon_emit(cs, S_00B864_SH0_CU_EN(0xffff) | S_00B864_SH1_CU_EN(0xffff)); radeon_emit(cs, S_00B868_SH0_CU_EN(0xffff) | S_00B868_SH1_CU_EN(0xffff)); } /* This register has been moved to R_00CD20_COMPUTE_MAX_WAVE_ID * and is now per pipe, so it should be handled in the * kernel if we want to use something other than the default value, * which is now 0x22f. */ if (physical_device->rad_info.chip_class <= SI) { /* XXX: This should be: * (number of compute units) * 4 * (waves per simd) - 1 */ radeon_set_sh_reg(cs, R_00B82C_COMPUTE_MAX_WAVE_ID, 0x190 /* Default value */); } }
static void si_initialize_compute(struct si_context *sctx) { struct radeon_winsys_cs *cs = sctx->b.gfx.cs; radeon_set_sh_reg_seq(cs, R_00B810_COMPUTE_START_X, 3); radeon_emit(cs, 0); radeon_emit(cs, 0); radeon_emit(cs, 0); radeon_set_sh_reg_seq(cs, R_00B858_COMPUTE_STATIC_THREAD_MGMT_SE0, 2); /* R_00B858_COMPUTE_STATIC_THREAD_MGMT_SE0 / SE1 */ radeon_emit(cs, S_00B858_SH0_CU_EN(0xffff) | S_00B858_SH1_CU_EN(0xffff)); radeon_emit(cs, S_00B85C_SH0_CU_EN(0xffff) | S_00B85C_SH1_CU_EN(0xffff)); if (sctx->b.chip_class >= CIK) { /* Also set R_00B858_COMPUTE_STATIC_THREAD_MGMT_SE2 / SE3 */ radeon_set_sh_reg_seq(cs, R_00B864_COMPUTE_STATIC_THREAD_MGMT_SE2, 2); radeon_emit(cs, S_00B864_SH0_CU_EN(0xffff) | S_00B864_SH1_CU_EN(0xffff)); radeon_emit(cs, S_00B868_SH0_CU_EN(0xffff) | S_00B868_SH1_CU_EN(0xffff)); } /* This register has been moved to R_00CD20_COMPUTE_MAX_WAVE_ID * and is now per pipe, so it should be handled in the * kernel if we want to use something other than the default value, * which is now 0x22f. */ if (sctx->b.chip_class <= SI) { /* XXX: This should be: * (number of compute units) * 4 * (waves per simd) - 1 */ radeon_set_sh_reg(cs, R_00B82C_COMPUTE_MAX_WAVE_ID, 0x190 /* Default value */); } sctx->cs_shader_state.emitted_program = NULL; sctx->cs_shader_state.initialized = true; }
static void evergreen_emit_direct_dispatch( struct r600_context *rctx, const uint *block_layout, const uint *grid_layout) { int i; struct radeon_winsys_cs *cs = rctx->b.gfx.cs; struct r600_pipe_compute *shader = rctx->cs_shader_state.shader; unsigned num_waves; unsigned num_pipes = rctx->screen->b.info.r600_max_pipes; unsigned wave_divisor = (16 * num_pipes); int group_size = 1; int grid_size = 1; unsigned lds_size = shader->local_size / 4 + #if HAVE_LLVM < 0x0306 shader->active_kernel->bc.nlds_dw; #else shader->bc.nlds_dw; #endif /* Calculate group_size/grid_size */ for (i = 0; i < 3; i++) { group_size *= block_layout[i]; } for (i = 0; i < 3; i++) { grid_size *= grid_layout[i]; } /* num_waves = ceil((tg_size.x * tg_size.y, tg_size.z) / (16 * num_pipes)) */ num_waves = (block_layout[0] * block_layout[1] * block_layout[2] + wave_divisor - 1) / wave_divisor; COMPUTE_DBG(rctx->screen, "Using %u pipes, " "%u wavefronts per thread block, " "allocating %u dwords lds.\n", num_pipes, num_waves, lds_size); radeon_set_config_reg(cs, R_008970_VGT_NUM_INDICES, group_size); radeon_set_config_reg_seq(cs, R_00899C_VGT_COMPUTE_START_X, 3); radeon_emit(cs, 0); /* R_00899C_VGT_COMPUTE_START_X */ radeon_emit(cs, 0); /* R_0089A0_VGT_COMPUTE_START_Y */ radeon_emit(cs, 0); /* R_0089A4_VGT_COMPUTE_START_Z */ radeon_set_config_reg(cs, R_0089AC_VGT_COMPUTE_THREAD_GROUP_SIZE, group_size); radeon_compute_set_context_reg_seq(cs, R_0286EC_SPI_COMPUTE_NUM_THREAD_X, 3); radeon_emit(cs, block_layout[0]); /* R_0286EC_SPI_COMPUTE_NUM_THREAD_X */ radeon_emit(cs, block_layout[1]); /* R_0286F0_SPI_COMPUTE_NUM_THREAD_Y */ radeon_emit(cs, block_layout[2]); /* R_0286F4_SPI_COMPUTE_NUM_THREAD_Z */ if (rctx->b.chip_class < CAYMAN) { assert(lds_size <= 8192); } else { /* Cayman appears to have a slightly smaller limit, see the * value of CM_R_0286FC_SPI_LDS_MGMT.NUM_LS_LDS */ assert(lds_size <= 8160); } radeon_compute_set_context_reg(cs, R_0288E8_SQ_LDS_ALLOC, lds_size | (num_waves << 14)); /* Dispatch packet */ radeon_emit(cs, PKT3C(PKT3_DISPATCH_DIRECT, 3, 0)); radeon_emit(cs, grid_layout[0]); radeon_emit(cs, grid_layout[1]); radeon_emit(cs, grid_layout[2]); /* VGT_DISPATCH_INITIATOR = COMPUTE_SHADER_EN */ radeon_emit(cs, 1); }
void r600_cp_dma_copy_buffer(struct r600_context *rctx, struct pipe_resource *dst, uint64_t dst_offset, struct pipe_resource *src, uint64_t src_offset, unsigned size) { struct radeon_winsys_cs *cs = rctx->b.gfx.cs; assert(size); assert(rctx->screen->b.has_cp_dma); /* Mark the buffer range of destination as valid (initialized), * so that transfer_map knows it should wait for the GPU when mapping * that range. */ util_range_add(&r600_resource(dst)->valid_buffer_range, dst_offset, dst_offset + size); dst_offset += r600_resource(dst)->gpu_address; src_offset += r600_resource(src)->gpu_address; /* Flush the caches where the resources are bound. */ rctx->b.flags |= R600_CONTEXT_INV_CONST_CACHE | R600_CONTEXT_INV_VERTEX_CACHE | R600_CONTEXT_INV_TEX_CACHE | R600_CONTEXT_FLUSH_AND_INV | R600_CONTEXT_FLUSH_AND_INV_CB | R600_CONTEXT_FLUSH_AND_INV_DB | R600_CONTEXT_FLUSH_AND_INV_CB_META | R600_CONTEXT_FLUSH_AND_INV_DB_META | R600_CONTEXT_STREAMOUT_FLUSH | R600_CONTEXT_WAIT_3D_IDLE; /* There are differences between R700 and EG in CP DMA, * but we only use the common bits here. */ while (size) { unsigned sync = 0; unsigned byte_count = MIN2(size, CP_DMA_MAX_BYTE_COUNT); unsigned src_reloc, dst_reloc; r600_need_cs_space(rctx, 10 + (rctx->b.flags ? R600_MAX_FLUSH_CS_DWORDS : 0), FALSE); /* Flush the caches for the first copy only. */ if (rctx->b.flags) { r600_flush_emit(rctx); } /* Do the synchronization after the last copy, so that all data is written to memory. */ if (size == byte_count) { sync = PKT3_CP_DMA_CP_SYNC; } /* This must be done after r600_need_cs_space. */ src_reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, (struct r600_resource*)src, RADEON_USAGE_READ, RADEON_PRIO_CP_DMA); dst_reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, (struct r600_resource*)dst, RADEON_USAGE_WRITE, RADEON_PRIO_CP_DMA); radeon_emit(cs, PKT3(PKT3_CP_DMA, 4, 0)); radeon_emit(cs, src_offset); /* SRC_ADDR_LO [31:0] */ radeon_emit(cs, sync | ((src_offset >> 32) & 0xff)); /* CP_SYNC [31] | SRC_ADDR_HI [7:0] */ radeon_emit(cs, dst_offset); /* DST_ADDR_LO [31:0] */ radeon_emit(cs, (dst_offset >> 32) & 0xff); /* DST_ADDR_HI [7:0] */ radeon_emit(cs, byte_count); /* COMMAND [29:22] | BYTE_COUNT [20:0] */ radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); radeon_emit(cs, src_reloc); radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); radeon_emit(cs, dst_reloc); size -= byte_count; src_offset += byte_count; dst_offset += byte_count; } /* Invalidate the read caches. */ rctx->b.flags |= R600_CONTEXT_INV_CONST_CACHE | R600_CONTEXT_INV_VERTEX_CACHE | R600_CONTEXT_INV_TEX_CACHE; }
void si_emit_cache_flush(struct radv_cmd_buffer *cmd_buffer) { enum chip_class chip_class = cmd_buffer->device->instance->physicalDevice.rad_info.chip_class; unsigned cp_coher_cntl = 0; radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 128); if (cmd_buffer->state.flush_bits & RADV_CMD_FLAG_INV_ICACHE) cp_coher_cntl |= S_0085F0_SH_ICACHE_ACTION_ENA(1); if (cmd_buffer->state.flush_bits & RADV_CMD_FLAG_INV_SMEM_L1) cp_coher_cntl |= S_0085F0_SH_KCACHE_ACTION_ENA(1); if (cmd_buffer->state.flush_bits & RADV_CMD_FLAG_INV_VMEM_L1) cp_coher_cntl |= S_0085F0_TCL1_ACTION_ENA(1); if (cmd_buffer->state.flush_bits & RADV_CMD_FLAG_INV_GLOBAL_L2) { cp_coher_cntl |= S_0085F0_TC_ACTION_ENA(1); if (chip_class >= VI) cp_coher_cntl |= S_0301F0_TC_WB_ACTION_ENA(1); } if (cmd_buffer->state.flush_bits & RADV_CMD_FLAG_FLUSH_AND_INV_CB) { cp_coher_cntl |= S_0085F0_CB_ACTION_ENA(1) | S_0085F0_CB0_DEST_BASE_ENA(1) | S_0085F0_CB1_DEST_BASE_ENA(1) | S_0085F0_CB2_DEST_BASE_ENA(1) | S_0085F0_CB3_DEST_BASE_ENA(1) | S_0085F0_CB4_DEST_BASE_ENA(1) | S_0085F0_CB5_DEST_BASE_ENA(1) | S_0085F0_CB6_DEST_BASE_ENA(1) | S_0085F0_CB7_DEST_BASE_ENA(1); /* Necessary for DCC */ if (cmd_buffer->device->instance->physicalDevice.rad_info.chip_class >= VI) { radeon_emit(cmd_buffer->cs, PKT3(PKT3_EVENT_WRITE_EOP, 4, 0)); radeon_emit(cmd_buffer->cs, EVENT_TYPE(V_028A90_FLUSH_AND_INV_CB_DATA_TS) | EVENT_INDEX(5)); radeon_emit(cmd_buffer->cs, 0); radeon_emit(cmd_buffer->cs, 0); radeon_emit(cmd_buffer->cs, 0); radeon_emit(cmd_buffer->cs, 0); } } if (cmd_buffer->state.flush_bits & RADV_CMD_FLAG_FLUSH_AND_INV_DB) { cp_coher_cntl |= S_0085F0_DB_ACTION_ENA(1) | S_0085F0_DB_DEST_BASE_ENA(1); } if (cmd_buffer->state.flush_bits & RADV_CMD_FLAG_FLUSH_AND_INV_CB_META) { radeon_emit(cmd_buffer->cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); radeon_emit(cmd_buffer->cs, EVENT_TYPE(V_028A90_FLUSH_AND_INV_CB_META) | EVENT_INDEX(0)); } if (cmd_buffer->state.flush_bits & RADV_CMD_FLAG_FLUSH_AND_INV_DB_META) { radeon_emit(cmd_buffer->cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); radeon_emit(cmd_buffer->cs, EVENT_TYPE(V_028A90_FLUSH_AND_INV_DB_META) | EVENT_INDEX(0)); } if (!(cmd_buffer->state.flush_bits & (RADV_CMD_FLAG_FLUSH_AND_INV_CB | RADV_CMD_FLAG_FLUSH_AND_INV_DB))) { if (cmd_buffer->state.flush_bits & RADV_CMD_FLAG_PS_PARTIAL_FLUSH) { radeon_emit(cmd_buffer->cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); radeon_emit(cmd_buffer->cs, EVENT_TYPE(V_028A90_PS_PARTIAL_FLUSH) | EVENT_INDEX(4)); } else if (cmd_buffer->state.flush_bits & RADV_CMD_FLAG_VS_PARTIAL_FLUSH) { radeon_emit(cmd_buffer->cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); radeon_emit(cmd_buffer->cs, EVENT_TYPE(V_028A90_VS_PARTIAL_FLUSH) | EVENT_INDEX(4)); } } if (cmd_buffer->state.flush_bits & RADV_CMD_FLAG_CS_PARTIAL_FLUSH) { radeon_emit(cmd_buffer->cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); radeon_emit(cmd_buffer->cs, EVENT_TYPE(V_028A90_CS_PARTIAL_FLUSH) | EVENT_INDEX(4)); } /* VGT state sync */ if (cmd_buffer->state.flush_bits & RADV_CMD_FLAG_VGT_FLUSH) { radeon_emit(cmd_buffer->cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); radeon_emit(cmd_buffer->cs, EVENT_TYPE(V_028A90_VGT_FLUSH) | EVENT_INDEX(0)); } /* Make sure ME is idle (it executes most packets) before continuing. * This prevents read-after-write hazards between PFP and ME. */ if (cp_coher_cntl || (cmd_buffer->state.flush_bits & RADV_CMD_FLAG_CS_PARTIAL_FLUSH)) { radeon_emit(cmd_buffer->cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0)); radeon_emit(cmd_buffer->cs, 0); } /* When one of the DEST_BASE flags is set, SURFACE_SYNC waits for idle. * Therefore, it should be last. Done in PFP. */ if (cp_coher_cntl) { /* ACQUIRE_MEM is only required on a compute ring. */ radeon_emit(cmd_buffer->cs, PKT3(PKT3_SURFACE_SYNC, 3, 0)); radeon_emit(cmd_buffer->cs, cp_coher_cntl); /* CP_COHER_CNTL */ radeon_emit(cmd_buffer->cs, 0xffffffff); /* CP_COHER_SIZE */ radeon_emit(cmd_buffer->cs, 0); /* CP_COHER_BASE */ radeon_emit(cmd_buffer->cs, 0x0000000A); /* POLL_INTERVAL */ } cmd_buffer->state.flush_bits = 0; }
void cayman_emit_msaa_sample_locs(struct radeon_winsys_cs *cs, int nr_samples) { switch (nr_samples) { case 2: radeon_set_context_reg(cs, CM_R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0, eg_sample_locs_2x[0]); radeon_set_context_reg(cs, CM_R_028C08_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_0, eg_sample_locs_2x[1]); radeon_set_context_reg(cs, CM_R_028C18_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_0, eg_sample_locs_2x[2]); radeon_set_context_reg(cs, CM_R_028C28_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_0, eg_sample_locs_2x[3]); break; case 4: radeon_set_context_reg(cs, CM_R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0, eg_sample_locs_4x[0]); radeon_set_context_reg(cs, CM_R_028C08_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_0, eg_sample_locs_4x[1]); radeon_set_context_reg(cs, CM_R_028C18_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_0, eg_sample_locs_4x[2]); radeon_set_context_reg(cs, CM_R_028C28_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_0, eg_sample_locs_4x[3]); break; case 8: radeon_set_context_reg_seq(cs, CM_R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0, 14); radeon_emit(cs, cm_sample_locs_8x[0]); radeon_emit(cs, cm_sample_locs_8x[4]); radeon_emit(cs, 0); radeon_emit(cs, 0); radeon_emit(cs, cm_sample_locs_8x[1]); radeon_emit(cs, cm_sample_locs_8x[5]); radeon_emit(cs, 0); radeon_emit(cs, 0); radeon_emit(cs, cm_sample_locs_8x[2]); radeon_emit(cs, cm_sample_locs_8x[6]); radeon_emit(cs, 0); radeon_emit(cs, 0); radeon_emit(cs, cm_sample_locs_8x[3]); radeon_emit(cs, cm_sample_locs_8x[7]); break; case 16: radeon_set_context_reg_seq(cs, CM_R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0, 16); radeon_emit(cs, cm_sample_locs_16x[0]); radeon_emit(cs, cm_sample_locs_16x[4]); radeon_emit(cs, cm_sample_locs_16x[8]); radeon_emit(cs, cm_sample_locs_16x[12]); radeon_emit(cs, cm_sample_locs_16x[1]); radeon_emit(cs, cm_sample_locs_16x[5]); radeon_emit(cs, cm_sample_locs_16x[9]); radeon_emit(cs, cm_sample_locs_16x[13]); radeon_emit(cs, cm_sample_locs_16x[2]); radeon_emit(cs, cm_sample_locs_16x[6]); radeon_emit(cs, cm_sample_locs_16x[10]); radeon_emit(cs, cm_sample_locs_16x[14]); radeon_emit(cs, cm_sample_locs_16x[3]); radeon_emit(cs, cm_sample_locs_16x[7]); radeon_emit(cs, cm_sample_locs_16x[11]); radeon_emit(cs, cm_sample_locs_16x[15]); break; } }
static void compute_emit_cs(struct r600_context *ctx, const uint *block_layout, const uint *grid_layout) { struct radeon_winsys_cs *cs = ctx->b.gfx.cs; unsigned i; /* make sure that the gfx ring is only one active */ if (ctx->b.dma.cs && ctx->b.dma.cs->cdw) { ctx->b.dma.flush(ctx, RADEON_FLUSH_ASYNC, NULL); } /* Initialize all the compute-related registers. * * See evergreen_init_atom_start_compute_cs() in this file for the list * of registers initialized by the start_compute_cs_cmd atom. */ r600_emit_command_buffer(cs, &ctx->start_compute_cs_cmd); /* emit config state */ if (ctx->b.chip_class == EVERGREEN) r600_emit_atom(ctx, &ctx->config_state.atom); ctx->b.flags |= R600_CONTEXT_WAIT_3D_IDLE | R600_CONTEXT_FLUSH_AND_INV; r600_flush_emit(ctx); /* Emit colorbuffers. */ /* XXX support more than 8 colorbuffers (the offsets are not a multiple of 0x3C for CB8-11) */ for (i = 0; i < 8 && i < ctx->framebuffer.state.nr_cbufs; i++) { struct r600_surface *cb = (struct r600_surface*)ctx->framebuffer.state.cbufs[i]; unsigned reloc = radeon_add_to_buffer_list(&ctx->b, &ctx->b.gfx, (struct r600_resource*)cb->base.texture, RADEON_USAGE_READWRITE, RADEON_PRIO_SHADER_RW_BUFFER); radeon_compute_set_context_reg_seq(cs, R_028C60_CB_COLOR0_BASE + i * 0x3C, 7); radeon_emit(cs, cb->cb_color_base); /* R_028C60_CB_COLOR0_BASE */ radeon_emit(cs, cb->cb_color_pitch); /* R_028C64_CB_COLOR0_PITCH */ radeon_emit(cs, cb->cb_color_slice); /* R_028C68_CB_COLOR0_SLICE */ radeon_emit(cs, cb->cb_color_view); /* R_028C6C_CB_COLOR0_VIEW */ radeon_emit(cs, cb->cb_color_info); /* R_028C70_CB_COLOR0_INFO */ radeon_emit(cs, cb->cb_color_attrib); /* R_028C74_CB_COLOR0_ATTRIB */ radeon_emit(cs, cb->cb_color_dim); /* R_028C78_CB_COLOR0_DIM */ radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C60_CB_COLOR0_BASE */ radeon_emit(cs, reloc); if (!ctx->keep_tiling_flags) { radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C70_CB_COLOR0_INFO */ radeon_emit(cs, reloc); } radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C74_CB_COLOR0_ATTRIB */ radeon_emit(cs, reloc); } if (ctx->keep_tiling_flags) { for (; i < 8 ; i++) { radeon_compute_set_context_reg(cs, R_028C70_CB_COLOR0_INFO + i * 0x3C, S_028C70_FORMAT(V_028C70_COLOR_INVALID)); } for (; i < 12; i++) { radeon_compute_set_context_reg(cs, R_028E50_CB_COLOR8_INFO + (i - 8) * 0x1C, S_028C70_FORMAT(V_028C70_COLOR_INVALID)); } } /* Set CB_TARGET_MASK XXX: Use cb_misc_state */ radeon_compute_set_context_reg(cs, R_028238_CB_TARGET_MASK, ctx->compute_cb_target_mask); /* Emit vertex buffer state */ ctx->cs_vertex_buffer_state.atom.num_dw = 12 * util_bitcount(ctx->cs_vertex_buffer_state.dirty_mask); r600_emit_atom(ctx, &ctx->cs_vertex_buffer_state.atom); /* Emit constant buffer state */ r600_emit_atom(ctx, &ctx->constbuf_state[PIPE_SHADER_COMPUTE].atom); /* Emit sampler state */ r600_emit_atom(ctx, &ctx->samplers[PIPE_SHADER_COMPUTE].states.atom); /* Emit sampler view (texture resource) state */ r600_emit_atom(ctx, &ctx->samplers[PIPE_SHADER_COMPUTE].views.atom); /* Emit compute shader state */ r600_emit_atom(ctx, &ctx->cs_shader_state.atom); /* Emit dispatch state and dispatch packet */ evergreen_emit_direct_dispatch(ctx, block_layout, grid_layout); /* XXX evergreen_flush_emit() hardcodes the CP_COHER_SIZE to 0xffffffff */ ctx->b.flags |= R600_CONTEXT_INV_CONST_CACHE | R600_CONTEXT_INV_VERTEX_CACHE | R600_CONTEXT_INV_TEX_CACHE; r600_flush_emit(ctx); ctx->b.flags = 0; if (ctx->b.chip_class >= CAYMAN) { cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 0, 0); cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4); /* DEALLOC_STATE prevents the GPU from hanging when a * SURFACE_SYNC packet is emitted some time after a DISPATCH_DIRECT * with any of the CB*_DEST_BASE_ENA or DB_DEST_BASE_ENA bits set. */ cs->buf[cs->cdw++] = PKT3C(PKT3_DEALLOC_STATE, 0, 0); cs->buf[cs->cdw++] = 0; } #if 0 COMPUTE_DBG(ctx->screen, "cdw: %i\n", cs->cdw); for (i = 0; i < cs->cdw; i++) { COMPUTE_DBG(ctx->screen, "%4i : 0x%08X\n", i, cs->buf[i]); } #endif }
static int radeon_drm_cs_flush(struct radeon_cmdbuf *rcs, unsigned flags, struct pipe_fence_handle **pfence) { struct radeon_drm_cs *cs = radeon_drm_cs(rcs); struct radeon_cs_context *tmp; switch (cs->ring_type) { case RING_DMA: /* pad DMA ring to 8 DWs */ if (cs->ws->info.chip_class <= GFX6) { while (rcs->current.cdw & 7) radeon_emit(&cs->base, 0xf0000000); /* NOP packet */ } else { while (rcs->current.cdw & 7) radeon_emit(&cs->base, 0x00000000); /* NOP packet */ } break; case RING_GFX: /* pad GFX ring to 8 DWs to meet CP fetch alignment requirements * r6xx, requires at least 4 dw alignment to avoid a hw bug. */ if (cs->ws->info.gfx_ib_pad_with_type2) { while (rcs->current.cdw & 7) radeon_emit(&cs->base, 0x80000000); /* type2 nop packet */ } else { while (rcs->current.cdw & 7) radeon_emit(&cs->base, 0xffff1000); /* type3 nop packet */ } break; case RING_UVD: while (rcs->current.cdw & 15) radeon_emit(&cs->base, 0x80000000); /* type2 nop packet */ break; default: break; } if (rcs->current.cdw > rcs->current.max_dw) { fprintf(stderr, "radeon: command stream overflowed\n"); } if (pfence || cs->csc->num_slab_buffers) { struct pipe_fence_handle *fence; if (cs->next_fence) { fence = cs->next_fence; cs->next_fence = NULL; } else { fence = radeon_cs_create_fence(rcs); } if (fence) { if (pfence) radeon_fence_reference(pfence, fence); mtx_lock(&cs->ws->bo_fence_lock); for (unsigned i = 0; i < cs->csc->num_slab_buffers; ++i) { struct radeon_bo *bo = cs->csc->slab_buffers[i].bo; p_atomic_inc(&bo->num_active_ioctls); radeon_bo_slab_fence(bo, (struct radeon_bo *)fence); } mtx_unlock(&cs->ws->bo_fence_lock); radeon_fence_reference(&fence, NULL); } } else { radeon_fence_reference(&cs->next_fence, NULL); } radeon_drm_cs_sync_flush(rcs); /* Swap command streams. */ tmp = cs->csc; cs->csc = cs->cst; cs->cst = tmp; /* If the CS is not empty or overflowed, emit it in a separate thread. */ if (cs->base.current.cdw && cs->base.current.cdw <= cs->base.current.max_dw && !debug_get_option_noop()) { unsigned i, num_relocs; num_relocs = cs->cst->num_relocs; cs->cst->chunks[0].length_dw = cs->base.current.cdw; for (i = 0; i < num_relocs; i++) { /* Update the number of active asynchronous CS ioctls for the buffer. */ p_atomic_inc(&cs->cst->relocs_bo[i].bo->num_active_ioctls); } switch (cs->ring_type) { case RING_DMA: cs->cst->flags[0] = 0; cs->cst->flags[1] = RADEON_CS_RING_DMA; cs->cst->cs.num_chunks = 3; if (cs->ws->info.r600_has_virtual_memory) { cs->cst->flags[0] |= RADEON_CS_USE_VM; } break; case RING_UVD: cs->cst->flags[0] = 0; cs->cst->flags[1] = RADEON_CS_RING_UVD; cs->cst->cs.num_chunks = 3; break; case RING_VCE: cs->cst->flags[0] = 0; cs->cst->flags[1] = RADEON_CS_RING_VCE; cs->cst->cs.num_chunks = 3; break; default: case RING_GFX: case RING_COMPUTE: cs->cst->flags[0] = RADEON_CS_KEEP_TILING_FLAGS; cs->cst->flags[1] = RADEON_CS_RING_GFX; cs->cst->cs.num_chunks = 3; if (cs->ws->info.r600_has_virtual_memory) { cs->cst->flags[0] |= RADEON_CS_USE_VM; cs->cst->cs.num_chunks = 3; } if (flags & PIPE_FLUSH_END_OF_FRAME) { cs->cst->flags[0] |= RADEON_CS_END_OF_FRAME; cs->cst->cs.num_chunks = 3; } if (cs->ring_type == RING_COMPUTE) { cs->cst->flags[1] = RADEON_CS_RING_COMPUTE; cs->cst->cs.num_chunks = 3; } break; } if (util_queue_is_initialized(&cs->ws->cs_queue)) { util_queue_add_job(&cs->ws->cs_queue, cs, &cs->flush_completed, radeon_drm_cs_emit_ioctl_oneshot, NULL); if (!(flags & PIPE_FLUSH_ASYNC)) radeon_drm_cs_sync_flush(rcs); } else { radeon_drm_cs_emit_ioctl_oneshot(cs, 0); } } else { radeon_cs_context_cleanup(cs->cst); } /* Prepare a new CS. */ cs->base.current.buf = cs->csc->buf; cs->base.current.cdw = 0; cs->base.used_vram = 0; cs->base.used_gart = 0; if (cs->ring_type == RING_GFX) cs->ws->num_gfx_IBs++; else if (cs->ring_type == RING_DMA) cs->ws->num_sdma_IBs++; return 0; }