void si_context_draw(struct r600_context *ctx, const struct r600_draw *draw) { struct radeon_winsys_cs *cs = ctx->cs; unsigned ndwords = 7; uint32_t *pm4; uint64_t va; if (draw->indices) { ndwords = 12; } if (ctx->num_cs_dw_queries_suspend) ndwords += 6; /* when increasing ndwords, bump the max limit too */ assert(ndwords <= SI_MAX_DRAW_CS_DWORDS); /* queries need some special values * (this is non-zero if any query is active) */ if (ctx->num_cs_dw_queries_suspend) { pm4 = &cs->buf[cs->cdw]; pm4[0] = PKT3(PKT3_SET_CONTEXT_REG, 1, 0); pm4[1] = (R_028004_DB_COUNT_CONTROL - SI_CONTEXT_REG_OFFSET) >> 2; pm4[2] = S_028004_PERFECT_ZPASS_COUNTS(1); pm4[3] = PKT3(PKT3_SET_CONTEXT_REG, 1, 0); pm4[4] = (R_02800C_DB_RENDER_OVERRIDE - SI_CONTEXT_REG_OFFSET) >> 2; pm4[5] = draw->db_render_override | S_02800C_NOOP_CULL_DISABLE(1); cs->cdw += 6; ndwords -= 6; }
static void r600_flush_vgt_streamout(struct r600_common_context *rctx) { struct radeon_winsys_cs *cs = rctx->rings.gfx.cs; unsigned reg_strmout_cntl; /* The register is at different places on different ASICs. */ if (rctx->chip_class >= CIK) { reg_strmout_cntl = R_0300FC_CP_STRMOUT_CNTL; } else if (rctx->chip_class >= EVERGREEN) { reg_strmout_cntl = R_0084FC_CP_STRMOUT_CNTL; } else { reg_strmout_cntl = R_008490_CP_STRMOUT_CNTL; } if (rctx->chip_class >= CIK) { cik_write_uconfig_reg(cs, reg_strmout_cntl, 0); } else { r600_write_config_reg(cs, reg_strmout_cntl, 0); } radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_SO_VGTSTREAMOUT_FLUSH) | EVENT_INDEX(0)); radeon_emit(cs, PKT3(PKT3_WAIT_REG_MEM, 5, 0)); radeon_emit(cs, WAIT_REG_MEM_EQUAL); /* wait until the register is equal to the reference value */ radeon_emit(cs, reg_strmout_cntl >> 2); /* register */ radeon_emit(cs, 0); radeon_emit(cs, S_008490_OFFSET_UPDATE_DONE(1)); /* reference value */ radeon_emit(cs, S_008490_OFFSET_UPDATE_DONE(1)); /* mask */ radeon_emit(cs, 4); /* poll interval */ }
void r600_emit_pfp_sync_me(struct r600_context *rctx) { struct radeon_winsys_cs *cs = rctx->b.gfx.cs; if (rctx->b.chip_class >= EVERGREEN && rctx->b.screen->info.drm_minor >= 46) { radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0)); radeon_emit(cs, 0); } else { /* Emulate PFP_SYNC_ME by writing a value to memory in ME and * waiting for it in PFP. */ struct r600_resource *buf = NULL; unsigned offset, reloc; uint64_t va; /* 16-byte address alignment is required by WAIT_REG_MEM. */ u_suballocator_alloc(rctx->b.allocator_zeroed_memory, 4, 16, &offset, (struct pipe_resource**)&buf); if (!buf) { /* This is too heavyweight, but will work. */ rctx->b.gfx.flush(rctx, RADEON_FLUSH_ASYNC, NULL); return; } reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, buf, RADEON_USAGE_READWRITE, RADEON_PRIO_FENCE); va = buf->gpu_address + offset; assert(va % 16 == 0); /* Write 1 to memory in ME. */ radeon_emit(cs, PKT3(PKT3_MEM_WRITE, 3, 0)); radeon_emit(cs, va); radeon_emit(cs, ((va >> 32) & 0xff) | MEM_WRITE_32_BITS); radeon_emit(cs, 1); radeon_emit(cs, 0); radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); radeon_emit(cs, reloc); /* Wait in PFP (PFP can only do GEQUAL against memory). */ radeon_emit(cs, PKT3(PKT3_WAIT_REG_MEM, 5, 0)); radeon_emit(cs, WAIT_REG_MEM_GEQUAL | WAIT_REG_MEM_MEMORY | WAIT_REG_MEM_PFP); radeon_emit(cs, va); radeon_emit(cs, va >> 32); radeon_emit(cs, 1); /* reference value */ radeon_emit(cs, 0xffffffff); /* mask */ radeon_emit(cs, 4); /* poll interval */ radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); radeon_emit(cs, reloc); r600_resource_reference(&buf, NULL); } }
void evergreen_set_streamout_enable(struct r600_context *ctx, unsigned buffer_enable_bit) { struct radeon_winsys_cs *cs = ctx->cs; if (buffer_enable_bit) { cs->buf[cs->cdw++] = PKT3(PKT3_SET_CONTEXT_REG, 1, 0); cs->buf[cs->cdw++] = (R_028B94_VGT_STRMOUT_CONFIG - EVERGREEN_CONTEXT_REG_OFFSET) >> 2; cs->buf[cs->cdw++] = S_028B94_STREAMOUT_0_EN(1); cs->buf[cs->cdw++] = PKT3(PKT3_SET_CONTEXT_REG, 1, 0); cs->buf[cs->cdw++] = (R_028B98_VGT_STRMOUT_BUFFER_CONFIG - EVERGREEN_CONTEXT_REG_OFFSET) >> 2; cs->buf[cs->cdw++] = S_028B98_STREAM_0_BUFFER_EN(buffer_enable_bit); } else {
static void si_emit_cp_dma_copy_buffer(struct radv_cmd_buffer *cmd_buffer, uint64_t dst_va, uint64_t src_va, unsigned size, unsigned flags) { struct radeon_winsys_cs *cs = cmd_buffer->cs; uint32_t sync_flag = flags & R600_CP_DMA_SYNC ? S_411_CP_SYNC(1) : 0; uint32_t wr_confirm = !(flags & R600_CP_DMA_SYNC) ? S_414_DISABLE_WR_CONFIRM(1) : 0; uint32_t raw_wait = flags & SI_CP_DMA_RAW_WAIT ? S_414_RAW_WAIT(1) : 0; uint32_t sel = flags & CIK_CP_DMA_USE_L2 ? S_411_SRC_SEL(V_411_SRC_ADDR_TC_L2) | S_411_DSL_SEL(V_411_DST_ADDR_TC_L2) : 0; assert(size); assert((size & ((1<<21)-1)) == size); radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 9); if (cmd_buffer->device->instance->physicalDevice.rad_info.chip_class >= CIK) { radeon_emit(cs, PKT3(PKT3_DMA_DATA, 5, 0)); radeon_emit(cs, sync_flag | sel); /* CP_SYNC [31] */ radeon_emit(cs, src_va); /* SRC_ADDR_LO [31:0] */ radeon_emit(cs, src_va >> 32); /* SRC_ADDR_HI [31:0] */ radeon_emit(cs, dst_va); /* DST_ADDR_LO [31:0] */ radeon_emit(cs, dst_va >> 32); /* DST_ADDR_HI [31:0] */ radeon_emit(cs, size | wr_confirm | raw_wait); /* COMMAND [29:22] | BYTE_COUNT [20:0] */ } else {
static void si_reinitialize_ce_ram(struct si_context *sctx, struct si_descriptors *desc) { if (desc->buffer) { struct r600_resource *buffer = (struct r600_resource*)desc->buffer; unsigned list_size = desc->num_elements * desc->element_dw_size * 4; uint64_t va = buffer->gpu_address + desc->buffer_offset; struct radeon_winsys_cs *ib = sctx->ce_preamble_ib; if (!ib) ib = sctx->ce_ib; list_size = align(list_size, 32); radeon_emit(ib, PKT3(PKT3_LOAD_CONST_RAM, 3, 0)); radeon_emit(ib, va); radeon_emit(ib, va >> 32); radeon_emit(ib, list_size / 4); radeon_emit(ib, desc->ce_offset); radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, desc->buffer, RADEON_USAGE_READ, RADEON_PRIO_DESCRIPTORS); } desc->ce_ram_dirty = false; }
void si_ce_enable_loads(struct radeon_winsys_cs *ib) { radeon_emit(ib, PKT3(PKT3_CONTEXT_CONTROL, 1, 0)); radeon_emit(ib, CONTEXT_CONTROL_LOAD_ENABLE(1) | CONTEXT_CONTROL_LOAD_CE_RAM(1)); radeon_emit(ib, CONTEXT_CONTROL_SHADOW_ENABLE(1)); }
void evergreen_flush_vgt_streamout(struct r600_context *ctx) { struct radeon_winsys_cs *cs = ctx->rings.gfx.cs; r600_write_config_reg(cs, R_0084FC_CP_STRMOUT_CNTL, 0); cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 0, 0); cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_SO_VGTSTREAMOUT_FLUSH) | EVENT_INDEX(0); cs->buf[cs->cdw++] = PKT3(PKT3_WAIT_REG_MEM, 5, 0); cs->buf[cs->cdw++] = WAIT_REG_MEM_EQUAL; /* wait until the register is equal to the reference value */ cs->buf[cs->cdw++] = R_0084FC_CP_STRMOUT_CNTL >> 2; /* register */ cs->buf[cs->cdw++] = 0; cs->buf[cs->cdw++] = S_0084FC_OFFSET_UPDATE_DONE(1); /* reference value */ cs->buf[cs->cdw++] = S_0084FC_OFFSET_UPDATE_DONE(1); /* mask */ cs->buf[cs->cdw++] = 4; /* poll interval */ }
void si_pm4_cmd_end(struct si_pm4_state *state, bool predicate) { unsigned count; count = state->ndw - state->last_pm4 - 2; state->pm4[state->last_pm4] = PKT3(state->last_opcode, count, predicate); assert(state->ndw <= SI_PM4_MAX_DW); }
static bool si_upload_descriptors(struct si_context *sctx, struct si_descriptors *desc, struct r600_atom * atom) { unsigned list_size = desc->num_elements * desc->element_dw_size * 4; if (!desc->dirty_mask) return true; if (sctx->ce_ib) { uint32_t const* list = (uint32_t const*)desc->list; if (desc->ce_ram_dirty) si_reinitialize_ce_ram(sctx, desc); while(desc->dirty_mask) { int begin, count; u_bit_scan_consecutive_range(&desc->dirty_mask, &begin, &count); begin *= desc->element_dw_size; count *= desc->element_dw_size; radeon_emit(sctx->ce_ib, PKT3(PKT3_WRITE_CONST_RAM, count, 0)); radeon_emit(sctx->ce_ib, desc->ce_offset + begin * 4); radeon_emit_array(sctx->ce_ib, list + begin, count); } if (!si_ce_upload(sctx, desc->ce_offset, list_size, &desc->buffer_offset, &desc->buffer)) return false; } else { void *ptr; u_upload_alloc(sctx->b.uploader, 0, list_size, 256, &desc->buffer_offset, (struct pipe_resource**)&desc->buffer, &ptr); if (!desc->buffer) return false; /* skip the draw call */ util_memcpy_cpu_to_le32(ptr, desc->list, list_size); radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, desc->buffer, RADEON_USAGE_READ, RADEON_PRIO_DESCRIPTORS); } desc->pointer_dirty = true; desc->dirty_mask = 0; if (atom) si_mark_atom_dirty(sctx, atom); return true; }
static inline void evergreen_context_ps_partial_flush(struct r600_context *ctx) { struct radeon_winsys_cs *cs = ctx->cs; if (!(ctx->flags & R600_CONTEXT_DRAW_PENDING)) return; cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 0, 0); cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_PS_PARTIAL_FLUSH) | EVENT_INDEX(4); ctx->flags &= ~R600_CONTEXT_DRAW_PENDING; }
void r600_gfx_wait_fence(struct r600_common_context *ctx, uint64_t va, uint32_t ref, uint32_t mask) { struct radeon_winsys_cs *cs = ctx->gfx.cs; radeon_emit(cs, PKT3(PKT3_WAIT_REG_MEM, 5, 0)); radeon_emit(cs, WAIT_REG_MEM_EQUAL | WAIT_REG_MEM_MEM_SPACE(1)); radeon_emit(cs, va); radeon_emit(cs, va >> 32); radeon_emit(cs, ref); /* reference value */ radeon_emit(cs, mask); /* mask */ radeon_emit(cs, 4); /* poll interval */ }
static void r600_emit_surface_sync(struct r600_context *rctx, struct r600_atom *atom) { struct radeon_winsys_cs *cs = rctx->cs; struct r600_atom_surface_sync *a = (struct r600_atom_surface_sync*)atom; cs->buf[cs->cdw++] = PKT3(PKT3_SURFACE_SYNC, 3, 0); cs->buf[cs->cdw++] = a->flush_flags; /* CP_COHER_CNTL */ cs->buf[cs->cdw++] = 0xffffffff; /* CP_COHER_SIZE */ cs->buf[cs->cdw++] = 0; /* CP_COHER_BASE */ cs->buf[cs->cdw++] = 0x0000000A; /* POLL_INTERVAL */ a->flush_flags = 0; }
void si_trace_emit(struct si_context *sctx) { struct si_screen *sscreen = sctx->screen; struct radeon_winsys_cs *cs = sctx->cs; uint64_t va; va = r600_resource_va(&sscreen->screen, (void*)sscreen->b.trace_bo); r600_context_bo_reloc(sctx, sscreen->b.trace_bo, RADEON_USAGE_READWRITE); cs->buf[cs->cdw++] = PKT3(PKT3_WRITE_DATA, 4, 0); cs->buf[cs->cdw++] = PKT3_WRITE_DATA_DST_SEL(PKT3_WRITE_DATA_DST_SEL_MEM_SYNC) | PKT3_WRITE_DATA_WR_CONFIRM | PKT3_WRITE_DATA_ENGINE_SEL(PKT3_WRITE_DATA_ENGINE_SEL_ME); cs->buf[cs->cdw++] = va & 0xFFFFFFFFUL; cs->buf[cs->cdw++] = (va >> 32UL) & 0xFFFFFFFFUL; cs->buf[cs->cdw++] = cs->cdw; cs->buf[cs->cdw++] = sscreen->b.cs_count; }
/* Emit a CP DMA packet to do a copy from one buffer to another. * The size must fit in bits [20:0]. */ static void si_emit_cp_dma_copy_buffer(struct si_context *sctx, uint64_t dst_va, uint64_t src_va, unsigned size, unsigned flags) { struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs; uint32_t sync_flag = flags & R600_CP_DMA_SYNC ? PKT3_CP_DMA_CP_SYNC : 0; uint32_t raw_wait = flags & SI_CP_DMA_RAW_WAIT ? PKT3_CP_DMA_CMD_RAW_WAIT : 0; assert(size); assert((size & ((1<<21)-1)) == size); if (sctx->b.chip_class >= CIK) { radeon_emit(cs, PKT3(PKT3_DMA_DATA, 5, 0)); radeon_emit(cs, sync_flag); /* CP_SYNC [31] */ radeon_emit(cs, src_va); /* SRC_ADDR_LO [31:0] */ radeon_emit(cs, src_va >> 32); /* SRC_ADDR_HI [31:0] */ radeon_emit(cs, dst_va); /* DST_ADDR_LO [31:0] */ radeon_emit(cs, dst_va >> 32); /* DST_ADDR_HI [31:0] */ radeon_emit(cs, size | raw_wait); /* COMMAND [29:22] | BYTE_COUNT [20:0] */ } else {
static bool si_ce_upload(struct si_context *sctx, unsigned ce_offset, unsigned size, unsigned *out_offset, struct r600_resource **out_buf) { uint64_t va; u_suballocator_alloc(sctx->ce_suballocator, size, out_offset, (struct pipe_resource**)out_buf); if (!out_buf) return false; va = (*out_buf)->gpu_address + *out_offset; radeon_emit(sctx->ce_ib, PKT3(PKT3_DUMP_CONST_RAM, 3, 0)); radeon_emit(sctx->ce_ib, ce_offset); radeon_emit(sctx->ce_ib, size / 4); radeon_emit(sctx->ce_ib, va); radeon_emit(sctx->ce_ib, va >> 32); radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, *out_buf, RADEON_USAGE_READWRITE, RADEON_PRIO_DESCRIPTORS); sctx->ce_need_synchronization = true; return true; }
/** * Write an EOP event. * * \param event EVENT_TYPE_* * \param event_flags Optional cache flush flags (TC) * \param data_sel 1 = fence, 3 = timestamp * \param buf Buffer * \param va GPU address * \param old_value Previous fence value (for a bug workaround) * \param new_value Fence value to write for this event. */ void r600_gfx_write_event_eop(struct r600_common_context *ctx, unsigned event, unsigned event_flags, unsigned data_sel, struct r600_resource *buf, uint64_t va, uint32_t new_fence, unsigned query_type) { struct radeon_winsys_cs *cs = ctx->gfx.cs; unsigned op = EVENT_TYPE(event) | EVENT_INDEX(5) | event_flags; unsigned sel = EOP_DATA_SEL(data_sel); radeon_emit(cs, PKT3(PKT3_EVENT_WRITE_EOP, 4, 0)); radeon_emit(cs, op); radeon_emit(cs, va); radeon_emit(cs, ((va >> 32) & 0xffff) | sel); radeon_emit(cs, new_fence); /* immediate data */ radeon_emit(cs, 0); /* unused */ if (buf) r600_emit_reloc(ctx, &ctx->gfx, buf, RADEON_USAGE_WRITE, RADEON_PRIO_QUERY); }
/* Emit a CP DMA packet to do a copy from one buffer to another, or to clear * a buffer. The size must fit in bits [20:0]. If CP_DMA_CLEAR is set, src_va is a 32-bit * clear value. */ static void si_emit_cp_dma(struct si_context *sctx, uint64_t dst_va, uint64_t src_va, unsigned size, unsigned flags, enum r600_coherency coher) { struct radeon_winsys_cs *cs = sctx->b.gfx.cs; uint32_t header = 0, command = S_414_BYTE_COUNT(size); assert(size); assert(size <= CP_DMA_MAX_BYTE_COUNT); /* Sync flags. */ if (flags & CP_DMA_SYNC) header |= S_411_CP_SYNC(1); else command |= S_414_DISABLE_WR_CONFIRM(1); if (flags & CP_DMA_RAW_WAIT) command |= S_414_RAW_WAIT(1); /* Src and dst flags. */ if (flags & CP_DMA_USE_L2) header |= S_411_DSL_SEL(V_411_DST_ADDR_TC_L2); if (flags & CP_DMA_CLEAR) header |= S_411_SRC_SEL(V_411_DATA); else if (flags & CP_DMA_USE_L2) header |= S_411_SRC_SEL(V_411_SRC_ADDR_TC_L2); if (sctx->b.chip_class >= CIK) { radeon_emit(cs, PKT3(PKT3_DMA_DATA, 5, 0)); radeon_emit(cs, header); radeon_emit(cs, src_va); /* SRC_ADDR_LO [31:0] */ radeon_emit(cs, src_va >> 32); /* SRC_ADDR_HI [31:0] */ radeon_emit(cs, dst_va); /* DST_ADDR_LO [31:0] */ radeon_emit(cs, dst_va >> 32); /* DST_ADDR_HI [31:0] */ radeon_emit(cs, command); } else {
void si_pm4_emit(struct si_context *sctx, struct si_pm4_state *state) { struct radeon_winsys_cs *cs = sctx->b.gfx.cs; for (int i = 0; i < state->nbo; ++i) { radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, state->bo[i], state->bo_usage[i], state->bo_priority[i]); } if (!state->indirect_buffer) { radeon_emit_array(cs, state->pm4, state->ndw); } else { struct r600_resource *ib = state->indirect_buffer; radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, ib, RADEON_USAGE_READ, RADEON_PRIO_IB2); radeon_emit(cs, PKT3(PKT3_INDIRECT_BUFFER_CIK, 2, 0)); radeon_emit(cs, ib->gpu_address); radeon_emit(cs, ib->gpu_address >> 32); radeon_emit(cs, (ib->b.b.width0 >> 2) & 0xfffff); } }
static void compute_emit_cs(struct r600_context *ctx, const uint *block_layout, const uint *grid_layout) { struct radeon_winsys_cs *cs = ctx->rings.gfx.cs; unsigned flush_flags = 0; int i; /* make sure that the gfx ring is only one active */ if (ctx->rings.dma.cs) { ctx->rings.dma.flush(ctx, RADEON_FLUSH_ASYNC); } /* Initialize all the compute-related registers. * * See evergreen_init_atom_start_compute_cs() in this file for the list * of registers initialized by the start_compute_cs_cmd atom. */ r600_emit_command_buffer(cs, &ctx->start_compute_cs_cmd); ctx->flags |= R600_CONTEXT_WAIT_3D_IDLE | R600_CONTEXT_FLUSH_AND_INV; r600_flush_emit(ctx); /* Emit colorbuffers. */ for (i = 0; i < ctx->framebuffer.state.nr_cbufs; i++) { struct r600_surface *cb = (struct r600_surface*)ctx->framebuffer.state.cbufs[i]; unsigned reloc = r600_context_bo_reloc(ctx, &ctx->rings.gfx, (struct r600_resource*)cb->base.texture, RADEON_USAGE_READWRITE); r600_write_compute_context_reg_seq(cs, R_028C60_CB_COLOR0_BASE + i * 0x3C, 7); r600_write_value(cs, cb->cb_color_base); /* R_028C60_CB_COLOR0_BASE */ r600_write_value(cs, cb->cb_color_pitch); /* R_028C64_CB_COLOR0_PITCH */ r600_write_value(cs, cb->cb_color_slice); /* R_028C68_CB_COLOR0_SLICE */ r600_write_value(cs, cb->cb_color_view); /* R_028C6C_CB_COLOR0_VIEW */ r600_write_value(cs, cb->cb_color_info); /* R_028C70_CB_COLOR0_INFO */ r600_write_value(cs, cb->cb_color_attrib); /* R_028C74_CB_COLOR0_ATTRIB */ r600_write_value(cs, cb->cb_color_dim); /* R_028C78_CB_COLOR0_DIM */ r600_write_value(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C60_CB_COLOR0_BASE */ r600_write_value(cs, reloc); if (!ctx->keep_tiling_flags) { r600_write_value(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C70_CB_COLOR0_INFO */ r600_write_value(cs, reloc); } r600_write_value(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C74_CB_COLOR0_ATTRIB */ r600_write_value(cs, reloc); } /* Set CB_TARGET_MASK XXX: Use cb_misc_state */ r600_write_compute_context_reg(cs, R_028238_CB_TARGET_MASK, ctx->compute_cb_target_mask); /* Emit vertex buffer state */ ctx->cs_vertex_buffer_state.atom.num_dw = 12 * util_bitcount(ctx->cs_vertex_buffer_state.dirty_mask); r600_emit_atom(ctx, &ctx->cs_vertex_buffer_state.atom); /* Emit constant buffer state */ r600_emit_atom(ctx, &ctx->constbuf_state[PIPE_SHADER_COMPUTE].atom); /* Emit compute shader state */ r600_emit_atom(ctx, &ctx->cs_shader_state.atom); /* Emit dispatch state and dispatch packet */ evergreen_emit_direct_dispatch(ctx, block_layout, grid_layout); /* XXX evergreen_flush_emit() hardcodes the CP_COHER_SIZE to 0xffffffff */ ctx->flags |= R600_CONTEXT_INVAL_READ_CACHES; r600_flush_emit(ctx); #if 0 COMPUTE_DBG(ctx->screen, "cdw: %i\n", cs->cdw); for (i = 0; i < cs->cdw; i++) { COMPUTE_DBG(ctx->screen, "%4i : 0x%08X\n", i, ctx->cs->buf[i]); } #endif flush_flags = RADEON_FLUSH_ASYNC | RADEON_FLUSH_COMPUTE; if (ctx->keep_tiling_flags) { flush_flags |= RADEON_FLUSH_KEEP_TILING_FLAGS; } ctx->ws->cs_flush(ctx->rings.gfx.cs, flush_flags, ctx->screen->cs_count++); ctx->flags = 0; COMPUTE_DBG(ctx->screen, "shader started\n"); }
void r600_flush_emit(struct r600_context *rctx) { struct radeon_winsys_cs *cs = rctx->b.gfx.cs; unsigned cp_coher_cntl = 0; unsigned wait_until = 0; if (!rctx->b.flags) { return; } if (rctx->b.flags & R600_CONTEXT_WAIT_3D_IDLE) { wait_until |= S_008040_WAIT_3D_IDLE(1); } if (rctx->b.flags & R600_CONTEXT_WAIT_CP_DMA_IDLE) { wait_until |= S_008040_WAIT_CP_DMA_IDLE(1); } if (wait_until) { /* Use of WAIT_UNTIL is deprecated on Cayman+ */ if (rctx->b.family >= CHIP_CAYMAN) { /* emit a PS partial flush on Cayman/TN */ rctx->b.flags |= R600_CONTEXT_PS_PARTIAL_FLUSH; } } if (rctx->b.flags & R600_CONTEXT_PS_PARTIAL_FLUSH) { cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 0, 0); cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_PS_PARTIAL_FLUSH) | EVENT_INDEX(4); } if (rctx->b.chip_class >= R700 && (rctx->b.flags & R600_CONTEXT_FLUSH_AND_INV_CB_META)) { cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 0, 0); cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_FLUSH_AND_INV_CB_META) | EVENT_INDEX(0); } if (rctx->b.chip_class >= R700 && (rctx->b.flags & R600_CONTEXT_FLUSH_AND_INV_DB_META)) { cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 0, 0); cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_FLUSH_AND_INV_DB_META) | EVENT_INDEX(0); /* Set FULL_CACHE_ENA for DB META flushes on r7xx and later. * * This hack predates use of FLUSH_AND_INV_DB_META, so it's * unclear whether it's still needed or even whether it has * any effect. */ cp_coher_cntl |= S_0085F0_FULL_CACHE_ENA(1); } if (rctx->b.flags & R600_CONTEXT_FLUSH_AND_INV || (rctx->b.chip_class == R600 && rctx->b.flags & R600_CONTEXT_STREAMOUT_FLUSH)) { cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 0, 0); cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_CACHE_FLUSH_AND_INV_EVENT) | EVENT_INDEX(0); } if (rctx->b.flags & R600_CONTEXT_INV_CONST_CACHE) { /* Direct constant addressing uses the shader cache. * Indirect contant addressing uses the vertex cache. */ cp_coher_cntl |= S_0085F0_SH_ACTION_ENA(1) | (rctx->has_vertex_cache ? S_0085F0_VC_ACTION_ENA(1) : S_0085F0_TC_ACTION_ENA(1)); } if (rctx->b.flags & R600_CONTEXT_INV_VERTEX_CACHE) { cp_coher_cntl |= rctx->has_vertex_cache ? S_0085F0_VC_ACTION_ENA(1) : S_0085F0_TC_ACTION_ENA(1); } if (rctx->b.flags & R600_CONTEXT_INV_TEX_CACHE) { /* Textures use the texture cache. * Texture buffer objects use the vertex cache. */ cp_coher_cntl |= S_0085F0_TC_ACTION_ENA(1) | (rctx->has_vertex_cache ? S_0085F0_VC_ACTION_ENA(1) : 0); } /* Don't use the DB CP COHER logic on r6xx. * There are hw bugs. */ if (rctx->b.chip_class >= R700 && (rctx->b.flags & R600_CONTEXT_FLUSH_AND_INV_DB)) { cp_coher_cntl |= S_0085F0_DB_ACTION_ENA(1) | S_0085F0_DB_DEST_BASE_ENA(1) | S_0085F0_SMX_ACTION_ENA(1); } /* Don't use the CB CP COHER logic on r6xx. * There are hw bugs. */ if (rctx->b.chip_class >= R700 && (rctx->b.flags & R600_CONTEXT_FLUSH_AND_INV_CB)) { cp_coher_cntl |= S_0085F0_CB_ACTION_ENA(1) | S_0085F0_CB0_DEST_BASE_ENA(1) | S_0085F0_CB1_DEST_BASE_ENA(1) | S_0085F0_CB2_DEST_BASE_ENA(1) | S_0085F0_CB3_DEST_BASE_ENA(1) | S_0085F0_CB4_DEST_BASE_ENA(1) | S_0085F0_CB5_DEST_BASE_ENA(1) | S_0085F0_CB6_DEST_BASE_ENA(1) | S_0085F0_CB7_DEST_BASE_ENA(1) | S_0085F0_SMX_ACTION_ENA(1); if (rctx->b.chip_class >= EVERGREEN) cp_coher_cntl |= S_0085F0_CB8_DEST_BASE_ENA(1) | S_0085F0_CB9_DEST_BASE_ENA(1) | S_0085F0_CB10_DEST_BASE_ENA(1) | S_0085F0_CB11_DEST_BASE_ENA(1); } if (rctx->b.chip_class >= R700 && rctx->b.flags & R600_CONTEXT_STREAMOUT_FLUSH) { cp_coher_cntl |= S_0085F0_SO0_DEST_BASE_ENA(1) | S_0085F0_SO1_DEST_BASE_ENA(1) | S_0085F0_SO2_DEST_BASE_ENA(1) | S_0085F0_SO3_DEST_BASE_ENA(1) | S_0085F0_SMX_ACTION_ENA(1); } /* Workaround for buggy flushing on some R6xx chipsets. */ if ((rctx->b.flags & (R600_CONTEXT_FLUSH_AND_INV | R600_CONTEXT_STREAMOUT_FLUSH)) && (rctx->b.family == CHIP_RV670 || rctx->b.family == CHIP_RS780 || rctx->b.family == CHIP_RS880)) { cp_coher_cntl |= S_0085F0_CB1_DEST_BASE_ENA(1) | S_0085F0_DEST_BASE_0_ENA(1); } if (cp_coher_cntl) { cs->buf[cs->cdw++] = PKT3(PKT3_SURFACE_SYNC, 3, 0); cs->buf[cs->cdw++] = cp_coher_cntl; /* CP_COHER_CNTL */ cs->buf[cs->cdw++] = 0xffffffff; /* CP_COHER_SIZE */ cs->buf[cs->cdw++] = 0; /* CP_COHER_BASE */ cs->buf[cs->cdw++] = 0x0000000A; /* POLL_INTERVAL */ } if (wait_until) { /* Use of WAIT_UNTIL is deprecated on Cayman+ */ if (rctx->b.family < CHIP_CAYMAN) { /* wait for things to settle */ radeon_set_config_reg(cs, R_008040_WAIT_UNTIL, wait_until); } } /* everything is properly flushed */ rctx->b.flags = 0; }
void r600_cp_dma_copy_buffer(struct r600_context *rctx, struct pipe_resource *dst, uint64_t dst_offset, struct pipe_resource *src, uint64_t src_offset, unsigned size) { struct radeon_winsys_cs *cs = rctx->b.gfx.cs; assert(size); assert(rctx->screen->b.has_cp_dma); /* Mark the buffer range of destination as valid (initialized), * so that transfer_map knows it should wait for the GPU when mapping * that range. */ util_range_add(&r600_resource(dst)->valid_buffer_range, dst_offset, dst_offset + size); dst_offset += r600_resource(dst)->gpu_address; src_offset += r600_resource(src)->gpu_address; /* Flush the caches where the resources are bound. */ rctx->b.flags |= R600_CONTEXT_INV_CONST_CACHE | R600_CONTEXT_INV_VERTEX_CACHE | R600_CONTEXT_INV_TEX_CACHE | R600_CONTEXT_FLUSH_AND_INV | R600_CONTEXT_FLUSH_AND_INV_CB | R600_CONTEXT_FLUSH_AND_INV_DB | R600_CONTEXT_FLUSH_AND_INV_CB_META | R600_CONTEXT_FLUSH_AND_INV_DB_META | R600_CONTEXT_STREAMOUT_FLUSH | R600_CONTEXT_WAIT_3D_IDLE; /* There are differences between R700 and EG in CP DMA, * but we only use the common bits here. */ while (size) { unsigned sync = 0; unsigned byte_count = MIN2(size, CP_DMA_MAX_BYTE_COUNT); unsigned src_reloc, dst_reloc; r600_need_cs_space(rctx, 10 + (rctx->b.flags ? R600_MAX_FLUSH_CS_DWORDS : 0), FALSE); /* Flush the caches for the first copy only. */ if (rctx->b.flags) { r600_flush_emit(rctx); } /* Do the synchronization after the last copy, so that all data is written to memory. */ if (size == byte_count) { sync = PKT3_CP_DMA_CP_SYNC; } /* This must be done after r600_need_cs_space. */ src_reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, (struct r600_resource*)src, RADEON_USAGE_READ, RADEON_PRIO_CP_DMA); dst_reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, (struct r600_resource*)dst, RADEON_USAGE_WRITE, RADEON_PRIO_CP_DMA); radeon_emit(cs, PKT3(PKT3_CP_DMA, 4, 0)); radeon_emit(cs, src_offset); /* SRC_ADDR_LO [31:0] */ radeon_emit(cs, sync | ((src_offset >> 32) & 0xff)); /* CP_SYNC [31] | SRC_ADDR_HI [7:0] */ radeon_emit(cs, dst_offset); /* DST_ADDR_LO [31:0] */ radeon_emit(cs, (dst_offset >> 32) & 0xff); /* DST_ADDR_HI [7:0] */ radeon_emit(cs, byte_count); /* COMMAND [29:22] | BYTE_COUNT [20:0] */ radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); radeon_emit(cs, src_reloc); radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); radeon_emit(cs, dst_reloc); size -= byte_count; src_offset += byte_count; dst_offset += byte_count; } /* Invalidate the read caches. */ rctx->b.flags |= R600_CONTEXT_INV_CONST_CACHE | R600_CONTEXT_INV_VERTEX_CACHE | R600_CONTEXT_INV_TEX_CACHE; }
void si_init_config(struct radv_physical_device *physical_device, struct radv_cmd_buffer *cmd_buffer) { unsigned num_rb = MIN2(physical_device->rad_info.num_render_backends, 16); unsigned rb_mask = physical_device->rad_info.enabled_rb_mask; unsigned raster_config, raster_config_1; int i; struct radeon_winsys_cs *cs = cmd_buffer->cs; radeon_emit(cs, PKT3(PKT3_CONTEXT_CONTROL, 1, 0)); radeon_emit(cs, CONTEXT_CONTROL_LOAD_ENABLE(1)); radeon_emit(cs, CONTEXT_CONTROL_SHADOW_ENABLE(1)); radeon_set_context_reg(cs, R_028A18_VGT_HOS_MAX_TESS_LEVEL, fui(64)); radeon_set_context_reg(cs, R_028A1C_VGT_HOS_MIN_TESS_LEVEL, fui(0)); /* FIXME calculate these values somehow ??? */ radeon_set_context_reg(cs, R_028A54_VGT_GS_PER_ES, SI_GS_PER_ES); radeon_set_context_reg(cs, R_028A58_VGT_ES_PER_GS, 0x40); radeon_set_context_reg(cs, R_028A5C_VGT_GS_PER_VS, 0x2); radeon_set_context_reg(cs, R_028A8C_VGT_PRIMITIVEID_RESET, 0x0); radeon_set_context_reg(cs, R_028B28_VGT_STRMOUT_DRAW_OPAQUE_OFFSET, 0); radeon_set_context_reg(cs, R_028B98_VGT_STRMOUT_BUFFER_CONFIG, 0x0); radeon_set_context_reg(cs, R_028AB8_VGT_VTX_CNT_EN, 0x0); if (physical_device->rad_info.chip_class < CIK) radeon_set_config_reg(cs, R_008A14_PA_CL_ENHANCE, S_008A14_NUM_CLIP_SEQ(3) | S_008A14_CLIP_VTX_REORDER_ENA(1)); radeon_set_context_reg(cs, R_028BD4_PA_SC_CENTROID_PRIORITY_0, 0x76543210); radeon_set_context_reg(cs, R_028BD8_PA_SC_CENTROID_PRIORITY_1, 0xfedcba98); radeon_set_context_reg(cs, R_02882C_PA_SU_PRIM_FILTER_CNTL, 0); for (i = 0; i < 16; i++) { radeon_set_context_reg(cs, R_0282D0_PA_SC_VPORT_ZMIN_0 + i*8, 0); radeon_set_context_reg(cs, R_0282D4_PA_SC_VPORT_ZMAX_0 + i*8, fui(1.0)); } switch (physical_device->rad_info.family) { case CHIP_TAHITI: case CHIP_PITCAIRN: raster_config = 0x2a00126a; raster_config_1 = 0x00000000; break; case CHIP_VERDE: raster_config = 0x0000124a; raster_config_1 = 0x00000000; break; case CHIP_OLAND: raster_config = 0x00000082; raster_config_1 = 0x00000000; break; case CHIP_HAINAN: raster_config = 0x00000000; raster_config_1 = 0x00000000; break; case CHIP_BONAIRE: raster_config = 0x16000012; raster_config_1 = 0x00000000; break; case CHIP_HAWAII: raster_config = 0x3a00161a; raster_config_1 = 0x0000002e; break; case CHIP_FIJI: if (physical_device->rad_info.cik_macrotile_mode_array[0] == 0x000000e8) { /* old kernels with old tiling config */ raster_config = 0x16000012; raster_config_1 = 0x0000002a; } else { raster_config = 0x3a00161a; raster_config_1 = 0x0000002e; } break; case CHIP_POLARIS10: raster_config = 0x16000012; raster_config_1 = 0x0000002a; break; case CHIP_POLARIS11: raster_config = 0x16000012; raster_config_1 = 0x00000000; break; case CHIP_TONGA: raster_config = 0x16000012; raster_config_1 = 0x0000002a; break; case CHIP_ICELAND: if (num_rb == 1) raster_config = 0x00000000; else raster_config = 0x00000002; raster_config_1 = 0x00000000; break; case CHIP_CARRIZO: raster_config = 0x00000002; raster_config_1 = 0x00000000; break; case CHIP_KAVERI: /* KV should be 0x00000002, but that causes problems with radeon */ raster_config = 0x00000000; /* 0x00000002 */ raster_config_1 = 0x00000000; break; case CHIP_KABINI: case CHIP_MULLINS: case CHIP_STONEY: raster_config = 0x00000000; raster_config_1 = 0x00000000; break; default: fprintf(stderr, "radeonsi: Unknown GPU, using 0 for raster_config\n"); raster_config = 0x00000000; raster_config_1 = 0x00000000; break; } /* Always use the default config when all backends are enabled * (or when we failed to determine the enabled backends). */ if (!rb_mask || util_bitcount(rb_mask) >= num_rb) { radeon_set_context_reg(cs, R_028350_PA_SC_RASTER_CONFIG, raster_config); if (physical_device->rad_info.chip_class >= CIK) radeon_set_context_reg(cs, R_028354_PA_SC_RASTER_CONFIG_1, raster_config_1); } else { si_write_harvested_raster_configs(physical_device, cs, raster_config, raster_config_1); } radeon_set_context_reg(cs, R_028204_PA_SC_WINDOW_SCISSOR_TL, S_028204_WINDOW_OFFSET_DISABLE(1)); radeon_set_context_reg(cs, R_028240_PA_SC_GENERIC_SCISSOR_TL, S_028240_WINDOW_OFFSET_DISABLE(1)); radeon_set_context_reg(cs, R_028244_PA_SC_GENERIC_SCISSOR_BR, S_028244_BR_X(16384) | S_028244_BR_Y(16384)); radeon_set_context_reg(cs, R_028030_PA_SC_SCREEN_SCISSOR_TL, 0); radeon_set_context_reg(cs, R_028034_PA_SC_SCREEN_SCISSOR_BR, S_028034_BR_X(16384) | S_028034_BR_Y(16384)); radeon_set_context_reg(cs, R_02820C_PA_SC_CLIPRECT_RULE, 0xFFFF); radeon_set_context_reg(cs, R_028230_PA_SC_EDGERULE, 0xAAAAAAAA); /* PA_SU_HARDWARE_SCREEN_OFFSET must be 0 due to hw bug on SI */ radeon_set_context_reg(cs, R_028234_PA_SU_HARDWARE_SCREEN_OFFSET, 0); radeon_set_context_reg(cs, R_028820_PA_CL_NANINF_CNTL, 0); radeon_set_context_reg(cs, R_028BE8_PA_CL_GB_VERT_CLIP_ADJ, fui(1.0)); radeon_set_context_reg(cs, R_028BEC_PA_CL_GB_VERT_DISC_ADJ, fui(1.0)); radeon_set_context_reg(cs, R_028BF0_PA_CL_GB_HORZ_CLIP_ADJ, fui(1.0)); radeon_set_context_reg(cs, R_028BF4_PA_CL_GB_HORZ_DISC_ADJ, fui(1.0)); radeon_set_context_reg(cs, R_028AC0_DB_SRESULTS_COMPARE_STATE0, 0x0); radeon_set_context_reg(cs, R_028AC4_DB_SRESULTS_COMPARE_STATE1, 0x0); radeon_set_context_reg(cs, R_028AC8_DB_PRELOAD_CONTROL, 0x0); radeon_set_context_reg(cs, R_02800C_DB_RENDER_OVERRIDE, S_02800C_FORCE_HIS_ENABLE0(V_02800C_FORCE_DISABLE) | S_02800C_FORCE_HIS_ENABLE1(V_02800C_FORCE_DISABLE)); radeon_set_context_reg(cs, R_028400_VGT_MAX_VTX_INDX, ~0); radeon_set_context_reg(cs, R_028404_VGT_MIN_VTX_INDX, 0); radeon_set_context_reg(cs, R_028408_VGT_INDX_OFFSET, 0); if (physical_device->rad_info.chip_class >= CIK) { radeon_set_sh_reg(cs, R_00B41C_SPI_SHADER_PGM_RSRC3_HS, 0); radeon_set_sh_reg(cs, R_00B31C_SPI_SHADER_PGM_RSRC3_ES, S_00B31C_CU_EN(0xffff)); radeon_set_sh_reg(cs, R_00B21C_SPI_SHADER_PGM_RSRC3_GS, S_00B21C_CU_EN(0xffff)); if (physical_device->rad_info.num_good_compute_units / (physical_device->rad_info.max_se * physical_device->rad_info.max_sh_per_se) <= 4) { /* Too few available compute units per SH. Disallowing * VS to run on CU0 could hurt us more than late VS * allocation would help. * * LATE_ALLOC_VS = 2 is the highest safe number. */ radeon_set_sh_reg(cs, R_00B51C_SPI_SHADER_PGM_RSRC3_LS, S_00B51C_CU_EN(0xffff)); radeon_set_sh_reg(cs, R_00B118_SPI_SHADER_PGM_RSRC3_VS, S_00B118_CU_EN(0xffff)); radeon_set_sh_reg(cs, R_00B11C_SPI_SHADER_LATE_ALLOC_VS, S_00B11C_LIMIT(2)); } else { /* Set LATE_ALLOC_VS == 31. It should be less than * the number of scratch waves. Limitations: * - VS can't execute on CU0. * - If HS writes outputs to LDS, LS can't execute on CU0. */ radeon_set_sh_reg(cs, R_00B51C_SPI_SHADER_PGM_RSRC3_LS, S_00B51C_CU_EN(0xfffe)); radeon_set_sh_reg(cs, R_00B118_SPI_SHADER_PGM_RSRC3_VS, S_00B118_CU_EN(0xfffe)); radeon_set_sh_reg(cs, R_00B11C_SPI_SHADER_LATE_ALLOC_VS, S_00B11C_LIMIT(31)); } radeon_set_sh_reg(cs, R_00B01C_SPI_SHADER_PGM_RSRC3_PS, S_00B01C_CU_EN(0xffff)); } if (physical_device->rad_info.chip_class >= VI) { radeon_set_context_reg(cs, R_028424_CB_DCC_CONTROL, S_028424_OVERWRITE_COMBINER_MRT_SHARING_DISABLE(1) | S_028424_OVERWRITE_COMBINER_WATERMARK(4)); radeon_set_context_reg(cs, R_028C58_VGT_VERTEX_REUSE_BLOCK_CNTL, 30); radeon_set_context_reg(cs, R_028C5C_VGT_OUT_DEALLOC_CNTL, 32); radeon_set_context_reg(cs, R_028B50_VGT_TESS_DISTRIBUTION, S_028B50_ACCUM_ISOLINE(32) | S_028B50_ACCUM_TRI(11) | S_028B50_ACCUM_QUAD(11) | S_028B50_DONUT_SPLIT(16)); } else { radeon_set_context_reg(cs, R_028C58_VGT_VERTEX_REUSE_BLOCK_CNTL, 14); radeon_set_context_reg(cs, R_028C5C_VGT_OUT_DEALLOC_CNTL, 16); } if (physical_device->rad_info.family == CHIP_STONEY) radeon_set_context_reg(cs, R_028C40_PA_SC_SHADER_CONTROL, 0); si_init_compute(physical_device, cs); }
static void r600_emit_r6xx_flush_and_inv(struct r600_context *rctx, struct r600_atom *atom) { struct radeon_winsys_cs *cs = rctx->cs; cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 0, 0); cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_CACHE_FLUSH_AND_INV_EVENT) | EVENT_INDEX(0); }
/** * This function initializes all the compute specific registers that need to * be initialized for each compute command stream. Registers that are common * to both compute and 3D will be initialized at the beginning of each compute * command stream by the start_cs_cmd atom. However, since the SET_CONTEXT_REG * packet requires that the shader type bit be set, we must initialize all * context registers needed for compute in this function. The registers * intialized by the start_cs_cmd atom can be found in evereen_state.c in the * functions evergreen_init_atom_start_cs or cayman_init_atom_start_cs depending * on the GPU family. */ void evergreen_init_atom_start_compute_cs(struct r600_context *ctx) { struct r600_command_buffer *cb = &ctx->start_compute_cs_cmd; int num_threads; int num_stack_entries; /* since all required registers are initialised in the * start_compute_cs_cmd atom, we can EMIT_EARLY here. */ r600_init_command_buffer(cb, 256); cb->pkt_flags = RADEON_CP_PACKET3_COMPUTE_MODE; /* This must be first. */ r600_store_value(cb, PKT3(PKT3_CONTEXT_CONTROL, 1, 0)); r600_store_value(cb, 0x80000000); r600_store_value(cb, 0x80000000); /* We're setting config registers here. */ r600_store_value(cb, PKT3(PKT3_EVENT_WRITE, 0, 0)); r600_store_value(cb, EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4)); switch (ctx->family) { case CHIP_CEDAR: default: num_threads = 128; num_stack_entries = 256; break; case CHIP_REDWOOD: num_threads = 128; num_stack_entries = 256; break; case CHIP_JUNIPER: num_threads = 128; num_stack_entries = 512; break; case CHIP_CYPRESS: case CHIP_HEMLOCK: num_threads = 128; num_stack_entries = 512; break; case CHIP_PALM: num_threads = 128; num_stack_entries = 256; break; case CHIP_SUMO: num_threads = 128; num_stack_entries = 256; break; case CHIP_SUMO2: num_threads = 128; num_stack_entries = 512; break; case CHIP_BARTS: num_threads = 128; num_stack_entries = 512; break; case CHIP_TURKS: num_threads = 128; num_stack_entries = 256; break; case CHIP_CAICOS: num_threads = 128; num_stack_entries = 256; break; } /* Config Registers */ if (ctx->chip_class < CAYMAN) evergreen_init_common_regs(cb, ctx->chip_class, ctx->family, ctx->screen->info.drm_minor); else cayman_init_common_regs(cb, ctx->chip_class, ctx->family, ctx->screen->info.drm_minor); /* The primitive type always needs to be POINTLIST for compute. */ r600_store_config_reg(cb, R_008958_VGT_PRIMITIVE_TYPE, V_008958_DI_PT_POINTLIST); if (ctx->chip_class < CAYMAN) { /* These registers control which simds can be used by each stage. * The default for these registers is 0xffffffff, which means * all simds are available for each stage. It's possible we may * want to play around with these in the future, but for now * the default value is fine. * * R_008E20_SQ_STATIC_THREAD_MGMT1 * R_008E24_SQ_STATIC_THREAD_MGMT2 * R_008E28_SQ_STATIC_THREAD_MGMT3 */ /* XXX: We may need to adjust the thread and stack resouce * values for 3D/compute interop */ r600_store_config_reg_seq(cb, R_008C18_SQ_THREAD_RESOURCE_MGMT_1, 5); /* R_008C18_SQ_THREAD_RESOURCE_MGMT_1 * Set the number of threads used by the PS/VS/GS/ES stage to * 0. */ r600_store_value(cb, 0); /* R_008C1C_SQ_THREAD_RESOURCE_MGMT_2 * Set the number of threads used by the CS (aka LS) stage to * the maximum number of threads and set the number of threads * for the HS stage to 0. */ r600_store_value(cb, S_008C1C_NUM_LS_THREADS(num_threads)); /* R_008C20_SQ_STACK_RESOURCE_MGMT_1 * Set the Control Flow stack entries to 0 for PS/VS stages */ r600_store_value(cb, 0); /* R_008C24_SQ_STACK_RESOURCE_MGMT_2 * Set the Control Flow stack entries to 0 for GS/ES stages */ r600_store_value(cb, 0); /* R_008C28_SQ_STACK_RESOURCE_MGMT_3 * Set the Contol Flow stack entries to 0 for the HS stage, and * set it to the maximum value for the CS (aka LS) stage. */ r600_store_value(cb, S_008C28_NUM_LS_STACK_ENTRIES(num_stack_entries)); } /* Context Registers */ if (ctx->chip_class < CAYMAN) { /* workaround for hw issues with dyn gpr - must set all limits * to 240 instead of 0, 0x1e == 240 / 8 */ r600_store_context_reg(cb, R_028838_SQ_DYN_GPR_RESOURCE_LIMIT_1, S_028838_PS_GPRS(0x1e) | S_028838_VS_GPRS(0x1e) | S_028838_GS_GPRS(0x1e) | S_028838_ES_GPRS(0x1e) | S_028838_HS_GPRS(0x1e) | S_028838_LS_GPRS(0x1e)); } /* XXX: Investigate setting bit 15, which is FAST_COMPUTE_MODE */ r600_store_context_reg(cb, R_028A40_VGT_GS_MODE, S_028A40_COMPUTE_MODE(1) | S_028A40_PARTIAL_THD_AT_EOI(1)); r600_store_context_reg(cb, R_028B54_VGT_SHADER_STAGES_EN, 2/*CS_ON*/); r600_store_context_reg(cb, R_0286E8_SPI_COMPUTE_INPUT_CNTL, S_0286E8_TID_IN_GROUP_ENA | S_0286E8_TGID_ENA | S_0286E8_DISABLE_INDEX_PACK) ; /* The LOOP_CONST registers are an optimizations for loops that allows * you to store the initial counter, increment value, and maximum * counter value in a register so that hardware can calculate the * correct number of iterations for the loop, so that you don't need * to have the loop counter in your shader code. We don't currently use * this optimization, so we must keep track of the counter in the * shader and use a break instruction to exit loops. However, the * hardware will still uses this register to determine when to exit a * loop, so we need to initialize the counter to 0, set the increment * value to 1 and the maximum counter value to the 4095 (0xfff) which * is the maximum value allowed. This gives us a maximum of 4096 * iterations for our loops, but hopefully our break instruction will * execute before some time before the 4096th iteration. */ eg_store_loop_const(cb, R_03A200_SQ_LOOP_CONST_0 + (160 * 4), 0x1000FFF); }
void si_emit_cache_flush(struct radv_cmd_buffer *cmd_buffer) { enum chip_class chip_class = cmd_buffer->device->instance->physicalDevice.rad_info.chip_class; unsigned cp_coher_cntl = 0; radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 128); if (cmd_buffer->state.flush_bits & RADV_CMD_FLAG_INV_ICACHE) cp_coher_cntl |= S_0085F0_SH_ICACHE_ACTION_ENA(1); if (cmd_buffer->state.flush_bits & RADV_CMD_FLAG_INV_SMEM_L1) cp_coher_cntl |= S_0085F0_SH_KCACHE_ACTION_ENA(1); if (cmd_buffer->state.flush_bits & RADV_CMD_FLAG_INV_VMEM_L1) cp_coher_cntl |= S_0085F0_TCL1_ACTION_ENA(1); if (cmd_buffer->state.flush_bits & RADV_CMD_FLAG_INV_GLOBAL_L2) { cp_coher_cntl |= S_0085F0_TC_ACTION_ENA(1); if (chip_class >= VI) cp_coher_cntl |= S_0301F0_TC_WB_ACTION_ENA(1); } if (cmd_buffer->state.flush_bits & RADV_CMD_FLAG_FLUSH_AND_INV_CB) { cp_coher_cntl |= S_0085F0_CB_ACTION_ENA(1) | S_0085F0_CB0_DEST_BASE_ENA(1) | S_0085F0_CB1_DEST_BASE_ENA(1) | S_0085F0_CB2_DEST_BASE_ENA(1) | S_0085F0_CB3_DEST_BASE_ENA(1) | S_0085F0_CB4_DEST_BASE_ENA(1) | S_0085F0_CB5_DEST_BASE_ENA(1) | S_0085F0_CB6_DEST_BASE_ENA(1) | S_0085F0_CB7_DEST_BASE_ENA(1); /* Necessary for DCC */ if (cmd_buffer->device->instance->physicalDevice.rad_info.chip_class >= VI) { radeon_emit(cmd_buffer->cs, PKT3(PKT3_EVENT_WRITE_EOP, 4, 0)); radeon_emit(cmd_buffer->cs, EVENT_TYPE(V_028A90_FLUSH_AND_INV_CB_DATA_TS) | EVENT_INDEX(5)); radeon_emit(cmd_buffer->cs, 0); radeon_emit(cmd_buffer->cs, 0); radeon_emit(cmd_buffer->cs, 0); radeon_emit(cmd_buffer->cs, 0); } } if (cmd_buffer->state.flush_bits & RADV_CMD_FLAG_FLUSH_AND_INV_DB) { cp_coher_cntl |= S_0085F0_DB_ACTION_ENA(1) | S_0085F0_DB_DEST_BASE_ENA(1); } if (cmd_buffer->state.flush_bits & RADV_CMD_FLAG_FLUSH_AND_INV_CB_META) { radeon_emit(cmd_buffer->cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); radeon_emit(cmd_buffer->cs, EVENT_TYPE(V_028A90_FLUSH_AND_INV_CB_META) | EVENT_INDEX(0)); } if (cmd_buffer->state.flush_bits & RADV_CMD_FLAG_FLUSH_AND_INV_DB_META) { radeon_emit(cmd_buffer->cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); radeon_emit(cmd_buffer->cs, EVENT_TYPE(V_028A90_FLUSH_AND_INV_DB_META) | EVENT_INDEX(0)); } if (!(cmd_buffer->state.flush_bits & (RADV_CMD_FLAG_FLUSH_AND_INV_CB | RADV_CMD_FLAG_FLUSH_AND_INV_DB))) { if (cmd_buffer->state.flush_bits & RADV_CMD_FLAG_PS_PARTIAL_FLUSH) { radeon_emit(cmd_buffer->cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); radeon_emit(cmd_buffer->cs, EVENT_TYPE(V_028A90_PS_PARTIAL_FLUSH) | EVENT_INDEX(4)); } else if (cmd_buffer->state.flush_bits & RADV_CMD_FLAG_VS_PARTIAL_FLUSH) { radeon_emit(cmd_buffer->cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); radeon_emit(cmd_buffer->cs, EVENT_TYPE(V_028A90_VS_PARTIAL_FLUSH) | EVENT_INDEX(4)); } } if (cmd_buffer->state.flush_bits & RADV_CMD_FLAG_CS_PARTIAL_FLUSH) { radeon_emit(cmd_buffer->cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); radeon_emit(cmd_buffer->cs, EVENT_TYPE(V_028A90_CS_PARTIAL_FLUSH) | EVENT_INDEX(4)); } /* VGT state sync */ if (cmd_buffer->state.flush_bits & RADV_CMD_FLAG_VGT_FLUSH) { radeon_emit(cmd_buffer->cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); radeon_emit(cmd_buffer->cs, EVENT_TYPE(V_028A90_VGT_FLUSH) | EVENT_INDEX(0)); } /* Make sure ME is idle (it executes most packets) before continuing. * This prevents read-after-write hazards between PFP and ME. */ if (cp_coher_cntl || (cmd_buffer->state.flush_bits & RADV_CMD_FLAG_CS_PARTIAL_FLUSH)) { radeon_emit(cmd_buffer->cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0)); radeon_emit(cmd_buffer->cs, 0); } /* When one of the DEST_BASE flags is set, SURFACE_SYNC waits for idle. * Therefore, it should be last. Done in PFP. */ if (cp_coher_cntl) { /* ACQUIRE_MEM is only required on a compute ring. */ radeon_emit(cmd_buffer->cs, PKT3(PKT3_SURFACE_SYNC, 3, 0)); radeon_emit(cmd_buffer->cs, cp_coher_cntl); /* CP_COHER_CNTL */ radeon_emit(cmd_buffer->cs, 0xffffffff); /* CP_COHER_SIZE */ radeon_emit(cmd_buffer->cs, 0); /* CP_COHER_BASE */ radeon_emit(cmd_buffer->cs, 0x0000000A); /* POLL_INTERVAL */ } cmd_buffer->state.flush_bits = 0; }
* (this is non-zero if any query is active) */ if (ctx->num_cs_dw_queries_suspend) { pm4 = &cs->buf[cs->cdw]; pm4[0] = PKT3(PKT3_SET_CONTEXT_REG, 1, 0); pm4[1] = (R_028004_DB_COUNT_CONTROL - SI_CONTEXT_REG_OFFSET) >> 2; pm4[2] = S_028004_PERFECT_ZPASS_COUNTS(1); pm4[3] = PKT3(PKT3_SET_CONTEXT_REG, 1, 0); pm4[4] = (R_02800C_DB_RENDER_OVERRIDE - SI_CONTEXT_REG_OFFSET) >> 2; pm4[5] = draw->db_render_override | S_02800C_NOOP_CULL_DISABLE(1); cs->cdw += 6; ndwords -= 6; } /* draw packet */ pm4 = &cs->buf[cs->cdw]; pm4[0] = PKT3(PKT3_INDEX_TYPE, 0, ctx->predicate_drawing); pm4[1] = draw->vgt_index_type; pm4[2] = PKT3(PKT3_NUM_INSTANCES, 0, ctx->predicate_drawing); pm4[3] = draw->vgt_num_instances; if (draw->indices) { va = r600_resource_va(&ctx->screen->screen, (void*)draw->indices); va += draw->indices_bo_offset; pm4[4] = PKT3(PKT3_DRAW_INDEX_2, 4, ctx->predicate_drawing); pm4[5] = (draw->indices->b.b.width0 - draw->indices_bo_offset) / ctx->index_buffer.index_size; pm4[6] = va; pm4[7] = (va >> 32UL) & 0xFF; pm4[8] = draw->vgt_num_indices; pm4[9] = draw->vgt_draw_initiator; pm4[10] = PKT3(PKT3_NOP, 0, ctx->predicate_drawing); pm4[11] = r600_context_bo_reloc(ctx, draw->indices, RADEON_USAGE_READ);
void r600_cp_dma_copy_buffer(struct r600_context *rctx, struct pipe_resource *dst, uint64_t dst_offset, struct pipe_resource *src, uint64_t src_offset, unsigned size) { struct radeon_winsys_cs *cs = rctx->b.gfx.cs; assert(size); assert(rctx->screen->b.has_cp_dma); /* Mark the buffer range of destination as valid (initialized), * so that transfer_map knows it should wait for the GPU when mapping * that range. */ util_range_add(&r600_resource(dst)->valid_buffer_range, dst_offset, dst_offset + size); dst_offset += r600_resource(dst)->gpu_address; src_offset += r600_resource(src)->gpu_address; /* Flush the caches where the resources are bound. */ rctx->b.flags |= r600_get_flush_flags(R600_COHERENCY_SHADER) | R600_CONTEXT_WAIT_3D_IDLE; /* There are differences between R700 and EG in CP DMA, * but we only use the common bits here. */ while (size) { unsigned sync = 0; unsigned byte_count = MIN2(size, CP_DMA_MAX_BYTE_COUNT); unsigned src_reloc, dst_reloc; r600_need_cs_space(rctx, 10 + (rctx->b.flags ? R600_MAX_FLUSH_CS_DWORDS : 0) + 3 + R600_MAX_PFP_SYNC_ME_DWORDS, FALSE); /* Flush the caches for the first copy only. */ if (rctx->b.flags) { r600_flush_emit(rctx); } /* Do the synchronization after the last copy, so that all data is written to memory. */ if (size == byte_count) { sync = PKT3_CP_DMA_CP_SYNC; } /* This must be done after r600_need_cs_space. */ src_reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, (struct r600_resource*)src, RADEON_USAGE_READ, RADEON_PRIO_CP_DMA); dst_reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, (struct r600_resource*)dst, RADEON_USAGE_WRITE, RADEON_PRIO_CP_DMA); radeon_emit(cs, PKT3(PKT3_CP_DMA, 4, 0)); radeon_emit(cs, src_offset); /* SRC_ADDR_LO [31:0] */ radeon_emit(cs, sync | ((src_offset >> 32) & 0xff)); /* CP_SYNC [31] | SRC_ADDR_HI [7:0] */ radeon_emit(cs, dst_offset); /* DST_ADDR_LO [31:0] */ radeon_emit(cs, (dst_offset >> 32) & 0xff); /* DST_ADDR_HI [7:0] */ radeon_emit(cs, byte_count); /* COMMAND [29:22] | BYTE_COUNT [20:0] */ radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); radeon_emit(cs, src_reloc); radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); radeon_emit(cs, dst_reloc); size -= byte_count; src_offset += byte_count; dst_offset += byte_count; } /* CP_DMA_CP_SYNC doesn't wait for idle on R6xx, but this does. */ if (rctx->b.chip_class == R600) radeon_set_config_reg(cs, R_008040_WAIT_UNTIL, S_008040_WAIT_CP_DMA_IDLE(1)); /* CP DMA is executed in ME, but index buffers are read by PFP. * This ensures that ME (CP DMA) is idle before PFP starts fetching * indices. If we wanted to execute CP DMA in PFP, this packet * should precede it. */ r600_emit_pfp_sync_me(rctx); }
cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 0, 0); cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_SO_VGTSTREAMOUT_FLUSH) | EVENT_INDEX(0); cs->buf[cs->cdw++] = PKT3(PKT3_WAIT_REG_MEM, 5, 0); cs->buf[cs->cdw++] = WAIT_REG_MEM_EQUAL; /* wait until the register is equal to the reference value */ cs->buf[cs->cdw++] = R_0084FC_CP_STRMOUT_CNTL >> 2; /* register */ cs->buf[cs->cdw++] = 0; cs->buf[cs->cdw++] = S_0084FC_OFFSET_UPDATE_DONE(1); /* reference value */ cs->buf[cs->cdw++] = S_0084FC_OFFSET_UPDATE_DONE(1); /* mask */ cs->buf[cs->cdw++] = 4; /* poll interval */ } void evergreen_set_streamout_enable(struct r600_context *ctx, unsigned buffer_enable_bit) { struct radeon_winsys_cs *cs = ctx->cs; if (buffer_enable_bit) { cs->buf[cs->cdw++] = PKT3(PKT3_SET_CONTEXT_REG, 1, 0); cs->buf[cs->cdw++] = (R_028B94_VGT_STRMOUT_CONFIG - EVERGREEN_CONTEXT_REG_OFFSET) >> 2; cs->buf[cs->cdw++] = S_028B94_STREAMOUT_0_EN(1); cs->buf[cs->cdw++] = PKT3(PKT3_SET_CONTEXT_REG, 1, 0); cs->buf[cs->cdw++] = (R_028B98_VGT_STRMOUT_BUFFER_CONFIG - EVERGREEN_CONTEXT_REG_OFFSET) >> 2; cs->buf[cs->cdw++] = S_028B98_STREAM_0_BUFFER_EN(buffer_enable_bit); } else { cs->buf[cs->cdw++] = PKT3(PKT3_SET_CONTEXT_REG, 1, 0); cs->buf[cs->cdw++] = (R_028B94_VGT_STRMOUT_CONFIG - EVERGREEN_CONTEXT_REG_OFFSET) >> 2; cs->buf[cs->cdw++] = S_028B94_STREAMOUT_0_EN(0); } }
static void compute_emit_cs(struct r600_context *ctx, const uint *block_layout, const uint *grid_layout) { struct radeon_winsys_cs *cs = ctx->b.gfx.cs; unsigned i; /* make sure that the gfx ring is only one active */ if (ctx->b.dma.cs && ctx->b.dma.cs->cdw) { ctx->b.dma.flush(ctx, RADEON_FLUSH_ASYNC, NULL); } /* Initialize all the compute-related registers. * * See evergreen_init_atom_start_compute_cs() in this file for the list * of registers initialized by the start_compute_cs_cmd atom. */ r600_emit_command_buffer(cs, &ctx->start_compute_cs_cmd); /* emit config state */ if (ctx->b.chip_class == EVERGREEN) r600_emit_atom(ctx, &ctx->config_state.atom); ctx->b.flags |= R600_CONTEXT_WAIT_3D_IDLE | R600_CONTEXT_FLUSH_AND_INV; r600_flush_emit(ctx); /* Emit colorbuffers. */ /* XXX support more than 8 colorbuffers (the offsets are not a multiple of 0x3C for CB8-11) */ for (i = 0; i < 8 && i < ctx->framebuffer.state.nr_cbufs; i++) { struct r600_surface *cb = (struct r600_surface*)ctx->framebuffer.state.cbufs[i]; unsigned reloc = radeon_add_to_buffer_list(&ctx->b, &ctx->b.gfx, (struct r600_resource*)cb->base.texture, RADEON_USAGE_READWRITE, RADEON_PRIO_SHADER_RW_BUFFER); radeon_compute_set_context_reg_seq(cs, R_028C60_CB_COLOR0_BASE + i * 0x3C, 7); radeon_emit(cs, cb->cb_color_base); /* R_028C60_CB_COLOR0_BASE */ radeon_emit(cs, cb->cb_color_pitch); /* R_028C64_CB_COLOR0_PITCH */ radeon_emit(cs, cb->cb_color_slice); /* R_028C68_CB_COLOR0_SLICE */ radeon_emit(cs, cb->cb_color_view); /* R_028C6C_CB_COLOR0_VIEW */ radeon_emit(cs, cb->cb_color_info); /* R_028C70_CB_COLOR0_INFO */ radeon_emit(cs, cb->cb_color_attrib); /* R_028C74_CB_COLOR0_ATTRIB */ radeon_emit(cs, cb->cb_color_dim); /* R_028C78_CB_COLOR0_DIM */ radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C60_CB_COLOR0_BASE */ radeon_emit(cs, reloc); if (!ctx->keep_tiling_flags) { radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C70_CB_COLOR0_INFO */ radeon_emit(cs, reloc); } radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C74_CB_COLOR0_ATTRIB */ radeon_emit(cs, reloc); } if (ctx->keep_tiling_flags) { for (; i < 8 ; i++) { radeon_compute_set_context_reg(cs, R_028C70_CB_COLOR0_INFO + i * 0x3C, S_028C70_FORMAT(V_028C70_COLOR_INVALID)); } for (; i < 12; i++) { radeon_compute_set_context_reg(cs, R_028E50_CB_COLOR8_INFO + (i - 8) * 0x1C, S_028C70_FORMAT(V_028C70_COLOR_INVALID)); } } /* Set CB_TARGET_MASK XXX: Use cb_misc_state */ radeon_compute_set_context_reg(cs, R_028238_CB_TARGET_MASK, ctx->compute_cb_target_mask); /* Emit vertex buffer state */ ctx->cs_vertex_buffer_state.atom.num_dw = 12 * util_bitcount(ctx->cs_vertex_buffer_state.dirty_mask); r600_emit_atom(ctx, &ctx->cs_vertex_buffer_state.atom); /* Emit constant buffer state */ r600_emit_atom(ctx, &ctx->constbuf_state[PIPE_SHADER_COMPUTE].atom); /* Emit sampler state */ r600_emit_atom(ctx, &ctx->samplers[PIPE_SHADER_COMPUTE].states.atom); /* Emit sampler view (texture resource) state */ r600_emit_atom(ctx, &ctx->samplers[PIPE_SHADER_COMPUTE].views.atom); /* Emit compute shader state */ r600_emit_atom(ctx, &ctx->cs_shader_state.atom); /* Emit dispatch state and dispatch packet */ evergreen_emit_direct_dispatch(ctx, block_layout, grid_layout); /* XXX evergreen_flush_emit() hardcodes the CP_COHER_SIZE to 0xffffffff */ ctx->b.flags |= R600_CONTEXT_INV_CONST_CACHE | R600_CONTEXT_INV_VERTEX_CACHE | R600_CONTEXT_INV_TEX_CACHE; r600_flush_emit(ctx); ctx->b.flags = 0; if (ctx->b.chip_class >= CAYMAN) { cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 0, 0); cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4); /* DEALLOC_STATE prevents the GPU from hanging when a * SURFACE_SYNC packet is emitted some time after a DISPATCH_DIRECT * with any of the CB*_DEST_BASE_ENA or DB_DEST_BASE_ENA bits set. */ cs->buf[cs->cdw++] = PKT3C(PKT3_DEALLOC_STATE, 0, 0); cs->buf[cs->cdw++] = 0; } #if 0 COMPUTE_DBG(ctx->screen, "cdw: %i\n", cs->cdw); for (i = 0; i < cs->cdw; i++) { COMPUTE_DBG(ctx->screen, "%4i : 0x%08X\n", i, cs->buf[i]); } #endif }