static bool r600_resource_commit(struct pipe_context *pctx, struct pipe_resource *resource, unsigned level, struct pipe_box *box, bool commit) { struct r600_common_context *ctx = (struct r600_common_context *)pctx; struct r600_resource *res = r600_resource(resource); /* * Since buffer commitment changes cannot be pipelined, we need to * (a) flush any pending commands that refer to the buffer we're about * to change, and * (b) wait for threaded submit to finish, including those that were * triggered by some other, earlier operation. */ if (radeon_emitted(ctx->gfx.cs, ctx->initial_gfx_cs_size) && ctx->ws->cs_is_buffer_referenced(ctx->gfx.cs, res->buf, RADEON_USAGE_READWRITE)) { ctx->gfx.flush(ctx, RADEON_FLUSH_ASYNC, NULL); } if (radeon_emitted(ctx->dma.cs, 0) && ctx->ws->cs_is_buffer_referenced(ctx->dma.cs, res->buf, RADEON_USAGE_READWRITE)) { ctx->dma.flush(ctx, RADEON_FLUSH_ASYNC, NULL); } ctx->ws->cs_sync_flush(ctx->dma.cs); ctx->ws->cs_sync_flush(ctx->gfx.cs); assert(resource->target == PIPE_BUFFER); return ctx->ws->buffer_commit(res->buf, box->x, box->width, commit); }
void *r600_buffer_map_sync_with_rings(struct r600_common_context *ctx, struct r600_resource *resource, unsigned usage) { enum radeon_bo_usage rusage = RADEON_USAGE_READWRITE; bool busy = false; if (usage & PIPE_TRANSFER_UNSYNCHRONIZED) { return ctx->ws->buffer_map(resource->buf, NULL, usage); } if (!(usage & PIPE_TRANSFER_WRITE)) { /* have to wait for the last write */ rusage = RADEON_USAGE_WRITE; } if (radeon_emitted(ctx->gfx.cs, ctx->initial_gfx_cs_size) && ctx->ws->cs_is_buffer_referenced(ctx->gfx.cs, resource->buf, rusage)) { if (usage & PIPE_TRANSFER_DONTBLOCK) { ctx->gfx.flush(ctx, RADEON_FLUSH_ASYNC, NULL); return NULL; } else { ctx->gfx.flush(ctx, 0, NULL); busy = true; } } if (radeon_emitted(ctx->dma.cs, 0) && ctx->ws->cs_is_buffer_referenced(ctx->dma.cs, resource->buf, rusage)) { if (usage & PIPE_TRANSFER_DONTBLOCK) { ctx->dma.flush(ctx, RADEON_FLUSH_ASYNC, NULL); return NULL; } else { ctx->dma.flush(ctx, 0, NULL); busy = true; } } if (busy || !ctx->ws->buffer_wait(resource->buf, 0, rusage)) { if (usage & PIPE_TRANSFER_DONTBLOCK) { return NULL; } else { /* We will be wait for the GPU. Wait for any offloaded * CS flush to complete to avoid busy-waiting in the winsys. */ ctx->ws->cs_sync_flush(ctx->gfx.cs); if (ctx->dma.cs) ctx->ws->cs_sync_flush(ctx->dma.cs); } } /* Setting the CS to NULL will prevent doing checks we have done already. */ return ctx->ws->buffer_map(resource->buf, NULL, usage); }
/* initialize */ void si_need_cs_space(struct si_context *ctx) { struct radeon_winsys_cs *cs = ctx->b.gfx.cs; struct radeon_winsys_cs *ce_ib = ctx->ce_ib; struct radeon_winsys_cs *dma = ctx->b.dma.cs; /* Flush the DMA IB if it's not empty. */ if (radeon_emitted(dma, 0)) ctx->b.dma.flush(ctx, RADEON_FLUSH_ASYNC, NULL); /* There are two memory usage counters in the winsys for all buffers * that have been added (cs_add_buffer) and two counters in the pipe * driver for those that haven't been added yet. */ if (unlikely(!ctx->b.ws->cs_memory_below_limit(ctx->b.gfx.cs, ctx->b.vram, ctx->b.gtt))) { ctx->b.gtt = 0; ctx->b.vram = 0; ctx->b.gfx.flush(ctx, RADEON_FLUSH_ASYNC, NULL); return; } ctx->b.gtt = 0; ctx->b.vram = 0; /* If the CS is sufficiently large, don't count the space needed * and just flush if there is not enough space left. */ if (unlikely(cs->cdw > cs->max_dw - 2048 || (ce_ib && ce_ib->max_dw - ce_ib->cdw < si_ce_needed_cs_space()))) ctx->b.gfx.flush(ctx, RADEON_FLUSH_ASYNC, NULL); }
static void r600_flush_dma_ring(void *ctx, unsigned flags, struct pipe_fence_handle **fence) { struct r600_common_context *rctx = (struct r600_common_context *)ctx; struct radeon_winsys_cs *cs = rctx->dma.cs; struct radeon_saved_cs saved; bool check_vm = (rctx->screen->debug_flags & DBG_CHECK_VM) && rctx->check_vm_faults; if (!radeon_emitted(cs, 0)) { if (fence) rctx->ws->fence_reference(fence, rctx->last_sdma_fence); return; } if (check_vm) radeon_save_cs(rctx->ws, cs, &saved, true); rctx->ws->cs_flush(cs, flags, &rctx->last_sdma_fence); if (fence) rctx->ws->fence_reference(fence, rctx->last_sdma_fence); if (check_vm) { /* Use conservative timeout 800ms, after which we won't wait any * longer and assume the GPU is hung. */ rctx->ws->fence_wait(rctx->ws, rctx->last_sdma_fence, 800*1000*1000); rctx->check_vm_faults(rctx, &saved, RING_DMA); radeon_clear_saved_cs(&saved); } }
void r600_need_cs_space(struct r600_context *ctx, unsigned num_dw, boolean count_draw_in, unsigned num_atomics) { /* Flush the DMA IB if it's not empty. */ if (radeon_emitted(ctx->b.dma.cs, 0)) ctx->b.dma.flush(ctx, PIPE_FLUSH_ASYNC, NULL); if (!radeon_cs_memory_below_limit(ctx->b.screen, ctx->b.gfx.cs, ctx->b.vram, ctx->b.gtt)) { ctx->b.gtt = 0; ctx->b.vram = 0; ctx->b.gfx.flush(ctx, PIPE_FLUSH_ASYNC, NULL); return; } /* all will be accounted once relocation are emited */ ctx->b.gtt = 0; ctx->b.vram = 0; /* Check available space in CS. */ if (count_draw_in) { uint64_t mask; /* The number of dwords all the dirty states would take. */ mask = ctx->dirty_atoms; while (mask != 0) num_dw += ctx->atoms[u_bit_scan64(&mask)]->num_dw; /* The upper-bound of how much space a draw command would take. */ num_dw += R600_MAX_FLUSH_CS_DWORDS + R600_MAX_DRAW_CS_DWORDS; } /* add atomic counters, 8 pre + 8 post per counter + 16 post if any counters */ num_dw += (num_atomics * 16) + (num_atomics ? 16 : 0); /* Count in r600_suspend_queries. */ num_dw += ctx->b.num_cs_dw_queries_suspend; /* Count in streamout_end at the end of CS. */ if (ctx->b.streamout.begin_emitted) { num_dw += ctx->b.streamout.num_dw_for_end; } /* SX_MISC */ if (ctx->b.chip_class == R600) { num_dw += 3; } /* Count in framebuffer cache flushes at the end of CS. */ num_dw += R600_MAX_FLUSH_CS_DWORDS; /* The fence at the end of CS. */ num_dw += 10; /* Flush if there's not enough space. */ if (!ctx->b.ws->cs_check_space(ctx->b.gfx.cs, num_dw, false)) { ctx->b.gfx.flush(ctx, PIPE_FLUSH_ASYNC, NULL); } }
bool r600_rings_is_buffer_referenced(struct r600_common_context *ctx, struct pb_buffer *buf, enum radeon_bo_usage usage) { if (ctx->ws->cs_is_buffer_referenced(ctx->gfx.cs, buf, usage)) { return true; } if (radeon_emitted(ctx->dma.cs, 0) && ctx->ws->cs_is_buffer_referenced(ctx->dma.cs, buf, usage)) { return true; } return false; }
void r600_need_dma_space(struct r600_common_context *ctx, unsigned num_dw, struct r600_resource *dst, struct r600_resource *src) { uint64_t vram = 0, gtt = 0; if (dst) { if (dst->domains & RADEON_DOMAIN_VRAM) vram += dst->buf->size; else if (dst->domains & RADEON_DOMAIN_GTT) gtt += dst->buf->size; } if (src) { if (src->domains & RADEON_DOMAIN_VRAM) vram += src->buf->size; else if (src->domains & RADEON_DOMAIN_GTT) gtt += src->buf->size; } /* Flush the GFX IB if DMA depends on it. */ if (radeon_emitted(ctx->gfx.cs, ctx->initial_gfx_cs_size) && ((dst && ctx->ws->cs_is_buffer_referenced(ctx->gfx.cs, dst->buf, RADEON_USAGE_READWRITE)) || (src && ctx->ws->cs_is_buffer_referenced(ctx->gfx.cs, src->buf, RADEON_USAGE_WRITE)))) ctx->gfx.flush(ctx, RADEON_FLUSH_ASYNC, NULL); /* Flush if there's not enough space, or if the memory usage per IB * is too large. */ if (!ctx->ws->cs_check_space(ctx->dma.cs, num_dw) || !ctx->ws->cs_memory_below_limit(ctx->dma.cs, vram, gtt)) { ctx->dma.flush(ctx, RADEON_FLUSH_ASYNC, NULL); assert((num_dw + ctx->dma.cs->current.cdw) <= ctx->dma.cs->current.max_dw); } /* If GPUVM is not supported, the CS checker needs 2 entries * in the buffer list per packet, which has to be done manually. */ if (ctx->screen->info.has_virtual_memory) { if (dst) radeon_add_to_buffer_list(ctx, &ctx->dma, dst, RADEON_USAGE_WRITE, RADEON_PRIO_SDMA_BUFFER); if (src) radeon_add_to_buffer_list(ctx, &ctx->dma, src, RADEON_USAGE_READ, RADEON_PRIO_SDMA_BUFFER); } }
void r600_context_gfx_flush(void *context, unsigned flags, struct pipe_fence_handle **fence) { struct r600_context *ctx = context; struct radeon_winsys_cs *cs = ctx->b.gfx.cs; struct radeon_winsys *ws = ctx->b.ws; if (!radeon_emitted(cs, ctx->b.initial_gfx_cs_size) && (!fence || ctx->b.last_gfx_fence)) { if (fence) ws->fence_reference(fence, ctx->b.last_gfx_fence); if (!(flags & RADEON_FLUSH_ASYNC)) ws->cs_sync_flush(cs); return; } r600_preflush_suspend_features(&ctx->b); /* flush the framebuffer cache */ ctx->b.flags |= R600_CONTEXT_FLUSH_AND_INV | R600_CONTEXT_FLUSH_AND_INV_CB | R600_CONTEXT_FLUSH_AND_INV_DB | R600_CONTEXT_FLUSH_AND_INV_CB_META | R600_CONTEXT_FLUSH_AND_INV_DB_META | R600_CONTEXT_WAIT_3D_IDLE | R600_CONTEXT_WAIT_CP_DMA_IDLE; r600_flush_emit(ctx); /* old kernels and userspace don't set SX_MISC, so we must reset it to 0 here */ if (ctx->b.chip_class == R600) { radeon_set_context_reg(cs, R_028350_SX_MISC, 0); } /* Flush the CS. */ ws->cs_flush(cs, flags, &ctx->b.last_gfx_fence); if (fence) ws->fence_reference(fence, ctx->b.last_gfx_fence); ctx->b.num_gfx_cs_flushes++; r600_begin_new_cs(ctx); }
/* This is required to prevent read-after-write hazards. */ void r600_dma_emit_wait_idle(struct r600_common_context *rctx) { struct radeon_winsys_cs *cs = rctx->dma.cs; /* done at the end of DMA calls, so increment this. */ rctx->num_dma_calls++; /* IBs using too little memory are limited by the IB submission overhead. * IBs using too much memory are limited by the kernel/TTM overhead. * Too long IBs create CPU-GPU pipeline bubbles and add latency. * * This heuristic makes sure that DMA requests are executed * very soon after the call is made and lowers memory usage. * It improves texture upload performance by keeping the DMA * engine busy while uploads are being submitted. */ if (rctx->ws->cs_query_memory_usage(rctx->dma.cs) > 64 * 1024 * 1024) { rctx->dma.flush(rctx, RADEON_FLUSH_ASYNC, NULL); return; } r600_need_dma_space(rctx, 1, NULL, NULL); if (!radeon_emitted(cs, 0)) /* empty queue */ return; /* NOP waits for idle on Evergreen and later. */ if (rctx->chip_class >= CIK) radeon_emit(cs, 0x00000000); /* NOP */ else if (rctx->chip_class >= EVERGREEN) radeon_emit(cs, 0xf0000000); /* NOP */ else { /* TODO: R600-R700 should use the FENCE packet. * CS checker support is required. */ } }
void si_context_gfx_flush(void *context, unsigned flags, struct pipe_fence_handle **fence) { struct si_context *ctx = context; struct radeon_winsys_cs *cs = ctx->b.gfx.cs; struct radeon_winsys *ws = ctx->b.ws; if (ctx->gfx_flush_in_progress) return; ctx->gfx_flush_in_progress = true; if (!radeon_emitted(cs, ctx->b.initial_gfx_cs_size) && (!fence || ctx->last_gfx_fence)) { if (fence) ws->fence_reference(fence, ctx->last_gfx_fence); if (!(flags & RADEON_FLUSH_ASYNC)) ws->cs_sync_flush(cs); ctx->gfx_flush_in_progress = false; return; } r600_preflush_suspend_features(&ctx->b); ctx->b.flags |= SI_CONTEXT_CS_PARTIAL_FLUSH | SI_CONTEXT_PS_PARTIAL_FLUSH; /* DRM 3.1.0 doesn't flush TC for VI correctly. */ if (ctx->b.chip_class == VI && ctx->b.screen->info.drm_minor <= 1) ctx->b.flags |= SI_CONTEXT_INV_GLOBAL_L2 | SI_CONTEXT_INV_VMEM_L1; si_emit_cache_flush(ctx, NULL); /* force to keep tiling flags */ flags |= RADEON_FLUSH_KEEP_TILING_FLAGS; if (ctx->trace_buf) si_trace_emit(ctx); if (ctx->is_debug) { unsigned i; /* Save the IB for debug contexts. */ free(ctx->last_ib); ctx->last_ib_dw_size = cs->cdw; ctx->last_ib = malloc(cs->cdw * 4); memcpy(ctx->last_ib, cs->buf, cs->cdw * 4); r600_resource_reference(&ctx->last_trace_buf, ctx->trace_buf); r600_resource_reference(&ctx->trace_buf, NULL); /* Save the buffer list. */ if (ctx->last_bo_list) { for (i = 0; i < ctx->last_bo_count; i++) pb_reference(&ctx->last_bo_list[i].buf, NULL); free(ctx->last_bo_list); } ctx->last_bo_count = ws->cs_get_buffer_list(cs, NULL); ctx->last_bo_list = calloc(ctx->last_bo_count, sizeof(ctx->last_bo_list[0])); ws->cs_get_buffer_list(cs, ctx->last_bo_list); } /* Flush the CS. */ ws->cs_flush(cs, flags, &ctx->last_gfx_fence); if (fence) ws->fence_reference(fence, ctx->last_gfx_fence); /* Check VM faults if needed. */ if (ctx->screen->b.debug_flags & DBG_CHECK_VM) si_check_vm_faults(ctx); si_begin_new_cs(ctx); ctx->gfx_flush_in_progress = false; }
static void r600_flush_from_st(struct pipe_context *ctx, struct pipe_fence_handle **fence, unsigned flags) { struct pipe_screen *screen = ctx->screen; struct r600_common_context *rctx = (struct r600_common_context *)ctx; struct radeon_winsys *ws = rctx->ws; struct pipe_fence_handle *gfx_fence = NULL; struct pipe_fence_handle *sdma_fence = NULL; bool deferred_fence = false; unsigned rflags = RADEON_FLUSH_ASYNC; if (flags & PIPE_FLUSH_END_OF_FRAME) rflags |= RADEON_FLUSH_END_OF_FRAME; /* DMA IBs are preambles to gfx IBs, therefore must be flushed first. */ if (rctx->dma.cs) rctx->dma.flush(rctx, rflags, fence ? &sdma_fence : NULL); if (!radeon_emitted(rctx->gfx.cs, rctx->initial_gfx_cs_size)) { if (fence) ws->fence_reference(&gfx_fence, rctx->last_gfx_fence); if (!(flags & PIPE_FLUSH_DEFERRED)) ws->cs_sync_flush(rctx->gfx.cs); } else { /* Instead of flushing, create a deferred fence. Constraints: * - The state tracker must allow a deferred flush. * - The state tracker must request a fence. * Thread safety in fence_finish must be ensured by the state tracker. */ if (flags & PIPE_FLUSH_DEFERRED && fence) { gfx_fence = rctx->ws->cs_get_next_fence(rctx->gfx.cs); deferred_fence = true; } else { rctx->gfx.flush(rctx, rflags, fence ? &gfx_fence : NULL); } } /* Both engines can signal out of order, so we need to keep both fences. */ if (fence) { struct r600_multi_fence *multi_fence = CALLOC_STRUCT(r600_multi_fence); if (!multi_fence) { ws->fence_reference(&sdma_fence, NULL); ws->fence_reference(&gfx_fence, NULL); goto finish; } multi_fence->reference.count = 1; /* If both fences are NULL, fence_finish will always return true. */ multi_fence->gfx = gfx_fence; multi_fence->sdma = sdma_fence; if (deferred_fence) { multi_fence->gfx_unflushed.ctx = rctx; multi_fence->gfx_unflushed.ib_index = rctx->num_gfx_cs_flushes; } screen->fence_reference(screen, fence, NULL); *fence = (struct pipe_fence_handle*)multi_fence; } finish: if (!(flags & PIPE_FLUSH_DEFERRED)) { if (rctx->dma.cs) ws->cs_sync_flush(rctx->dma.cs); ws->cs_sync_flush(rctx->gfx.cs); } }
void r600_need_dma_space(struct r600_common_context *ctx, unsigned num_dw, struct r600_resource *dst, struct r600_resource *src) { uint64_t vram = ctx->dma.cs->used_vram; uint64_t gtt = ctx->dma.cs->used_gart; if (dst) { vram += dst->vram_usage; gtt += dst->gart_usage; } if (src) { vram += src->vram_usage; gtt += src->gart_usage; } /* Flush the GFX IB if DMA depends on it. */ if (radeon_emitted(ctx->gfx.cs, ctx->initial_gfx_cs_size) && ((dst && ctx->ws->cs_is_buffer_referenced(ctx->gfx.cs, dst->buf, RADEON_USAGE_READWRITE)) || (src && ctx->ws->cs_is_buffer_referenced(ctx->gfx.cs, src->buf, RADEON_USAGE_WRITE)))) ctx->gfx.flush(ctx, RADEON_FLUSH_ASYNC, NULL); /* Flush if there's not enough space, or if the memory usage per IB * is too large. * * IBs using too little memory are limited by the IB submission overhead. * IBs using too much memory are limited by the kernel/TTM overhead. * Too long IBs create CPU-GPU pipeline bubbles and add latency. * * This heuristic makes sure that DMA requests are executed * very soon after the call is made and lowers memory usage. * It improves texture upload performance by keeping the DMA * engine busy while uploads are being submitted. */ num_dw++; /* for emit_wait_idle below */ if (!ctx->ws->cs_check_space(ctx->dma.cs, num_dw) || ctx->dma.cs->used_vram + ctx->dma.cs->used_gart > 64 * 1024 * 1024 || !radeon_cs_memory_below_limit(ctx->screen, ctx->dma.cs, vram, gtt)) { ctx->dma.flush(ctx, RADEON_FLUSH_ASYNC, NULL); assert((num_dw + ctx->dma.cs->current.cdw) <= ctx->dma.cs->current.max_dw); } /* Wait for idle if either buffer has been used in the IB before to * prevent read-after-write hazards. */ if ((dst && ctx->ws->cs_is_buffer_referenced(ctx->dma.cs, dst->buf, RADEON_USAGE_READWRITE)) || (src && ctx->ws->cs_is_buffer_referenced(ctx->dma.cs, src->buf, RADEON_USAGE_WRITE))) r600_dma_emit_wait_idle(ctx); /* If GPUVM is not supported, the CS checker needs 2 entries * in the buffer list per packet, which has to be done manually. */ if (ctx->screen->info.has_virtual_memory) { if (dst) radeon_add_to_buffer_list(ctx, &ctx->dma, dst, RADEON_USAGE_WRITE, RADEON_PRIO_SDMA_BUFFER); if (src) radeon_add_to_buffer_list(ctx, &ctx->dma, src, RADEON_USAGE_READ, RADEON_PRIO_SDMA_BUFFER); } /* this function is called before all DMA calls, so increment this. */ ctx->num_dma_calls++; }
static void compute_emit_cs(struct r600_context *rctx, const struct pipe_grid_info *info) { struct radeon_winsys_cs *cs = rctx->b.gfx.cs; unsigned i; /* make sure that the gfx ring is only one active */ if (radeon_emitted(rctx->b.dma.cs, 0)) { rctx->b.dma.flush(rctx, RADEON_FLUSH_ASYNC, NULL); } /* Initialize all the compute-related registers. * * See evergreen_init_atom_start_compute_cs() in this file for the list * of registers initialized by the start_compute_cs_cmd atom. */ r600_emit_command_buffer(cs, &rctx->start_compute_cs_cmd); /* emit config state */ if (rctx->b.chip_class == EVERGREEN) r600_emit_atom(rctx, &rctx->config_state.atom); rctx->b.flags |= R600_CONTEXT_WAIT_3D_IDLE | R600_CONTEXT_FLUSH_AND_INV; r600_flush_emit(rctx); /* Emit colorbuffers. */ /* XXX support more than 8 colorbuffers (the offsets are not a multiple of 0x3C for CB8-11) */ for (i = 0; i < 8 && i < rctx->framebuffer.state.nr_cbufs; i++) { struct r600_surface *cb = (struct r600_surface*)rctx->framebuffer.state.cbufs[i]; unsigned reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, (struct r600_resource*)cb->base.texture, RADEON_USAGE_READWRITE, RADEON_PRIO_SHADER_RW_BUFFER); radeon_compute_set_context_reg_seq(cs, R_028C60_CB_COLOR0_BASE + i * 0x3C, 7); radeon_emit(cs, cb->cb_color_base); /* R_028C60_CB_COLOR0_BASE */ radeon_emit(cs, cb->cb_color_pitch); /* R_028C64_CB_COLOR0_PITCH */ radeon_emit(cs, cb->cb_color_slice); /* R_028C68_CB_COLOR0_SLICE */ radeon_emit(cs, cb->cb_color_view); /* R_028C6C_CB_COLOR0_VIEW */ radeon_emit(cs, cb->cb_color_info); /* R_028C70_CB_COLOR0_INFO */ radeon_emit(cs, cb->cb_color_attrib); /* R_028C74_CB_COLOR0_ATTRIB */ radeon_emit(cs, cb->cb_color_dim); /* R_028C78_CB_COLOR0_DIM */ radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C60_CB_COLOR0_BASE */ radeon_emit(cs, reloc); radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C74_CB_COLOR0_ATTRIB */ radeon_emit(cs, reloc); } for (; i < 8 ; i++) radeon_compute_set_context_reg(cs, R_028C70_CB_COLOR0_INFO + i * 0x3C, S_028C70_FORMAT(V_028C70_COLOR_INVALID)); for (; i < 12; i++) radeon_compute_set_context_reg(cs, R_028E50_CB_COLOR8_INFO + (i - 8) * 0x1C, S_028C70_FORMAT(V_028C70_COLOR_INVALID)); /* Set CB_TARGET_MASK XXX: Use cb_misc_state */ radeon_compute_set_context_reg(cs, R_028238_CB_TARGET_MASK, rctx->compute_cb_target_mask); /* Emit vertex buffer state */ rctx->cs_vertex_buffer_state.atom.num_dw = 12 * util_bitcount(rctx->cs_vertex_buffer_state.dirty_mask); r600_emit_atom(rctx, &rctx->cs_vertex_buffer_state.atom); /* Emit constant buffer state */ r600_emit_atom(rctx, &rctx->constbuf_state[PIPE_SHADER_COMPUTE].atom); /* Emit sampler state */ r600_emit_atom(rctx, &rctx->samplers[PIPE_SHADER_COMPUTE].states.atom); /* Emit sampler view (texture resource) state */ r600_emit_atom(rctx, &rctx->samplers[PIPE_SHADER_COMPUTE].views.atom); /* Emit compute shader state */ r600_emit_atom(rctx, &rctx->cs_shader_state.atom); /* Emit dispatch state and dispatch packet */ evergreen_emit_dispatch(rctx, info); /* XXX evergreen_flush_emit() hardcodes the CP_COHER_SIZE to 0xffffffff */ rctx->b.flags |= R600_CONTEXT_INV_CONST_CACHE | R600_CONTEXT_INV_VERTEX_CACHE | R600_CONTEXT_INV_TEX_CACHE; r600_flush_emit(rctx); rctx->b.flags = 0; if (rctx->b.chip_class >= CAYMAN) { radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4)); /* DEALLOC_STATE prevents the GPU from hanging when a * SURFACE_SYNC packet is emitted some time after a DISPATCH_DIRECT * with any of the CB*_DEST_BASE_ENA or DB_DEST_BASE_ENA bits set. */ radeon_emit(cs, PKT3C(PKT3_DEALLOC_STATE, 0, 0)); radeon_emit(cs, 0); } #if 0 COMPUTE_DBG(rctx->screen, "cdw: %i\n", cs->cdw); for (i = 0; i < cs->cdw; i++) { COMPUTE_DBG(rctx->screen, "%4i : 0x%08X\n", i, cs->buf[i]); } #endif }
void r600_context_gfx_flush(void *context, unsigned flags, struct pipe_fence_handle **fence) { struct r600_context *ctx = context; struct radeon_cmdbuf *cs = ctx->b.gfx.cs; struct radeon_winsys *ws = ctx->b.ws; if (!radeon_emitted(cs, ctx->b.initial_gfx_cs_size)) return; if (r600_check_device_reset(&ctx->b)) return; r600_preflush_suspend_features(&ctx->b); /* flush the framebuffer cache */ ctx->b.flags |= R600_CONTEXT_FLUSH_AND_INV | R600_CONTEXT_FLUSH_AND_INV_CB | R600_CONTEXT_FLUSH_AND_INV_DB | R600_CONTEXT_FLUSH_AND_INV_CB_META | R600_CONTEXT_FLUSH_AND_INV_DB_META | R600_CONTEXT_WAIT_3D_IDLE | R600_CONTEXT_WAIT_CP_DMA_IDLE; r600_flush_emit(ctx); if (ctx->trace_buf) eg_trace_emit(ctx); /* old kernels and userspace don't set SX_MISC, so we must reset it to 0 here */ if (ctx->b.chip_class == R600) { radeon_set_context_reg(cs, R_028350_SX_MISC, 0); } if (ctx->is_debug) { /* Save the IB for debug contexts. */ radeon_clear_saved_cs(&ctx->last_gfx); radeon_save_cs(ws, cs, &ctx->last_gfx, true); r600_resource_reference(&ctx->last_trace_buf, ctx->trace_buf); r600_resource_reference(&ctx->trace_buf, NULL); } /* Flush the CS. */ ws->cs_flush(cs, flags, &ctx->b.last_gfx_fence); if (fence) ws->fence_reference(fence, ctx->b.last_gfx_fence); ctx->b.num_gfx_cs_flushes++; if (ctx->is_debug) { if (!ws->fence_wait(ws, ctx->b.last_gfx_fence, 10000000)) { const char *fname = getenv("R600_TRACE"); if (!fname) exit(-1); FILE *fl = fopen(fname, "w+"); if (fl) { eg_dump_debug_state(&ctx->b.b, fl, 0); fclose(fl); } else perror(fname); exit(-1); } } r600_begin_new_cs(ctx); }