/** * Emit function for r600_cs_shader_state atom */ void evergreen_emit_cs_shader(struct r600_context *rctx, struct r600_atom *atom) { struct r600_cs_shader_state *state = (struct r600_cs_shader_state*)atom; struct r600_pipe_compute *shader = state->shader; struct radeon_winsys_cs *cs = rctx->b.gfx.cs; uint64_t va; struct r600_resource *code_bo; unsigned ngpr, nstack; code_bo = shader->code_bo; va = shader->code_bo->gpu_address + state->pc; ngpr = shader->bc.ngpr; nstack = shader->bc.nstack; radeon_compute_set_context_reg_seq(cs, R_0288D0_SQ_PGM_START_LS, 3); radeon_emit(cs, va >> 8); /* R_0288D0_SQ_PGM_START_LS */ radeon_emit(cs, /* R_0288D4_SQ_PGM_RESOURCES_LS */ S_0288D4_NUM_GPRS(ngpr) | S_0288D4_STACK_SIZE(nstack)); radeon_emit(cs, 0); /* R_0288D8_SQ_PGM_RESOURCES_LS_2 */ radeon_emit(cs, PKT3C(PKT3_NOP, 0, 0)); radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, code_bo, RADEON_USAGE_READ, RADEON_PRIO_USER_SHADER)); }
static void compute_emit_cs(struct r600_context *ctx, const uint *block_layout, const uint *grid_layout) { struct radeon_winsys_cs *cs = ctx->b.gfx.cs; unsigned i; /* make sure that the gfx ring is only one active */ if (ctx->b.dma.cs && ctx->b.dma.cs->cdw) { ctx->b.dma.flush(ctx, RADEON_FLUSH_ASYNC, NULL); } /* Initialize all the compute-related registers. * * See evergreen_init_atom_start_compute_cs() in this file for the list * of registers initialized by the start_compute_cs_cmd atom. */ r600_emit_command_buffer(cs, &ctx->start_compute_cs_cmd); /* emit config state */ if (ctx->b.chip_class == EVERGREEN) r600_emit_atom(ctx, &ctx->config_state.atom); ctx->b.flags |= R600_CONTEXT_WAIT_3D_IDLE | R600_CONTEXT_FLUSH_AND_INV; r600_flush_emit(ctx); /* Emit colorbuffers. */ /* XXX support more than 8 colorbuffers (the offsets are not a multiple of 0x3C for CB8-11) */ for (i = 0; i < 8 && i < ctx->framebuffer.state.nr_cbufs; i++) { struct r600_surface *cb = (struct r600_surface*)ctx->framebuffer.state.cbufs[i]; unsigned reloc = radeon_add_to_buffer_list(&ctx->b, &ctx->b.gfx, (struct r600_resource*)cb->base.texture, RADEON_USAGE_READWRITE, RADEON_PRIO_SHADER_RW_BUFFER); radeon_compute_set_context_reg_seq(cs, R_028C60_CB_COLOR0_BASE + i * 0x3C, 7); radeon_emit(cs, cb->cb_color_base); /* R_028C60_CB_COLOR0_BASE */ radeon_emit(cs, cb->cb_color_pitch); /* R_028C64_CB_COLOR0_PITCH */ radeon_emit(cs, cb->cb_color_slice); /* R_028C68_CB_COLOR0_SLICE */ radeon_emit(cs, cb->cb_color_view); /* R_028C6C_CB_COLOR0_VIEW */ radeon_emit(cs, cb->cb_color_info); /* R_028C70_CB_COLOR0_INFO */ radeon_emit(cs, cb->cb_color_attrib); /* R_028C74_CB_COLOR0_ATTRIB */ radeon_emit(cs, cb->cb_color_dim); /* R_028C78_CB_COLOR0_DIM */ radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C60_CB_COLOR0_BASE */ radeon_emit(cs, reloc); if (!ctx->keep_tiling_flags) { radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C70_CB_COLOR0_INFO */ radeon_emit(cs, reloc); } radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C74_CB_COLOR0_ATTRIB */ radeon_emit(cs, reloc); } if (ctx->keep_tiling_flags) { for (; i < 8 ; i++) { radeon_compute_set_context_reg(cs, R_028C70_CB_COLOR0_INFO + i * 0x3C, S_028C70_FORMAT(V_028C70_COLOR_INVALID)); } for (; i < 12; i++) { radeon_compute_set_context_reg(cs, R_028E50_CB_COLOR8_INFO + (i - 8) * 0x1C, S_028C70_FORMAT(V_028C70_COLOR_INVALID)); } } /* Set CB_TARGET_MASK XXX: Use cb_misc_state */ radeon_compute_set_context_reg(cs, R_028238_CB_TARGET_MASK, ctx->compute_cb_target_mask); /* Emit vertex buffer state */ ctx->cs_vertex_buffer_state.atom.num_dw = 12 * util_bitcount(ctx->cs_vertex_buffer_state.dirty_mask); r600_emit_atom(ctx, &ctx->cs_vertex_buffer_state.atom); /* Emit constant buffer state */ r600_emit_atom(ctx, &ctx->constbuf_state[PIPE_SHADER_COMPUTE].atom); /* Emit sampler state */ r600_emit_atom(ctx, &ctx->samplers[PIPE_SHADER_COMPUTE].states.atom); /* Emit sampler view (texture resource) state */ r600_emit_atom(ctx, &ctx->samplers[PIPE_SHADER_COMPUTE].views.atom); /* Emit compute shader state */ r600_emit_atom(ctx, &ctx->cs_shader_state.atom); /* Emit dispatch state and dispatch packet */ evergreen_emit_direct_dispatch(ctx, block_layout, grid_layout); /* XXX evergreen_flush_emit() hardcodes the CP_COHER_SIZE to 0xffffffff */ ctx->b.flags |= R600_CONTEXT_INV_CONST_CACHE | R600_CONTEXT_INV_VERTEX_CACHE | R600_CONTEXT_INV_TEX_CACHE; r600_flush_emit(ctx); ctx->b.flags = 0; if (ctx->b.chip_class >= CAYMAN) { cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 0, 0); cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4); /* DEALLOC_STATE prevents the GPU from hanging when a * SURFACE_SYNC packet is emitted some time after a DISPATCH_DIRECT * with any of the CB*_DEST_BASE_ENA or DB_DEST_BASE_ENA bits set. */ cs->buf[cs->cdw++] = PKT3C(PKT3_DEALLOC_STATE, 0, 0); cs->buf[cs->cdw++] = 0; } #if 0 COMPUTE_DBG(ctx->screen, "cdw: %i\n", cs->cdw); for (i = 0; i < cs->cdw; i++) { COMPUTE_DBG(ctx->screen, "%4i : 0x%08X\n", i, cs->buf[i]); } #endif }
static void evergreen_emit_direct_dispatch( struct r600_context *rctx, const uint *block_layout, const uint *grid_layout) { int i; struct radeon_winsys_cs *cs = rctx->b.gfx.cs; struct r600_pipe_compute *shader = rctx->cs_shader_state.shader; unsigned num_waves; unsigned num_pipes = rctx->screen->b.info.r600_max_pipes; unsigned wave_divisor = (16 * num_pipes); int group_size = 1; int grid_size = 1; unsigned lds_size = shader->local_size / 4 + #if HAVE_LLVM < 0x0306 shader->active_kernel->bc.nlds_dw; #else shader->bc.nlds_dw; #endif /* Calculate group_size/grid_size */ for (i = 0; i < 3; i++) { group_size *= block_layout[i]; } for (i = 0; i < 3; i++) { grid_size *= grid_layout[i]; } /* num_waves = ceil((tg_size.x * tg_size.y, tg_size.z) / (16 * num_pipes)) */ num_waves = (block_layout[0] * block_layout[1] * block_layout[2] + wave_divisor - 1) / wave_divisor; COMPUTE_DBG(rctx->screen, "Using %u pipes, " "%u wavefronts per thread block, " "allocating %u dwords lds.\n", num_pipes, num_waves, lds_size); radeon_set_config_reg(cs, R_008970_VGT_NUM_INDICES, group_size); radeon_set_config_reg_seq(cs, R_00899C_VGT_COMPUTE_START_X, 3); radeon_emit(cs, 0); /* R_00899C_VGT_COMPUTE_START_X */ radeon_emit(cs, 0); /* R_0089A0_VGT_COMPUTE_START_Y */ radeon_emit(cs, 0); /* R_0089A4_VGT_COMPUTE_START_Z */ radeon_set_config_reg(cs, R_0089AC_VGT_COMPUTE_THREAD_GROUP_SIZE, group_size); radeon_compute_set_context_reg_seq(cs, R_0286EC_SPI_COMPUTE_NUM_THREAD_X, 3); radeon_emit(cs, block_layout[0]); /* R_0286EC_SPI_COMPUTE_NUM_THREAD_X */ radeon_emit(cs, block_layout[1]); /* R_0286F0_SPI_COMPUTE_NUM_THREAD_Y */ radeon_emit(cs, block_layout[2]); /* R_0286F4_SPI_COMPUTE_NUM_THREAD_Z */ if (rctx->b.chip_class < CAYMAN) { assert(lds_size <= 8192); } else { /* Cayman appears to have a slightly smaller limit, see the * value of CM_R_0286FC_SPI_LDS_MGMT.NUM_LS_LDS */ assert(lds_size <= 8160); } radeon_compute_set_context_reg(cs, R_0288E8_SQ_LDS_ALLOC, lds_size | (num_waves << 14)); /* Dispatch packet */ radeon_emit(cs, PKT3C(PKT3_DISPATCH_DIRECT, 3, 0)); radeon_emit(cs, grid_layout[0]); radeon_emit(cs, grid_layout[1]); radeon_emit(cs, grid_layout[2]); /* VGT_DISPATCH_INITIATOR = COMPUTE_SHADER_EN */ radeon_emit(cs, 1); }