static void compute_emit_cs(struct r600_context *ctx, const uint *block_layout, const uint *grid_layout) { struct radeon_winsys_cs *cs = ctx->cs; int i; struct r600_resource *onebo = NULL; struct r600_pipe_state *cb_state; struct evergreen_compute_resource *resources = ctx->cs_shader_state.shader->resources; /* Initialize all the compute-related registers. * * See evergreen_init_atom_start_compute_cs() in this file for the list * of registers initialized by the start_compute_cs_cmd atom. */ r600_emit_atom(ctx, &ctx->start_compute_cs_cmd.atom); ctx->flags |= R600_CONTEXT_CB_FLUSH; r600_flush_emit(ctx); /* Emit cb_state */ cb_state = ctx->states[R600_PIPE_STATE_FRAMEBUFFER]; r600_context_pipe_state_emit(ctx, cb_state, RADEON_CP_PACKET3_COMPUTE_MODE); /* Set CB_TARGET_MASK XXX: Use cb_misc_state */ r600_write_compute_context_reg(cs, R_028238_CB_TARGET_MASK, ctx->compute_cb_target_mask); /* Emit vertex buffer state */ ctx->cs_vertex_buffer_state.atom.num_dw = 12 * util_bitcount(ctx->cs_vertex_buffer_state.dirty_mask); r600_emit_atom(ctx, &ctx->cs_vertex_buffer_state.atom); /* Emit compute shader state */ r600_emit_atom(ctx, &ctx->cs_shader_state.atom); for (i = 0; i < get_compute_resource_num(); i++) { if (resources[i].enabled) { int j; COMPUTE_DBG("resnum: %i, cdw: %i\n", i, cs->cdw); for (j = 0; j < resources[i].cs_end; j++) { if (resources[i].do_reloc[j]) { assert(resources[i].bo); evergreen_emit_ctx_reloc(ctx, resources[i].bo, resources[i].usage); } cs->buf[cs->cdw++] = resources[i].cs[j]; } if (resources[i].bo) { onebo = resources[i].bo; evergreen_emit_ctx_reloc(ctx, resources[i].bo, resources[i].usage); ///special case for textures if (resources[i].do_reloc [resources[i].cs_end] == 2) { evergreen_emit_ctx_reloc(ctx, resources[i].bo, resources[i].usage); } } } } /* Emit dispatch state and dispatch packet */ evergreen_emit_direct_dispatch(ctx, block_layout, grid_layout); /* XXX evergreen_flush_emit() hardcodes the CP_COHER_SIZE to 0xffffffff */ ctx->flags |= R600_CONTEXT_CB_FLUSH; r600_flush_emit(ctx); #if 0 COMPUTE_DBG("cdw: %i\n", cs->cdw); for (i = 0; i < cs->cdw; i++) { COMPUTE_DBG("%4i : 0x%08X\n", i, ctx->cs->buf[i]); } #endif ctx->ws->cs_flush(ctx->cs, RADEON_FLUSH_ASYNC | RADEON_FLUSH_COMPUTE); ctx->pm4_dirty_cdwords = 0; ctx->flags = 0; COMPUTE_DBG("shader started\n"); ctx->ws->buffer_wait(onebo->buf, 0); COMPUTE_DBG("...\n"); ctx->streamout_start = TRUE; ctx->streamout_append_bitmask = ~0; }
static void evergreen_emit_direct_dispatch( struct r600_context *rctx, const uint *block_layout, const uint *grid_layout) { int i; struct radeon_winsys_cs *cs = rctx->cs; unsigned num_waves; unsigned num_pipes = rctx->screen->info.r600_max_pipes; unsigned wave_divisor = (16 * num_pipes); int group_size = 1; int grid_size = 1; /* XXX: Enable lds and get size from cs_shader_state */ unsigned lds_size = 0; /* Calculate group_size/grid_size */ for (i = 0; i < 3; i++) { group_size *= block_layout[i]; } for (i = 0; i < 3; i++) { grid_size *= grid_layout[i]; } /* num_waves = ceil((tg_size.x * tg_size.y, tg_size.z) / (16 * num_pipes)) */ num_waves = (block_layout[0] * block_layout[1] * block_layout[2] + wave_divisor - 1) / wave_divisor; COMPUTE_DBG("Using %u pipes, there are %u wavefronts per thread block\n", num_pipes, num_waves); /* XXX: Partition the LDS between PS/CS. By default half (4096 dwords * on Evergreen) oes to Pixel Shaders and half goes to Compute Shaders. * We may need to allocat the entire LDS space for Compute Shaders. * * EG: R_008E2C_SQ_LDS_RESOURCE_MGMT := S_008E2C_NUM_LS_LDS(lds_dwords) * CM: CM_R_0286FC_SPI_LDS_MGMT := S_0286FC_NUM_LS_LDS(lds_dwords) */ r600_write_config_reg(cs, R_008970_VGT_NUM_INDICES, group_size); r600_write_config_reg_seq(cs, R_00899C_VGT_COMPUTE_START_X, 3); r600_write_value(cs, 0); /* R_00899C_VGT_COMPUTE_START_X */ r600_write_value(cs, 0); /* R_0089A0_VGT_COMPUTE_START_Y */ r600_write_value(cs, 0); /* R_0089A4_VGT_COMPUTE_START_Z */ r600_write_config_reg(cs, R_0089AC_VGT_COMPUTE_THREAD_GROUP_SIZE, group_size); r600_write_compute_context_reg_seq(cs, R_0286EC_SPI_COMPUTE_NUM_THREAD_X, 3); r600_write_value(cs, block_layout[0]); /* R_0286EC_SPI_COMPUTE_NUM_THREAD_X */ r600_write_value(cs, block_layout[1]); /* R_0286F0_SPI_COMPUTE_NUM_THREAD_Y */ r600_write_value(cs, block_layout[2]); /* R_0286F4_SPI_COMPUTE_NUM_THREAD_Z */ r600_write_compute_context_reg(cs, CM_R_0288E8_SQ_LDS_ALLOC, lds_size | (num_waves << 14)); /* Dispatch packet */ r600_write_value(cs, PKT3C(PKT3_DISPATCH_DIRECT, 3, 0)); r600_write_value(cs, grid_layout[0]); r600_write_value(cs, grid_layout[1]); r600_write_value(cs, grid_layout[2]); /* VGT_DISPATCH_INITIATOR = COMPUTE_SHADER_EN */ r600_write_value(cs, 1); }
static void compute_emit_cs(struct r600_context *ctx, const uint *block_layout, const uint *grid_layout) { struct radeon_winsys_cs *cs = ctx->rings.gfx.cs; unsigned flush_flags = 0; int i; /* make sure that the gfx ring is only one active */ if (ctx->rings.dma.cs) { ctx->rings.dma.flush(ctx, RADEON_FLUSH_ASYNC); } /* Initialize all the compute-related registers. * * See evergreen_init_atom_start_compute_cs() in this file for the list * of registers initialized by the start_compute_cs_cmd atom. */ r600_emit_command_buffer(cs, &ctx->start_compute_cs_cmd); ctx->flags |= R600_CONTEXT_WAIT_3D_IDLE | R600_CONTEXT_FLUSH_AND_INV; r600_flush_emit(ctx); /* Emit colorbuffers. */ for (i = 0; i < ctx->framebuffer.state.nr_cbufs; i++) { struct r600_surface *cb = (struct r600_surface*)ctx->framebuffer.state.cbufs[i]; unsigned reloc = r600_context_bo_reloc(ctx, &ctx->rings.gfx, (struct r600_resource*)cb->base.texture, RADEON_USAGE_READWRITE); r600_write_compute_context_reg_seq(cs, R_028C60_CB_COLOR0_BASE + i * 0x3C, 7); r600_write_value(cs, cb->cb_color_base); /* R_028C60_CB_COLOR0_BASE */ r600_write_value(cs, cb->cb_color_pitch); /* R_028C64_CB_COLOR0_PITCH */ r600_write_value(cs, cb->cb_color_slice); /* R_028C68_CB_COLOR0_SLICE */ r600_write_value(cs, cb->cb_color_view); /* R_028C6C_CB_COLOR0_VIEW */ r600_write_value(cs, cb->cb_color_info); /* R_028C70_CB_COLOR0_INFO */ r600_write_value(cs, cb->cb_color_attrib); /* R_028C74_CB_COLOR0_ATTRIB */ r600_write_value(cs, cb->cb_color_dim); /* R_028C78_CB_COLOR0_DIM */ r600_write_value(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C60_CB_COLOR0_BASE */ r600_write_value(cs, reloc); if (!ctx->keep_tiling_flags) { r600_write_value(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C70_CB_COLOR0_INFO */ r600_write_value(cs, reloc); } r600_write_value(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C74_CB_COLOR0_ATTRIB */ r600_write_value(cs, reloc); } /* Set CB_TARGET_MASK XXX: Use cb_misc_state */ r600_write_compute_context_reg(cs, R_028238_CB_TARGET_MASK, ctx->compute_cb_target_mask); /* Emit vertex buffer state */ ctx->cs_vertex_buffer_state.atom.num_dw = 12 * util_bitcount(ctx->cs_vertex_buffer_state.dirty_mask); r600_emit_atom(ctx, &ctx->cs_vertex_buffer_state.atom); /* Emit constant buffer state */ r600_emit_atom(ctx, &ctx->constbuf_state[PIPE_SHADER_COMPUTE].atom); /* Emit compute shader state */ r600_emit_atom(ctx, &ctx->cs_shader_state.atom); /* Emit dispatch state and dispatch packet */ evergreen_emit_direct_dispatch(ctx, block_layout, grid_layout); /* XXX evergreen_flush_emit() hardcodes the CP_COHER_SIZE to 0xffffffff */ ctx->flags |= R600_CONTEXT_INVAL_READ_CACHES; r600_flush_emit(ctx); #if 0 COMPUTE_DBG(ctx->screen, "cdw: %i\n", cs->cdw); for (i = 0; i < cs->cdw; i++) { COMPUTE_DBG(ctx->screen, "%4i : 0x%08X\n", i, ctx->cs->buf[i]); } #endif flush_flags = RADEON_FLUSH_ASYNC | RADEON_FLUSH_COMPUTE; if (ctx->keep_tiling_flags) { flush_flags |= RADEON_FLUSH_KEEP_TILING_FLAGS; } ctx->ws->cs_flush(ctx->rings.gfx.cs, flush_flags, ctx->screen->cs_count++); ctx->flags = 0; COMPUTE_DBG(ctx->screen, "shader started\n"); }
static void evergreen_emit_direct_dispatch( struct r600_context *rctx, const uint *block_layout, const uint *grid_layout) { int i; struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs; struct r600_pipe_compute *shader = rctx->cs_shader_state.shader; unsigned num_waves; unsigned num_pipes = rctx->screen->b.info.r600_max_pipes; unsigned wave_divisor = (16 * num_pipes); int group_size = 1; int grid_size = 1; unsigned lds_size = shader->local_size / 4 + shader->active_kernel->bc.nlds_dw; /* Calculate group_size/grid_size */ for (i = 0; i < 3; i++) { group_size *= block_layout[i]; } for (i = 0; i < 3; i++) { grid_size *= grid_layout[i]; } /* num_waves = ceil((tg_size.x * tg_size.y, tg_size.z) / (16 * num_pipes)) */ num_waves = (block_layout[0] * block_layout[1] * block_layout[2] + wave_divisor - 1) / wave_divisor; COMPUTE_DBG(rctx->screen, "Using %u pipes, " "%u wavefronts per thread block, " "allocating %u dwords lds.\n", num_pipes, num_waves, lds_size); r600_write_config_reg(cs, R_008970_VGT_NUM_INDICES, group_size); r600_write_config_reg_seq(cs, R_00899C_VGT_COMPUTE_START_X, 3); radeon_emit(cs, 0); /* R_00899C_VGT_COMPUTE_START_X */ radeon_emit(cs, 0); /* R_0089A0_VGT_COMPUTE_START_Y */ radeon_emit(cs, 0); /* R_0089A4_VGT_COMPUTE_START_Z */ r600_write_config_reg(cs, R_0089AC_VGT_COMPUTE_THREAD_GROUP_SIZE, group_size); r600_write_compute_context_reg_seq(cs, R_0286EC_SPI_COMPUTE_NUM_THREAD_X, 3); radeon_emit(cs, block_layout[0]); /* R_0286EC_SPI_COMPUTE_NUM_THREAD_X */ radeon_emit(cs, block_layout[1]); /* R_0286F0_SPI_COMPUTE_NUM_THREAD_Y */ radeon_emit(cs, block_layout[2]); /* R_0286F4_SPI_COMPUTE_NUM_THREAD_Z */ if (rctx->b.chip_class < CAYMAN) { assert(lds_size <= 8192); } else { /* Cayman appears to have a slightly smaller limit, see the * value of CM_R_0286FC_SPI_LDS_MGMT.NUM_LS_LDS */ assert(lds_size <= 8160); } r600_write_compute_context_reg(cs, CM_R_0288E8_SQ_LDS_ALLOC, lds_size | (num_waves << 14)); /* Dispatch packet */ radeon_emit(cs, PKT3C(PKT3_DISPATCH_DIRECT, 3, 0)); radeon_emit(cs, grid_layout[0]); radeon_emit(cs, grid_layout[1]); radeon_emit(cs, grid_layout[2]); /* VGT_DISPATCH_INITIATOR = COMPUTE_SHADER_EN */ radeon_emit(cs, 1); }
static void compute_emit_cs(struct r600_context *ctx, const uint *block_layout, const uint *grid_layout) { struct radeon_winsys_cs *cs = ctx->b.rings.gfx.cs; int i; /* make sure that the gfx ring is only one active */ if (ctx->b.rings.dma.cs) { ctx->b.rings.dma.flush(ctx, RADEON_FLUSH_ASYNC); } /* Initialize all the compute-related registers. * * See evergreen_init_atom_start_compute_cs() in this file for the list * of registers initialized by the start_compute_cs_cmd atom. */ r600_emit_command_buffer(cs, &ctx->start_compute_cs_cmd); ctx->b.flags |= R600_CONTEXT_WAIT_3D_IDLE | R600_CONTEXT_FLUSH_AND_INV; r600_flush_emit(ctx); /* Emit colorbuffers. */ /* XXX support more than 8 colorbuffers (the offsets are not a multiple of 0x3C for CB8-11) */ for (i = 0; i < 8 && i < ctx->framebuffer.state.nr_cbufs; i++) { struct r600_surface *cb = (struct r600_surface*)ctx->framebuffer.state.cbufs[i]; unsigned reloc = r600_context_bo_reloc(&ctx->b, &ctx->b.rings.gfx, (struct r600_resource*)cb->base.texture, RADEON_USAGE_READWRITE); r600_write_compute_context_reg_seq(cs, R_028C60_CB_COLOR0_BASE + i * 0x3C, 7); radeon_emit(cs, cb->cb_color_base); /* R_028C60_CB_COLOR0_BASE */ radeon_emit(cs, cb->cb_color_pitch); /* R_028C64_CB_COLOR0_PITCH */ radeon_emit(cs, cb->cb_color_slice); /* R_028C68_CB_COLOR0_SLICE */ radeon_emit(cs, cb->cb_color_view); /* R_028C6C_CB_COLOR0_VIEW */ radeon_emit(cs, cb->cb_color_info); /* R_028C70_CB_COLOR0_INFO */ radeon_emit(cs, cb->cb_color_attrib); /* R_028C74_CB_COLOR0_ATTRIB */ radeon_emit(cs, cb->cb_color_dim); /* R_028C78_CB_COLOR0_DIM */ radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C60_CB_COLOR0_BASE */ radeon_emit(cs, reloc); if (!ctx->keep_tiling_flags) { radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C70_CB_COLOR0_INFO */ radeon_emit(cs, reloc); } radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C74_CB_COLOR0_ATTRIB */ radeon_emit(cs, reloc); } if (ctx->keep_tiling_flags) { for (; i < 8 ; i++) { r600_write_compute_context_reg(cs, R_028C70_CB_COLOR0_INFO + i * 0x3C, S_028C70_FORMAT(V_028C70_COLOR_INVALID)); } for (; i < 12; i++) { r600_write_compute_context_reg(cs, R_028E50_CB_COLOR8_INFO + (i - 8) * 0x1C, S_028C70_FORMAT(V_028C70_COLOR_INVALID)); } } /* Set CB_TARGET_MASK XXX: Use cb_misc_state */ r600_write_compute_context_reg(cs, R_028238_CB_TARGET_MASK, ctx->compute_cb_target_mask); /* Emit vertex buffer state */ ctx->cs_vertex_buffer_state.atom.num_dw = 12 * util_bitcount(ctx->cs_vertex_buffer_state.dirty_mask); r600_emit_atom(ctx, &ctx->cs_vertex_buffer_state.atom); /* Emit constant buffer state */ r600_emit_atom(ctx, &ctx->constbuf_state[PIPE_SHADER_COMPUTE].atom); /* Emit compute shader state */ r600_emit_atom(ctx, &ctx->cs_shader_state.atom); /* Emit dispatch state and dispatch packet */ evergreen_emit_direct_dispatch(ctx, block_layout, grid_layout); /* XXX evergreen_flush_emit() hardcodes the CP_COHER_SIZE to 0xffffffff */ ctx->b.flags |= R600_CONTEXT_INV_CONST_CACHE | R600_CONTEXT_INV_VERTEX_CACHE | R600_CONTEXT_INV_TEX_CACHE; r600_flush_emit(ctx); ctx->b.flags = 0; if (ctx->b.chip_class >= CAYMAN) { cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 0, 0); cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4); /* DEALLOC_STATE prevents the GPU from hanging when a * SURFACE_SYNC packet is emitted some time after a DISPATCH_DIRECT * with any of the CB*_DEST_BASE_ENA or DB_DEST_BASE_ENA bits set. */ cs->buf[cs->cdw++] = PKT3C(PKT3_DEALLOC_STATE, 0, 0); cs->buf[cs->cdw++] = 0; } #if 0 COMPUTE_DBG(ctx->screen, "cdw: %i\n", cs->cdw); for (i = 0; i < cs->cdw; i++) { COMPUTE_DBG(ctx->screen, "%4i : 0x%08X\n", i, cs->buf[i]); } #endif }
static void compute_emit_cs(struct r600_context *ctx, const uint *block_layout, const uint *grid_layout) { struct radeon_winsys_cs *cs = ctx->rings.gfx.cs; unsigned flush_flags = 0; int i; struct r600_resource *onebo = NULL; struct evergreen_compute_resource *resources = ctx->cs_shader_state.shader->resources; /* make sure that the gfx ring is only one active */ if (ctx->rings.dma.cs) { ctx->rings.dma.flush(ctx, RADEON_FLUSH_ASYNC); } /* Initialize all the compute-related registers. * * See evergreen_init_atom_start_compute_cs() in this file for the list * of registers initialized by the start_compute_cs_cmd atom. */ r600_emit_command_buffer(cs, &ctx->start_compute_cs_cmd); ctx->flags |= R600_CONTEXT_WAIT_3D_IDLE | R600_CONTEXT_FLUSH_AND_INV; r600_flush_emit(ctx); /* Emit colorbuffers. */ for (i = 0; i < ctx->framebuffer.state.nr_cbufs; i++) { struct r600_surface *cb = (struct r600_surface*)ctx->framebuffer.state.cbufs[i]; unsigned reloc = r600_context_bo_reloc(ctx, &ctx->rings.gfx, (struct r600_resource*)cb->base.texture, RADEON_USAGE_READWRITE); r600_write_compute_context_reg_seq(cs, R_028C60_CB_COLOR0_BASE + i * 0x3C, 7); r600_write_value(cs, cb->cb_color_base); /* R_028C60_CB_COLOR0_BASE */ r600_write_value(cs, cb->cb_color_pitch); /* R_028C64_CB_COLOR0_PITCH */ r600_write_value(cs, cb->cb_color_slice); /* R_028C68_CB_COLOR0_SLICE */ r600_write_value(cs, cb->cb_color_view); /* R_028C6C_CB_COLOR0_VIEW */ r600_write_value(cs, cb->cb_color_info); /* R_028C70_CB_COLOR0_INFO */ r600_write_value(cs, cb->cb_color_attrib); /* R_028C74_CB_COLOR0_ATTRIB */ r600_write_value(cs, cb->cb_color_dim); /* R_028C78_CB_COLOR0_DIM */ r600_write_value(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C60_CB_COLOR0_BASE */ r600_write_value(cs, reloc); if (!ctx->keep_tiling_flags) { r600_write_value(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C70_CB_COLOR0_INFO */ r600_write_value(cs, reloc); } r600_write_value(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C74_CB_COLOR0_ATTRIB */ r600_write_value(cs, reloc); } /* Set CB_TARGET_MASK XXX: Use cb_misc_state */ r600_write_compute_context_reg(cs, R_028238_CB_TARGET_MASK, ctx->compute_cb_target_mask); /* Emit vertex buffer state */ ctx->cs_vertex_buffer_state.atom.num_dw = 12 * util_bitcount(ctx->cs_vertex_buffer_state.dirty_mask); r600_emit_atom(ctx, &ctx->cs_vertex_buffer_state.atom); /* Emit compute shader state */ r600_emit_atom(ctx, &ctx->cs_shader_state.atom); for (i = 0; i < get_compute_resource_num(); i++) { if (resources[i].enabled) { int j; COMPUTE_DBG("resnum: %i, cdw: %i\n", i, cs->cdw); for (j = 0; j < resources[i].cs_end; j++) { if (resources[i].do_reloc[j]) { assert(resources[i].bo); evergreen_emit_ctx_reloc(ctx, resources[i].bo, resources[i].usage); } cs->buf[cs->cdw++] = resources[i].cs[j]; } if (resources[i].bo) { onebo = resources[i].bo; evergreen_emit_ctx_reloc(ctx, resources[i].bo, resources[i].usage); ///special case for textures if (resources[i].do_reloc [resources[i].cs_end] == 2) { evergreen_emit_ctx_reloc(ctx, resources[i].bo, resources[i].usage); } } } } /* Emit dispatch state and dispatch packet */ evergreen_emit_direct_dispatch(ctx, block_layout, grid_layout); /* XXX evergreen_flush_emit() hardcodes the CP_COHER_SIZE to 0xffffffff */ ctx->flags |= R600_CONTEXT_INVAL_READ_CACHES; r600_flush_emit(ctx); #if 0 COMPUTE_DBG("cdw: %i\n", cs->cdw); for (i = 0; i < cs->cdw; i++) { COMPUTE_DBG("%4i : 0x%08X\n", i, ctx->cs->buf[i]); } #endif flush_flags = RADEON_FLUSH_ASYNC | RADEON_FLUSH_COMPUTE; if (ctx->keep_tiling_flags) { flush_flags |= RADEON_FLUSH_KEEP_TILING_FLAGS; } ctx->ws->cs_flush(ctx->rings.gfx.cs, flush_flags); ctx->pm4_dirty_cdwords = 0; ctx->flags = 0; COMPUTE_DBG("shader started\n"); ctx->ws->buffer_wait(onebo->buf, 0); COMPUTE_DBG("...\n"); ctx->streamout_start = TRUE; ctx->streamout_append_bitmask = ~0; }