static void si_dma_copy_buffer(struct si_context *ctx, struct pipe_resource *dst, struct pipe_resource *src, uint64_t dst_offset, uint64_t src_offset, uint64_t size) { struct radeon_winsys_cs *cs = ctx->b.rings.dma.cs; unsigned i, ncopy, csize, max_csize, sub_cmd, shift; struct r600_resource *rdst = (struct r600_resource*)dst; struct r600_resource *rsrc = (struct r600_resource*)src; /* Mark the buffer range of destination as valid (initialized), * so that transfer_map knows it should wait for the GPU when mapping * that range. */ util_range_add(&rdst->valid_buffer_range, dst_offset, dst_offset + size); dst_offset += r600_resource_va(&ctx->screen->b.b, dst); src_offset += r600_resource_va(&ctx->screen->b.b, src); /* see if we use dword or byte copy */ if (!(dst_offset % 4) && !(src_offset % 4) && !(size % 4)) { size >>= 2; sub_cmd = SI_DMA_COPY_DWORD_ALIGNED; shift = 2; max_csize = SI_DMA_COPY_MAX_SIZE_DW; } else {
bool r600_init_resource(struct r600_common_screen *rscreen, struct r600_resource *res, unsigned size, unsigned alignment, bool use_reusable_pool, unsigned usage) { uint32_t initial_domain, domains; switch(usage) { case PIPE_USAGE_STAGING: /* Staging resources participate in transfers, i.e. are used * for uploads and downloads from regular resources. * We generate them internally for some transfers. */ initial_domain = RADEON_DOMAIN_GTT; domains = RADEON_DOMAIN_GTT; break; case PIPE_USAGE_DYNAMIC: case PIPE_USAGE_STREAM: /* Default to GTT, but allow the memory manager to move it to VRAM. */ initial_domain = RADEON_DOMAIN_GTT; domains = RADEON_DOMAIN_GTT | RADEON_DOMAIN_VRAM; break; case PIPE_USAGE_DEFAULT: case PIPE_USAGE_STATIC: case PIPE_USAGE_IMMUTABLE: default: /* Don't list GTT here, because the memory manager would put some * resources to GTT no matter what the initial domain is. * Not listing GTT in the domains improves performance a lot. */ initial_domain = RADEON_DOMAIN_VRAM; domains = RADEON_DOMAIN_VRAM; break; } res->buf = rscreen->ws->buffer_create(rscreen->ws, size, alignment, use_reusable_pool, initial_domain); if (!res->buf) { return false; } res->cs_buf = rscreen->ws->buffer_get_cs_handle(res->buf); res->domains = domains; util_range_set_empty(&res->valid_buffer_range); if (rscreen->debug_flags & DBG_VM && res->b.b.target == PIPE_BUFFER) { fprintf(stderr, "VM start=0x%"PRIu64" end=0x%"PRIu64" | Buffer %u bytes\n", r600_resource_va(&rscreen->b, &res->b.b), r600_resource_va(&rscreen->b, &res->b.b) + res->buf->size, res->buf->size); } return true; }
/** * Emit function for r600_cs_shader_state atom */ void evergreen_emit_cs_shader( struct r600_context *rctx, struct r600_atom *atom) { struct r600_cs_shader_state *state = (struct r600_cs_shader_state*)atom; struct r600_pipe_compute *shader = state->shader; struct radeon_winsys_cs *cs = rctx->cs; uint64_t va; va = r600_resource_va(&rctx->screen->screen, &shader->shader_code_bo->b.b); r600_write_compute_context_reg_seq(cs, R_0288D0_SQ_PGM_START_LS, 3); r600_write_value(cs, va >> 8); /* R_0288D0_SQ_PGM_START_LS */ r600_write_value(cs, /* R_0288D4_SQ_PGM_RESOURCES_LS */ S_0288D4_NUM_GPRS(shader->bc.ngpr) | S_0288D4_STACK_SIZE(shader->bc.nstack)); r600_write_value(cs, 0); /* R_0288D8_SQ_PGM_RESOURCES_LS_2 */ r600_write_value(cs, PKT3C(PKT3_NOP, 0, 0)); r600_write_value(cs, r600_context_bo_reloc(rctx, shader->shader_code_bo, RADEON_USAGE_READ)); rctx->flags |= R600_CONTEXT_SHADERCONST_FLUSH; }
bool r600_init_resource(struct r600_common_screen *rscreen, struct r600_resource *res, unsigned size, unsigned alignment, bool use_reusable_pool) { struct r600_texture *rtex = (struct r600_texture*)res; switch (res->b.b.usage) { case PIPE_USAGE_STAGING: case PIPE_USAGE_DYNAMIC: case PIPE_USAGE_STREAM: /* Transfers are likely to occur more often with these resources. */ res->domains = RADEON_DOMAIN_GTT; break; case PIPE_USAGE_DEFAULT: case PIPE_USAGE_IMMUTABLE: default: /* Not listing GTT here improves performance in some apps. */ res->domains = RADEON_DOMAIN_VRAM; break; } /* Tiled textures are unmappable. Always put them in VRAM. */ if (res->b.b.target != PIPE_BUFFER && rtex->surface.level[0].mode >= RADEON_SURF_MODE_1D) { res->domains = RADEON_DOMAIN_VRAM; } /* Allocate the resource. */ res->buf = rscreen->ws->buffer_create(rscreen->ws, size, alignment, use_reusable_pool, res->domains); if (!res->buf) { return false; } res->cs_buf = rscreen->ws->buffer_get_cs_handle(res->buf); util_range_set_empty(&res->valid_buffer_range); if (rscreen->debug_flags & DBG_VM && res->b.b.target == PIPE_BUFFER) { fprintf(stderr, "VM start=0x%"PRIu64" end=0x%"PRIu64" | Buffer %u bytes\n", r600_resource_va(&rscreen->b, &res->b.b), r600_resource_va(&rscreen->b, &res->b.b) + res->buf->size, res->buf->size); } return true; }
void evergreen_dma_copy(struct r600_context *rctx, struct pipe_resource *dst, struct pipe_resource *src, uint64_t dst_offset, uint64_t src_offset, uint64_t size) { struct radeon_winsys_cs *cs = rctx->rings.dma.cs; unsigned i, ncopy, csize, sub_cmd, shift; struct r600_resource *rdst = (struct r600_resource*)dst; struct r600_resource *rsrc = (struct r600_resource*)src; /* make sure that the dma ring is only one active */ rctx->rings.gfx.flush(rctx, RADEON_FLUSH_ASYNC); dst_offset += r600_resource_va(&rctx->screen->screen, dst); src_offset += r600_resource_va(&rctx->screen->screen, src); /* see if we use dword or byte copy */ if (!(dst_offset & 0x3) && !(src_offset & 0x3) && !(size & 0x3)) { size >>= 2; sub_cmd = 0x00; shift = 2; } else {
void si_trace_emit(struct si_context *sctx) { struct si_screen *sscreen = sctx->screen; struct radeon_winsys_cs *cs = sctx->cs; uint64_t va; va = r600_resource_va(&sscreen->screen, (void*)sscreen->b.trace_bo); r600_context_bo_reloc(sctx, sscreen->b.trace_bo, RADEON_USAGE_READWRITE); cs->buf[cs->cdw++] = PKT3(PKT3_WRITE_DATA, 4, 0); cs->buf[cs->cdw++] = PKT3_WRITE_DATA_DST_SEL(PKT3_WRITE_DATA_DST_SEL_MEM_SYNC) | PKT3_WRITE_DATA_WR_CONFIRM | PKT3_WRITE_DATA_ENGINE_SEL(PKT3_WRITE_DATA_ENGINE_SEL_ME); cs->buf[cs->cdw++] = va & 0xFFFFFFFFUL; cs->buf[cs->cdw++] = (va >> 32UL) & 0xFFFFFFFFUL; cs->buf[cs->cdw++] = cs->cdw; cs->buf[cs->cdw++] = sscreen->b.cs_count; }
static void si_pipe_shader_es(struct pipe_context *ctx, struct si_pipe_shader *shader) { struct si_context *sctx = (struct si_context *)ctx; struct si_pm4_state *pm4; unsigned num_sgprs, num_user_sgprs; unsigned vgpr_comp_cnt; uint64_t va; si_pm4_delete_state(sctx, es, shader->pm4); pm4 = shader->pm4 = si_pm4_alloc_state(sctx); if (pm4 == NULL) return; va = r600_resource_va(ctx->screen, (void *)shader->bo); si_pm4_add_bo(pm4, shader->bo, RADEON_USAGE_READ); vgpr_comp_cnt = shader->shader.uses_instanceid ? 3 : 0; num_user_sgprs = SI_VS_NUM_USER_SGPR; num_sgprs = shader->num_sgprs; /* One SGPR after user SGPRs is pre-loaded with es2gs_offset */ if ((num_user_sgprs + 1) > num_sgprs) { /* Last 2 reserved SGPRs are used for VCC */ num_sgprs = num_user_sgprs + 1 + 2; } assert(num_sgprs <= 104); si_pm4_set_reg(pm4, R_00B320_SPI_SHADER_PGM_LO_ES, va >> 8); si_pm4_set_reg(pm4, R_00B324_SPI_SHADER_PGM_HI_ES, va >> 40); si_pm4_set_reg(pm4, R_00B328_SPI_SHADER_PGM_RSRC1_ES, S_00B328_VGPRS((shader->num_vgprs - 1) / 4) | S_00B328_SGPRS((num_sgprs - 1) / 8) | S_00B328_VGPR_COMP_CNT(vgpr_comp_cnt)); si_pm4_set_reg(pm4, R_00B32C_SPI_SHADER_PGM_RSRC2_ES, S_00B32C_USER_SGPR(num_user_sgprs)); sctx->b.flags |= R600_CONTEXT_INV_SHADER_CACHE; }
/** * Emit function for r600_cs_shader_state atom */ void evergreen_emit_cs_shader( struct r600_context *rctx, struct r600_atom *atom) { struct r600_cs_shader_state *state = (struct r600_cs_shader_state*)atom; struct r600_pipe_compute *shader = state->shader; struct r600_kernel *kernel = &shader->kernels[state->kernel_index]; struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs; uint64_t va; va = r600_resource_va(&rctx->screen->b.b, &kernel->code_bo->b.b); r600_write_compute_context_reg_seq(cs, R_0288D0_SQ_PGM_START_LS, 3); radeon_emit(cs, va >> 8); /* R_0288D0_SQ_PGM_START_LS */ radeon_emit(cs, /* R_0288D4_SQ_PGM_RESOURCES_LS */ S_0288D4_NUM_GPRS(kernel->bc.ngpr) | S_0288D4_STACK_SIZE(kernel->bc.nstack)); radeon_emit(cs, 0); /* R_0288D8_SQ_PGM_RESOURCES_LS_2 */ radeon_emit(cs, PKT3C(PKT3_NOP, 0, 0)); radeon_emit(cs, r600_context_bo_reloc(&rctx->b, &rctx->b.rings.gfx, kernel->code_bo, RADEON_USAGE_READ)); }
for (i = 0; i < rctx->streamout.num_targets; i++) { if (!t[i]) continue; t[i]->stride_in_dw = stride_in_dw[i]; if (rctx->chip_class >= SI) { /* SI binds streamout buffers as shader resources. * VGT only counts primitives and tells the shader * through SGPRs what to do. */ r600_write_context_reg_seq(cs, R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 + 16*i, 2); radeon_emit(cs, (t[i]->b.buffer_offset + t[i]->b.buffer_size) >> 2); /* BUFFER_SIZE (in DW) */ radeon_emit(cs, stride_in_dw[i]); /* VTX_STRIDE (in DW) */ } else { uint64_t va = r600_resource_va(rctx->b.screen, (void*)t[i]->b.buffer); update_flags |= SURFACE_BASE_UPDATE_STRMOUT(i); r600_write_context_reg_seq(cs, R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 + 16*i, 3); radeon_emit(cs, (t[i]->b.buffer_offset + t[i]->b.buffer_size) >> 2); /* BUFFER_SIZE (in DW) */ radeon_emit(cs, stride_in_dw[i]); /* VTX_STRIDE (in DW) */ radeon_emit(cs, va >> 8); /* BUFFER_BASE */ r600_emit_reloc(rctx, &rctx->rings.gfx, r600_resource(t[i]->b.buffer), RADEON_USAGE_WRITE); /* R7xx requires this packet after updating BUFFER_BASE. * Without this, R7xx locks up. */ if (rctx->family >= CHIP_RS780 && rctx->family <= CHIP_RV740) {
static void si_pipe_shader_gs(struct pipe_context *ctx, struct si_pipe_shader *shader) { struct si_context *sctx = (struct si_context *)ctx; unsigned gs_vert_itemsize = shader->shader.noutput * (16 >> 2); unsigned gs_max_vert_out = shader->shader.gs_max_out_vertices; unsigned gsvs_itemsize = gs_vert_itemsize * gs_max_vert_out; unsigned cut_mode; struct si_pm4_state *pm4; unsigned num_sgprs, num_user_sgprs; uint64_t va; /* The GSVS_RING_ITEMSIZE register takes 15 bits */ assert(gsvs_itemsize < (1 << 15)); si_pm4_delete_state(sctx, gs, shader->pm4); pm4 = shader->pm4 = si_pm4_alloc_state(sctx); if (pm4 == NULL) return; if (gs_max_vert_out <= 128) { cut_mode = V_028A40_GS_CUT_128; } else if (gs_max_vert_out <= 256) { cut_mode = V_028A40_GS_CUT_256; } else if (gs_max_vert_out <= 512) { cut_mode = V_028A40_GS_CUT_512; } else { assert(gs_max_vert_out <= 1024); cut_mode = V_028A40_GS_CUT_1024; } si_pm4_set_reg(pm4, R_028A40_VGT_GS_MODE, S_028A40_MODE(V_028A40_GS_SCENARIO_G) | S_028A40_CUT_MODE(cut_mode)| S_028A40_ES_WRITE_OPTIMIZE(1) | S_028A40_GS_WRITE_OPTIMIZE(1)); si_pm4_set_reg(pm4, R_028A60_VGT_GSVS_RING_OFFSET_1, gsvs_itemsize); si_pm4_set_reg(pm4, R_028A64_VGT_GSVS_RING_OFFSET_2, gsvs_itemsize); si_pm4_set_reg(pm4, R_028A68_VGT_GSVS_RING_OFFSET_3, gsvs_itemsize); si_pm4_set_reg(pm4, R_028AAC_VGT_ESGS_RING_ITEMSIZE, shader->shader.nparam * (16 >> 2)); si_pm4_set_reg(pm4, R_028AB0_VGT_GSVS_RING_ITEMSIZE, gsvs_itemsize); si_pm4_set_reg(pm4, R_028B38_VGT_GS_MAX_VERT_OUT, gs_max_vert_out); si_pm4_set_reg(pm4, R_028B5C_VGT_GS_VERT_ITEMSIZE, gs_vert_itemsize); va = r600_resource_va(ctx->screen, (void *)shader->bo); si_pm4_add_bo(pm4, shader->bo, RADEON_USAGE_READ); si_pm4_set_reg(pm4, R_00B220_SPI_SHADER_PGM_LO_GS, va >> 8); si_pm4_set_reg(pm4, R_00B224_SPI_SHADER_PGM_HI_GS, va >> 40); num_user_sgprs = SI_GS_NUM_USER_SGPR; num_sgprs = shader->num_sgprs; /* Two SGPRs after user SGPRs are pre-loaded with gs2vs_offset, gs_wave_id */ if ((num_user_sgprs + 2) > num_sgprs) { /* Last 2 reserved SGPRs are used for VCC */ num_sgprs = num_user_sgprs + 2 + 2; } assert(num_sgprs <= 104); si_pm4_set_reg(pm4, R_00B228_SPI_SHADER_PGM_RSRC1_GS, S_00B228_VGPRS((shader->num_vgprs - 1) / 4) | S_00B228_SGPRS((num_sgprs - 1) / 8)); si_pm4_set_reg(pm4, R_00B22C_SPI_SHADER_PGM_RSRC2_GS, S_00B22C_USER_SGPR(num_user_sgprs)); sctx->b.flags |= R600_CONTEXT_INV_SHADER_CACHE; }
static void si_pipe_shader_ps(struct pipe_context *ctx, struct si_pipe_shader *shader) { struct si_context *sctx = (struct si_context *)ctx; struct si_pm4_state *pm4; unsigned i, exports_ps, spi_ps_in_control, db_shader_control; unsigned num_sgprs, num_user_sgprs; unsigned spi_baryc_cntl = 0, spi_ps_input_ena, spi_shader_z_format; uint64_t va; si_pm4_delete_state(sctx, ps, shader->pm4); pm4 = shader->pm4 = si_pm4_alloc_state(sctx); if (pm4 == NULL) return; db_shader_control = S_02880C_Z_ORDER(V_02880C_EARLY_Z_THEN_LATE_Z) | S_02880C_ALPHA_TO_MASK_DISABLE(sctx->fb_cb0_is_integer); for (i = 0; i < shader->shader.ninput; i++) { switch (shader->shader.input[i].name) { case TGSI_SEMANTIC_POSITION: if (shader->shader.input[i].centroid) { /* SPI_BARYC_CNTL.POS_FLOAT_LOCATION * Possible vaules: * 0 -> Position = pixel center (default) * 1 -> Position = pixel centroid * 2 -> Position = iterated sample number XXX: * What does this mean? */ spi_baryc_cntl |= S_0286E0_POS_FLOAT_LOCATION(1); } /* Fall through */ case TGSI_SEMANTIC_FACE: continue; } } for (i = 0; i < shader->shader.noutput; i++) { if (shader->shader.output[i].name == TGSI_SEMANTIC_POSITION) db_shader_control |= S_02880C_Z_EXPORT_ENABLE(1); if (shader->shader.output[i].name == TGSI_SEMANTIC_STENCIL) db_shader_control |= S_02880C_STENCIL_TEST_VAL_EXPORT_ENABLE(1); } if (shader->shader.uses_kill || shader->key.ps.alpha_func != PIPE_FUNC_ALWAYS) db_shader_control |= S_02880C_KILL_ENABLE(1); exports_ps = 0; for (i = 0; i < shader->shader.noutput; i++) { if (shader->shader.output[i].name == TGSI_SEMANTIC_POSITION || shader->shader.output[i].name == TGSI_SEMANTIC_STENCIL) exports_ps |= 1; } if (!exports_ps) { /* always at least export 1 component per pixel */ exports_ps = 2; } spi_ps_in_control = S_0286D8_NUM_INTERP(shader->shader.nparam) | S_0286D8_BC_OPTIMIZE_DISABLE(1); si_pm4_set_reg(pm4, R_0286E0_SPI_BARYC_CNTL, spi_baryc_cntl); spi_ps_input_ena = shader->spi_ps_input_ena; /* we need to enable at least one of them, otherwise we hang the GPU */ assert(G_0286CC_PERSP_SAMPLE_ENA(spi_ps_input_ena) || G_0286CC_PERSP_CENTER_ENA(spi_ps_input_ena) || G_0286CC_PERSP_CENTROID_ENA(spi_ps_input_ena) || G_0286CC_PERSP_PULL_MODEL_ENA(spi_ps_input_ena) || G_0286CC_LINEAR_SAMPLE_ENA(spi_ps_input_ena) || G_0286CC_LINEAR_CENTER_ENA(spi_ps_input_ena) || G_0286CC_LINEAR_CENTROID_ENA(spi_ps_input_ena) || G_0286CC_LINE_STIPPLE_TEX_ENA(spi_ps_input_ena)); si_pm4_set_reg(pm4, R_0286CC_SPI_PS_INPUT_ENA, spi_ps_input_ena); si_pm4_set_reg(pm4, R_0286D0_SPI_PS_INPUT_ADDR, spi_ps_input_ena); si_pm4_set_reg(pm4, R_0286D8_SPI_PS_IN_CONTROL, spi_ps_in_control); if (G_02880C_STENCIL_TEST_VAL_EXPORT_ENABLE(db_shader_control)) spi_shader_z_format = V_028710_SPI_SHADER_32_GR; else if (G_02880C_Z_EXPORT_ENABLE(db_shader_control)) spi_shader_z_format = V_028710_SPI_SHADER_32_R; else spi_shader_z_format = 0; si_pm4_set_reg(pm4, R_028710_SPI_SHADER_Z_FORMAT, spi_shader_z_format); si_pm4_set_reg(pm4, R_028714_SPI_SHADER_COL_FORMAT, shader->spi_shader_col_format); si_pm4_set_reg(pm4, R_02823C_CB_SHADER_MASK, shader->cb_shader_mask); va = r600_resource_va(ctx->screen, (void *)shader->bo); si_pm4_add_bo(pm4, shader->bo, RADEON_USAGE_READ); si_pm4_set_reg(pm4, R_00B020_SPI_SHADER_PGM_LO_PS, va >> 8); si_pm4_set_reg(pm4, R_00B024_SPI_SHADER_PGM_HI_PS, va >> 40); num_user_sgprs = SI_PS_NUM_USER_SGPR; num_sgprs = shader->num_sgprs; /* One SGPR after user SGPRs is pre-loaded with {prim_mask, lds_offset} */ if ((num_user_sgprs + 1) > num_sgprs) { /* Last 2 reserved SGPRs are used for VCC */ num_sgprs = num_user_sgprs + 1 + 2; } assert(num_sgprs <= 104); si_pm4_set_reg(pm4, R_00B028_SPI_SHADER_PGM_RSRC1_PS, S_00B028_VGPRS((shader->num_vgprs - 1) / 4) | S_00B028_SGPRS((num_sgprs - 1) / 8)); si_pm4_set_reg(pm4, R_00B02C_SPI_SHADER_PGM_RSRC2_PS, S_00B02C_EXTRA_LDS_SIZE(shader->lds_size) | S_00B02C_USER_SGPR(num_user_sgprs)); si_pm4_set_reg(pm4, R_02880C_DB_SHADER_CONTROL, db_shader_control); shader->cb0_is_integer = sctx->fb_cb0_is_integer; shader->sprite_coord_enable = sctx->sprite_coord_enable; sctx->b.flags |= R600_CONTEXT_INV_SHADER_CACHE; }
static void si_pipe_shader_vs(struct pipe_context *ctx, struct si_pipe_shader *shader) { struct si_context *sctx = (struct si_context *)ctx; struct si_pm4_state *pm4; unsigned num_sgprs, num_user_sgprs; unsigned nparams, i, vgpr_comp_cnt; uint64_t va; si_pm4_delete_state(sctx, vs, shader->pm4); pm4 = shader->pm4 = si_pm4_alloc_state(sctx); if (pm4 == NULL) return; va = r600_resource_va(ctx->screen, (void *)shader->bo); si_pm4_add_bo(pm4, shader->bo, RADEON_USAGE_READ); vgpr_comp_cnt = shader->shader.uses_instanceid ? 3 : 0; num_user_sgprs = SI_VS_NUM_USER_SGPR; num_sgprs = shader->num_sgprs; if (num_user_sgprs > num_sgprs) { /* Last 2 reserved SGPRs are used for VCC */ num_sgprs = num_user_sgprs + 2; } assert(num_sgprs <= 104); /* Certain attributes (position, psize, etc.) don't count as params. * VS is required to export at least one param and r600_shader_from_tgsi() * takes care of adding a dummy export. */ for (nparams = 0, i = 0 ; i < shader->shader.noutput; i++) { switch (shader->shader.output[i].name) { case TGSI_SEMANTIC_CLIPVERTEX: case TGSI_SEMANTIC_POSITION: case TGSI_SEMANTIC_PSIZE: break; default: nparams++; } } if (nparams < 1) nparams = 1; si_pm4_set_reg(pm4, R_0286C4_SPI_VS_OUT_CONFIG, S_0286C4_VS_EXPORT_COUNT(nparams - 1)); si_pm4_set_reg(pm4, R_02870C_SPI_SHADER_POS_FORMAT, S_02870C_POS0_EXPORT_FORMAT(V_02870C_SPI_SHADER_4COMP) | S_02870C_POS1_EXPORT_FORMAT(shader->shader.nr_pos_exports > 1 ? V_02870C_SPI_SHADER_4COMP : V_02870C_SPI_SHADER_NONE) | S_02870C_POS2_EXPORT_FORMAT(shader->shader.nr_pos_exports > 2 ? V_02870C_SPI_SHADER_4COMP : V_02870C_SPI_SHADER_NONE) | S_02870C_POS3_EXPORT_FORMAT(shader->shader.nr_pos_exports > 3 ? V_02870C_SPI_SHADER_4COMP : V_02870C_SPI_SHADER_NONE)); si_pm4_set_reg(pm4, R_00B120_SPI_SHADER_PGM_LO_VS, va >> 8); si_pm4_set_reg(pm4, R_00B124_SPI_SHADER_PGM_HI_VS, va >> 40); si_pm4_set_reg(pm4, R_00B128_SPI_SHADER_PGM_RSRC1_VS, S_00B128_VGPRS((shader->num_vgprs - 1) / 4) | S_00B128_SGPRS((num_sgprs - 1) / 8) | S_00B128_VGPR_COMP_CNT(vgpr_comp_cnt)); si_pm4_set_reg(pm4, R_00B12C_SPI_SHADER_PGM_RSRC2_VS, S_00B12C_USER_SGPR(num_user_sgprs) | S_00B12C_SO_BASE0_EN(!!shader->selector->so.stride[0]) | S_00B12C_SO_BASE1_EN(!!shader->selector->so.stride[1]) | S_00B12C_SO_BASE2_EN(!!shader->selector->so.stride[2]) | S_00B12C_SO_BASE3_EN(!!shader->selector->so.stride[3]) | S_00B12C_SO_EN(!!shader->selector->so.num_outputs)); sctx->b.flags |= R600_CONTEXT_INV_SHADER_CACHE; }
bool r600_init_resource(struct r600_common_screen *rscreen, struct r600_resource *res, unsigned size, unsigned alignment, bool use_reusable_pool) { struct r600_texture *rtex = (struct r600_texture*)res; struct pb_buffer *old_buf, *new_buf; switch (res->b.b.usage) { case PIPE_USAGE_STAGING: case PIPE_USAGE_DYNAMIC: case PIPE_USAGE_STREAM: /* Transfers are likely to occur more often with these resources. */ res->domains = RADEON_DOMAIN_GTT; break; case PIPE_USAGE_DEFAULT: case PIPE_USAGE_IMMUTABLE: default: /* Not listing GTT here improves performance in some apps. */ res->domains = RADEON_DOMAIN_VRAM; break; } /* Use GTT for all persistent mappings, because they are * always cached and coherent. */ if (res->b.b.target == PIPE_BUFFER && res->b.b.flags & (PIPE_RESOURCE_FLAG_MAP_PERSISTENT | PIPE_RESOURCE_FLAG_MAP_COHERENT)) { res->domains = RADEON_DOMAIN_GTT; } /* Tiled textures are unmappable. Always put them in VRAM. */ if (res->b.b.target != PIPE_BUFFER && rtex->surface.level[0].mode >= RADEON_SURF_MODE_1D) { res->domains = RADEON_DOMAIN_VRAM; } /* Allocate a new resource. */ new_buf = rscreen->ws->buffer_create(rscreen->ws, size, alignment, use_reusable_pool, res->domains); if (!new_buf) { return false; } /* Replace the pointer such that if res->buf wasn't NULL, it won't be * NULL. This should prevent crashes with multiple contexts using * the same buffer where one of the contexts invalidates it while * the others are using it. */ old_buf = res->buf; res->cs_buf = rscreen->ws->buffer_get_cs_handle(new_buf); /* should be atomic */ res->buf = new_buf; /* should be atomic */ pb_reference(&old_buf, NULL); util_range_set_empty(&res->valid_buffer_range); if (rscreen->debug_flags & DBG_VM && res->b.b.target == PIPE_BUFFER) { fprintf(stderr, "VM start=0x%"PRIu64" end=0x%"PRIu64" | Buffer %u bytes\n", r600_resource_va(&rscreen->b, &res->b.b), r600_resource_va(&rscreen->b, &res->b.b) + res->buf->size, res->buf->size); } return true; }