void Mask_SetAll (Buffer_t *buffer, event_t *event) { int index = EVENT_INDEX(buffer, event); buffer->Masks[index] = ALL_BITS_SET; }
void Mask_Set (Buffer_t *buffer, event_t *event, int mask_id) { int index = EVENT_INDEX(buffer, event); buffer->Masks[index] |= mask_id; }
/** * This function initializes all the compute specific registers that need to * be initialized for each compute command stream. Registers that are common * to both compute and 3D will be initialized at the beginning of each compute * command stream by the start_cs_cmd atom. However, since the SET_CONTEXT_REG * packet requires that the shader type bit be set, we must initialize all * context registers needed for compute in this function. The registers * intialized by the start_cs_cmd atom can be found in evereen_state.c in the * functions evergreen_init_atom_start_cs or cayman_init_atom_start_cs depending * on the GPU family. */ void evergreen_init_atom_start_compute_cs(struct r600_context *ctx) { struct r600_command_buffer *cb = &ctx->start_compute_cs_cmd; int num_threads; int num_stack_entries; /* since all required registers are initialised in the * start_compute_cs_cmd atom, we can EMIT_EARLY here. */ r600_init_command_buffer(cb, 256); cb->pkt_flags = RADEON_CP_PACKET3_COMPUTE_MODE; /* This must be first. */ r600_store_value(cb, PKT3(PKT3_CONTEXT_CONTROL, 1, 0)); r600_store_value(cb, 0x80000000); r600_store_value(cb, 0x80000000); /* We're setting config registers here. */ r600_store_value(cb, PKT3(PKT3_EVENT_WRITE, 0, 0)); r600_store_value(cb, EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4)); switch (ctx->b.family) { case CHIP_CEDAR: default: num_threads = 128; num_stack_entries = 256; break; case CHIP_REDWOOD: num_threads = 128; num_stack_entries = 256; break; case CHIP_JUNIPER: num_threads = 128; num_stack_entries = 512; break; case CHIP_CYPRESS: case CHIP_HEMLOCK: num_threads = 128; num_stack_entries = 512; break; case CHIP_PALM: num_threads = 128; num_stack_entries = 256; break; case CHIP_SUMO: num_threads = 128; num_stack_entries = 256; break; case CHIP_SUMO2: num_threads = 128; num_stack_entries = 512; break; case CHIP_BARTS: num_threads = 128; num_stack_entries = 512; break; case CHIP_TURKS: num_threads = 128; num_stack_entries = 256; break; case CHIP_CAICOS: num_threads = 128; num_stack_entries = 256; break; } /* Config Registers */ if (ctx->b.chip_class < CAYMAN) evergreen_init_common_regs(cb, ctx->b.chip_class, ctx->b.family, ctx->screen->b.info.drm_minor); else cayman_init_common_regs(cb, ctx->b.chip_class, ctx->b.family, ctx->screen->b.info.drm_minor); /* The primitive type always needs to be POINTLIST for compute. */ r600_store_config_reg(cb, R_008958_VGT_PRIMITIVE_TYPE, V_008958_DI_PT_POINTLIST); if (ctx->b.chip_class < CAYMAN) { /* These registers control which simds can be used by each stage. * The default for these registers is 0xffffffff, which means * all simds are available for each stage. It's possible we may * want to play around with these in the future, but for now * the default value is fine. * * R_008E20_SQ_STATIC_THREAD_MGMT1 * R_008E24_SQ_STATIC_THREAD_MGMT2 * R_008E28_SQ_STATIC_THREAD_MGMT3 */ /* XXX: We may need to adjust the thread and stack resouce * values for 3D/compute interop */ r600_store_config_reg_seq(cb, R_008C18_SQ_THREAD_RESOURCE_MGMT_1, 5); /* R_008C18_SQ_THREAD_RESOURCE_MGMT_1 * Set the number of threads used by the PS/VS/GS/ES stage to * 0. */ r600_store_value(cb, 0); /* R_008C1C_SQ_THREAD_RESOURCE_MGMT_2 * Set the number of threads used by the CS (aka LS) stage to * the maximum number of threads and set the number of threads * for the HS stage to 0. */ r600_store_value(cb, S_008C1C_NUM_LS_THREADS(num_threads)); /* R_008C20_SQ_STACK_RESOURCE_MGMT_1 * Set the Control Flow stack entries to 0 for PS/VS stages */ r600_store_value(cb, 0); /* R_008C24_SQ_STACK_RESOURCE_MGMT_2 * Set the Control Flow stack entries to 0 for GS/ES stages */ r600_store_value(cb, 0); /* R_008C28_SQ_STACK_RESOURCE_MGMT_3 * Set the Contol Flow stack entries to 0 for the HS stage, and * set it to the maximum value for the CS (aka LS) stage. */ r600_store_value(cb, S_008C28_NUM_LS_STACK_ENTRIES(num_stack_entries)); } /* Give the compute shader all the available LDS space. * NOTE: This only sets the maximum number of dwords that a compute * shader can allocate. When a shader is executed, we still need to * allocate the appropriate amount of LDS dwords using the * CM_R_0288E8_SQ_LDS_ALLOC register. */ if (ctx->b.chip_class < CAYMAN) { r600_store_config_reg(cb, R_008E2C_SQ_LDS_RESOURCE_MGMT, S_008E2C_NUM_PS_LDS(0x0000) | S_008E2C_NUM_LS_LDS(8192)); } else { r600_store_context_reg(cb, CM_R_0286FC_SPI_LDS_MGMT, S_0286FC_NUM_PS_LDS(0) | S_0286FC_NUM_LS_LDS(255)); /* 255 * 32 = 8160 dwords */ } /* Context Registers */ if (ctx->b.chip_class < CAYMAN) { /* workaround for hw issues with dyn gpr - must set all limits * to 240 instead of 0, 0x1e == 240 / 8 */ r600_store_context_reg(cb, R_028838_SQ_DYN_GPR_RESOURCE_LIMIT_1, S_028838_PS_GPRS(0x1e) | S_028838_VS_GPRS(0x1e) | S_028838_GS_GPRS(0x1e) | S_028838_ES_GPRS(0x1e) | S_028838_HS_GPRS(0x1e) | S_028838_LS_GPRS(0x1e)); } /* XXX: Investigate setting bit 15, which is FAST_COMPUTE_MODE */ r600_store_context_reg(cb, R_028A40_VGT_GS_MODE, S_028A40_COMPUTE_MODE(1) | S_028A40_PARTIAL_THD_AT_EOI(1)); r600_store_context_reg(cb, R_028B54_VGT_SHADER_STAGES_EN, 2/*CS_ON*/); r600_store_context_reg(cb, R_0286E8_SPI_COMPUTE_INPUT_CNTL, S_0286E8_TID_IN_GROUP_ENA | S_0286E8_TGID_ENA | S_0286E8_DISABLE_INDEX_PACK) ; /* The LOOP_CONST registers are an optimizations for loops that allows * you to store the initial counter, increment value, and maximum * counter value in a register so that hardware can calculate the * correct number of iterations for the loop, so that you don't need * to have the loop counter in your shader code. We don't currently use * this optimization, so we must keep track of the counter in the * shader and use a break instruction to exit loops. However, the * hardware will still uses this register to determine when to exit a * loop, so we need to initialize the counter to 0, set the increment * value to 1 and the maximum counter value to the 4095 (0xfff) which * is the maximum value allowed. This gives us a maximum of 4096 * iterations for our loops, but hopefully our break instruction will * execute before some time before the 4096th iteration. */ eg_store_loop_const(cb, R_03A200_SQ_LOOP_CONST_0 + (160 * 4), 0x1000FFF); }
void r600_flush_emit(struct r600_context *rctx) { struct radeon_winsys_cs *cs = rctx->b.gfx.cs; unsigned cp_coher_cntl = 0; unsigned wait_until = 0; if (!rctx->b.flags) { return; } if (rctx->b.flags & R600_CONTEXT_WAIT_3D_IDLE) { wait_until |= S_008040_WAIT_3D_IDLE(1); } if (rctx->b.flags & R600_CONTEXT_WAIT_CP_DMA_IDLE) { wait_until |= S_008040_WAIT_CP_DMA_IDLE(1); } if (wait_until) { /* Use of WAIT_UNTIL is deprecated on Cayman+ */ if (rctx->b.family >= CHIP_CAYMAN) { /* emit a PS partial flush on Cayman/TN */ rctx->b.flags |= R600_CONTEXT_PS_PARTIAL_FLUSH; } } if (rctx->b.flags & R600_CONTEXT_PS_PARTIAL_FLUSH) { cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 0, 0); cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_PS_PARTIAL_FLUSH) | EVENT_INDEX(4); } if (rctx->b.chip_class >= R700 && (rctx->b.flags & R600_CONTEXT_FLUSH_AND_INV_CB_META)) { cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 0, 0); cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_FLUSH_AND_INV_CB_META) | EVENT_INDEX(0); } if (rctx->b.chip_class >= R700 && (rctx->b.flags & R600_CONTEXT_FLUSH_AND_INV_DB_META)) { cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 0, 0); cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_FLUSH_AND_INV_DB_META) | EVENT_INDEX(0); /* Set FULL_CACHE_ENA for DB META flushes on r7xx and later. * * This hack predates use of FLUSH_AND_INV_DB_META, so it's * unclear whether it's still needed or even whether it has * any effect. */ cp_coher_cntl |= S_0085F0_FULL_CACHE_ENA(1); } if (rctx->b.flags & R600_CONTEXT_FLUSH_AND_INV || (rctx->b.chip_class == R600 && rctx->b.flags & R600_CONTEXT_STREAMOUT_FLUSH)) { cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 0, 0); cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_CACHE_FLUSH_AND_INV_EVENT) | EVENT_INDEX(0); } if (rctx->b.flags & R600_CONTEXT_INV_CONST_CACHE) { /* Direct constant addressing uses the shader cache. * Indirect contant addressing uses the vertex cache. */ cp_coher_cntl |= S_0085F0_SH_ACTION_ENA(1) | (rctx->has_vertex_cache ? S_0085F0_VC_ACTION_ENA(1) : S_0085F0_TC_ACTION_ENA(1)); } if (rctx->b.flags & R600_CONTEXT_INV_VERTEX_CACHE) { cp_coher_cntl |= rctx->has_vertex_cache ? S_0085F0_VC_ACTION_ENA(1) : S_0085F0_TC_ACTION_ENA(1); } if (rctx->b.flags & R600_CONTEXT_INV_TEX_CACHE) { /* Textures use the texture cache. * Texture buffer objects use the vertex cache. */ cp_coher_cntl |= S_0085F0_TC_ACTION_ENA(1) | (rctx->has_vertex_cache ? S_0085F0_VC_ACTION_ENA(1) : 0); } /* Don't use the DB CP COHER logic on r6xx. * There are hw bugs. */ if (rctx->b.chip_class >= R700 && (rctx->b.flags & R600_CONTEXT_FLUSH_AND_INV_DB)) { cp_coher_cntl |= S_0085F0_DB_ACTION_ENA(1) | S_0085F0_DB_DEST_BASE_ENA(1) | S_0085F0_SMX_ACTION_ENA(1); } /* Don't use the CB CP COHER logic on r6xx. * There are hw bugs. */ if (rctx->b.chip_class >= R700 && (rctx->b.flags & R600_CONTEXT_FLUSH_AND_INV_CB)) { cp_coher_cntl |= S_0085F0_CB_ACTION_ENA(1) | S_0085F0_CB0_DEST_BASE_ENA(1) | S_0085F0_CB1_DEST_BASE_ENA(1) | S_0085F0_CB2_DEST_BASE_ENA(1) | S_0085F0_CB3_DEST_BASE_ENA(1) | S_0085F0_CB4_DEST_BASE_ENA(1) | S_0085F0_CB5_DEST_BASE_ENA(1) | S_0085F0_CB6_DEST_BASE_ENA(1) | S_0085F0_CB7_DEST_BASE_ENA(1) | S_0085F0_SMX_ACTION_ENA(1); if (rctx->b.chip_class >= EVERGREEN) cp_coher_cntl |= S_0085F0_CB8_DEST_BASE_ENA(1) | S_0085F0_CB9_DEST_BASE_ENA(1) | S_0085F0_CB10_DEST_BASE_ENA(1) | S_0085F0_CB11_DEST_BASE_ENA(1); } if (rctx->b.chip_class >= R700 && rctx->b.flags & R600_CONTEXT_STREAMOUT_FLUSH) { cp_coher_cntl |= S_0085F0_SO0_DEST_BASE_ENA(1) | S_0085F0_SO1_DEST_BASE_ENA(1) | S_0085F0_SO2_DEST_BASE_ENA(1) | S_0085F0_SO3_DEST_BASE_ENA(1) | S_0085F0_SMX_ACTION_ENA(1); } /* Workaround for buggy flushing on some R6xx chipsets. */ if ((rctx->b.flags & (R600_CONTEXT_FLUSH_AND_INV | R600_CONTEXT_STREAMOUT_FLUSH)) && (rctx->b.family == CHIP_RV670 || rctx->b.family == CHIP_RS780 || rctx->b.family == CHIP_RS880)) { cp_coher_cntl |= S_0085F0_CB1_DEST_BASE_ENA(1) | S_0085F0_DEST_BASE_0_ENA(1); } if (cp_coher_cntl) { cs->buf[cs->cdw++] = PKT3(PKT3_SURFACE_SYNC, 3, 0); cs->buf[cs->cdw++] = cp_coher_cntl; /* CP_COHER_CNTL */ cs->buf[cs->cdw++] = 0xffffffff; /* CP_COHER_SIZE */ cs->buf[cs->cdw++] = 0; /* CP_COHER_BASE */ cs->buf[cs->cdw++] = 0x0000000A; /* POLL_INTERVAL */ } if (rctx->b.flags & R600_CONTEXT_START_PIPELINE_STATS) { radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_PIPELINESTAT_START) | EVENT_INDEX(0)); } else if (rctx->b.flags & R600_CONTEXT_STOP_PIPELINE_STATS) { radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_PIPELINESTAT_STOP) | EVENT_INDEX(0)); } if (wait_until) { /* Use of WAIT_UNTIL is deprecated on Cayman+ */ if (rctx->b.family < CHIP_CAYMAN) { /* wait for things to settle */ radeon_set_config_reg(cs, R_008040_WAIT_UNTIL, wait_until); } } /* everything is properly flushed */ rctx->b.flags = 0; }
static void compute_emit_cs(struct r600_context *ctx, const uint *block_layout, const uint *grid_layout) { struct radeon_winsys_cs *cs = ctx->b.rings.gfx.cs; int i; /* make sure that the gfx ring is only one active */ if (ctx->b.rings.dma.cs) { ctx->b.rings.dma.flush(ctx, RADEON_FLUSH_ASYNC); } /* Initialize all the compute-related registers. * * See evergreen_init_atom_start_compute_cs() in this file for the list * of registers initialized by the start_compute_cs_cmd atom. */ r600_emit_command_buffer(cs, &ctx->start_compute_cs_cmd); ctx->b.flags |= R600_CONTEXT_WAIT_3D_IDLE | R600_CONTEXT_FLUSH_AND_INV; r600_flush_emit(ctx); /* Emit colorbuffers. */ /* XXX support more than 8 colorbuffers (the offsets are not a multiple of 0x3C for CB8-11) */ for (i = 0; i < 8 && i < ctx->framebuffer.state.nr_cbufs; i++) { struct r600_surface *cb = (struct r600_surface*)ctx->framebuffer.state.cbufs[i]; unsigned reloc = r600_context_bo_reloc(&ctx->b, &ctx->b.rings.gfx, (struct r600_resource*)cb->base.texture, RADEON_USAGE_READWRITE); r600_write_compute_context_reg_seq(cs, R_028C60_CB_COLOR0_BASE + i * 0x3C, 7); radeon_emit(cs, cb->cb_color_base); /* R_028C60_CB_COLOR0_BASE */ radeon_emit(cs, cb->cb_color_pitch); /* R_028C64_CB_COLOR0_PITCH */ radeon_emit(cs, cb->cb_color_slice); /* R_028C68_CB_COLOR0_SLICE */ radeon_emit(cs, cb->cb_color_view); /* R_028C6C_CB_COLOR0_VIEW */ radeon_emit(cs, cb->cb_color_info); /* R_028C70_CB_COLOR0_INFO */ radeon_emit(cs, cb->cb_color_attrib); /* R_028C74_CB_COLOR0_ATTRIB */ radeon_emit(cs, cb->cb_color_dim); /* R_028C78_CB_COLOR0_DIM */ radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C60_CB_COLOR0_BASE */ radeon_emit(cs, reloc); if (!ctx->keep_tiling_flags) { radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C70_CB_COLOR0_INFO */ radeon_emit(cs, reloc); } radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C74_CB_COLOR0_ATTRIB */ radeon_emit(cs, reloc); } if (ctx->keep_tiling_flags) { for (; i < 8 ; i++) { r600_write_compute_context_reg(cs, R_028C70_CB_COLOR0_INFO + i * 0x3C, S_028C70_FORMAT(V_028C70_COLOR_INVALID)); } for (; i < 12; i++) { r600_write_compute_context_reg(cs, R_028E50_CB_COLOR8_INFO + (i - 8) * 0x1C, S_028C70_FORMAT(V_028C70_COLOR_INVALID)); } } /* Set CB_TARGET_MASK XXX: Use cb_misc_state */ r600_write_compute_context_reg(cs, R_028238_CB_TARGET_MASK, ctx->compute_cb_target_mask); /* Emit vertex buffer state */ ctx->cs_vertex_buffer_state.atom.num_dw = 12 * util_bitcount(ctx->cs_vertex_buffer_state.dirty_mask); r600_emit_atom(ctx, &ctx->cs_vertex_buffer_state.atom); /* Emit constant buffer state */ r600_emit_atom(ctx, &ctx->constbuf_state[PIPE_SHADER_COMPUTE].atom); /* Emit compute shader state */ r600_emit_atom(ctx, &ctx->cs_shader_state.atom); /* Emit dispatch state and dispatch packet */ evergreen_emit_direct_dispatch(ctx, block_layout, grid_layout); /* XXX evergreen_flush_emit() hardcodes the CP_COHER_SIZE to 0xffffffff */ ctx->b.flags |= R600_CONTEXT_INV_CONST_CACHE | R600_CONTEXT_INV_VERTEX_CACHE | R600_CONTEXT_INV_TEX_CACHE; r600_flush_emit(ctx); ctx->b.flags = 0; if (ctx->b.chip_class >= CAYMAN) { cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 0, 0); cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4); /* DEALLOC_STATE prevents the GPU from hanging when a * SURFACE_SYNC packet is emitted some time after a DISPATCH_DIRECT * with any of the CB*_DEST_BASE_ENA or DB_DEST_BASE_ENA bits set. */ cs->buf[cs->cdw++] = PKT3C(PKT3_DEALLOC_STATE, 0, 0); cs->buf[cs->cdw++] = 0; } #if 0 COMPUTE_DBG(ctx->screen, "cdw: %i\n", cs->cdw); for (i = 0; i < cs->cdw; i++) { COMPUTE_DBG(ctx->screen, "%4i : 0x%08X\n", i, cs->buf[i]); } #endif }