static void evergreen_emit_direct_dispatch( struct r600_context *rctx, const uint *block_layout, const uint *grid_layout) { int i; struct radeon_winsys_cs *cs = rctx->cs; unsigned num_waves; unsigned num_pipes = rctx->screen->info.r600_max_pipes; unsigned wave_divisor = (16 * num_pipes); int group_size = 1; int grid_size = 1; /* XXX: Enable lds and get size from cs_shader_state */ unsigned lds_size = 0; /* Calculate group_size/grid_size */ for (i = 0; i < 3; i++) { group_size *= block_layout[i]; } for (i = 0; i < 3; i++) { grid_size *= grid_layout[i]; } /* num_waves = ceil((tg_size.x * tg_size.y, tg_size.z) / (16 * num_pipes)) */ num_waves = (block_layout[0] * block_layout[1] * block_layout[2] + wave_divisor - 1) / wave_divisor; COMPUTE_DBG("Using %u pipes, there are %u wavefronts per thread block\n", num_pipes, num_waves); /* XXX: Partition the LDS between PS/CS. By default half (4096 dwords * on Evergreen) oes to Pixel Shaders and half goes to Compute Shaders. * We may need to allocat the entire LDS space for Compute Shaders. * * EG: R_008E2C_SQ_LDS_RESOURCE_MGMT := S_008E2C_NUM_LS_LDS(lds_dwords) * CM: CM_R_0286FC_SPI_LDS_MGMT := S_0286FC_NUM_LS_LDS(lds_dwords) */ r600_write_config_reg(cs, R_008970_VGT_NUM_INDICES, group_size); r600_write_config_reg_seq(cs, R_00899C_VGT_COMPUTE_START_X, 3); r600_write_value(cs, 0); /* R_00899C_VGT_COMPUTE_START_X */ r600_write_value(cs, 0); /* R_0089A0_VGT_COMPUTE_START_Y */ r600_write_value(cs, 0); /* R_0089A4_VGT_COMPUTE_START_Z */ r600_write_config_reg(cs, R_0089AC_VGT_COMPUTE_THREAD_GROUP_SIZE, group_size); r600_write_compute_context_reg_seq(cs, R_0286EC_SPI_COMPUTE_NUM_THREAD_X, 3); r600_write_value(cs, block_layout[0]); /* R_0286EC_SPI_COMPUTE_NUM_THREAD_X */ r600_write_value(cs, block_layout[1]); /* R_0286F0_SPI_COMPUTE_NUM_THREAD_Y */ r600_write_value(cs, block_layout[2]); /* R_0286F4_SPI_COMPUTE_NUM_THREAD_Z */ r600_write_compute_context_reg(cs, CM_R_0288E8_SQ_LDS_ALLOC, lds_size | (num_waves << 14)); /* Dispatch packet */ r600_write_value(cs, PKT3C(PKT3_DISPATCH_DIRECT, 3, 0)); r600_write_value(cs, grid_layout[0]); r600_write_value(cs, grid_layout[1]); r600_write_value(cs, grid_layout[2]); /* VGT_DISPATCH_INITIATOR = COMPUTE_SHADER_EN */ r600_write_value(cs, 1); }
static void evergreen_emit_direct_dispatch( struct r600_context *rctx, const uint *block_layout, const uint *grid_layout) { int i; struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs; struct r600_pipe_compute *shader = rctx->cs_shader_state.shader; unsigned num_waves; unsigned num_pipes = rctx->screen->b.info.r600_max_pipes; unsigned wave_divisor = (16 * num_pipes); int group_size = 1; int grid_size = 1; unsigned lds_size = shader->local_size / 4 + shader->active_kernel->bc.nlds_dw; /* Calculate group_size/grid_size */ for (i = 0; i < 3; i++) { group_size *= block_layout[i]; } for (i = 0; i < 3; i++) { grid_size *= grid_layout[i]; } /* num_waves = ceil((tg_size.x * tg_size.y, tg_size.z) / (16 * num_pipes)) */ num_waves = (block_layout[0] * block_layout[1] * block_layout[2] + wave_divisor - 1) / wave_divisor; COMPUTE_DBG(rctx->screen, "Using %u pipes, " "%u wavefronts per thread block, " "allocating %u dwords lds.\n", num_pipes, num_waves, lds_size); r600_write_config_reg(cs, R_008970_VGT_NUM_INDICES, group_size); r600_write_config_reg_seq(cs, R_00899C_VGT_COMPUTE_START_X, 3); radeon_emit(cs, 0); /* R_00899C_VGT_COMPUTE_START_X */ radeon_emit(cs, 0); /* R_0089A0_VGT_COMPUTE_START_Y */ radeon_emit(cs, 0); /* R_0089A4_VGT_COMPUTE_START_Z */ r600_write_config_reg(cs, R_0089AC_VGT_COMPUTE_THREAD_GROUP_SIZE, group_size); r600_write_compute_context_reg_seq(cs, R_0286EC_SPI_COMPUTE_NUM_THREAD_X, 3); radeon_emit(cs, block_layout[0]); /* R_0286EC_SPI_COMPUTE_NUM_THREAD_X */ radeon_emit(cs, block_layout[1]); /* R_0286F0_SPI_COMPUTE_NUM_THREAD_Y */ radeon_emit(cs, block_layout[2]); /* R_0286F4_SPI_COMPUTE_NUM_THREAD_Z */ if (rctx->b.chip_class < CAYMAN) { assert(lds_size <= 8192); } else { /* Cayman appears to have a slightly smaller limit, see the * value of CM_R_0286FC_SPI_LDS_MGMT.NUM_LS_LDS */ assert(lds_size <= 8160); } r600_write_compute_context_reg(cs, CM_R_0288E8_SQ_LDS_ALLOC, lds_size | (num_waves << 14)); /* Dispatch packet */ radeon_emit(cs, PKT3C(PKT3_DISPATCH_DIRECT, 3, 0)); radeon_emit(cs, grid_layout[0]); radeon_emit(cs, grid_layout[1]); radeon_emit(cs, grid_layout[2]); /* VGT_DISPATCH_INITIATOR = COMPUTE_SHADER_EN */ radeon_emit(cs, 1); }