static bool brw_codegen_cs_prog(struct brw_context *brw, struct gl_shader_program *prog, struct brw_compute_program *cp, struct brw_cs_prog_key *key) { struct gl_context *ctx = &brw->ctx; const GLuint *program; void *mem_ctx = ralloc_context(NULL); GLuint program_size; struct brw_cs_prog_data prog_data; bool start_busy = false; double start_time = 0; struct brw_shader *cs = (struct brw_shader *) prog->_LinkedShaders[MESA_SHADER_COMPUTE]; assert (cs); memset(&prog_data, 0, sizeof(prog_data)); if (prog->Comp.SharedSize > 64 * 1024) { prog->LinkStatus = false; const char *error_str = "Compute shader used more than 64KB of shared variables"; ralloc_strcat(&prog->InfoLog, error_str); _mesa_problem(NULL, "Failed to link compute shader: %s\n", error_str); ralloc_free(mem_ctx); return false; } else { prog_data.base.total_shared = prog->Comp.SharedSize; } assign_cs_binding_table_offsets(brw->intelScreen->devinfo, prog, &cp->program.Base, &prog_data); /* Allocate the references to the uniforms that will end up in the * prog_data associated with the compiled program, and which will be freed * by the state cache. */ int param_count = cp->program.Base.nir->num_uniforms; /* The backend also sometimes adds params for texture size. */ param_count += 2 * ctx->Const.Program[MESA_SHADER_COMPUTE].MaxTextureImageUnits; prog_data.base.param = rzalloc_array(NULL, const gl_constant_value *, param_count); prog_data.base.pull_param = rzalloc_array(NULL, const gl_constant_value *, param_count); prog_data.base.image_param = rzalloc_array(NULL, struct brw_image_param, cs->base.NumImages); prog_data.base.nr_params = param_count; prog_data.base.nr_image_params = cs->base.NumImages; brw_nir_setup_glsl_uniforms(cp->program.Base.nir, prog, &cp->program.Base, &prog_data.base, true); if (unlikely(brw->perf_debug)) { start_busy = (brw->batch.last_bo && drm_intel_bo_busy(brw->batch.last_bo)); start_time = get_time(); } if (unlikely(INTEL_DEBUG & DEBUG_CS)) brw_dump_ir("compute", prog, &cs->base, &cp->program.Base); int st_index = -1; if (INTEL_DEBUG & DEBUG_SHADER_TIME) st_index = brw_get_shader_time_index(brw, prog, &cp->program.Base, ST_CS); char *error_str; program = brw_compile_cs(brw->intelScreen->compiler, brw, mem_ctx, key, &prog_data, cp->program.Base.nir, st_index, &program_size, &error_str); if (program == NULL) { prog->LinkStatus = false; ralloc_strcat(&prog->InfoLog, error_str); _mesa_problem(NULL, "Failed to compile compute shader: %s\n", error_str); ralloc_free(mem_ctx); return false; } if (unlikely(brw->perf_debug) && cs) { if (cs->compiled_once) { _mesa_problem(&brw->ctx, "CS programs shouldn't need recompiles"); } cs->compiled_once = true; if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) { perf_debug("CS compile took %.03f ms and stalled the GPU\n", (get_time() - start_time) * 1000); } } if (prog_data.base.total_scratch) { brw_get_scratch_bo(brw, &brw->cs.base.scratch_bo, prog_data.base.total_scratch * brw->max_cs_threads); } if (unlikely(INTEL_DEBUG & DEBUG_CS)) fprintf(stderr, "\n"); brw_upload_cache(&brw->cache, BRW_CACHE_CS_PROG, key, sizeof(*key), program, program_size, &prog_data, sizeof(prog_data), &brw->cs.base.prog_offset, &brw->cs.prog_data); ralloc_free(mem_ctx); return true; }
static bool brw_codegen_cs_prog(struct brw_context *brw, struct gl_shader_program *prog, struct brw_program *cp, struct brw_cs_prog_key *key) { const struct gen_device_info *devinfo = &brw->screen->devinfo; struct gl_context *ctx = &brw->ctx; const GLuint *program; void *mem_ctx = ralloc_context(NULL); GLuint program_size; struct brw_cs_prog_data prog_data; bool start_busy = false; double start_time = 0; struct brw_shader *cs = (struct brw_shader *) prog->_LinkedShaders[MESA_SHADER_COMPUTE]; assert (cs); memset(&prog_data, 0, sizeof(prog_data)); if (prog->Comp.SharedSize > 64 * 1024) { prog->LinkStatus = false; const char *error_str = "Compute shader used more than 64KB of shared variables"; ralloc_strcat(&prog->InfoLog, error_str); _mesa_problem(NULL, "Failed to link compute shader: %s\n", error_str); ralloc_free(mem_ctx); return false; } else { prog_data.base.total_shared = prog->Comp.SharedSize; } assign_cs_binding_table_offsets(devinfo, prog, &cp->program, &prog_data); /* Allocate the references to the uniforms that will end up in the * prog_data associated with the compiled program, and which will be freed * by the state cache. */ int param_count = cp->program.nir->num_uniforms / 4; /* The backend also sometimes add a param for the thread local id. */ prog_data.thread_local_id_index = param_count++; /* The backend also sometimes adds params for texture size. */ param_count += 2 * ctx->Const.Program[MESA_SHADER_COMPUTE].MaxTextureImageUnits; prog_data.base.param = rzalloc_array(NULL, const gl_constant_value *, param_count); prog_data.base.pull_param = rzalloc_array(NULL, const gl_constant_value *, param_count); prog_data.base.image_param = rzalloc_array(NULL, struct brw_image_param, cs->base.NumImages); prog_data.base.nr_params = param_count; prog_data.base.nr_image_params = cs->base.NumImages; brw_nir_setup_glsl_uniforms(cp->program.nir, prog, &cp->program, &prog_data.base, true); if (unlikely(brw->perf_debug)) { start_busy = (brw->batch.last_bo && drm_intel_bo_busy(brw->batch.last_bo)); start_time = get_time(); } if (unlikely(INTEL_DEBUG & DEBUG_CS)) brw_dump_ir("compute", prog, &cs->base, &cp->program); int st_index = -1; if (INTEL_DEBUG & DEBUG_SHADER_TIME) st_index = brw_get_shader_time_index(brw, prog, &cp->program, ST_CS); char *error_str; program = brw_compile_cs(brw->screen->compiler, brw, mem_ctx, key, &prog_data, cp->program.nir, st_index, &program_size, &error_str); if (program == NULL) { prog->LinkStatus = false; ralloc_strcat(&prog->InfoLog, error_str); _mesa_problem(NULL, "Failed to compile compute shader: %s\n", error_str); ralloc_free(mem_ctx); return false; } if (unlikely(brw->perf_debug) && cs) { if (cs->compiled_once) { _mesa_problem(&brw->ctx, "CS programs shouldn't need recompiles"); } cs->compiled_once = true; if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) { perf_debug("CS compile took %.03f ms and stalled the GPU\n", (get_time() - start_time) * 1000); } } const unsigned subslices = MAX2(brw->screen->subslice_total, 1); /* WaCSScratchSize:hsw * * Haswell's scratch space address calculation appears to be sparse * rather than tightly packed. The Thread ID has bits indicating * which subslice, EU within a subslice, and thread within an EU * it is. There's a maximum of two slices and two subslices, so these * can be stored with a single bit. Even though there are only 10 EUs * per subslice, this is stored in 4 bits, so there's an effective * maximum value of 16 EUs. Similarly, although there are only 7 * threads per EU, this is stored in a 3 bit number, giving an effective * maximum value of 8 threads per EU. * * This means that we need to use 16 * 8 instead of 10 * 7 for the * number of threads per subslice. */ const unsigned scratch_ids_per_subslice = brw->is_haswell ? 16 * 8 : devinfo->max_cs_threads; brw_alloc_stage_scratch(brw, &brw->cs.base, prog_data.base.total_scratch, scratch_ids_per_subslice * subslices); if (unlikely(INTEL_DEBUG & DEBUG_CS)) fprintf(stderr, "\n"); brw_upload_cache(&brw->cache, BRW_CACHE_CS_PROG, key, sizeof(*key), program, program_size, &prog_data, sizeof(prog_data), &brw->cs.base.prog_offset, &brw->cs.base.prog_data); ralloc_free(mem_ctx); return true; }