Beispiel #1
0
static bool
brw_codegen_wm_prog(struct brw_context *brw,
                    struct brw_program *fp,
                    struct brw_wm_prog_key *key,
                    struct brw_vue_map *vue_map)
{
   const struct gen_device_info *devinfo = &brw->screen->devinfo;
   void *mem_ctx = ralloc_context(NULL);
   struct brw_wm_prog_data prog_data;
   const GLuint *program;
   bool start_busy = false;
   double start_time = 0;

   nir_shader *nir = nir_shader_clone(mem_ctx, fp->program.nir);

   memset(&prog_data, 0, sizeof(prog_data));

   /* Use ALT floating point mode for ARB programs so that 0^0 == 1. */
   if (fp->program.is_arb_asm)
      prog_data.base.use_alt_mode = true;

   assign_fs_binding_table_offsets(devinfo, &fp->program, key, &prog_data);

   if (!fp->program.is_arb_asm) {
      brw_nir_setup_glsl_uniforms(mem_ctx, nir, &fp->program,
                                  &prog_data.base, true);
      brw_nir_analyze_ubo_ranges(brw->screen->compiler, nir,
                                 NULL, prog_data.base.ubo_ranges);
   } else {
      brw_nir_setup_arb_uniforms(mem_ctx, nir, &fp->program, &prog_data.base);

      if (unlikely(INTEL_DEBUG & DEBUG_WM))
         brw_dump_arb_asm("fragment", &fp->program);
   }

   if (unlikely(brw->perf_debug)) {
      start_busy = (brw->batch.last_bo &&
                    brw_bo_busy(brw->batch.last_bo));
      start_time = get_time();
   }

   int st_index8 = -1, st_index16 = -1, st_index32 = -1;
   if (INTEL_DEBUG & DEBUG_SHADER_TIME) {
      st_index8 = brw_get_shader_time_index(brw, &fp->program, ST_FS8,
                                            !fp->program.is_arb_asm);
      st_index16 = brw_get_shader_time_index(brw, &fp->program, ST_FS16,
                                             !fp->program.is_arb_asm);
      st_index32 = brw_get_shader_time_index(brw, &fp->program, ST_FS32,
                                             !fp->program.is_arb_asm);
   }

   char *error_str = NULL;
   program = brw_compile_fs(brw->screen->compiler, brw, mem_ctx,
                            key, &prog_data, nir,
                            &fp->program, st_index8, st_index16, st_index32,
                            true, false, vue_map,
                            &error_str);

   if (program == NULL) {
      if (!fp->program.is_arb_asm) {
         fp->program.sh.data->LinkStatus = LINKING_FAILURE;
         ralloc_strcat(&fp->program.sh.data->InfoLog, error_str);
      }

      _mesa_problem(NULL, "Failed to compile fragment shader: %s\n", error_str);

      ralloc_free(mem_ctx);
      return false;
   }

   if (unlikely(brw->perf_debug)) {
      if (fp->compiled_once) {
         brw_debug_recompile(brw, MESA_SHADER_FRAGMENT, fp->program.Id,
                             key->program_string_id, key);
      }
      fp->compiled_once = true;

      if (start_busy && !brw_bo_busy(brw->batch.last_bo)) {
         perf_debug("FS compile took %.03f ms and stalled the GPU\n",
                    (get_time() - start_time) * 1000);
      }
   }

   brw_alloc_stage_scratch(brw, &brw->wm.base, prog_data.base.total_scratch);

   if (unlikely((INTEL_DEBUG & DEBUG_WM) && fp->program.is_arb_asm))
      fprintf(stderr, "\n");

   /* The param and pull_param arrays will be freed by the shader cache. */
   ralloc_steal(NULL, prog_data.base.param);
   ralloc_steal(NULL, prog_data.base.pull_param);
   brw_upload_cache(&brw->cache, BRW_CACHE_FS_PROG,
                    key, sizeof(struct brw_wm_prog_key),
                    program, prog_data.base.program_size,
                    &prog_data, sizeof(prog_data),
                    &brw->wm.base.prog_offset, &brw->wm.base.prog_data);

   ralloc_free(mem_ctx);

   return true;
}
Beispiel #2
0
extern "C" const unsigned *
brw_compile_tcs(const struct brw_compiler *compiler,
                void *log_data,
                void *mem_ctx,
                const struct brw_tcs_prog_key *key,
                struct brw_tcs_prog_data *prog_data,
                const nir_shader *src_shader,
                int shader_time_index,
                unsigned *final_assembly_size,
                char **error_str)
{
   const struct gen_device_info *devinfo = compiler->devinfo;
   struct brw_vue_prog_data *vue_prog_data = &prog_data->base;
   const bool is_scalar = compiler->scalar_stage[MESA_SHADER_TESS_CTRL];

   nir_shader *nir = nir_shader_clone(mem_ctx, src_shader);
   nir->info->outputs_written = key->outputs_written;
   nir->info->patch_outputs_written = key->patch_outputs_written;

   struct brw_vue_map input_vue_map;
   brw_compute_vue_map(devinfo, &input_vue_map, nir->info->inputs_read,
                       nir->info->separate_shader);
   brw_compute_tess_vue_map(&vue_prog_data->vue_map,
                            nir->info->outputs_written,
                            nir->info->patch_outputs_written);

   nir = brw_nir_apply_sampler_key(nir, devinfo, &key->tex, is_scalar);
   brw_nir_lower_vue_inputs(nir, is_scalar, &input_vue_map);
   brw_nir_lower_tcs_outputs(nir, &vue_prog_data->vue_map);
   if (key->quads_workaround)
      brw_nir_apply_tcs_quads_workaround(nir);

   nir = brw_postprocess_nir(nir, compiler->devinfo, is_scalar);

   if (is_scalar)
      prog_data->instances = DIV_ROUND_UP(nir->info->tcs.vertices_out, 8);
   else
      prog_data->instances = DIV_ROUND_UP(nir->info->tcs.vertices_out, 2);

   /* Compute URB entry size.  The maximum allowed URB entry size is 32k.
    * That divides up as follows:
    *
    *     32 bytes for the patch header (tessellation factors)
    *    480 bytes for per-patch varyings (a varying component is 4 bytes and
    *              gl_MaxTessPatchComponents = 120)
    *  16384 bytes for per-vertex varyings (a varying component is 4 bytes,
    *              gl_MaxPatchVertices = 32 and
    *              gl_MaxTessControlOutputComponents = 128)
    *
    *  15808 bytes left for varying packing overhead
    */
   const int num_per_patch_slots = vue_prog_data->vue_map.num_per_patch_slots;
   const int num_per_vertex_slots = vue_prog_data->vue_map.num_per_vertex_slots;
   unsigned output_size_bytes = 0;
   /* Note that the patch header is counted in num_per_patch_slots. */
   output_size_bytes += num_per_patch_slots * 16;
   output_size_bytes += nir->info->tcs.vertices_out * num_per_vertex_slots * 16;

   assert(output_size_bytes >= 1);
   if (output_size_bytes > GEN7_MAX_HS_URB_ENTRY_SIZE_BYTES)
      return NULL;

   /* URB entry sizes are stored as a multiple of 64 bytes. */
   vue_prog_data->urb_entry_size = ALIGN(output_size_bytes, 64) / 64;

   /* HS does not use the usual payload pushing from URB to GRFs,
    * because we don't have enough registers for a full-size payload, and
    * the hardware is broken on Haswell anyway.
    */
   vue_prog_data->urb_read_length = 0;

   if (unlikely(INTEL_DEBUG & DEBUG_TCS)) {
      fprintf(stderr, "TCS Input ");
      brw_print_vue_map(stderr, &input_vue_map);
      fprintf(stderr, "TCS Output ");
      brw_print_vue_map(stderr, &vue_prog_data->vue_map);
   }

   if (is_scalar) {
      fs_visitor v(compiler, log_data, mem_ctx, (void *) key,
                   &prog_data->base.base, NULL, nir, 8,
                   shader_time_index, &input_vue_map);
      if (!v.run_tcs_single_patch()) {
         if (error_str)
            *error_str = ralloc_strdup(mem_ctx, v.fail_msg);
         return NULL;
      }

      prog_data->base.base.dispatch_grf_start_reg = v.payload.num_regs;
      prog_data->base.dispatch_mode = DISPATCH_MODE_SIMD8;

      fs_generator g(compiler, log_data, mem_ctx, (void *) key,
                     &prog_data->base.base, v.promoted_constants, false,
                     MESA_SHADER_TESS_CTRL);
      if (unlikely(INTEL_DEBUG & DEBUG_TCS)) {
         g.enable_debug(ralloc_asprintf(mem_ctx,
                                        "%s tessellation control shader %s",
                                        nir->info->label ? nir->info->label
                                                        : "unnamed",
                                        nir->info->name));
      }

      g.generate_code(v.cfg, 8);

      return g.get_assembly(final_assembly_size);
   } else {
      vec4_tcs_visitor v(compiler, log_data, key, prog_data,
                         nir, mem_ctx, shader_time_index, &input_vue_map);
      if (!v.run()) {
         if (error_str)
            *error_str = ralloc_strdup(mem_ctx, v.fail_msg);
         return NULL;
      }

      if (unlikely(INTEL_DEBUG & DEBUG_TCS))
         v.dump_instructions();


      return brw_vec4_generate_assembly(compiler, log_data, mem_ctx, nir,
                                        &prog_data->base, v.cfg,
                                        final_assembly_size);
   }
}
Beispiel #3
0
static bool
brw_codegen_tes_prog(struct brw_context *brw,
                     struct brw_program *tep,
                     struct brw_tes_prog_key *key)
{
   const struct brw_compiler *compiler = brw->screen->compiler;
   const struct gen_device_info *devinfo = &brw->screen->devinfo;
   struct brw_stage_state *stage_state = &brw->tes.base;
   struct brw_tes_prog_data prog_data;
   bool start_busy = false;
   double start_time = 0;

   memset(&prog_data, 0, sizeof(prog_data));

   void *mem_ctx = ralloc_context(NULL);

   nir_shader *nir = nir_shader_clone(mem_ctx, tep->program.nir);

   brw_assign_common_binding_table_offsets(devinfo, &tep->program,
                                           &prog_data.base.base, 0);

   brw_nir_setup_glsl_uniforms(mem_ctx, nir, &tep->program,
                               &prog_data.base.base,
                               compiler->scalar_stage[MESA_SHADER_TESS_EVAL]);
   brw_nir_analyze_ubo_ranges(compiler, nir, NULL,
                              prog_data.base.base.ubo_ranges);

   int st_index = -1;
   if (unlikely(INTEL_DEBUG & DEBUG_SHADER_TIME))
      st_index = brw_get_shader_time_index(brw, &tep->program, ST_TES, true);

   if (unlikely(brw->perf_debug)) {
      start_busy = brw->batch.last_bo && brw_bo_busy(brw->batch.last_bo);
      start_time = get_time();
   }

   struct brw_vue_map input_vue_map;
   brw_compute_tess_vue_map(&input_vue_map, key->inputs_read,
                            key->patch_inputs_read);

   char *error_str;
   const unsigned *program =
      brw_compile_tes(compiler, brw, mem_ctx, key, &input_vue_map, &prog_data,
                      nir, &tep->program, st_index, &error_str);
   if (program == NULL) {
      tep->program.sh.data->LinkStatus = LINKING_FAILURE;
      ralloc_strcat(&tep->program.sh.data->InfoLog, error_str);

      _mesa_problem(NULL, "Failed to compile tessellation evaluation shader: "
                    "%s\n", error_str);

      ralloc_free(mem_ctx);
      return false;
   }

   if (unlikely(brw->perf_debug)) {
      if (tep->compiled_once) {
         brw_debug_recompile(brw, MESA_SHADER_TESS_EVAL, tep->program.Id,
                             key->program_string_id, key);
      }
      if (start_busy && !brw_bo_busy(brw->batch.last_bo)) {
         perf_debug("TES compile took %.03f ms and stalled the GPU\n",
                    (get_time() - start_time) * 1000);
      }
      tep->compiled_once = true;
   }

   /* Scratch space is used for register spilling */
   brw_alloc_stage_scratch(brw, stage_state,
                           prog_data.base.base.total_scratch);

   /* The param and pull_param arrays will be freed by the shader cache. */
   ralloc_steal(NULL, prog_data.base.base.param);
   ralloc_steal(NULL, prog_data.base.base.pull_param);
   brw_upload_cache(&brw->cache, BRW_CACHE_TES_PROG,
                    key, sizeof(*key),
                    program, prog_data.base.base.program_size,
                    &prog_data, sizeof(prog_data),
                    &stage_state->prog_offset, &brw->tes.base.prog_data);
   ralloc_free(mem_ctx);

   return true;
}
Beispiel #4
0
bool
nv50_program_translate(struct nv50_program *prog, uint16_t chipset,
                       struct pipe_debug_callback *debug)
{
   struct nv50_ir_prog_info *info;
   int i, ret;
   const uint8_t map_undef = (prog->type == PIPE_SHADER_VERTEX) ? 0x40 : 0x80;

   info = CALLOC_STRUCT(nv50_ir_prog_info);
   if (!info)
      return false;

   info->type = prog->type;
   info->target = chipset;

   info->bin.sourceRep = prog->pipe.type;
   switch (prog->pipe.type) {
   case PIPE_SHADER_IR_TGSI:
      info->bin.source = (void *)prog->pipe.tokens;
      break;
   case PIPE_SHADER_IR_NIR:
      info->bin.source = (void *)nir_shader_clone(NULL, prog->pipe.ir.nir);
      break;
   default:
      assert(!"unsupported IR!");
      return false;
   }

   info->bin.smemSize = prog->cp.smem_size;
   info->io.auxCBSlot = 15;
   info->io.ucpBase = NV50_CB_AUX_UCP_OFFSET;
   info->io.genUserClip = prog->vp.clpd_nr;
   if (prog->fp.alphatest)
      info->io.alphaRefBase = NV50_CB_AUX_ALPHATEST_OFFSET;

   info->io.suInfoBase = NV50_CB_AUX_TEX_MS_OFFSET;
   info->io.sampleInfoBase = NV50_CB_AUX_SAMPLE_OFFSET;
   info->io.msInfoCBSlot = 15;
   info->io.msInfoBase = NV50_CB_AUX_MS_OFFSET;

   info->assignSlots = nv50_program_assign_varying_slots;

   prog->vp.bfc[0] = 0xff;
   prog->vp.bfc[1] = 0xff;
   prog->vp.edgeflag = 0xff;
   prog->vp.clpd[0] = map_undef;
   prog->vp.clpd[1] = map_undef;
   prog->vp.psiz = map_undef;
   prog->gp.has_layer = 0;
   prog->gp.has_viewport = 0;

   if (prog->type == PIPE_SHADER_COMPUTE)
      info->prop.cp.inputOffset = 0x10;

   info->driverPriv = prog;

#ifdef DEBUG
   info->optLevel = debug_get_num_option("NV50_PROG_OPTIMIZE", 3);
   info->dbgFlags = debug_get_num_option("NV50_PROG_DEBUG", 0);
   info->omitLineNum = debug_get_num_option("NV50_PROG_DEBUG_OMIT_LINENUM", 0);
#else
   info->optLevel = 3;
#endif

   ret = nv50_ir_generate_code(info);
   if (ret) {
      NOUVEAU_ERR("shader translation failed: %i\n", ret);
      goto out;
   }

   prog->code = info->bin.code;
   prog->code_size = info->bin.codeSize;
   prog->fixups = info->bin.relocData;
   prog->interps = info->bin.fixupData;
   prog->max_gpr = MAX2(4, (info->bin.maxGPR >> 1) + 1);
   prog->tls_space = info->bin.tlsSpace;
   prog->cp.smem_size = info->bin.smemSize;
   prog->mul_zero_wins = info->io.mul_zero_wins;
   prog->vp.need_vertex_id = info->io.vertexId < PIPE_MAX_SHADER_INPUTS;

   prog->vp.clip_enable = (1 << info->io.clipDistances) - 1;
   prog->vp.cull_enable =
      ((1 << info->io.cullDistances) - 1) << info->io.clipDistances;
   prog->vp.clip_mode = 0;
   for (i = 0; i < info->io.cullDistances; ++i)
      prog->vp.clip_mode |= 1 << ((info->io.clipDistances + i) * 4);

   if (prog->type == PIPE_SHADER_FRAGMENT) {
      if (info->prop.fp.writesDepth) {
         prog->fp.flags[0] |= NV50_3D_FP_CONTROL_EXPORTS_Z;
         prog->fp.flags[1] = 0x11;
      }
      if (info->prop.fp.usesDiscard)
         prog->fp.flags[0] |= NV50_3D_FP_CONTROL_USES_KIL;
   } else
   if (prog->type == PIPE_SHADER_GEOMETRY) {
      switch (info->prop.gp.outputPrim) {
      case PIPE_PRIM_LINE_STRIP:
         prog->gp.prim_type = NV50_3D_GP_OUTPUT_PRIMITIVE_TYPE_LINE_STRIP;
         break;
      case PIPE_PRIM_TRIANGLE_STRIP:
         prog->gp.prim_type = NV50_3D_GP_OUTPUT_PRIMITIVE_TYPE_TRIANGLE_STRIP;
         break;
      case PIPE_PRIM_POINTS:
      default:
         assert(info->prop.gp.outputPrim == PIPE_PRIM_POINTS);
         prog->gp.prim_type = NV50_3D_GP_OUTPUT_PRIMITIVE_TYPE_POINTS;
         break;
      }
      prog->gp.vert_count = CLAMP(info->prop.gp.maxVertices, 1, 1024);
   }

   if (prog->type == PIPE_SHADER_COMPUTE) {
      prog->cp.syms = info->bin.syms;
      prog->cp.num_syms = info->bin.numSyms;
   } else {
      FREE(info->bin.syms);
   }

   if (prog->pipe.stream_output.num_outputs)
      prog->so = nv50_program_create_strmout_state(info,
                                                   &prog->pipe.stream_output);

   pipe_debug_message(debug, SHADER_INFO,
                      "type: %d, local: %d, shared: %d, gpr: %d, inst: %d, bytes: %d",
                      prog->type, info->bin.tlsSpace, info->bin.smemSize,
                      prog->max_gpr, info->bin.instructions,
                      info->bin.codeSize);

out:
   if (info->bin.sourceRep == PIPE_SHADER_IR_NIR)
      ralloc_free((void *)info->bin.source);
   FREE(info);
   return !ret;
}
Beispiel #5
0
extern "C" const unsigned *
brw_compile_gs(const struct brw_compiler *compiler, void *log_data,
               void *mem_ctx,
               const struct brw_gs_prog_key *key,
               struct brw_gs_prog_data *prog_data,
               const nir_shader *src_shader,
               struct gl_shader_program *shader_prog,
               int shader_time_index,
               unsigned *final_assembly_size,
               char **error_str)
{
    struct brw_gs_compile c;
    memset(&c, 0, sizeof(c));
    c.key = *key;

    const bool is_scalar = compiler->scalar_stage[MESA_SHADER_GEOMETRY];
    nir_shader *shader = nir_shader_clone(mem_ctx, src_shader);

    /* The GLSL linker will have already matched up GS inputs and the outputs
     * of prior stages.  The driver does extend VS outputs in some cases, but
     * only for legacy OpenGL or Gen4-5 hardware, neither of which offer
     * geometry shader support.  So we can safely ignore that.
     *
     * For SSO pipelines, we use a fixed VUE map layout based on variable
     * locations, so we can rely on rendezvous-by-location making this work.
     */
    GLbitfield64 inputs_read = shader->info->inputs_read;
    brw_compute_vue_map(compiler->devinfo,
                        &c.input_vue_map, inputs_read,
                        shader->info->separate_shader);

    shader = brw_nir_apply_sampler_key(shader, compiler->devinfo, &key->tex,
                                       is_scalar);
    brw_nir_lower_vue_inputs(shader, is_scalar, &c.input_vue_map);
    brw_nir_lower_vue_outputs(shader, is_scalar);
    shader = brw_postprocess_nir(shader, compiler->devinfo, is_scalar);

    prog_data->base.clip_distance_mask =
        ((1 << shader->info->clip_distance_array_size) - 1);
    prog_data->base.cull_distance_mask =
        ((1 << shader->info->cull_distance_array_size) - 1) <<
        shader->info->clip_distance_array_size;

    prog_data->include_primitive_id =
        (shader->info->system_values_read & (1 << SYSTEM_VALUE_PRIMITIVE_ID)) != 0;

    prog_data->invocations = shader->info->gs.invocations;

    if (compiler->devinfo->gen >= 8)
        prog_data->static_vertex_count = nir_gs_count_vertices(shader);

    if (compiler->devinfo->gen >= 7) {
        if (shader->info->gs.output_primitive == GL_POINTS) {
            /* When the output type is points, the geometry shader may output data
             * to multiple streams, and EndPrimitive() has no effect.  So we
             * configure the hardware to interpret the control data as stream ID.
             */
            prog_data->control_data_format = GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_SID;

            /* We only have to emit control bits if we are using streams */
            if (shader_prog && shader_prog->Geom.UsesStreams)
                c.control_data_bits_per_vertex = 2;
            else
                c.control_data_bits_per_vertex = 0;
        } else {
            /* When the output type is triangle_strip or line_strip, EndPrimitive()
             * may be used to terminate the current strip and start a new one
             * (similar to primitive restart), and outputting data to multiple
             * streams is not supported.  So we configure the hardware to interpret
             * the control data as EndPrimitive information (a.k.a. "cut bits").
             */
            prog_data->control_data_format = GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_CUT;

            /* We only need to output control data if the shader actually calls
             * EndPrimitive().
             */
            c.control_data_bits_per_vertex =
                shader->info->gs.uses_end_primitive ? 1 : 0;
        }
    } else {
        /* There are no control data bits in gen6. */
        c.control_data_bits_per_vertex = 0;

        /* If it is using transform feedback, enable it */
        if (shader->info->has_transform_feedback_varyings)
            prog_data->gen6_xfb_enabled = true;
        else
            prog_data->gen6_xfb_enabled = false;
    }
    c.control_data_header_size_bits =
        shader->info->gs.vertices_out * c.control_data_bits_per_vertex;

    /* 1 HWORD = 32 bytes = 256 bits */
    prog_data->control_data_header_size_hwords =
        ALIGN(c.control_data_header_size_bits, 256) / 256;

    /* Compute the output vertex size.
     *
     * From the Ivy Bridge PRM, Vol2 Part1 7.2.1.1 STATE_GS - Output Vertex
     * Size (p168):
     *
     *     [0,62] indicating [1,63] 16B units
     *
     *     Specifies the size of each vertex stored in the GS output entry
     *     (following any Control Header data) as a number of 128-bit units
     *     (minus one).
     *
     *     Programming Restrictions: The vertex size must be programmed as a
     *     multiple of 32B units with the following exception: Rendering is
     *     disabled (as per SOL stage state) and the vertex size output by the
     *     GS thread is 16B.
     *
     *     If rendering is enabled (as per SOL state) the vertex size must be
     *     programmed as a multiple of 32B units. In other words, the only time
     *     software can program a vertex size with an odd number of 16B units
     *     is when rendering is disabled.
     *
     * Note: B=bytes in the above text.
     *
     * It doesn't seem worth the extra trouble to optimize the case where the
     * vertex size is 16B (especially since this would require special-casing
     * the GEN assembly that writes to the URB).  So we just set the vertex
     * size to a multiple of 32B (2 vec4's) in all cases.
     *
     * The maximum output vertex size is 62*16 = 992 bytes (31 hwords).  We
     * budget that as follows:
     *
     *   512 bytes for varyings (a varying component is 4 bytes and
     *             gl_MaxGeometryOutputComponents = 128)
     *    16 bytes overhead for VARYING_SLOT_PSIZ (each varying slot is 16
     *             bytes)
     *    16 bytes overhead for gl_Position (we allocate it a slot in the VUE
     *             even if it's not used)
     *    32 bytes overhead for gl_ClipDistance (we allocate it 2 VUE slots
     *             whenever clip planes are enabled, even if the shader doesn't
     *             write to gl_ClipDistance)
     *    16 bytes overhead since the VUE size must be a multiple of 32 bytes
     *             (see below)--this causes up to 1 VUE slot to be wasted
     *   400 bytes available for varying packing overhead
     *
     * Worst-case varying packing overhead is 3/4 of a varying slot (12 bytes)
     * per interpolation type, so this is plenty.
     *
     */
    unsigned output_vertex_size_bytes = prog_data->base.vue_map.num_slots * 16;
    assert(compiler->devinfo->gen == 6 ||
           output_vertex_size_bytes <= GEN7_MAX_GS_OUTPUT_VERTEX_SIZE_BYTES);
    prog_data->output_vertex_size_hwords =
        ALIGN(output_vertex_size_bytes, 32) / 32;

    /* Compute URB entry size.  The maximum allowed URB entry size is 32k.
     * That divides up as follows:
     *
     *     64 bytes for the control data header (cut indices or StreamID bits)
     *   4096 bytes for varyings (a varying component is 4 bytes and
     *              gl_MaxGeometryTotalOutputComponents = 1024)
     *   4096 bytes overhead for VARYING_SLOT_PSIZ (each varying slot is 16
     *              bytes/vertex and gl_MaxGeometryOutputVertices is 256)
     *   4096 bytes overhead for gl_Position (we allocate it a slot in the VUE
     *              even if it's not used)
     *   8192 bytes overhead for gl_ClipDistance (we allocate it 2 VUE slots
     *              whenever clip planes are enabled, even if the shader doesn't
     *              write to gl_ClipDistance)
     *   4096 bytes overhead since the VUE size must be a multiple of 32
     *              bytes (see above)--this causes up to 1 VUE slot to be wasted
     *   8128 bytes available for varying packing overhead
     *
     * Worst-case varying packing overhead is 3/4 of a varying slot per
     * interpolation type, which works out to 3072 bytes, so this would allow
     * us to accommodate 2 interpolation types without any danger of running
     * out of URB space.
     *
     * In practice, the risk of running out of URB space is very small, since
     * the above figures are all worst-case, and most of them scale with the
     * number of output vertices.  So we'll just calculate the amount of space
     * we need, and if it's too large, fail to compile.
     *
     * The above is for gen7+ where we have a single URB entry that will hold
     * all the output. In gen6, we will have to allocate URB entries for every
     * vertex we emit, so our URB entries only need to be large enough to hold
     * a single vertex. Also, gen6 does not have a control data header.
     */
    unsigned output_size_bytes;
    if (compiler->devinfo->gen >= 7) {
        output_size_bytes =
            prog_data->output_vertex_size_hwords * 32 * shader->info->gs.vertices_out;
        output_size_bytes += 32 * prog_data->control_data_header_size_hwords;
    } else {
        output_size_bytes = prog_data->output_vertex_size_hwords * 32;
    }

    /* Broadwell stores "Vertex Count" as a full 8 DWord (32 byte) URB output,
     * which comes before the control header.
     */
    if (compiler->devinfo->gen >= 8)
        output_size_bytes += 32;

    /* Shaders can technically set max_vertices = 0, at which point we
     * may have a URB size of 0 bytes.  Nothing good can come from that,
     * so enforce a minimum size.
     */
    if (output_size_bytes == 0)
        output_size_bytes = 1;

    unsigned max_output_size_bytes = GEN7_MAX_GS_URB_ENTRY_SIZE_BYTES;
    if (compiler->devinfo->gen == 6)
        max_output_size_bytes = GEN6_MAX_GS_URB_ENTRY_SIZE_BYTES;
    if (output_size_bytes > max_output_size_bytes)
        return NULL;


    /* URB entry sizes are stored as a multiple of 64 bytes in gen7+ and
     * a multiple of 128 bytes in gen6.
     */
    if (compiler->devinfo->gen >= 7)
        prog_data->base.urb_entry_size = ALIGN(output_size_bytes, 64) / 64;
    else
        prog_data->base.urb_entry_size = ALIGN(output_size_bytes, 128) / 128;

    prog_data->output_topology =
        get_hw_prim_for_gl_prim(shader->info->gs.output_primitive);

    prog_data->vertices_in = shader->info->gs.vertices_in;

    /* GS inputs are read from the VUE 256 bits (2 vec4's) at a time, so we
     * need to program a URB read length of ceiling(num_slots / 2).
     */
    prog_data->base.urb_read_length = (c.input_vue_map.num_slots + 1) / 2;

    /* Now that prog_data setup is done, we are ready to actually compile the
     * program.
     */
    if (unlikely(INTEL_DEBUG & DEBUG_GS)) {
        fprintf(stderr, "GS Input ");
        brw_print_vue_map(stderr, &c.input_vue_map);
        fprintf(stderr, "GS Output ");
        brw_print_vue_map(stderr, &prog_data->base.vue_map);
    }

    if (is_scalar) {
        fs_visitor v(compiler, log_data, mem_ctx, &c, prog_data, shader,
                     shader_time_index);
        if (v.run_gs()) {
            prog_data->base.dispatch_mode = DISPATCH_MODE_SIMD8;
            prog_data->base.base.dispatch_grf_start_reg = v.payload.num_regs;

            fs_generator g(compiler, log_data, mem_ctx, &c.key,
                           &prog_data->base.base, v.promoted_constants,
                           false, MESA_SHADER_GEOMETRY);
            if (unlikely(INTEL_DEBUG & DEBUG_GS)) {
                const char *label =
                    shader->info->label ? shader->info->label : "unnamed";
                char *name = ralloc_asprintf(mem_ctx, "%s geometry shader %s",
                                             label, shader->info->name);
                g.enable_debug(name);
            }
            g.generate_code(v.cfg, 8);
            return g.get_assembly(final_assembly_size);
        }
    }

    if (compiler->devinfo->gen >= 7) {
        /* Compile the geometry shader in DUAL_OBJECT dispatch mode, if we can do
         * so without spilling. If the GS invocations count > 1, then we can't use
         * dual object mode.
         */
        if (prog_data->invocations <= 1 &&
                likely(!(INTEL_DEBUG & DEBUG_NO_DUAL_OBJECT_GS))) {
            prog_data->base.dispatch_mode = DISPATCH_MODE_4X2_DUAL_OBJECT;

            vec4_gs_visitor v(compiler, log_data, &c, prog_data, shader,
                              mem_ctx, true /* no_spills */, shader_time_index);
            if (v.run()) {
                return brw_vec4_generate_assembly(compiler, log_data, mem_ctx,
                                                  shader, &prog_data->base, v.cfg,
                                                  final_assembly_size);
            }
        }
    }

    /* Either we failed to compile in DUAL_OBJECT mode (probably because it
     * would have required spilling) or DUAL_OBJECT mode is disabled.  So fall
     * back to DUAL_INSTANCED or SINGLE mode, which consumes fewer registers.
     *
     * FIXME: Single dispatch mode requires that the driver can handle
     * interleaving of input registers, but this is already supported (dual
     * instance mode has the same requirement). However, to take full advantage
     * of single dispatch mode to reduce register pressure we would also need to
     * do interleaved outputs, but currently, the vec4 visitor and generator
     * classes do not support this, so at the moment register pressure in
     * single and dual instance modes is the same.
     *
     * From the Ivy Bridge PRM, Vol2 Part1 7.2.1.1 "3DSTATE_GS"
     * "If InstanceCount>1, DUAL_OBJECT mode is invalid. Software will likely
     * want to use DUAL_INSTANCE mode for higher performance, but SINGLE mode
     * is also supported. When InstanceCount=1 (one instance per object) software
     * can decide which dispatch mode to use. DUAL_OBJECT mode would likely be
     * the best choice for performance, followed by SINGLE mode."
     *
     * So SINGLE mode is more performant when invocations == 1 and DUAL_INSTANCE
     * mode is more performant when invocations > 1. Gen6 only supports
     * SINGLE mode.
     */
    if (prog_data->invocations <= 1 || compiler->devinfo->gen < 7)
        prog_data->base.dispatch_mode = DISPATCH_MODE_4X1_SINGLE;
    else
        prog_data->base.dispatch_mode = DISPATCH_MODE_4X2_DUAL_INSTANCE;

    vec4_gs_visitor *gs = NULL;
    const unsigned *ret = NULL;

    if (compiler->devinfo->gen >= 7)
        gs = new vec4_gs_visitor(compiler, log_data, &c, prog_data,
                                 shader, mem_ctx, false /* no_spills */,
                                 shader_time_index);
    else
        gs = new gen6_gs_visitor(compiler, log_data, &c, prog_data, shader_prog,
                                 shader, mem_ctx, false /* no_spills */,
                                 shader_time_index);

    if (!gs->run()) {
        if (error_str)
            *error_str = ralloc_strdup(mem_ctx, gs->fail_msg);
    } else {
        ret = brw_vec4_generate_assembly(compiler, log_data, mem_ctx, shader,
                                         &prog_data->base, gs->cfg,
                                         final_assembly_size);
    }

    delete gs;
    return ret;
}