Ejemplo n.º 1
0
/* When the primitive changes, set a state bit and re-validate.  Not
 * the nicest and would rather deal with this by having all the
 * programs be immune to the active primitive (ie. cope with all
 * possibilities).  That may not be realistic however.
 */
static void
brw_set_prim(struct brw_context *brw, const struct _mesa_prim *prim)
{
   struct gl_context *ctx = &brw->ctx;
   uint32_t hw_prim = get_hw_prim_for_gl_prim(prim->mode);

   DBG("PRIM: %s\n", _mesa_enum_to_string(prim->mode));

   /* Slight optimization to avoid the GS program when not needed:
    */
   if (prim->mode == GL_QUAD_STRIP &&
       ctx->Light.ShadeModel != GL_FLAT &&
       ctx->Polygon.FrontMode == GL_FILL &&
       ctx->Polygon.BackMode == GL_FILL)
      hw_prim = _3DPRIM_TRISTRIP;

   if (prim->mode == GL_QUADS && prim->count == 4 &&
       ctx->Light.ShadeModel != GL_FLAT &&
       ctx->Polygon.FrontMode == GL_FILL &&
       ctx->Polygon.BackMode == GL_FILL) {
      hw_prim = _3DPRIM_TRIFAN;
   }

   if (hw_prim != brw->primitive) {
      brw->primitive = hw_prim;
      brw->ctx.NewDriverState |= BRW_NEW_PRIMITIVE;

      if (reduced_prim[prim->mode] != brw->reduced_primitive) {
	 brw->reduced_primitive = reduced_prim[prim->mode];
	 brw->ctx.NewDriverState |= BRW_NEW_REDUCED_PRIMITIVE;
      }
   }
}
Ejemplo n.º 2
0
static void
gen6_set_prim(struct brw_context *brw, const struct _mesa_prim *prim)
{
   DBG("PRIM: %s\n", _mesa_enum_to_string(prim->mode));

   const uint32_t hw_prim = get_hw_prim_for_gl_prim(prim->mode);
   if (hw_prim != brw->primitive) {
      brw->primitive = hw_prim;
      brw->ctx.NewDriverState |= BRW_NEW_PRIMITIVE;
   }
}
Ejemplo n.º 3
0
static void gen6_set_prim(struct brw_context *brw,
                          const struct _mesa_prim *prim)
{
   uint32_t hw_prim;

   DBG("PRIM: %s\n", _mesa_lookup_enum_by_nr(prim->mode));

   hw_prim = get_hw_prim_for_gl_prim(prim->mode);

   if (hw_prim != brw->primitive) {
      brw->primitive = hw_prim;
      brw->state.dirty.brw |= BRW_NEW_PRIMITIVE;
   }
}
Ejemplo n.º 4
0
static void
gen6_set_prim(struct brw_context *brw, const struct _mesa_prim *prim)
{
   const struct gl_context *ctx = &brw->ctx;
   uint32_t hw_prim;

   DBG("PRIM: %s\n", _mesa_enum_to_string(prim->mode));

   if (prim->mode == GL_PATCHES) {
      hw_prim = _3DPRIM_PATCHLIST(ctx->TessCtrlProgram.patch_vertices);
   } else {
      hw_prim = get_hw_prim_for_gl_prim(prim->mode);
   }

   if (hw_prim != brw->primitive) {
      brw->primitive = hw_prim;
      brw->ctx.NewDriverState |= BRW_NEW_PRIMITIVE;
      if (prim->mode == GL_PATCHES)
         brw->ctx.NewDriverState |= BRW_NEW_PATCH_PRIMITIVE;
   }
}
Ejemplo n.º 5
0
bool
brw_codegen_gs_prog(struct brw_context *brw,
                    struct gl_shader_program *prog,
                    struct brw_geometry_program *gp,
                    struct brw_gs_prog_key *key)
{
   struct brw_stage_state *stage_state = &brw->gs.base;
   struct brw_gs_compile c;
   memset(&c, 0, sizeof(c));
   c.key = *key;
   c.gp = gp;

   c.prog_data.include_primitive_id =
      (gp->program.Base.InputsRead & VARYING_BIT_PRIMITIVE_ID) != 0;

   c.prog_data.invocations = gp->program.Invocations;

   /* Allocate the references to the uniforms that will end up in the
    * prog_data associated with the compiled program, and which will be freed
    * by the state cache.
    *
    * Note: param_count needs to be num_uniform_components * 4, since we add
    * padding around uniform values below vec4 size, so the worst case is that
    * every uniform is a float which gets padded to the size of a vec4.
    */
   struct gl_shader *gs = prog->_LinkedShaders[MESA_SHADER_GEOMETRY];
   int param_count = gs->num_uniform_components * 4;

   /* We also upload clip plane data as uniforms */
   param_count += MAX_CLIP_PLANES * 4;

   c.prog_data.base.base.param =
      rzalloc_array(NULL, const gl_constant_value *, param_count);
   c.prog_data.base.base.pull_param =
      rzalloc_array(NULL, const gl_constant_value *, param_count);
   c.prog_data.base.base.nr_params = param_count;

   if (brw->gen >= 7) {
      if (gp->program.OutputType == GL_POINTS) {
         /* When the output type is points, the geometry shader may output data
          * to multiple streams, and EndPrimitive() has no effect.  So we
          * configure the hardware to interpret the control data as stream ID.
          */
         c.prog_data.control_data_format = GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_SID;

         /* We only have to emit control bits if we are using streams */
         if (prog->Geom.UsesStreams)
            c.control_data_bits_per_vertex = 2;
         else
            c.control_data_bits_per_vertex = 0;
      } else {
         /* When the output type is triangle_strip or line_strip, EndPrimitive()
          * may be used to terminate the current strip and start a new one
          * (similar to primitive restart), and outputting data to multiple
          * streams is not supported.  So we configure the hardware to interpret
          * the control data as EndPrimitive information (a.k.a. "cut bits").
          */
         c.prog_data.control_data_format = GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_CUT;

         /* We only need to output control data if the shader actually calls
          * EndPrimitive().
          */
         c.control_data_bits_per_vertex = gp->program.UsesEndPrimitive ? 1 : 0;
      }
   } else {
      /* There are no control data bits in gen6. */
      c.control_data_bits_per_vertex = 0;

      /* If it is using transform feedback, enable it */
      if (prog->TransformFeedback.NumVarying)
         c.prog_data.gen6_xfb_enabled = true;
      else
         c.prog_data.gen6_xfb_enabled = false;
   }
   c.control_data_header_size_bits =
      gp->program.VerticesOut * c.control_data_bits_per_vertex;

   /* 1 HWORD = 32 bytes = 256 bits */
   c.prog_data.control_data_header_size_hwords =
      ALIGN(c.control_data_header_size_bits, 256) / 256;

   GLbitfield64 outputs_written = gp->program.Base.OutputsWritten;

   /* In order for legacy clipping to work, we need to populate the clip
    * distance varying slots whenever clipping is enabled, even if the vertex
    * shader doesn't write to gl_ClipDistance.
    */
   if (c.key.base.userclip_active) {
      outputs_written |= BITFIELD64_BIT(VARYING_SLOT_CLIP_DIST0);
      outputs_written |= BITFIELD64_BIT(VARYING_SLOT_CLIP_DIST1);
   }

   brw_compute_vue_map(brw->intelScreen->devinfo,
                       &c.prog_data.base.vue_map, outputs_written);

   /* Compute the output vertex size.
    *
    * From the Ivy Bridge PRM, Vol2 Part1 7.2.1.1 STATE_GS - Output Vertex
    * Size (p168):
    *
    *     [0,62] indicating [1,63] 16B units
    *
    *     Specifies the size of each vertex stored in the GS output entry
    *     (following any Control Header data) as a number of 128-bit units
    *     (minus one).
    *
    *     Programming Restrictions: The vertex size must be programmed as a
    *     multiple of 32B units with the following exception: Rendering is
    *     disabled (as per SOL stage state) and the vertex size output by the
    *     GS thread is 16B.
    *
    *     If rendering is enabled (as per SOL state) the vertex size must be
    *     programmed as a multiple of 32B units. In other words, the only time
    *     software can program a vertex size with an odd number of 16B units
    *     is when rendering is disabled.
    *
    * Note: B=bytes in the above text.
    *
    * It doesn't seem worth the extra trouble to optimize the case where the
    * vertex size is 16B (especially since this would require special-casing
    * the GEN assembly that writes to the URB).  So we just set the vertex
    * size to a multiple of 32B (2 vec4's) in all cases.
    *
    * The maximum output vertex size is 62*16 = 992 bytes (31 hwords).  We
    * budget that as follows:
    *
    *   512 bytes for varyings (a varying component is 4 bytes and
    *             gl_MaxGeometryOutputComponents = 128)
    *    16 bytes overhead for VARYING_SLOT_PSIZ (each varying slot is 16
    *             bytes)
    *    16 bytes overhead for gl_Position (we allocate it a slot in the VUE
    *             even if it's not used)
    *    32 bytes overhead for gl_ClipDistance (we allocate it 2 VUE slots
    *             whenever clip planes are enabled, even if the shader doesn't
    *             write to gl_ClipDistance)
    *    16 bytes overhead since the VUE size must be a multiple of 32 bytes
    *             (see below)--this causes up to 1 VUE slot to be wasted
    *   400 bytes available for varying packing overhead
    *
    * Worst-case varying packing overhead is 3/4 of a varying slot (12 bytes)
    * per interpolation type, so this is plenty.
    *
    */
   unsigned output_vertex_size_bytes = c.prog_data.base.vue_map.num_slots * 16;
   assert(brw->gen == 6 ||
          output_vertex_size_bytes <= GEN7_MAX_GS_OUTPUT_VERTEX_SIZE_BYTES);
   c.prog_data.output_vertex_size_hwords =
      ALIGN(output_vertex_size_bytes, 32) / 32;

   /* Compute URB entry size.  The maximum allowed URB entry size is 32k.
    * That divides up as follows:
    *
    *     64 bytes for the control data header (cut indices or StreamID bits)
    *   4096 bytes for varyings (a varying component is 4 bytes and
    *              gl_MaxGeometryTotalOutputComponents = 1024)
    *   4096 bytes overhead for VARYING_SLOT_PSIZ (each varying slot is 16
    *              bytes/vertex and gl_MaxGeometryOutputVertices is 256)
    *   4096 bytes overhead for gl_Position (we allocate it a slot in the VUE
    *              even if it's not used)
    *   8192 bytes overhead for gl_ClipDistance (we allocate it 2 VUE slots
    *              whenever clip planes are enabled, even if the shader doesn't
    *              write to gl_ClipDistance)
    *   4096 bytes overhead since the VUE size must be a multiple of 32
    *              bytes (see above)--this causes up to 1 VUE slot to be wasted
    *   8128 bytes available for varying packing overhead
    *
    * Worst-case varying packing overhead is 3/4 of a varying slot per
    * interpolation type, which works out to 3072 bytes, so this would allow
    * us to accommodate 2 interpolation types without any danger of running
    * out of URB space.
    *
    * In practice, the risk of running out of URB space is very small, since
    * the above figures are all worst-case, and most of them scale with the
    * number of output vertices.  So we'll just calculate the amount of space
    * we need, and if it's too large, fail to compile.
    *
    * The above is for gen7+ where we have a single URB entry that will hold
    * all the output. In gen6, we will have to allocate URB entries for every
    * vertex we emit, so our URB entries only need to be large enough to hold
    * a single vertex. Also, gen6 does not have a control data header.
    */
   unsigned output_size_bytes;
   if (brw->gen >= 7) {
      output_size_bytes =
         c.prog_data.output_vertex_size_hwords * 32 * gp->program.VerticesOut;
      output_size_bytes += 32 * c.prog_data.control_data_header_size_hwords;
   } else {
      output_size_bytes = c.prog_data.output_vertex_size_hwords * 32;
   }

   /* Broadwell stores "Vertex Count" as a full 8 DWord (32 byte) URB output,
    * which comes before the control header.
    */
   if (brw->gen >= 8)
      output_size_bytes += 32;

   assert(output_size_bytes >= 1);
   int max_output_size_bytes = GEN7_MAX_GS_URB_ENTRY_SIZE_BYTES;
   if (brw->gen == 6)
      max_output_size_bytes = GEN6_MAX_GS_URB_ENTRY_SIZE_BYTES;
   if (output_size_bytes > max_output_size_bytes)
      return false;


   /* URB entry sizes are stored as a multiple of 64 bytes in gen7+ and
    * a multiple of 128 bytes in gen6.
    */
   if (brw->gen >= 7)
      c.prog_data.base.urb_entry_size = ALIGN(output_size_bytes, 64) / 64;
   else
      c.prog_data.base.urb_entry_size = ALIGN(output_size_bytes, 128) / 128;

   c.prog_data.output_topology =
      get_hw_prim_for_gl_prim(gp->program.OutputType);

   brw_compute_vue_map(brw->intelScreen->devinfo,
                       &c.input_vue_map, c.key.input_varyings);

   /* GS inputs are read from the VUE 256 bits (2 vec4's) at a time, so we
    * need to program a URB read length of ceiling(num_slots / 2).
    */
   c.prog_data.base.urb_read_length = (c.input_vue_map.num_slots + 1) / 2;

   void *mem_ctx = ralloc_context(NULL);
   unsigned program_size;
   const unsigned *program =
      brw_gs_emit(brw, prog, &c, mem_ctx, &program_size);
   if (program == NULL) {
      ralloc_free(mem_ctx);
      return false;
   }

   /* Scratch space is used for register spilling */
   if (c.base.last_scratch) {
      perf_debug("Geometry shader triggered register spilling.  "
                 "Try reducing the number of live vec4 values to "
                 "improve performance.\n");

      c.prog_data.base.base.total_scratch
         = brw_get_scratch_size(c.base.last_scratch*REG_SIZE);

      brw_get_scratch_bo(brw, &stage_state->scratch_bo,
			 c.prog_data.base.base.total_scratch *
                         brw->max_gs_threads);
   }

   brw_upload_cache(&brw->cache, BRW_CACHE_GS_PROG,
                    &c.key, sizeof(c.key),
                    program, program_size,
                    &c.prog_data, sizeof(c.prog_data),
                    &stage_state->prog_offset, &brw->gs.prog_data);
   ralloc_free(mem_ctx);

   return true;
}
Ejemplo n.º 6
0
extern "C" const unsigned *
brw_compile_gs(const struct brw_compiler *compiler, void *log_data,
               void *mem_ctx,
               const struct brw_gs_prog_key *key,
               struct brw_gs_prog_data *prog_data,
               const nir_shader *src_shader,
               struct gl_shader_program *shader_prog,
               int shader_time_index,
               unsigned *final_assembly_size,
               char **error_str)
{
    struct brw_gs_compile c;
    memset(&c, 0, sizeof(c));
    c.key = *key;

    const bool is_scalar = compiler->scalar_stage[MESA_SHADER_GEOMETRY];
    nir_shader *shader = nir_shader_clone(mem_ctx, src_shader);

    /* The GLSL linker will have already matched up GS inputs and the outputs
     * of prior stages.  The driver does extend VS outputs in some cases, but
     * only for legacy OpenGL or Gen4-5 hardware, neither of which offer
     * geometry shader support.  So we can safely ignore that.
     *
     * For SSO pipelines, we use a fixed VUE map layout based on variable
     * locations, so we can rely on rendezvous-by-location making this work.
     */
    GLbitfield64 inputs_read = shader->info->inputs_read;
    brw_compute_vue_map(compiler->devinfo,
                        &c.input_vue_map, inputs_read,
                        shader->info->separate_shader);

    shader = brw_nir_apply_sampler_key(shader, compiler->devinfo, &key->tex,
                                       is_scalar);
    brw_nir_lower_vue_inputs(shader, is_scalar, &c.input_vue_map);
    brw_nir_lower_vue_outputs(shader, is_scalar);
    shader = brw_postprocess_nir(shader, compiler->devinfo, is_scalar);

    prog_data->base.clip_distance_mask =
        ((1 << shader->info->clip_distance_array_size) - 1);
    prog_data->base.cull_distance_mask =
        ((1 << shader->info->cull_distance_array_size) - 1) <<
        shader->info->clip_distance_array_size;

    prog_data->include_primitive_id =
        (shader->info->system_values_read & (1 << SYSTEM_VALUE_PRIMITIVE_ID)) != 0;

    prog_data->invocations = shader->info->gs.invocations;

    if (compiler->devinfo->gen >= 8)
        prog_data->static_vertex_count = nir_gs_count_vertices(shader);

    if (compiler->devinfo->gen >= 7) {
        if (shader->info->gs.output_primitive == GL_POINTS) {
            /* When the output type is points, the geometry shader may output data
             * to multiple streams, and EndPrimitive() has no effect.  So we
             * configure the hardware to interpret the control data as stream ID.
             */
            prog_data->control_data_format = GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_SID;

            /* We only have to emit control bits if we are using streams */
            if (shader_prog && shader_prog->Geom.UsesStreams)
                c.control_data_bits_per_vertex = 2;
            else
                c.control_data_bits_per_vertex = 0;
        } else {
            /* When the output type is triangle_strip or line_strip, EndPrimitive()
             * may be used to terminate the current strip and start a new one
             * (similar to primitive restart), and outputting data to multiple
             * streams is not supported.  So we configure the hardware to interpret
             * the control data as EndPrimitive information (a.k.a. "cut bits").
             */
            prog_data->control_data_format = GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_CUT;

            /* We only need to output control data if the shader actually calls
             * EndPrimitive().
             */
            c.control_data_bits_per_vertex =
                shader->info->gs.uses_end_primitive ? 1 : 0;
        }
    } else {
        /* There are no control data bits in gen6. */
        c.control_data_bits_per_vertex = 0;

        /* If it is using transform feedback, enable it */
        if (shader->info->has_transform_feedback_varyings)
            prog_data->gen6_xfb_enabled = true;
        else
            prog_data->gen6_xfb_enabled = false;
    }
    c.control_data_header_size_bits =
        shader->info->gs.vertices_out * c.control_data_bits_per_vertex;

    /* 1 HWORD = 32 bytes = 256 bits */
    prog_data->control_data_header_size_hwords =
        ALIGN(c.control_data_header_size_bits, 256) / 256;

    /* Compute the output vertex size.
     *
     * From the Ivy Bridge PRM, Vol2 Part1 7.2.1.1 STATE_GS - Output Vertex
     * Size (p168):
     *
     *     [0,62] indicating [1,63] 16B units
     *
     *     Specifies the size of each vertex stored in the GS output entry
     *     (following any Control Header data) as a number of 128-bit units
     *     (minus one).
     *
     *     Programming Restrictions: The vertex size must be programmed as a
     *     multiple of 32B units with the following exception: Rendering is
     *     disabled (as per SOL stage state) and the vertex size output by the
     *     GS thread is 16B.
     *
     *     If rendering is enabled (as per SOL state) the vertex size must be
     *     programmed as a multiple of 32B units. In other words, the only time
     *     software can program a vertex size with an odd number of 16B units
     *     is when rendering is disabled.
     *
     * Note: B=bytes in the above text.
     *
     * It doesn't seem worth the extra trouble to optimize the case where the
     * vertex size is 16B (especially since this would require special-casing
     * the GEN assembly that writes to the URB).  So we just set the vertex
     * size to a multiple of 32B (2 vec4's) in all cases.
     *
     * The maximum output vertex size is 62*16 = 992 bytes (31 hwords).  We
     * budget that as follows:
     *
     *   512 bytes for varyings (a varying component is 4 bytes and
     *             gl_MaxGeometryOutputComponents = 128)
     *    16 bytes overhead for VARYING_SLOT_PSIZ (each varying slot is 16
     *             bytes)
     *    16 bytes overhead for gl_Position (we allocate it a slot in the VUE
     *             even if it's not used)
     *    32 bytes overhead for gl_ClipDistance (we allocate it 2 VUE slots
     *             whenever clip planes are enabled, even if the shader doesn't
     *             write to gl_ClipDistance)
     *    16 bytes overhead since the VUE size must be a multiple of 32 bytes
     *             (see below)--this causes up to 1 VUE slot to be wasted
     *   400 bytes available for varying packing overhead
     *
     * Worst-case varying packing overhead is 3/4 of a varying slot (12 bytes)
     * per interpolation type, so this is plenty.
     *
     */
    unsigned output_vertex_size_bytes = prog_data->base.vue_map.num_slots * 16;
    assert(compiler->devinfo->gen == 6 ||
           output_vertex_size_bytes <= GEN7_MAX_GS_OUTPUT_VERTEX_SIZE_BYTES);
    prog_data->output_vertex_size_hwords =
        ALIGN(output_vertex_size_bytes, 32) / 32;

    /* Compute URB entry size.  The maximum allowed URB entry size is 32k.
     * That divides up as follows:
     *
     *     64 bytes for the control data header (cut indices or StreamID bits)
     *   4096 bytes for varyings (a varying component is 4 bytes and
     *              gl_MaxGeometryTotalOutputComponents = 1024)
     *   4096 bytes overhead for VARYING_SLOT_PSIZ (each varying slot is 16
     *              bytes/vertex and gl_MaxGeometryOutputVertices is 256)
     *   4096 bytes overhead for gl_Position (we allocate it a slot in the VUE
     *              even if it's not used)
     *   8192 bytes overhead for gl_ClipDistance (we allocate it 2 VUE slots
     *              whenever clip planes are enabled, even if the shader doesn't
     *              write to gl_ClipDistance)
     *   4096 bytes overhead since the VUE size must be a multiple of 32
     *              bytes (see above)--this causes up to 1 VUE slot to be wasted
     *   8128 bytes available for varying packing overhead
     *
     * Worst-case varying packing overhead is 3/4 of a varying slot per
     * interpolation type, which works out to 3072 bytes, so this would allow
     * us to accommodate 2 interpolation types without any danger of running
     * out of URB space.
     *
     * In practice, the risk of running out of URB space is very small, since
     * the above figures are all worst-case, and most of them scale with the
     * number of output vertices.  So we'll just calculate the amount of space
     * we need, and if it's too large, fail to compile.
     *
     * The above is for gen7+ where we have a single URB entry that will hold
     * all the output. In gen6, we will have to allocate URB entries for every
     * vertex we emit, so our URB entries only need to be large enough to hold
     * a single vertex. Also, gen6 does not have a control data header.
     */
    unsigned output_size_bytes;
    if (compiler->devinfo->gen >= 7) {
        output_size_bytes =
            prog_data->output_vertex_size_hwords * 32 * shader->info->gs.vertices_out;
        output_size_bytes += 32 * prog_data->control_data_header_size_hwords;
    } else {
        output_size_bytes = prog_data->output_vertex_size_hwords * 32;
    }

    /* Broadwell stores "Vertex Count" as a full 8 DWord (32 byte) URB output,
     * which comes before the control header.
     */
    if (compiler->devinfo->gen >= 8)
        output_size_bytes += 32;

    /* Shaders can technically set max_vertices = 0, at which point we
     * may have a URB size of 0 bytes.  Nothing good can come from that,
     * so enforce a minimum size.
     */
    if (output_size_bytes == 0)
        output_size_bytes = 1;

    unsigned max_output_size_bytes = GEN7_MAX_GS_URB_ENTRY_SIZE_BYTES;
    if (compiler->devinfo->gen == 6)
        max_output_size_bytes = GEN6_MAX_GS_URB_ENTRY_SIZE_BYTES;
    if (output_size_bytes > max_output_size_bytes)
        return NULL;


    /* URB entry sizes are stored as a multiple of 64 bytes in gen7+ and
     * a multiple of 128 bytes in gen6.
     */
    if (compiler->devinfo->gen >= 7)
        prog_data->base.urb_entry_size = ALIGN(output_size_bytes, 64) / 64;
    else
        prog_data->base.urb_entry_size = ALIGN(output_size_bytes, 128) / 128;

    prog_data->output_topology =
        get_hw_prim_for_gl_prim(shader->info->gs.output_primitive);

    prog_data->vertices_in = shader->info->gs.vertices_in;

    /* GS inputs are read from the VUE 256 bits (2 vec4's) at a time, so we
     * need to program a URB read length of ceiling(num_slots / 2).
     */
    prog_data->base.urb_read_length = (c.input_vue_map.num_slots + 1) / 2;

    /* Now that prog_data setup is done, we are ready to actually compile the
     * program.
     */
    if (unlikely(INTEL_DEBUG & DEBUG_GS)) {
        fprintf(stderr, "GS Input ");
        brw_print_vue_map(stderr, &c.input_vue_map);
        fprintf(stderr, "GS Output ");
        brw_print_vue_map(stderr, &prog_data->base.vue_map);
    }

    if (is_scalar) {
        fs_visitor v(compiler, log_data, mem_ctx, &c, prog_data, shader,
                     shader_time_index);
        if (v.run_gs()) {
            prog_data->base.dispatch_mode = DISPATCH_MODE_SIMD8;
            prog_data->base.base.dispatch_grf_start_reg = v.payload.num_regs;

            fs_generator g(compiler, log_data, mem_ctx, &c.key,
                           &prog_data->base.base, v.promoted_constants,
                           false, MESA_SHADER_GEOMETRY);
            if (unlikely(INTEL_DEBUG & DEBUG_GS)) {
                const char *label =
                    shader->info->label ? shader->info->label : "unnamed";
                char *name = ralloc_asprintf(mem_ctx, "%s geometry shader %s",
                                             label, shader->info->name);
                g.enable_debug(name);
            }
            g.generate_code(v.cfg, 8);
            return g.get_assembly(final_assembly_size);
        }
    }

    if (compiler->devinfo->gen >= 7) {
        /* Compile the geometry shader in DUAL_OBJECT dispatch mode, if we can do
         * so without spilling. If the GS invocations count > 1, then we can't use
         * dual object mode.
         */
        if (prog_data->invocations <= 1 &&
                likely(!(INTEL_DEBUG & DEBUG_NO_DUAL_OBJECT_GS))) {
            prog_data->base.dispatch_mode = DISPATCH_MODE_4X2_DUAL_OBJECT;

            vec4_gs_visitor v(compiler, log_data, &c, prog_data, shader,
                              mem_ctx, true /* no_spills */, shader_time_index);
            if (v.run()) {
                return brw_vec4_generate_assembly(compiler, log_data, mem_ctx,
                                                  shader, &prog_data->base, v.cfg,
                                                  final_assembly_size);
            }
        }
    }

    /* Either we failed to compile in DUAL_OBJECT mode (probably because it
     * would have required spilling) or DUAL_OBJECT mode is disabled.  So fall
     * back to DUAL_INSTANCED or SINGLE mode, which consumes fewer registers.
     *
     * FIXME: Single dispatch mode requires that the driver can handle
     * interleaving of input registers, but this is already supported (dual
     * instance mode has the same requirement). However, to take full advantage
     * of single dispatch mode to reduce register pressure we would also need to
     * do interleaved outputs, but currently, the vec4 visitor and generator
     * classes do not support this, so at the moment register pressure in
     * single and dual instance modes is the same.
     *
     * From the Ivy Bridge PRM, Vol2 Part1 7.2.1.1 "3DSTATE_GS"
     * "If InstanceCount>1, DUAL_OBJECT mode is invalid. Software will likely
     * want to use DUAL_INSTANCE mode for higher performance, but SINGLE mode
     * is also supported. When InstanceCount=1 (one instance per object) software
     * can decide which dispatch mode to use. DUAL_OBJECT mode would likely be
     * the best choice for performance, followed by SINGLE mode."
     *
     * So SINGLE mode is more performant when invocations == 1 and DUAL_INSTANCE
     * mode is more performant when invocations > 1. Gen6 only supports
     * SINGLE mode.
     */
    if (prog_data->invocations <= 1 || compiler->devinfo->gen < 7)
        prog_data->base.dispatch_mode = DISPATCH_MODE_4X1_SINGLE;
    else
        prog_data->base.dispatch_mode = DISPATCH_MODE_4X2_DUAL_INSTANCE;

    vec4_gs_visitor *gs = NULL;
    const unsigned *ret = NULL;

    if (compiler->devinfo->gen >= 7)
        gs = new vec4_gs_visitor(compiler, log_data, &c, prog_data,
                                 shader, mem_ctx, false /* no_spills */,
                                 shader_time_index);
    else
        gs = new gen6_gs_visitor(compiler, log_data, &c, prog_data, shader_prog,
                                 shader, mem_ctx, false /* no_spills */,
                                 shader_time_index);

    if (!gs->run()) {
        if (error_str)
            *error_str = ralloc_strdup(mem_ctx, gs->fail_msg);
    } else {
        ret = brw_vec4_generate_assembly(compiler, log_data, mem_ctx, shader,
                                         &prog_data->base, gs->cfg,
                                         final_assembly_size);
    }

    delete gs;
    return ret;
}