Example #1
0
extern "C" const unsigned *
brw_compile_tcs(const struct brw_compiler *compiler,
                void *log_data,
                void *mem_ctx,
                const struct brw_tcs_prog_key *key,
                struct brw_tcs_prog_data *prog_data,
                const nir_shader *src_shader,
                int shader_time_index,
                unsigned *final_assembly_size,
                char **error_str)
{
   const struct gen_device_info *devinfo = compiler->devinfo;
   struct brw_vue_prog_data *vue_prog_data = &prog_data->base;
   const bool is_scalar = compiler->scalar_stage[MESA_SHADER_TESS_CTRL];

   nir_shader *nir = nir_shader_clone(mem_ctx, src_shader);
   nir->info->outputs_written = key->outputs_written;
   nir->info->patch_outputs_written = key->patch_outputs_written;

   struct brw_vue_map input_vue_map;
   brw_compute_vue_map(devinfo, &input_vue_map, nir->info->inputs_read,
                       nir->info->separate_shader);
   brw_compute_tess_vue_map(&vue_prog_data->vue_map,
                            nir->info->outputs_written,
                            nir->info->patch_outputs_written);

   nir = brw_nir_apply_sampler_key(nir, devinfo, &key->tex, is_scalar);
   brw_nir_lower_vue_inputs(nir, is_scalar, &input_vue_map);
   brw_nir_lower_tcs_outputs(nir, &vue_prog_data->vue_map);
   if (key->quads_workaround)
      brw_nir_apply_tcs_quads_workaround(nir);

   nir = brw_postprocess_nir(nir, compiler->devinfo, is_scalar);

   if (is_scalar)
      prog_data->instances = DIV_ROUND_UP(nir->info->tcs.vertices_out, 8);
   else
      prog_data->instances = DIV_ROUND_UP(nir->info->tcs.vertices_out, 2);

   /* Compute URB entry size.  The maximum allowed URB entry size is 32k.
    * That divides up as follows:
    *
    *     32 bytes for the patch header (tessellation factors)
    *    480 bytes for per-patch varyings (a varying component is 4 bytes and
    *              gl_MaxTessPatchComponents = 120)
    *  16384 bytes for per-vertex varyings (a varying component is 4 bytes,
    *              gl_MaxPatchVertices = 32 and
    *              gl_MaxTessControlOutputComponents = 128)
    *
    *  15808 bytes left for varying packing overhead
    */
   const int num_per_patch_slots = vue_prog_data->vue_map.num_per_patch_slots;
   const int num_per_vertex_slots = vue_prog_data->vue_map.num_per_vertex_slots;
   unsigned output_size_bytes = 0;
   /* Note that the patch header is counted in num_per_patch_slots. */
   output_size_bytes += num_per_patch_slots * 16;
   output_size_bytes += nir->info->tcs.vertices_out * num_per_vertex_slots * 16;

   assert(output_size_bytes >= 1);
   if (output_size_bytes > GEN7_MAX_HS_URB_ENTRY_SIZE_BYTES)
      return NULL;

   /* URB entry sizes are stored as a multiple of 64 bytes. */
   vue_prog_data->urb_entry_size = ALIGN(output_size_bytes, 64) / 64;

   /* HS does not use the usual payload pushing from URB to GRFs,
    * because we don't have enough registers for a full-size payload, and
    * the hardware is broken on Haswell anyway.
    */
   vue_prog_data->urb_read_length = 0;

   if (unlikely(INTEL_DEBUG & DEBUG_TCS)) {
      fprintf(stderr, "TCS Input ");
      brw_print_vue_map(stderr, &input_vue_map);
      fprintf(stderr, "TCS Output ");
      brw_print_vue_map(stderr, &vue_prog_data->vue_map);
   }

   if (is_scalar) {
      fs_visitor v(compiler, log_data, mem_ctx, (void *) key,
                   &prog_data->base.base, NULL, nir, 8,
                   shader_time_index, &input_vue_map);
      if (!v.run_tcs_single_patch()) {
         if (error_str)
            *error_str = ralloc_strdup(mem_ctx, v.fail_msg);
         return NULL;
      }

      prog_data->base.base.dispatch_grf_start_reg = v.payload.num_regs;
      prog_data->base.dispatch_mode = DISPATCH_MODE_SIMD8;

      fs_generator g(compiler, log_data, mem_ctx, (void *) key,
                     &prog_data->base.base, v.promoted_constants, false,
                     MESA_SHADER_TESS_CTRL);
      if (unlikely(INTEL_DEBUG & DEBUG_TCS)) {
         g.enable_debug(ralloc_asprintf(mem_ctx,
                                        "%s tessellation control shader %s",
                                        nir->info->label ? nir->info->label
                                                        : "unnamed",
                                        nir->info->name));
      }

      g.generate_code(v.cfg, 8);

      return g.get_assembly(final_assembly_size);
   } else {
      vec4_tcs_visitor v(compiler, log_data, key, prog_data,
                         nir, mem_ctx, shader_time_index, &input_vue_map);
      if (!v.run()) {
         if (error_str)
            *error_str = ralloc_strdup(mem_ctx, v.fail_msg);
         return NULL;
      }

      if (unlikely(INTEL_DEBUG & DEBUG_TCS))
         v.dump_instructions();


      return brw_vec4_generate_assembly(compiler, log_data, mem_ctx, nir,
                                        &prog_data->base, v.cfg,
                                        final_assembly_size);
   }
}
Example #2
0
extern "C" const unsigned *
brw_compile_gs(const struct brw_compiler *compiler, void *log_data,
               void *mem_ctx,
               const struct brw_gs_prog_key *key,
               struct brw_gs_prog_data *prog_data,
               const nir_shader *src_shader,
               struct gl_shader_program *shader_prog,
               int shader_time_index,
               unsigned *final_assembly_size,
               char **error_str)
{
    struct brw_gs_compile c;
    memset(&c, 0, sizeof(c));
    c.key = *key;

    const bool is_scalar = compiler->scalar_stage[MESA_SHADER_GEOMETRY];
    nir_shader *shader = nir_shader_clone(mem_ctx, src_shader);

    /* The GLSL linker will have already matched up GS inputs and the outputs
     * of prior stages.  The driver does extend VS outputs in some cases, but
     * only for legacy OpenGL or Gen4-5 hardware, neither of which offer
     * geometry shader support.  So we can safely ignore that.
     *
     * For SSO pipelines, we use a fixed VUE map layout based on variable
     * locations, so we can rely on rendezvous-by-location making this work.
     */
    GLbitfield64 inputs_read = shader->info->inputs_read;
    brw_compute_vue_map(compiler->devinfo,
                        &c.input_vue_map, inputs_read,
                        shader->info->separate_shader);

    shader = brw_nir_apply_sampler_key(shader, compiler->devinfo, &key->tex,
                                       is_scalar);
    brw_nir_lower_vue_inputs(shader, is_scalar, &c.input_vue_map);
    brw_nir_lower_vue_outputs(shader, is_scalar);
    shader = brw_postprocess_nir(shader, compiler->devinfo, is_scalar);

    prog_data->base.clip_distance_mask =
        ((1 << shader->info->clip_distance_array_size) - 1);
    prog_data->base.cull_distance_mask =
        ((1 << shader->info->cull_distance_array_size) - 1) <<
        shader->info->clip_distance_array_size;

    prog_data->include_primitive_id =
        (shader->info->system_values_read & (1 << SYSTEM_VALUE_PRIMITIVE_ID)) != 0;

    prog_data->invocations = shader->info->gs.invocations;

    if (compiler->devinfo->gen >= 8)
        prog_data->static_vertex_count = nir_gs_count_vertices(shader);

    if (compiler->devinfo->gen >= 7) {
        if (shader->info->gs.output_primitive == GL_POINTS) {
            /* When the output type is points, the geometry shader may output data
             * to multiple streams, and EndPrimitive() has no effect.  So we
             * configure the hardware to interpret the control data as stream ID.
             */
            prog_data->control_data_format = GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_SID;

            /* We only have to emit control bits if we are using streams */
            if (shader_prog && shader_prog->Geom.UsesStreams)
                c.control_data_bits_per_vertex = 2;
            else
                c.control_data_bits_per_vertex = 0;
        } else {
            /* When the output type is triangle_strip or line_strip, EndPrimitive()
             * may be used to terminate the current strip and start a new one
             * (similar to primitive restart), and outputting data to multiple
             * streams is not supported.  So we configure the hardware to interpret
             * the control data as EndPrimitive information (a.k.a. "cut bits").
             */
            prog_data->control_data_format = GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_CUT;

            /* We only need to output control data if the shader actually calls
             * EndPrimitive().
             */
            c.control_data_bits_per_vertex =
                shader->info->gs.uses_end_primitive ? 1 : 0;
        }
    } else {
        /* There are no control data bits in gen6. */
        c.control_data_bits_per_vertex = 0;

        /* If it is using transform feedback, enable it */
        if (shader->info->has_transform_feedback_varyings)
            prog_data->gen6_xfb_enabled = true;
        else
            prog_data->gen6_xfb_enabled = false;
    }
    c.control_data_header_size_bits =
        shader->info->gs.vertices_out * c.control_data_bits_per_vertex;

    /* 1 HWORD = 32 bytes = 256 bits */
    prog_data->control_data_header_size_hwords =
        ALIGN(c.control_data_header_size_bits, 256) / 256;

    /* Compute the output vertex size.
     *
     * From the Ivy Bridge PRM, Vol2 Part1 7.2.1.1 STATE_GS - Output Vertex
     * Size (p168):
     *
     *     [0,62] indicating [1,63] 16B units
     *
     *     Specifies the size of each vertex stored in the GS output entry
     *     (following any Control Header data) as a number of 128-bit units
     *     (minus one).
     *
     *     Programming Restrictions: The vertex size must be programmed as a
     *     multiple of 32B units with the following exception: Rendering is
     *     disabled (as per SOL stage state) and the vertex size output by the
     *     GS thread is 16B.
     *
     *     If rendering is enabled (as per SOL state) the vertex size must be
     *     programmed as a multiple of 32B units. In other words, the only time
     *     software can program a vertex size with an odd number of 16B units
     *     is when rendering is disabled.
     *
     * Note: B=bytes in the above text.
     *
     * It doesn't seem worth the extra trouble to optimize the case where the
     * vertex size is 16B (especially since this would require special-casing
     * the GEN assembly that writes to the URB).  So we just set the vertex
     * size to a multiple of 32B (2 vec4's) in all cases.
     *
     * The maximum output vertex size is 62*16 = 992 bytes (31 hwords).  We
     * budget that as follows:
     *
     *   512 bytes for varyings (a varying component is 4 bytes and
     *             gl_MaxGeometryOutputComponents = 128)
     *    16 bytes overhead for VARYING_SLOT_PSIZ (each varying slot is 16
     *             bytes)
     *    16 bytes overhead for gl_Position (we allocate it a slot in the VUE
     *             even if it's not used)
     *    32 bytes overhead for gl_ClipDistance (we allocate it 2 VUE slots
     *             whenever clip planes are enabled, even if the shader doesn't
     *             write to gl_ClipDistance)
     *    16 bytes overhead since the VUE size must be a multiple of 32 bytes
     *             (see below)--this causes up to 1 VUE slot to be wasted
     *   400 bytes available for varying packing overhead
     *
     * Worst-case varying packing overhead is 3/4 of a varying slot (12 bytes)
     * per interpolation type, so this is plenty.
     *
     */
    unsigned output_vertex_size_bytes = prog_data->base.vue_map.num_slots * 16;
    assert(compiler->devinfo->gen == 6 ||
           output_vertex_size_bytes <= GEN7_MAX_GS_OUTPUT_VERTEX_SIZE_BYTES);
    prog_data->output_vertex_size_hwords =
        ALIGN(output_vertex_size_bytes, 32) / 32;

    /* Compute URB entry size.  The maximum allowed URB entry size is 32k.
     * That divides up as follows:
     *
     *     64 bytes for the control data header (cut indices or StreamID bits)
     *   4096 bytes for varyings (a varying component is 4 bytes and
     *              gl_MaxGeometryTotalOutputComponents = 1024)
     *   4096 bytes overhead for VARYING_SLOT_PSIZ (each varying slot is 16
     *              bytes/vertex and gl_MaxGeometryOutputVertices is 256)
     *   4096 bytes overhead for gl_Position (we allocate it a slot in the VUE
     *              even if it's not used)
     *   8192 bytes overhead for gl_ClipDistance (we allocate it 2 VUE slots
     *              whenever clip planes are enabled, even if the shader doesn't
     *              write to gl_ClipDistance)
     *   4096 bytes overhead since the VUE size must be a multiple of 32
     *              bytes (see above)--this causes up to 1 VUE slot to be wasted
     *   8128 bytes available for varying packing overhead
     *
     * Worst-case varying packing overhead is 3/4 of a varying slot per
     * interpolation type, which works out to 3072 bytes, so this would allow
     * us to accommodate 2 interpolation types without any danger of running
     * out of URB space.
     *
     * In practice, the risk of running out of URB space is very small, since
     * the above figures are all worst-case, and most of them scale with the
     * number of output vertices.  So we'll just calculate the amount of space
     * we need, and if it's too large, fail to compile.
     *
     * The above is for gen7+ where we have a single URB entry that will hold
     * all the output. In gen6, we will have to allocate URB entries for every
     * vertex we emit, so our URB entries only need to be large enough to hold
     * a single vertex. Also, gen6 does not have a control data header.
     */
    unsigned output_size_bytes;
    if (compiler->devinfo->gen >= 7) {
        output_size_bytes =
            prog_data->output_vertex_size_hwords * 32 * shader->info->gs.vertices_out;
        output_size_bytes += 32 * prog_data->control_data_header_size_hwords;
    } else {
        output_size_bytes = prog_data->output_vertex_size_hwords * 32;
    }

    /* Broadwell stores "Vertex Count" as a full 8 DWord (32 byte) URB output,
     * which comes before the control header.
     */
    if (compiler->devinfo->gen >= 8)
        output_size_bytes += 32;

    /* Shaders can technically set max_vertices = 0, at which point we
     * may have a URB size of 0 bytes.  Nothing good can come from that,
     * so enforce a minimum size.
     */
    if (output_size_bytes == 0)
        output_size_bytes = 1;

    unsigned max_output_size_bytes = GEN7_MAX_GS_URB_ENTRY_SIZE_BYTES;
    if (compiler->devinfo->gen == 6)
        max_output_size_bytes = GEN6_MAX_GS_URB_ENTRY_SIZE_BYTES;
    if (output_size_bytes > max_output_size_bytes)
        return NULL;


    /* URB entry sizes are stored as a multiple of 64 bytes in gen7+ and
     * a multiple of 128 bytes in gen6.
     */
    if (compiler->devinfo->gen >= 7)
        prog_data->base.urb_entry_size = ALIGN(output_size_bytes, 64) / 64;
    else
        prog_data->base.urb_entry_size = ALIGN(output_size_bytes, 128) / 128;

    prog_data->output_topology =
        get_hw_prim_for_gl_prim(shader->info->gs.output_primitive);

    prog_data->vertices_in = shader->info->gs.vertices_in;

    /* GS inputs are read from the VUE 256 bits (2 vec4's) at a time, so we
     * need to program a URB read length of ceiling(num_slots / 2).
     */
    prog_data->base.urb_read_length = (c.input_vue_map.num_slots + 1) / 2;

    /* Now that prog_data setup is done, we are ready to actually compile the
     * program.
     */
    if (unlikely(INTEL_DEBUG & DEBUG_GS)) {
        fprintf(stderr, "GS Input ");
        brw_print_vue_map(stderr, &c.input_vue_map);
        fprintf(stderr, "GS Output ");
        brw_print_vue_map(stderr, &prog_data->base.vue_map);
    }

    if (is_scalar) {
        fs_visitor v(compiler, log_data, mem_ctx, &c, prog_data, shader,
                     shader_time_index);
        if (v.run_gs()) {
            prog_data->base.dispatch_mode = DISPATCH_MODE_SIMD8;
            prog_data->base.base.dispatch_grf_start_reg = v.payload.num_regs;

            fs_generator g(compiler, log_data, mem_ctx, &c.key,
                           &prog_data->base.base, v.promoted_constants,
                           false, MESA_SHADER_GEOMETRY);
            if (unlikely(INTEL_DEBUG & DEBUG_GS)) {
                const char *label =
                    shader->info->label ? shader->info->label : "unnamed";
                char *name = ralloc_asprintf(mem_ctx, "%s geometry shader %s",
                                             label, shader->info->name);
                g.enable_debug(name);
            }
            g.generate_code(v.cfg, 8);
            return g.get_assembly(final_assembly_size);
        }
    }

    if (compiler->devinfo->gen >= 7) {
        /* Compile the geometry shader in DUAL_OBJECT dispatch mode, if we can do
         * so without spilling. If the GS invocations count > 1, then we can't use
         * dual object mode.
         */
        if (prog_data->invocations <= 1 &&
                likely(!(INTEL_DEBUG & DEBUG_NO_DUAL_OBJECT_GS))) {
            prog_data->base.dispatch_mode = DISPATCH_MODE_4X2_DUAL_OBJECT;

            vec4_gs_visitor v(compiler, log_data, &c, prog_data, shader,
                              mem_ctx, true /* no_spills */, shader_time_index);
            if (v.run()) {
                return brw_vec4_generate_assembly(compiler, log_data, mem_ctx,
                                                  shader, &prog_data->base, v.cfg,
                                                  final_assembly_size);
            }
        }
    }

    /* Either we failed to compile in DUAL_OBJECT mode (probably because it
     * would have required spilling) or DUAL_OBJECT mode is disabled.  So fall
     * back to DUAL_INSTANCED or SINGLE mode, which consumes fewer registers.
     *
     * FIXME: Single dispatch mode requires that the driver can handle
     * interleaving of input registers, but this is already supported (dual
     * instance mode has the same requirement). However, to take full advantage
     * of single dispatch mode to reduce register pressure we would also need to
     * do interleaved outputs, but currently, the vec4 visitor and generator
     * classes do not support this, so at the moment register pressure in
     * single and dual instance modes is the same.
     *
     * From the Ivy Bridge PRM, Vol2 Part1 7.2.1.1 "3DSTATE_GS"
     * "If InstanceCount>1, DUAL_OBJECT mode is invalid. Software will likely
     * want to use DUAL_INSTANCE mode for higher performance, but SINGLE mode
     * is also supported. When InstanceCount=1 (one instance per object) software
     * can decide which dispatch mode to use. DUAL_OBJECT mode would likely be
     * the best choice for performance, followed by SINGLE mode."
     *
     * So SINGLE mode is more performant when invocations == 1 and DUAL_INSTANCE
     * mode is more performant when invocations > 1. Gen6 only supports
     * SINGLE mode.
     */
    if (prog_data->invocations <= 1 || compiler->devinfo->gen < 7)
        prog_data->base.dispatch_mode = DISPATCH_MODE_4X1_SINGLE;
    else
        prog_data->base.dispatch_mode = DISPATCH_MODE_4X2_DUAL_INSTANCE;

    vec4_gs_visitor *gs = NULL;
    const unsigned *ret = NULL;

    if (compiler->devinfo->gen >= 7)
        gs = new vec4_gs_visitor(compiler, log_data, &c, prog_data,
                                 shader, mem_ctx, false /* no_spills */,
                                 shader_time_index);
    else
        gs = new gen6_gs_visitor(compiler, log_data, &c, prog_data, shader_prog,
                                 shader, mem_ctx, false /* no_spills */,
                                 shader_time_index);

    if (!gs->run()) {
        if (error_str)
            *error_str = ralloc_strdup(mem_ctx, gs->fail_msg);
    } else {
        ret = brw_vec4_generate_assembly(compiler, log_data, mem_ctx, shader,
                                         &prog_data->base, gs->cfg,
                                         final_assembly_size);
    }

    delete gs;
    return ret;
}