예제 #1
0
void
gen6_gs_visitor::visit(ir_end_primitive *)
{
   this->current_annotation = "gen6 end primitive";
   /* Calling EndPrimitive() is optional for point output. In this case we set
    * the PrimEnd flag when we process EmitVertex().
    */
   if (c->gp->program.OutputType == GL_POINTS)
      return;

   /* Otherwise we know that the last vertex we have processed was the last
    * vertex in the primitive and we need to set its PrimEnd flag, so do this
    * unless we haven't emitted that vertex at all (vertex_count != 0).
    *
    * Notice that we have already incremented vertex_count when we processed
    * the last emit_vertex, so we need to take that into account in the
    * comparison below (hence the num_output_vertices + 1 in the comparison
    * below).
    */
   unsigned num_output_vertices = c->gp->program.VerticesOut;
   emit(CMP(dst_null_d(), this->vertex_count, src_reg(num_output_vertices + 1),
            BRW_CONDITIONAL_L));
   vec4_instruction *inst = emit(CMP(dst_null_d(),
                                     this->vertex_count, 0u,
                                     BRW_CONDITIONAL_NEQ));
   inst->predicate = BRW_PREDICATE_NORMAL;
   emit(IF(BRW_PREDICATE_NORMAL));
   {
      /* vertex_output_offset is already pointing at the first entry of the
       * next vertex. So subtract 1 to modify the flags for the previous
       * vertex.
       */
      src_reg offset(this, glsl_type::uint_type);
      emit(ADD(dst_reg(offset), this->vertex_output_offset, src_reg(-1)));

      src_reg dst(this->vertex_output);
      dst.reladdr = ralloc(mem_ctx, src_reg);
      memcpy(dst.reladdr, &offset, sizeof(src_reg));

      emit(OR(dst_reg(dst), dst, URB_WRITE_PRIM_END));
      emit(ADD(dst_reg(this->prim_count), this->prim_count, 1u));

      /* Set the first vertex flag to indicate that the next vertex will start
       * a primitive.
       */
      emit(MOV(dst_reg(this->first_vertex), URB_WRITE_PRIM_START));
   }
   emit(BRW_OPCODE_ENDIF);
}
예제 #2
0
void
vec4_tcs_visitor::emit_thread_end()
{
   vec4_instruction *inst;
   current_annotation = "thread end";

   if (nir->info->tcs.vertices_out % 2) {
      emit(BRW_OPCODE_ENDIF);
   }

   if (devinfo->gen == 7) {
      struct brw_tcs_prog_data *tcs_prog_data =
         (struct brw_tcs_prog_data *) prog_data;

      current_annotation = "release input vertices";

      /* Synchronize all threads, so we know that no one is still
       * using the input URB handles.
       */
      if (tcs_prog_data->instances > 1) {
         dst_reg header = dst_reg(this, glsl_type::uvec4_type);
         emit(TCS_OPCODE_CREATE_BARRIER_HEADER, header);
         emit(SHADER_OPCODE_BARRIER, dst_null_ud(), src_reg(header));
      }

      /* Make thread 0 (invocations <1, 0>) release pairs of ICP handles.
       * We want to compare the bottom half of invocation_id with 0, but
       * use that truth value for the top half as well.  Unfortunately,
       * we don't have stride in the vec4 world, nor UV immediates in
       * align16, so we need an opcode to get invocation_id<0,4,0>.
       */
      set_condmod(BRW_CONDITIONAL_Z,
                  emit(TCS_OPCODE_SRC0_010_IS_ZERO, dst_null_d(),
                       invocation_id));
      emit(IF(BRW_PREDICATE_NORMAL));
      for (unsigned i = 0; i < key->input_vertices; i += 2) {
         /* If we have an odd number of input vertices, the last will be
          * unpaired.  We don't want to use an interleaved URB write in
          * that case.
          */
         const bool is_unpaired = i == key->input_vertices - 1;

         dst_reg header(this, glsl_type::uvec4_type);
         emit(TCS_OPCODE_RELEASE_INPUT, header, brw_imm_ud(i),
              brw_imm_ud(is_unpaired));
      }
      emit(BRW_OPCODE_ENDIF);
   }

   if (unlikely(INTEL_DEBUG & DEBUG_SHADER_TIME))
      emit_shader_time_end();

   inst = emit(TCS_OPCODE_THREAD_END);
   inst->base_mrf = 14;
   inst->mlen = 2;
}
예제 #3
0
void
vec4_tcs_visitor::emit_prolog()
{
   invocation_id = src_reg(this, glsl_type::uint_type);
   emit(TCS_OPCODE_GET_INSTANCE_ID, dst_reg(invocation_id));

   /* HS threads are dispatched with the dispatch mask set to 0xFF.
    * If there are an odd number of output vertices, then the final
    * HS instance dispatched will only have its bottom half doing real
    * work, and so we need to disable the upper half:
    */
   if (nir->info->tcs.vertices_out % 2) {
      emit(CMP(dst_null_d(), invocation_id,
               brw_imm_ud(nir->info->tcs.vertices_out), BRW_CONDITIONAL_L));

      /* Matching ENDIF is in emit_thread_end() */
      emit(IF(BRW_PREDICATE_NORMAL));
   }
}
예제 #4
0
void
vec4_gs_visitor::visit(ir_emit_vertex *ir)
{
   this->current_annotation = "emit vertex: safety check";

   /* To ensure that we don't output more vertices than the shader specified
    * using max_vertices, do the logic inside a conditional of the form "if
    * (vertex_count < MAX)"
    */
   unsigned num_output_vertices = c->gp->program.VerticesOut;
   emit(CMP(dst_null_d(), this->vertex_count,
            src_reg(num_output_vertices), BRW_CONDITIONAL_L));
   emit(IF(BRW_PREDICATE_NORMAL));
   {
      /* If we're outputting 32 control data bits or less, then we can wait
       * until the shader is over to output them all.  Otherwise we need to
       * output them as we go.  Now is the time to do it, since we're about to
       * output the vertex_count'th vertex, so it's guaranteed that the
       * control data bits associated with the (vertex_count - 1)th vertex are
       * correct.
       */
      if (c->control_data_header_size_bits > 32) {
         this->current_annotation = "emit vertex: emit control data bits";
         /* Only emit control data bits if we've finished accumulating a batch
          * of 32 bits.  This is the case when:
          *
          *     (vertex_count * bits_per_vertex) % 32 == 0
          *
          * (in other words, when the last 5 bits of vertex_count *
          * bits_per_vertex are 0).  Assuming bits_per_vertex == 2^n for some
          * integer n (which is always the case, since bits_per_vertex is
          * always 1 or 2), this is equivalent to requiring that the last 5-n
          * bits of vertex_count are 0:
          *
          *     vertex_count & (2^(5-n) - 1) == 0
          *
          * 2^(5-n) == 2^5 / 2^n == 32 / bits_per_vertex, so this is
          * equivalent to:
          *
          *     vertex_count & (32 / bits_per_vertex - 1) == 0
          */
         vec4_instruction *inst =
            emit(AND(dst_null_d(), this->vertex_count,
                     (uint32_t) (32 / c->control_data_bits_per_vertex - 1)));
         inst->conditional_mod = BRW_CONDITIONAL_Z;
         emit(IF(BRW_PREDICATE_NORMAL));
         {
            emit_control_data_bits();

            /* Reset control_data_bits to 0 so we can start accumulating a new
             * batch.
             *
             * Note: in the case where vertex_count == 0, this neutralizes the
             * effect of any call to EndPrimitive() that the shader may have
             * made before outputting its first vertex.
             */
            inst = emit(MOV(dst_reg(this->control_data_bits), 0u));
            inst->force_writemask_all = true;
         }
         emit(BRW_OPCODE_ENDIF);
      }

      this->current_annotation = "emit vertex: vertex data";
      emit_vertex();

      /* In stream mode we have to set control data bits for all vertices
       * unless we have disabled control data bits completely (which we do
       * do for GL_POINTS outputs that don't use streams).
       */
      if (c->control_data_header_size_bits > 0 &&
          c->prog_data.control_data_format ==
             GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_SID) {
          this->current_annotation = "emit vertex: Stream control data bits";
          set_stream_control_data_bits(ir->stream_id());
      }

      this->current_annotation = "emit vertex: increment vertex count";
      emit(ADD(dst_reg(this->vertex_count), this->vertex_count,
               src_reg(1u)));
   }
   emit(BRW_OPCODE_ENDIF);

   this->current_annotation = NULL;
}
예제 #5
0
/**
 * Write out a batch of 32 control data bits from the control_data_bits
 * register to the URB.
 *
 * The current value of the vertex_count register determines which DWORD in
 * the URB receives the control data bits.  The control_data_bits register is
 * assumed to contain the correct data for the vertex that was most recently
 * output, and all previous vertices that share the same DWORD.
 *
 * This function takes care of ensuring that if no vertices have been output
 * yet, no control bits are emitted.
 */
void
vec4_gs_visitor::emit_control_data_bits()
{
   assert(c->control_data_bits_per_vertex != 0);

   /* Since the URB_WRITE_OWORD message operates with 128-bit (vec4 sized)
    * granularity, we need to use two tricks to ensure that the batch of 32
    * control data bits is written to the appropriate DWORD in the URB.  To
    * select which vec4 we are writing to, we use the "slot {0,1} offset"
    * fields of the message header.  To select which DWORD in the vec4 we are
    * writing to, we use the channel mask fields of the message header.  To
    * avoid penalizing geometry shaders that emit a small number of vertices
    * with extra bookkeeping, we only do each of these tricks when
    * c->prog_data.control_data_header_size_bits is large enough to make it
    * necessary.
    *
    * Note: this means that if we're outputting just a single DWORD of control
    * data bits, we'll actually replicate it four times since we won't do any
    * channel masking.  But that's not a problem since in this case the
    * hardware only pays attention to the first DWORD.
    */
   enum brw_urb_write_flags urb_write_flags = BRW_URB_WRITE_OWORD;
   if (c->control_data_header_size_bits > 32)
      urb_write_flags = urb_write_flags | BRW_URB_WRITE_USE_CHANNEL_MASKS;
   if (c->control_data_header_size_bits > 128)
      urb_write_flags = urb_write_flags | BRW_URB_WRITE_PER_SLOT_OFFSET;

   /* If vertex_count is 0, then no control data bits have been accumulated
    * yet, so we should do nothing.
    */
   emit(CMP(dst_null_d(), this->vertex_count, 0u, BRW_CONDITIONAL_NEQ));
   emit(IF(BRW_PREDICATE_NORMAL));
   {
      /* If we are using either channel masks or a per-slot offset, then we
       * need to figure out which DWORD we are trying to write to, using the
       * formula:
       *
       *     dword_index = (vertex_count - 1) * bits_per_vertex / 32
       *
       * Since bits_per_vertex is a power of two, and is known at compile
       * time, this can be optimized to:
       *
       *     dword_index = (vertex_count - 1) >> (6 - log2(bits_per_vertex))
       */
      src_reg dword_index(this, glsl_type::uint_type);
      if (urb_write_flags) {
         src_reg prev_count(this, glsl_type::uint_type);
         emit(ADD(dst_reg(prev_count), this->vertex_count, 0xffffffffu));
         unsigned log2_bits_per_vertex =
            _mesa_fls(c->control_data_bits_per_vertex);
         emit(SHR(dst_reg(dword_index), prev_count,
                  (uint32_t) (6 - log2_bits_per_vertex)));
      }

      /* Start building the URB write message.  The first MRF gets a copy of
       * R0.
       */
      int base_mrf = 1;
      dst_reg mrf_reg(MRF, base_mrf);
      src_reg r0(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
      vec4_instruction *inst = emit(MOV(mrf_reg, r0));
      inst->force_writemask_all = true;

      if (urb_write_flags & BRW_URB_WRITE_PER_SLOT_OFFSET) {
         /* Set the per-slot offset to dword_index / 4, to that we'll write to
          * the appropriate OWORD within the control data header.
          */
         src_reg per_slot_offset(this, glsl_type::uint_type);
         emit(SHR(dst_reg(per_slot_offset), dword_index, 2u));
         emit(GS_OPCODE_SET_WRITE_OFFSET, mrf_reg, per_slot_offset, 1u);
      }

      if (urb_write_flags & BRW_URB_WRITE_USE_CHANNEL_MASKS) {
         /* Set the channel masks to 1 << (dword_index % 4), so that we'll
          * write to the appropriate DWORD within the OWORD.  We need to do
          * this computation with force_writemask_all, otherwise garbage data
          * from invocation 0 might clobber the mask for invocation 1 when
          * GS_OPCODE_PREPARE_CHANNEL_MASKS tries to OR the two masks
          * together.
          */
         src_reg channel(this, glsl_type::uint_type);
         inst = emit(AND(dst_reg(channel), dword_index, 3u));
         inst->force_writemask_all = true;
         src_reg one(this, glsl_type::uint_type);
         inst = emit(MOV(dst_reg(one), 1u));
         inst->force_writemask_all = true;
         src_reg channel_mask(this, glsl_type::uint_type);
         inst = emit(SHL(dst_reg(channel_mask), one, channel));
         inst->force_writemask_all = true;
         emit(GS_OPCODE_PREPARE_CHANNEL_MASKS, dst_reg(channel_mask),
                                               channel_mask);
         emit(GS_OPCODE_SET_CHANNEL_MASKS, mrf_reg, channel_mask);
      }

      /* Store the control data bits in the message payload and send it. */
      dst_reg mrf_reg2(MRF, base_mrf + 1);
      inst = emit(MOV(mrf_reg2, this->control_data_bits));
      inst->force_writemask_all = true;
      inst = emit(GS_OPCODE_URB_WRITE);
      inst->urb_write_flags = urb_write_flags;
      /* We need to increment Global Offset by 256-bits to make room for
       * Broadwell's extra "Vertex Count" payload at the beginning of the
       * URB entry.  Since this is an OWord message, Global Offset is counted
       * in 128-bit units, so we must set it to 2.
       */
      if (brw->gen >= 8)
         inst->offset = 2;
      inst->base_mrf = base_mrf;
      inst->mlen = 2;
   }
   emit(BRW_OPCODE_ENDIF);
}
예제 #6
0
void
gen6_gs_visitor::xfb_program(unsigned vertex, unsigned num_verts)
{
   struct brw_gs_prog_data *prog_data =
      (struct brw_gs_prog_data *) &c->prog_data;
   unsigned binding;
   unsigned num_bindings = prog_data->num_transform_feedback_bindings;
   src_reg sol_temp(this, glsl_type::uvec4_type);

   /* Check for buffer overflow: we need room to write the complete primitive
    * (all vertices). Otherwise, avoid writing any vertices for it
    */
   emit(ADD(dst_reg(sol_temp), this->sol_prim_written, 1u));
   emit(MUL(dst_reg(sol_temp), sol_temp, src_reg(num_verts)));
   emit(ADD(dst_reg(sol_temp), sol_temp, this->svbi));
   emit(CMP(dst_null_d(), sol_temp, this->max_svbi, BRW_CONDITIONAL_LE));
   emit(IF(BRW_PREDICATE_NORMAL));
   {
      /* Avoid overwriting MRF 1 as it is used as URB write message header */
      dst_reg mrf_reg(MRF, 2);

      this->current_annotation = "gen6: emit SOL vertex data";
      /* For each vertex, generate code to output each varying using the
       * appropriate binding table entry.
       */
      for (binding = 0; binding < num_bindings; ++binding) {
         unsigned char varying =
            prog_data->transform_feedback_bindings[binding];

         /* Set up the correct destination index for this vertex */
         vec4_instruction *inst = emit(GS_OPCODE_SVB_SET_DST_INDEX,
                                       mrf_reg,
                                       this->destination_indices);
         inst->sol_vertex = vertex % num_verts;

         /* From the Sandybridge PRM, Volume 2, Part 1, Section 4.5.1:
          *
          *   "Prior to End of Thread with a URB_WRITE, the kernel must
          *   ensure that all writes are complete by sending the final
          *   write as a committed write."
          */
         bool final_write = binding == (unsigned) num_bindings - 1 &&
                            inst->sol_vertex == num_verts - 1;

         /* Compute offset of this varying for the current vertex
          * in vertex_output
          */
         this->current_annotation = output_reg_annotation[varying];
         src_reg data(this->vertex_output);
         data.reladdr = ralloc(mem_ctx, src_reg);
         int offset = get_vertex_output_offset_for_varying(vertex, varying);
         emit(MOV(dst_reg(this->vertex_output_offset), offset));
         memcpy(data.reladdr, &this->vertex_output_offset, sizeof(src_reg));
         data.type = output_reg[varying].type;

         /* PSIZ, LAYER and VIEWPORT are packed in different channels of the
          * same slot, so make sure we write the appropriate channel
          */
         if (varying == VARYING_SLOT_PSIZ)
            data.swizzle = BRW_SWIZZLE_WWWW;
         else if (varying == VARYING_SLOT_LAYER)
            data.swizzle = BRW_SWIZZLE_YYYY;
         else if (varying == VARYING_SLOT_VIEWPORT)
            data.swizzle = BRW_SWIZZLE_ZZZZ;
         else
            data.swizzle = prog_data->transform_feedback_swizzles[binding];

         /* Write data */
         inst = emit(GS_OPCODE_SVB_WRITE, mrf_reg, data, sol_temp);
         inst->sol_binding = binding;
         inst->sol_final_write = final_write;

         if (final_write) {
            /* This is the last vertex of the primitive, then increment
             * SO num primitive counter and destination indices.
             */
            emit(ADD(dst_reg(this->destination_indices),
                     this->destination_indices,
                     src_reg(num_verts)));
            emit(ADD(dst_reg(this->sol_prim_written),
                     this->sol_prim_written, 1u));
         }

      }
      this->current_annotation = NULL;
   }
   emit(BRW_OPCODE_ENDIF);
}
예제 #7
0
void
gen6_gs_visitor::xfb_write()
{
   unsigned num_verts;
   struct brw_gs_prog_data *prog_data =
      (struct brw_gs_prog_data *) &c->prog_data;

   if (!prog_data->num_transform_feedback_bindings)
      return;

   switch (c->prog_data.output_topology) {
   case _3DPRIM_POINTLIST:
      num_verts = 1;
      break;
   case _3DPRIM_LINELIST:
   case _3DPRIM_LINESTRIP:
   case _3DPRIM_LINELOOP:
      num_verts = 2;
      break;
   case _3DPRIM_TRILIST:
   case _3DPRIM_TRIFAN:
   case _3DPRIM_TRISTRIP:
   case _3DPRIM_RECTLIST:
      num_verts = 3;
      break;
   case _3DPRIM_QUADLIST:
   case _3DPRIM_QUADSTRIP:
   case _3DPRIM_POLYGON:
      num_verts = 3;
      break;
   default:
      unreachable("Unexpected primitive type in Gen6 SOL program.");
   }

   this->current_annotation = "gen6 thread end: svb writes init";

   emit(MOV(dst_reg(this->vertex_output_offset), 0u));
   emit(MOV(dst_reg(this->sol_prim_written), 0u));

   /* Check that at least one primitive can be written
    *
    * Note: since we use the binding table to keep track of buffer offsets
    * and stride, the GS doesn't need to keep track of a separate pointer
    * into each buffer; it uses a single pointer which increments by 1 for
    * each vertex.  So we use SVBI0 for this pointer, regardless of whether
    * transform feedback is in interleaved or separate attribs mode.
    */
   src_reg sol_temp(this, glsl_type::uvec4_type);
   emit(ADD(dst_reg(sol_temp), this->svbi, src_reg(num_verts)));

   /* Compare SVBI calculated number with the maximum value, which is
    * in R1.4 (previously saved in this->max_svbi) for gen6.
    */
   emit(CMP(dst_null_d(), sol_temp, this->max_svbi, BRW_CONDITIONAL_LE));
   emit(IF(BRW_PREDICATE_NORMAL));
   {
      src_reg destination_indices_uw =
         retype(destination_indices, BRW_REGISTER_TYPE_UW);

      vec4_instruction *inst = emit(MOV(dst_reg(destination_indices_uw),
                                        brw_imm_v(0x00020100))); /* (0, 1, 2) */
      inst->force_writemask_all = true;

      emit(ADD(dst_reg(this->destination_indices),
               this->destination_indices,
               this->svbi));
   }
   emit(BRW_OPCODE_ENDIF);

   /* Write transform feedback data for all processed vertices. */
   for (int i = 0; i < c->gp->program.VerticesOut; i++) {
      emit(MOV(dst_reg(sol_temp), i));
      emit(CMP(dst_null_d(), sol_temp, this->vertex_count,
               BRW_CONDITIONAL_L));
      emit(IF(BRW_PREDICATE_NORMAL));
      {
         xfb_program(i, num_verts);
      }
      emit(BRW_OPCODE_ENDIF);
   }
}
예제 #8
0
void
gen6_gs_visitor::emit_thread_end()
{
   /* Make sure the current primitive is ended: we know it is not ended when
    * first_vertex is not zero. This is only relevant for outputs other than
    * points because in the point case we set PrimEnd on all vertices.
    */
   if (c->gp->program.OutputType != GL_POINTS) {
      emit(CMP(dst_null_d(), this->first_vertex, 0u, BRW_CONDITIONAL_Z));
      emit(IF(BRW_PREDICATE_NORMAL));
      {
         visit((ir_end_primitive *) NULL);
      }
      emit(BRW_OPCODE_ENDIF);
   }

   /* Here we have to:
    * 1) Emit an FF_SYNC messsage to obtain an initial VUE handle.
    * 2) Loop over all buffered vertex data and write it to corresponding
    *    URB entries.
    * 3) Allocate new VUE handles for all vertices other than the first.
    * 4) Send a final EOT message.
    */

   /* MRF 0 is reserved for the debugger, so start with message header
    * in MRF 1.
    */
   int base_mrf = 1;

   /* In the process of generating our URB write message contents, we
    * may need to unspill a register or load from an array.  Those
    * reads would use MRFs 14-15.
    */
   int max_usable_mrf = 13;

   /* Issue the FF_SYNC message and obtain the initial VUE handle. */
   emit(CMP(dst_null_d(), this->vertex_count, 0u, BRW_CONDITIONAL_G));
   emit(IF(BRW_PREDICATE_NORMAL));
   {
      this->current_annotation = "gen6 thread end: ff_sync";

      vec4_instruction *inst;
      if (c->prog_data.gen6_xfb_enabled) {
         src_reg sol_temp(this, glsl_type::uvec4_type);
         emit(GS_OPCODE_FF_SYNC_SET_PRIMITIVES,
              dst_reg(this->svbi),
              this->vertex_count,
              this->prim_count,
              sol_temp);
         inst = emit(GS_OPCODE_FF_SYNC,
                     dst_reg(this->temp), this->prim_count, this->svbi);
      } else {
         inst = emit(GS_OPCODE_FF_SYNC,
                     dst_reg(this->temp), this->prim_count, src_reg(0u));
      }
      inst->base_mrf = base_mrf;

      /* Loop over all buffered vertices and emit URB write messages */
      this->current_annotation = "gen6 thread end: urb writes init";
      src_reg vertex(this, glsl_type::uint_type);
      emit(MOV(dst_reg(vertex), 0u));
      emit(MOV(dst_reg(this->vertex_output_offset), 0u));

      this->current_annotation = "gen6 thread end: urb writes";
      emit(BRW_OPCODE_DO);
      {
         emit(CMP(dst_null_d(), vertex, this->vertex_count, BRW_CONDITIONAL_GE));
         inst = emit(BRW_OPCODE_BREAK);
         inst->predicate = BRW_PREDICATE_NORMAL;

         /* First we prepare the message header */
         emit_urb_write_header(base_mrf);

         /* Then add vertex data to the message in interleaved fashion */
         int slot = 0;
         bool complete = false;
         do {
            int mrf = base_mrf + 1;

            /* URB offset is in URB row increments, and each of our MRFs is half
             * of one of those, since we're doing interleaved writes.
             */
            int urb_offset = slot / 2;

            for (; slot < prog_data->vue_map.num_slots; ++slot) {
               int varying = prog_data->vue_map.slot_to_varying[slot];
               current_annotation = output_reg_annotation[varying];

               /* Compute offset of this slot for the current vertex
                * in vertex_output
                */
               src_reg data(this->vertex_output);
               data.reladdr = ralloc(mem_ctx, src_reg);
               memcpy(data.reladdr, &this->vertex_output_offset,
                      sizeof(src_reg));

               /* Copy this slot to the appropriate message register */
               dst_reg reg = dst_reg(MRF, mrf);
               reg.type = output_reg[varying].type;
               data.type = reg.type;
               vec4_instruction *inst = emit(MOV(reg, data));
               inst->force_writemask_all = true;

               mrf++;
               emit(ADD(dst_reg(this->vertex_output_offset),
                        this->vertex_output_offset, 1u));

               /* If this was max_usable_mrf, we can't fit anything more into
                * this URB WRITE.
                */
               if (mrf > max_usable_mrf) {
                  slot++;
                  break;
               }
            }

            complete = slot >= prog_data->vue_map.num_slots;
            emit_urb_write_opcode(complete, base_mrf, mrf, urb_offset);
         } while (!complete);

         /* Skip over the flags data item so that vertex_output_offset points
          * to the first data item of the next vertex, so that we can start
          * writing the next vertex.
          */
         emit(ADD(dst_reg(this->vertex_output_offset),
                  this->vertex_output_offset, 1u));

         emit(ADD(dst_reg(vertex), vertex, 1u));
      }
      emit(BRW_OPCODE_WHILE);

      if (c->prog_data.gen6_xfb_enabled)
         xfb_write();
   }
   emit(BRW_OPCODE_ENDIF);

   /* Finally, emit EOT message.
    *
    * In gen6 we need to end the thread differently depending on whether we have
    * emitted at least one vertex or not. In case we did, the EOT message must
    * always include the COMPLETE flag or else the GPU hangs. If we have not
    * produced any output we can't use the COMPLETE flag.
    *
    * However, this would lead us to end the program with an ENDIF opcode,
    * which we want to avoid, so what we do is that we always request a new
    * VUE handle every time we do a URB WRITE, even for the last vertex we emit.
    * With this we make sure that whether we have emitted at least one vertex
    * or none at all, we have to finish the thread without writing to the URB,
    * which works for both cases by setting the COMPLETE and UNUSED flags in
    * the EOT message.
    */
   this->current_annotation = "gen6 thread end: EOT";

   if (c->prog_data.gen6_xfb_enabled) {
      /* When emitting EOT, set SONumPrimsWritten Increment Value. */
      src_reg data(this, glsl_type::uint_type);
      emit(AND(dst_reg(data), this->sol_prim_written, src_reg(0xffffu)));
      emit(SHL(dst_reg(data), data, src_reg(16u)));
      emit(GS_OPCODE_SET_DWORD_2, dst_reg(MRF, base_mrf), data);
   }

   vec4_instruction *inst = emit(GS_OPCODE_THREAD_END);
   inst->urb_write_flags = BRW_URB_WRITE_COMPLETE | BRW_URB_WRITE_UNUSED;
   inst->base_mrf = base_mrf;
   inst->mlen = 1;
}
예제 #9
0
void
gen6_gs_visitor::visit(ir_emit_vertex *)
{
   this->current_annotation = "gen6 emit vertex";
   /* Honor max_vertex layout indication in geometry shader by ignoring any
    * vertices coming after c->gp->program.VerticesOut.
    */
   unsigned num_output_vertices = c->gp->program.VerticesOut;
   emit(CMP(dst_null_d(), this->vertex_count, src_reg(num_output_vertices),
            BRW_CONDITIONAL_L));
   emit(IF(BRW_PREDICATE_NORMAL));
   {
      /* Buffer all output slots for this vertex in vertex_output */
      for (int slot = 0; slot < prog_data->vue_map.num_slots; ++slot) {
         int varying = prog_data->vue_map.slot_to_varying[slot];
         if (varying != VARYING_SLOT_PSIZ) {
            dst_reg dst(this->vertex_output);
            dst.reladdr = ralloc(mem_ctx, src_reg);
            memcpy(dst.reladdr, &this->vertex_output_offset, sizeof(src_reg));
            emit_urb_slot(dst, varying);
         } else {
            /* The PSIZ slot can pack multiple varyings in different channels
             * and emit_urb_slot() will produce a MOV instruction for each of
             * them. Since we are writing to an array, that will translate to
             * possibly multiple MOV instructions with an array destination and
             * each will generate a scratch write with the same offset into
             * scratch space (thus, each one overwriting the previous). This is
             * not what we want. What we will do instead is emit PSIZ to a
             * a regular temporary register, then move that resgister into the
             * array. This way we only have one instruction with an array
             * destination and we only produce a single scratch write.
             */
            dst_reg tmp = dst_reg(src_reg(this, glsl_type::uvec4_type));
            emit_urb_slot(tmp, varying);
            dst_reg dst(this->vertex_output);
            dst.reladdr = ralloc(mem_ctx, src_reg);
            memcpy(dst.reladdr, &this->vertex_output_offset, sizeof(src_reg));
            vec4_instruction *inst = emit(MOV(dst, src_reg(tmp)));
            inst->force_writemask_all = true;
         }

         emit(ADD(dst_reg(this->vertex_output_offset),
                  this->vertex_output_offset, 1u));
      }

      /* Now buffer flags for this vertex */
      dst_reg dst(this->vertex_output);
      dst.reladdr = ralloc(mem_ctx, src_reg);
      memcpy(dst.reladdr, &this->vertex_output_offset, sizeof(src_reg));
      if (c->gp->program.OutputType == GL_POINTS) {
         /* If we are outputting points, then every vertex has PrimStart and
          * PrimEnd set.
          */
         emit(MOV(dst, (_3DPRIM_POINTLIST << URB_WRITE_PRIM_TYPE_SHIFT) |
                  URB_WRITE_PRIM_START | URB_WRITE_PRIM_END));
         emit(ADD(dst_reg(this->prim_count), this->prim_count, 1u));
      } else {
         /* Otherwise, we can only set the PrimStart flag, which we have stored
          * in the first_vertex register. We will have to wait until we execute
          * EndPrimitive() or we end the thread to set the PrimEnd flag on a
          * vertex.
          */
         emit(OR(dst, this->first_vertex,
                 (c->prog_data.output_topology << URB_WRITE_PRIM_TYPE_SHIFT)));
         emit(MOV(dst_reg(this->first_vertex), 0u));
      }
      emit(ADD(dst_reg(this->vertex_output_offset),
               this->vertex_output_offset, 1u));

      /* Update vertex count */
      emit(ADD(dst_reg(this->vertex_count), this->vertex_count, 1u));
   }
   emit(BRW_OPCODE_ENDIF);
}