Ejemplo n.º 1
0
static void brw_gs_ff_sync(struct brw_gs_compile *c, int num_prim)
{
	struct brw_compile *p = &c->func;
	struct intel_context *intel = &c->func.brw->intel;

	if (intel->gen < 6) {
	    brw_MOV(p, get_element_ud(c->reg.R0, 1), brw_imm_ud(num_prim));
	    brw_ff_sync(p,
		        c->reg.R0,
			0,
			c->reg.R0,
			1, /* allocate */
			1, /* response length */
			0 /* eot */);
	} else {
	    brw_MOV(p, retype(c->reg.temp, BRW_REGISTER_TYPE_UD),
		    retype(c->reg.R0, BRW_REGISTER_TYPE_UD));
	    brw_MOV(p, get_element_ud(c->reg.temp, 1), brw_imm_ud(num_prim));
	    brw_ff_sync(p,
		        c->reg.temp,
			0,
			c->reg.temp,
			1, /* allocate */
			1, /* response length */
			0 /* eot */);
	    brw_MOV(p, get_element_ud(c->reg.R0, 0),
		    get_element_ud(c->reg.temp, 0));
	}
}
Ejemplo n.º 2
0
static void merge_edgeflags( struct brw_clip_compile *c )
{
   struct brw_compile *p = &c->func;
   struct brw_reg tmp0 = get_element_ud(c->reg.tmp0, 0);

   brw_AND(p, tmp0, get_element_ud(c->reg.R0, 2), brw_imm_ud(PRIM_MASK)); 
   brw_CMP(p, 
	   vec1(brw_null_reg()), 
	   BRW_CONDITIONAL_EQ, 
	   tmp0,
	   brw_imm_ud(_3DPRIM_POLYGON));

   /* Get away with using reg.vertex because we know that this is not
    * a _3DPRIM_TRISTRIP_REVERSE:
    */
   brw_IF(p, BRW_EXECUTE_1);
   {   
      brw_set_conditionalmod(p, BRW_CONDITIONAL_EQ);
      brw_AND(p, vec1(brw_null_reg()), get_element_ud(c->reg.R0, 2), brw_imm_ud(1<<8));
      brw_MOV(p, byte_offset(c->reg.vertex[0],
                             brw_varying_to_offset(&c->vue_map,
                                                   VARYING_SLOT_EDGE)),
              brw_imm_f(0));
      brw_set_predicate_control(p, BRW_PREDICATE_NONE);

      brw_set_conditionalmod(p, BRW_CONDITIONAL_EQ);
      brw_AND(p, vec1(brw_null_reg()), get_element_ud(c->reg.R0, 2), brw_imm_ud(1<<9));
      brw_MOV(p, byte_offset(c->reg.vertex[2],
                             brw_varying_to_offset(&c->vue_map,
                                                   VARYING_SLOT_EDGE)),
              brw_imm_f(0));
      brw_set_predicate_control(p, BRW_PREDICATE_NONE);
   }
   brw_ENDIF(p);
}
Ejemplo n.º 3
0
/**
 * Overwrite DWORD 2 of c->reg.header with the primitive type from c->reg.R0.
 *
 * When the thread is spawned, GRF 0 contains the primitive type in bits 4:0
 * of DWORD 2.  URB_WRITE messages need the primitive type in bits 6:2 of
 * DWORD 2.  So this function extracts the primitive type field, bitshifts it
 * appropriately, and stores it in c->reg.header.
 */
static void brw_gs_overwrite_header_dw2_from_r0(struct brw_gs_compile *c)
{
   struct brw_compile *p = &c->func;
   brw_AND(p, get_element_ud(c->reg.header, 2), get_element_ud(c->reg.R0, 2),
           brw_imm_ud(0x1f));
   brw_SHL(p, get_element_ud(c->reg.header, 2),
           get_element_ud(c->reg.header, 2), brw_imm_ud(2));
}
Ejemplo n.º 4
0
/**
 * Emit a vertex using the URB_WRITE message.  Use the contents of
 * c->reg.header for the message header, and the registers starting at \c vert
 * for the vertex data.
 *
 * If \c last is true, then this is the last vertex, so no further URB space
 * should be allocated, and this message should end the thread.
 *
 * If \c last is false, then a new URB entry will be allocated, and its handle
 * will be stored in DWORD 0 of c->reg.header for use in the next URB_WRITE
 * message.
 */
static void brw_ff_gs_emit_vue(struct brw_ff_gs_compile *c,
                               struct brw_reg vert,
                               bool last)
{
   struct brw_codegen *p = &c->func;
   int write_offset = 0;
   bool complete = false;

   do {
      /* We can't write more than 14 registers at a time to the URB */
      int write_len = MIN2(c->nr_regs - write_offset, 14);
      if (write_len == c->nr_regs - write_offset)
         complete = true;

      /* Copy the vertex from vertn into m1..mN+1:
       */
      brw_copy8(p, brw_message_reg(1), offset(vert, write_offset), write_len);

      /* Send the vertex data to the URB.  If this is the last write for this
       * vertex, then we mark it as complete, and either end the thread or
       * allocate another vertex URB entry (depending whether this is the last
       * vertex).
       */
      enum brw_urb_write_flags flags;
      if (!complete)
         flags = BRW_URB_WRITE_NO_FLAGS;
      else if (last)
         flags = BRW_URB_WRITE_EOT_COMPLETE;
      else
         flags = BRW_URB_WRITE_ALLOCATE_COMPLETE;
      brw_urb_WRITE(p,
                    (flags & BRW_URB_WRITE_ALLOCATE) ? c->reg.temp
                    : retype(brw_null_reg(), BRW_REGISTER_TYPE_UD),
                    0,
                    c->reg.header,
                    flags,
                    write_len + 1, /* msg length */
                    (flags & BRW_URB_WRITE_ALLOCATE) ? 1
                    : 0, /* response length */
                    write_offset,  /* urb offset */
                    BRW_URB_SWIZZLE_NONE);
      write_offset += write_len;
   } while (!complete);

   if (!last) {
      brw_MOV(p, get_element_ud(c->reg.header, 0),
              get_element_ud(c->reg.temp, 0));
   }
}
Ejemplo n.º 5
0
static void brw_gs_emit_vue(struct brw_gs_compile *c, 
			    struct brw_reg vert,
			    GLboolean last,
			    GLuint header)
{
   struct brw_compile *p = &c->func;
   struct intel_context *intel = &c->func.brw->intel;
   GLboolean allocate = !last;
   struct brw_reg temp;

   if (intel->gen < 6)
       temp = c->reg.R0;
   else {
       temp = c->reg.temp;
       brw_MOV(p, retype(temp, BRW_REGISTER_TYPE_UD),
	       retype(c->reg.R0, BRW_REGISTER_TYPE_UD));
   }

   /* Overwrite PrimType and PrimStart in the message header, for
    * each vertex in turn:
    */
   brw_MOV(p, get_element_ud(temp, 2), brw_imm_ud(header));

   /* Copy the vertex from vertn into m1..mN+1:
    */
   brw_copy8(p, brw_message_reg(1), vert, c->nr_regs);

   /* Send each vertex as a seperate write to the urb.  This is
    * different to the concept in brw_sf_emit.c, where subsequent
    * writes are used to build up a single urb entry.  Each of these
    * writes instantiates a seperate urb entry, and a new one must be
    * allocated each time.
    */
   brw_urb_WRITE(p, 
		 allocate ? temp : retype(brw_null_reg(), BRW_REGISTER_TYPE_UD),
		 0,
		 temp,
		 allocate,
		 1,		/* used */
		 c->nr_regs + 1, /* msg length */
		 allocate ? 1 : 0, /* response length */
		 allocate ? 0 : 1, /* eot */
		 1,		/* writes_complete */
		 0,		/* urb offset */
		 BRW_URB_SWIZZLE_NONE);

   if (intel->gen >= 6 && allocate)
       brw_MOV(p, get_element_ud(c->reg.R0, 0), get_element_ud(temp, 0));
}
Ejemplo n.º 6
0
/**
 * Send an FF_SYNC message to ensure that all previously spawned GS threads
 * have finished sending primitives down the pipeline, and to allocate a URB
 * entry for the first output vertex.  Only needed when intel->needs_ff_sync
 * is true.
 *
 * This function modifies c->reg.header: in DWORD 1, it stores num_prim (which
 * is needed by the FF_SYNC message), and in DWORD 0, it stores the handle to
 * the allocated URB entry (which will be needed by the URB_WRITE meesage that
 * follows).
 */
static void brw_gs_ff_sync(struct brw_gs_compile *c, int num_prim)
{
   struct brw_compile *p = &c->func;

   brw_MOV(p, get_element_ud(c->reg.header, 1), brw_imm_ud(num_prim));
   brw_ff_sync(p,
               c->reg.temp,
               0,
               c->reg.header,
               1, /* allocate */
               1, /* response length */
               0 /* eot */);
   brw_MOV(p, get_element_ud(c->reg.header, 0),
           get_element_ud(c->reg.temp, 0));
}
Ejemplo n.º 7
0
void brw_clip_tri_flat_shade( struct brw_clip_compile *c )
{
   struct brw_compile *p = &c->func;
   struct brw_instruction *is_poly;
   struct brw_reg tmp0 = c->reg.loopcount; /* handy temporary */

   brw_AND(p, tmp0, get_element_ud(c->reg.R0, 2), brw_imm_ud(PRIM_MASK)); 
   brw_CMP(p, 
	   vec1(brw_null_reg()), 
	   BRW_CONDITIONAL_EQ, 
	   tmp0,
	   brw_imm_ud(_3DPRIM_POLYGON));

   is_poly = brw_IF(p, BRW_EXECUTE_1);
   {   
      brw_clip_copy_colors(c, 1, 0);
      brw_clip_copy_colors(c, 2, 0);
   }
   is_poly = brw_ELSE(p, is_poly);
   {
      brw_clip_copy_colors(c, 0, 2);
      brw_clip_copy_colors(c, 1, 2);
   }
   brw_ENDIF(p, is_poly);
}
void brw_clip_init_clipmask( struct brw_clip_compile *c )
{
   struct brw_codegen *p = &c->func;
   struct brw_reg incoming = get_element_ud(c->reg.R0, 2);

   /* Shift so that lowest outcode bit is rightmost:
    */
   brw_SHR(p, c->reg.planemask, incoming, brw_imm_ud(26));

   if (c->key.nr_userclip) {
      struct brw_reg tmp = retype(vec1(get_tmp(c)), BRW_REGISTER_TYPE_UD);

      /* Rearrange userclip outcodes so that they come directly after
       * the fixed plane bits.
       */
      if (p->devinfo->gen == 5 || p->devinfo->is_g4x)
         brw_AND(p, tmp, incoming, brw_imm_ud(0xff<<14));
      else
         brw_AND(p, tmp, incoming, brw_imm_ud(0x3f<<14));

      brw_SHR(p, tmp, tmp, brw_imm_ud(8));
      brw_OR(p, c->reg.planemask, c->reg.planemask, tmp);

      release_tmp(c, tmp);
   }
}
Ejemplo n.º 9
0
void brw_clip_emit_vue(struct brw_clip_compile *c, 
		       struct brw_indirect vert,
		       GLboolean allocate,
		       GLboolean eot,
		       GLuint header)
{
   struct brw_compile *p = &c->func;
   GLuint start = c->last_mrf;

   brw_clip_ff_sync(c);

   assert(!(allocate && eot));
   
   /* Cycle through mrf regs - probably futile as we have to wait for
    * the allocation response anyway.  Also, the order this function
    * is invoked doesn't correspond to the order the instructions will
    * be executed, so it won't have any effect in many cases.
    */
#if 0
   if (start + c->nr_regs + 1 >= MAX_MRF)
      start = 0;

   c->last_mrf = start + c->nr_regs + 1;
#endif
	
   /* Copy the vertex from vertn into m1..mN+1:
    */
   brw_copy_from_indirect(p, brw_message_reg(start+1), vert, c->nr_regs);

   /* Overwrite PrimType and PrimStart in the message header, for
    * each vertex in turn:
    */
   brw_MOV(p, get_element_ud(c->reg.R0, 2), brw_imm_ud(header));


   /* Send each vertex as a seperate write to the urb.  This
    * is different to the concept in brw_sf_emit.c, where
    * subsequent writes are used to build up a single urb
    * entry.  Each of these writes instantiates a seperate
    * urb entry - (I think... what about 'allocate'?)
    */
   brw_urb_WRITE(p, 
		 allocate ? c->reg.R0 : retype(brw_null_reg(), BRW_REGISTER_TYPE_UD),
		 start,
		 c->reg.R0,
		 allocate,
		 1,		/* used */
		 c->nr_regs + 1, /* msg length */
		 allocate ? 1 : 0, /* response_length */ 
		 eot,		/* eot */
		 1,		/* writes_complete */
		 0,		/* urb offset */
		 BRW_URB_SWIZZLE_NONE);
}
Ejemplo n.º 10
0
void brw_emit_anyprim_setup( struct brw_sf_compile *c )
{
   struct brw_compile *p = &c->func;
   struct brw_context *brw = p->brw;
   struct brw_reg payload_prim = brw_uw1_reg(BRW_GENERAL_REGISTER_FILE, 1, 0);
   struct brw_reg payload_attr = get_element_ud(brw_vec1_reg(BRW_GENERAL_REGISTER_FILE, 1, 0), 0);
   struct brw_reg primmask;
   int jmp;
   struct brw_reg v1_null_ud = vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_UD));

   c->nr_verts = 3;
   alloc_regs(c);

   primmask = retype(get_element(c->tmp, 0), BRW_REGISTER_TYPE_UD);

   brw_MOV(p, primmask, brw_imm_ud(1));
   brw_SHL(p, primmask, primmask, payload_prim);

   brw_AND(p, v1_null_ud, primmask, brw_imm_ud((1<<_3DPRIM_TRILIST) |
					       (1<<_3DPRIM_TRISTRIP) |
					       (1<<_3DPRIM_TRIFAN) |
					       (1<<_3DPRIM_TRISTRIP_REVERSE) |
					       (1<<_3DPRIM_POLYGON) |
					       (1<<_3DPRIM_RECTLIST) |
					       (1<<_3DPRIM_TRIFAN_NOSTIPPLE)));
   brw_inst_set_cond_modifier(brw, brw_last_inst, BRW_CONDITIONAL_Z);
   jmp = brw_JMPI(p, brw_imm_d(0), BRW_PREDICATE_NORMAL) - p->store;
   brw_emit_tri_setup(c, false);
   brw_land_fwd_jump(p, jmp);

   brw_AND(p, v1_null_ud, primmask, brw_imm_ud((1<<_3DPRIM_LINELIST) |
					       (1<<_3DPRIM_LINESTRIP) |
					       (1<<_3DPRIM_LINELOOP) |
					       (1<<_3DPRIM_LINESTRIP_CONT) |
					       (1<<_3DPRIM_LINESTRIP_BF) |
					       (1<<_3DPRIM_LINESTRIP_CONT_BF)));
   brw_inst_set_cond_modifier(brw, brw_last_inst, BRW_CONDITIONAL_Z);
   jmp = brw_JMPI(p, brw_imm_d(0), BRW_PREDICATE_NORMAL) - p->store;
   brw_emit_line_setup(c, false);
   brw_land_fwd_jump(p, jmp);

   brw_AND(p, v1_null_ud, payload_attr, brw_imm_ud(1<<BRW_SPRITE_POINT_ENABLE));
   brw_inst_set_cond_modifier(brw, brw_last_inst, BRW_CONDITIONAL_Z);
   jmp = brw_JMPI(p, brw_imm_d(0), BRW_PREDICATE_NORMAL) - p->store;
   brw_emit_point_sprite_setup(c, false);
   brw_land_fwd_jump(p, jmp);

   brw_emit_point_setup( c, false );
}
Ejemplo n.º 11
0
/**
 * Emit a vertex using the URB_WRITE message.  Use the contents of
 * c->reg.header for the message header, and the registers starting at \c vert
 * for the vertex data.
 *
 * If \c last is true, then this is the last vertex, so no further URB space
 * should be allocated, and this message should end the thread.
 *
 * If \c last is false, then a new URB entry will be allocated, and its handle
 * will be stored in DWORD 0 of c->reg.header for use in the next URB_WRITE
 * message.
 */
static void brw_gs_emit_vue(struct brw_gs_compile *c, 
			    struct brw_reg vert,
			    bool last)
{
   struct brw_compile *p = &c->func;
   bool allocate = !last;

   /* Copy the vertex from vertn into m1..mN+1:
    */
   brw_copy8(p, brw_message_reg(1), vert, c->nr_regs);

   /* Send each vertex as a seperate write to the urb.  This is
    * different to the concept in brw_sf_emit.c, where subsequent
    * writes are used to build up a single urb entry.  Each of these
    * writes instantiates a seperate urb entry, and a new one must be
    * allocated each time.
    */
   brw_urb_WRITE(p, 
		 allocate ? c->reg.temp
                          : retype(brw_null_reg(), BRW_REGISTER_TYPE_UD),
		 0,
		 c->reg.header,
		 allocate,
		 1,		/* used */
		 c->nr_regs + 1, /* msg length */
		 allocate ? 1 : 0, /* response length */
		 allocate ? 0 : 1, /* eot */
		 1,		/* writes_complete */
		 0,		/* urb offset */
		 BRW_URB_SWIZZLE_NONE);

   if (allocate) {
      brw_MOV(p, get_element_ud(c->reg.header, 0),
              get_element_ud(c->reg.temp, 0));
   }
}
Ejemplo n.º 12
0
static void brw_gs_ff_sync(struct brw_gs_compile *c, int num_prim)
{
	struct brw_compile *p = &c->func;
	brw_MOV(p, get_element_ud(c->reg.R0, 1), brw_imm_ud(num_prim));
	brw_ff_sync(p, 
				c->reg.R0,
				0,
				c->reg.R0,
				1,	
				1,		/* used */
				1,  	/* msg length */
				1,		/* response length */
				0,		/* eot */
				1,		/* write compelete */
				0,		/* urb offset */
				BRW_URB_SWIZZLE_NONE);
}
Ejemplo n.º 13
0
void brw_clip_init_planes( struct brw_clip_compile *c )
{
   struct brw_compile *p = &c->func;

   if (!c->key.nr_userclip) {
      brw_MOV(p, get_element_ud(c->reg.fixed_planes, 0), make_plane_ud( 0,    0, 0xff, 1));
      brw_MOV(p, get_element_ud(c->reg.fixed_planes, 1), make_plane_ud( 0,    0,    1, 1));
      brw_MOV(p, get_element_ud(c->reg.fixed_planes, 2), make_plane_ud( 0, 0xff,    0, 1));
      brw_MOV(p, get_element_ud(c->reg.fixed_planes, 3), make_plane_ud( 0,    1,    0, 1));
      brw_MOV(p, get_element_ud(c->reg.fixed_planes, 4), make_plane_ud(0xff,  0,    0, 1));
      brw_MOV(p, get_element_ud(c->reg.fixed_planes, 5), make_plane_ud( 1,    0,    0, 1));
   }
}
Ejemplo n.º 14
0
void brw_clip_tri_flat_shade( struct brw_clip_compile *c )
{
   struct brw_codegen *p = &c->func;
   struct brw_reg tmp0 = c->reg.loopcount; /* handy temporary */

   brw_AND(p, tmp0, get_element_ud(c->reg.R0, 2), brw_imm_ud(PRIM_MASK));
   brw_CMP(p,
	   vec1(brw_null_reg()),
	   BRW_CONDITIONAL_EQ,
	   tmp0,
	   brw_imm_ud(_3DPRIM_POLYGON));

   brw_IF(p, BRW_EXECUTE_1);
   {
      brw_clip_copy_flatshaded_attributes(c, 1, 0);
      brw_clip_copy_flatshaded_attributes(c, 2, 0);
   }
   brw_ELSE(p);
   {
      if (c->key.pv_first) {
	 brw_CMP(p,
		 vec1(brw_null_reg()),
		 BRW_CONDITIONAL_EQ,
		 tmp0,
		 brw_imm_ud(_3DPRIM_TRIFAN));
	 brw_IF(p, BRW_EXECUTE_1);
	 {
	    brw_clip_copy_flatshaded_attributes(c, 0, 1);
	    brw_clip_copy_flatshaded_attributes(c, 2, 1);
	 }
	 brw_ELSE(p);
	 {
	    brw_clip_copy_flatshaded_attributes(c, 1, 0);
	    brw_clip_copy_flatshaded_attributes(c, 2, 0);
	 }
	 brw_ENDIF(p);
      }
      else {
         brw_clip_copy_flatshaded_attributes(c, 0, 2);
         brw_clip_copy_flatshaded_attributes(c, 1, 2);
      }
   }
   brw_ENDIF(p);
}
void brw_clip_emit_vue(struct brw_clip_compile *c,
		       struct brw_indirect vert,
                       enum brw_urb_write_flags flags,
		       GLuint header)
{
   struct brw_codegen *p = &c->func;
   bool allocate = flags & BRW_URB_WRITE_ALLOCATE;

   brw_clip_ff_sync(c);

   /* Any URB entry that is allocated must subsequently be used or discarded,
    * so it doesn't make sense to mark EOT and ALLOCATE at the same time.
    */
   assert(!(allocate && (flags & BRW_URB_WRITE_EOT)));

   /* Copy the vertex from vertn into m1..mN+1:
    */
   brw_copy_from_indirect(p, brw_message_reg(1), vert, c->nr_regs);

   /* Overwrite PrimType and PrimStart in the message header, for
    * each vertex in turn:
    */
   brw_MOV(p, get_element_ud(c->reg.R0, 2), brw_imm_ud(header));


   /* Send each vertex as a separate write to the urb.  This
    * is different to the concept in brw_sf_emit.c, where
    * subsequent writes are used to build up a single urb
    * entry.  Each of these writes instantiates a separate
    * urb entry - (I think... what about 'allocate'?)
    */
   brw_urb_WRITE(p,
		 allocate ? c->reg.R0 : retype(brw_null_reg(), BRW_REGISTER_TYPE_UD),
		 0,
		 c->reg.R0,
                 flags,
		 c->nr_regs + 1, /* msg length */
		 allocate ? 1 : 0, /* response_length */
		 0,		/* urb offset */
		 BRW_URB_SWIZZLE_NONE);
}
Ejemplo n.º 16
0
void brw_clip_emit_vue(struct brw_clip_compile *c, 
		       struct brw_indirect vert,
		       bool allocate,
		       bool eot,
		       GLuint header)
{
   struct brw_compile *p = &c->func;

   brw_clip_ff_sync(c);

   assert(!(allocate && eot));

   /* Copy the vertex from vertn into m1..mN+1:
    */
   brw_copy_from_indirect(p, brw_message_reg(1), vert, c->nr_regs);

   /* Overwrite PrimType and PrimStart in the message header, for
    * each vertex in turn:
    */
   brw_MOV(p, get_element_ud(c->reg.R0, 2), brw_imm_ud(header));


   /* Send each vertex as a seperate write to the urb.  This
    * is different to the concept in brw_sf_emit.c, where
    * subsequent writes are used to build up a single urb
    * entry.  Each of these writes instantiates a seperate
    * urb entry - (I think... what about 'allocate'?)
    */
   brw_urb_WRITE(p, 
		 allocate ? c->reg.R0 : retype(brw_null_reg(), BRW_REGISTER_TYPE_UD),
		 0,
		 c->reg.R0,
		 allocate,
		 1,		/* used */
		 c->nr_regs + 1, /* msg length */
		 allocate ? 1 : 0, /* response_length */ 
		 eot,		/* eot */
		 1,		/* writes_complete */
		 0,		/* urb offset */
		 BRW_URB_SWIZZLE_NONE);
}
Ejemplo n.º 17
0
void brw_emit_tri_clip( struct brw_clip_compile *c )
{
   struct brw_instruction *neg_rhw;
   struct brw_compile *p = &c->func;
   brw_clip_tri_alloc_regs(c, 3 + c->key.nr_userclip + 6);
   brw_clip_tri_init_vertices(c);
   brw_clip_init_clipmask(c);
   brw_clip_init_ff_sync(c);

   /* if -ve rhw workaround bit is set, 
      do cliptest */
   if (c->chipset.is_965) {
      brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
      brw_AND(p, brw_null_reg(), get_element_ud(c->reg.R0, 2), 
              brw_imm_ud(1<<20));
      neg_rhw = brw_IF(p, BRW_EXECUTE_1); 
      {
         brw_clip_test(c);
      }
      brw_ENDIF(p, neg_rhw);
   }
   /* Can't push into do_clip_tri because with polygon (or quad)
    * flatshading, need to apply the flatshade here because we don't
    * respect the PV when converting to trifan for emit:
    */
   if (c->key.do_flat_shading) 
      brw_clip_tri_flat_shade(c); 
      
   if ((c->key.clip_mode == BRW_CLIPMODE_NORMAL) ||
       (c->key.clip_mode == BRW_CLIPMODE_KERNEL_CLIP))
      do_clip_tri(c);
   else 
      maybe_do_clip_tri(c);

   brw_clip_tri_emit_polygon(c);

   /* Send an empty message to kill the thread:
    */
   brw_clip_kill_thread(c);
}
Ejemplo n.º 18
0
void brw_clip_tri_init_vertices( struct brw_clip_compile *c )
{
   struct brw_compile *p = &c->func;
   struct brw_reg tmp0 = c->reg.loopcount; /* handy temporary */
   struct brw_instruction *is_rev;

   /* Initial list of indices for incoming vertexes:
    */
   brw_AND(p, tmp0, get_element_ud(c->reg.R0, 2), brw_imm_ud(PRIM_MASK)); 
   brw_CMP(p, 
	   vec1(brw_null_reg()), 
	   BRW_CONDITIONAL_EQ, 
	   tmp0,
	   brw_imm_ud(_3DPRIM_TRISTRIP_REVERSE));

   /* XXX: Is there an easier way to do this?  Need to reverse every
    * second tristrip element:  Can ignore sometimes?
    */
   is_rev = brw_IF(p, BRW_EXECUTE_1);
   {   
      brw_MOV(p, get_element(c->reg.inlist, 0),  brw_address(c->reg.vertex[1]) );
      brw_MOV(p, get_element(c->reg.inlist, 1),  brw_address(c->reg.vertex[0]) );
      if (c->need_direction)
	 brw_MOV(p, c->reg.dir, brw_imm_f(-1));
   }
   is_rev = brw_ELSE(p, is_rev);
   {
      brw_MOV(p, get_element(c->reg.inlist, 0),  brw_address(c->reg.vertex[0]) );
      brw_MOV(p, get_element(c->reg.inlist, 1),  brw_address(c->reg.vertex[1]) );
      if (c->need_direction)
	 brw_MOV(p, c->reg.dir, brw_imm_f(1));
   }
   brw_ENDIF(p, is_rev);

   brw_MOV(p, get_element(c->reg.inlist, 2),  brw_address(c->reg.vertex[2]) );
   brw_MOV(p, brw_vec8_grf(c->reg.outlist.nr, 0), brw_imm_f(0));
   brw_MOV(p, c->reg.nr_verts, brw_imm_ud(3));
}
Ejemplo n.º 19
0
/**
 * Overwrite DWORD 2 of c->reg.header with the given immediate unsigned value.
 *
 * In URB_WRITE messages, DWORD 2 contains the fields PrimType, PrimStart,
 * PrimEnd, Increment CL_INVOCATIONS, and SONumPrimsWritten, many of which we
 * need to be able to update on a per-vertex basis.
 */
static void brw_gs_overwrite_header_dw2(struct brw_gs_compile *c,
                                        unsigned dw2)
{
   struct brw_compile *p = &c->func;
   brw_MOV(p, get_element_ud(c->reg.header, 2), brw_imm_ud(dw2));
}
Ejemplo n.º 20
0
/**
 * Texture sample instruction.
 * Note: the msg_type plus msg_length values determine exactly what kind
 * of sampling operation is performed.  See volume 4, page 161 of docs.
 */
void brw_SAMPLE(struct brw_compile *p,
		struct brw_reg dest,
		GLuint msg_reg_nr,
		struct brw_reg src0,
		GLuint binding_table_index,
		GLuint sampler,
		GLuint writemask,
		GLuint msg_type,
		GLuint response_length,
		GLuint msg_length,
		GLboolean eot,
		GLuint header_present,
		GLuint simd_mode)
{
   GLboolean need_stall = 0;
   
   if (writemask == 0) {
      /*_mesa_printf("%s: zero writemask??\n", __FUNCTION__); */
      return;
   }
   
   /* Hardware doesn't do destination dependency checking on send
    * instructions properly.  Add a workaround which generates the
    * dependency by other means.  In practice it seems like this bug
    * only crops up for texture samples, and only where registers are
    * written by the send and then written again later without being
    * read in between.  Luckily for us, we already track that
    * information and use it to modify the writemask for the
    * instruction, so that is a guide for whether a workaround is
    * needed.
    */
   if (writemask != WRITEMASK_XYZW) {
      GLuint dst_offset = 0;
      GLuint i, newmask = 0, len = 0;

      for (i = 0; i < 4; i++) {
	 if (writemask & (1<<i))
	    break;
	 dst_offset += 2;
      }
      for (; i < 4; i++) {
	 if (!(writemask & (1<<i)))
	    break;
	 newmask |= 1<<i;
	 len++;
      }

      if (newmask != writemask) {
	 need_stall = 1;
         /* _mesa_printf("need stall %x %x\n", newmask , writemask); */
      }
      else {
	 struct brw_reg m1 = brw_message_reg(msg_reg_nr);
	 
	 newmask = ~newmask & WRITEMASK_XYZW;

	 brw_push_insn_state(p);

	 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
	 brw_set_mask_control(p, BRW_MASK_DISABLE);

	 brw_MOV(p, m1, brw_vec8_grf(0,0));	 
  	 brw_MOV(p, get_element_ud(m1, 2), brw_imm_ud(newmask << 12)); 

	 brw_pop_insn_state(p);

  	 src0 = retype(brw_null_reg(), BRW_REGISTER_TYPE_UW); 
	 dest = offset(dest, dst_offset);
	 response_length = len * 2;
      }
   }

   {
      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
   
      insn->header.predicate_control = 0; /* XXX */
      insn->header.compression_control = BRW_COMPRESSION_NONE;
      insn->header.destreg__conditionalmod = msg_reg_nr;

      brw_set_dest(insn, dest);
      brw_set_src0(insn, src0);
      brw_set_sampler_message(p->brw, insn,
			      binding_table_index,
			      sampler,
			      msg_type,
			      response_length, 
			      msg_length,
			      eot,
			      header_present,
			      simd_mode);
   }

   if (need_stall) {
      struct brw_reg reg = vec8(offset(dest, response_length-1));

      /*  mov (8) r9.0<1>:f    r9.0<8;8,1>:f    { Align1 }
       */
      brw_push_insn_state(p);
      brw_set_compression_control(p, BRW_COMPRESSION_NONE);
      brw_MOV(p, reg, reg);	      
      brw_pop_insn_state(p);
   }

}
Ejemplo n.º 21
0
void brw_emit_anyprim_setup( struct brw_sf_compile *c )
{
    struct brw_compile *p = &c->func;
    struct brw_reg ip = brw_ip_reg();
    struct brw_reg payload_prim = brw_uw1_reg(BRW_GENERAL_REGISTER_FILE, 1, 0);
    struct brw_reg payload_attr = get_element_ud(brw_vec1_reg(BRW_GENERAL_REGISTER_FILE, 1, 0), 0);
    struct brw_reg primmask;
    struct brw_instruction *jmp;
    struct brw_reg v1_null_ud = vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_UD));

    GLuint saveflag;

    c->nr_verts = 3;
    alloc_regs(c);

    primmask = retype(get_element(c->tmp, 0), BRW_REGISTER_TYPE_UD);

    brw_MOV(p, primmask, brw_imm_ud(1));
    brw_SHL(p, primmask, primmask, payload_prim);

    brw_set_conditionalmod(p, BRW_CONDITIONAL_Z);
    brw_AND(p, v1_null_ud, primmask, brw_imm_ud((1<<_3DPRIM_TRILIST) |
            (1<<_3DPRIM_TRISTRIP) |
            (1<<_3DPRIM_TRIFAN) |
            (1<<_3DPRIM_TRISTRIP_REVERSE) |
            (1<<_3DPRIM_POLYGON) |
            (1<<_3DPRIM_RECTLIST) |
            (1<<_3DPRIM_TRIFAN_NOSTIPPLE)));
    jmp = brw_JMPI(p, ip, ip, brw_imm_d(0));
    {
        saveflag = p->flag_value;
        brw_push_insn_state(p);
        brw_emit_tri_setup( c, GL_FALSE );
        brw_pop_insn_state(p);
        p->flag_value = saveflag;
        /* note - thread killed in subroutine, so must
         * restore the flag which is changed when building
         * the subroutine. fix #13240
         */
    }
    brw_land_fwd_jump(p, jmp);

    brw_set_conditionalmod(p, BRW_CONDITIONAL_Z);
    brw_AND(p, v1_null_ud, primmask, brw_imm_ud((1<<_3DPRIM_LINELIST) |
            (1<<_3DPRIM_LINESTRIP) |
            (1<<_3DPRIM_LINELOOP) |
            (1<<_3DPRIM_LINESTRIP_CONT) |
            (1<<_3DPRIM_LINESTRIP_BF) |
            (1<<_3DPRIM_LINESTRIP_CONT_BF)));
    jmp = brw_JMPI(p, ip, ip, brw_imm_d(0));
    {
        saveflag = p->flag_value;
        brw_push_insn_state(p);
        brw_emit_line_setup( c, GL_FALSE );
        brw_pop_insn_state(p);
        p->flag_value = saveflag;
        /* note - thread killed in subroutine */
    }
    brw_land_fwd_jump(p, jmp);

    brw_set_conditionalmod(p, BRW_CONDITIONAL_Z);
    brw_AND(p, v1_null_ud, payload_attr, brw_imm_ud(1<<BRW_SPRITE_POINT_ENABLE));
    jmp = brw_JMPI(p, ip, ip, brw_imm_d(0));
    {
        saveflag = p->flag_value;
        brw_push_insn_state(p);
        brw_emit_point_sprite_setup( c, GL_FALSE );
        brw_pop_insn_state(p);
        p->flag_value = saveflag;
    }
    brw_land_fwd_jump(p, jmp);

    brw_emit_point_setup( c, GL_FALSE );
}
Ejemplo n.º 22
0
void
gen8_vec4_generator::generate_tex(vec4_instruction *ir, struct brw_reg dst)
{
   int msg_type = 0;

   switch (ir->opcode) {
   case SHADER_OPCODE_TEX:
   case SHADER_OPCODE_TXL:
      if (ir->shadow_compare) {
         msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD_COMPARE;
      } else {
         msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD;
      }
      break;
   case SHADER_OPCODE_TXD:
      if (ir->shadow_compare) {
         msg_type = HSW_SAMPLER_MESSAGE_SAMPLE_DERIV_COMPARE;
      } else {
         msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_DERIVS;
      }
      break;
   case SHADER_OPCODE_TXF:
      msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LD;
      break;
   case SHADER_OPCODE_TXF_CMS:
      msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_LD2DMS;
      break;
   case SHADER_OPCODE_TXF_MCS:
      msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_LD_MCS;
      break;
   case SHADER_OPCODE_TXS:
      msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_RESINFO;
      break;
   case SHADER_OPCODE_TG4:
      if (ir->shadow_compare) {
         msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_C;
      } else {
         msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4;
      }
      break;
   case SHADER_OPCODE_TG4_OFFSET:
      if (ir->shadow_compare) {
         msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO_C;
      } else {
         msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO;
      }
      break;
   default:
      assert(!"should not get here: invalid VS texture opcode");
      break;
   }

   if (ir->header_present) {
      MOV_RAW(retype(brw_message_reg(ir->base_mrf), BRW_REGISTER_TYPE_UD),
              retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));

      default_state.access_mode = BRW_ALIGN_1;

      if (ir->texture_offset) {
         /* Set the offset bits in DWord 2. */
         MOV_RAW(retype(brw_vec1_reg(MRF, ir->base_mrf, 2),
                        BRW_REGISTER_TYPE_UD),
                 brw_imm_ud(ir->texture_offset));
      }

      if (ir->sampler >= 16) {
         /* The "Sampler Index" field can only store values between 0 and 15.
          * However, we can add an offset to the "Sampler State Pointer"
          * field, effectively selecting a different set of 16 samplers.
          *
          * The "Sampler State Pointer" needs to be aligned to a 32-byte
          * offset, and each sampler state is only 16-bytes, so we can't
          * exclusively use the offset - we have to use both.
          */
         gen8_instruction *add =
            ADD(get_element_ud(brw_message_reg(ir->base_mrf), 3),
                get_element_ud(brw_vec8_grf(0, 0), 3),
                brw_imm_ud(16 * (ir->sampler / 16) *
                           sizeof(gen7_sampler_state)));
         gen8_set_mask_control(add, BRW_MASK_DISABLE);
      }

      default_state.access_mode = BRW_ALIGN_16;
   }

   uint32_t surf_index =
      prog_data->base.binding_table.texture_start + ir->sampler;

   gen8_instruction *inst = next_inst(BRW_OPCODE_SEND);
   gen8_set_dst(brw, inst, dst);
   gen8_set_src0(brw, inst, brw_message_reg(ir->base_mrf));
   gen8_set_sampler_message(brw, inst,
                            surf_index,
                            ir->sampler % 16,
                            msg_type,
                            1,
                            ir->mlen,
                            ir->header_present,
                            BRW_SAMPLER_SIMD_MODE_SIMD4X2);

   mark_surface_used(surf_index);
}
Ejemplo n.º 23
0
/* Post-fragment-program processing.  Send the results to the
 * framebuffer.
 * \param arg0  the fragment color
 * \param arg1  the pass-through depth value
 * \param arg2  the shader-computed depth value
 */
void emit_fb_write(struct brw_wm_compile *c,
		   struct brw_reg *arg0,
		   struct brw_reg *arg1,
		   struct brw_reg *arg2,
		   GLuint target,
		   GLuint eot)
{
   struct brw_compile *p = &c->func;
   struct brw_context *brw = p->brw;
   struct intel_context *intel = &brw->intel;
   GLuint nr = 2;
   GLuint channel;

   /* Reserve a space for AA - may not be needed:
    */
   if (c->aa_dest_stencil_reg)
      nr += 1;

   /* I don't really understand how this achieves the color interleave
    * (ie RGBARGBA) in the result:  [Do the saturation here]
    */
   brw_push_insn_state(p);

   if (c->key.clamp_fragment_color)
      brw_set_saturate(p, 1);

   for (channel = 0; channel < 4; channel++) {
      if (intel->gen >= 6) {
	 /* gen6 SIMD16 single source DP write looks like:
	  * m + 0: r0
	  * m + 1: r1
	  * m + 2: g0
	  * m + 3: g1
	  * m + 4: b0
	  * m + 5: b1
	  * m + 6: a0
	  * m + 7: a1
	  */
	 if (c->dispatch_width == 16) {
	    brw_MOV(p, brw_message_reg(nr + channel * 2), arg0[channel]);
	 } else {
	    brw_MOV(p, brw_message_reg(nr + channel), arg0[channel]);
	 }
      } else if (c->dispatch_width == 16 && brw->has_compr4) {
	 /* pre-gen6 SIMD16 single source DP write looks like:
	  * m + 0: r0
	  * m + 1: g0
	  * m + 2: b0
	  * m + 3: a0
	  * m + 4: r1
	  * m + 5: g1
	  * m + 6: b1
	  * m + 7: a1
	  *
	  * By setting the high bit of the MRF register number, we indicate
	  * that we want COMPR4 mode - instead of doing the usual destination
	  * + 1 for the second half we get destination + 4.
	  */
	 brw_MOV(p,
		 brw_message_reg(nr + channel + BRW_MRF_COMPR4),
		 arg0[channel]);
      } else {
	 /*  mov (8) m2.0<1>:ud   r28.0<8;8,1>:ud  { Align1 } */
	 /*  mov (8) m6.0<1>:ud   r29.0<8;8,1>:ud  { Align1 SecHalf } */
	 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
	 brw_MOV(p,
		 brw_message_reg(nr + channel),
		 arg0[channel]);

	 if (c->dispatch_width == 16) {
	    brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
	    brw_MOV(p,
		    brw_message_reg(nr + channel + 4),
		    sechalf(arg0[channel]));
	 }
      }
   }

   brw_set_saturate(p, 0);

   /* skip over the regs populated above:
    */
   if (c->dispatch_width == 16)
      nr += 8;
   else
      nr += 4;

   brw_pop_insn_state(p);

   if (c->source_depth_to_render_target)
   {
      if (c->computes_depth)
	 brw_MOV(p, brw_message_reg(nr), arg2[2]);
      else 
	 brw_MOV(p, brw_message_reg(nr), arg1[1]); /* ? */

      nr += 2;
   }

   if (c->dest_depth_reg)
   {
      GLuint comp = c->dest_depth_reg / 2;
      GLuint off = c->dest_depth_reg % 2;

      if (off != 0) {
         brw_push_insn_state(p);
         brw_set_compression_control(p, BRW_COMPRESSION_NONE);

         brw_MOV(p, brw_message_reg(nr), offset(arg1[comp],1));
         /* 2nd half? */
         brw_MOV(p, brw_message_reg(nr+1), arg1[comp+1]);
         brw_pop_insn_state(p);
      }
      else {
         brw_MOV(p, brw_message_reg(nr), arg1[comp]);
      }
      nr += 2;
   }

   if (intel->gen >= 6) {
      /* Load the message header.  There's no implied move from src0
       * to the base mrf on gen6.
       */
      brw_push_insn_state(p);
      brw_set_mask_control(p, BRW_MASK_DISABLE);
      brw_MOV(p, retype(brw_message_reg(0), BRW_REGISTER_TYPE_UD),
	      retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
      brw_pop_insn_state(p);

      if (target != 0) {
	 brw_MOV(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
					0,
					2), BRW_REGISTER_TYPE_UD),
		 brw_imm_ud(target));
      }
   }

   if (!c->runtime_check_aads_emit) {
      if (c->aa_dest_stencil_reg)
	 emit_aa(c, arg1, 2);

      fire_fb_write(c, 0, nr, target, eot);
   }
   else {
      struct brw_reg v1_null_ud = vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_UD));
      struct brw_reg ip = brw_ip_reg();
      struct brw_instruction *jmp;
      
      brw_set_compression_control(p, BRW_COMPRESSION_NONE);
      brw_set_conditionalmod(p, BRW_CONDITIONAL_Z);
      brw_AND(p, 
	      v1_null_ud, 
	      get_element_ud(brw_vec8_grf(1,0), 6), 
	      brw_imm_ud(1<<26)); 

      jmp = brw_JMPI(p, ip, ip, brw_imm_w(0));
      {
	 emit_aa(c, arg1, 2);
	 fire_fb_write(c, 0, nr, target, eot);
	 /* note - thread killed in subroutine */
      }
      brw_land_fwd_jump(p, jmp);

      /* ELSE: Shuffle up one register to fill in the hole left for AA:
       */
      fire_fb_write(c, 1, nr-1, target, eot);
   }
}
Ejemplo n.º 24
0
void
vec4_generator::generate_tex(vec4_instruction *inst,
                             struct brw_reg dst,
                             struct brw_reg src,
                             struct brw_reg sampler_index)
{
   int msg_type = -1;

   if (brw->gen >= 5) {
      switch (inst->opcode) {
      case SHADER_OPCODE_TEX:
      case SHADER_OPCODE_TXL:
	 if (inst->shadow_compare) {
	    msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD_COMPARE;
	 } else {
	    msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD;
	 }
	 break;
      case SHADER_OPCODE_TXD:
         if (inst->shadow_compare) {
            /* Gen7.5+.  Otherwise, lowered by brw_lower_texture_gradients(). */
            assert(brw->gen >= 8 || brw->is_haswell);
            msg_type = HSW_SAMPLER_MESSAGE_SAMPLE_DERIV_COMPARE;
         } else {
            msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_DERIVS;
         }
	 break;
      case SHADER_OPCODE_TXF:
	 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LD;
	 break;
      case SHADER_OPCODE_TXF_CMS:
         if (brw->gen >= 7)
            msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_LD2DMS;
         else
            msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LD;
         break;
      case SHADER_OPCODE_TXF_MCS:
         assert(brw->gen >= 7);
         msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_LD_MCS;
         break;
      case SHADER_OPCODE_TXS:
	 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_RESINFO;
	 break;
      case SHADER_OPCODE_TG4:
         if (inst->shadow_compare) {
            msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_C;
         } else {
            msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4;
         }
         break;
      case SHADER_OPCODE_TG4_OFFSET:
         if (inst->shadow_compare) {
            msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO_C;
         } else {
            msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO;
         }
         break;
      default:
	 unreachable("should not get here: invalid vec4 texture opcode");
      }
   } else {
      switch (inst->opcode) {
      case SHADER_OPCODE_TEX:
      case SHADER_OPCODE_TXL:
	 if (inst->shadow_compare) {
	    msg_type = BRW_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_LOD_COMPARE;
	    assert(inst->mlen == 3);
	 } else {
	    msg_type = BRW_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_LOD;
	    assert(inst->mlen == 2);
	 }
	 break;
      case SHADER_OPCODE_TXD:
	 /* There is no sample_d_c message; comparisons are done manually. */
	 msg_type = BRW_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_GRADIENTS;
	 assert(inst->mlen == 4);
	 break;
      case SHADER_OPCODE_TXF:
	 msg_type = BRW_SAMPLER_MESSAGE_SIMD4X2_LD;
	 assert(inst->mlen == 2);
	 break;
      case SHADER_OPCODE_TXS:
	 msg_type = BRW_SAMPLER_MESSAGE_SIMD4X2_RESINFO;
	 assert(inst->mlen == 2);
	 break;
      default:
	 unreachable("should not get here: invalid vec4 texture opcode");
      }
   }

   assert(msg_type != -1);

   assert(sampler_index.type == BRW_REGISTER_TYPE_UD);

   /* Load the message header if present.  If there's a texture offset, we need
    * to set it up explicitly and load the offset bitfield.  Otherwise, we can
    * use an implied move from g0 to the first message register.
    */
   if (inst->header_present) {
      if (brw->gen < 6 && !inst->texture_offset) {
         /* Set up an implied move from g0 to the MRF. */
         src = brw_vec8_grf(0, 0);
      } else {
         struct brw_reg header =
            retype(brw_message_reg(inst->base_mrf), BRW_REGISTER_TYPE_UD);

         /* Explicitly set up the message header by copying g0 to the MRF. */
         brw_push_insn_state(p);
         brw_set_default_mask_control(p, BRW_MASK_DISABLE);
         brw_MOV(p, header, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));

         brw_set_default_access_mode(p, BRW_ALIGN_1);

         if (inst->texture_offset) {
            /* Set the texel offset bits in DWord 2. */
            brw_MOV(p, get_element_ud(header, 2),
                    brw_imm_ud(inst->texture_offset));
         }

         brw_adjust_sampler_state_pointer(p, header, sampler_index, dst);
         brw_pop_insn_state(p);
      }
   }

   uint32_t return_format;

   switch (dst.type) {
   case BRW_REGISTER_TYPE_D:
      return_format = BRW_SAMPLER_RETURN_FORMAT_SINT32;
      break;
   case BRW_REGISTER_TYPE_UD:
      return_format = BRW_SAMPLER_RETURN_FORMAT_UINT32;
      break;
   default:
      return_format = BRW_SAMPLER_RETURN_FORMAT_FLOAT32;
      break;
   }

   uint32_t base_binding_table_index = (inst->opcode == SHADER_OPCODE_TG4 ||
         inst->opcode == SHADER_OPCODE_TG4_OFFSET)
         ? prog_data->base.binding_table.gather_texture_start
         : prog_data->base.binding_table.texture_start;

   if (sampler_index.file == BRW_IMMEDIATE_VALUE) {
      uint32_t sampler = sampler_index.dw1.ud;

      brw_SAMPLE(p,
                 dst,
                 inst->base_mrf,
                 src,
                 sampler + base_binding_table_index,
                 sampler % 16,
                 msg_type,
                 1, /* response length */
                 inst->mlen,
                 inst->header_present,
                 BRW_SAMPLER_SIMD_MODE_SIMD4X2,
                 return_format);

      brw_mark_surface_used(&prog_data->base, sampler + base_binding_table_index);
   } else {
      /* Non-constant sampler index. */
      /* Note: this clobbers `dst` as a temporary before emitting the send */

      struct brw_reg addr = vec1(retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD));
      struct brw_reg temp = vec1(retype(dst, BRW_REGISTER_TYPE_UD));

      struct brw_reg sampler_reg = vec1(retype(sampler_index, BRW_REGISTER_TYPE_UD));

      brw_push_insn_state(p);
      brw_set_default_mask_control(p, BRW_MASK_DISABLE);
      brw_set_default_access_mode(p, BRW_ALIGN_1);

      /* Some care required: `sampler` and `temp` may alias:
       *    addr = sampler & 0xff
       *    temp = (sampler << 8) & 0xf00
       *    addr = addr | temp
       */
      brw_ADD(p, addr, sampler_reg, brw_imm_ud(base_binding_table_index));
      brw_SHL(p, temp, sampler_reg, brw_imm_ud(8u));
      brw_AND(p, temp, temp, brw_imm_ud(0x0f00));
      brw_AND(p, addr, addr, brw_imm_ud(0x0ff));
      brw_OR(p, addr, addr, temp);

      /* a0.0 |= <descriptor> */
      brw_inst *insn_or = brw_next_insn(p, BRW_OPCODE_OR);
      brw_set_sampler_message(p, insn_or,
                              0 /* surface */,
                              0 /* sampler */,
                              msg_type,
                              1 /* rlen */,
                              inst->mlen /* mlen */,
                              inst->header_present /* header */,
                              BRW_SAMPLER_SIMD_MODE_SIMD4X2,
                              return_format);
      brw_inst_set_exec_size(p->brw, insn_or, BRW_EXECUTE_1);
      brw_inst_set_src1_reg_type(p->brw, insn_or, BRW_REGISTER_TYPE_UD);
      brw_set_src0(p, insn_or, addr);
      brw_set_dest(p, insn_or, addr);


      /* dst = send(offset, a0.0) */
      brw_inst *insn_send = brw_next_insn(p, BRW_OPCODE_SEND);
      brw_set_dest(p, insn_send, dst);
      brw_set_src0(p, insn_send, src);
      brw_set_indirect_send_descriptor(p, insn_send, BRW_SFID_SAMPLER, addr);

      brw_pop_insn_state(p);

      /* visitor knows more than we do about the surface limit required,
       * so has already done marking.
       */
   }
}
Ejemplo n.º 25
0
void
vec4_generator::generate_tex(vec4_instruction *inst,
                             struct brw_reg dst,
                             struct brw_reg src)
{
    int msg_type = -1;

    if (brw->gen >= 5) {
        switch (inst->opcode) {
        case SHADER_OPCODE_TEX:
        case SHADER_OPCODE_TXL:
            if (inst->shadow_compare) {
                msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD_COMPARE;
            } else {
                msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD;
            }
            break;
        case SHADER_OPCODE_TXD:
            if (inst->shadow_compare) {
                /* Gen7.5+.  Otherwise, lowered by brw_lower_texture_gradients(). */
                assert(brw->is_haswell);
                msg_type = HSW_SAMPLER_MESSAGE_SAMPLE_DERIV_COMPARE;
            } else {
                msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_DERIVS;
            }
            break;
        case SHADER_OPCODE_TXF:
            msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LD;
            break;
        case SHADER_OPCODE_TXF_CMS:
            if (brw->gen >= 7)
                msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_LD2DMS;
            else
                msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LD;
            break;
        case SHADER_OPCODE_TXF_MCS:
            assert(brw->gen >= 7);
            msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_LD_MCS;
            break;
        case SHADER_OPCODE_TXS:
            msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_RESINFO;
            break;
        case SHADER_OPCODE_TG4:
            if (inst->shadow_compare) {
                msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_C;
            } else {
                msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4;
            }
            break;
        case SHADER_OPCODE_TG4_OFFSET:
            if (inst->shadow_compare) {
                msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO_C;
            } else {
                msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO;
            }
            break;
        default:
            assert(!"should not get here: invalid vec4 texture opcode");
            break;
        }
    } else {
        switch (inst->opcode) {
        case SHADER_OPCODE_TEX:
        case SHADER_OPCODE_TXL:
            if (inst->shadow_compare) {
                msg_type = BRW_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_LOD_COMPARE;
                assert(inst->mlen == 3);
            } else {
                msg_type = BRW_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_LOD;
                assert(inst->mlen == 2);
            }
            break;
        case SHADER_OPCODE_TXD:
            /* There is no sample_d_c message; comparisons are done manually. */
            msg_type = BRW_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_GRADIENTS;
            assert(inst->mlen == 4);
            break;
        case SHADER_OPCODE_TXF:
            msg_type = BRW_SAMPLER_MESSAGE_SIMD4X2_LD;
            assert(inst->mlen == 2);
            break;
        case SHADER_OPCODE_TXS:
            msg_type = BRW_SAMPLER_MESSAGE_SIMD4X2_RESINFO;
            assert(inst->mlen == 2);
            break;
        default:
            assert(!"should not get here: invalid vec4 texture opcode");
            break;
        }
    }

    assert(msg_type != -1);

    /* Load the message header if present.  If there's a texture offset, we need
     * to set it up explicitly and load the offset bitfield.  Otherwise, we can
     * use an implied move from g0 to the first message register.
     */
    if (inst->header_present) {
        if (brw->gen < 6 && !inst->texture_offset) {
            /* Set up an implied move from g0 to the MRF. */
            src = brw_vec8_grf(0, 0);
        } else {
            struct brw_reg header =
                retype(brw_message_reg(inst->base_mrf), BRW_REGISTER_TYPE_UD);

            /* Explicitly set up the message header by copying g0 to the MRF. */
            brw_push_insn_state(p);
            brw_set_mask_control(p, BRW_MASK_DISABLE);
            brw_MOV(p, header, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));

            brw_set_access_mode(p, BRW_ALIGN_1);

            if (inst->texture_offset) {
                /* Set the texel offset bits in DWord 2. */
                brw_MOV(p, get_element_ud(header, 2),
                        brw_imm_ud(inst->texture_offset));
            }

            if (inst->sampler >= 16) {
                /* The "Sampler Index" field can only store values between 0 and 15.
                 * However, we can add an offset to the "Sampler State Pointer"
                 * field, effectively selecting a different set of 16 samplers.
                 *
                 * The "Sampler State Pointer" needs to be aligned to a 32-byte
                 * offset, and each sampler state is only 16-bytes, so we can't
                 * exclusively use the offset - we have to use both.
                 */
                assert(brw->is_haswell); /* field only exists on Haswell */
                brw_ADD(p,
                        get_element_ud(header, 3),
                        get_element_ud(brw_vec8_grf(0, 0), 3),
                        brw_imm_ud(16 * (inst->sampler / 16) *
                                   sizeof(gen7_sampler_state)));
            }
            brw_pop_insn_state(p);
        }
    }

    uint32_t return_format;

    switch (dst.type) {
    case BRW_REGISTER_TYPE_D:
        return_format = BRW_SAMPLER_RETURN_FORMAT_SINT32;
        break;
    case BRW_REGISTER_TYPE_UD:
        return_format = BRW_SAMPLER_RETURN_FORMAT_UINT32;
        break;
    default:
        return_format = BRW_SAMPLER_RETURN_FORMAT_FLOAT32;
        break;
    }

    uint32_t surface_index = ((inst->opcode == SHADER_OPCODE_TG4 ||
                               inst->opcode == SHADER_OPCODE_TG4_OFFSET)
                              ? prog_data->base.binding_table.gather_texture_start
                              : prog_data->base.binding_table.texture_start) + inst->sampler;

    brw_SAMPLE(p,
               dst,
               inst->base_mrf,
               src,
               surface_index,
               inst->sampler % 16,
               msg_type,
               1, /* response length */
               inst->mlen,
               inst->header_present,
               BRW_SAMPLER_SIMD_MODE_SIMD4X2,
               return_format);

    brw_mark_surface_used(&prog_data->base, surface_index);
}
Ejemplo n.º 26
0
/* Line clipping, more or less following the following algorithm:
 *
 *  for (p=0;p<MAX_PLANES;p++) {
 *     if (clipmask & (1 << p)) {
 *        GLfloat dp0 = DOTPROD( vtx0, plane[p] );
 *        GLfloat dp1 = DOTPROD( vtx1, plane[p] );
 *
 *        if (dp1 < 0.0f) {
 *           GLfloat t = dp1 / (dp1 - dp0);
 *           if (t > t1) t1 = t;
 *        } else {
 *           GLfloat t = dp0 / (dp0 - dp1);
 *           if (t > t0) t0 = t;
 *        }
 *
 *        if (t0 + t1 >= 1.0)
 *           return;
 *     }
 *  }
 *
 *  interp( ctx, newvtx0, vtx0, vtx1, t0 );
 *  interp( ctx, newvtx1, vtx1, vtx0, t1 );
 *
 */
static void clip_and_emit_line( struct brw_clip_compile *c )
{
   struct brw_codegen *p = &c->func;
   struct brw_indirect vtx0     = brw_indirect(0, 0);
   struct brw_indirect vtx1      = brw_indirect(1, 0);
   struct brw_indirect newvtx0   = brw_indirect(2, 0);
   struct brw_indirect newvtx1   = brw_indirect(3, 0);
   struct brw_indirect plane_ptr = brw_indirect(4, 0);
   struct brw_reg v1_null_ud = retype(vec1(brw_null_reg()), BRW_REGISTER_TYPE_UD);
   GLuint hpos_offset = brw_varying_to_offset(&c->vue_map, VARYING_SLOT_POS);
   GLint clipdist0_offset = c->key.nr_userclip
      ? brw_varying_to_offset(&c->vue_map, VARYING_SLOT_CLIP_DIST0)
      : 0;

   brw_MOV(p, get_addr_reg(vtx0),      brw_address(c->reg.vertex[0]));
   brw_MOV(p, get_addr_reg(vtx1),      brw_address(c->reg.vertex[1]));
   brw_MOV(p, get_addr_reg(newvtx0),   brw_address(c->reg.vertex[2]));
   brw_MOV(p, get_addr_reg(newvtx1),   brw_address(c->reg.vertex[3]));
   brw_MOV(p, get_addr_reg(plane_ptr), brw_clip_plane0_address(c));

   /* Note: init t0, t1 together:
    */
   brw_MOV(p, vec2(c->reg.t0), brw_imm_f(0));

   brw_clip_init_planes(c);
   brw_clip_init_clipmask(c);

   /* -ve rhw workaround */
   if (p->devinfo->has_negative_rhw_bug) {
      brw_AND(p, brw_null_reg(), get_element_ud(c->reg.R0, 2),
              brw_imm_ud(1<<20));
      brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_NZ);
      brw_OR(p, c->reg.planemask, c->reg.planemask, brw_imm_ud(0x3f));
      brw_inst_set_pred_control(p->devinfo, brw_last_inst, BRW_PREDICATE_NORMAL);
   }

   /* Set the initial vertex source mask: The first 6 planes are the bounds
    * of the view volume; the next 8 planes are the user clipping planes.
    */
   brw_MOV(p, c->reg.vertex_src_mask, brw_imm_ud(0x3fc0));

   /* Set the initial clipdistance offset to be 6 floats before gl_ClipDistance[0].
    * We'll increment 6 times before we start hitting actual user clipping. */
   brw_MOV(p, c->reg.clipdistance_offset, brw_imm_d(clipdist0_offset - 6*sizeof(float)));

   brw_DO(p, BRW_EXECUTE_1);
   {
      /* if (planemask & 1)
       */
      brw_AND(p, v1_null_ud, c->reg.planemask, brw_imm_ud(1));
      brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_NZ);

      brw_IF(p, BRW_EXECUTE_1);
      {
         brw_AND(p, v1_null_ud, c->reg.vertex_src_mask, brw_imm_ud(1));
         brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_NZ);
         brw_IF(p, BRW_EXECUTE_1);
         {
            /* user clip distance: just fetch the correct float from each vertex */
            struct brw_indirect temp_ptr = brw_indirect(7, 0);
            brw_ADD(p, get_addr_reg(temp_ptr), get_addr_reg(vtx0), c->reg.clipdistance_offset);
            brw_MOV(p, c->reg.dp0, deref_1f(temp_ptr, 0));
            brw_ADD(p, get_addr_reg(temp_ptr), get_addr_reg(vtx1), c->reg.clipdistance_offset);
            brw_MOV(p, c->reg.dp1, deref_1f(temp_ptr, 0));
         }
         brw_ELSE(p);
         {
            /* fixed plane: fetch the hpos, dp4 against the plane. */
            if (c->key.nr_userclip)
               brw_MOV(p, c->reg.plane_equation, deref_4f(plane_ptr, 0));
            else
               brw_MOV(p, c->reg.plane_equation, deref_4b(plane_ptr, 0));

            brw_DP4(p, vec4(c->reg.dp0), deref_4f(vtx0, hpos_offset), c->reg.plane_equation);
            brw_DP4(p, vec4(c->reg.dp1), deref_4f(vtx1, hpos_offset), c->reg.plane_equation);
         }
         brw_ENDIF(p);

         brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, vec1(c->reg.dp1), brw_imm_f(0.0f));

         brw_IF(p, BRW_EXECUTE_1);
         {
             /*
              * Both can be negative on GM965/G965 due to RHW workaround
              * if so, this object should be rejected.
              */
             if (p->devinfo->has_negative_rhw_bug) {
                 brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_LE, c->reg.dp0, brw_imm_f(0.0));
                 brw_IF(p, BRW_EXECUTE_1);
                 {
                     brw_clip_kill_thread(c);
                 }
                 brw_ENDIF(p);
             }

             brw_ADD(p, c->reg.t, c->reg.dp1, negate(c->reg.dp0));
             brw_math_invert(p, c->reg.t, c->reg.t);
             brw_MUL(p, c->reg.t, c->reg.t, c->reg.dp1);

             brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_G, c->reg.t, c->reg.t1 );
             brw_MOV(p, c->reg.t1, c->reg.t);
             brw_inst_set_pred_control(p->devinfo, brw_last_inst,
                                       BRW_PREDICATE_NORMAL);
	 }
	 brw_ELSE(p);
	 {
             /* Coming back in.  We know that both cannot be negative
              * because the line would have been culled in that case.
              */

             /* If both are positive, do nothing */
             /* Only on GM965/G965 */
             if (p->devinfo->has_negative_rhw_bug) {
                 brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_L, c->reg.dp0, brw_imm_f(0.0));
                 brw_IF(p, BRW_EXECUTE_1);
             }

             {
                 brw_ADD(p, c->reg.t, c->reg.dp0, negate(c->reg.dp1));
                 brw_math_invert(p, c->reg.t, c->reg.t);
                 brw_MUL(p, c->reg.t, c->reg.t, c->reg.dp0);

                 brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_G, c->reg.t, c->reg.t0 );
                 brw_MOV(p, c->reg.t0, c->reg.t);
                 brw_inst_set_pred_control(p->devinfo, brw_last_inst,
                                           BRW_PREDICATE_NORMAL);
             }

             if (p->devinfo->has_negative_rhw_bug) {
                 brw_ENDIF(p);
             }
         }
	 brw_ENDIF(p);
      }
      brw_ENDIF(p);

      /* plane_ptr++;
       */
      brw_ADD(p, get_addr_reg(plane_ptr), get_addr_reg(plane_ptr), brw_clip_plane_stride(c));

      /* while (planemask>>=1) != 0
       */
      brw_SHR(p, c->reg.planemask, c->reg.planemask, brw_imm_ud(1));
      brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_NZ);
      brw_SHR(p, c->reg.vertex_src_mask, c->reg.vertex_src_mask, brw_imm_ud(1));
      brw_inst_set_pred_control(p->devinfo, brw_last_inst, BRW_PREDICATE_NORMAL);
      brw_ADD(p, c->reg.clipdistance_offset, c->reg.clipdistance_offset, brw_imm_w(sizeof(float)));
      brw_inst_set_pred_control(p->devinfo, brw_last_inst, BRW_PREDICATE_NORMAL);
   }
   brw_WHILE(p);
   brw_inst_set_pred_control(p->devinfo, brw_last_inst, BRW_PREDICATE_NORMAL);

   brw_ADD(p, c->reg.t, c->reg.t0, c->reg.t1);
   brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_L, c->reg.t, brw_imm_f(1.0));
   brw_IF(p, BRW_EXECUTE_1);
   {
      brw_clip_interp_vertex(c, newvtx0, vtx0, vtx1, c->reg.t0, false);
      brw_clip_interp_vertex(c, newvtx1, vtx1, vtx0, c->reg.t1, false);

      brw_clip_emit_vue(c, newvtx0, BRW_URB_WRITE_ALLOCATE_COMPLETE,
                        (_3DPRIM_LINESTRIP << URB_WRITE_PRIM_TYPE_SHIFT)
                        | URB_WRITE_PRIM_START);
      brw_clip_emit_vue(c, newvtx1, BRW_URB_WRITE_EOT_COMPLETE,
                        (_3DPRIM_LINESTRIP << URB_WRITE_PRIM_TYPE_SHIFT)
                        | URB_WRITE_PRIM_END);
   }
   brw_ENDIF(p);
   brw_clip_kill_thread(c);
}
Ejemplo n.º 27
0
/**
 * Generate the geometry shader program used on Gen6 to perform stream output
 * (transform feedback).
 */
void
gen6_sol_program(struct brw_gs_compile *c, struct brw_gs_prog_key *key,
	         unsigned num_verts, bool check_edge_flags)
{
   struct brw_compile *p = &c->func;
   c->prog_data.svbi_postincrement_value = num_verts;

   brw_gs_alloc_regs(c, num_verts, true);
   brw_gs_initialize_header(c);

   if (key->num_transform_feedback_bindings > 0) {
      unsigned vertex, binding;
      struct brw_reg destination_indices_uw =
         vec8(retype(c->reg.destination_indices, BRW_REGISTER_TYPE_UW));

      /* Note: since we use the binding table to keep track of buffer offsets
       * and stride, the GS doesn't need to keep track of a separate pointer
       * into each buffer; it uses a single pointer which increments by 1 for
       * each vertex.  So we use SVBI0 for this pointer, regardless of whether
       * transform feedback is in interleaved or separate attribs mode.
       *
       * Make sure that the buffers have enough room for all the vertices.
       */
      brw_ADD(p, get_element_ud(c->reg.temp, 0),
	         get_element_ud(c->reg.SVBI, 0), brw_imm_ud(num_verts));
      brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_LE,
	         get_element_ud(c->reg.temp, 0),
	         get_element_ud(c->reg.SVBI, 4));
      brw_IF(p, BRW_EXECUTE_1);

      /* Compute the destination indices to write to.  Usually we use SVBI[0]
       * + (0, 1, 2).  However, for odd-numbered triangles in tristrips, the
       * vertices come down the pipeline in reversed winding order, so we need
       * to flip the order when writing to the transform feedback buffer.  To
       * ensure that flatshading accuracy is preserved, we need to write them
       * in order SVBI[0] + (0, 2, 1) if we're using the first provoking
       * vertex convention, and in order SVBI[0] + (1, 0, 2) if we're using
       * the last provoking vertex convention.
       *
       * Note: since brw_imm_v can only be used in instructions in
       * packed-word execution mode, and SVBI is a double-word, we need to
       * first move the appropriate immediate constant ((0, 1, 2), (0, 2, 1),
       * or (1, 0, 2)) to the destination_indices register, and then add SVBI
       * using a separate instruction.  Also, since the immediate constant is
       * expressed as packed words, and we need to load double-words into
       * destination_indices, we need to intersperse zeros to fill the upper
       * halves of each double-word.
       */
      brw_MOV(p, destination_indices_uw,
              brw_imm_v(0x00020100)); /* (0, 1, 2) */
      if (num_verts == 3) {
         /* Get primitive type into temp register. */
         brw_AND(p, get_element_ud(c->reg.temp, 0),
                 get_element_ud(c->reg.R0, 2), brw_imm_ud(0x1f));

         /* Test if primitive type is TRISTRIP_REVERSE.  We need to do this as
          * an 8-wide comparison so that the conditional MOV that follows
          * moves all 8 words correctly.
          */
         brw_CMP(p, vec8(brw_null_reg()), BRW_CONDITIONAL_EQ,
                 get_element_ud(c->reg.temp, 0),
                 brw_imm_ud(_3DPRIM_TRISTRIP_REVERSE));

         /* If so, then overwrite destination_indices_uw with the appropriate
          * reordering.
          */
         brw_MOV(p, destination_indices_uw,
                 brw_imm_v(key->pv_first ? 0x00010200    /* (0, 2, 1) */
                                         : 0x00020001)); /* (1, 0, 2) */
         brw_set_predicate_control(p, BRW_PREDICATE_NONE);
      }
      brw_ADD(p, c->reg.destination_indices,
              c->reg.destination_indices, get_element_ud(c->reg.SVBI, 0));

      /* For each vertex, generate code to output each varying using the
       * appropriate binding table entry.
       */
      for (vertex = 0; vertex < num_verts; ++vertex) {
         /* Set up the correct destination index for this vertex */
         brw_MOV(p, get_element_ud(c->reg.header, 5),
                 get_element_ud(c->reg.destination_indices, vertex));

         for (binding = 0; binding < key->num_transform_feedback_bindings;
              ++binding) {
            unsigned char varying =
               key->transform_feedback_bindings[binding];
            unsigned char slot = c->vue_map.varying_to_slot[varying];
            /* From the Sandybridge PRM, Volume 2, Part 1, Section 4.5.1:
             *
             *   "Prior to End of Thread with a URB_WRITE, the kernel must
             *   ensure that all writes are complete by sending the final
             *   write as a committed write."
             */
            bool final_write =
               binding == key->num_transform_feedback_bindings - 1 &&
               vertex == num_verts - 1;
            struct brw_reg vertex_slot = c->reg.vertex[vertex];
            vertex_slot.nr += slot / 2;
            vertex_slot.subnr = (slot % 2) * 16;
            /* gl_PointSize is stored in VARYING_SLOT_PSIZ.w. */
            vertex_slot.dw1.bits.swizzle = varying == VARYING_SLOT_PSIZ
               ? BRW_SWIZZLE_WWWW : key->transform_feedback_swizzles[binding];
            brw_set_access_mode(p, BRW_ALIGN_16);
            brw_MOV(p, stride(c->reg.header, 4, 4, 1),
                    retype(vertex_slot, BRW_REGISTER_TYPE_UD));
            brw_set_access_mode(p, BRW_ALIGN_1);
            brw_svb_write(p,
                          final_write ? c->reg.temp : brw_null_reg(), /* dest */
                          1, /* msg_reg_nr */
                          c->reg.header, /* src0 */
                          SURF_INDEX_SOL_BINDING(binding), /* binding_table_index */
                          final_write); /* send_commit_msg */
         }
      }
      brw_ENDIF(p);

      /* Now, reinitialize the header register from R0 to restore the parts of
       * the register that we overwrote while streaming out transform feedback
       * data.
       */
      brw_gs_initialize_header(c);

      /* Finally, wait for the write commit to occur so that we can proceed to
       * other things safely.
       *
       * From the Sandybridge PRM, Volume 4, Part 1, Section 3.3:
       *
       *   The write commit does not modify the destination register, but
       *   merely clears the dependency associated with the destination
       *   register. Thus, a simple “mov” instruction using the register as a
       *   source is sufficient to wait for the write commit to occur.
       */
      brw_MOV(p, c->reg.temp, c->reg.temp);
   }

   brw_gs_ff_sync(c, 1);

   /* If RASTERIZER_DISCARD is enabled, we have nothing further to do, so
    * release the URB that was just allocated, and terminate the thread.
    */
   if (key->rasterizer_discard) {
      brw_gs_terminate(c);
      return;
   }

   brw_gs_overwrite_header_dw2_from_r0(c);
   switch (num_verts) {
   case 1:
      brw_gs_offset_header_dw2(c, URB_WRITE_PRIM_START | URB_WRITE_PRIM_END);
      brw_gs_emit_vue(c, c->reg.vertex[0], true);
      break;
   case 2:
      brw_gs_offset_header_dw2(c, URB_WRITE_PRIM_START);
      brw_gs_emit_vue(c, c->reg.vertex[0], false);
      brw_gs_offset_header_dw2(c, URB_WRITE_PRIM_END - URB_WRITE_PRIM_START);
      brw_gs_emit_vue(c, c->reg.vertex[1], true);
      break;
   case 3:
      if (check_edge_flags) {
         /* Only emit vertices 0 and 1 if this is the first triangle of the
          * polygon.  Otherwise they are redundant.
          */
         brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
         brw_AND(p, retype(brw_null_reg(), BRW_REGISTER_TYPE_UD),
                 get_element_ud(c->reg.R0, 2),
                 brw_imm_ud(BRW_GS_EDGE_INDICATOR_0));
         brw_IF(p, BRW_EXECUTE_1);
      }
      brw_gs_offset_header_dw2(c, URB_WRITE_PRIM_START);
      brw_gs_emit_vue(c, c->reg.vertex[0], false);
      brw_gs_offset_header_dw2(c, -URB_WRITE_PRIM_START);
      brw_gs_emit_vue(c, c->reg.vertex[1], false);
      if (check_edge_flags) {
         brw_ENDIF(p);
         /* Only emit vertex 2 in PRIM_END mode if this is the last triangle
          * of the polygon.  Otherwise leave the primitive incomplete because
          * there are more polygon vertices coming.
          */
         brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
         brw_AND(p, retype(brw_null_reg(), BRW_REGISTER_TYPE_UD),
                 get_element_ud(c->reg.R0, 2),
                 brw_imm_ud(BRW_GS_EDGE_INDICATOR_1));
         brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
      }
      brw_gs_offset_header_dw2(c, URB_WRITE_PRIM_END);
      brw_set_predicate_control(p, BRW_PREDICATE_NONE);
      brw_gs_emit_vue(c, c->reg.vertex[2], true);
      break;
   }
}
/* Post-fragment-program processing.  Send the results to the
 * framebuffer.
 */
static void emit_fb_write( struct brw_wm_compile *c,
			   struct brw_reg *arg0,
			   struct brw_reg *arg1,
			   struct brw_reg *arg2,
			   GLuint target,
			   GLuint eot)
{
   struct brw_compile *p = &c->func;
   GLuint nr = 2;
   GLuint channel;

   /* Reserve a space for AA - may not be needed:
    */
   if (c->key.aa_dest_stencil_reg)
      nr += 1;

   /* I don't really understand how this achieves the color interleave
    * (ie RGBARGBA) in the result:  [Do the saturation here]
    */
   {
      brw_push_insn_state(p);
      
      for (channel = 0; channel < 4; channel++) {
	 /*  mov (8) m2.0<1>:ud   r28.0<8;8,1>:ud  { Align1 } */
	 /*  mov (8) m6.0<1>:ud   r29.0<8;8,1>:ud  { Align1 SecHalf } */

	 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
	 brw_MOV(p,
		 brw_message_reg(nr + channel),
		 arg0[channel]);
       
	 brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
	 brw_MOV(p,
		 brw_message_reg(nr + channel + 4),
		 sechalf(arg0[channel]));
      }

      /* skip over the regs populated above:
       */
      nr += 8;
   
      brw_pop_insn_state(p);
   }

   if (c->key.source_depth_to_render_target)
   {
      if (c->key.computes_depth) 
	 brw_MOV(p, brw_message_reg(nr), arg2[2]);
      else 
	 brw_MOV(p, brw_message_reg(nr), arg1[1]); /* ? */

      nr += 2;
   }

   if (c->key.dest_depth_reg)
   {
      GLuint comp = c->key.dest_depth_reg / 2;
      GLuint off = c->key.dest_depth_reg % 2;

      if (off != 0) {
         brw_push_insn_state(p);
         brw_set_compression_control(p, BRW_COMPRESSION_NONE);

         brw_MOV(p, brw_message_reg(nr), offset(arg1[comp],1));
         /* 2nd half? */
         brw_MOV(p, brw_message_reg(nr+1), arg1[comp+1]);
         brw_pop_insn_state(p);
      }
      else {
         brw_MOV(p, brw_message_reg(nr), arg1[comp]);
      }
      nr += 2;
   }


   if (!c->key.runtime_check_aads_emit) {
      if (c->key.aa_dest_stencil_reg)
	 emit_aa(c, arg1, 2);

      fire_fb_write(c, 0, nr, target, eot);
   }
   else {
      struct brw_reg v1_null_ud = vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_UD));
      struct brw_reg ip = brw_ip_reg();
      struct brw_instruction *jmp;
      
      brw_set_compression_control(p, BRW_COMPRESSION_NONE);
      brw_set_conditionalmod(p, BRW_CONDITIONAL_Z);
      brw_AND(p, 
	      v1_null_ud, 
	      get_element_ud(brw_vec8_grf(1,0), 6), 
	      brw_imm_ud(1<<26)); 

      jmp = brw_JMPI(p, ip, ip, brw_imm_w(0));
      {
	 emit_aa(c, arg1, 2);
	 fire_fb_write(c, 0, nr, target, eot);
	 /* note - thread killed in subroutine */
      }
      brw_land_fwd_jump(p, jmp);

      /* ELSE: Shuffle up one register to fill in the hole left for AA:
       */
      fire_fb_write(c, 1, nr-1, target, eot);
   }
}