예제 #1
0
void brw_debug_compact_uncompact(struct intel_context *intel,
                                 struct brw_instruction *orig,
                                 struct brw_instruction *uncompacted)
{
   fprintf(stderr, "Instruction compact/uncompact changed (gen%d):\n",
           intel->gen);

   fprintf(stderr, "  before: ");
   brw_disasm(stderr, orig, intel->gen);

   fprintf(stderr, "  after:  ");
   brw_disasm(stderr, uncompacted, intel->gen);

   uint32_t *before_bits = (uint32_t *)orig;
   uint32_t *after_bits = (uint32_t *)uncompacted;
   printf("  changed bits:\n");
   for (int i = 0; i < 128; i++) {
      uint32_t before = before_bits[i / 32] & (1 << (i & 31));
      uint32_t after = after_bits[i / 32] & (1 << (i & 31));

      if (before != after) {
         printf("  bit %d, %s to %s\n", i,
                before ? "set" : "unset",
                after ? "set" : "unset");
      }
   }
}
예제 #2
0
static void wm_src_sample_argb(struct brw_compile *p)
{
	static const uint32_t fragment[][4] = {
#include "exa_wm_src_affine.g6b"
#include "exa_wm_src_sample_argb.g6b"
#include "exa_wm_write.g6b"
	};
	int n;

	brw_push_insn_state(p);
	brw_set_mask_control(p, BRW_MASK_DISABLE);
	brw_set_compression_control(p, BRW_COMPRESSION_NONE);
	brw_MOV(p,
		retype(brw_vec1_grf(0,2), BRW_REGISTER_TYPE_UD),
		brw_imm_ud(0));
	brw_pop_insn_state(p);

	brw_SAMPLE(p,
		   retype(vec16(brw_vec8_grf(14, 0)), BRW_REGISTER_TYPE_UW),
		   1,
		   retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD),
		   1, 0,
		   WRITEMASK_XYZW,
		   GEN5_SAMPLER_MESSAGE_SAMPLE,
		   8,
		   5,
		   true,
		   BRW_SAMPLER_SIMD_MODE_SIMD16);


	for (n = 0; n < p->nr_insn; n++) {
		brw_disasm(stdout, &p->store[n], 60);
	}

	printf("\n\n");
	for (n = 0; n < ARRAY_SIZE(fragment); n++) {
		brw_disasm(stdout,
			   (const struct brw_instruction *)&fragment[n][0],
			   60);
	}
}
예제 #3
0
void
brw_dump_compile(struct brw_compile *p, FILE *out, int start, int end)
{
   struct brw_context *brw = p->brw;
   struct intel_context *intel = &brw->intel;
   void *store = p->store;
   bool dump_hex = false;

   for (int offset = start; offset < end;) {
      struct brw_instruction *insn = store + offset;
      struct brw_instruction uncompacted;
      printf("0x%08x: ", offset);

      if (insn->header.cmpt_control) {
	 struct brw_compact_instruction *compacted = (void *)insn;
	 if (dump_hex) {
	    printf("0x%08x 0x%08x                       ",
		   ((uint32_t *)insn)[1],
		   ((uint32_t *)insn)[0]);
	 }

	 brw_uncompact_instruction(intel, &uncompacted, compacted);
	 insn = &uncompacted;
	 offset += 8;
      } else {
	 if (dump_hex) {
	    printf("0x%08x 0x%08x 0x%08x 0x%08x ",
		   ((uint32_t *)insn)[3],
		   ((uint32_t *)insn)[2],
		   ((uint32_t *)insn)[1],
		   ((uint32_t *)insn)[0]);
	 }
	 offset += 16;
      }

      brw_disasm(stdout, insn, p->brw->intel.gen);
   }
}
예제 #4
0
파일: brw_gs.c 프로젝트: nikai3d/mesa
static void compile_gs_prog( struct brw_context *brw,
			     struct brw_gs_prog_key *key )
{
   struct intel_context *intel = &brw->intel;
   struct brw_gs_compile c;
   const GLuint *program;
   void *mem_ctx;
   GLuint program_size;

   /* Gen6: VF has already converted into polygon, and LINELOOP is
    * converted to LINESTRIP at the beginning of the 3D pipeline.
    */
   if (intel->gen >= 6)
      return;

   memset(&c, 0, sizeof(c));
   
   c.key = *key;
   /* Need to locate the two positions present in vertex + header.
    * These are currently hardcoded:
    */
   c.nr_attrs = brw_count_bits(c.key.attrs);

   if (intel->gen >= 5)
       c.nr_regs = (c.nr_attrs + 1) / 2 + 3;  /* are vertices packed, or reg-aligned? */
   else
       c.nr_regs = (c.nr_attrs + 1) / 2 + 1;  /* are vertices packed, or reg-aligned? */

   c.nr_bytes = c.nr_regs * REG_SIZE;

   mem_ctx = NULL;
   
   /* Begin the compilation:
    */
   brw_init_compile(brw, &c.func, mem_ctx);

   c.func.single_program_flow = 1;

   /* For some reason the thread is spawned with only 4 channels
    * unmasked.  
    */
   brw_set_mask_control(&c.func, BRW_MASK_DISABLE);


   /* Note that primitives which don't require a GS program have
    * already been weeded out by this stage:
    */

   switch (key->primitive) {
   case GL_QUADS:
      brw_gs_quads( &c, key );
      break;
   case GL_QUAD_STRIP:
      brw_gs_quad_strip( &c, key );
      break;
   case GL_LINE_LOOP:
      brw_gs_lines( &c );
      break;
   default:
      ralloc_free(mem_ctx);
      return;
   }

   /* get the program
    */
   program = brw_get_program(&c.func, &program_size);

   if (unlikely(INTEL_DEBUG & DEBUG_GS)) {
      int i;

      printf("gs:\n");
      for (i = 0; i < program_size / sizeof(struct brw_instruction); i++)
	 brw_disasm(stdout, &((struct brw_instruction *)program)[i],
		    intel->gen);
      printf("\n");
    }

   brw_upload_cache(&brw->cache, BRW_GS_PROG,
		    &c.key, sizeof(c.key),
		    program, program_size,
		    &c.prog_data, sizeof(c.prog_data),
		    &brw->gs.prog_offset, &brw->gs.prog_data);
   ralloc_free(mem_ctx);
}
예제 #5
0
파일: brw_gs.c 프로젝트: DirectFB/mesa
static void compile_ff_gs_prog(struct brw_context *brw,
                               struct brw_ff_gs_prog_key *key)
{
   struct brw_ff_gs_compile c;
   const GLuint *program;
   void *mem_ctx;
   GLuint program_size;

   memset(&c, 0, sizeof(c));

   c.key = *key;
   c.vue_map = brw->vs.prog_data->base.vue_map;
   c.nr_regs = (c.vue_map.num_slots + 1)/2;

   mem_ctx = ralloc_context(NULL);

   /* Begin the compilation:
    */
   brw_init_compile(brw, &c.func, mem_ctx);

   c.func.single_program_flow = 1;

   /* For some reason the thread is spawned with only 4 channels
    * unmasked.
    */
   brw_set_mask_control(&c.func, BRW_MASK_DISABLE);

   if (brw->gen >= 6) {
      unsigned num_verts;
      bool check_edge_flag;
      /* On Sandybridge, we use the GS for implementing transform feedback
       * (called "Stream Out" in the PRM).
       */
      switch (key->primitive) {
      case _3DPRIM_POINTLIST:
         num_verts = 1;
         check_edge_flag = false;
	 break;
      case _3DPRIM_LINELIST:
      case _3DPRIM_LINESTRIP:
      case _3DPRIM_LINELOOP:
         num_verts = 2;
         check_edge_flag = false;
	 break;
      case _3DPRIM_TRILIST:
      case _3DPRIM_TRIFAN:
      case _3DPRIM_TRISTRIP:
      case _3DPRIM_RECTLIST:
	 num_verts = 3;
         check_edge_flag = false;
         break;
      case _3DPRIM_QUADLIST:
      case _3DPRIM_QUADSTRIP:
      case _3DPRIM_POLYGON:
         num_verts = 3;
         check_edge_flag = true;
         break;
      default:
	 assert(!"Unexpected primitive type in Gen6 SOL program.");
	 return;
      }
      gen6_sol_program(&c, key, num_verts, check_edge_flag);
   } else {
      /* On Gen4-5, we use the GS to decompose certain types of primitives.
       * Note that primitives which don't require a GS program have already
       * been weeded out by now.
       */
      switch (key->primitive) {
      case _3DPRIM_QUADLIST:
	 brw_ff_gs_quads( &c, key );
	 break;
      case _3DPRIM_QUADSTRIP:
	 brw_ff_gs_quad_strip( &c, key );
	 break;
      case _3DPRIM_LINELOOP:
	 brw_ff_gs_lines( &c );
	 break;
      default:
	 ralloc_free(mem_ctx);
	 return;
      }
   }

   /* get the program
    */
   program = brw_get_program(&c.func, &program_size);

   if (unlikely(INTEL_DEBUG & DEBUG_GS)) {
      int i;

      printf("gs:\n");
      for (i = 0; i < program_size / sizeof(struct brw_instruction); i++)
	 brw_disasm(stdout, &((struct brw_instruction *)program)[i],
		    brw->gen);
      printf("\n");
    }

   brw_upload_cache(&brw->cache, BRW_FF_GS_PROG,
		    &c.key, sizeof(c.key),
		    program, program_size,
		    &c.prog_data, sizeof(c.prog_data),
		    &brw->ff_gs.prog_offset, &brw->ff_gs.prog_data);
   ralloc_free(mem_ctx);
}
예제 #6
0
파일: brw_sf.c 프로젝트: PatriceBlin/mesa
static void compile_sf_prog( struct brw_context *brw,
                             struct brw_sf_prog_key *key )
{
    struct brw_sf_compile c;
    const GLuint *program;
    void *mem_ctx;
    GLuint program_size;
    GLuint i;

    memset(&c, 0, sizeof(c));

    mem_ctx = ralloc_context(NULL);
    /* Begin the compilation:
     */
    brw_init_compile(brw, &c.func, mem_ctx);

    c.key = *key;
    c.vue_map = brw->vue_map_geom_out;
    if (c.key.do_point_coord) {
        /*
         * gl_PointCoord is a FS instead of VS builtin variable, thus it's
         * not included in c.vue_map generated in VS stage. Here we add
         * it manually to let SF shader generate the needed interpolation
         * coefficient for FS shader.
         */
        c.vue_map.varying_to_slot[BRW_VARYING_SLOT_PNTC] = c.vue_map.num_slots;
        c.vue_map.slot_to_varying[c.vue_map.num_slots++] = BRW_VARYING_SLOT_PNTC;
    }
    c.urb_entry_read_offset = BRW_SF_URB_ENTRY_READ_OFFSET;
    c.nr_attr_regs = (c.vue_map.num_slots + 1)/2 - c.urb_entry_read_offset;
    c.nr_setup_regs = c.nr_attr_regs;

    c.prog_data.urb_read_length = c.nr_attr_regs;
    c.prog_data.urb_entry_size = c.nr_setup_regs * 2;
    c.has_flat_shading = brw_any_flat_varyings(&key->interpolation_mode);

    /* Which primitive?  Or all three?
     */
    switch (key->primitive) {
    case SF_TRIANGLES:
        c.nr_verts = 3;
        brw_emit_tri_setup( &c, true );
        break;
    case SF_LINES:
        c.nr_verts = 2;
        brw_emit_line_setup( &c, true );
        break;
    case SF_POINTS:
        c.nr_verts = 1;
        if (key->do_point_sprite)
            brw_emit_point_sprite_setup( &c, true );
        else
            brw_emit_point_setup( &c, true );
        break;
    case SF_UNFILLED_TRIS:
        c.nr_verts = 3;
        brw_emit_anyprim_setup( &c );
        break;
    default:
        assert(0);
        return;
    }

    /* get the program
     */
    program = brw_get_program(&c.func, &program_size);

    if (unlikely(INTEL_DEBUG & DEBUG_SF)) {
        printf("sf:\n");
        for (i = 0; i < program_size / sizeof(struct brw_instruction); i++)
            brw_disasm(stdout, &((struct brw_instruction *)program)[i],
                       brw->gen);
        printf("\n");
    }

    brw_upload_cache(&brw->cache, BRW_SF_PROG,
                     &c.key, sizeof(c.key),
                     program, program_size,
                     &c.prog_data, sizeof(c.prog_data),
                     &brw->sf.prog_offset, &brw->sf.prog_data);
    ralloc_free(mem_ctx);
}
예제 #7
0
/* Emit the fragment program instructions here.
 */
void brw_wm_emit( struct brw_wm_compile *c )
{
   struct brw_compile *p = &c->func;
   struct intel_context *intel = &p->brw->intel;
   GLuint insn;

   brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
   if (intel->gen >= 6)
	brw_set_acc_write_control(p, 1);

   /* Check if any of the payload regs need to be spilled:
    */
   spill_values(c, c->payload.depth, 4);
   spill_values(c, c->creg, c->nr_creg);
   spill_values(c, c->payload.input_interp, FRAG_ATTRIB_MAX);
   

   for (insn = 0; insn < c->nr_insns; insn++) {

      struct brw_wm_instruction *inst = &c->instruction[insn];
      struct brw_reg args[3][4], dst[4];
      GLuint i, dst_flags;
      
      /* Get argument regs:
       */
      for (i = 0; i < 3; i++) 
	 get_argument_regs(c, inst->src[i], args[i]);

      /* Get dest regs:
       */
      for (i = 0; i < 4; i++)
	 if (inst->dst[i])
	    dst[i] = inst->dst[i]->hw_reg;
	 else
	    dst[i] = brw_null_reg();
      
      /* Flags
       */
      dst_flags = inst->writemask;
      if (inst->saturate) 
	 dst_flags |= SATURATE;

      switch (inst->opcode) {
	 /* Generated instructions for calculating triangle interpolants:
	  */
      case WM_PIXELXY:
	 emit_pixel_xy(c, dst, dst_flags);
	 break;

      case WM_DELTAXY:
	 emit_delta_xy(p, dst, dst_flags, args[0]);
	 break;

      case WM_WPOSXY:
	 emit_wpos_xy(c, dst, dst_flags, args[0]);
	 break;

      case WM_PIXELW:
	 emit_pixel_w(c, dst, dst_flags, args[0], args[1]);
	 break;

      case WM_LINTERP:
	 emit_linterp(p, dst, dst_flags, args[0], args[1]);
	 break;

      case WM_PINTERP:
	 emit_pinterp(p, dst, dst_flags, args[0], args[1], args[2]);
	 break;

      case WM_CINTERP:
	 emit_cinterp(p, dst, dst_flags, args[0]);
	 break;

      case WM_FB_WRITE:
	 emit_fb_write(c, args[0], args[1], args[2], inst->target, inst->eot);
	 break;

      case WM_FRONTFACING:
	 emit_frontfacing(p, dst, dst_flags);
	 break;

	 /* Straightforward arithmetic:
	  */
      case OPCODE_ADD:
	 emit_alu2(p, brw_ADD, dst, dst_flags, args[0], args[1]);
	 break;

      case OPCODE_FRC:
	 emit_alu1(p, brw_FRC, dst, dst_flags, args[0]);
	 break;

      case OPCODE_FLR:
	 emit_alu1(p, brw_RNDD, dst, dst_flags, args[0]);
	 break;

      case OPCODE_DDX:
	 emit_ddxy(p, dst, dst_flags, true, args[0]);
	 break;

      case OPCODE_DDY:
	 emit_ddxy(p, dst, dst_flags, false, args[0]);
	 break;

      case OPCODE_DP2:
	 emit_dp2(p, dst, dst_flags, args[0], args[1]);
	 break;

      case OPCODE_DP3:
	 emit_dp3(p, dst, dst_flags, args[0], args[1]);
	 break;

      case OPCODE_DP4:
	 emit_dp4(p, dst, dst_flags, args[0], args[1]);
	 break;

      case OPCODE_DPH:
	 emit_dph(p, dst, dst_flags, args[0], args[1]);
	 break;

      case OPCODE_TRUNC:
	 for (i = 0; i < 4; i++) {
	    if (dst_flags & (1<<i)) {
	       brw_RNDZ(p, dst[i], args[0][i]);
	    }
	 }
	 break;

      case OPCODE_LRP:
	 emit_lrp(p, dst, dst_flags, args[0], args[1], args[2]);
	 break;

      case OPCODE_MAD:	
	 emit_mad(p, dst, dst_flags, args[0], args[1], args[2]);
	 break;

      case OPCODE_MOV:
      case OPCODE_SWZ:
	 emit_alu1(p, brw_MOV, dst, dst_flags, args[0]);
	 break;

      case OPCODE_MUL:
	 emit_alu2(p, brw_MUL, dst, dst_flags, args[0], args[1]);
	 break;

      case OPCODE_XPD:
	 emit_xpd(p, dst, dst_flags, args[0], args[1]);
	 break;

	 /* Higher math functions:
	  */
      case OPCODE_RCP:
	 emit_math1(c, BRW_MATH_FUNCTION_INV, dst, dst_flags, args[0]);
	 break;

      case OPCODE_RSQ:
	 emit_math1(c, BRW_MATH_FUNCTION_RSQ, dst, dst_flags, args[0]);
	 break;

      case OPCODE_SIN:
	 emit_math1(c, BRW_MATH_FUNCTION_SIN, dst, dst_flags, args[0]);
	 break;

      case OPCODE_COS:
	 emit_math1(c, BRW_MATH_FUNCTION_COS, dst, dst_flags, args[0]);
	 break;

      case OPCODE_EX2:
	 emit_math1(c, BRW_MATH_FUNCTION_EXP, dst, dst_flags, args[0]);
	 break;

      case OPCODE_LG2:
	 emit_math1(c, BRW_MATH_FUNCTION_LOG, dst, dst_flags, args[0]);
	 break;

      case OPCODE_SCS:
	 /* There is an scs math function, but it would need some
	  * fixup for 16-element execution.
	  */
	 if (dst_flags & WRITEMASK_X)
	    emit_math1(c, BRW_MATH_FUNCTION_COS, dst, (dst_flags&SATURATE)|WRITEMASK_X, args[0]);
	 if (dst_flags & WRITEMASK_Y)
	    emit_math1(c, BRW_MATH_FUNCTION_SIN, dst+1, (dst_flags&SATURATE)|WRITEMASK_X, args[0]);
	 break;

      case OPCODE_POW:
	 emit_math2(c, BRW_MATH_FUNCTION_POW, dst, dst_flags, args[0], args[1]);
	 break;

	 /* Comparisons:
	  */
      case OPCODE_CMP:
	 emit_cmp(p, dst, dst_flags, args[0], args[1], args[2]);
	 break;

      case OPCODE_MAX:
	 emit_max(p, dst, dst_flags, args[0], args[1]);
	 break;

      case OPCODE_MIN:
	 emit_min(p, dst, dst_flags, args[0], args[1]);
	 break;

      case OPCODE_SLT:
	 emit_slt(p, dst, dst_flags, args[0], args[1]);
	 break;

      case OPCODE_SLE:
	 emit_sle(p, dst, dst_flags, args[0], args[1]);
	break;
      case OPCODE_SGT:
	 emit_sgt(p, dst, dst_flags, args[0], args[1]);
	break;
      case OPCODE_SGE:
	 emit_sge(p, dst, dst_flags, args[0], args[1]);
	 break;
      case OPCODE_SEQ:
	 emit_seq(p, dst, dst_flags, args[0], args[1]);
	break;
      case OPCODE_SNE:
	 emit_sne(p, dst, dst_flags, args[0], args[1]);
	break;

      case OPCODE_SSG:
	 emit_sign(p, dst, dst_flags, args[0]);
	 break;

      case OPCODE_LIT:
	 emit_lit(c, dst, dst_flags, args[0]);
	 break;

	 /* Texturing operations:
	  */
      case OPCODE_TEX:
	 emit_tex(c, dst, dst_flags, args[0], c->payload.depth[0].hw_reg,
		  inst->tex_idx, inst->tex_unit,
		  inst->tex_shadow);
	 break;

      case OPCODE_TXB:
	 emit_txb(c, dst, dst_flags, args[0], c->payload.depth[0].hw_reg,
		  inst->tex_idx, inst->tex_unit);
	 break;

      case OPCODE_KIL:
	 emit_kil(c, args[0]);
	 break;

      default:
	 printf("Unsupported opcode %i (%s) in fragment shader\n",
		inst->opcode, inst->opcode < MAX_OPCODE ?
		_mesa_opcode_string(inst->opcode) :
		"unknown");
      }
      
      for (i = 0; i < 4; i++)
	if (inst->dst[i] && inst->dst[i]->spill_slot) 
	   emit_spill(c, 
		      inst->dst[i]->hw_reg, 
		      inst->dst[i]->spill_slot);
   }

   /* Only properly tested on ILK */
   if (p->brw->intel.gen == 5) {
     brw_remove_duplicate_mrf_moves(p);
     if (c->dispatch_width == 16)
	brw_remove_grf_to_mrf_moves(p);
   }

   if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
      int i;

     printf("wm-native:\n");
     for (i = 0; i < p->nr_insn; i++)
	 brw_disasm(stdout, &p->store[i], p->brw->intel.gen);
      printf("\n");
   }
}
예제 #8
0
파일: brw_clip.c 프로젝트: CSRedRat/mesa-1
static void compile_clip_prog( struct brw_context *brw,
			     struct brw_clip_prog_key *key )
{
   struct brw_clip_compile c;
   const GLuint *program;
   void *mem_ctx;
   GLuint program_size;
   GLuint i;

   memset(&c, 0, sizeof(c));

   mem_ctx = ralloc_context(NULL);
   
   /* Begin the compilation:
    */
   brw_init_compile(brw, &c.func, mem_ctx);

   c.func.single_program_flow = 1;

   c.key = *key;
   c.vue_map = brw->vue_map_geom_out;

   c.has_flat_shading =
      brw_any_flat_varyings(&key->interpolation_mode);
   c.has_noperspective_shading =
      brw_any_noperspective_varyings(&key->interpolation_mode);

   /* nr_regs is the number of registers filled by reading data from the VUE.
    * This program accesses the entire VUE, so nr_regs needs to be the size of
    * the VUE (measured in pairs, since two slots are stored in each
    * register).
    */
   c.nr_regs = (c.vue_map.num_slots + 1)/2;

   c.prog_data.clip_mode = c.key.clip_mode; /* XXX */

   /* For some reason the thread is spawned with only 4 channels
    * unmasked.  
    */
   brw_set_mask_control(&c.func, BRW_MASK_DISABLE);


   /* Would ideally have the option of producing a program which could
    * do all three:
    */
   switch (key->primitive) {
   case GL_TRIANGLES: 
      if (key->do_unfilled)
	 brw_emit_unfilled_clip( &c );
      else
	 brw_emit_tri_clip( &c );
      break;
   case GL_LINES:
      brw_emit_line_clip( &c );
      break;
   case GL_POINTS:
      brw_emit_point_clip( &c );
      break;
   default:
      assert(0);
      return;
   }

	 

   /* get the program
    */
   program = brw_get_program(&c.func, &program_size);

   if (unlikely(INTEL_DEBUG & DEBUG_CLIP)) {
      printf("clip:\n");
      for (i = 0; i < program_size / sizeof(struct brw_instruction); i++)
	 brw_disasm(stdout, &((struct brw_instruction *)program)[i],
		    brw->gen);
      printf("\n");
   }

   brw_upload_cache(&brw->cache,
		    BRW_CLIP_PROG,
		    &c.key, sizeof(c.key),
		    program, program_size,
		    &c.prog_data, sizeof(c.prog_data),
		    &brw->clip.prog_offset, &brw->clip.prog_data);
   ralloc_free(mem_ctx);
}
예제 #9
0
static void compile_clip_prog( struct brw_context *brw,
			     struct brw_clip_prog_key *key )
{
   struct intel_context *intel = &brw->intel;
   struct brw_clip_compile c;
   const GLuint *program;
   GLuint program_size;
   GLuint delta;
   GLuint i;
   GLuint header_regs;

   memset(&c, 0, sizeof(c));
   
   /* Begin the compilation:
    */
   brw_init_compile(brw, &c.func);

   c.func.single_program_flow = 1;

   c.key = *key;

   /* Need to locate the two positions present in vertex + header.
    * These are currently hardcoded:
    */
   c.header_position_offset = ATTR_SIZE;

   if (intel->gen == 5)
      header_regs = 3;
   else
      header_regs = 1;

   delta = header_regs * REG_SIZE;

   for (i = 0; i < VERT_RESULT_MAX; i++) {
      if (c.key.attrs & BITFIELD64_BIT(i)) {
	 c.offset[i] = delta;
	 delta += ATTR_SIZE;

	 c.idx_to_attr[c.nr_attrs] = i;
	 c.nr_attrs++;
      }
   }

   /* The vertex attributes start at a URB row-aligned offset after
    * the 8-20 dword vertex header, and continue for a URB row-aligned
    * length.  nr_regs determines the urb_read_length from the start
    * of the header to the end of the vertex data.
    */
   c.nr_regs = header_regs + (c.nr_attrs + 1) / 2;

   c.nr_bytes = c.nr_regs * REG_SIZE;

   c.prog_data.clip_mode = c.key.clip_mode; /* XXX */

   /* For some reason the thread is spawned with only 4 channels
    * unmasked.  
    */
   brw_set_mask_control(&c.func, BRW_MASK_DISABLE);


   /* Would ideally have the option of producing a program which could
    * do all three:
    */
   switch (key->primitive) {
   case GL_TRIANGLES: 
      if (key->do_unfilled)
	 brw_emit_unfilled_clip( &c );
      else
	 brw_emit_tri_clip( &c );
      break;
   case GL_LINES:
      brw_emit_line_clip( &c );
      break;
   case GL_POINTS:
      brw_emit_point_clip( &c );
      break;
   default:
      assert(0);
      return;
   }

	 

   /* get the program
    */
   program = brw_get_program(&c.func, &program_size);

    if (INTEL_DEBUG & DEBUG_CLIP) {
      printf("clip:\n");
      for (i = 0; i < program_size / sizeof(struct brw_instruction); i++)
	 brw_disasm(stdout, &((struct brw_instruction *)program)[i],
		    intel->gen);
      printf("\n");
    }

   /* Upload
    */
   drm_intel_bo_unreference(brw->clip.prog_bo);
   brw->clip.prog_bo = brw_upload_cache_with_auxdata(&brw->cache,
						     BRW_CLIP_PROG,
						     &c.key, sizeof(c.key),
						     NULL, 0,
						     program, program_size,
						     &c.prog_data,
						     sizeof(c.prog_data),
						     &brw->clip.prog_data);
}