示例#1
0
static inline vec_float4 vec_dot3( vec_float4 vec0, vec_float4 vec1 )
{
    vec_float4 result;
    result = spu_mul( vec0, vec1 );
    result = spu_madd( spu_rlqwbyte( vec0, 4 ), spu_rlqwbyte( vec1, 4 ), result );
    return spu_madd( spu_rlqwbyte( vec0, 8 ), spu_rlqwbyte( vec1, 8 ), result );
}
示例#2
0
/**
 * Setup fragment shader inputs by evaluating triangle's vertex
 * attribute coefficient info.
 * \param x  quad x pos
 * \param y  quad y pos
 * \param fragZ  returns quad Z values
 * \param fragInputs  returns fragment program inputs
 * Note: this code could be incorporated into the fragment program
 * itself to avoid the loop and switch.
 */
static void
eval_inputs(float x, float y, vector float *fragZ, vector float fragInputs[])
{
   static const vector float deltaX = (const vector float) {0, 1, 0, 1};
   static const vector float deltaY = (const vector float) {0, 0, 1, 1};

   const uint posSlot = 0;
   const vector float pos = setup.coef[posSlot].a0;
   const vector float dposdx = setup.coef[posSlot].dadx;
   const vector float dposdy = setup.coef[posSlot].dady;
   const vector float fragX = spu_splats(x) + deltaX;
   const vector float fragY = spu_splats(y) + deltaY;
   vector float fragW, wInv;
   uint i;

   *fragZ = splatz(pos) + fragX * splatz(dposdx) + fragY * splatz(dposdy);
   fragW =  splatw(pos) + fragX * splatw(dposdx) + fragY * splatw(dposdy);
   wInv = spu_re(fragW);  /* 1 / w */

   /* loop over fragment program inputs */
   for (i = 0; i < spu.vertex_info.num_attribs; i++) {
      uint attr = i + 1;
      enum interp_mode interp = spu.vertex_info.attrib[attr].interp_mode;

      /* constant term */
      vector float a0 = setup.coef[attr].a0;
      vector float r0 = splatx(a0);
      vector float r1 = splaty(a0);
      vector float r2 = splatz(a0);
      vector float r3 = splatw(a0);

      if (interp == INTERP_LINEAR || interp == INTERP_PERSPECTIVE) {
         /* linear term */
         vector float dadx = setup.coef[attr].dadx;
         vector float dady = setup.coef[attr].dady;
         /* Use SPU intrinsics here to get slightly better code.
          * originally: r0 += fragX * splatx(dadx) + fragY * splatx(dady);
          */
         r0 = spu_madd(fragX, splatx(dadx), spu_madd(fragY, splatx(dady), r0));
         r1 = spu_madd(fragX, splaty(dadx), spu_madd(fragY, splaty(dady), r1));
         r2 = spu_madd(fragX, splatz(dadx), spu_madd(fragY, splatz(dady), r2));
         r3 = spu_madd(fragX, splatw(dadx), spu_madd(fragY, splatw(dady), r3));
         if (interp == INTERP_PERSPECTIVE) {
            /* perspective term */
            r0 *= wInv;
            r1 *= wInv;
            r2 *= wInv;
            r3 *= wInv;
         }
      }
      fragInputs[CHAN0] = r0;
      fragInputs[CHAN1] = r1;
      fragInputs[CHAN2] = r2;
      fragInputs[CHAN3] = r3;
      fragInputs += 4;
   }
}


/**
 * Emit a quad (pass to next stage).  No clipping is done.
 * Note: about 1/5 to 1/7 of the time, mask is zero and this function
 * should be skipped.  But adding the test for that slows things down
 * overall.
 */
static INLINE void
emit_quad( int x, int y, mask_t mask)
{
   /* If any bits in mask are set... */
   if (spu_extract(spu_orx(mask), 0)) {
      const int ix = x - setup.cliprect_minx;
      const int iy = y - setup.cliprect_miny;

      spu.cur_ctile_status = TILE_STATUS_DIRTY;
      spu.cur_ztile_status = TILE_STATUS_DIRTY;

      {
         /*
          * Run fragment shader, execute per-fragment ops, update fb/tile.
          */
         vector float inputs[4*4], outputs[2*4];
         vector unsigned int kill_mask;
         vector float fragZ;

         eval_inputs((float) x, (float) y, &fragZ, inputs);

         ASSERT(spu.fragment_program);
         ASSERT(spu.fragment_ops);

         /* Execute the current fragment program */
         kill_mask = spu.fragment_program(inputs, outputs, spu.constants);

         mask = spu_andc(mask, kill_mask);

         /* Execute per-fragment/quad operations, including:
          * alpha test, z test, stencil test, blend and framebuffer writing.
          * Note that there are two different fragment operations functions
          * that can be called, one for front-facing fragments, and one
          * for back-facing fragments.  (Often the two are the same;
          * but in some cases, like two-sided stenciling, they can be
          * very different.)  So choose the correct function depending
          * on the calculated facing.
          */
         spu.fragment_ops[setup.facing](ix, iy, &spu.ctile, &spu.ztile,
                          fragZ,
                          outputs[0*4+0],
                          outputs[0*4+1],
                          outputs[0*4+2],
                          outputs[0*4+3],
                          mask);
      }
   }
}


/**
 * Given an X or Y coordinate, return the block/quad coordinate that it
 * belongs to.
 */
static INLINE int
block(int x)
{
   return x & ~1;
}


/**
 * Render a horizontal span of quads
 */
static void
flush_spans(void)
{
   int minleft, maxright;

   const int l0 = spu_extract(setup.span.quad, 0);
   const int l1 = spu_extract(setup.span.quad, 1);
   const int r0 = spu_extract(setup.span.quad, 2);
   const int r1 = spu_extract(setup.span.quad, 3);

   switch (setup.span.y_flags) {
   case 0x3:
      /* both odd and even lines written (both quad rows) */
      minleft = MIN2(l0, l1);
      maxright = MAX2(r0, r1);
      break;

   case 0x1:
      /* only even line written (quad top row) */
      minleft = l0;
      maxright = r0;
      break;

   case 0x2:
      /* only odd line written (quad bottom row) */
      minleft = l1;
      maxright = r1;
      break;

   default:
      return;
   }

   /* OK, we're very likely to need the tile data now.
    * clear or finish waiting if needed.
    */
   if (spu.cur_ctile_status == TILE_STATUS_GETTING) {
      /* wait for mfc_get() to complete */
      //printf("SPU: %u: waiting for ctile\n", spu.init.id);
      wait_on_mask(1 << TAG_READ_TILE_COLOR);
      spu.cur_ctile_status = TILE_STATUS_CLEAN;
   }
   else if (spu.cur_ctile_status == TILE_STATUS_CLEAR) {
      //printf("SPU %u: clearing C tile %u, %u\n", spu.init.id, setup.tx, setup.ty);
      clear_c_tile(&spu.ctile);
      spu.cur_ctile_status = TILE_STATUS_DIRTY;
   }
   ASSERT(spu.cur_ctile_status != TILE_STATUS_DEFINED);

   if (spu.read_depth_stencil) {
      if (spu.cur_ztile_status == TILE_STATUS_GETTING) {
         /* wait for mfc_get() to complete */
         //printf("SPU: %u: waiting for ztile\n", spu.init.id);
         wait_on_mask(1 << TAG_READ_TILE_Z);
         spu.cur_ztile_status = TILE_STATUS_CLEAN;
      }
      else if (spu.cur_ztile_status == TILE_STATUS_CLEAR) {
         //printf("SPU %u: clearing Z tile %u, %u\n", spu.init.id, setup.tx, setup.ty);
         clear_z_tile(&spu.ztile);
         spu.cur_ztile_status = TILE_STATUS_DIRTY;
      }
      ASSERT(spu.cur_ztile_status != TILE_STATUS_DEFINED);
   }

   /* XXX this loop could be moved into the above switch cases... */
   
   /* Setup for mask calculation */
   const vec_int4 quad_LlRr = setup.span.quad;
   const vec_int4 quad_RrLl = spu_rlqwbyte(quad_LlRr, 8);
   const vec_int4 quad_LLll = spu_shuffle(quad_LlRr, quad_LlRr, SHUFFLE4(A,A,B,B));
   const vec_int4 quad_RRrr = spu_shuffle(quad_RrLl, quad_RrLl, SHUFFLE4(A,A,B,B));

   const vec_int4 twos = spu_splats(2);

   const int x = block(minleft);
   vec_int4 xs = {x, x+1, x, x+1};

   for (; spu_extract(xs, 0) <= block(maxright); xs += twos) {
      /**
       * Computes mask to indicate which pixels in the 2x2 quad are actually
       * inside the triangle's bounds.
       */
      
      /* Calculate ({x,x+1,x,x+1} >= {l[0],l[0],l[1],l[1]}) */
      const mask_t gt_LLll_xs = spu_cmpgt(quad_LLll, xs);
      const mask_t gte_xs_LLll = spu_nand(gt_LLll_xs, gt_LLll_xs); 
      
      /* Calculate ({r[0],r[0],r[1],r[1]} > {x,x+1,x,x+1}) */
      const mask_t gt_RRrr_xs = spu_cmpgt(quad_RRrr, xs);

      /* Combine results to create mask */
      const mask_t mask = spu_and(gte_xs_LLll, gt_RRrr_xs);

      emit_quad(spu_extract(xs, 0), setup.span.y, mask);
   }

   setup.span.y = 0;
   setup.span.y_flags = 0;
   /* Zero right elements */
   setup.span.quad = spu_shuffle(setup.span.quad, setup.span.quad, SHUFFLE4(A,B,0,0));
}


#if DEBUG_VERTS
static void
print_vertex(const struct vertex_header *v)
{
   uint i;
   fprintf(stderr, "  Vertex: (%p)\n", v);
   for (i = 0; i < spu.vertex_info.num_attribs; i++) {
      fprintf(stderr, "    %d: %f %f %f %f\n",  i, 
              spu_extract(v->data[i], 0),
              spu_extract(v->data[i], 1),
              spu_extract(v->data[i], 2),
              spu_extract(v->data[i], 3));
   }
}
void* libvector_pointwise_multiply_32fc_unaligned(void* target,  void* src0, void* src1, unsigned int num_bytes){
	//loop iterator i
	int i = 0;
	void* retval = target;


	//put the target and source addresses into qwords
	vector unsigned int address_counter_tgt = {(unsigned int)target, 0, 0, 0};
	vector unsigned int address_counter_src0 = {(unsigned int)src0, 0, 0 ,0};
	vector unsigned int address_counter_src1 = {(unsigned int)src1, 0, 0, 0};

	//create shuffle masks

	//shuffle mask building blocks:
	//all from the first vector
	vector unsigned char oneup = {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
								  0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f};
	//all from the second vector
	vector unsigned char second_oneup = {0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
										 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};



	//gamma: second half of the second, first half of the first, break at (unsigned int)src0%16
	vector unsigned char src_cmp = spu_splats((unsigned char)((unsigned int)src0%16));
	vector unsigned char gt_res = spu_cmpgt(oneup, src_cmp);
	vector unsigned char eq_res = spu_cmpeq(oneup, src_cmp);
	vector unsigned char cmp_res = spu_or(gt_res, eq_res);
	vector unsigned char sixteen_uchar = spu_splats((unsigned char)16);
	vector unsigned char phase_change = spu_and(sixteen_uchar, cmp_res);
	vector unsigned int shuffle_mask_gamma = spu_add((vector unsigned int)phase_change,
												 (vector unsigned int)oneup);
	shuffle_mask_gamma = spu_rlqwbyte(shuffle_mask_gamma, (unsigned int)src0%16);

	//eta: second half of the second, first half of the first, break at (unsigned int)src1%16
	src_cmp = spu_splats((unsigned char)((unsigned int)src1%16));
	gt_res = spu_cmpgt(oneup, src_cmp);
	eq_res = spu_cmpeq(oneup, src_cmp);
	cmp_res = spu_or(gt_res, eq_res);
	sixteen_uchar = spu_splats((unsigned char)16);
	phase_change = spu_and(sixteen_uchar, cmp_res);
	vector unsigned int shuffle_mask_eta = spu_add((vector unsigned int)phase_change,
												 (vector unsigned int)oneup);
	shuffle_mask_eta = spu_rlqwbyte(shuffle_mask_eta, (unsigned int)src1%16);





	vector unsigned char tgt_second = spu_rlqwbyte(second_oneup, -((unsigned int)target%16));
	vector unsigned char tgt_first = spu_rlqwbyte(oneup, -((unsigned int)target%16));

	//alpha: first half of first, second half of second, break at (unsigned int)target%16
	src_cmp = spu_splats((unsigned char)((unsigned int)target%16));
	gt_res = spu_cmpgt(oneup, src_cmp);
	eq_res = spu_cmpeq(oneup, src_cmp);
	cmp_res = spu_or(gt_res, eq_res);
	phase_change = spu_and(sixteen_uchar, cmp_res);
	vector unsigned int shuffle_mask_alpha = spu_add((vector unsigned int)phase_change,
													 (vector unsigned int)oneup);

	//delta: first half of first, first half of second, break at (unsigned int)target%16
	vector unsigned char shuffle_mask_delta = spu_shuffle(oneup, tgt_second, (vector unsigned char)shuffle_mask_alpha);
	//epsilon: second half of second, second half of first, break at (unsigned int)target%16
	vector unsigned char shuffle_mask_epsilon = spu_shuffle(tgt_second, oneup, (vector unsigned char)shuffle_mask_alpha);
	//zeta: second half of second, first half of first, break at 16 - (unsigned int)target%16
	vector unsigned int shuffle_mask_zeta = spu_rlqwbyte(shuffle_mask_alpha, (unsigned int)target%16);

	//beta: first half of first, second half of second, break at num_bytes%16
	src_cmp = spu_splats((unsigned char)(num_bytes%16));
	gt_res = spu_cmpgt(oneup, src_cmp);
	eq_res = spu_cmpeq(oneup, src_cmp);
	cmp_res = spu_or(gt_res, eq_res);
	phase_change = spu_and(sixteen_uchar, cmp_res);
	vector unsigned int shuffle_mask_beta = spu_add((vector unsigned int)phase_change,
													 (vector unsigned int)oneup);






	qword src0_past;
	qword src0_present;
	qword src1_past;
	qword src1_present;
	qword tgt_past;
	qword tgt_present;

	qword in_temp0;
	qword in_temp1;
	qword out_temp0;
	qword out_temp1;


	src0_past = si_lqd((qword)address_counter_src0, 0);
	src1_past = si_lqd((qword)address_counter_src1, 0);
	tgt_past = si_lqd((qword)address_counter_tgt, 0);

	vector unsigned char shuffle_mask_complexprod0 = {0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03,
													  0x0c, 0x0d, 0x0e, 0x0f, 0x08, 0x09, 0x0a, 0x0b};
	vector unsigned char shuffle_mask_complexprod1 = {0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13,
													  0x08, 0x09, 0x0a, 0x0b, 0x18, 0x19, 0x1a, 0x1b};
	vector unsigned char shuffle_mask_complexprod2 = {0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17,
													  0x0c, 0x0d, 0x0e, 0x0f, 0x1c, 0x1d, 0x1e, 0x1f};
	vector unsigned char sign_changer = {0x00, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00,
										 0x00, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00};

	vector float prod0;
	qword shuf0;
	vector float prod1;
	vector float sign_change;
	qword summand0;
	qword summand1;
	vector float sum;


	for(i = 0; i < num_bytes/16; ++i) {

		src0_present = si_lqd((qword)address_counter_src0, 16);
		src1_present = si_lqd((qword)address_counter_src1, 16);
		tgt_present = si_lqd((qword)address_counter_tgt, 16);

		in_temp0 = spu_shuffle(src0_present, src0_past, (vector unsigned char)shuffle_mask_gamma);
		in_temp1 = spu_shuffle(src1_present, src1_past, (vector unsigned char)shuffle_mask_eta);

		prod0 = spu_mul((vector float)in_temp0, (vector float)in_temp1);
		shuf0 = spu_shuffle((qword)in_temp1, (qword)in_temp1, shuffle_mask_complexprod0);
		prod1 = spu_mul((vector float)in_temp0, (vector float)shuf0);
		sign_change = spu_xor(prod0, (vector float)sign_changer);

		summand0 = spu_shuffle((qword)sign_change, (qword)prod1, shuffle_mask_complexprod1);

		summand1 = spu_shuffle((qword)sign_change, (qword)prod1, shuffle_mask_complexprod2);

		sum = spu_add((vector float)summand0, (vector float)summand1);


		out_temp0 = spu_shuffle(tgt_past, (qword)sum, shuffle_mask_delta);
		out_temp1 = spu_shuffle(tgt_present, (qword)sum, shuffle_mask_epsilon);

		si_stqd(out_temp0, (qword)address_counter_tgt, 0);
		si_stqd(out_temp1, (qword)address_counter_tgt, 16);

		tgt_past = out_temp1;
		src0_past = src0_present;
		src1_past = src1_present;
		address_counter_src0 = spu_add(address_counter_src0, 16);
		address_counter_src1 = spu_add(address_counter_src1, 16);
		address_counter_tgt = spu_add(address_counter_tgt, 16);


	}

	src0_present = si_lqd((qword)address_counter_src0, 16);
	src1_present = si_lqd((qword)address_counter_src1, 16);
	tgt_present = si_lqd((qword)address_counter_tgt, 16);


	in_temp0 = spu_shuffle(src0_present, src0_past, (vector unsigned char) shuffle_mask_gamma);
	in_temp1 = spu_shuffle(src1_present, src1_past, (vector unsigned char) shuffle_mask_eta);


	prod0 = spu_mul((vector float)in_temp0, (vector float)in_temp1);
	shuf0 = spu_shuffle((qword)in_temp1, (qword)in_temp1, shuffle_mask_complexprod0);
	prod1 = spu_mul(prod0, (vector float)shuf0);
	sign_change = spu_xor(prod0, (vector float)sign_changer);
	summand0 = spu_shuffle((qword)sign_change, (qword)prod1, shuffle_mask_complexprod1);
	summand1 = spu_shuffle((qword)sign_change, (qword)prod1, shuffle_mask_complexprod2);
	sum = spu_add((vector float)summand0, (vector float)summand1);



	qword target_temp = spu_shuffle(tgt_present, tgt_past, (vector unsigned char) shuffle_mask_zeta);
	qword meld = spu_shuffle((qword)sum, target_temp, (vector unsigned char)shuffle_mask_beta);



	out_temp0 = spu_shuffle(tgt_past, meld, shuffle_mask_delta);
	out_temp1 = spu_shuffle(tgt_present, meld, shuffle_mask_epsilon);

	si_stqd(out_temp0, (qword)address_counter_tgt, 0);
	si_stqd(out_temp1, (qword)address_counter_tgt, 16);

	return retval;
}