Example #1
0
void writeTriangleBuffer(Triangle* endTriangle)
{
	if (endTriangle != _currentTriangle) {
		int length = ( ((char*)endTriangle) - _currentTriangleBuffer + 127) & ~127;
		unsigned short endTriangleBase = (((char*)endTriangle) - ((char*)_currentTriangle)) + _currentTriangleOffset;
		vec_ushort8 v_new_end = spu_promote(endTriangleBase, 1);

		// calculate genuine next pointer ( rewind==0 -> next, rewind!=0 -> 0 )
		unsigned short next_pointer = spu_extract( spu_andc( v_new_end, _currentTriangleRewind ), 1 );
		_currentTriangle->next_triangle = next_pointer;
		
//		printf("current=0x%x, endTriBase=0x%x, next_pointer=0x%x\n", _currentTriangleOffset, endTriangleBase, next_pointer);

		// DMA the triangle data out
		spu_mfcdma64(_currentTriangleBuffer, mfc_ea2h(_currentTriangleBufferEA), mfc_ea2l(_currentTriangleBufferEA), length, 0, MFC_PUT_CMD);

		// update the information in the cache line
		_currentTriangleRewind = spu_splats(next_pointer);		// re-use this variable as we don't need it anymore
		char* dstart = ((char*)&_currentTriangleRewind) + (_currentTriangleCacheEndTriangleEAL & 15);
		spu_mfcdma64(dstart, _currentTriangleCacheEndTriangleEAH, _currentTriangleCacheEndTriangleEAL, sizeof(short), 0, MFC_PUTB_CMD);

//		printf("writing from %x to %x:%x\n", dstart, _currentTriangleCacheEndTriangleEAH, _currentTriangleCacheEndTriangleEAL);

		// finally invalidate the triangle info
		_currentTriangle = NULL;

		// and make sure the DMA completed
		mfc_write_tag_mask(1<<0);
		mfc_read_tag_status_all();
	}
}
Example #2
0
unsigned int
__mfc_tag_reserve (void)
{
  vector unsigned int mask = (vector unsigned int)
	{ 0x80000000, 0x80000000, 0x80000000, 0x80000000 };
  vector unsigned int count_zeros, is_valid;
  vector signed int count_neg;

  count_zeros = spu_cntlz (__mfc_tag_table);
  count_neg = spu_sub (0, (vector signed int) count_zeros);

  mask = spu_rlmask (mask, (vector signed int) count_neg);
  __mfc_tag_table = spu_andc (__mfc_tag_table, mask);

  is_valid = spu_cmpeq (count_zeros, 32);
  count_zeros = spu_sel (count_zeros, is_valid, is_valid);

  return spu_extract (count_zeros, 0);
}
Example #3
0
/**
 * Setup fragment shader inputs by evaluating triangle's vertex
 * attribute coefficient info.
 * \param x  quad x pos
 * \param y  quad y pos
 * \param fragZ  returns quad Z values
 * \param fragInputs  returns fragment program inputs
 * Note: this code could be incorporated into the fragment program
 * itself to avoid the loop and switch.
 */
static void
eval_inputs(float x, float y, vector float *fragZ, vector float fragInputs[])
{
   static const vector float deltaX = (const vector float) {0, 1, 0, 1};
   static const vector float deltaY = (const vector float) {0, 0, 1, 1};

   const uint posSlot = 0;
   const vector float pos = setup.coef[posSlot].a0;
   const vector float dposdx = setup.coef[posSlot].dadx;
   const vector float dposdy = setup.coef[posSlot].dady;
   const vector float fragX = spu_splats(x) + deltaX;
   const vector float fragY = spu_splats(y) + deltaY;
   vector float fragW, wInv;
   uint i;

   *fragZ = splatz(pos) + fragX * splatz(dposdx) + fragY * splatz(dposdy);
   fragW =  splatw(pos) + fragX * splatw(dposdx) + fragY * splatw(dposdy);
   wInv = spu_re(fragW);  /* 1 / w */

   /* loop over fragment program inputs */
   for (i = 0; i < spu.vertex_info.num_attribs; i++) {
      uint attr = i + 1;
      enum interp_mode interp = spu.vertex_info.attrib[attr].interp_mode;

      /* constant term */
      vector float a0 = setup.coef[attr].a0;
      vector float r0 = splatx(a0);
      vector float r1 = splaty(a0);
      vector float r2 = splatz(a0);
      vector float r3 = splatw(a0);

      if (interp == INTERP_LINEAR || interp == INTERP_PERSPECTIVE) {
         /* linear term */
         vector float dadx = setup.coef[attr].dadx;
         vector float dady = setup.coef[attr].dady;
         /* Use SPU intrinsics here to get slightly better code.
          * originally: r0 += fragX * splatx(dadx) + fragY * splatx(dady);
          */
         r0 = spu_madd(fragX, splatx(dadx), spu_madd(fragY, splatx(dady), r0));
         r1 = spu_madd(fragX, splaty(dadx), spu_madd(fragY, splaty(dady), r1));
         r2 = spu_madd(fragX, splatz(dadx), spu_madd(fragY, splatz(dady), r2));
         r3 = spu_madd(fragX, splatw(dadx), spu_madd(fragY, splatw(dady), r3));
         if (interp == INTERP_PERSPECTIVE) {
            /* perspective term */
            r0 *= wInv;
            r1 *= wInv;
            r2 *= wInv;
            r3 *= wInv;
         }
      }
      fragInputs[CHAN0] = r0;
      fragInputs[CHAN1] = r1;
      fragInputs[CHAN2] = r2;
      fragInputs[CHAN3] = r3;
      fragInputs += 4;
   }
}


/**
 * Emit a quad (pass to next stage).  No clipping is done.
 * Note: about 1/5 to 1/7 of the time, mask is zero and this function
 * should be skipped.  But adding the test for that slows things down
 * overall.
 */
static INLINE void
emit_quad( int x, int y, mask_t mask)
{
   /* If any bits in mask are set... */
   if (spu_extract(spu_orx(mask), 0)) {
      const int ix = x - setup.cliprect_minx;
      const int iy = y - setup.cliprect_miny;

      spu.cur_ctile_status = TILE_STATUS_DIRTY;
      spu.cur_ztile_status = TILE_STATUS_DIRTY;

      {
         /*
          * Run fragment shader, execute per-fragment ops, update fb/tile.
          */
         vector float inputs[4*4], outputs[2*4];
         vector unsigned int kill_mask;
         vector float fragZ;

         eval_inputs((float) x, (float) y, &fragZ, inputs);

         ASSERT(spu.fragment_program);
         ASSERT(spu.fragment_ops);

         /* Execute the current fragment program */
         kill_mask = spu.fragment_program(inputs, outputs, spu.constants);

         mask = spu_andc(mask, kill_mask);

         /* Execute per-fragment/quad operations, including:
          * alpha test, z test, stencil test, blend and framebuffer writing.
          * Note that there are two different fragment operations functions
          * that can be called, one for front-facing fragments, and one
          * for back-facing fragments.  (Often the two are the same;
          * but in some cases, like two-sided stenciling, they can be
          * very different.)  So choose the correct function depending
          * on the calculated facing.
          */
         spu.fragment_ops[setup.facing](ix, iy, &spu.ctile, &spu.ztile,
                          fragZ,
                          outputs[0*4+0],
                          outputs[0*4+1],
                          outputs[0*4+2],
                          outputs[0*4+3],
                          mask);
      }
   }
}


/**
 * Given an X or Y coordinate, return the block/quad coordinate that it
 * belongs to.
 */
static INLINE int
block(int x)
{
   return x & ~1;
}


/**
 * Render a horizontal span of quads
 */
static void
flush_spans(void)
{
   int minleft, maxright;

   const int l0 = spu_extract(setup.span.quad, 0);
   const int l1 = spu_extract(setup.span.quad, 1);
   const int r0 = spu_extract(setup.span.quad, 2);
   const int r1 = spu_extract(setup.span.quad, 3);

   switch (setup.span.y_flags) {
   case 0x3:
      /* both odd and even lines written (both quad rows) */
      minleft = MIN2(l0, l1);
      maxright = MAX2(r0, r1);
      break;

   case 0x1:
      /* only even line written (quad top row) */
      minleft = l0;
      maxright = r0;
      break;

   case 0x2:
      /* only odd line written (quad bottom row) */
      minleft = l1;
      maxright = r1;
      break;

   default:
      return;
   }

   /* OK, we're very likely to need the tile data now.
    * clear or finish waiting if needed.
    */
   if (spu.cur_ctile_status == TILE_STATUS_GETTING) {
      /* wait for mfc_get() to complete */
      //printf("SPU: %u: waiting for ctile\n", spu.init.id);
      wait_on_mask(1 << TAG_READ_TILE_COLOR);
      spu.cur_ctile_status = TILE_STATUS_CLEAN;
   }
   else if (spu.cur_ctile_status == TILE_STATUS_CLEAR) {
      //printf("SPU %u: clearing C tile %u, %u\n", spu.init.id, setup.tx, setup.ty);
      clear_c_tile(&spu.ctile);
      spu.cur_ctile_status = TILE_STATUS_DIRTY;
   }
   ASSERT(spu.cur_ctile_status != TILE_STATUS_DEFINED);

   if (spu.read_depth_stencil) {
      if (spu.cur_ztile_status == TILE_STATUS_GETTING) {
         /* wait for mfc_get() to complete */
         //printf("SPU: %u: waiting for ztile\n", spu.init.id);
         wait_on_mask(1 << TAG_READ_TILE_Z);
         spu.cur_ztile_status = TILE_STATUS_CLEAN;
      }
      else if (spu.cur_ztile_status == TILE_STATUS_CLEAR) {
         //printf("SPU %u: clearing Z tile %u, %u\n", spu.init.id, setup.tx, setup.ty);
         clear_z_tile(&spu.ztile);
         spu.cur_ztile_status = TILE_STATUS_DIRTY;
      }
      ASSERT(spu.cur_ztile_status != TILE_STATUS_DEFINED);
   }

   /* XXX this loop could be moved into the above switch cases... */
   
   /* Setup for mask calculation */
   const vec_int4 quad_LlRr = setup.span.quad;
   const vec_int4 quad_RrLl = spu_rlqwbyte(quad_LlRr, 8);
   const vec_int4 quad_LLll = spu_shuffle(quad_LlRr, quad_LlRr, SHUFFLE4(A,A,B,B));
   const vec_int4 quad_RRrr = spu_shuffle(quad_RrLl, quad_RrLl, SHUFFLE4(A,A,B,B));

   const vec_int4 twos = spu_splats(2);

   const int x = block(minleft);
   vec_int4 xs = {x, x+1, x, x+1};

   for (; spu_extract(xs, 0) <= block(maxright); xs += twos) {
      /**
       * Computes mask to indicate which pixels in the 2x2 quad are actually
       * inside the triangle's bounds.
       */
      
      /* Calculate ({x,x+1,x,x+1} >= {l[0],l[0],l[1],l[1]}) */
      const mask_t gt_LLll_xs = spu_cmpgt(quad_LLll, xs);
      const mask_t gte_xs_LLll = spu_nand(gt_LLll_xs, gt_LLll_xs); 
      
      /* Calculate ({r[0],r[0],r[1],r[1]} > {x,x+1,x,x+1}) */
      const mask_t gt_RRrr_xs = spu_cmpgt(quad_RRrr, xs);

      /* Combine results to create mask */
      const mask_t mask = spu_and(gte_xs_LLll, gt_RRrr_xs);

      emit_quad(spu_extract(xs, 0), setup.span.y, mask);
   }

   setup.span.y = 0;
   setup.span.y_flags = 0;
   /* Zero right elements */
   setup.span.quad = spu_shuffle(setup.span.quad, setup.span.quad, SHUFFLE4(A,B,0,0));
}


#if DEBUG_VERTS
static void
print_vertex(const struct vertex_header *v)
{
   uint i;
   fprintf(stderr, "  Vertex: (%p)\n", v);
   for (i = 0; i < spu.vertex_info.num_attribs; i++) {
      fprintf(stderr, "    %d: %f %f %f %f\n",  i, 
              spu_extract(v->data[i], 0),
              spu_extract(v->data[i], 1),
              spu_extract(v->data[i], 2),
              spu_extract(v->data[i], 3));
   }
}
Example #4
0
vector double
__divv2df3 (vector double a_in, vector double b_in)
{
    /* Variables */
    vec_int4    exp, exp_bias;
    vec_uint4   no_underflow, overflow;
    vec_float4  mant_bf, inv_bf;
    vec_ullong2 exp_a, exp_b;
    vec_ullong2 a_nan, a_zero, a_inf, a_denorm, a_denorm0;
    vec_ullong2 b_nan, b_zero, b_inf, b_denorm, b_denorm0;
    vec_ullong2 nan;
    vec_uint4   a_exp, b_exp;
    vec_ullong2 a_mant_0, b_mant_0;
    vec_ullong2 a_exp_1s, b_exp_1s;
    vec_ullong2 sign_exp_mask;

    vec_double2 a, b;
    vec_double2 mant_a, mant_b, inv_b, q0, q1, q2, mult;

    /* Constants */
    vec_uint4   exp_mask_u32 = spu_splats((unsigned int)0x7FF00000);
    vec_uchar16 splat_hi = (vec_uchar16) {
        0,1,2,3, 0,1,2,3,  8, 9,10,11, 8,9,10,11
    };
    vec_uchar16 swap_32 = (vec_uchar16) {
        4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11
    };
    vec_ullong2 exp_mask = spu_splats(0x7FF0000000000000ULL);
    vec_ullong2 sign_mask = spu_splats(0x8000000000000000ULL);
    vec_float4  onef = spu_splats(1.0f);
    vec_double2 one = spu_splats(1.0);
    vec_double2 exp_53 = (vec_double2)spu_splats(0x0350000000000000ULL);

    sign_exp_mask = spu_or(sign_mask, exp_mask);

    /* Extract the floating point components from each of the operands including
     * exponent and mantissa.
     */
    a_exp = (vec_uint4)spu_and((vec_uint4)a_in, exp_mask_u32);
    a_exp = spu_shuffle(a_exp, a_exp, splat_hi);
    b_exp = (vec_uint4)spu_and((vec_uint4)b_in, exp_mask_u32);
    b_exp = spu_shuffle(b_exp, b_exp, splat_hi);

    a_mant_0 = (vec_ullong2)spu_cmpeq((vec_uint4)spu_andc((vec_ullong2)a_in, sign_exp_mask), 0);
    a_mant_0 = spu_and(a_mant_0, spu_shuffle(a_mant_0, a_mant_0, swap_32));

    b_mant_0 = (vec_ullong2)spu_cmpeq((vec_uint4)spu_andc((vec_ullong2)b_in, sign_exp_mask), 0);
    b_mant_0 = spu_and(b_mant_0, spu_shuffle(b_mant_0, b_mant_0, swap_32));

    a_exp_1s = (vec_ullong2)spu_cmpeq(a_exp, exp_mask_u32);
    b_exp_1s = (vec_ullong2)spu_cmpeq(b_exp, exp_mask_u32);

    /* Identify all possible special values that must be accommodated including:
     * +-denorm, +-0, +-infinity, and NaNs.
     */
    a_denorm0= (vec_ullong2)spu_cmpeq(a_exp, 0);
    a_nan    = spu_andc(a_exp_1s, a_mant_0);
    a_zero   = spu_and (a_denorm0, a_mant_0);
    a_inf    = spu_and (a_exp_1s, a_mant_0);
    a_denorm = spu_andc(a_denorm0, a_zero);

    b_denorm0= (vec_ullong2)spu_cmpeq(b_exp, 0);
    b_nan    = spu_andc(b_exp_1s, b_mant_0);
    b_zero   = spu_and (b_denorm0, b_mant_0);
    b_inf    = spu_and (b_exp_1s, b_mant_0);
    b_denorm = spu_andc(b_denorm0, b_zero);

    /* Scale denorm inputs to into normalized numbers by conditionally scaling the
     * input parameters.
     */
    a = spu_sub(spu_or(a_in, exp_53), spu_sel(exp_53, a_in, sign_mask));
    a = spu_sel(a_in, a, a_denorm);

    b = spu_sub(spu_or(b_in, exp_53), spu_sel(exp_53, b_in, sign_mask));
    b = spu_sel(b_in, b, b_denorm);

    /* Extract the divisor and dividend exponent and force parameters into the signed
     * range [1.0,2.0) or [-1.0,2.0).
     */
    exp_a = spu_and((vec_ullong2)a, exp_mask);
    exp_b = spu_and((vec_ullong2)b, exp_mask);

    mant_a = spu_sel(a, one, (vec_ullong2)exp_mask);
    mant_b = spu_sel(b, one, (vec_ullong2)exp_mask);

    /* Approximate the single reciprocal of b by using
     * the single precision reciprocal estimate followed by one
     * single precision iteration of Newton-Raphson.
     */
    mant_bf = spu_roundtf(mant_b);
    inv_bf = spu_re(mant_bf);
    inv_bf = spu_madd(spu_nmsub(mant_bf, inv_bf, onef), inv_bf, inv_bf);

    /* Perform 2 more Newton-Raphson iterations in double precision. The
     * result (q1) is in the range (0.5, 2.0).
     */
    inv_b = spu_extend(inv_bf);
    inv_b = spu_madd(spu_nmsub(mant_b, inv_b, one), inv_b, inv_b);
    q0 = spu_mul(mant_a, inv_b);
    q1 = spu_madd(spu_nmsub(mant_b, q0, mant_a), inv_b, q0);

    /* Determine the exponent correction factor that must be applied
     * to q1 by taking into account the exponent of the normalized inputs
     * and the scale factors that were applied to normalize them.
     */
    exp = spu_rlmaska(spu_sub((vec_int4)exp_a, (vec_int4)exp_b), -20);
    exp = spu_add(exp, (vec_int4)spu_add(spu_and((vec_int4)a_denorm, -0x34), spu_and((vec_int4)b_denorm, 0x34)));

    /* Bias the quotient exponent depending on the sign of the exponent correction
     * factor so that a single multiplier will ensure the entire double precision
     * domain (including denorms) can be achieved.
     *
     *    exp 	       bias q1     adjust exp
     *   =====	       ========    ==========
     *   positive         2^+65         -65
     *   negative         2^-64         +64
     */
    exp_bias = spu_xor(spu_rlmaska(exp, -31), 64);
    exp = spu_sub(exp, exp_bias);

    q1 = spu_sel(q1, (vec_double2)spu_add((vec_int4)q1, spu_sl(exp_bias, 20)), exp_mask);

    /* Compute a multiplier (mult) to applied to the quotient (q1) to produce the
     * expected result. On overflow, clamp the multiplier to the maximum non-infinite
     * number in case the rounding mode is not round-to-nearest.
     */
    exp = spu_add(exp, 0x3FF);
    no_underflow = spu_cmpgt(exp, 0);
    overflow = spu_cmpgt(exp, 0x7FE);
    exp = spu_and(spu_sl(exp, 20), (vec_int4)no_underflow);
    exp = spu_and(exp, (vec_int4)exp_mask);

    mult = spu_sel((vec_double2)exp, (vec_double2)(spu_add((vec_uint4)exp_mask, -1)), (vec_ullong2)overflow);

    /* Handle special value conditions. These include:
     *
     * 1) IF either operand is a NaN OR both operands are 0 or INFINITY THEN a NaN
     *    results.
     * 2) ELSE IF the dividend is an INFINITY OR the divisor is 0 THEN a INFINITY results.
     * 3) ELSE IF the dividend is 0 OR the divisor is INFINITY THEN a 0 results.
     */
    mult = spu_andc(mult, (vec_double2)spu_or(a_zero, b_inf));
    mult = spu_sel(mult, (vec_double2)exp_mask, spu_or(a_inf, b_zero));

    nan = spu_or(a_nan, b_nan);
    nan = spu_or(nan, spu_and(a_zero, b_zero));
    nan = spu_or(nan, spu_and(a_inf, b_inf));

    mult = spu_or(mult, (vec_double2)nan);

    /* Scale the final quotient */

    q2 = spu_mul(q1, mult);

    return (q2);
}
Example #5
0
int allposinf_double2( vec_double2 x )
{
   vec_ullong2 posinf = spu_andc( isinfd2 ( x ), signbitd2 ( x ) );
   return ( spu_extract(posinf,0) != 0 && spu_extract(posinf,1) != 0 );
}
Example #6
0
Triangle* getTriangleBuffer(Context* context)
{
	// if we've already allocated a triangle buffer (and we're in the same context)
	if (context == _currentTriangleContext && _currentTriangle)
		return _currentTriangle;

	// trash the default values
	_currentTriangleContext	= context;
	_currentTriangle	= NULL;

	// read the current renderable cache line to ensure there is room for the triangle data
	// in the cache line buffer; we do this by comparing against all 16 cache line blocks
	// to make sure that extending the write pointer wouldn't clobber the data

	unsigned long long cache_ea = context->renderableCacheLine;
	if (cache_ea == 0)
		return NULL;
	char cachebuffer[128+127];
	RenderableCacheLine* cache = (RenderableCacheLine*) ( ((unsigned int)cachebuffer+127) & ~127 );

	// printf("GTB: reading to %x from %x:%x\n", cache, mfc_ea2h(cache_ea), mfc_ea2l(cache_ea));

	spu_mfcdma64(cache, mfc_ea2h(cache_ea), mfc_ea2l(cache_ea), 128, 0, MFC_GETLLAR_CMD);
	spu_readch(MFC_RdAtomicStat);

	// extendvalid = ( read<=write && test<end ) || ( read>write && test<read )
	// extendvalid = ( read>write && read>test ) || ( read<=write && end>test )
	// simplifies to	extendvalid = selb(end, read, read>write) > test
	// or			extendvalid = selb(end>test, read>test, read>write)
	// rewind = next >= end
	// rewindvalid = read != 0
	// valid = extendvalid && (!rewind || rewindvalid)
	// 	 = extendvalid && (!rewind || !rewindinvalid)
	// 	 = extendvalid && !(rewind && rewindinvalid)
	// invalid = ! (extendvalid && !(rewind && rewindinvalid))
	//         = (!extendvalid || (rewind && rewindinvalid))

	vec_ushort8 v_writeptr		= spu_splats( cache->endTriangle );
	vec_ushort8 v_readptr0		= cache->chunkTriangle[0];
	vec_ushort8 v_readptr1		= cache->chunkTriangle[1];
	vec_ushort8 v_testptr		= spu_add(v_writeptr,   TRIANGLE_MAX_SIZE);
	vec_ushort8 v_nextptr		= spu_add(v_writeptr, 2*TRIANGLE_MAX_SIZE);
	vec_ushort8 v_endptr		= spu_splats( (unsigned short)TRIANGLE_BUFFER_SIZE);

	vec_ushort8 v_zero		= spu_splats( (unsigned short) 0 );
	vec_uchar16 v_merger		= (vec_uchar16) { 1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31 };

	vec_ushort8 v_max0_test		= spu_sel( v_endptr, v_readptr0, spu_cmpgt( v_readptr0, v_writeptr ) );
	vec_ushort8 v_max1_test		= spu_sel( v_endptr, v_readptr1, spu_cmpgt( v_readptr1, v_writeptr ) );
	vec_ushort8 v_extend0_valid	= spu_cmpgt( v_max0_test, v_testptr );
	vec_ushort8 v_extend1_valid	= spu_cmpgt( v_max1_test, v_testptr );
	vec_ushort8 v_rewind0_invalid	= spu_cmpeq( v_readptr0, v_zero );
	vec_ushort8 v_rewind1_invalid	= spu_cmpeq( v_readptr1, v_zero );
	vec_ushort8 v_rewind8		= spu_cmpgt( v_nextptr, v_endptr );

	vec_uchar16 v_extend_valid	= (vec_uchar16) spu_shuffle( v_extend0_valid, v_extend1_valid, v_merger );
	vec_uchar16 v_rewind_invalid	= (vec_uchar16) spu_shuffle( v_rewind0_invalid, v_rewind1_invalid, v_merger );
	vec_uchar16 v_rewind		= (vec_uchar16) v_rewind8;

	vec_uchar16 v_valid_rhs		= spu_and( v_rewind_invalid, v_rewind );
	vec_uchar16 v_invalid		= spu_orc( v_valid_rhs, v_extend_valid );

	// check to see if the chunk is being processed
	vec_uint4 v_free = spu_gather(
		spu_cmpeq( spu_splats( (unsigned char) CHUNKNEXT_FREE_BLOCK ), cache->chunkNext ) );
	vec_uint4   v_invalid_bits	= spu_andc( spu_gather( v_invalid ), (vec_uint4) v_free );

	// if any of the bits are invalid, then no can do
	if ( spu_extract(v_invalid_bits, 0) ) {
		return NULL;
	}

	// fetch in the data before this triangle in the cache buffer
	unsigned int offset = cache->endTriangle;
	_currentTriangleBufferExtra = offset & 127;
	unsigned long long trianglebuffer_ea = cache_ea + TRIANGLE_OFFSET_FROM_CACHE_LINE + (offset & ~127);
	if (_currentTriangleBufferExtra) {
		spu_mfcdma64(_currentTriangleBuffer, mfc_ea2h(trianglebuffer_ea), mfc_ea2l(trianglebuffer_ea), 128, 0, MFC_GET_CMD);

		// ensure DMA did actually complete
		mfc_write_tag_mask(1<<0);
		mfc_read_tag_status_all();
	}

	// final bit of initialisation
	_currentTriangle = (Triangle*) (_currentTriangleBuffer+_currentTriangleBufferExtra);
	_currentTriangleOffset = offset;
	_currentTriangleRewind = v_rewind8;
	_currentTriangleCacheEndTriangleEAL = mfc_ea2l(cache_ea) + (((char*)&cache->endTriangle) - ((char*)cache));
	_currentTriangleCacheEndTriangleEAH = mfc_ea2h(cache_ea); 
	_currentTriangleBufferEA = trianglebuffer_ea; 

	// printf("Allocated new triangle buffer: %x\n", offset);

	// and return the buffer ready to go
	return _currentTriangle;
}
Example #7
0
/* Scans the string pointed to by s for the character c and
 * returns a pointer to the last occurance of c. If
 * c is not found, then NULL is returned.
 */
char * strrchr(const char *s, int c)
{
  int nskip;
  vec_uchar16 *ptr, data, vc;
  vec_uint4 cmp_c, cmp_0, cmp;
  vec_uint4 res_ptr, res_cmp;
  vec_uint4 mask, result;
  vec_uint4 one = spu_splats(0xffffU);
  /* Scan memory array a quadword at a time. Skip leading
   * mis-aligned bytes.
   */
  ptr = (vec_uchar16 *)s;

  nskip = -((unsigned int)(ptr) & 15);
  mask = spu_rlmask(one, nskip);

  vc = spu_splats((unsigned char)(c));

  data = *ptr++;
  ptr = (vec_uchar16 *)((unsigned int)ptr & ~15);

  cmp_c = spu_and(spu_gather(spu_cmpeq(data, vc)), mask);
  cmp_0 = spu_and(spu_gather(spu_cmpeq(data, 0)), mask);

  res_ptr = spu_splats(0U);
  res_cmp = spu_splats(0U);

  while (spu_extract(cmp_0, 0) == 0) {
    cmp = spu_cmpeq(cmp_c, 0);

    res_ptr = spu_sel(spu_promote((unsigned int)(ptr), 0), res_ptr, cmp);
    res_cmp = spu_sel(cmp_c, res_cmp, cmp);

    data = *ptr++;

    cmp_c = spu_gather(spu_cmpeq(data, vc));
    cmp_0 = spu_gather(spu_cmpeq(data, 0));

    cmp = spu_cmpeq(cmp_c, 0);
  }

  /* Compute the location of the last character before termination
   * character.
   *
   * First mask off compare results following the first termination character.
   */
  mask = spu_sl(one, 31 - spu_extract(spu_cntlz(cmp_0), 0));
  cmp_c = spu_and(cmp_c, mask);

  /* Conditionally update res_ptr and res_cmd if a match was found in the last
   * quadword.
   */
  cmp = spu_cmpeq(cmp_c, 0);

  res_ptr = spu_sel(spu_promote((unsigned int)(ptr), 0), res_ptr, cmp);
  res_cmp = spu_sel(cmp_c, res_cmp, cmp);

  /* Bit reserve res_cmp for locating last occurance.
   */
  mask = spu_cmpeq(res_cmp, 0);

  res_cmp = (vec_uint4)spu_maskb(spu_extract(res_cmp, 0));
  res_cmp = spu_gather((vec_uchar16)spu_shuffle(res_cmp, res_cmp,
						VEC_LITERAL(vec_uchar16,
							    15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0)));

  /* Compute the location (ptr) of the last occurance of c. If no
   * occurance was found (ie, element 0 of res_cmp == 0, then return
   * NULL.
   */
  result = spu_sub(spu_add(res_ptr, 15), spu_cntlz(res_cmp));
  result = spu_andc(result, mask);

  return ((char *)spu_extract(result, 0));
}