Example #1
0
vec_ullong2 cmpnegzerod2( vec_double2 x )
{
   vec_ullong2 cmp;
   vec_uchar16 even = (vec_uchar16)(vec_uint4){ 0x00010203, 0x00010203, 0x08090a0b, 0x08090a0b };
   vec_uchar16 odd = (vec_uchar16)(vec_uint4){ 0x04050607, 0x04050607, 0x0c0d0e0f, 0x0c0d0e0f };
 
   cmp = (vec_ullong2)spu_cmpeq( (vec_int4)x, (vec_int4)spu_splats(0x8000000000000000ull) );
   cmp = spu_and( spu_shuffle( cmp, cmp, even ), spu_shuffle( cmp, cmp, odd ) );
 
   return cmp;
}
Example #2
0
inline void merge_cache_blocks(RenderableCacheLine* cache)
{
    vec_uchar16 next = cache->chunkNext;

    for (;;) {
        vec_uchar16 nextnext = spu_shuffle(next, next, next);
        vec_uchar16 nextmask = spu_and(next, spu_splats((unsigned char)CHUNKNEXT_MASK));

        vec_ushort8 firstblock0 = spu_cmpeq( cache->chunkStart[0], 0);
        vec_ushort8 firstblock1 = spu_cmpeq( cache->chunkStart[1], 0);
        // change next to word offset, note we don't care what the low bit shifted in is
        vec_uchar16 firstshuf = (vec_uchar16) spu_sl( (vec_ushort8)nextmask, 1 );
        vec_uchar16 first = (vec_uchar16) spu_shuffle( firstblock0, firstblock1, firstshuf );

        vec_ushort8 tri0 = cache->chunkTriangle[0];
        vec_ushort8 tri1 = cache->chunkTriangle[1];
        vec_uchar16 trishufhi = spu_or ( firstshuf, spu_splats((unsigned char) 1));
        vec_uchar16 trishuflo = spu_and( firstshuf, spu_splats((unsigned char) 254));

        vec_ushort8 ntri0 = spu_shuffle( tri0, tri1, spu_shuffle( trishuflo, trishufhi, SHUF0 ) );
        vec_ushort8 ntri1 = spu_shuffle( tri0, tri1, spu_shuffle( trishuflo, trishufhi, SHUF1 ) );

        vec_ushort8 trieq0 = spu_cmpeq( tri0, ntri0 );
        vec_ushort8 trieq1 = spu_cmpeq( tri1, ntri1 );

        vec_uchar16 trieq = (vec_uchar16) spu_shuffle( trieq0, trieq1, MERGE );
        vec_uchar16 combi = spu_orc(first, trieq);

        vec_uchar16 canmerge = spu_cmpgt( spu_nor(spu_or(next, nextnext), combi), 256-CHUNKNEXT_BUSY_BIT );

        vec_uint4 gather = spu_gather( canmerge );

        vec_uint4 mergeid = spu_sub( spu_cntlz( gather ), spu_promote((unsigned int)16, 0));

        if( !spu_extract(gather, 0) ) {
            return;
        }

        //	unsigned int firstchunk = spu_extract(mergeid, 0);
        //	unsigned int nextchunk = cache->chunkNextArray[firstchunk];
        vec_uint4 v_chunkNext = (vec_uint4) si_rotqby( (qword) next, (qword) spu_add(mergeid,13) );
        vec_uint4 v_chunkNextNext = (vec_uint4) si_rotqby( (qword) next, (qword) spu_add(v_chunkNext,13) );

        // cache->chunkNextArray[firstchunk] = cache->chunkNextArray[nextchunk];
        next = spu_shuffle( (vec_uchar16) v_chunkNextNext, next, (vec_uchar16) si_cbd( (qword) mergeid, 0 ) );

        // cache->chunkNextArray[nextchunk] = CHUNKNEXT_FREE_BLOCK;
        next = spu_shuffle( spu_splats( (unsigned char) CHUNKNEXT_FREE_BLOCK), next, (vec_uchar16) si_cbd( (qword) v_chunkNext, 0 ) );

        // this is for debug use only, it's not really needed...
        // cache->chunkStartArray[nextchunk] = -1;
        cache->chunkStartArray[ spu_extract(v_chunkNext,0) & 255 ] = -1;

        cache->chunkNext = next;
    }
}
Example #3
0
int kernel(lwp_functions* pf,
	   void*             params,
	   void*             inout,
	   unsigned int      iter,
	   unsigned int      n)
{
  Ternary_params* p = (Ternary_params*)params;
  switch (p->cmd)
  {
    case AM:
    {
      int length = p->length / 4;
      vector float *a = (vector float *)inout;
      vector float *b = a + length;
      vector float *c = a + 2 * length;
      unsigned int i;
      for (i = 0; i != length; ++i, ++a, ++b, ++c)
	*a = spu_mul(spu_add(*a, *b), *c);
      return 0;
    }
    case MA:
    {
      int length = p->length / 4;
      vector float *a = (vector float *)inout;
      vector float *b = a + length;
      vector float *c = a + 2 * length;
      unsigned int i;
      for (i = 0; i != length; ++i, ++a, ++b, ++c)
	*a = spu_madd(*a, *b, *c);
      return 0;
    }
    case CAM:
    {
      static vector unsigned char lo = 
	(vector unsigned char) { 0, 1, 2, 3, 16, 17, 18, 19,
				 4, 5, 6, 7, 20, 21, 22, 23};

      static vector unsigned char hi = 
	(vector unsigned char) { 8,  9, 10, 11, 24, 25, 26, 27,
				12, 13, 14, 15, 28, 29, 30, 31};

      int length = p->length / 4;
      float *a = (float *)inout;
      float *b = a + 8 * length;
      float *c = a + 16 * length;
      unsigned int i;
      // (a + b) * c:
      // r.r = (a.r+b.r)*c.r - (a.i+b.i)*c.i
      // r.i = (a.r+b.r)*c.i + (a.i+b.i)*c.r
      for (i = 0; i != length; ++i, a+=8, b+=8, c+=8)
      {
	vector float av = {*a, *(a+2), *(a+4), *(a+6)};              // a.r
	vector float bv = {*b, *(b+2), *(b+4), *(b+6)};              // b.r
	vector float cv = {*c, *(c+2), *(c+4), *(c+6)};              // c.r
	vector float dv = {*(a+1), *(a+3), *(a+5), *(a+7)};          // a.i
	vector float ev = {*(b+1), *(b+3), *(b+5), *(b+7)};          // b.i
	vector float fv = {*(c+1), *(c+3), *(c+5), *(c+7)};          // c.i
	vector float trv = spu_add(av, bv); // a.r+b.r
	vector float tiv = spu_add(dv, ev); // a.i+b.i
	vector float sv = spu_mul(trv, cv); // (a.r+b.r)*c.r
	vector float tv = spu_mul(trv, fv); // (a.r+b.r)*c.i
	vector float real = spu_nmsub(tiv, fv, sv); // r.r
	vector float imag = spu_madd(tiv, cv, tv);  // r.i
	// interleave result
	*(vector float *)a = spu_shuffle(real, imag, lo);
	*(vector float *)(a+4) = spu_shuffle(real, imag, hi);
      }
      return 0;
    }
    case CMA:
    {
      static vector unsigned char lo = 
	(vector unsigned char) { 0, 1, 2, 3, 16, 17, 18, 19,
				 4, 5, 6, 7, 20, 21, 22, 23};

      static vector unsigned char hi = 
	(vector unsigned char) { 8,  9, 10, 11, 24, 25, 26, 27,
				12, 13, 14, 15, 28, 29, 30, 31};

      int length = p->length / 4;
      float *a = (float *)inout;
      float *b = a + 8 * length;
      float *c = a + 16 * length;
      unsigned int i;
      // a * b + c:
      // r.r = a.r*b.r + c.r - a.i*b.i
      // r.i = a.r*b.i + c.i + a.i*b.r
      for (i = 0; i != length; ++i, a+=8, b+=8, c+=8)
      {
	vector float av = {*a, *(a+2), *(a+4), *(a+6)};              // a.r
	vector float bv = {*b, *(b+2), *(b+4), *(b+6)};              // b.r
	vector float cv = {*c, *(c+2), *(c+4), *(c+6)};              // c.r
	vector float dv = {*(a+1), *(a+3), *(a+5), *(a+7)};          // a.i
	vector float ev = {*(b+1), *(b+3), *(b+5), *(b+7)};          // b.i
	vector float fv = {*(c+1), *(c+3), *(c+5), *(c+7)};          // c.i
	vector float real = spu_nmsub(dv, ev, spu_madd(av, bv, cv)); // r.r
	vector float imag = spu_madd(dv, bv, spu_madd(av, ev, fv));  // r.i
	// interleave result
	*(vector float *)a = spu_shuffle(real, imag, lo);
	*(vector float *)(a+4) = spu_shuffle(real, imag, hi);
      }
      return 0;
    }
    case ZAM:
    {
      int length = p->length / 4;
      float *a_re = (float *)inout;
      float *a_im = a_re + 4 * length;
      float *b_re = a_re + 8 * length;
      float *b_im = a_re + 12 * length;
      float *c_re = a_re + 16 * length;
      float *c_im = a_re + 20 * length;
      unsigned int i;
      // (a + b) * c:
      // r.r = (a.r+b.r)*c.r - (a.i+b.i)*c.i
      // r.i = (a.r+b.r)*c.i + (a.i+b.i)*c.r
      for (i = 0; i != length;
	   ++i, a_re+=4, b_re+=4, c_re+=4, a_im+=4, b_im+=4, c_im+=4)
      {
	vector float *av = (vector float *)a_re;
	vector float *bv = (vector float *)b_re;
	vector float *cv = (vector float *)c_re;
	vector float *dv = (vector float *)a_im;
	vector float *ev = (vector float *)b_im;
	vector float *fv = (vector float *)c_im;
	vector float trv = spu_add(*av, *bv); // a.r+b.r
	vector float tiv = spu_add(*dv, *ev); // a.i+b.i
	vector float sv = spu_mul(trv, *cv); // (a.r+b.r)*c.r
	vector float tv = spu_mul(trv, *fv); // (a.r+b.r)*c.i
	*av = spu_nmsub(tiv, *fv, sv); // r.r
        *dv = spu_madd(tiv, *cv, tv);  // r.i
      }
      return 0;
    }
    case ZMA:
    {
      int length = p->length / 4;
      float *a_re = (float *)inout;
      float *a_im = a_re + 4 * length;
      float *b_re = a_re + 8 * length;
      float *b_im = a_re + 12 * length;
      float *c_re = a_re + 16 * length;
      float *c_im = a_re + 20 * length;
      unsigned int i;
      // a * b + c:
      // r.r = a.r*b.r + c.r - a.i*b.i
      // r.i = a.r*b.i + c.i + a.i*b.r
      for (i = 0; i != length;
	   ++i, a_re+=4, b_re+=4, c_re+=4, a_im+=4, b_im+=4, c_im+=4)
      {
	vector float *av = (vector float *)a_re;
	vector float *bv = (vector float *)b_re;
	vector float *cv = (vector float *)c_re;
	vector float *dv = (vector float *)a_im;
	vector float *ev = (vector float *)b_im;
	vector float *fv = (vector float *)c_im;
	vector float tmp = spu_nmsub(*dv, *ev, spu_madd(*av, *bv, *cv));
	*dv = spu_madd(*dv, *bv, spu_madd(*av, *ev, *fv));
	*av = tmp;
      }
      return 0;
    }
  }
  return 1;
}
Example #4
0
/**
 * Setup fragment shader inputs by evaluating triangle's vertex
 * attribute coefficient info.
 * \param x  quad x pos
 * \param y  quad y pos
 * \param fragZ  returns quad Z values
 * \param fragInputs  returns fragment program inputs
 * Note: this code could be incorporated into the fragment program
 * itself to avoid the loop and switch.
 */
static void
eval_inputs(float x, float y, vector float *fragZ, vector float fragInputs[])
{
   static const vector float deltaX = (const vector float) {0, 1, 0, 1};
   static const vector float deltaY = (const vector float) {0, 0, 1, 1};

   const uint posSlot = 0;
   const vector float pos = setup.coef[posSlot].a0;
   const vector float dposdx = setup.coef[posSlot].dadx;
   const vector float dposdy = setup.coef[posSlot].dady;
   const vector float fragX = spu_splats(x) + deltaX;
   const vector float fragY = spu_splats(y) + deltaY;
   vector float fragW, wInv;
   uint i;

   *fragZ = splatz(pos) + fragX * splatz(dposdx) + fragY * splatz(dposdy);
   fragW =  splatw(pos) + fragX * splatw(dposdx) + fragY * splatw(dposdy);
   wInv = spu_re(fragW);  /* 1 / w */

   /* loop over fragment program inputs */
   for (i = 0; i < spu.vertex_info.num_attribs; i++) {
      uint attr = i + 1;
      enum interp_mode interp = spu.vertex_info.attrib[attr].interp_mode;

      /* constant term */
      vector float a0 = setup.coef[attr].a0;
      vector float r0 = splatx(a0);
      vector float r1 = splaty(a0);
      vector float r2 = splatz(a0);
      vector float r3 = splatw(a0);

      if (interp == INTERP_LINEAR || interp == INTERP_PERSPECTIVE) {
         /* linear term */
         vector float dadx = setup.coef[attr].dadx;
         vector float dady = setup.coef[attr].dady;
         /* Use SPU intrinsics here to get slightly better code.
          * originally: r0 += fragX * splatx(dadx) + fragY * splatx(dady);
          */
         r0 = spu_madd(fragX, splatx(dadx), spu_madd(fragY, splatx(dady), r0));
         r1 = spu_madd(fragX, splaty(dadx), spu_madd(fragY, splaty(dady), r1));
         r2 = spu_madd(fragX, splatz(dadx), spu_madd(fragY, splatz(dady), r2));
         r3 = spu_madd(fragX, splatw(dadx), spu_madd(fragY, splatw(dady), r3));
         if (interp == INTERP_PERSPECTIVE) {
            /* perspective term */
            r0 *= wInv;
            r1 *= wInv;
            r2 *= wInv;
            r3 *= wInv;
         }
      }
      fragInputs[CHAN0] = r0;
      fragInputs[CHAN1] = r1;
      fragInputs[CHAN2] = r2;
      fragInputs[CHAN3] = r3;
      fragInputs += 4;
   }
}


/**
 * Emit a quad (pass to next stage).  No clipping is done.
 * Note: about 1/5 to 1/7 of the time, mask is zero and this function
 * should be skipped.  But adding the test for that slows things down
 * overall.
 */
static INLINE void
emit_quad( int x, int y, mask_t mask)
{
   /* If any bits in mask are set... */
   if (spu_extract(spu_orx(mask), 0)) {
      const int ix = x - setup.cliprect_minx;
      const int iy = y - setup.cliprect_miny;

      spu.cur_ctile_status = TILE_STATUS_DIRTY;
      spu.cur_ztile_status = TILE_STATUS_DIRTY;

      {
         /*
          * Run fragment shader, execute per-fragment ops, update fb/tile.
          */
         vector float inputs[4*4], outputs[2*4];
         vector unsigned int kill_mask;
         vector float fragZ;

         eval_inputs((float) x, (float) y, &fragZ, inputs);

         ASSERT(spu.fragment_program);
         ASSERT(spu.fragment_ops);

         /* Execute the current fragment program */
         kill_mask = spu.fragment_program(inputs, outputs, spu.constants);

         mask = spu_andc(mask, kill_mask);

         /* Execute per-fragment/quad operations, including:
          * alpha test, z test, stencil test, blend and framebuffer writing.
          * Note that there are two different fragment operations functions
          * that can be called, one for front-facing fragments, and one
          * for back-facing fragments.  (Often the two are the same;
          * but in some cases, like two-sided stenciling, they can be
          * very different.)  So choose the correct function depending
          * on the calculated facing.
          */
         spu.fragment_ops[setup.facing](ix, iy, &spu.ctile, &spu.ztile,
                          fragZ,
                          outputs[0*4+0],
                          outputs[0*4+1],
                          outputs[0*4+2],
                          outputs[0*4+3],
                          mask);
      }
   }
}


/**
 * Given an X or Y coordinate, return the block/quad coordinate that it
 * belongs to.
 */
static INLINE int
block(int x)
{
   return x & ~1;
}


/**
 * Render a horizontal span of quads
 */
static void
flush_spans(void)
{
   int minleft, maxright;

   const int l0 = spu_extract(setup.span.quad, 0);
   const int l1 = spu_extract(setup.span.quad, 1);
   const int r0 = spu_extract(setup.span.quad, 2);
   const int r1 = spu_extract(setup.span.quad, 3);

   switch (setup.span.y_flags) {
   case 0x3:
      /* both odd and even lines written (both quad rows) */
      minleft = MIN2(l0, l1);
      maxright = MAX2(r0, r1);
      break;

   case 0x1:
      /* only even line written (quad top row) */
      minleft = l0;
      maxright = r0;
      break;

   case 0x2:
      /* only odd line written (quad bottom row) */
      minleft = l1;
      maxright = r1;
      break;

   default:
      return;
   }

   /* OK, we're very likely to need the tile data now.
    * clear or finish waiting if needed.
    */
   if (spu.cur_ctile_status == TILE_STATUS_GETTING) {
      /* wait for mfc_get() to complete */
      //printf("SPU: %u: waiting for ctile\n", spu.init.id);
      wait_on_mask(1 << TAG_READ_TILE_COLOR);
      spu.cur_ctile_status = TILE_STATUS_CLEAN;
   }
   else if (spu.cur_ctile_status == TILE_STATUS_CLEAR) {
      //printf("SPU %u: clearing C tile %u, %u\n", spu.init.id, setup.tx, setup.ty);
      clear_c_tile(&spu.ctile);
      spu.cur_ctile_status = TILE_STATUS_DIRTY;
   }
   ASSERT(spu.cur_ctile_status != TILE_STATUS_DEFINED);

   if (spu.read_depth_stencil) {
      if (spu.cur_ztile_status == TILE_STATUS_GETTING) {
         /* wait for mfc_get() to complete */
         //printf("SPU: %u: waiting for ztile\n", spu.init.id);
         wait_on_mask(1 << TAG_READ_TILE_Z);
         spu.cur_ztile_status = TILE_STATUS_CLEAN;
      }
      else if (spu.cur_ztile_status == TILE_STATUS_CLEAR) {
         //printf("SPU %u: clearing Z tile %u, %u\n", spu.init.id, setup.tx, setup.ty);
         clear_z_tile(&spu.ztile);
         spu.cur_ztile_status = TILE_STATUS_DIRTY;
      }
      ASSERT(spu.cur_ztile_status != TILE_STATUS_DEFINED);
   }

   /* XXX this loop could be moved into the above switch cases... */
   
   /* Setup for mask calculation */
   const vec_int4 quad_LlRr = setup.span.quad;
   const vec_int4 quad_RrLl = spu_rlqwbyte(quad_LlRr, 8);
   const vec_int4 quad_LLll = spu_shuffle(quad_LlRr, quad_LlRr, SHUFFLE4(A,A,B,B));
   const vec_int4 quad_RRrr = spu_shuffle(quad_RrLl, quad_RrLl, SHUFFLE4(A,A,B,B));

   const vec_int4 twos = spu_splats(2);

   const int x = block(minleft);
   vec_int4 xs = {x, x+1, x, x+1};

   for (; spu_extract(xs, 0) <= block(maxright); xs += twos) {
      /**
       * Computes mask to indicate which pixels in the 2x2 quad are actually
       * inside the triangle's bounds.
       */
      
      /* Calculate ({x,x+1,x,x+1} >= {l[0],l[0],l[1],l[1]}) */
      const mask_t gt_LLll_xs = spu_cmpgt(quad_LLll, xs);
      const mask_t gte_xs_LLll = spu_nand(gt_LLll_xs, gt_LLll_xs); 
      
      /* Calculate ({r[0],r[0],r[1],r[1]} > {x,x+1,x,x+1}) */
      const mask_t gt_RRrr_xs = spu_cmpgt(quad_RRrr, xs);

      /* Combine results to create mask */
      const mask_t mask = spu_and(gte_xs_LLll, gt_RRrr_xs);

      emit_quad(spu_extract(xs, 0), setup.span.y, mask);
   }

   setup.span.y = 0;
   setup.span.y_flags = 0;
   /* Zero right elements */
   setup.span.quad = spu_shuffle(setup.span.quad, setup.span.quad, SHUFFLE4(A,B,0,0));
}


#if DEBUG_VERTS
static void
print_vertex(const struct vertex_header *v)
{
   uint i;
   fprintf(stderr, "  Vertex: (%p)\n", v);
   for (i = 0; i < spu.vertex_info.num_attribs; i++) {
      fprintf(stderr, "    %d: %f %f %f %f\n",  i, 
              spu_extract(v->data[i], 0),
              spu_extract(v->data[i], 1),
              spu_extract(v->data[i], 2),
              spu_extract(v->data[i], 3));
   }
}
Example #5
0
vector double
__divv2df3 (vector double a_in, vector double b_in)
{
    /* Variables */
    vec_int4    exp, exp_bias;
    vec_uint4   no_underflow, overflow;
    vec_float4  mant_bf, inv_bf;
    vec_ullong2 exp_a, exp_b;
    vec_ullong2 a_nan, a_zero, a_inf, a_denorm, a_denorm0;
    vec_ullong2 b_nan, b_zero, b_inf, b_denorm, b_denorm0;
    vec_ullong2 nan;
    vec_uint4   a_exp, b_exp;
    vec_ullong2 a_mant_0, b_mant_0;
    vec_ullong2 a_exp_1s, b_exp_1s;
    vec_ullong2 sign_exp_mask;

    vec_double2 a, b;
    vec_double2 mant_a, mant_b, inv_b, q0, q1, q2, mult;

    /* Constants */
    vec_uint4   exp_mask_u32 = spu_splats((unsigned int)0x7FF00000);
    vec_uchar16 splat_hi = (vec_uchar16) {
        0,1,2,3, 0,1,2,3,  8, 9,10,11, 8,9,10,11
    };
    vec_uchar16 swap_32 = (vec_uchar16) {
        4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11
    };
    vec_ullong2 exp_mask = spu_splats(0x7FF0000000000000ULL);
    vec_ullong2 sign_mask = spu_splats(0x8000000000000000ULL);
    vec_float4  onef = spu_splats(1.0f);
    vec_double2 one = spu_splats(1.0);
    vec_double2 exp_53 = (vec_double2)spu_splats(0x0350000000000000ULL);

    sign_exp_mask = spu_or(sign_mask, exp_mask);

    /* Extract the floating point components from each of the operands including
     * exponent and mantissa.
     */
    a_exp = (vec_uint4)spu_and((vec_uint4)a_in, exp_mask_u32);
    a_exp = spu_shuffle(a_exp, a_exp, splat_hi);
    b_exp = (vec_uint4)spu_and((vec_uint4)b_in, exp_mask_u32);
    b_exp = spu_shuffle(b_exp, b_exp, splat_hi);

    a_mant_0 = (vec_ullong2)spu_cmpeq((vec_uint4)spu_andc((vec_ullong2)a_in, sign_exp_mask), 0);
    a_mant_0 = spu_and(a_mant_0, spu_shuffle(a_mant_0, a_mant_0, swap_32));

    b_mant_0 = (vec_ullong2)spu_cmpeq((vec_uint4)spu_andc((vec_ullong2)b_in, sign_exp_mask), 0);
    b_mant_0 = spu_and(b_mant_0, spu_shuffle(b_mant_0, b_mant_0, swap_32));

    a_exp_1s = (vec_ullong2)spu_cmpeq(a_exp, exp_mask_u32);
    b_exp_1s = (vec_ullong2)spu_cmpeq(b_exp, exp_mask_u32);

    /* Identify all possible special values that must be accommodated including:
     * +-denorm, +-0, +-infinity, and NaNs.
     */
    a_denorm0= (vec_ullong2)spu_cmpeq(a_exp, 0);
    a_nan    = spu_andc(a_exp_1s, a_mant_0);
    a_zero   = spu_and (a_denorm0, a_mant_0);
    a_inf    = spu_and (a_exp_1s, a_mant_0);
    a_denorm = spu_andc(a_denorm0, a_zero);

    b_denorm0= (vec_ullong2)spu_cmpeq(b_exp, 0);
    b_nan    = spu_andc(b_exp_1s, b_mant_0);
    b_zero   = spu_and (b_denorm0, b_mant_0);
    b_inf    = spu_and (b_exp_1s, b_mant_0);
    b_denorm = spu_andc(b_denorm0, b_zero);

    /* Scale denorm inputs to into normalized numbers by conditionally scaling the
     * input parameters.
     */
    a = spu_sub(spu_or(a_in, exp_53), spu_sel(exp_53, a_in, sign_mask));
    a = spu_sel(a_in, a, a_denorm);

    b = spu_sub(spu_or(b_in, exp_53), spu_sel(exp_53, b_in, sign_mask));
    b = spu_sel(b_in, b, b_denorm);

    /* Extract the divisor and dividend exponent and force parameters into the signed
     * range [1.0,2.0) or [-1.0,2.0).
     */
    exp_a = spu_and((vec_ullong2)a, exp_mask);
    exp_b = spu_and((vec_ullong2)b, exp_mask);

    mant_a = spu_sel(a, one, (vec_ullong2)exp_mask);
    mant_b = spu_sel(b, one, (vec_ullong2)exp_mask);

    /* Approximate the single reciprocal of b by using
     * the single precision reciprocal estimate followed by one
     * single precision iteration of Newton-Raphson.
     */
    mant_bf = spu_roundtf(mant_b);
    inv_bf = spu_re(mant_bf);
    inv_bf = spu_madd(spu_nmsub(mant_bf, inv_bf, onef), inv_bf, inv_bf);

    /* Perform 2 more Newton-Raphson iterations in double precision. The
     * result (q1) is in the range (0.5, 2.0).
     */
    inv_b = spu_extend(inv_bf);
    inv_b = spu_madd(spu_nmsub(mant_b, inv_b, one), inv_b, inv_b);
    q0 = spu_mul(mant_a, inv_b);
    q1 = spu_madd(spu_nmsub(mant_b, q0, mant_a), inv_b, q0);

    /* Determine the exponent correction factor that must be applied
     * to q1 by taking into account the exponent of the normalized inputs
     * and the scale factors that were applied to normalize them.
     */
    exp = spu_rlmaska(spu_sub((vec_int4)exp_a, (vec_int4)exp_b), -20);
    exp = spu_add(exp, (vec_int4)spu_add(spu_and((vec_int4)a_denorm, -0x34), spu_and((vec_int4)b_denorm, 0x34)));

    /* Bias the quotient exponent depending on the sign of the exponent correction
     * factor so that a single multiplier will ensure the entire double precision
     * domain (including denorms) can be achieved.
     *
     *    exp 	       bias q1     adjust exp
     *   =====	       ========    ==========
     *   positive         2^+65         -65
     *   negative         2^-64         +64
     */
    exp_bias = spu_xor(spu_rlmaska(exp, -31), 64);
    exp = spu_sub(exp, exp_bias);

    q1 = spu_sel(q1, (vec_double2)spu_add((vec_int4)q1, spu_sl(exp_bias, 20)), exp_mask);

    /* Compute a multiplier (mult) to applied to the quotient (q1) to produce the
     * expected result. On overflow, clamp the multiplier to the maximum non-infinite
     * number in case the rounding mode is not round-to-nearest.
     */
    exp = spu_add(exp, 0x3FF);
    no_underflow = spu_cmpgt(exp, 0);
    overflow = spu_cmpgt(exp, 0x7FE);
    exp = spu_and(spu_sl(exp, 20), (vec_int4)no_underflow);
    exp = spu_and(exp, (vec_int4)exp_mask);

    mult = spu_sel((vec_double2)exp, (vec_double2)(spu_add((vec_uint4)exp_mask, -1)), (vec_ullong2)overflow);

    /* Handle special value conditions. These include:
     *
     * 1) IF either operand is a NaN OR both operands are 0 or INFINITY THEN a NaN
     *    results.
     * 2) ELSE IF the dividend is an INFINITY OR the divisor is 0 THEN a INFINITY results.
     * 3) ELSE IF the dividend is 0 OR the divisor is INFINITY THEN a 0 results.
     */
    mult = spu_andc(mult, (vec_double2)spu_or(a_zero, b_inf));
    mult = spu_sel(mult, (vec_double2)exp_mask, spu_or(a_inf, b_zero));

    nan = spu_or(a_nan, b_nan);
    nan = spu_or(nan, spu_and(a_zero, b_zero));
    nan = spu_or(nan, spu_and(a_inf, b_inf));

    mult = spu_or(mult, (vec_double2)nan);

    /* Scale the final quotient */

    q2 = spu_mul(q1, mult);

    return (q2);
}
Example #6
0
void process_render_tasks(unsigned long eah_render_tasks, unsigned long eal_render_tasks)
{
    const vec_uchar16 SHUFFLE_MERGE_BYTES = (vec_uchar16) {	// merge lo bytes from unsigned shorts (array)
        1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31
    };

    const vec_uchar16 SHUFFLE_GET_BUSY_WITH_ONES = (vec_uchar16) {	// get busy flag with ones in unused bytes
        0xc0, 0xc0, 2, 3, 0xc0,0xc0,0xc0,0xc0, 0xc0,0xc0,0xc0,0xc0
    };

    const vec_uchar16 ZERO_BYTES = (vec_uchar16) spu_splats(0);

    char trianglebuffer[ 256 + TRIANGLE_MAX_SIZE ];

    char	sync_buffer[128+127];
    void*	aligned_sync_buffer = (void*) ( ((unsigned long)sync_buffer+127) & ~127 );

    RenderableCacheLine* cache = (RenderableCacheLine*) aligned_sync_buffer;
    unsigned long long cache_ea;

    spu_mfcdma64(&cache_ea, eah_render_tasks, eal_render_tasks, sizeof(cache_ea), 0, MFC_GET_CMD);
    mfc_write_tag_mask(1<<0);
    mfc_read_tag_status_all();

    while (cache_ea) {
        // terminate immediately if possible
        if (spu_stat_in_mbox())
            return;

        // read the cache line
        spu_mfcdma64(cache, mfc_ea2h(cache_ea), mfc_ea2l(cache_ea), 128, 0, MFC_GETLLAR_CMD);
        spu_readch(MFC_RdAtomicStat);

        unsigned int endTriangle = cache->endTriangle;
        vec_ushort8 testTriangle = spu_splats((unsigned short) endTriangle);

        // first look for short chunks
        vec_uchar16 next = cache->chunkNext;
        vec_uchar16 nextmask = spu_and(next, spu_splats((unsigned char)CHUNKNEXT_MASK));

        // change next to word offset, note we don't care what the low bit shifted in is
        vec_uchar16 firstshuf = (vec_uchar16) spu_sl( (vec_ushort8)nextmask, 1 );
        vec_uchar16 trishufhi = spu_or ( firstshuf, spu_splats((unsigned char) 1));
        vec_uchar16 trishuflo = spu_and( firstshuf, spu_splats((unsigned char) 254));

        vec_ushort8 start0 = cache->chunkStart[0];
        vec_ushort8 start1 = cache->chunkStart[1];

        vec_ushort8 nstart0 = spu_shuffle( start0, start1, spu_shuffle( trishuflo, trishufhi, SHUF0 ) );
        vec_ushort8 nstart1 = spu_shuffle( start0, start1, spu_shuffle( trishuflo, trishufhi, SHUF1 ) );

        vec_ushort8 starteq0 = spu_cmpeq( nstart0, spu_splats((unsigned short)0) );
        vec_ushort8 starteq1 = spu_cmpeq( nstart1, spu_splats((unsigned short)0) );

        vec_ushort8 end0 = spu_sel( nstart0, spu_splats((unsigned short)4096), starteq0);
        vec_ushort8 end1 = spu_sel( nstart1, spu_splats((unsigned short)4096), starteq1);

        vec_ushort8 len0 = spu_sub( end0, start0);
        vec_ushort8 len1 = spu_sub( end1, start1);

        vec_ushort8 small0 = spu_cmpgt( spu_splats((unsigned short)17), len0);
        vec_ushort8 small1 = spu_cmpgt( spu_splats((unsigned short)17), len1);
        vec_uchar16 small = (vec_uchar16) spu_shuffle( small0, small1, MERGE );
        vec_uint4 smallChunkGather = spu_gather(small);

        // check to see if chunk is already at the last triangle
        vec_uint4 doneChunkGather = spu_gather( (vec_uchar16) spu_shuffle(
                (vec_uchar16) spu_cmpeq(testTriangle, cache->chunkTriangle[0]),
                (vec_uchar16) spu_cmpeq(testTriangle, cache->chunkTriangle[1]),
                SHUFFLE_MERGE_BYTES) );

        // check if the chunk is free
        vec_uint4 freeChunkGather = spu_gather(
                                        spu_cmpeq( spu_splats( (unsigned char) CHUNKNEXT_FREE_BLOCK ), cache->chunkNext ) );

        // check to see if the chunk is being processed
        vec_uint4 busyChunkGather = spu_gather(
                                        spu_cmpgt( cache->chunkNext, //spu_and(cache->chunkNext, CHUNKNEXT_MASK),
                                                spu_splats( (unsigned char) (CHUNKNEXT_BUSY_BIT-1) ) ) );

        // doneChunkGather, freeChunkGather, busyChunkGather - rightmost 16 bits of word 0
        // note that if freeChunkGather is true then busyChunkGather must also be true

        // done=false, free=false, busy=false -> can process
        // free=false, busy=false -> can be merged

        // decide which chunk to process
        vec_uint4 mayProcessGather = spu_nor( doneChunkGather, busyChunkGather );
        vec_uint4 mayProcessShortGather = spu_and( mayProcessGather, smallChunkGather );

        vec_uint4 shortSelMask = spu_cmpeq( mayProcessShortGather, spu_splats(0U) );
        vec_uint4 mayProcessSelection = spu_sel( mayProcessShortGather, mayProcessGather, shortSelMask );

        /*
        		if (!spu_extract(shortSelMask, 0))
        			printf("taken short: may=%04x short=%04x mayshort=%04x mask=%04x sel=%04x\n",
        				spu_extract(mayProcessGather, 0) & 0xffff,
        				spu_extract(smallChunkGather, 0),
        				spu_extract(mayProcessShortGather, 0),
        				spu_extract(shortSelMask, 0) & 0xffff,
        				spu_extract(mayProcessSelection, 0) & 0xffff );
        */

        vec_uint4 mayProcessBits = spu_sl( mayProcessSelection, 16);
        unsigned int chunkToProcess = spu_extract( spu_cntlz( mayProcessBits ), 0);
        unsigned int freeChunk = spu_extract( spu_cntlz( spu_sl( freeChunkGather, 16 ) ), 0);

        // if there's nothing to process, try the next cache line in the rendering tasks list
        if (!spu_extract(mayProcessBits, 0)) {
trynextcacheline:
            cache_ea = cache->next;
            // sleep();
            continue;
        }

        unsigned int chunkStart    	= cache->chunkStartArray   [chunkToProcess];
        unsigned int chunkTriangle	= cache->chunkTriangleArray[chunkToProcess];
        unsigned int chunkNext		= cache->chunkNextArray	   [chunkToProcess] & CHUNKNEXT_MASK;
        unsigned int chunkEnd		= (cache->chunkStartArray  [chunkNext]-1) & (NUMBER_OF_TILES-1);
        unsigned int chunkLength	= 1 + chunkEnd-chunkStart;

        // only need an extra block if the block is especially long
        if (chunkLength <= NUMBER_OF_TILES_PER_CHUNK) {
            freeChunk = 32;
        }

        // mark this block as busy
        cache->chunkNextArray[chunkToProcess] |= CHUNKNEXT_BUSY_BIT;

        // if there's at least one free chunk, claim it
        if (freeChunk != 32) {
            cache->chunkNextArray[freeChunk] = CHUNKNEXT_RESERVED;
            cache->chunkTriangleArray[freeChunk] = chunkTriangle;
        }

        // write the cache line back
        spu_mfcdma64(cache, mfc_ea2h(cache_ea), mfc_ea2l(cache_ea), 128, 0, MFC_PUTLLC_CMD);
        if (spu_readch(MFC_RdAtomicStat) & MFC_PUTLLC_STATUS)
            continue;

#ifdef INFO
        printf("[%d] Claimed chunk %d (%d-%d len %d) at tri %x end %x with free chunk %d\n", _SPUID,
               chunkToProcess, chunkStart, chunkEnd, chunkLength, chunkTriangle, endTriangle,
               freeChunk!=32 ? freeChunk : -1 );
//		debug_render_tasks(cache);
#endif

        Triangle* triangle;
        int firstTile;
        do {
            // read the triangle data for the current triangle
            unsigned int extra = chunkTriangle & 127;
            unsigned long long trianglebuffer_ea = cache_ea + TRIANGLE_OFFSET_FROM_CACHE_LINE + (chunkTriangle & ~127);
            triangle = (Triangle*) (trianglebuffer+extra);
            unsigned int length = (extra + TRIANGLE_MAX_SIZE + 127) & ~127;

            // ensure DMA slot available
            do {} while (!spu_readchcnt(MFC_Cmd));

            spu_mfcdma64(trianglebuffer, mfc_ea2h(trianglebuffer_ea), mfc_ea2l(trianglebuffer_ea),
                         length, 0, MFC_GET_CMD);
            mfc_write_tag_mask(1<<0);
            mfc_read_tag_status_all();

            // get the triangle deltas
            firstTile = findFirstTriangleTile(triangle, chunkStart, chunkEnd);

            if (firstTile>=0)
                break;

            // no match, try next triangle
            chunkTriangle = triangle->next_triangle;
        } while (chunkTriangle != endTriangle);

        // if we actually have something to process...
        if (firstTile>=0) {
            // the "normal" splitting will now become:
            // chunkStart .. (firstTile-1)	-> triangle->next_triangle
            // firstTile .. (firstTile+NUM-1) -> chunkTriangle (BUSY)
            // (firstTile+NUM) .. chunkEnd -> chunkTriangle (FREE)

            int tailChunk;
            int thisChunk;
            int nextBlockStart;
            int thisBlockStart;
            int realBlockStart;
            do {
retry:
                // read the cache line
                spu_mfcdma64(cache, mfc_ea2h(cache_ea), mfc_ea2l(cache_ea), 128, 0, MFC_GETLLAR_CMD);
                spu_readch(MFC_RdAtomicStat);

                // calculate start of next block
                nextBlockStart = firstTile + NUMBER_OF_TILES_PER_CHUNK;
                if (nextBlockStart > chunkEnd)
                    nextBlockStart = chunkEnd+1;

                // calculate start of block to mark as busy
                thisBlockStart = nextBlockStart - NUMBER_OF_TILES_PER_CHUNK;
                if (thisBlockStart < chunkStart)
                    thisBlockStart = chunkStart;
                realBlockStart = thisBlockStart;

#ifdef INFO
                printf("[%d] nextBlockStart=%d, realBlockStart=%d, thisBlockStart=%d, chunkStart=%d\n", _SPUID,
                       nextBlockStart, realBlockStart, thisBlockStart, chunkStart);
#endif


                // allocate some more free chunks
                vec_uint4 freeChunkGather2 = spu_sl(spu_gather(spu_cmpeq(
                                                        spu_splats((unsigned char)CHUNKNEXT_FREE_BLOCK), cache->chunkNext)), 16);
                unsigned int freeChunk2 = spu_extract(spu_cntlz(freeChunkGather2), 0);

                if (freeChunk == 32) {
                    // if we didn't have one before, try again
                    freeChunk = freeChunk2;

                    // and try to get the second one
                    freeChunkGather2 = spu_andc( freeChunkGather2, spu_promote(0x80000000>>freeChunk2, 0) );
                    freeChunk2 = spu_extract(spu_cntlz(freeChunkGather2), 0);
                } else {
                    // speculatively clear the free chunk just in case we don't need it
                    cache->chunkNextArray[freeChunk] = CHUNKNEXT_FREE_BLOCK;
                }

#ifdef INFO
                printf("[%d] Free chunks %d and %d, cN=%d, nBS=%d, cE=%d, tBS=%d, cS=%d\n",
                       _SPUID, freeChunk, freeChunk2, chunkNext, nextBlockStart, chunkEnd, thisBlockStart, chunkStart );
#endif

                // mark region after as available for processing if required
                if (nextBlockStart < chunkEnd) {
                    if (freeChunk==32) {
                        // if no free chunk, relinquish entire block and write back
                        cache->chunkNextArray[chunkToProcess] = chunkNext;
                        spu_mfcdma64(cache, mfc_ea2h(cache_ea), mfc_ea2l(cache_ea), 128, 0, MFC_PUTLLC_CMD);
                        // if writeback failed, we *might* have a free block, retry
                        if (spu_readch(MFC_RdAtomicStat) & MFC_PUTLLC_STATUS)
                            goto retry;

                        // otherwise give up and try the next cache line
                        goto trynextcacheline;
                    }
                    cache->chunkStartArray[freeChunk] = nextBlockStart;
                    cache->chunkNextArray[freeChunk] = chunkNext;
                    cache->chunkTriangleArray[freeChunk] = chunkTriangle;
                    cache->chunkNextArray[chunkToProcess] = freeChunk | CHUNKNEXT_BUSY_BIT;
                    tailChunk = freeChunk;
#ifdef INFO
                    printf("[%d] Insert tail, tailChunk=%d, chunkNext=%d, chunkToProcess=%d\n", _SPUID, tailChunk, chunkNext, chunkToProcess);
                    debug_render_tasks(cache);
#endif
                } else {
                    // we're gonna use freeChunk2 for the "in front" block, as we've not
                    // used freeChunk, let's use it as it's more likely to have a free chunk
                    freeChunk2 = freeChunk;
                    tailChunk = chunkNext;
                }

                // mark region before as available if required and possible
                thisChunk = chunkToProcess;
                if (thisBlockStart > chunkStart) {
                    if (freeChunk2 != 32) {
                        // mark this region as busy
                        cache->chunkStartArray[freeChunk2]=thisBlockStart;
                        cache->chunkNextArray[freeChunk2]=tailChunk | CHUNKNEXT_BUSY_BIT;
                        cache->chunkTriangleArray[freeChunk2]=chunkTriangle;

                        // mark region before as available for processing
                        cache->chunkNextArray[chunkToProcess]=freeChunk2;
                        cache->chunkTriangleArray[chunkToProcess]=triangle->next_triangle;
                        thisChunk = freeChunk2;
#ifdef INFO
                        printf("[%d] Insert new head, tailChunk=%d, chunkNext=%d, thisChunk=%d\n", _SPUID, tailChunk, chunkNext, thisChunk);
                        debug_render_tasks(cache);
#endif
                    } else {
                        // need to keep whole block, update info and mark bust
                        cache->chunkTriangleArray[chunkToProcess]=chunkTriangle;
                        cache->chunkNextArray[chunkToProcess]=tailChunk | CHUNKNEXT_BUSY_BIT;
                        realBlockStart = chunkStart;
                        printf("[%d] Keep whole block, tailChunk=%d, chunkNext=%d, thisChunk=%d\n", _SPUID, tailChunk, chunkNext, thisChunk);
                        debug_render_tasks(cache);
#ifdef INFO
#endif
                        sleep();
                    }
                }

                // merge chunks
                merge_cache_blocks(cache);

                // write the cache line back
                spu_mfcdma64(cache, mfc_ea2h(cache_ea), mfc_ea2l(cache_ea), 128, 0, MFC_PUTLLC_CMD);
            } while (spu_readch(MFC_RdAtomicStat) & MFC_PUTLLC_STATUS);

            // finally after the write succeeded, update the variables
            chunkNext = tailChunk;
            chunkToProcess = thisChunk;
            chunkStart = firstTile; //thisBlockStart;
            chunkLength = nextBlockStart - firstTile;
            chunkEnd = chunkStart + chunkLength - 1;
            freeChunk = 32;

            // now we can process the block up to endTriangle
            initTileBuffers(thisBlockStart, chunkEnd);

            int ok=0;
            while (chunkTriangle != endTriangle) {
#ifdef INFO
                printf("[%d] Processing chunk %d at %4d len %4d, triangle %04x first=%d tbs=%d\n",
                       _SPUID, chunkToProcess, chunkStart, chunkLength,
                       chunkTriangle, firstTile, thisBlockStart);
#endif
                // and actually process that triangle on these chunks
                processTriangleChunks(triangle, cache, thisBlockStart, chunkEnd, chunkTriangle, ok);
                ok=1;
#ifdef PAUSE
                sleep();
#endif
                // and advance to the next-triangle
                chunkTriangle = triangle->next_triangle;

                // this should only ever happen if we're running really low on cache line slots
                // basically, if we pick up a block with more than NUMBER_OF_TILES_PER_CHUNK and
                // there's no slot to store the pre-NUMBER_OF_TILES_PER_CHUNK tiles.
                // in this case, we process from thisBlockStart only (because we know that from
                // chunkStart to there has no result) and then we only process one triangle
                if (chunkStart != realBlockStart) {
                    /*
                    printf("[%d] chunkStart=%d != realBlockStart %d, chunkEnd=%d, "
                    	"firstTile=%d chunk=%d\n",
                    	_SPUID, chunkStart, realBlockStart, chunkEnd,
                    	firstTile, chunkToProcess);
                    debug_render_tasks(cache);
                    */

                    // abort the while loop
                    break;
                }

                // read the next triangle
                unsigned int extra = chunkTriangle & 127;
                unsigned long long trianglebuffer_ea = cache_ea + TRIANGLE_OFFSET_FROM_CACHE_LINE + (chunkTriangle & ~127);
                triangle = (Triangle*) (trianglebuffer+extra);
                unsigned int length = (extra + TRIANGLE_MAX_SIZE + 127) & ~127;

                // ensure DMA slot available
                do {} while (!spu_readchcnt(MFC_Cmd));

                spu_mfcdma64(trianglebuffer, mfc_ea2h(trianglebuffer_ea),
                             mfc_ea2l(trianglebuffer_ea), length, 0, MFC_GET_CMD);
                mfc_write_tag_mask(1<<0);
                mfc_read_tag_status_all();
            } // until chunkTriangle == endTriangle

            // flush any output buffers
            flushTileBuffers(thisBlockStart, chunkEnd);

        } // firstTile>=0
Example #7
0
Triangle* getTriangleBuffer(Context* context)
{
	// if we've already allocated a triangle buffer (and we're in the same context)
	if (context == _currentTriangleContext && _currentTriangle)
		return _currentTriangle;

	// trash the default values
	_currentTriangleContext	= context;
	_currentTriangle	= NULL;

	// read the current renderable cache line to ensure there is room for the triangle data
	// in the cache line buffer; we do this by comparing against all 16 cache line blocks
	// to make sure that extending the write pointer wouldn't clobber the data

	unsigned long long cache_ea = context->renderableCacheLine;
	if (cache_ea == 0)
		return NULL;
	char cachebuffer[128+127];
	RenderableCacheLine* cache = (RenderableCacheLine*) ( ((unsigned int)cachebuffer+127) & ~127 );

	// printf("GTB: reading to %x from %x:%x\n", cache, mfc_ea2h(cache_ea), mfc_ea2l(cache_ea));

	spu_mfcdma64(cache, mfc_ea2h(cache_ea), mfc_ea2l(cache_ea), 128, 0, MFC_GETLLAR_CMD);
	spu_readch(MFC_RdAtomicStat);

	// extendvalid = ( read<=write && test<end ) || ( read>write && test<read )
	// extendvalid = ( read>write && read>test ) || ( read<=write && end>test )
	// simplifies to	extendvalid = selb(end, read, read>write) > test
	// or			extendvalid = selb(end>test, read>test, read>write)
	// rewind = next >= end
	// rewindvalid = read != 0
	// valid = extendvalid && (!rewind || rewindvalid)
	// 	 = extendvalid && (!rewind || !rewindinvalid)
	// 	 = extendvalid && !(rewind && rewindinvalid)
	// invalid = ! (extendvalid && !(rewind && rewindinvalid))
	//         = (!extendvalid || (rewind && rewindinvalid))

	vec_ushort8 v_writeptr		= spu_splats( cache->endTriangle );
	vec_ushort8 v_readptr0		= cache->chunkTriangle[0];
	vec_ushort8 v_readptr1		= cache->chunkTriangle[1];
	vec_ushort8 v_testptr		= spu_add(v_writeptr,   TRIANGLE_MAX_SIZE);
	vec_ushort8 v_nextptr		= spu_add(v_writeptr, 2*TRIANGLE_MAX_SIZE);
	vec_ushort8 v_endptr		= spu_splats( (unsigned short)TRIANGLE_BUFFER_SIZE);

	vec_ushort8 v_zero		= spu_splats( (unsigned short) 0 );
	vec_uchar16 v_merger		= (vec_uchar16) { 1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31 };

	vec_ushort8 v_max0_test		= spu_sel( v_endptr, v_readptr0, spu_cmpgt( v_readptr0, v_writeptr ) );
	vec_ushort8 v_max1_test		= spu_sel( v_endptr, v_readptr1, spu_cmpgt( v_readptr1, v_writeptr ) );
	vec_ushort8 v_extend0_valid	= spu_cmpgt( v_max0_test, v_testptr );
	vec_ushort8 v_extend1_valid	= spu_cmpgt( v_max1_test, v_testptr );
	vec_ushort8 v_rewind0_invalid	= spu_cmpeq( v_readptr0, v_zero );
	vec_ushort8 v_rewind1_invalid	= spu_cmpeq( v_readptr1, v_zero );
	vec_ushort8 v_rewind8		= spu_cmpgt( v_nextptr, v_endptr );

	vec_uchar16 v_extend_valid	= (vec_uchar16) spu_shuffle( v_extend0_valid, v_extend1_valid, v_merger );
	vec_uchar16 v_rewind_invalid	= (vec_uchar16) spu_shuffle( v_rewind0_invalid, v_rewind1_invalid, v_merger );
	vec_uchar16 v_rewind		= (vec_uchar16) v_rewind8;

	vec_uchar16 v_valid_rhs		= spu_and( v_rewind_invalid, v_rewind );
	vec_uchar16 v_invalid		= spu_orc( v_valid_rhs, v_extend_valid );

	// check to see if the chunk is being processed
	vec_uint4 v_free = spu_gather(
		spu_cmpeq( spu_splats( (unsigned char) CHUNKNEXT_FREE_BLOCK ), cache->chunkNext ) );
	vec_uint4   v_invalid_bits	= spu_andc( spu_gather( v_invalid ), (vec_uint4) v_free );

	// if any of the bits are invalid, then no can do
	if ( spu_extract(v_invalid_bits, 0) ) {
		return NULL;
	}

	// fetch in the data before this triangle in the cache buffer
	unsigned int offset = cache->endTriangle;
	_currentTriangleBufferExtra = offset & 127;
	unsigned long long trianglebuffer_ea = cache_ea + TRIANGLE_OFFSET_FROM_CACHE_LINE + (offset & ~127);
	if (_currentTriangleBufferExtra) {
		spu_mfcdma64(_currentTriangleBuffer, mfc_ea2h(trianglebuffer_ea), mfc_ea2l(trianglebuffer_ea), 128, 0, MFC_GET_CMD);

		// ensure DMA did actually complete
		mfc_write_tag_mask(1<<0);
		mfc_read_tag_status_all();
	}

	// final bit of initialisation
	_currentTriangle = (Triangle*) (_currentTriangleBuffer+_currentTriangleBufferExtra);
	_currentTriangleOffset = offset;
	_currentTriangleRewind = v_rewind8;
	_currentTriangleCacheEndTriangleEAL = mfc_ea2l(cache_ea) + (((char*)&cache->endTriangle) - ((char*)cache));
	_currentTriangleCacheEndTriangleEAH = mfc_ea2h(cache_ea); 
	_currentTriangleBufferEA = trianglebuffer_ea; 

	// printf("Allocated new triangle buffer: %x\n", offset);

	// and return the buffer ready to go
	return _currentTriangle;
}
Example #8
0
int main (unsigned long long spe_id, 
          unsigned long long argp, 
          unsigned long long envp)
{
   unsigned int id;
   int i, j, bufindex;
   vector float temp[4];

   /* this is a set of 2 16K buffers */
   vector float buf[2][BLOCK][BLOCK/4] __attribute__ ((aligned(128)));
   vector float out[2][BLOCK][BLOCK/4] __attribute__ ((aligned(128)));

   vector unsigned char maskLeft = (vector unsigned char){0x00, 0x01, 0x02, 0x03,
                                                          0x10, 0x11, 0x12, 0x13,
                                                          0x04, 0x05, 0x06, 0x07,
                                                          0x14, 0x15, 0x16, 0x17};

   vector unsigned char maskRight = (vector unsigned char){0x08, 0x09, 0x0a, 0x0b,
                                                           0x18, 0x19, 0x1a, 0x1b,
                                                           0x0c, 0x0d, 0x0e, 0x0f,
                                                           0x1c, 0x1d, 0x1e, 0x1f};

   transpose_package_t package;

   /* location markers */
   unsigned long long dataaddr = 0;
   int rowid, blockid, blockaddr, blockstart, row;
   int opporowid, oppoblockaddr;

   /* read in package */
   mfc_get(&package, argp, sizeof(transpose_package_t), TAG, 0, 0);
   mfc_write_tag_mask(1<<TAG);
   mfc_read_tag_status_all();

   id = package.id;

   blockstart = id * (N / THREADCNT / BLOCK) * BLOCK * sizeof(float);

   /* For each Row set (64 rows in a row set)
    *     for each block
    *         for each row in a block
    *              read
    */
   for (rowid = 0; rowid < N; rowid += BLOCK)
   {
      /* read in prebuf */
      blockid = 0;

      blockaddr = blockstart + (blockid * sizeof(buf[0][0]));

      /* each rowset is 64 rows */
      for (row = rowid; row < rowid + BLOCK; row++)
      {
	 dataaddr = package.srcbuf + (row * N * sizeof(float)) + blockaddr;

	 mfc_get(
	    buf[blockid & 1][row % BLOCK],
	    dataaddr,
	    sizeof(buf[0][0]),
	    0,
	    0,
	    0);
      }

      /* each spu must walk 8 blocks per rowset */
      for (blockid = 1; blockid < (N / THREADCNT / BLOCK); blockid++)
      {
	 blockaddr = blockstart + (blockid * sizeof(buf[0][0]));

	 /* each rowset is 64 rows */
	 for (row = rowid; row < rowid + BLOCK; row++)
	 {
	    dataaddr = package.srcbuf + (row * N * sizeof(float)) + blockaddr;

	    mfc_get(
	       buf[blockid & 1][row % BLOCK],
	       dataaddr,
	       sizeof(buf[0][0]),
	       blockid & 1,
	       0,
	       0);
	 }

	 mfc_write_tag_mask(1 << (1 - (blockid & 1)));
	 mfc_read_tag_status_all();

	 bufindex = (blockid & 1) ? 0 : 1;

	 /* transpose the previous block */
         for (i = 0; i < BLOCK; i+= 4)
         {
            for (j = 0; j < BLOCK / 4; j++)
            {
               /* first phase */
               temp[0] = spu_shuffle(
                  buf[bufindex][i][j],
                  buf[bufindex][i+2][j],
                  maskLeft);
               temp[1] = spu_shuffle(
                  buf[bufindex][i][j],
                  buf[bufindex][i+2][j],
                  maskRight);
               temp[2] = spu_shuffle(
                  buf[bufindex][i+1][j],
                  buf[bufindex][i+3][j],
                  maskLeft);
               temp[3] = spu_shuffle(
                  buf[bufindex][i+1][j],
                  buf[bufindex][i+3][j],
                  maskRight);

               /* second phase */
               out[bufindex][j*4][i/4] =
                  spu_shuffle(temp[0], temp[2], maskLeft);
               out[bufindex][(j*4)+1][i/4] =
                  spu_shuffle(temp[0], temp[2], maskRight);
               out[bufindex][(j*4)+2][i/4] =
                  spu_shuffle(temp[1], temp[3], maskLeft);
               out[bufindex][(j*4)+3][i/4] =
                  spu_shuffle(temp[1], temp[3], maskRight);
            }
         }

	 /* calculating opposite location! */
	 oppoblockaddr = rowid * sizeof(float);

	 blockaddr = blockstart + ((blockid - 1) * sizeof(buf[0][0]));
	 opporowid = blockaddr / sizeof(float);
	 
	 /* write the block back out -> to the opposite location! */
	 for (row = opporowid; row < opporowid + BLOCK; row++)
	 {
	    dataaddr = package.destbuf + (row * N * sizeof(float)) + oppoblockaddr;
	    
	    mfc_put(
	       out[1 - (blockid & 1)][row % BLOCK],
	       dataaddr,
	       sizeof(buf[0][0]),
	       1 - (blockid & 1),
	       0,
	       0);
	 }
      }

      /* handle final block in row */

      mfc_write_tag_mask(2);
      mfc_read_tag_status_all();

      /* process remaining block */
      bufindex = (blockid == 1) ? 0 : 1;

      /* transpose the previous block */
      /* i indexes the row */
      for (i = 0; i < BLOCK; i+=4)
      {
         /* j indexes the column */
         for (j = 0; j < BLOCK / 4; j++)
         {
            /* first phase */
            temp[0] = spu_shuffle(
               buf[bufindex][i][j],
               buf[bufindex][i+2][j],
               maskLeft);
            temp[1] = spu_shuffle(
               buf[bufindex][i][j],
               buf[bufindex][i+2][j],
               maskRight);
            temp[2] = spu_shuffle(
               buf[bufindex][i+1][j],
               buf[bufindex][i+3][j],
               maskLeft);
            temp[3] = spu_shuffle(
               buf[bufindex][i+1][j],
               buf[bufindex][i+3][j],
               maskRight);

            /* second phase */
            out[bufindex][j*4][i/4] = spu_shuffle(temp[0], temp[2], maskLeft);
            out[bufindex][(j*4)+1][i/4] = spu_shuffle(temp[0], temp[2], maskRight);
            out[bufindex][(j*4)+2][i/4] = spu_shuffle(temp[1], temp[3], maskLeft);
            out[bufindex][(j*4)+3][i/4] = spu_shuffle(temp[1], temp[3], maskRight);
         }
      }

      /* calculating opposite for the previous block */
      blockaddr = blockstart + ((blockid - 1) * sizeof(buf[0][0]));

      oppoblockaddr = rowid * sizeof(float);
      opporowid = blockaddr / sizeof(float);

      /* write the block back out -> to the opposite location! */
      for (row = opporowid; row < opporowid + BLOCK; row++)
      {
         dataaddr = package.destbuf + (row * N * sizeof(float)) + oppoblockaddr;

         mfc_put(
            out[bufindex][row % BLOCK],
            dataaddr,
            sizeof(buf[0][0]),
            1,
            0,
            0);
      }

      mfc_read_tag_status_all();
   }

   return 0;
}
Example #9
0
/*
 * NAME:	sha256->search()
 * DESCRIPTION:	try to find a nonce which satisfies a target hash
 */
int64_t sha256_search(const message_t M,
		      const hash_t target, const hash_t midstate,
		      uint32_t start_nonce, uint32_t range)
{
  uint32_t nonce, stop_nonce = start_nonce + range + (4 - (range % 4)) % 4;
# if !defined(UNROLL_SHA256)
  int t;
# endif
  vec_uint4 W0[3], a0, b0, c0, d0, e0, f0, g0, h0;
  vec_uint4 W[16], a, b, c, d, e, f, g, h, T1, T2;
  vec_uint4 borrow, solution;
  const vec_uchar16 reverse_endian = {
     3,  2,  1,  0,
     7,  6,  5,  4,
    11, 10,  9,  8,
    15, 14, 13, 12
  };

  /* precompute first three rounds */

  a = SPLAT(midstate.words[0]);
  b = SPLAT(midstate.words[1]);
  c = SPLAT(midstate.words[2]);
  d = SPLAT(midstate.words[3]);
  e = SPLAT(midstate.words[4]);
  f = SPLAT(midstate.words[5]);
  g = SPLAT(midstate.words[6]);
  h = SPLAT(midstate.words[7]);

# ifdef UNROLL_SHA256
  W[0] = SPLAT(M.words[0]); ROUND(0);
  W[1] = SPLAT(M.words[1]); ROUND(1);
  W[2] = SPLAT(M.words[2]); ROUND(2);
# else
  for (t = 0; t < 3; ++t) {
    W[t] = SPLAT(M.words[t]);
    ROUND(t);
  }
# endif

  W0[0] = W[0];
  W0[1] = W[1];
  W0[2] = W[2];

  a0 = a;
  b0 = b;
  c0 = c;
  d0 = d;
  e0 = e;
  f0 = f;
  g0 = g;
  h0 = h;

  /* do the search, four at a time */

  for (nonce = start_nonce; nonce != stop_nonce; nonce += 4) {
    W[0] = W0[0];
    W[1] = W0[1];
    W[2] = W0[2];

    a = a0;
    b = b0;
    c = c0;
    d = d0;
    e = e0;
    f = f0;
    g = g0;
    h = h0;

    /* t = 3 */
    W[3] = (vec_uint4) { nonce + 0, nonce + 1, nonce + 2, nonce + 3 };
    ROUND(3);

# ifdef UNROLL_SHA256
    W[ 4] = SPLAT(M.words[ 4]); ROUND( 4);
    W[ 5] = SPLAT(M.words[ 5]); ROUND( 5);
    W[ 6] = SPLAT(M.words[ 6]); ROUND( 6);
    W[ 7] = SPLAT(M.words[ 7]); ROUND( 7);

    W[ 8] = SPLAT(M.words[ 8]); ROUND( 8);
    W[ 9] = SPLAT(M.words[ 9]); ROUND( 9);
    W[10] = SPLAT(M.words[10]); ROUND(10);
    W[11] = SPLAT(M.words[11]); ROUND(11);
    W[12] = SPLAT(M.words[12]); ROUND(12);
    W[13] = SPLAT(M.words[13]); ROUND(13);
    W[14] = SPLAT(M.words[14]); ROUND(14);
    W[15] = SPLAT(M.words[15]); ROUND(15);
# else
    for (t = 4; t < 16; ++t) {
      W[t] = SPLAT(M.words[t]);
      ROUND(t);
    }
# endif

# ifdef UNROLL_SHA256
    W[16 % 16] = W(16); ROUND(16);
    W[17 % 16] = W(17); ROUND(17);
    W[18 % 16] = W(18); ROUND(18);
    W[19 % 16] = W(19); ROUND(19);
    W[20 % 16] = W(20); ROUND(20);
    W[21 % 16] = W(21); ROUND(21);
    W[22 % 16] = W(22); ROUND(22);
    W[23 % 16] = W(23); ROUND(23);

    W[24 % 16] = W(24); ROUND(24);
    W[25 % 16] = W(25); ROUND(25);
    W[26 % 16] = W(26); ROUND(26);
    W[27 % 16] = W(27); ROUND(27);
    W[28 % 16] = W(28); ROUND(28);
    W[29 % 16] = W(29); ROUND(29);
    W[30 % 16] = W(30); ROUND(30);
    W[31 % 16] = W(31); ROUND(31);

    W[32 % 16] = W(32); ROUND(32);
    W[33 % 16] = W(33); ROUND(33);
    W[34 % 16] = W(34); ROUND(34);
    W[35 % 16] = W(35); ROUND(35);
    W[36 % 16] = W(36); ROUND(36);
    W[37 % 16] = W(37); ROUND(37);
    W[38 % 16] = W(38); ROUND(38);
    W[39 % 16] = W(39); ROUND(39);

    W[40 % 16] = W(40); ROUND(40);
    W[41 % 16] = W(41); ROUND(41);
    W[42 % 16] = W(42); ROUND(42);
    W[43 % 16] = W(43); ROUND(43);
    W[44 % 16] = W(44); ROUND(44);
    W[45 % 16] = W(45); ROUND(45);
    W[46 % 16] = W(46); ROUND(46);
    W[47 % 16] = W(47); ROUND(47);

    W[48 % 16] = W(48); ROUND(48);
    W[49 % 16] = W(49); ROUND(49);
    W[50 % 16] = W(50); ROUND(50);
    W[51 % 16] = W(51); ROUND(51);
    W[52 % 16] = W(52); ROUND(52);
    W[53 % 16] = W(53); ROUND(53);
    W[54 % 16] = W(54); ROUND(54);
    W[55 % 16] = W(55); ROUND(55);

    W[56 % 16] = W(56); ROUND(56);
    W[57 % 16] = W(57); ROUND(57);
    W[58 % 16] = W(58); ROUND(58);
    W[59 % 16] = W(59); ROUND(59);
    W[60 % 16] = W(60); ROUND(60);
    W[61 % 16] = W(61); ROUND(61);
    W[62 % 16] = W(62); ROUND(62);
    W[63 % 16] = W(63); ROUND(63);
# else
    for (t = 16; t < 64; ++t) {
      W[t % 16] = W(t);
      ROUND(t);
    }
# endif

    W[0] = ADD(a, midstate.words[0]);
    W[1] = ADD(b, midstate.words[1]);
    W[2] = ADD(c, midstate.words[2]);
    W[3] = ADD(d, midstate.words[3]);
    W[4] = ADD(e, midstate.words[4]);
    W[5] = ADD(f, midstate.words[5]);
    W[6] = ADD(g, midstate.words[6]);
    W[7] = ADD(h, midstate.words[7]);

    /* first SHA-256 complete */

    a = SPLAT(H0.words[0]);
    b = SPLAT(H0.words[1]);
    c = SPLAT(H0.words[2]);
    d = SPLAT(H0.words[3]);
    e = SPLAT(H0.words[4]);
    f = SPLAT(H0.words[5]);
    g = SPLAT(H0.words[6]);
    h = SPLAT(H0.words[7]);

    ROUND(0);
    ROUND(1);
    ROUND(2);
    ROUND(3);
    ROUND(4);
    ROUND(5);
    ROUND(6);
    ROUND(7);

    W[ 8] = SPLAT(0x80000000U); ROUND( 8);

# ifdef UNROLL_SHA256
    W[ 9] = SPLAT(0x00000000U); ROUND( 9);
    W[10] = SPLAT(0x00000000U); ROUND(10);
    W[11] = SPLAT(0x00000000U); ROUND(11);
    W[12] = SPLAT(0x00000000U); ROUND(12);
    W[13] = SPLAT(0x00000000U); ROUND(13);
    W[14] = SPLAT(0x00000000U); ROUND(14);
# else
    for (t = 9; t < 15; ++t) {
      W[t] = SPLAT(0U);
      ROUND(t);
    }
# endif

    W[15] = SPLAT(0x00000100U); ROUND(15);

# ifdef UNROLL_SHA256
    W[16 % 16] = W(16); ROUND(16);
    W[17 % 16] = W(17); ROUND(17);
    W[18 % 16] = W(18); ROUND(18);
    W[19 % 16] = W(19); ROUND(19);
    W[20 % 16] = W(20); ROUND(20);
    W[21 % 16] = W(21); ROUND(21);
    W[22 % 16] = W(22); ROUND(22);
    W[23 % 16] = W(23); ROUND(23);

    W[24 % 16] = W(24); ROUND(24);
    W[25 % 16] = W(25); ROUND(25);
    W[26 % 16] = W(26); ROUND(26);
    W[27 % 16] = W(27); ROUND(27);
    W[28 % 16] = W(28); ROUND(28);
    W[29 % 16] = W(29); ROUND(29);
    W[30 % 16] = W(30); ROUND(30);
    W[31 % 16] = W(31); ROUND(31);

    W[32 % 16] = W(32); ROUND(32);
    W[33 % 16] = W(33); ROUND(33);
    W[34 % 16] = W(34); ROUND(34);
    W[35 % 16] = W(35); ROUND(35);
    W[36 % 16] = W(36); ROUND(36);
    W[37 % 16] = W(37); ROUND(37);
    W[38 % 16] = W(38); ROUND(38);
    W[39 % 16] = W(39); ROUND(39);

    W[40 % 16] = W(40); ROUND(40);
    W[41 % 16] = W(41); ROUND(41);
    W[42 % 16] = W(42); ROUND(42);
    W[43 % 16] = W(43); ROUND(43);
    W[44 % 16] = W(44); ROUND(44);
    W[45 % 16] = W(45); ROUND(45);
    W[46 % 16] = W(46); ROUND(46);
    W[47 % 16] = W(47); ROUND(47);

    W[48 % 16] = W(48); ROUND(48);
    W[49 % 16] = W(49); ROUND(49);
    W[50 % 16] = W(50); ROUND(50);
    W[51 % 16] = W(51); ROUND(51);
    W[52 % 16] = W(52); ROUND(52);
    W[53 % 16] = W(53); ROUND(53);
    W[54 % 16] = W(54); ROUND(54);
    W[55 % 16] = W(55); ROUND(55);

    W[56 % 16] = W(56); ROUND(56);
    W[57 % 16] = W(57); ROUND(57);
    W[58 % 16] = W(58); ROUND(58);
    W[59 % 16] = W(59); ROUND(59);
    /* t = 60..63 delayed */
# else
    for (t = 16; t < 60; ++t) {
      W[t % 16] = W(t);
      ROUND(t);
    }
# endif

    W[60 % 16] = W(60);
    T1 = T1(60, e, f, g, h);

    T2 = ADD(ADD(d, T1), H0.words[7]);

    /* quick check to see if any element of the last word vector is zero */
    if (__builtin_expect(spu_extract(spu_gather(spu_cmpeq(T2, 0)), 0) == 0, 1))
      continue;

    /* we have something interesting; finish the SHA-256 */

    ROUND(60);

# ifdef UNROLL_SHA256
    W[61 % 16] = W(61); ROUND(61);
    W[62 % 16] = W(62); ROUND(62);
    W[63 % 16] = W(63); ROUND(63);
# else
    for (t = 61; t < 64; ++t) {
      W[t % 16] = W(t);
      ROUND(t);
    }
# endif

    a = ADD(a, H0.words[0]);
    b = ADD(b, H0.words[1]);
    c = ADD(c, H0.words[2]);
    d = ADD(d, H0.words[3]);
    e = ADD(e, H0.words[4]);
    f = ADD(f, H0.words[5]);
    g = ADD(g, H0.words[6]);
    h = ADD(h, H0.words[7]);

    /* now do the full (reversed-endian) subtraction */

    borrow = spu_genb(SPLAT(target.words[7]),
		      spu_shuffle(a, a, reverse_endian));
    borrow = spu_genbx(SPLAT(target.words[6]),
		       spu_shuffle(b, b, reverse_endian), borrow);
    borrow = spu_genbx(SPLAT(target.words[5]),
		       spu_shuffle(c, c, reverse_endian), borrow);
    borrow = spu_genbx(SPLAT(target.words[4]),
		       spu_shuffle(d, d, reverse_endian), borrow);
    borrow = spu_genbx(SPLAT(target.words[3]),
		       spu_shuffle(e, e, reverse_endian), borrow);
    borrow = spu_genbx(SPLAT(target.words[2]),
		       spu_shuffle(f, f, reverse_endian), borrow);
    borrow = spu_genbx(SPLAT(target.words[1]),
		       spu_shuffle(g, g, reverse_endian), borrow);
    borrow = spu_genbx(SPLAT(target.words[0]),
		       spu_shuffle(h, h, reverse_endian), borrow);

    solution = spu_gather(borrow);

    if (__builtin_expect(spu_extract(solution, 0) == 0, 1))
      continue;

    /* we have a winner */

    return nonce + (spu_extract(spu_cntlz(solution), 0) - 28);
  }

  return -1;
}
void process_buffer(int buffer, int cnt, vector float dt_v)
{
  int i;
  volatile vector float *p_inv_mass_v;
  vector float force_v, inv_mass_v;
  vector float pos0, pos1, pos2, pos3;
  vector float vel0, vel1, vel2, vel3;
  vector float dt_inv_mass_v, dt_inv_mass_v_0, dt_inv_mass_v_1, dt_inv_mass_v_2, dt_inv_mass_v_3;
  vector unsigned char splat_word_0 = (vector unsigned char){0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3};
  vector unsigned char splat_word_1 = (vector unsigned char){4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7};
  vector unsigned char splat_word_2 = (vector unsigned char){8, 9,10,11, 8, 9,10,11, 8, 9,10,11, 8, 9,10,11};
  vector unsigned char splat_word_3 = (vector unsigned char){12,13,14,15,12,13,14,15,12,13,14,15,12,13,14,15};

  p_inv_mass_v = (volatile vector float *)&inv_mass[buffer][0]; 
  force_v = ctx.force_v;

  // Compute the step in time for the block of particles, four 
  // particle at a time.
  for (i=0; i<cnt; i+=4) {
    inv_mass_v = *p_inv_mass_v++;
    
    pos0 = pos[buffer][i+0];
    pos1 = pos[buffer][i+1];
    pos2 = pos[buffer][i+2];
    pos3 = pos[buffer][i+3];

    vel0 = vel[buffer][i+0];
    vel1 = vel[buffer][i+1];
    vel2 = vel[buffer][i+2];
    vel3 = vel[buffer][i+3];

    dt_inv_mass_v = spu_mul(dt_v, inv_mass_v);

    pos0 = spu_madd(vel0, dt_v, pos0);
    pos1 = spu_madd(vel1, dt_v, pos1);
    pos2 = spu_madd(vel2, dt_v, pos2);
    pos3 = spu_madd(vel3, dt_v, pos3);

    dt_inv_mass_v_0 = spu_shuffle(dt_inv_mass_v, dt_inv_mass_v, splat_word_0);
    dt_inv_mass_v_1 = spu_shuffle(dt_inv_mass_v, dt_inv_mass_v, splat_word_1);
    dt_inv_mass_v_2 = spu_shuffle(dt_inv_mass_v, dt_inv_mass_v, splat_word_2);
    dt_inv_mass_v_3 = spu_shuffle(dt_inv_mass_v, dt_inv_mass_v, splat_word_3);

    vel0 = spu_madd(dt_inv_mass_v_0, force_v, vel0);
    vel1 = spu_madd(dt_inv_mass_v_1, force_v, vel1);
    vel2 = spu_madd(dt_inv_mass_v_2, force_v, vel2);
    vel3 = spu_madd(dt_inv_mass_v_3, force_v, vel3);

    pos[buffer][i+0] = pos0;
    pos[buffer][i+1] = pos1;
    pos[buffer][i+2] = pos2;
    pos[buffer][i+3] = pos3;

    vel[buffer][i+0] = vel0;
    vel[buffer][i+1] = vel1;
    vel[buffer][i+2] = vel2;
    vel[buffer][i+3] = vel3;
  }
}


int main(unsigned long long spu_id __attribute__ ((unused)), unsigned long long argv)
{
  int buffer, next_buffer;
  int cnt, next_cnt, left;
  float time, dt;
  vector float dt_v;
  volatile vector float *ctx_pos_v, *ctx_vel_v;
  volatile vector float *next_ctx_pos_v, *next_ctx_vel_v;
  volatile float *ctx_inv_mass, *next_ctx_inv_mass;
  unsigned int tags[2];

  // Reserve a pair of DMA tag IDs
  tags[0] = mfc_tag_reserve();
  tags[1] = mfc_tag_reserve();
  
  // Input parameter argv is a pointer to the particle context.
  // Fetch the parameter context, waiting for it to complete.
  spu_writech(MFC_WrTagMask, 1 << tags[0]);
  spu_mfcdma32((void *)(&ctx), (unsigned int)argv, sizeof(parm_context), tags[0], MFC_GET_CMD);
  (void)spu_mfcstat(MFC_TAG_UPDATE_ALL);

  dt = ctx.dt;
  dt_v = spu_splats(dt);

  // For each step in time
  for (time=0; time<END_OF_TIME; time += dt) {
    // For each double buffered block of particles
    left = ctx.particles;

    cnt = (left < PARTICLES_PER_BLOCK) ? left : PARTICLES_PER_BLOCK;

    ctx_pos_v = ctx.pos_v;
    ctx_vel_v = ctx.vel_v;
    ctx_inv_mass = ctx.inv_mass;

    // Prefetch first buffer of input data.
    buffer = 0;
    spu_mfcdma32((void *)(pos), (unsigned int)(ctx_pos_v), cnt * sizeof(vector float), tags[0], MFC_GETB_CMD);
    spu_mfcdma32((void *)(vel), (unsigned int)(ctx_vel_v), cnt * sizeof(vector float), tags[0], MFC_GET_CMD);
    spu_mfcdma32((void *)(inv_mass), (unsigned int)(ctx_inv_mass), cnt * sizeof(float), tags[0], MFC_GET_CMD);

    while (cnt < left) {
      left -= cnt;

      next_ctx_pos_v = ctx_pos_v + cnt;
      next_ctx_vel_v = ctx_vel_v + cnt;
      next_ctx_inv_mass = ctx_inv_mass + cnt;
      next_cnt = (left < PARTICLES_PER_BLOCK) ? left : PARTICLES_PER_BLOCK;

      // Prefetch next buffer so the data is available for computation on next loop iteration.
      // The first DMA is barriered so that we don't GET data before the previous iteration's
      // data is PUT.
      next_buffer = buffer^1;

      spu_mfcdma32((void *)(&pos[next_buffer][0]), (unsigned int)(next_ctx_pos_v), next_cnt * sizeof(vector float), tags[next_buffer], MFC_GETB_CMD);
      spu_mfcdma32((void *)(&vel[next_buffer][0]), (unsigned int)(next_ctx_vel_v), next_cnt * sizeof(vector float), tags[next_buffer], MFC_GET_CMD);
      spu_mfcdma32((void *)(&inv_mass[next_buffer][0]), (unsigned int)(next_ctx_inv_mass), next_cnt * sizeof(float), tags[next_buffer], MFC_GET_CMD);
      
      // Wait for previously prefetched data
      spu_writech(MFC_WrTagMask, 1 << tags[buffer]);
      (void)spu_mfcstat(MFC_TAG_UPDATE_ALL);

      process_buffer(buffer, cnt, dt_v);

      // Put the buffer's position and velocity data back into system memory
      spu_mfcdma32((void *)(&pos[buffer][0]), (unsigned int)(ctx_pos_v), cnt * sizeof(vector float), tags[buffer], MFC_PUT_CMD);
      spu_mfcdma32((void *)(&vel[buffer][0]), (unsigned int)(ctx_vel_v), cnt * sizeof(vector float), tags[buffer], MFC_PUT_CMD);
      
      ctx_pos_v = next_ctx_pos_v;
      ctx_vel_v = next_ctx_vel_v;
      ctx_inv_mass = next_ctx_inv_mass;

      buffer = next_buffer;
      cnt = next_cnt;		  
    }

    // Wait for previously prefetched data
    spu_writech(MFC_WrTagMask, 1 << tags[buffer]);
    (void)spu_mfcstat(MFC_TAG_UPDATE_ALL);

    process_buffer(buffer, cnt, dt_v);

    // Put the buffer's position and velocity data back into system memory
    spu_mfcdma32((void *)(&pos[buffer][0]), (unsigned int)(ctx_pos_v), cnt * sizeof(vector float), tags[buffer], MFC_PUT_CMD);
    spu_mfcdma32((void *)(&vel[buffer][0]), (unsigned int)(ctx_vel_v), cnt * sizeof(vector float), tags[buffer], MFC_PUT_CMD);

    // Wait for DMAs to complete before starting the next step in time.
    spu_writech(MFC_WrTagMask, 1 << tags[buffer]);
    (void)spu_mfcstat(MFC_TAG_UPDATE_ALL);
  }

  return (0);
}
Example #11
0
/* Scans the string pointed to by s for the character c and
 * returns a pointer to the last occurance of c. If
 * c is not found, then NULL is returned.
 */
char * strrchr(const char *s, int c)
{
  int nskip;
  vec_uchar16 *ptr, data, vc;
  vec_uint4 cmp_c, cmp_0, cmp;
  vec_uint4 res_ptr, res_cmp;
  vec_uint4 mask, result;
  vec_uint4 one = spu_splats(0xffffU);
  /* Scan memory array a quadword at a time. Skip leading
   * mis-aligned bytes.
   */
  ptr = (vec_uchar16 *)s;

  nskip = -((unsigned int)(ptr) & 15);
  mask = spu_rlmask(one, nskip);

  vc = spu_splats((unsigned char)(c));

  data = *ptr++;
  ptr = (vec_uchar16 *)((unsigned int)ptr & ~15);

  cmp_c = spu_and(spu_gather(spu_cmpeq(data, vc)), mask);
  cmp_0 = spu_and(spu_gather(spu_cmpeq(data, 0)), mask);

  res_ptr = spu_splats(0U);
  res_cmp = spu_splats(0U);

  while (spu_extract(cmp_0, 0) == 0) {
    cmp = spu_cmpeq(cmp_c, 0);

    res_ptr = spu_sel(spu_promote((unsigned int)(ptr), 0), res_ptr, cmp);
    res_cmp = spu_sel(cmp_c, res_cmp, cmp);

    data = *ptr++;

    cmp_c = spu_gather(spu_cmpeq(data, vc));
    cmp_0 = spu_gather(spu_cmpeq(data, 0));

    cmp = spu_cmpeq(cmp_c, 0);
  }

  /* Compute the location of the last character before termination
   * character.
   *
   * First mask off compare results following the first termination character.
   */
  mask = spu_sl(one, 31 - spu_extract(spu_cntlz(cmp_0), 0));
  cmp_c = spu_and(cmp_c, mask);

  /* Conditionally update res_ptr and res_cmd if a match was found in the last
   * quadword.
   */
  cmp = spu_cmpeq(cmp_c, 0);

  res_ptr = spu_sel(spu_promote((unsigned int)(ptr), 0), res_ptr, cmp);
  res_cmp = spu_sel(cmp_c, res_cmp, cmp);

  /* Bit reserve res_cmp for locating last occurance.
   */
  mask = spu_cmpeq(res_cmp, 0);

  res_cmp = (vec_uint4)spu_maskb(spu_extract(res_cmp, 0));
  res_cmp = spu_gather((vec_uchar16)spu_shuffle(res_cmp, res_cmp,
						VEC_LITERAL(vec_uchar16,
							    15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0)));

  /* Compute the location (ptr) of the last occurance of c. If no
   * occurance was found (ie, element 0 of res_cmp == 0, then return
   * NULL.
   */
  result = spu_sub(spu_add(res_ptr, 15), spu_cntlz(res_cmp));
  result = spu_andc(result, mask);

  return ((char *)spu_extract(result, 0));
}
void* libvector_pointwise_multiply_32fc_unaligned(void* target,  void* src0, void* src1, unsigned int num_bytes){
	//loop iterator i
	int i = 0;
	void* retval = target;


	//put the target and source addresses into qwords
	vector unsigned int address_counter_tgt = {(unsigned int)target, 0, 0, 0};
	vector unsigned int address_counter_src0 = {(unsigned int)src0, 0, 0 ,0};
	vector unsigned int address_counter_src1 = {(unsigned int)src1, 0, 0, 0};

	//create shuffle masks

	//shuffle mask building blocks:
	//all from the first vector
	vector unsigned char oneup = {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
								  0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f};
	//all from the second vector
	vector unsigned char second_oneup = {0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
										 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};



	//gamma: second half of the second, first half of the first, break at (unsigned int)src0%16
	vector unsigned char src_cmp = spu_splats((unsigned char)((unsigned int)src0%16));
	vector unsigned char gt_res = spu_cmpgt(oneup, src_cmp);
	vector unsigned char eq_res = spu_cmpeq(oneup, src_cmp);
	vector unsigned char cmp_res = spu_or(gt_res, eq_res);
	vector unsigned char sixteen_uchar = spu_splats((unsigned char)16);
	vector unsigned char phase_change = spu_and(sixteen_uchar, cmp_res);
	vector unsigned int shuffle_mask_gamma = spu_add((vector unsigned int)phase_change,
												 (vector unsigned int)oneup);
	shuffle_mask_gamma = spu_rlqwbyte(shuffle_mask_gamma, (unsigned int)src0%16);

	//eta: second half of the second, first half of the first, break at (unsigned int)src1%16
	src_cmp = spu_splats((unsigned char)((unsigned int)src1%16));
	gt_res = spu_cmpgt(oneup, src_cmp);
	eq_res = spu_cmpeq(oneup, src_cmp);
	cmp_res = spu_or(gt_res, eq_res);
	sixteen_uchar = spu_splats((unsigned char)16);
	phase_change = spu_and(sixteen_uchar, cmp_res);
	vector unsigned int shuffle_mask_eta = spu_add((vector unsigned int)phase_change,
												 (vector unsigned int)oneup);
	shuffle_mask_eta = spu_rlqwbyte(shuffle_mask_eta, (unsigned int)src1%16);





	vector unsigned char tgt_second = spu_rlqwbyte(second_oneup, -((unsigned int)target%16));
	vector unsigned char tgt_first = spu_rlqwbyte(oneup, -((unsigned int)target%16));

	//alpha: first half of first, second half of second, break at (unsigned int)target%16
	src_cmp = spu_splats((unsigned char)((unsigned int)target%16));
	gt_res = spu_cmpgt(oneup, src_cmp);
	eq_res = spu_cmpeq(oneup, src_cmp);
	cmp_res = spu_or(gt_res, eq_res);
	phase_change = spu_and(sixteen_uchar, cmp_res);
	vector unsigned int shuffle_mask_alpha = spu_add((vector unsigned int)phase_change,
													 (vector unsigned int)oneup);

	//delta: first half of first, first half of second, break at (unsigned int)target%16
	vector unsigned char shuffle_mask_delta = spu_shuffle(oneup, tgt_second, (vector unsigned char)shuffle_mask_alpha);
	//epsilon: second half of second, second half of first, break at (unsigned int)target%16
	vector unsigned char shuffle_mask_epsilon = spu_shuffle(tgt_second, oneup, (vector unsigned char)shuffle_mask_alpha);
	//zeta: second half of second, first half of first, break at 16 - (unsigned int)target%16
	vector unsigned int shuffle_mask_zeta = spu_rlqwbyte(shuffle_mask_alpha, (unsigned int)target%16);

	//beta: first half of first, second half of second, break at num_bytes%16
	src_cmp = spu_splats((unsigned char)(num_bytes%16));
	gt_res = spu_cmpgt(oneup, src_cmp);
	eq_res = spu_cmpeq(oneup, src_cmp);
	cmp_res = spu_or(gt_res, eq_res);
	phase_change = spu_and(sixteen_uchar, cmp_res);
	vector unsigned int shuffle_mask_beta = spu_add((vector unsigned int)phase_change,
													 (vector unsigned int)oneup);






	qword src0_past;
	qword src0_present;
	qword src1_past;
	qword src1_present;
	qword tgt_past;
	qword tgt_present;

	qword in_temp0;
	qword in_temp1;
	qword out_temp0;
	qword out_temp1;


	src0_past = si_lqd((qword)address_counter_src0, 0);
	src1_past = si_lqd((qword)address_counter_src1, 0);
	tgt_past = si_lqd((qword)address_counter_tgt, 0);

	vector unsigned char shuffle_mask_complexprod0 = {0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03,
													  0x0c, 0x0d, 0x0e, 0x0f, 0x08, 0x09, 0x0a, 0x0b};
	vector unsigned char shuffle_mask_complexprod1 = {0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13,
													  0x08, 0x09, 0x0a, 0x0b, 0x18, 0x19, 0x1a, 0x1b};
	vector unsigned char shuffle_mask_complexprod2 = {0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17,
													  0x0c, 0x0d, 0x0e, 0x0f, 0x1c, 0x1d, 0x1e, 0x1f};
	vector unsigned char sign_changer = {0x00, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00,
										 0x00, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00};

	vector float prod0;
	qword shuf0;
	vector float prod1;
	vector float sign_change;
	qword summand0;
	qword summand1;
	vector float sum;


	for(i = 0; i < num_bytes/16; ++i) {

		src0_present = si_lqd((qword)address_counter_src0, 16);
		src1_present = si_lqd((qword)address_counter_src1, 16);
		tgt_present = si_lqd((qword)address_counter_tgt, 16);

		in_temp0 = spu_shuffle(src0_present, src0_past, (vector unsigned char)shuffle_mask_gamma);
		in_temp1 = spu_shuffle(src1_present, src1_past, (vector unsigned char)shuffle_mask_eta);

		prod0 = spu_mul((vector float)in_temp0, (vector float)in_temp1);
		shuf0 = spu_shuffle((qword)in_temp1, (qword)in_temp1, shuffle_mask_complexprod0);
		prod1 = spu_mul((vector float)in_temp0, (vector float)shuf0);
		sign_change = spu_xor(prod0, (vector float)sign_changer);

		summand0 = spu_shuffle((qword)sign_change, (qword)prod1, shuffle_mask_complexprod1);

		summand1 = spu_shuffle((qword)sign_change, (qword)prod1, shuffle_mask_complexprod2);

		sum = spu_add((vector float)summand0, (vector float)summand1);


		out_temp0 = spu_shuffle(tgt_past, (qword)sum, shuffle_mask_delta);
		out_temp1 = spu_shuffle(tgt_present, (qword)sum, shuffle_mask_epsilon);

		si_stqd(out_temp0, (qword)address_counter_tgt, 0);
		si_stqd(out_temp1, (qword)address_counter_tgt, 16);

		tgt_past = out_temp1;
		src0_past = src0_present;
		src1_past = src1_present;
		address_counter_src0 = spu_add(address_counter_src0, 16);
		address_counter_src1 = spu_add(address_counter_src1, 16);
		address_counter_tgt = spu_add(address_counter_tgt, 16);


	}

	src0_present = si_lqd((qword)address_counter_src0, 16);
	src1_present = si_lqd((qword)address_counter_src1, 16);
	tgt_present = si_lqd((qword)address_counter_tgt, 16);


	in_temp0 = spu_shuffle(src0_present, src0_past, (vector unsigned char) shuffle_mask_gamma);
	in_temp1 = spu_shuffle(src1_present, src1_past, (vector unsigned char) shuffle_mask_eta);


	prod0 = spu_mul((vector float)in_temp0, (vector float)in_temp1);
	shuf0 = spu_shuffle((qword)in_temp1, (qword)in_temp1, shuffle_mask_complexprod0);
	prod1 = spu_mul(prod0, (vector float)shuf0);
	sign_change = spu_xor(prod0, (vector float)sign_changer);
	summand0 = spu_shuffle((qword)sign_change, (qword)prod1, shuffle_mask_complexprod1);
	summand1 = spu_shuffle((qword)sign_change, (qword)prod1, shuffle_mask_complexprod2);
	sum = spu_add((vector float)summand0, (vector float)summand1);



	qword target_temp = spu_shuffle(tgt_present, tgt_past, (vector unsigned char) shuffle_mask_zeta);
	qword meld = spu_shuffle((qword)sum, target_temp, (vector unsigned char)shuffle_mask_beta);



	out_temp0 = spu_shuffle(tgt_past, meld, shuffle_mask_delta);
	out_temp1 = spu_shuffle(tgt_present, meld, shuffle_mask_epsilon);

	si_stqd(out_temp0, (qword)address_counter_tgt, 0);
	si_stqd(out_temp1, (qword)address_counter_tgt, 16);

	return retval;
}
Example #13
0
void merge_buffers(){
  vector unsigned int cmp_v, cmp_v2;

  const vector signed int one_at_0 = {1,0,0,0};
  const vector signed int one_at_1 = {0,1,0,0};
  const vector signed int one_at_2 = {0,0,1,0};
  const vector signed int ones = {1,1,1,1};
  const vector signed int zeros = {0,0,0,0};

  const vector unsigned char cmp_v_shuffle_mask = {31,31,31,31,
						   31,31,31,31,
						   31,31,31,31,
						   31,31,31,31};
  vector unsigned char rev_mask;
  const vector unsigned char rev_left = {12,13,14,15,
					 8,9,10,11,
					 4,5,6,7,
					 0,1,2,3};

  const vector unsigned char rev_right = {28,29,30,31,
					  24,25,26,27,
					  20,21,22,23,
					  16,17,18,19};
  vector signed int *out_head_idx;
  if(mcb[am].local[OUT] < 255){
    int parent_idx = mcb[am].local[OUT];
    int side = (mcb[am].id+1)&1;
    out_head_idx = (vector signed int*) &md[parent_idx].idx[side][HEAD];
  } else {
    out_head_idx = (vector signed int*) &md[am].idx[OUT][HEAD];
  }

  vector signed int *left_tail_idx = (vector signed int*) &md[am].idx[LEFT][TAIL];
  vector signed int *right_tail_idx = (vector signed int*) &md[am].idx[RIGHT][TAIL];

  vector signed int size_v = {mcb[am].buffer_size[LEFT], mcb[am].buffer_size[RIGHT], mcb[am].buffer_size[OUT], 0};
  vector signed int avail_v = {num_in_buffer(LEFT), num_in_buffer(RIGHT), num_free_in_buffer(OUT), 1};
  vector signed int avail_before = { spu_extract(avail_v, 0), spu_extract(avail_v, 1), 0, 0 };
  vector unsigned int avail = spu_gather( spu_cmpgt(avail_v, zeros) ); // avail = 0x0F if all avail_v > zeros

  vector signed int *left, *right, *out;
  left = (vector signed int*) &md[am].buffer[LEFT][ spu_extract(*left_tail_idx,0) ];
  right = (vector signed int*) &md[am].buffer[RIGHT][ spu_extract(*right_tail_idx,0) ];
  out = (vector signed int*) &md[am].buffer[OUT][ spu_extract(*out_head_idx,0) ];

  #ifdef TRACE_TIME
    dec_val2 = spu_read_decrementer();
  #endif

  while(spu_extract(avail,0) == 0x0F){
    // cmp left and right to determine who gets eaten
    cmp_v = spu_cmpgt(*left,*right);
    cmp_v = spu_shuffle(cmp_v, cmp_v, cmp_v_shuffle_mask);
    // cmp_v = {FFFF,FFFF,FFFF,FFFF} if left[3] > right[3]

    *out = spu_sel(*left,*right,cmp_v);
    rev_mask = spu_sel(rev_right,rev_left,(vector unsigned char)cmp_v);
    *left = spu_shuffle(*left,*right,rev_mask);
    // data to be sorted is now in out and left, left in descending order

    sort_vectors(out,left);

    // update index of the used side
    if( spu_extract(cmp_v,0) ){
      // left[3] > right[3]
      *right_tail_idx = spu_add(*right_tail_idx,ones);
      avail_v = spu_sub(avail_v, one_at_1);
      right++;
      // modulus hack
      cmp_v2 = spu_cmpeq(*right_tail_idx, size_v);
      if( __builtin_expect( spu_extract(cmp_v2,0) ,0) ){
	*right_tail_idx = zeros;
	right = (vector signed int*) &md[am].buffer[RIGHT][0];
      }
    } else {
      *right = *left;
      *left_tail_idx = spu_add(*left_tail_idx,ones);
      avail_v = spu_sub(avail_v, one_at_0);
      left++;
      // modulus hack
      cmp_v2 = spu_cmpeq(*left_tail_idx, size_v);      
      if( __builtin_expect( spu_extract(cmp_v2,0) ,0) ){	
	*left_tail_idx = zeros;
	left = (vector signed int*) &md[am].buffer[LEFT][0];
      }
    }

    // update out head idx
    *out_head_idx = spu_add(*out_head_idx,ones);
    avail_v = spu_sub(avail_v, one_at_2);
    out++;
    // modulus hack
    cmp_v2 = spu_cmpeq(*out_head_idx, size_v);
    if( __builtin_expect(spu_extract(cmp_v2,0),0) ){
      out = (vector signed int*) &md[am].buffer[OUT][0];
      *out_head_idx = zeros;
    }

    // is there data still available?
    avail = spu_gather(spu_cmpgt(avail_v, zeros));
  }

  #ifdef TRACE_TIME
  merge_loop_ticks += -(spu_read_decrementer() - dec_val2);
  #endif

  // how much got produced?
  vector signed int consumed = spu_sub(avail_before, avail_v);
  int consumed_left = spu_extract(consumed, 0);
  int consumed_right = spu_extract(consumed, 1);

  if(consumed_left)
    update_tail(LEFT);

  if(consumed_right)
    update_tail(RIGHT);

  md[am].consumed[LEFT] += consumed_left;
  md[am].consumed[RIGHT] += consumed_right;
    
  if(md[am].consumed[LEFT] == mcb[am].data_size[LEFT])
    md[am].depleted[LEFT] = 1;
  
  if(md[am].consumed[RIGHT] == mcb[am].data_size[RIGHT])
    md[am].depleted[RIGHT] = 1;

  if(mcb[am].local[OUT] < 255 && md[am].depleted[LEFT] && md[am].depleted[RIGHT]){
    md[am].done = 1;
    --num_active_mergers;
  }
}