C++ (Cpp) spu_madd примеры использования

Пример #1

0

Показать файл

Файл: btConvexShape.cpp Проект: JinMyong/Game_MatchEmUp

static inline vec_float4 vec_dot3( vec_float4 vec0, vec_float4 vec1 )
{
    vec_float4 result;
    result = spu_mul( vec0, vec1 );
    result = spu_madd( spu_rlqwbyte( vec0, 4 ), spu_rlqwbyte( vec1, 4 ), result );
    return spu_madd( spu_rlqwbyte( vec0, 8 ), spu_rlqwbyte( vec1, 8 ), result );
}

Пример #2

0

Показать файл

Файл: particle.c Проект: rpetryniak/Course-Examples-for-IBM-Cell-B.E.Programming

int main(unsigned long long spu_id __attribute__ ((unused)), unsigned long long parm)
{
  int i, j;
  int left, cnt;
  float time;
  unsigned int tag_id;
  vector float dt_v, dt_inv_mass_v;

  // Reserve a tag ID
  tag_id = mfc_tag_reserve();

  spu_writech(MFC_WrTagMask, -1);

  // Input parameter parm is a pointer to the particle parameter context.
  // Fetch the context, waiting for it to complete.
  
  spu_mfcdma32((void *)(&ctx), (unsigned int)parm, sizeof(parm_context), tag_id, MFC_GET_CMD);
  (void)spu_mfcstat(MFC_TAG_UPDATE_ALL);

  dt_v = spu_splats(ctx.dt);

  // For each step in time
  for (time=0; time<END_OF_TIME; time += ctx.dt) {
    // For each block of particles
    for (i=0; i<ctx.particles; i+=PARTICLES_PER_BLOCK) {
      // Determine the number of particles in this block.
      left = ctx.particles - i;
      cnt = (left < PARTICLES_PER_BLOCK) ? left : PARTICLES_PER_BLOCK;

      // Fetch the data - position, velocity and inverse_mass. Wait for the DMA to complete 
      // before performing computation.
      spu_mfcdma32((void *)(pos), (unsigned int)(ctx.pos_v+i), cnt * sizeof(vector float), tag_id, MFC_GETB_CMD);
      spu_mfcdma32((void *)(vel), (unsigned int)(ctx.vel_v+i), cnt * sizeof(vector float), tag_id, MFC_GET_CMD);
      spu_mfcdma32((void *)(inv_mass), (unsigned int)(ctx.inv_mass+i), cnt * sizeof(float), tag_id, MFC_GET_CMD);
      (void)spu_mfcstat(MFC_TAG_UPDATE_ALL);

      // Compute the step in time for the block of particles
      for (j=0; j<cnt; j++) {
	pos[j] = spu_madd(vel[j], dt_v, pos[j]);
	dt_inv_mass_v = spu_mul(dt_v, spu_splats(inv_mass[j]));
	vel[j] = spu_madd(dt_inv_mass_v, ctx.force_v, vel[j]);
      }

      // Put the position and velocity data back into system memory
      spu_mfcdma32((void *)(pos), (unsigned int)(ctx.pos_v+i), cnt * sizeof(vector float), tag_id, MFC_PUT_CMD);
      spu_mfcdma32((void *)(vel), (unsigned int)(ctx.vel_v+i), cnt * sizeof(vector float), tag_id, MFC_PUT_CMD);
    }
  }
  // Wait for final DMAs to complete before terminating SPU thread.
  (void)spu_mfcstat(MFC_TAG_UPDATE_ALL);
  return (0);
}

Пример #3

0

Показать файл

Файл: spu.c Проект: kraused/cell-coding

void triad()
{
	int i, j, n;
	
	vector float s = spu_splats(args.scalar);
	
	n = SIZE * sizeof(float);

	for (i = 0; (i + SIZE) < args.N; i += SIZE) {
		mfc_get((void *)&ls1[0], (unsigned int )&args.b[i], n, TAG, 0, 0);
		mfc_get((void *)&ls2[0], (unsigned int )&args.c[i], n, TAG, 0, 0);
		mfc_write_tag_mask(1 << TAG);
		mfc_read_tag_status_all();

		for (j = 0; j < (SIZE / 4); ++j)
			ls3[j] = spu_madd(s, ls2[j], ls1[j]);

		mfc_put((void *)&ls3[0], (unsigned int )&args.a[i], n, TAG, 0, 0);
	}
		
	mfc_write_tag_mask(1 << TAG);
	mfc_read_tag_status_all();

	if (unlikely(i < args.N)) {
		/* 
		 * args.N - i will be smaller than SIZE at this point so
		 * it is safe to do a DMA transfer.
		 * We need to make sure that size is a multiple of 16.
		 */
		n = ((args.N - i) * sizeof(float)) & (~127);

		mfc_get((void *)&ls1[0], (unsigned int )&args.b[i], n, TAG, 0, 0);
		mfc_get((void *)&ls2[0], (unsigned int )&args.c[i], n, TAG, 0, 0);
		mfc_write_tag_mask(1 << TAG);
		mfc_read_tag_status_all();
		
		/* n must be divisible by 4. */
		for (j = 0; j < ((args.N - i) / 4); ++j)
			ls3[j] = spu_madd(s, ls2[j], ls1[j]);
		
		mfc_put((void *)&ls3[0], (unsigned int )&args.a[i], n, TAG, 0, 0);
		mfc_write_tag_mask(1 << TAG);
		mfc_read_tag_status_all();
	}

	/* 
	 * At this point it may be that i is still smaller than args.N if the length
	 * was not divisible by the number of SPUs times 16.
	 */
}

Пример #4

0

Показать файл

Файл: ternary_f.c Проект: bambang/vsipl

int kernel(lwp_functions* pf,
	   void*             params,
	   void*             inout,
	   unsigned int      iter,
	   unsigned int      n)
{
  Ternary_params* p = (Ternary_params*)params;
  switch (p->cmd)
  {
    case AM:
    {
      int length = p->length / 4;
      vector float *a = (vector float *)inout;
      vector float *b = a + length;
      vector float *c = a + 2 * length;
      unsigned int i;
      for (i = 0; i != length; ++i, ++a, ++b, ++c)
	*a = spu_mul(spu_add(*a, *b), *c);
      return 0;
    }
    case MA:
    {
      int length = p->length / 4;
      vector float *a = (vector float *)inout;
      vector float *b = a + length;
      vector float *c = a + 2 * length;
      unsigned int i;
      for (i = 0; i != length; ++i, ++a, ++b, ++c)
	*a = spu_madd(*a, *b, *c);
      return 0;
    }
    case CAM:
    {
      static vector unsigned char lo = 
	(vector unsigned char) { 0, 1, 2, 3, 16, 17, 18, 19,
				 4, 5, 6, 7, 20, 21, 22, 23};

      static vector unsigned char hi = 
	(vector unsigned char) { 8,  9, 10, 11, 24, 25, 26, 27,
				12, 13, 14, 15, 28, 29, 30, 31};

      int length = p->length / 4;
      float *a = (float *)inout;
      float *b = a + 8 * length;
      float *c = a + 16 * length;
      unsigned int i;
      // (a + b) * c:
      // r.r = (a.r+b.r)*c.r - (a.i+b.i)*c.i
      // r.i = (a.r+b.r)*c.i + (a.i+b.i)*c.r
      for (i = 0; i != length; ++i, a+=8, b+=8, c+=8)
      {
	vector float av = {*a, *(a+2), *(a+4), *(a+6)};              // a.r
	vector float bv = {*b, *(b+2), *(b+4), *(b+6)};              // b.r
	vector float cv = {*c, *(c+2), *(c+4), *(c+6)};              // c.r
	vector float dv = {*(a+1), *(a+3), *(a+5), *(a+7)};          // a.i
	vector float ev = {*(b+1), *(b+3), *(b+5), *(b+7)};          // b.i
	vector float fv = {*(c+1), *(c+3), *(c+5), *(c+7)};          // c.i
	vector float trv = spu_add(av, bv); // a.r+b.r
	vector float tiv = spu_add(dv, ev); // a.i+b.i
	vector float sv = spu_mul(trv, cv); // (a.r+b.r)*c.r
	vector float tv = spu_mul(trv, fv); // (a.r+b.r)*c.i
	vector float real = spu_nmsub(tiv, fv, sv); // r.r
	vector float imag = spu_madd(tiv, cv, tv);  // r.i
	// interleave result
	*(vector float *)a = spu_shuffle(real, imag, lo);
	*(vector float *)(a+4) = spu_shuffle(real, imag, hi);
      }
      return 0;
    }
    case CMA:
    {
      static vector unsigned char lo = 
	(vector unsigned char) { 0, 1, 2, 3, 16, 17, 18, 19,
				 4, 5, 6, 7, 20, 21, 22, 23};

      static vector unsigned char hi = 
	(vector unsigned char) { 8,  9, 10, 11, 24, 25, 26, 27,
				12, 13, 14, 15, 28, 29, 30, 31};

      int length = p->length / 4;
      float *a = (float *)inout;
      float *b = a + 8 * length;
      float *c = a + 16 * length;
      unsigned int i;
      // a * b + c:
      // r.r = a.r*b.r + c.r - a.i*b.i
      // r.i = a.r*b.i + c.i + a.i*b.r
      for (i = 0; i != length; ++i, a+=8, b+=8, c+=8)
      {
	vector float av = {*a, *(a+2), *(a+4), *(a+6)};              // a.r
	vector float bv = {*b, *(b+2), *(b+4), *(b+6)};              // b.r
	vector float cv = {*c, *(c+2), *(c+4), *(c+6)};              // c.r
	vector float dv = {*(a+1), *(a+3), *(a+5), *(a+7)};          // a.i
	vector float ev = {*(b+1), *(b+3), *(b+5), *(b+7)};          // b.i
	vector float fv = {*(c+1), *(c+3), *(c+5), *(c+7)};          // c.i
	vector float real = spu_nmsub(dv, ev, spu_madd(av, bv, cv)); // r.r
	vector float imag = spu_madd(dv, bv, spu_madd(av, ev, fv));  // r.i
	// interleave result
	*(vector float *)a = spu_shuffle(real, imag, lo);
	*(vector float *)(a+4) = spu_shuffle(real, imag, hi);
      }
      return 0;
    }
    case ZAM:
    {
      int length = p->length / 4;
      float *a_re = (float *)inout;
      float *a_im = a_re + 4 * length;
      float *b_re = a_re + 8 * length;
      float *b_im = a_re + 12 * length;
      float *c_re = a_re + 16 * length;
      float *c_im = a_re + 20 * length;
      unsigned int i;
      // (a + b) * c:
      // r.r = (a.r+b.r)*c.r - (a.i+b.i)*c.i
      // r.i = (a.r+b.r)*c.i + (a.i+b.i)*c.r
      for (i = 0; i != length;
	   ++i, a_re+=4, b_re+=4, c_re+=4, a_im+=4, b_im+=4, c_im+=4)
      {
	vector float *av = (vector float *)a_re;
	vector float *bv = (vector float *)b_re;
	vector float *cv = (vector float *)c_re;
	vector float *dv = (vector float *)a_im;
	vector float *ev = (vector float *)b_im;
	vector float *fv = (vector float *)c_im;
	vector float trv = spu_add(*av, *bv); // a.r+b.r
	vector float tiv = spu_add(*dv, *ev); // a.i+b.i
	vector float sv = spu_mul(trv, *cv); // (a.r+b.r)*c.r
	vector float tv = spu_mul(trv, *fv); // (a.r+b.r)*c.i
	*av = spu_nmsub(tiv, *fv, sv); // r.r
        *dv = spu_madd(tiv, *cv, tv);  // r.i
      }
      return 0;
    }
    case ZMA:
    {
      int length = p->length / 4;
      float *a_re = (float *)inout;
      float *a_im = a_re + 4 * length;
      float *b_re = a_re + 8 * length;
      float *b_im = a_re + 12 * length;
      float *c_re = a_re + 16 * length;
      float *c_im = a_re + 20 * length;
      unsigned int i;
      // a * b + c:
      // r.r = a.r*b.r + c.r - a.i*b.i
      // r.i = a.r*b.i + c.i + a.i*b.r
      for (i = 0; i != length;
	   ++i, a_re+=4, b_re+=4, c_re+=4, a_im+=4, b_im+=4, c_im+=4)
      {
	vector float *av = (vector float *)a_re;
	vector float *bv = (vector float *)b_re;
	vector float *cv = (vector float *)c_re;
	vector float *dv = (vector float *)a_im;
	vector float *ev = (vector float *)b_im;
	vector float *fv = (vector float *)c_im;
	vector float tmp = spu_nmsub(*dv, *ev, spu_madd(*av, *bv, *cv));
	*dv = spu_madd(*dv, *bv, spu_madd(*av, *ev, *fv));
	*av = tmp;
      }
      return 0;
    }
  }
  return 1;
}

Пример #5

0

Показать файл

Файл: spu_tri.c Проект: CPFDSoftware-Tony/gmv

/**
 * Setup fragment shader inputs by evaluating triangle's vertex
 * attribute coefficient info.
 * \param x  quad x pos
 * \param y  quad y pos
 * \param fragZ  returns quad Z values
 * \param fragInputs  returns fragment program inputs
 * Note: this code could be incorporated into the fragment program
 * itself to avoid the loop and switch.
 */
static void
eval_inputs(float x, float y, vector float *fragZ, vector float fragInputs[])
{
   static const vector float deltaX = (const vector float) {0, 1, 0, 1};
   static const vector float deltaY = (const vector float) {0, 0, 1, 1};

   const uint posSlot = 0;
   const vector float pos = setup.coef[posSlot].a0;
   const vector float dposdx = setup.coef[posSlot].dadx;
   const vector float dposdy = setup.coef[posSlot].dady;
   const vector float fragX = spu_splats(x) + deltaX;
   const vector float fragY = spu_splats(y) + deltaY;
   vector float fragW, wInv;
   uint i;

   *fragZ = splatz(pos) + fragX * splatz(dposdx) + fragY * splatz(dposdy);
   fragW =  splatw(pos) + fragX * splatw(dposdx) + fragY * splatw(dposdy);
   wInv = spu_re(fragW);  /* 1 / w */

   /* loop over fragment program inputs */
   for (i = 0; i < spu.vertex_info.num_attribs; i++) {
      uint attr = i + 1;
      enum interp_mode interp = spu.vertex_info.attrib[attr].interp_mode;

      /* constant term */
      vector float a0 = setup.coef[attr].a0;
      vector float r0 = splatx(a0);
      vector float r1 = splaty(a0);
      vector float r2 = splatz(a0);
      vector float r3 = splatw(a0);

      if (interp == INTERP_LINEAR || interp == INTERP_PERSPECTIVE) {
         /* linear term */
         vector float dadx = setup.coef[attr].dadx;
         vector float dady = setup.coef[attr].dady;
         /* Use SPU intrinsics here to get slightly better code.
          * originally: r0 += fragX * splatx(dadx) + fragY * splatx(dady);
          */
         r0 = spu_madd(fragX, splatx(dadx), spu_madd(fragY, splatx(dady), r0));
         r1 = spu_madd(fragX, splaty(dadx), spu_madd(fragY, splaty(dady), r1));
         r2 = spu_madd(fragX, splatz(dadx), spu_madd(fragY, splatz(dady), r2));
         r3 = spu_madd(fragX, splatw(dadx), spu_madd(fragY, splatw(dady), r3));
         if (interp == INTERP_PERSPECTIVE) {
            /* perspective term */
            r0 *= wInv;
            r1 *= wInv;
            r2 *= wInv;
            r3 *= wInv;
         }
      }
      fragInputs[CHAN0] = r0;
      fragInputs[CHAN1] = r1;
      fragInputs[CHAN2] = r2;
      fragInputs[CHAN3] = r3;
      fragInputs += 4;
   }
}


/**
 * Emit a quad (pass to next stage).  No clipping is done.
 * Note: about 1/5 to 1/7 of the time, mask is zero and this function
 * should be skipped.  But adding the test for that slows things down
 * overall.
 */
static INLINE void
emit_quad( int x, int y, mask_t mask)
{
   /* If any bits in mask are set... */
   if (spu_extract(spu_orx(mask), 0)) {
      const int ix = x - setup.cliprect_minx;
      const int iy = y - setup.cliprect_miny;

      spu.cur_ctile_status = TILE_STATUS_DIRTY;
      spu.cur_ztile_status = TILE_STATUS_DIRTY;

      {
         /*
          * Run fragment shader, execute per-fragment ops, update fb/tile.
          */
         vector float inputs[4*4], outputs[2*4];
         vector unsigned int kill_mask;
         vector float fragZ;

         eval_inputs((float) x, (float) y, &fragZ, inputs);

         ASSERT(spu.fragment_program);
         ASSERT(spu.fragment_ops);

         /* Execute the current fragment program */
         kill_mask = spu.fragment_program(inputs, outputs, spu.constants);

         mask = spu_andc(mask, kill_mask);

         /* Execute per-fragment/quad operations, including:
          * alpha test, z test, stencil test, blend and framebuffer writing.
          * Note that there are two different fragment operations functions
          * that can be called, one for front-facing fragments, and one
          * for back-facing fragments.  (Often the two are the same;
          * but in some cases, like two-sided stenciling, they can be
          * very different.)  So choose the correct function depending
          * on the calculated facing.
          */
         spu.fragment_ops[setup.facing](ix, iy, &spu.ctile, &spu.ztile,
                          fragZ,
                          outputs[0*4+0],
                          outputs[0*4+1],
                          outputs[0*4+2],
                          outputs[0*4+3],
                          mask);
      }
   }
}


/**
 * Given an X or Y coordinate, return the block/quad coordinate that it
 * belongs to.
 */
static INLINE int
block(int x)
{
   return x & ~1;
}


/**
 * Render a horizontal span of quads
 */
static void
flush_spans(void)
{
   int minleft, maxright;

   const int l0 = spu_extract(setup.span.quad, 0);
   const int l1 = spu_extract(setup.span.quad, 1);
   const int r0 = spu_extract(setup.span.quad, 2);
   const int r1 = spu_extract(setup.span.quad, 3);

   switch (setup.span.y_flags) {
   case 0x3:
      /* both odd and even lines written (both quad rows) */
      minleft = MIN2(l0, l1);
      maxright = MAX2(r0, r1);
      break;

   case 0x1:
      /* only even line written (quad top row) */
      minleft = l0;
      maxright = r0;
      break;

   case 0x2:
      /* only odd line written (quad bottom row) */
      minleft = l1;
      maxright = r1;
      break;

   default:
      return;
   }

   /* OK, we're very likely to need the tile data now.
    * clear or finish waiting if needed.
    */
   if (spu.cur_ctile_status == TILE_STATUS_GETTING) {
      /* wait for mfc_get() to complete */
      //printf("SPU: %u: waiting for ctile\n", spu.init.id);
      wait_on_mask(1 << TAG_READ_TILE_COLOR);
      spu.cur_ctile_status = TILE_STATUS_CLEAN;
   }
   else if (spu.cur_ctile_status == TILE_STATUS_CLEAR) {
      //printf("SPU %u: clearing C tile %u, %u\n", spu.init.id, setup.tx, setup.ty);
      clear_c_tile(&spu.ctile);
      spu.cur_ctile_status = TILE_STATUS_DIRTY;
   }
   ASSERT(spu.cur_ctile_status != TILE_STATUS_DEFINED);

   if (spu.read_depth_stencil) {
      if (spu.cur_ztile_status == TILE_STATUS_GETTING) {
         /* wait for mfc_get() to complete */
         //printf("SPU: %u: waiting for ztile\n", spu.init.id);
         wait_on_mask(1 << TAG_READ_TILE_Z);
         spu.cur_ztile_status = TILE_STATUS_CLEAN;
      }
      else if (spu.cur_ztile_status == TILE_STATUS_CLEAR) {
         //printf("SPU %u: clearing Z tile %u, %u\n", spu.init.id, setup.tx, setup.ty);
         clear_z_tile(&spu.ztile);
         spu.cur_ztile_status = TILE_STATUS_DIRTY;
      }
      ASSERT(spu.cur_ztile_status != TILE_STATUS_DEFINED);
   }

   /* XXX this loop could be moved into the above switch cases... */
   
   /* Setup for mask calculation */
   const vec_int4 quad_LlRr = setup.span.quad;
   const vec_int4 quad_RrLl = spu_rlqwbyte(quad_LlRr, 8);
   const vec_int4 quad_LLll = spu_shuffle(quad_LlRr, quad_LlRr, SHUFFLE4(A,A,B,B));
   const vec_int4 quad_RRrr = spu_shuffle(quad_RrLl, quad_RrLl, SHUFFLE4(A,A,B,B));

   const vec_int4 twos = spu_splats(2);

   const int x = block(minleft);
   vec_int4 xs = {x, x+1, x, x+1};

   for (; spu_extract(xs, 0) <= block(maxright); xs += twos) {
      /**
       * Computes mask to indicate which pixels in the 2x2 quad are actually
       * inside the triangle's bounds.
       */
      
      /* Calculate ({x,x+1,x,x+1} >= {l[0],l[0],l[1],l[1]}) */
      const mask_t gt_LLll_xs = spu_cmpgt(quad_LLll, xs);
      const mask_t gte_xs_LLll = spu_nand(gt_LLll_xs, gt_LLll_xs); 
      
      /* Calculate ({r[0],r[0],r[1],r[1]} > {x,x+1,x,x+1}) */
      const mask_t gt_RRrr_xs = spu_cmpgt(quad_RRrr, xs);

      /* Combine results to create mask */
      const mask_t mask = spu_and(gte_xs_LLll, gt_RRrr_xs);

      emit_quad(spu_extract(xs, 0), setup.span.y, mask);
   }

   setup.span.y = 0;
   setup.span.y_flags = 0;
   /* Zero right elements */
   setup.span.quad = spu_shuffle(setup.span.quad, setup.span.quad, SHUFFLE4(A,B,0,0));
}


#if DEBUG_VERTS
static void
print_vertex(const struct vertex_header *v)
{
   uint i;
   fprintf(stderr, "  Vertex: (%p)\n", v);
   for (i = 0; i < spu.vertex_info.num_attribs; i++) {
      fprintf(stderr, "    %d: %f %f %f %f\n",  i, 
              spu_extract(v->data[i], 0),
              spu_extract(v->data[i], 1),
              spu_extract(v->data[i], 2),
              spu_extract(v->data[i], 3));
   }
}

Пример #6

0

Показать файл

Файл: discretize.c Проект: jlinford/fixedgrid

void discretize(const uint32_t n, 
                volatile vector real_t *conc_in, 
                volatile vector real_t *wind, 
                volatile vector real_t *diff, 
                vector real_t *concbound, 
                vector real_t *windbound, 
                vector real_t *diffbound, 
                vector real_t cell_size, 
                vector real_t dt, 
                volatile vector real_t *conc_out)
{
    uint32_t i, x;
    vector real_t acc;
    vector real_t c[n];
    vector real_t dcdx[n];
    
    /* Copy original values  */
    i=0; x=n;
    while(x > 8)
    {
        c[i] = conc_out[i] = conc_in[i]; ++i;
        c[i] = conc_out[i] = conc_in[i]; ++i;
        c[i] = conc_out[i] = conc_in[i]; ++i;
        c[i] = conc_out[i] = conc_in[i]; ++i;
        c[i] = conc_out[i] = conc_in[i]; ++i;
        c[i] = conc_out[i] = conc_in[i]; ++i;
        c[i] = conc_out[i] = conc_in[i]; ++i;
        c[i] = conc_out[i] = conc_in[i]; ++i;
        x -= 8;
    }
    while(x > 4)
    {
        c[i] = conc_out[i] = conc_in[i]; ++i;
        c[i] = conc_out[i] = conc_in[i]; ++i;
        c[i] = conc_out[i] = conc_in[i]; ++i;
        c[i] = conc_out[i] = conc_in[i]; ++i;
        x -= 4;
    }
    while(x > 0)
    {
        c[i] = conc_out[i] = conc_in[i]; ++i;
        --x;
    }
    
    space_advec_diff_v(n, conc_in, wind, diff, concbound, windbound, diffbound, cell_size, dcdx);
    
    i=0; x=n;
    while(x > 8)
    {
        c[i] = spu_madd(dt, dcdx[i], c[i]); ++i;
        c[i] = spu_madd(dt, dcdx[i], c[i]); ++i;
        c[i] = spu_madd(dt, dcdx[i], c[i]); ++i;
        c[i] = spu_madd(dt, dcdx[i], c[i]); ++i;
        c[i] = spu_madd(dt, dcdx[i], c[i]); ++i;
        c[i] = spu_madd(dt, dcdx[i], c[i]); ++i;
        c[i] = spu_madd(dt, dcdx[i], c[i]); ++i;
        c[i] = spu_madd(dt, dcdx[i], c[i]); ++i;
        x -= 8;
    }
    while(x > 4)
    {
        c[i] = spu_madd(dt, dcdx[i], c[i]); ++i;
        c[i] = spu_madd(dt, dcdx[i], c[i]); ++i;
        c[i] = spu_madd(dt, dcdx[i], c[i]); ++i;
        c[i] = spu_madd(dt, dcdx[i], c[i]); ++i;
        x -= 4;
    }
    while(x > 0)
    {
        c[i] = spu_madd(dt, dcdx[i], c[i]); ++i;
        --x;
    }
    
    space_advec_diff_v(n, c, wind, diff, concbound, windbound, diffbound, cell_size, dcdx);
    
    i=0; x=n;
    while(x > 8)
    {
        c[i] = spu_madd(dt, dcdx[i], c[i]); ++i;
        c[i] = spu_madd(dt, dcdx[i], c[i]); ++i;
        c[i] = spu_madd(dt, dcdx[i], c[i]); ++i;
        c[i] = spu_madd(dt, dcdx[i], c[i]); ++i;
        c[i] = spu_madd(dt, dcdx[i], c[i]); ++i;
        c[i] = spu_madd(dt, dcdx[i], c[i]); ++i;
        c[i] = spu_madd(dt, dcdx[i], c[i]); ++i;
        c[i] = spu_madd(dt, dcdx[i], c[i]); ++i;
        x -= 8;
    }
    while(x > 4)
    {
        c[i] = spu_madd(dt, dcdx[i], c[i]); ++i;
        c[i] = spu_madd(dt, dcdx[i], c[i]); ++i;
        c[i] = spu_madd(dt, dcdx[i], c[i]); ++i;
        c[i] = spu_madd(dt, dcdx[i], c[i]); ++i;
        x -= 4;
    }
    while(x > 0)
    {
        c[i] = spu_madd(dt, dcdx[i], c[i]); ++i;
        --x;
    }

    #define UNROLL_ELEMENT \
    acc = spu_add(conc_out[i], c[i]); \
    conc_out[i] = spu_mul(HALF, acc); \
    acc = spu_splats((real_t)0.0); \
    acc = (vector real_t)spu_cmpgt(conc_out[i], acc); \
    conc_out[i] = spu_and(conc_out[i], acc)
    
    i=0; x=n;
    while(x > 8)
    {
        UNROLL_ELEMENT; ++i;
        UNROLL_ELEMENT; ++i;
        UNROLL_ELEMENT; ++i;
        UNROLL_ELEMENT; ++i;
        UNROLL_ELEMENT; ++i;
        UNROLL_ELEMENT; ++i;
        UNROLL_ELEMENT; ++i;
        UNROLL_ELEMENT; ++i;
        x -= 8;
    }
    while(x > 4)
    {
        UNROLL_ELEMENT; ++i;
        UNROLL_ELEMENT; ++i;
        UNROLL_ELEMENT; ++i;
        UNROLL_ELEMENT; ++i;
        x -= 4;
    }
    while(x > 0)
    {
        UNROLL_ELEMENT; ++i;
        --x;
    }
    
    #undef UNROLL_ELEMENT
}

Пример #7

0

Показать файл

vector double
__divv2df3 (vector double a_in, vector double b_in)
{
    /* Variables */
    vec_int4    exp, exp_bias;
    vec_uint4   no_underflow, overflow;
    vec_float4  mant_bf, inv_bf;
    vec_ullong2 exp_a, exp_b;
    vec_ullong2 a_nan, a_zero, a_inf, a_denorm, a_denorm0;
    vec_ullong2 b_nan, b_zero, b_inf, b_denorm, b_denorm0;
    vec_ullong2 nan;
    vec_uint4   a_exp, b_exp;
    vec_ullong2 a_mant_0, b_mant_0;
    vec_ullong2 a_exp_1s, b_exp_1s;
    vec_ullong2 sign_exp_mask;

    vec_double2 a, b;
    vec_double2 mant_a, mant_b, inv_b, q0, q1, q2, mult;

    /* Constants */
    vec_uint4   exp_mask_u32 = spu_splats((unsigned int)0x7FF00000);
    vec_uchar16 splat_hi = (vec_uchar16) {
        0,1,2,3, 0,1,2,3,  8, 9,10,11, 8,9,10,11
    };
    vec_uchar16 swap_32 = (vec_uchar16) {
        4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11
    };
    vec_ullong2 exp_mask = spu_splats(0x7FF0000000000000ULL);
    vec_ullong2 sign_mask = spu_splats(0x8000000000000000ULL);
    vec_float4  onef = spu_splats(1.0f);
    vec_double2 one = spu_splats(1.0);
    vec_double2 exp_53 = (vec_double2)spu_splats(0x0350000000000000ULL);

    sign_exp_mask = spu_or(sign_mask, exp_mask);

    /* Extract the floating point components from each of the operands including
     * exponent and mantissa.
     */
    a_exp = (vec_uint4)spu_and((vec_uint4)a_in, exp_mask_u32);
    a_exp = spu_shuffle(a_exp, a_exp, splat_hi);
    b_exp = (vec_uint4)spu_and((vec_uint4)b_in, exp_mask_u32);
    b_exp = spu_shuffle(b_exp, b_exp, splat_hi);

    a_mant_0 = (vec_ullong2)spu_cmpeq((vec_uint4)spu_andc((vec_ullong2)a_in, sign_exp_mask), 0);
    a_mant_0 = spu_and(a_mant_0, spu_shuffle(a_mant_0, a_mant_0, swap_32));

    b_mant_0 = (vec_ullong2)spu_cmpeq((vec_uint4)spu_andc((vec_ullong2)b_in, sign_exp_mask), 0);
    b_mant_0 = spu_and(b_mant_0, spu_shuffle(b_mant_0, b_mant_0, swap_32));

    a_exp_1s = (vec_ullong2)spu_cmpeq(a_exp, exp_mask_u32);
    b_exp_1s = (vec_ullong2)spu_cmpeq(b_exp, exp_mask_u32);

    /* Identify all possible special values that must be accommodated including:
     * +-denorm, +-0, +-infinity, and NaNs.
     */
    a_denorm0= (vec_ullong2)spu_cmpeq(a_exp, 0);
    a_nan    = spu_andc(a_exp_1s, a_mant_0);
    a_zero   = spu_and (a_denorm0, a_mant_0);
    a_inf    = spu_and (a_exp_1s, a_mant_0);
    a_denorm = spu_andc(a_denorm0, a_zero);

    b_denorm0= (vec_ullong2)spu_cmpeq(b_exp, 0);
    b_nan    = spu_andc(b_exp_1s, b_mant_0);
    b_zero   = spu_and (b_denorm0, b_mant_0);
    b_inf    = spu_and (b_exp_1s, b_mant_0);
    b_denorm = spu_andc(b_denorm0, b_zero);

    /* Scale denorm inputs to into normalized numbers by conditionally scaling the
     * input parameters.
     */
    a = spu_sub(spu_or(a_in, exp_53), spu_sel(exp_53, a_in, sign_mask));
    a = spu_sel(a_in, a, a_denorm);

    b = spu_sub(spu_or(b_in, exp_53), spu_sel(exp_53, b_in, sign_mask));
    b = spu_sel(b_in, b, b_denorm);

    /* Extract the divisor and dividend exponent and force parameters into the signed
     * range [1.0,2.0) or [-1.0,2.0).
     */
    exp_a = spu_and((vec_ullong2)a, exp_mask);
    exp_b = spu_and((vec_ullong2)b, exp_mask);

    mant_a = spu_sel(a, one, (vec_ullong2)exp_mask);
    mant_b = spu_sel(b, one, (vec_ullong2)exp_mask);

    /* Approximate the single reciprocal of b by using
     * the single precision reciprocal estimate followed by one
     * single precision iteration of Newton-Raphson.
     */
    mant_bf = spu_roundtf(mant_b);
    inv_bf = spu_re(mant_bf);
    inv_bf = spu_madd(spu_nmsub(mant_bf, inv_bf, onef), inv_bf, inv_bf);

    /* Perform 2 more Newton-Raphson iterations in double precision. The
     * result (q1) is in the range (0.5, 2.0).
     */
    inv_b = spu_extend(inv_bf);
    inv_b = spu_madd(spu_nmsub(mant_b, inv_b, one), inv_b, inv_b);
    q0 = spu_mul(mant_a, inv_b);
    q1 = spu_madd(spu_nmsub(mant_b, q0, mant_a), inv_b, q0);

    /* Determine the exponent correction factor that must be applied
     * to q1 by taking into account the exponent of the normalized inputs
     * and the scale factors that were applied to normalize them.
     */
    exp = spu_rlmaska(spu_sub((vec_int4)exp_a, (vec_int4)exp_b), -20);
    exp = spu_add(exp, (vec_int4)spu_add(spu_and((vec_int4)a_denorm, -0x34), spu_and((vec_int4)b_denorm, 0x34)));

    /* Bias the quotient exponent depending on the sign of the exponent correction
     * factor so that a single multiplier will ensure the entire double precision
     * domain (including denorms) can be achieved.
     *
     *    exp 	       bias q1     adjust exp
     *   =====	       ========    ==========
     *   positive         2^+65         -65
     *   negative         2^-64         +64
     */
    exp_bias = spu_xor(spu_rlmaska(exp, -31), 64);
    exp = spu_sub(exp, exp_bias);

    q1 = spu_sel(q1, (vec_double2)spu_add((vec_int4)q1, spu_sl(exp_bias, 20)), exp_mask);

    /* Compute a multiplier (mult) to applied to the quotient (q1) to produce the
     * expected result. On overflow, clamp the multiplier to the maximum non-infinite
     * number in case the rounding mode is not round-to-nearest.
     */
    exp = spu_add(exp, 0x3FF);
    no_underflow = spu_cmpgt(exp, 0);
    overflow = spu_cmpgt(exp, 0x7FE);
    exp = spu_and(spu_sl(exp, 20), (vec_int4)no_underflow);
    exp = spu_and(exp, (vec_int4)exp_mask);

    mult = spu_sel((vec_double2)exp, (vec_double2)(spu_add((vec_uint4)exp_mask, -1)), (vec_ullong2)overflow);

    /* Handle special value conditions. These include:
     *
     * 1) IF either operand is a NaN OR both operands are 0 or INFINITY THEN a NaN
     *    results.
     * 2) ELSE IF the dividend is an INFINITY OR the divisor is 0 THEN a INFINITY results.
     * 3) ELSE IF the dividend is 0 OR the divisor is INFINITY THEN a 0 results.
     */
    mult = spu_andc(mult, (vec_double2)spu_or(a_zero, b_inf));
    mult = spu_sel(mult, (vec_double2)exp_mask, spu_or(a_inf, b_zero));

    nan = spu_or(a_nan, b_nan);
    nan = spu_or(nan, spu_and(a_zero, b_zero));
    nan = spu_or(nan, spu_and(a_inf, b_inf));

    mult = spu_or(mult, (vec_double2)nan);

    /* Scale the final quotient */

    q2 = spu_mul(q1, mult);

    return (q2);
}

Пример #8

0

Показать файл

Файл: particle.c Проект: rpetryniak/Course-Examples-for-IBM-Cell-B.E.Programming

void process_buffer(int buffer, int cnt, vector float dt_v)
{
  int i;
  volatile vector float *p_inv_mass_v;
  vector float force_v, inv_mass_v;
  vector float pos0, pos1, pos2, pos3;
  vector float vel0, vel1, vel2, vel3;
  vector float dt_inv_mass_v, dt_inv_mass_v_0, dt_inv_mass_v_1, dt_inv_mass_v_2, dt_inv_mass_v_3;
  vector unsigned char splat_word_0 = (vector unsigned char){0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3};
  vector unsigned char splat_word_1 = (vector unsigned char){4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7};
  vector unsigned char splat_word_2 = (vector unsigned char){8, 9,10,11, 8, 9,10,11, 8, 9,10,11, 8, 9,10,11};
  vector unsigned char splat_word_3 = (vector unsigned char){12,13,14,15,12,13,14,15,12,13,14,15,12,13,14,15};

  p_inv_mass_v = (volatile vector float *)&inv_mass[buffer][0]; 
  force_v = ctx.force_v;

  // Compute the step in time for the block of particles, four 
  // particle at a time.
  for (i=0; i<cnt; i+=4) {
    inv_mass_v = *p_inv_mass_v++;
    
    pos0 = pos[buffer][i+0];
    pos1 = pos[buffer][i+1];
    pos2 = pos[buffer][i+2];
    pos3 = pos[buffer][i+3];

    vel0 = vel[buffer][i+0];
    vel1 = vel[buffer][i+1];
    vel2 = vel[buffer][i+2];
    vel3 = vel[buffer][i+3];

    dt_inv_mass_v = spu_mul(dt_v, inv_mass_v);

    pos0 = spu_madd(vel0, dt_v, pos0);
    pos1 = spu_madd(vel1, dt_v, pos1);
    pos2 = spu_madd(vel2, dt_v, pos2);
    pos3 = spu_madd(vel3, dt_v, pos3);

    dt_inv_mass_v_0 = spu_shuffle(dt_inv_mass_v, dt_inv_mass_v, splat_word_0);
    dt_inv_mass_v_1 = spu_shuffle(dt_inv_mass_v, dt_inv_mass_v, splat_word_1);
    dt_inv_mass_v_2 = spu_shuffle(dt_inv_mass_v, dt_inv_mass_v, splat_word_2);
    dt_inv_mass_v_3 = spu_shuffle(dt_inv_mass_v, dt_inv_mass_v, splat_word_3);

    vel0 = spu_madd(dt_inv_mass_v_0, force_v, vel0);
    vel1 = spu_madd(dt_inv_mass_v_1, force_v, vel1);
    vel2 = spu_madd(dt_inv_mass_v_2, force_v, vel2);
    vel3 = spu_madd(dt_inv_mass_v_3, force_v, vel3);

    pos[buffer][i+0] = pos0;
    pos[buffer][i+1] = pos1;
    pos[buffer][i+2] = pos2;
    pos[buffer][i+3] = pos3;

    vel[buffer][i+0] = vel0;
    vel[buffer][i+1] = vel1;
    vel[buffer][i+2] = vel2;
    vel[buffer][i+3] = vel3;
  }
}


int main(unsigned long long spu_id __attribute__ ((unused)), unsigned long long argv)
{
  int buffer, next_buffer;
  int cnt, next_cnt, left;
  float time, dt;
  vector float dt_v;
  volatile vector float *ctx_pos_v, *ctx_vel_v;
  volatile vector float *next_ctx_pos_v, *next_ctx_vel_v;
  volatile float *ctx_inv_mass, *next_ctx_inv_mass;
  unsigned int tags[2];

  // Reserve a pair of DMA tag IDs
  tags[0] = mfc_tag_reserve();
  tags[1] = mfc_tag_reserve();
  
  // Input parameter argv is a pointer to the particle context.
  // Fetch the parameter context, waiting for it to complete.
  spu_writech(MFC_WrTagMask, 1 << tags[0]);
  spu_mfcdma32((void *)(&ctx), (unsigned int)argv, sizeof(parm_context), tags[0], MFC_GET_CMD);
  (void)spu_mfcstat(MFC_TAG_UPDATE_ALL);

  dt = ctx.dt;
  dt_v = spu_splats(dt);

  // For each step in time
  for (time=0; time<END_OF_TIME; time += dt) {
    // For each double buffered block of particles
    left = ctx.particles;

    cnt = (left < PARTICLES_PER_BLOCK) ? left : PARTICLES_PER_BLOCK;

    ctx_pos_v = ctx.pos_v;
    ctx_vel_v = ctx.vel_v;
    ctx_inv_mass = ctx.inv_mass;

    // Prefetch first buffer of input data.
    buffer = 0;
    spu_mfcdma32((void *)(pos), (unsigned int)(ctx_pos_v), cnt * sizeof(vector float), tags[0], MFC_GETB_CMD);
    spu_mfcdma32((void *)(vel), (unsigned int)(ctx_vel_v), cnt * sizeof(vector float), tags[0], MFC_GET_CMD);
    spu_mfcdma32((void *)(inv_mass), (unsigned int)(ctx_inv_mass), cnt * sizeof(float), tags[0], MFC_GET_CMD);

    while (cnt < left) {
      left -= cnt;

      next_ctx_pos_v = ctx_pos_v + cnt;
      next_ctx_vel_v = ctx_vel_v + cnt;
      next_ctx_inv_mass = ctx_inv_mass + cnt;
      next_cnt = (left < PARTICLES_PER_BLOCK) ? left : PARTICLES_PER_BLOCK;

      // Prefetch next buffer so the data is available for computation on next loop iteration.
      // The first DMA is barriered so that we don't GET data before the previous iteration's
      // data is PUT.
      next_buffer = buffer^1;

      spu_mfcdma32((void *)(&pos[next_buffer][0]), (unsigned int)(next_ctx_pos_v), next_cnt * sizeof(vector float), tags[next_buffer], MFC_GETB_CMD);
      spu_mfcdma32((void *)(&vel[next_buffer][0]), (unsigned int)(next_ctx_vel_v), next_cnt * sizeof(vector float), tags[next_buffer], MFC_GET_CMD);
      spu_mfcdma32((void *)(&inv_mass[next_buffer][0]), (unsigned int)(next_ctx_inv_mass), next_cnt * sizeof(float), tags[next_buffer], MFC_GET_CMD);
      
      // Wait for previously prefetched data
      spu_writech(MFC_WrTagMask, 1 << tags[buffer]);
      (void)spu_mfcstat(MFC_TAG_UPDATE_ALL);

      process_buffer(buffer, cnt, dt_v);

      // Put the buffer's position and velocity data back into system memory
      spu_mfcdma32((void *)(&pos[buffer][0]), (unsigned int)(ctx_pos_v), cnt * sizeof(vector float), tags[buffer], MFC_PUT_CMD);
      spu_mfcdma32((void *)(&vel[buffer][0]), (unsigned int)(ctx_vel_v), cnt * sizeof(vector float), tags[buffer], MFC_PUT_CMD);
      
      ctx_pos_v = next_ctx_pos_v;
      ctx_vel_v = next_ctx_vel_v;
      ctx_inv_mass = next_ctx_inv_mass;

      buffer = next_buffer;
      cnt = next_cnt;		  
    }

    // Wait for previously prefetched data
    spu_writech(MFC_WrTagMask, 1 << tags[buffer]);
    (void)spu_mfcstat(MFC_TAG_UPDATE_ALL);

    process_buffer(buffer, cnt, dt_v);

    // Put the buffer's position and velocity data back into system memory
    spu_mfcdma32((void *)(&pos[buffer][0]), (unsigned int)(ctx_pos_v), cnt * sizeof(vector float), tags[buffer], MFC_PUT_CMD);
    spu_mfcdma32((void *)(&vel[buffer][0]), (unsigned int)(ctx_vel_v), cnt * sizeof(vector float), tags[buffer], MFC_PUT_CMD);

    // Wait for DMAs to complete before starting the next step in time.
    spu_writech(MFC_WrTagMask, 1 << tags[buffer]);
    (void)spu_mfcstat(MFC_TAG_UPDATE_ALL);
  }

  return (0);
}

Пример #9

0

Показать файл

Файл: matmul_shared.cpp Проект: brog2610/quinoa

inline void calc(ELEM_TYPE* A, ELEM_TYPE* B, ELEM_TYPE* C) {

  //register WRRecord* wrRecord = (WRRecord*)C;
  //register int startRow = wrRecord->startRow;
  //register int startCol = wrRecord->startCol;

  // DEBUG
  //printf("SPE_%d :: startRow = %d, startCol = %d...\n", (int)getSPEID(), startRow, startCol);

  // DEBUG
  //printf("SPE_%d :: A = %p, B = %p, C = %p...\n", (int)getSPEID(), A, B, C);

  register int r,c;

  // Fill in C
  for (r = 0; r < NUM_ROWS_PER_WR; r++) {
    for (c = 0; c < NUM_COLS_PER_WR; c++) {

      // Init the pointers
      register vector ELEM_TYPE* APtr = (vector ELEM_TYPE*)(A + (r * MATRIX_A_COLS));
      register vector ELEM_TYPE* BPtr = (vector ELEM_TYPE*)(B + (c * MATRIX_B_ROWS));
      #if USE_DOUBLE == 0
        register vector ELEM_TYPE sumV = { 0.0f, 0.0f, 0.0f, 0.0f };
      #else
        register vector ELEM_TYPE sumV = { 0.0, 0.0 };
      #endif

      //// DEBUG
      //printf("SPE_%d :: Start C value [%d x %d]... APtr = %p, BPtr = %p\n", (int)getSPEID(), r, c, APtr, BPtr);
      //{
      //  register int i;
      //  printf("SPE_%d :: A's Row = { ", (int)getSPEID());
      //  for (i = 0; i < MATRIX_A_COLS; i++) printf("%f ", (double)*(((float*)(APtr)) + i));
      //  printf("}...\n");
      //  printf("SPE_%d :: B's Column = { ", (int)getSPEID());
      //  for (i = 0; i < MATRIX_B_ROWS; i++) printf("%f ", (double)*(((float*)(BPtr)) + i));
      //  printf("}...\n");
      //}

      register int i;
      for (i = 0; i < MATRIX_A_COLS; i += (16 / sizeof(ELEM_TYPE))) {
        register vector ELEM_TYPE aV = *APtr;
        register vector ELEM_TYPE bV = *BPtr;

        // DEBUG
        //printf("SPE :: aV = { %f, %f, %f, %f }\n", spu_extract(aV, 0), spu_extract(aV, 1), spu_extract(aV, 2), spu_extract(aV, 3));
        //printf("SPE :: bV = { %f, %f, %f, %f }\n", spu_extract(bV, 0), spu_extract(bV, 1), spu_extract(bV, 2), spu_extract(bV, 3));

        APtr += 1;
        BPtr += 1;
        sumV = spu_madd(aV, bV, sumV);

        // DEBUG
        //printf("SPE :: sumV = { %f, %f, %f, %f }\n", spu_extract(sumV, 0), spu_extract(sumV, 1), spu_extract(sumV, 2), spu_extract(sumV, 3));
      }

      // Add the elements of the sumV vector together
      #if USE_DOUBLE == 0
        register ELEM_TYPE sum = 0.0f;
        sum += spu_extract(sumV, 0);
        sum += spu_extract(sumV, 1);
        sum += spu_extract(sumV, 2);
        sum += spu_extract(sumV, 3);
      #else
        register ELEM_TYPE sum = 0.0;
        sum += spu_extract(sumV, 0);
        sum += spu_extract(sumV, 1);
      #endif

      // Store in C
      C[c + (r * NUM_COLS_PER_WR)] = sum;

      // DEBUG
      //printf("SPE_%d :: C value [%d x %d] = %f\n", (int)getSPEID(), r, c, sum);
    }
  }
}

Пример #10

0

Показать файл

Файл: spu_prog.c Проект: vlitomsk/fit

int
main(
    unsigned long long spe_id,
    unsigned long long ppu_vector_a,
    unsigned long long ppu_vector_b)
{
    int i, iter, buf_idx, vec_idx;
    unsigned long long ppu_vector_bases[2] _ALIG(128);
    vector float * pchunk_a, * pchunk_b;
    vector float g_vec = {0,0,0,0};

    ppu_vector_bases[0] = ppu_vector_a;
    ppu_vector_bases[1] = ppu_vector_b;

    const unsigned int spu_num = spu_read_in_mbox();
    unsigned long long get_edge_bytes = spu_num * SUBVEC_SZ_BYTES;

    float buffers[NBUFFERS * BUF_SZ_FLOATS] _ALIG(128);
    int buffer_tags[NBUFFERS][2] _ALIG(128);
    //int buffer_tags[NBUFFERS];

    for (iter = 0; iter < NBUFFERS; ++iter) {
        buffer_tags[iter][0] = mfc_tag_reserve();
        buffer_tags[iter][1] = mfc_tag_reserve();
    }

    // first mfc_get for all
    for (buf_idx = 0; buf_idx < NBUFFERS; ++buf_idx) {
        for (vec_idx = 0; vec_idx < 2; ++vec_idx) {
            mfc_get(buf_ptr_float(buffers, buf_idx, vec_idx),
                    ppu_vector_bases[vec_idx] + get_edge_bytes,
                    CHUNK_SZ_BYTES,
                    buffer_tags[buf_idx][vec_idx],
                    0, 0);
        }
    }
    get_edge_bytes += CHUNK_SZ_BYTES;

    //printf("subvec_sz-chunks: %d\n", SUBVEC_SZ_CHUNKS);
    //printf("%d==%d\n", MAXITER*NBUFFERS*CHUNK_SZ_FLOATS, SUBVEC_SZ_FLOATS);
    int chunksleft = SUBVEC_SZ_CHUNKS;
    while(chunksleft!=0) {
        for (buf_idx = 0; chunksleft !=0 && buf_idx < NBUFFERS; ++buf_idx) {
            const int tag_mask = (1 << buffer_tags[buf_idx][0])
                                 | (1 << buffer_tags[buf_idx][1]);

            mfc_write_tag_mask(tag_mask);
            mfc_read_tag_status_all();

            pchunk_a = buf_ptr_vecfloat(buffers, buf_idx, 0);
            pchunk_b = buf_ptr_vecfloat(buffers, buf_idx, 1);

            for (i = 0; i < CHUNK_SZ_FLOATVECS; ++i) {
                g_vec = spu_madd(pchunk_a[i], pchunk_b[i], g_vec);
            }

            // move this mfc_get to end of loop, check get_edge_bytes variable dynamics
            if (likely(iter != MAXITER - 1)) {
                for (vec_idx = 0; vec_idx < 2; ++vec_idx) {
                    mfc_get(buf_ptr_float(buffers, buf_idx, vec_idx),
                            ppu_vector_bases[vec_idx] + get_edge_bytes,
                            CHUNK_SZ_BYTES,
                            buffer_tags[buf_idx][vec_idx],
                            0, 0);
                }
            }
            get_edge_bytes += CHUNK_SZ_BYTES;
            --chunksleft;
        }
    }

    for (iter = 0; iter < NBUFFERS; ++iter) {
        mfc_tag_release(buffer_tags[iter][0]);
        mfc_tag_release(buffer_tags[iter][1]);
    }

    float_uint_t retval;
    retval.f =
        spu_extract(g_vec, 0) +
        spu_extract(g_vec, 1) +
        spu_extract(g_vec, 2) +
        spu_extract(g_vec, 3);

    //printf("retval: %f\n", retval.f);
    spu_write_out_mbox(retval.i);

    return 0;
}