Beispiel #1
0
int main(unsigned long long spe, unsigned long long argp, unsigned long long envp)
{
    int i;
    int tag = 1;

    /* DMA Transfer 1 : GET input/output parameters */
    spu_mfcdma64(&abs_params, mfc_ea2h(argp), mfc_ea2l(argp),
                 sizeof(abs_params_t), tag, MFC_GET_CMD);
    spu_writech(MFC_WrTagMask, 1 << tag);
    spu_mfcstat(MFC_TAG_UPDATE_ALL);

    /* DMA Transfer 2 : GET input data */
    spu_mfcdma64(in_spe, mfc_ea2h(abs_params.ea_in), mfc_ea2l(abs_params.ea_in),
                 abs_params.size * sizeof(float), tag, MFC_GET_CMD);
    spu_writech(MFC_WrTagMask, 1 << tag);
    spu_mfcstat(MFC_TAG_UPDATE_ALL);

    /* Calculate absolute values */
    for (i = 0; i < abs_params.size; i++) {
        if (in_spe[i] > 0) {
                out_spe[i] = in_spe[i];
        } else {
                out_spe[i] = in_spe[i] * -1;
        }
    }

    /* DMA Transfer 3 : PUT output data */
    spu_mfcdma64(out_spe, mfc_ea2h(abs_params.ea_out), mfc_ea2l(abs_params.ea_out),
                 abs_params.size * sizeof(float), tag, MFC_PUT_CMD);
    spu_writech(MFC_WrTagMask, 1 << tag);
    spu_mfcstat(MFC_TAG_UPDATE_ALL);

    return 0;
}
int main(unsigned long long spu_id __attribute__ ((unused)), unsigned long long parm)
{
  int i, j;
  int left, cnt;
  float time;
  unsigned int tag_id;
  vector float dt_v, dt_inv_mass_v;

  // Reserve a tag ID
  tag_id = mfc_tag_reserve();

  spu_writech(MFC_WrTagMask, -1);

  // Input parameter parm is a pointer to the particle parameter context.
  // Fetch the context, waiting for it to complete.
  
  spu_mfcdma32((void *)(&ctx), (unsigned int)parm, sizeof(parm_context), tag_id, MFC_GET_CMD);
  (void)spu_mfcstat(MFC_TAG_UPDATE_ALL);

  dt_v = spu_splats(ctx.dt);

  // For each step in time
  for (time=0; time<END_OF_TIME; time += ctx.dt) {
    // For each block of particles
    for (i=0; i<ctx.particles; i+=PARTICLES_PER_BLOCK) {
      // Determine the number of particles in this block.
      left = ctx.particles - i;
      cnt = (left < PARTICLES_PER_BLOCK) ? left : PARTICLES_PER_BLOCK;

      // Fetch the data - position, velocity and inverse_mass. Wait for the DMA to complete 
      // before performing computation.
      spu_mfcdma32((void *)(pos), (unsigned int)(ctx.pos_v+i), cnt * sizeof(vector float), tag_id, MFC_GETB_CMD);
      spu_mfcdma32((void *)(vel), (unsigned int)(ctx.vel_v+i), cnt * sizeof(vector float), tag_id, MFC_GET_CMD);
      spu_mfcdma32((void *)(inv_mass), (unsigned int)(ctx.inv_mass+i), cnt * sizeof(float), tag_id, MFC_GET_CMD);
      (void)spu_mfcstat(MFC_TAG_UPDATE_ALL);

      // Compute the step in time for the block of particles
      for (j=0; j<cnt; j++) {
	pos[j] = spu_madd(vel[j], dt_v, pos[j]);
	dt_inv_mass_v = spu_mul(dt_v, spu_splats(inv_mass[j]));
	vel[j] = spu_madd(dt_inv_mass_v, ctx.force_v, vel[j]);
      }

      // Put the position and velocity data back into system memory
      spu_mfcdma32((void *)(pos), (unsigned int)(ctx.pos_v+i), cnt * sizeof(vector float), tag_id, MFC_PUT_CMD);
      spu_mfcdma32((void *)(vel), (unsigned int)(ctx.vel_v+i), cnt * sizeof(vector float), tag_id, MFC_PUT_CMD);
    }
  }
  // Wait for final DMAs to complete before terminating SPU thread.
  (void)spu_mfcstat(MFC_TAG_UPDATE_ALL);
  return (0);
}
Beispiel #3
0
int main(unsigned long long spe, unsigned long long argp, unsigned long long envp)
{
    int i;

    for (i = 0; i < 10000; i++) {
        spu_mfcdma64(&counter, mfc_ea2h(argp), mfc_ea2l(argp), sizeof(int), 0, MFC_GET_CMD);
        spu_writech(MFC_WrTagMask, 1 << 0);
        spu_mfcstat(MFC_TAG_UPDATE_ALL);

        counter++;

        spu_mfcdma64(&counter, mfc_ea2h(argp), mfc_ea2l(argp), sizeof(int), 0, MFC_PUT_CMD);
        spu_writech(MFC_WrTagMask, 1 << 0);
        spu_mfcstat(MFC_TAG_UPDATE_ALL);
    }

    return 0;
}
Beispiel #4
0
void GetSPEAddr( unsigned int ea, unsigned int *PPE_addr )
{

//#ifdef _DEBUG
//#if _DBGLVL > 1
//	printf( "Getting SPE address @ %#x to %#x\n", ea, (unsigned int)PPE_addr );
//#endif
//#endif
	Printf1( "SPE[%u]: Getting SPE address @ %#x to %#x\n", SPE_id, ea, (unsigned int)PPE_addr );
	// Get STRUCTURE
	spu_mfcdma32( PPE_addr, ea, 16, 30, MFC_GET_CMD );
	spu_writech( MFC_WrTagMask, 1 << 30 );
	spu_mfcstat( MFC_TAG_UPDATE_ALL );
}
Beispiel #5
0
void GetShader( unsigned int EA, unsigned int size, unsigned int *shader )
{
//#ifdef _DEBUG
//#if _DBGLVL > 1
//	printf( "Getting shader @ %#x to %#x(%u)\n", EA, (unsigned int)shader, size	 );
//	printf( "Shader size is %u\n", size );
//#endif
//#endif
	Printf1( "SPE[%u]: Getting shader @ %#x to %#x(%u)\n", SPE_id, EA, (unsigned int)shader, size	 );
	// Get shader
	size = ( size + 15 ) &~ 15;

	spu_mfcdma32( shader, EA, size, 29, MFC_GET_CMD );
	spu_writech( MFC_WrTagMask, 1 << 29 );
	spu_mfcstat( MFC_TAG_UPDATE_ALL );
}
Beispiel #6
0
void GetOperation( unsigned int ea, Operation_t *data )
{

//#ifdef _DEBUG
//#if _DBGLVL > 1
//	printf( "Getting operation @ %#x to %#x(%u)\n", ea, (unsigned int)data, sizeof( Operation_t ) );
//#endif
//#endif
	Printf1( "SPE[%u]: Getting operation @ %#x to %#x(%u)\n", SPE_id, ea, (unsigned int)data, sizeof( Operation_t ) );
	// Get STRUCTURE
	spu_mfcdma32( data, ea, 32, 30, MFC_GET_CMD );
	// Waiting
	spu_writech( MFC_WrTagMask, 1 << 30 );
	spu_mfcstat( MFC_TAG_UPDATE_ALL );
//	printf( "---->%#x\n", (unsigned int)data->EA_shader );
//	printf( "---->%#x\n", (unsigned int)data->shaderSize );
//	printf( "---->%#x\n", (unsigned int)data->obj[0] );
//	printf( "---->%#x\n", (unsigned int)data->obj[1] );
//	printf( "---->%#x\n", (unsigned int)data->obj[2] );
//	printf( "---->%#x\n", (unsigned int)data->scalars[0] );
//	printf( "---->%#x\n", (unsigned int)data->scalars[1] );
//	printf( "---->%#x\n", (unsigned int)data->scalars[2] );
}
Beispiel #7
0
void fluidx(data_type_t *fluidx_u, data_type_t *fluidx_b, int *fluidx_nx, int *fluidx_ny, int *fluidx_nz, data_type_t *fluidx_dt, int *fluidx_Y_location, int *fluidx_Z_location, int fluidx_tag_id)
{

volatile data_type_t fluidx_s_u[5*(*fluidx_nx)] __attribute__ ((aligned (16)));
volatile data_type_t fluidx_s_b3x[3*(*fluidx_nx)] __attribute__ ((aligned (16)));
volatile data_type_t fluidx_s_b2_jp[3*(*fluidx_nx)] __attribute__ ((aligned (16)));
volatile data_type_t fluidx_s_b3_kp[3*(*fluidx_nx)] __attribute__ ((aligned (16)));

data_type_t fluidx_tmp1[(*fluidx_nx)] __attribute__ ((aligned (16)));
data_type_t fluidx_tmp2[(*fluidx_nx)] __attribute__ ((aligned (16)));

int i,j,k; 
int j_global,k_global;
int fluidx_jp,fluidx_kp; 

int Y_inter=box_ny/num_SPE_Y;
int Z_inter=box_nz/num_SPE_Z;

int CELL_PER_BLOCK; 
CELL_PER_BLOCK = (*fluidx_nx);
int u_memSize_PER_BLOCK; 
u_memSize_PER_BLOCK = 5* CELL_PER_BLOCK; 
int b_memSize_PER_BLOCK; 
b_memSize_PER_BLOCK = 3* CELL_PER_BLOCK;

  for (k=0;k<(*fluidx_nz);k++)
  {
        for (j=0;j<(*fluidx_ny);j++)
        {
//	assigne global index
		k_global=k+(*fluidx_Z_location)*Z_inter;
		j_global=j+(*fluidx_Y_location)*Y_inter;
//      get u
                spu_mfcdma32((void *)(fluidx_s_u),
                (unsigned int)(fluidx_u+matrix4D(5,(box_nx),(box_ny),(box_nz),0,0,(j_global),(k_global))),
                u_memSize_PER_BLOCK * sizeof(data_type_t), fluidx_tag_id, MFC_GETB_CMD);
//      get b3x
                spu_mfcdma32((void *)(fluidx_s_b3x),
                (unsigned int)(fluidx_b+matrix4D(3,(box_nx),(box_ny),(box_nz),0,0,(j_global),(k_global))),
                b_memSize_PER_BLOCK * sizeof(data_type_t), fluidx_tag_id, MFC_GET_CMD);
//	get jp & kp
                fluidx_jp=(j_global+1)%(box_ny);
                fluidx_kp=(k_global+1)%(box_nz);
//	get b(2,:,jp,k) from update data matrix
                spu_mfcdma32((void *)(fluidx_s_b2_jp),
                (unsigned int)(fluidx_b+matrix4D(3,(box_nx),(box_ny),(box_nz),0,0,(fluidx_jp),(k_global))),
                b_memSize_PER_BLOCK * sizeof(data_type_t), fluidx_tag_id, MFC_GET_CMD);
//	get b(3,:,j,kp) from update data matrix

                spu_mfcdma32((void *)(fluidx_s_b3_kp),
                (unsigned int)(fluidx_b+matrix4D(3,(box_nx),(box_ny),(box_nz),0,0,(j_global),(fluidx_kp))),
                b_memSize_PER_BLOCK * sizeof(data_type_t), fluidx_tag_id, MFC_GET_CMD);
//      Wait for final DMAs to complete before terminating SPU thread.
		(void)spu_mfcstat(MFC_TAG_UPDATE_ALL);
//	b3x
		multiple_itself_matrix2D(fluidx_s_b3x,3,(*fluidx_nx),0.5E0);
//	get cshifrt(fluidx_s_b3x(1,:),1)
		extract_matrix2Dto1D(fluidx_s_b3x,3,(*fluidx_nx),(1-1),fluidx_tmp1);
	 	cshift_matrix1D(fluidx_tmp1,(*fluidx_nx),1,fluidx_tmp2);	
//	b3x
		for (i=0;i<(*fluidx_nx);i++)
		{
			fluidx_s_b3x[matrix2D(3,(*fluidx_nx),0,i)]=fluidx_s_b3x[matrix2D(3,(*fluidx_nx),0,i)]+fluidx_tmp2[i];
			fluidx_s_b3x[matrix2D(3,(*fluidx_nx),1,i)]=fluidx_s_b3x[matrix2D(3,(*fluidx_nx),1,i)]+fluidx_s_b2_jp[matrix2D(3,(*fluidx_nx),1,i)]/2E0;
			fluidx_s_b3x[matrix2D(3,(*fluidx_nx),2,i)]=fluidx_s_b3x[matrix2D(3,(*fluidx_nx),2,i)]+fluidx_s_b3_kp[matrix2D(3,(*fluidx_nx),2,i)]/2E0;
		}
//	tvd1
		tvd1(fluidx_s_u,fluidx_s_b3x,fluidx_nx,fluidx_dt);
//	send u
                spu_mfcdma32((void *)(fluidx_s_u),
                (unsigned int)(fluidx_u+matrix4D(5,(box_nx),(box_ny),(box_nz),0,0,(j_global),(k_global))),
                u_memSize_PER_BLOCK * sizeof(data_type_t), fluidx_tag_id, MFC_PUT_CMD);
        }
  }
//      Wait for final DMAs to complete before terminating SPU thread.
  (void)spu_mfcstat(MFC_TAG_UPDATE_ALL);

return;
}
Beispiel #8
0
/* wait for dma transfer to be finished */
static void wait_for_completion(int tag) {
    mfc_write_tag_mask(1<<tag);
    spu_mfcstat(MFC_TAG_UPDATE_ALL);
}
Beispiel #9
0
/**
 * Performs transport in a layer-block of the concentration matrix
 */
void discretize_z_block(uint64_t argvp)
{
    /* Iterators */
    uint32_t i, f, p, w;

    /* Arguments */
    real_t dt;
    real_t size;
    uint32_t block;

    timer_start(&metrics.z_discret);

    /* Get arguments from main memory synchronously */
    get_transport_argv(argvp, &dt, &size, &block);

    allocate_dma_list(&clist[0]);
    allocate_dma_list(&clist[1]);
    allocate_dma_list(&clist[2]);
    allocate_dma_list(&wlist[0]);
    allocate_dma_list(&wlist[1]);
    allocate_dma_list(&wlist[2]);
    allocate_dma_list(&dlist[0]);
    allocate_dma_list(&dlist[1]);
    allocate_dma_list(&dlist[2]);

    allocate_buffer(&conc[0]);
    allocate_buffer(&conc[1]);
    allocate_buffer(&conc[2]);
    allocate_buffer(&wind[0]);
    allocate_buffer(&wind[1]);
    allocate_buffer(&wind[2]);
    allocate_buffer(&diff[0]);
    allocate_buffer(&diff[1]);
    allocate_buffer(&diff[2]);
    allocate_buffer(&buff[0]);
    allocate_buffer(&buff[1]);
    allocate_buffer(&buff[2]);

    if(block == 1)
    {
        /* Start in_0 transfer */
        fetch_z_buffer(0, 0);

        /* Process in_0 */
        transport_buffer(0, size, dt);

        /* Write out_0 back to main memory */
        write_z_buffer(0);

        wait_for_dma(0);
    }
    else
    {
        /* Start in_0 transfer */
        fetch_z_buffer(0, 0);

        /* Start in_1 transfer */
        fetch_z_buffer(1, 16);

        /* Process in_0 */
        transport_buffer(0, size, dt);

        for(i=0; i<block-2; i++)
        {
            w = i % 3;
            p = (i+1) % 3;
            f = (i+2) % 3;

            /* Write buffer back to main memory */
            write_z_buffer(w);

            /* Start buffer transfer */
            fetch_z_buffer(f, (i+2)*16);

            /* Process buffer */
            transport_buffer(p, size, dt);
        }

        /* Discretize final column vector */
        w = i % 3;
        p = (i+1) % 3;

        /* Write out_b back to main memory */
        write_z_buffer(w);

        /* Process in_(b+1) */
        transport_buffer(p, size, dt);

        /* Write out_(b+1) back to main memory */
        write_z_buffer(p);

        /* Make sure DMA is complete before we exit */
        mfc_write_tag_mask( (1<<w) | (1<<p) );
        spu_mfcstat(MFC_TAG_UPDATE_ALL);
    }

    free_dma_list(&clist[0]);
    free_dma_list(&clist[1]);
    free_dma_list(&clist[2]);
    free_dma_list(&wlist[0]);
    free_dma_list(&wlist[1]);
    free_dma_list(&wlist[2]);
    free_dma_list(&dlist[0]);
    free_dma_list(&dlist[1]);
    free_dma_list(&dlist[2]);

    free_buffer(&conc[0]);
    free_buffer(&conc[1]);
    free_buffer(&conc[2]);
    free_buffer(&wind[0]);
    free_buffer(&wind[1]);
    free_buffer(&wind[2]);
    free_buffer(&diff[0]);
    free_buffer(&diff[1]);
    free_buffer(&diff[2]);
    free_buffer(&buff[0]);
    free_buffer(&buff[1]);
    free_buffer(&buff[2]);

    timer_stop(&metrics.z_discret);

    set_status(SPE_STATUS_WAITING);
}
Beispiel #10
0
/**
 * Performs transport in a row-block of the concentration matrix
 */
void discretize_x_block(uint64_t argvp)
{
    uint32_t i, f, p, w;

    /* Arguments */
    real_t dt;
    real_t size;
    uint32_t block;

    timer_start(&metrics.x_discret);

    /* Get arguments from main memory synchronously */
    get_transport_argv(argvp, &dt, &size, &block);

    allocate_buffer(&conc[0]);
    allocate_buffer(&conc[1]);
    allocate_buffer(&conc[2]);
    allocate_buffer(&wind[0]);
    allocate_buffer(&wind[1]);
    allocate_buffer(&wind[2]);
    allocate_buffer(&diff[0]);
    allocate_buffer(&diff[1]);
    allocate_buffer(&diff[2]);
    allocate_buffer(&buff[0]);
    allocate_buffer(&buff[1]);
    allocate_buffer(&buff[2]);
    allocate_buffer(&shuffle[0]);
    allocate_buffer(&shuffle[1]);
    allocate_buffer(&shuffle[2]);
    allocate_buffer(&shuffle[3]);

    if(block == 1)
    {
        /* Start in_0 transfer */
        fetch_x_buffer(0, 0);

        /* Process in_0 */
        transport_buffer(0, size, dt);

        /* Write out_0 back to main memory */
        write_x_buffer(0, 0);
    }
    else
    {
        /* Start in_0 transfer */
        fetch_x_buffer(0, 0);

        /* Start in_1 transfer */
        fetch_x_buffer(1, NX_ALIGNED_SIZE);

        /* Process in_0 */
        transport_buffer(0, size, dt);

        /* Loop over rows in this block */
        for(i=0; i<block-2; i++)
        {
            w = i % 3;
            p = (i+1) % 3;
            f = (i+2) % 3;

            /* Write out_b back to main memory */
            write_x_buffer(w, i*NX_ALIGNED_SIZE);

            /* Start in_b transfer */
            fetch_x_buffer(f, (i+2)*NX_ALIGNED_SIZE);

            /* Process in_(b+1) */
            transport_buffer(p, size, dt);
        }

        /* Discretize final row */
        w = i % 3;
        p = (i+1) % 3;

        /* Write out_b back to main memory */
        write_x_buffer(w, i*NX_ALIGNED_SIZE);

        /* Process in_(b+1) */
        transport_buffer(p, size, dt);

        /* Write out_(b+1) back to main memory */
        write_x_buffer(p, (i+1)*NX_ALIGNED_SIZE);

        /* Make sure DMA is complete before we exit */
        mfc_write_tag_mask( (1<<w) | (1<<p) );
        spu_mfcstat(MFC_TAG_UPDATE_ALL);
    }

    free_buffer(&conc[0]);
    free_buffer(&conc[1]);
    free_buffer(&conc[2]);
    free_buffer(&wind[0]);
    free_buffer(&wind[1]);
    free_buffer(&wind[2]);
    free_buffer(&diff[0]);
    free_buffer(&diff[1]);
    free_buffer(&diff[2]);
    free_buffer(&buff[0]);
    free_buffer(&buff[1]);
    free_buffer(&buff[2]);
    free_buffer(&shuffle[0]);
    free_buffer(&shuffle[1]);
    free_buffer(&shuffle[2]);
    free_buffer(&shuffle[3]);

    timer_stop(&metrics.x_discret);

    /* Signal PPE */
    set_status(SPE_STATUS_WAITING);
}
void process_buffer(int buffer, int cnt, vector float dt_v)
{
  int i;
  volatile vector float *p_inv_mass_v;
  vector float force_v, inv_mass_v;
  vector float pos0, pos1, pos2, pos3;
  vector float vel0, vel1, vel2, vel3;
  vector float dt_inv_mass_v, dt_inv_mass_v_0, dt_inv_mass_v_1, dt_inv_mass_v_2, dt_inv_mass_v_3;
  vector unsigned char splat_word_0 = (vector unsigned char){0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3};
  vector unsigned char splat_word_1 = (vector unsigned char){4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7};
  vector unsigned char splat_word_2 = (vector unsigned char){8, 9,10,11, 8, 9,10,11, 8, 9,10,11, 8, 9,10,11};
  vector unsigned char splat_word_3 = (vector unsigned char){12,13,14,15,12,13,14,15,12,13,14,15,12,13,14,15};

  p_inv_mass_v = (volatile vector float *)&inv_mass[buffer][0]; 
  force_v = ctx.force_v;

  // Compute the step in time for the block of particles, four 
  // particle at a time.
  for (i=0; i<cnt; i+=4) {
    inv_mass_v = *p_inv_mass_v++;
    
    pos0 = pos[buffer][i+0];
    pos1 = pos[buffer][i+1];
    pos2 = pos[buffer][i+2];
    pos3 = pos[buffer][i+3];

    vel0 = vel[buffer][i+0];
    vel1 = vel[buffer][i+1];
    vel2 = vel[buffer][i+2];
    vel3 = vel[buffer][i+3];

    dt_inv_mass_v = spu_mul(dt_v, inv_mass_v);

    pos0 = spu_madd(vel0, dt_v, pos0);
    pos1 = spu_madd(vel1, dt_v, pos1);
    pos2 = spu_madd(vel2, dt_v, pos2);
    pos3 = spu_madd(vel3, dt_v, pos3);

    dt_inv_mass_v_0 = spu_shuffle(dt_inv_mass_v, dt_inv_mass_v, splat_word_0);
    dt_inv_mass_v_1 = spu_shuffle(dt_inv_mass_v, dt_inv_mass_v, splat_word_1);
    dt_inv_mass_v_2 = spu_shuffle(dt_inv_mass_v, dt_inv_mass_v, splat_word_2);
    dt_inv_mass_v_3 = spu_shuffle(dt_inv_mass_v, dt_inv_mass_v, splat_word_3);

    vel0 = spu_madd(dt_inv_mass_v_0, force_v, vel0);
    vel1 = spu_madd(dt_inv_mass_v_1, force_v, vel1);
    vel2 = spu_madd(dt_inv_mass_v_2, force_v, vel2);
    vel3 = spu_madd(dt_inv_mass_v_3, force_v, vel3);

    pos[buffer][i+0] = pos0;
    pos[buffer][i+1] = pos1;
    pos[buffer][i+2] = pos2;
    pos[buffer][i+3] = pos3;

    vel[buffer][i+0] = vel0;
    vel[buffer][i+1] = vel1;
    vel[buffer][i+2] = vel2;
    vel[buffer][i+3] = vel3;
  }
}


int main(unsigned long long spu_id __attribute__ ((unused)), unsigned long long argv)
{
  int buffer, next_buffer;
  int cnt, next_cnt, left;
  float time, dt;
  vector float dt_v;
  volatile vector float *ctx_pos_v, *ctx_vel_v;
  volatile vector float *next_ctx_pos_v, *next_ctx_vel_v;
  volatile float *ctx_inv_mass, *next_ctx_inv_mass;
  unsigned int tags[2];

  // Reserve a pair of DMA tag IDs
  tags[0] = mfc_tag_reserve();
  tags[1] = mfc_tag_reserve();
  
  // Input parameter argv is a pointer to the particle context.
  // Fetch the parameter context, waiting for it to complete.
  spu_writech(MFC_WrTagMask, 1 << tags[0]);
  spu_mfcdma32((void *)(&ctx), (unsigned int)argv, sizeof(parm_context), tags[0], MFC_GET_CMD);
  (void)spu_mfcstat(MFC_TAG_UPDATE_ALL);

  dt = ctx.dt;
  dt_v = spu_splats(dt);

  // For each step in time
  for (time=0; time<END_OF_TIME; time += dt) {
    // For each double buffered block of particles
    left = ctx.particles;

    cnt = (left < PARTICLES_PER_BLOCK) ? left : PARTICLES_PER_BLOCK;

    ctx_pos_v = ctx.pos_v;
    ctx_vel_v = ctx.vel_v;
    ctx_inv_mass = ctx.inv_mass;

    // Prefetch first buffer of input data.
    buffer = 0;
    spu_mfcdma32((void *)(pos), (unsigned int)(ctx_pos_v), cnt * sizeof(vector float), tags[0], MFC_GETB_CMD);
    spu_mfcdma32((void *)(vel), (unsigned int)(ctx_vel_v), cnt * sizeof(vector float), tags[0], MFC_GET_CMD);
    spu_mfcdma32((void *)(inv_mass), (unsigned int)(ctx_inv_mass), cnt * sizeof(float), tags[0], MFC_GET_CMD);

    while (cnt < left) {
      left -= cnt;

      next_ctx_pos_v = ctx_pos_v + cnt;
      next_ctx_vel_v = ctx_vel_v + cnt;
      next_ctx_inv_mass = ctx_inv_mass + cnt;
      next_cnt = (left < PARTICLES_PER_BLOCK) ? left : PARTICLES_PER_BLOCK;

      // Prefetch next buffer so the data is available for computation on next loop iteration.
      // The first DMA is barriered so that we don't GET data before the previous iteration's
      // data is PUT.
      next_buffer = buffer^1;

      spu_mfcdma32((void *)(&pos[next_buffer][0]), (unsigned int)(next_ctx_pos_v), next_cnt * sizeof(vector float), tags[next_buffer], MFC_GETB_CMD);
      spu_mfcdma32((void *)(&vel[next_buffer][0]), (unsigned int)(next_ctx_vel_v), next_cnt * sizeof(vector float), tags[next_buffer], MFC_GET_CMD);
      spu_mfcdma32((void *)(&inv_mass[next_buffer][0]), (unsigned int)(next_ctx_inv_mass), next_cnt * sizeof(float), tags[next_buffer], MFC_GET_CMD);
      
      // Wait for previously prefetched data
      spu_writech(MFC_WrTagMask, 1 << tags[buffer]);
      (void)spu_mfcstat(MFC_TAG_UPDATE_ALL);

      process_buffer(buffer, cnt, dt_v);

      // Put the buffer's position and velocity data back into system memory
      spu_mfcdma32((void *)(&pos[buffer][0]), (unsigned int)(ctx_pos_v), cnt * sizeof(vector float), tags[buffer], MFC_PUT_CMD);
      spu_mfcdma32((void *)(&vel[buffer][0]), (unsigned int)(ctx_vel_v), cnt * sizeof(vector float), tags[buffer], MFC_PUT_CMD);
      
      ctx_pos_v = next_ctx_pos_v;
      ctx_vel_v = next_ctx_vel_v;
      ctx_inv_mass = next_ctx_inv_mass;

      buffer = next_buffer;
      cnt = next_cnt;		  
    }

    // Wait for previously prefetched data
    spu_writech(MFC_WrTagMask, 1 << tags[buffer]);
    (void)spu_mfcstat(MFC_TAG_UPDATE_ALL);

    process_buffer(buffer, cnt, dt_v);

    // Put the buffer's position and velocity data back into system memory
    spu_mfcdma32((void *)(&pos[buffer][0]), (unsigned int)(ctx_pos_v), cnt * sizeof(vector float), tags[buffer], MFC_PUT_CMD);
    spu_mfcdma32((void *)(&vel[buffer][0]), (unsigned int)(ctx_vel_v), cnt * sizeof(vector float), tags[buffer], MFC_PUT_CMD);

    // Wait for DMAs to complete before starting the next step in time.
    spu_writech(MFC_WrTagMask, 1 << tags[buffer]);
    (void)spu_mfcstat(MFC_TAG_UPDATE_ALL);
  }

  return (0);
}
void advectbyzxA1(data_type_t *advectbyzxA1_u, data_type_t *advectbyzxA1_b, int *advectbyzxA1_nx, int *advectbyzxA1_ny, int *advectbyzxA1_nz, data_type_t *advectbyzxA1_dt, data_type_t *advectbyzxA1_adv_tmp, int *advectbyzxA1_Y_location, int *advectbyzxA1_Z_location, int advectbyzxA1_tag_id)
{

volatile data_type_t advA1_s_u_jm[5*(*advectbyzxA1_nx)] __attribute__ ((aligned (16)));
volatile data_type_t advA1_s_u[5*(*advectbyzxA1_nx)] __attribute__ ((aligned (16)));
volatile data_type_t advA1_s_b[3*(*advectbyzxA1_nx)] __attribute__ ((aligned (16)));

data_type_t fluxbx[(*advectbyzxA1_nx)] __attribute__ ((aligned (16)));
data_type_t b1x[(*advectbyzxA1_nx)] __attribute__ ((aligned (16)));
data_type_t vx[(*advectbyzxA1_nx)] __attribute__ ((aligned (16)));

data_type_t advA1_tmp1[(*advectbyzxA1_nx)] __attribute__ ((aligned (16)));
data_type_t advA1_tmp2[(*advectbyzxA1_nx)] __attribute__ ((aligned (16)));

int advA1_jm; 
int i,j,k; 
int j_global,k_global;

int Y_inter=box_ny/num_SPE_Y;
int Z_inter=box_nz/num_SPE_Z;

int CELL_PER_BLOCK; 
CELL_PER_BLOCK = (*advectbyzxA1_nx);
int u_memSize_PER_BLOCK; 
u_memSize_PER_BLOCK = 5* CELL_PER_BLOCK;
int b_memSize_PER_BLOCK; 
b_memSize_PER_BLOCK = 3* CELL_PER_BLOCK;

for (k=0;k<(*advectbyzxA1_nz);k++)
{
	for (j=0;j<(*advectbyzxA1_ny);j++)
        {
//      assign global index
                k_global=k+(*advectbyzxA1_Z_location)*Z_inter;
                j_global=j+(*advectbyzxA1_Y_location)*Y_inter;
//      jm=mod(j+ny-2,ny)+1
		advA1_jm=(j_global+(box_ny)-1)%(box_ny);
//	get u(:,:,j,k)
		spu_mfcdma32((void *)(advA1_s_u),
                (unsigned int)(advectbyzxA1_u+matrix4D(5,(box_nx),(box_ny),(box_nz),0,0,(j_global),(k_global))),
                u_memSize_PER_BLOCK * sizeof(data_type_t), advectbyzxA1_tag_id, MFC_GETB_CMD);
//	get u(:,:,jm,k)
                spu_mfcdma32((void *)(advA1_s_u_jm),
                (unsigned int)(advectbyzxA1_u+matrix4D(5,(box_nx),(box_ny),(box_nz),0,0,(advA1_jm),(k_global))),
                u_memSize_PER_BLOCK * sizeof(data_type_t), advectbyzxA1_tag_id, MFC_GET_CMD);
//	get b(:,:,j,k)
                spu_mfcdma32((void *)(advA1_s_b),
                (unsigned int)(advectbyzxA1_b+matrix4D(3,(box_nx),(box_ny),(box_nz),0,0,(j_global),(k_global))),
                b_memSize_PER_BLOCK * sizeof(data_type_t), advectbyzxA1_tag_id, MFC_GET_CMD);
//      Wait for final DMAs to complete before terminating SPU thread.
                (void)spu_mfcstat(MFC_TAG_UPDATE_ALL);
//      vx=(u(2,:,jm,k)+u(2,:,j,k))/(u(1,:,jm,k)+u(1,:,j,k))
		for (i=0;i<(*advectbyzxA1_nx);i++)
		{
			vx[i]=(advA1_s_u_jm[matrix2D(5,(*advectbyzxA1_nx),(2-1),i)]+advA1_s_u[matrix2D(5,(*advectbyzxA1_nx),(2-1),i)])/(advA1_s_u_jm[matrix2D(5,(*advectbyzxA1_nx),(1-1),i)]+advA1_s_u[matrix2D(5,(*advectbyzxA1_nx),(1-1),i)]);
		}
//      vx=(cshift(vx,-1)+cshift(vx,1)+2*vx)/4
                cshift_matrix1D(vx, (*advectbyzxA1_nx), (-1), advA1_tmp1);
                cshift_matrix1D(vx, (*advectbyzxA1_nx), (1), advA1_tmp2);
//	vx=(cshift(vx,-1)+cshift(vx,1)+2.0d0*vx)/4.0d0
                for (i=0;i<(*advectbyzxA1_nx);i++)
                {
                        vx[i]=(advA1_tmp1[i]+advA1_tmp2[i]+2.0E0*vx[i])/4.0E0;
			b1x[i]=advA1_s_b[matrix2D(3,(*advectbyzxA1_nx),(2-1),i)];
		}
//      call tvdb(fluxbx,b1x,vx,nx,dt)
                tvdb(fluxbx,b1x,vx,advectbyzxA1_nx,advectbyzxA1_dt);
//      cshift(fluxbx,-1)
                cshift_matrix1D(fluxbx,(*advectbyzxA1_nx), (-1), advA1_tmp1);
//      b(2,:,j,k)=b1x
//      b(1,:,j,k)=b(1,:,j,k)-cshift(fluxbx,-1)
		for (i=0;i<(*advectbyzxA1_nx);i++)
		{
			advA1_s_b[matrix2D(3,(*advectbyzxA1_nx),(2-1),i)]=b1x[i];
			advA1_s_b[matrix2D(3,(*advectbyzxA1_nx),(1-1),i)]=advA1_s_b[matrix2D(3,(*advectbyzxA1_nx),(1-1),i)]-advA1_tmp1[i];
		}
//	send b(2,:,j,k) & b(1,:,j,k)
                spu_mfcdma32((void *)(advA1_s_b),
                (unsigned int)(advectbyzxA1_b+matrix4D(3,(box_nx),(box_ny),(box_nz),0,0,(j_global),(k_global))),
                b_memSize_PER_BLOCK * sizeof(data_type_t), advectbyzxA1_tag_id, MFC_PUTB_CMD);
//	send advA1_tmp1 out
                spu_mfcdma32((void *)(advA1_tmp1),
                (unsigned int)(advectbyzxA1_adv_tmp+matrix3D((box_nx),(box_ny),(box_nz),0,(j_global),(k_global))),
                CELL_PER_BLOCK * sizeof(data_type_t), advectbyzxA1_tag_id, MFC_PUT_CMD);
	}
}
//      Wait for final DMAs to complete before terminating SPU thread.
                (void)spu_mfcstat(MFC_TAG_UPDATE_ALL);
return;
}