int main(unsigned long long spe, unsigned long long argp, unsigned long long envp) { int i; int tag = 1; /* DMA Transfer 1 : GET input/output parameters */ spu_mfcdma64(&abs_params, mfc_ea2h(argp), mfc_ea2l(argp), sizeof(abs_params_t), tag, MFC_GET_CMD); spu_writech(MFC_WrTagMask, 1 << tag); spu_mfcstat(MFC_TAG_UPDATE_ALL); /* DMA Transfer 2 : GET input data */ spu_mfcdma64(in_spe, mfc_ea2h(abs_params.ea_in), mfc_ea2l(abs_params.ea_in), abs_params.size * sizeof(float), tag, MFC_GET_CMD); spu_writech(MFC_WrTagMask, 1 << tag); spu_mfcstat(MFC_TAG_UPDATE_ALL); /* Calculate absolute values */ for (i = 0; i < abs_params.size; i++) { if (in_spe[i] > 0) { out_spe[i] = in_spe[i]; } else { out_spe[i] = in_spe[i] * -1; } } /* DMA Transfer 3 : PUT output data */ spu_mfcdma64(out_spe, mfc_ea2h(abs_params.ea_out), mfc_ea2l(abs_params.ea_out), abs_params.size * sizeof(float), tag, MFC_PUT_CMD); spu_writech(MFC_WrTagMask, 1 << tag); spu_mfcstat(MFC_TAG_UPDATE_ALL); return 0; }
int main(unsigned long long spu_id __attribute__ ((unused)), unsigned long long parm) { int i, j; int left, cnt; float time; unsigned int tag_id; vector float dt_v, dt_inv_mass_v; // Reserve a tag ID tag_id = mfc_tag_reserve(); spu_writech(MFC_WrTagMask, -1); // Input parameter parm is a pointer to the particle parameter context. // Fetch the context, waiting for it to complete. spu_mfcdma32((void *)(&ctx), (unsigned int)parm, sizeof(parm_context), tag_id, MFC_GET_CMD); (void)spu_mfcstat(MFC_TAG_UPDATE_ALL); dt_v = spu_splats(ctx.dt); // For each step in time for (time=0; time<END_OF_TIME; time += ctx.dt) { // For each block of particles for (i=0; i<ctx.particles; i+=PARTICLES_PER_BLOCK) { // Determine the number of particles in this block. left = ctx.particles - i; cnt = (left < PARTICLES_PER_BLOCK) ? left : PARTICLES_PER_BLOCK; // Fetch the data - position, velocity and inverse_mass. Wait for the DMA to complete // before performing computation. spu_mfcdma32((void *)(pos), (unsigned int)(ctx.pos_v+i), cnt * sizeof(vector float), tag_id, MFC_GETB_CMD); spu_mfcdma32((void *)(vel), (unsigned int)(ctx.vel_v+i), cnt * sizeof(vector float), tag_id, MFC_GET_CMD); spu_mfcdma32((void *)(inv_mass), (unsigned int)(ctx.inv_mass+i), cnt * sizeof(float), tag_id, MFC_GET_CMD); (void)spu_mfcstat(MFC_TAG_UPDATE_ALL); // Compute the step in time for the block of particles for (j=0; j<cnt; j++) { pos[j] = spu_madd(vel[j], dt_v, pos[j]); dt_inv_mass_v = spu_mul(dt_v, spu_splats(inv_mass[j])); vel[j] = spu_madd(dt_inv_mass_v, ctx.force_v, vel[j]); } // Put the position and velocity data back into system memory spu_mfcdma32((void *)(pos), (unsigned int)(ctx.pos_v+i), cnt * sizeof(vector float), tag_id, MFC_PUT_CMD); spu_mfcdma32((void *)(vel), (unsigned int)(ctx.vel_v+i), cnt * sizeof(vector float), tag_id, MFC_PUT_CMD); } } // Wait for final DMAs to complete before terminating SPU thread. (void)spu_mfcstat(MFC_TAG_UPDATE_ALL); return (0); }
int main(unsigned long long spe, unsigned long long argp, unsigned long long envp) { int i; for (i = 0; i < 10000; i++) { spu_mfcdma64(&counter, mfc_ea2h(argp), mfc_ea2l(argp), sizeof(int), 0, MFC_GET_CMD); spu_writech(MFC_WrTagMask, 1 << 0); spu_mfcstat(MFC_TAG_UPDATE_ALL); counter++; spu_mfcdma64(&counter, mfc_ea2h(argp), mfc_ea2l(argp), sizeof(int), 0, MFC_PUT_CMD); spu_writech(MFC_WrTagMask, 1 << 0); spu_mfcstat(MFC_TAG_UPDATE_ALL); } return 0; }
void GetSPEAddr( unsigned int ea, unsigned int *PPE_addr ) { //#ifdef _DEBUG //#if _DBGLVL > 1 // printf( "Getting SPE address @ %#x to %#x\n", ea, (unsigned int)PPE_addr ); //#endif //#endif Printf1( "SPE[%u]: Getting SPE address @ %#x to %#x\n", SPE_id, ea, (unsigned int)PPE_addr ); // Get STRUCTURE spu_mfcdma32( PPE_addr, ea, 16, 30, MFC_GET_CMD ); spu_writech( MFC_WrTagMask, 1 << 30 ); spu_mfcstat( MFC_TAG_UPDATE_ALL ); }
void GetShader( unsigned int EA, unsigned int size, unsigned int *shader ) { //#ifdef _DEBUG //#if _DBGLVL > 1 // printf( "Getting shader @ %#x to %#x(%u)\n", EA, (unsigned int)shader, size ); // printf( "Shader size is %u\n", size ); //#endif //#endif Printf1( "SPE[%u]: Getting shader @ %#x to %#x(%u)\n", SPE_id, EA, (unsigned int)shader, size ); // Get shader size = ( size + 15 ) &~ 15; spu_mfcdma32( shader, EA, size, 29, MFC_GET_CMD ); spu_writech( MFC_WrTagMask, 1 << 29 ); spu_mfcstat( MFC_TAG_UPDATE_ALL ); }
void GetOperation( unsigned int ea, Operation_t *data ) { //#ifdef _DEBUG //#if _DBGLVL > 1 // printf( "Getting operation @ %#x to %#x(%u)\n", ea, (unsigned int)data, sizeof( Operation_t ) ); //#endif //#endif Printf1( "SPE[%u]: Getting operation @ %#x to %#x(%u)\n", SPE_id, ea, (unsigned int)data, sizeof( Operation_t ) ); // Get STRUCTURE spu_mfcdma32( data, ea, 32, 30, MFC_GET_CMD ); // Waiting spu_writech( MFC_WrTagMask, 1 << 30 ); spu_mfcstat( MFC_TAG_UPDATE_ALL ); // printf( "---->%#x\n", (unsigned int)data->EA_shader ); // printf( "---->%#x\n", (unsigned int)data->shaderSize ); // printf( "---->%#x\n", (unsigned int)data->obj[0] ); // printf( "---->%#x\n", (unsigned int)data->obj[1] ); // printf( "---->%#x\n", (unsigned int)data->obj[2] ); // printf( "---->%#x\n", (unsigned int)data->scalars[0] ); // printf( "---->%#x\n", (unsigned int)data->scalars[1] ); // printf( "---->%#x\n", (unsigned int)data->scalars[2] ); }
void fluidx(data_type_t *fluidx_u, data_type_t *fluidx_b, int *fluidx_nx, int *fluidx_ny, int *fluidx_nz, data_type_t *fluidx_dt, int *fluidx_Y_location, int *fluidx_Z_location, int fluidx_tag_id) { volatile data_type_t fluidx_s_u[5*(*fluidx_nx)] __attribute__ ((aligned (16))); volatile data_type_t fluidx_s_b3x[3*(*fluidx_nx)] __attribute__ ((aligned (16))); volatile data_type_t fluidx_s_b2_jp[3*(*fluidx_nx)] __attribute__ ((aligned (16))); volatile data_type_t fluidx_s_b3_kp[3*(*fluidx_nx)] __attribute__ ((aligned (16))); data_type_t fluidx_tmp1[(*fluidx_nx)] __attribute__ ((aligned (16))); data_type_t fluidx_tmp2[(*fluidx_nx)] __attribute__ ((aligned (16))); int i,j,k; int j_global,k_global; int fluidx_jp,fluidx_kp; int Y_inter=box_ny/num_SPE_Y; int Z_inter=box_nz/num_SPE_Z; int CELL_PER_BLOCK; CELL_PER_BLOCK = (*fluidx_nx); int u_memSize_PER_BLOCK; u_memSize_PER_BLOCK = 5* CELL_PER_BLOCK; int b_memSize_PER_BLOCK; b_memSize_PER_BLOCK = 3* CELL_PER_BLOCK; for (k=0;k<(*fluidx_nz);k++) { for (j=0;j<(*fluidx_ny);j++) { // assigne global index k_global=k+(*fluidx_Z_location)*Z_inter; j_global=j+(*fluidx_Y_location)*Y_inter; // get u spu_mfcdma32((void *)(fluidx_s_u), (unsigned int)(fluidx_u+matrix4D(5,(box_nx),(box_ny),(box_nz),0,0,(j_global),(k_global))), u_memSize_PER_BLOCK * sizeof(data_type_t), fluidx_tag_id, MFC_GETB_CMD); // get b3x spu_mfcdma32((void *)(fluidx_s_b3x), (unsigned int)(fluidx_b+matrix4D(3,(box_nx),(box_ny),(box_nz),0,0,(j_global),(k_global))), b_memSize_PER_BLOCK * sizeof(data_type_t), fluidx_tag_id, MFC_GET_CMD); // get jp & kp fluidx_jp=(j_global+1)%(box_ny); fluidx_kp=(k_global+1)%(box_nz); // get b(2,:,jp,k) from update data matrix spu_mfcdma32((void *)(fluidx_s_b2_jp), (unsigned int)(fluidx_b+matrix4D(3,(box_nx),(box_ny),(box_nz),0,0,(fluidx_jp),(k_global))), b_memSize_PER_BLOCK * sizeof(data_type_t), fluidx_tag_id, MFC_GET_CMD); // get b(3,:,j,kp) from update data matrix spu_mfcdma32((void *)(fluidx_s_b3_kp), (unsigned int)(fluidx_b+matrix4D(3,(box_nx),(box_ny),(box_nz),0,0,(j_global),(fluidx_kp))), b_memSize_PER_BLOCK * sizeof(data_type_t), fluidx_tag_id, MFC_GET_CMD); // Wait for final DMAs to complete before terminating SPU thread. (void)spu_mfcstat(MFC_TAG_UPDATE_ALL); // b3x multiple_itself_matrix2D(fluidx_s_b3x,3,(*fluidx_nx),0.5E0); // get cshifrt(fluidx_s_b3x(1,:),1) extract_matrix2Dto1D(fluidx_s_b3x,3,(*fluidx_nx),(1-1),fluidx_tmp1); cshift_matrix1D(fluidx_tmp1,(*fluidx_nx),1,fluidx_tmp2); // b3x for (i=0;i<(*fluidx_nx);i++) { fluidx_s_b3x[matrix2D(3,(*fluidx_nx),0,i)]=fluidx_s_b3x[matrix2D(3,(*fluidx_nx),0,i)]+fluidx_tmp2[i]; fluidx_s_b3x[matrix2D(3,(*fluidx_nx),1,i)]=fluidx_s_b3x[matrix2D(3,(*fluidx_nx),1,i)]+fluidx_s_b2_jp[matrix2D(3,(*fluidx_nx),1,i)]/2E0; fluidx_s_b3x[matrix2D(3,(*fluidx_nx),2,i)]=fluidx_s_b3x[matrix2D(3,(*fluidx_nx),2,i)]+fluidx_s_b3_kp[matrix2D(3,(*fluidx_nx),2,i)]/2E0; } // tvd1 tvd1(fluidx_s_u,fluidx_s_b3x,fluidx_nx,fluidx_dt); // send u spu_mfcdma32((void *)(fluidx_s_u), (unsigned int)(fluidx_u+matrix4D(5,(box_nx),(box_ny),(box_nz),0,0,(j_global),(k_global))), u_memSize_PER_BLOCK * sizeof(data_type_t), fluidx_tag_id, MFC_PUT_CMD); } } // Wait for final DMAs to complete before terminating SPU thread. (void)spu_mfcstat(MFC_TAG_UPDATE_ALL); return; }
/* wait for dma transfer to be finished */ static void wait_for_completion(int tag) { mfc_write_tag_mask(1<<tag); spu_mfcstat(MFC_TAG_UPDATE_ALL); }
/** * Performs transport in a layer-block of the concentration matrix */ void discretize_z_block(uint64_t argvp) { /* Iterators */ uint32_t i, f, p, w; /* Arguments */ real_t dt; real_t size; uint32_t block; timer_start(&metrics.z_discret); /* Get arguments from main memory synchronously */ get_transport_argv(argvp, &dt, &size, &block); allocate_dma_list(&clist[0]); allocate_dma_list(&clist[1]); allocate_dma_list(&clist[2]); allocate_dma_list(&wlist[0]); allocate_dma_list(&wlist[1]); allocate_dma_list(&wlist[2]); allocate_dma_list(&dlist[0]); allocate_dma_list(&dlist[1]); allocate_dma_list(&dlist[2]); allocate_buffer(&conc[0]); allocate_buffer(&conc[1]); allocate_buffer(&conc[2]); allocate_buffer(&wind[0]); allocate_buffer(&wind[1]); allocate_buffer(&wind[2]); allocate_buffer(&diff[0]); allocate_buffer(&diff[1]); allocate_buffer(&diff[2]); allocate_buffer(&buff[0]); allocate_buffer(&buff[1]); allocate_buffer(&buff[2]); if(block == 1) { /* Start in_0 transfer */ fetch_z_buffer(0, 0); /* Process in_0 */ transport_buffer(0, size, dt); /* Write out_0 back to main memory */ write_z_buffer(0); wait_for_dma(0); } else { /* Start in_0 transfer */ fetch_z_buffer(0, 0); /* Start in_1 transfer */ fetch_z_buffer(1, 16); /* Process in_0 */ transport_buffer(0, size, dt); for(i=0; i<block-2; i++) { w = i % 3; p = (i+1) % 3; f = (i+2) % 3; /* Write buffer back to main memory */ write_z_buffer(w); /* Start buffer transfer */ fetch_z_buffer(f, (i+2)*16); /* Process buffer */ transport_buffer(p, size, dt); } /* Discretize final column vector */ w = i % 3; p = (i+1) % 3; /* Write out_b back to main memory */ write_z_buffer(w); /* Process in_(b+1) */ transport_buffer(p, size, dt); /* Write out_(b+1) back to main memory */ write_z_buffer(p); /* Make sure DMA is complete before we exit */ mfc_write_tag_mask( (1<<w) | (1<<p) ); spu_mfcstat(MFC_TAG_UPDATE_ALL); } free_dma_list(&clist[0]); free_dma_list(&clist[1]); free_dma_list(&clist[2]); free_dma_list(&wlist[0]); free_dma_list(&wlist[1]); free_dma_list(&wlist[2]); free_dma_list(&dlist[0]); free_dma_list(&dlist[1]); free_dma_list(&dlist[2]); free_buffer(&conc[0]); free_buffer(&conc[1]); free_buffer(&conc[2]); free_buffer(&wind[0]); free_buffer(&wind[1]); free_buffer(&wind[2]); free_buffer(&diff[0]); free_buffer(&diff[1]); free_buffer(&diff[2]); free_buffer(&buff[0]); free_buffer(&buff[1]); free_buffer(&buff[2]); timer_stop(&metrics.z_discret); set_status(SPE_STATUS_WAITING); }
/** * Performs transport in a row-block of the concentration matrix */ void discretize_x_block(uint64_t argvp) { uint32_t i, f, p, w; /* Arguments */ real_t dt; real_t size; uint32_t block; timer_start(&metrics.x_discret); /* Get arguments from main memory synchronously */ get_transport_argv(argvp, &dt, &size, &block); allocate_buffer(&conc[0]); allocate_buffer(&conc[1]); allocate_buffer(&conc[2]); allocate_buffer(&wind[0]); allocate_buffer(&wind[1]); allocate_buffer(&wind[2]); allocate_buffer(&diff[0]); allocate_buffer(&diff[1]); allocate_buffer(&diff[2]); allocate_buffer(&buff[0]); allocate_buffer(&buff[1]); allocate_buffer(&buff[2]); allocate_buffer(&shuffle[0]); allocate_buffer(&shuffle[1]); allocate_buffer(&shuffle[2]); allocate_buffer(&shuffle[3]); if(block == 1) { /* Start in_0 transfer */ fetch_x_buffer(0, 0); /* Process in_0 */ transport_buffer(0, size, dt); /* Write out_0 back to main memory */ write_x_buffer(0, 0); } else { /* Start in_0 transfer */ fetch_x_buffer(0, 0); /* Start in_1 transfer */ fetch_x_buffer(1, NX_ALIGNED_SIZE); /* Process in_0 */ transport_buffer(0, size, dt); /* Loop over rows in this block */ for(i=0; i<block-2; i++) { w = i % 3; p = (i+1) % 3; f = (i+2) % 3; /* Write out_b back to main memory */ write_x_buffer(w, i*NX_ALIGNED_SIZE); /* Start in_b transfer */ fetch_x_buffer(f, (i+2)*NX_ALIGNED_SIZE); /* Process in_(b+1) */ transport_buffer(p, size, dt); } /* Discretize final row */ w = i % 3; p = (i+1) % 3; /* Write out_b back to main memory */ write_x_buffer(w, i*NX_ALIGNED_SIZE); /* Process in_(b+1) */ transport_buffer(p, size, dt); /* Write out_(b+1) back to main memory */ write_x_buffer(p, (i+1)*NX_ALIGNED_SIZE); /* Make sure DMA is complete before we exit */ mfc_write_tag_mask( (1<<w) | (1<<p) ); spu_mfcstat(MFC_TAG_UPDATE_ALL); } free_buffer(&conc[0]); free_buffer(&conc[1]); free_buffer(&conc[2]); free_buffer(&wind[0]); free_buffer(&wind[1]); free_buffer(&wind[2]); free_buffer(&diff[0]); free_buffer(&diff[1]); free_buffer(&diff[2]); free_buffer(&buff[0]); free_buffer(&buff[1]); free_buffer(&buff[2]); free_buffer(&shuffle[0]); free_buffer(&shuffle[1]); free_buffer(&shuffle[2]); free_buffer(&shuffle[3]); timer_stop(&metrics.x_discret); /* Signal PPE */ set_status(SPE_STATUS_WAITING); }
void process_buffer(int buffer, int cnt, vector float dt_v) { int i; volatile vector float *p_inv_mass_v; vector float force_v, inv_mass_v; vector float pos0, pos1, pos2, pos3; vector float vel0, vel1, vel2, vel3; vector float dt_inv_mass_v, dt_inv_mass_v_0, dt_inv_mass_v_1, dt_inv_mass_v_2, dt_inv_mass_v_3; vector unsigned char splat_word_0 = (vector unsigned char){0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3}; vector unsigned char splat_word_1 = (vector unsigned char){4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7}; vector unsigned char splat_word_2 = (vector unsigned char){8, 9,10,11, 8, 9,10,11, 8, 9,10,11, 8, 9,10,11}; vector unsigned char splat_word_3 = (vector unsigned char){12,13,14,15,12,13,14,15,12,13,14,15,12,13,14,15}; p_inv_mass_v = (volatile vector float *)&inv_mass[buffer][0]; force_v = ctx.force_v; // Compute the step in time for the block of particles, four // particle at a time. for (i=0; i<cnt; i+=4) { inv_mass_v = *p_inv_mass_v++; pos0 = pos[buffer][i+0]; pos1 = pos[buffer][i+1]; pos2 = pos[buffer][i+2]; pos3 = pos[buffer][i+3]; vel0 = vel[buffer][i+0]; vel1 = vel[buffer][i+1]; vel2 = vel[buffer][i+2]; vel3 = vel[buffer][i+3]; dt_inv_mass_v = spu_mul(dt_v, inv_mass_v); pos0 = spu_madd(vel0, dt_v, pos0); pos1 = spu_madd(vel1, dt_v, pos1); pos2 = spu_madd(vel2, dt_v, pos2); pos3 = spu_madd(vel3, dt_v, pos3); dt_inv_mass_v_0 = spu_shuffle(dt_inv_mass_v, dt_inv_mass_v, splat_word_0); dt_inv_mass_v_1 = spu_shuffle(dt_inv_mass_v, dt_inv_mass_v, splat_word_1); dt_inv_mass_v_2 = spu_shuffle(dt_inv_mass_v, dt_inv_mass_v, splat_word_2); dt_inv_mass_v_3 = spu_shuffle(dt_inv_mass_v, dt_inv_mass_v, splat_word_3); vel0 = spu_madd(dt_inv_mass_v_0, force_v, vel0); vel1 = spu_madd(dt_inv_mass_v_1, force_v, vel1); vel2 = spu_madd(dt_inv_mass_v_2, force_v, vel2); vel3 = spu_madd(dt_inv_mass_v_3, force_v, vel3); pos[buffer][i+0] = pos0; pos[buffer][i+1] = pos1; pos[buffer][i+2] = pos2; pos[buffer][i+3] = pos3; vel[buffer][i+0] = vel0; vel[buffer][i+1] = vel1; vel[buffer][i+2] = vel2; vel[buffer][i+3] = vel3; } } int main(unsigned long long spu_id __attribute__ ((unused)), unsigned long long argv) { int buffer, next_buffer; int cnt, next_cnt, left; float time, dt; vector float dt_v; volatile vector float *ctx_pos_v, *ctx_vel_v; volatile vector float *next_ctx_pos_v, *next_ctx_vel_v; volatile float *ctx_inv_mass, *next_ctx_inv_mass; unsigned int tags[2]; // Reserve a pair of DMA tag IDs tags[0] = mfc_tag_reserve(); tags[1] = mfc_tag_reserve(); // Input parameter argv is a pointer to the particle context. // Fetch the parameter context, waiting for it to complete. spu_writech(MFC_WrTagMask, 1 << tags[0]); spu_mfcdma32((void *)(&ctx), (unsigned int)argv, sizeof(parm_context), tags[0], MFC_GET_CMD); (void)spu_mfcstat(MFC_TAG_UPDATE_ALL); dt = ctx.dt; dt_v = spu_splats(dt); // For each step in time for (time=0; time<END_OF_TIME; time += dt) { // For each double buffered block of particles left = ctx.particles; cnt = (left < PARTICLES_PER_BLOCK) ? left : PARTICLES_PER_BLOCK; ctx_pos_v = ctx.pos_v; ctx_vel_v = ctx.vel_v; ctx_inv_mass = ctx.inv_mass; // Prefetch first buffer of input data. buffer = 0; spu_mfcdma32((void *)(pos), (unsigned int)(ctx_pos_v), cnt * sizeof(vector float), tags[0], MFC_GETB_CMD); spu_mfcdma32((void *)(vel), (unsigned int)(ctx_vel_v), cnt * sizeof(vector float), tags[0], MFC_GET_CMD); spu_mfcdma32((void *)(inv_mass), (unsigned int)(ctx_inv_mass), cnt * sizeof(float), tags[0], MFC_GET_CMD); while (cnt < left) { left -= cnt; next_ctx_pos_v = ctx_pos_v + cnt; next_ctx_vel_v = ctx_vel_v + cnt; next_ctx_inv_mass = ctx_inv_mass + cnt; next_cnt = (left < PARTICLES_PER_BLOCK) ? left : PARTICLES_PER_BLOCK; // Prefetch next buffer so the data is available for computation on next loop iteration. // The first DMA is barriered so that we don't GET data before the previous iteration's // data is PUT. next_buffer = buffer^1; spu_mfcdma32((void *)(&pos[next_buffer][0]), (unsigned int)(next_ctx_pos_v), next_cnt * sizeof(vector float), tags[next_buffer], MFC_GETB_CMD); spu_mfcdma32((void *)(&vel[next_buffer][0]), (unsigned int)(next_ctx_vel_v), next_cnt * sizeof(vector float), tags[next_buffer], MFC_GET_CMD); spu_mfcdma32((void *)(&inv_mass[next_buffer][0]), (unsigned int)(next_ctx_inv_mass), next_cnt * sizeof(float), tags[next_buffer], MFC_GET_CMD); // Wait for previously prefetched data spu_writech(MFC_WrTagMask, 1 << tags[buffer]); (void)spu_mfcstat(MFC_TAG_UPDATE_ALL); process_buffer(buffer, cnt, dt_v); // Put the buffer's position and velocity data back into system memory spu_mfcdma32((void *)(&pos[buffer][0]), (unsigned int)(ctx_pos_v), cnt * sizeof(vector float), tags[buffer], MFC_PUT_CMD); spu_mfcdma32((void *)(&vel[buffer][0]), (unsigned int)(ctx_vel_v), cnt * sizeof(vector float), tags[buffer], MFC_PUT_CMD); ctx_pos_v = next_ctx_pos_v; ctx_vel_v = next_ctx_vel_v; ctx_inv_mass = next_ctx_inv_mass; buffer = next_buffer; cnt = next_cnt; } // Wait for previously prefetched data spu_writech(MFC_WrTagMask, 1 << tags[buffer]); (void)spu_mfcstat(MFC_TAG_UPDATE_ALL); process_buffer(buffer, cnt, dt_v); // Put the buffer's position and velocity data back into system memory spu_mfcdma32((void *)(&pos[buffer][0]), (unsigned int)(ctx_pos_v), cnt * sizeof(vector float), tags[buffer], MFC_PUT_CMD); spu_mfcdma32((void *)(&vel[buffer][0]), (unsigned int)(ctx_vel_v), cnt * sizeof(vector float), tags[buffer], MFC_PUT_CMD); // Wait for DMAs to complete before starting the next step in time. spu_writech(MFC_WrTagMask, 1 << tags[buffer]); (void)spu_mfcstat(MFC_TAG_UPDATE_ALL); } return (0); }
void advectbyzxA1(data_type_t *advectbyzxA1_u, data_type_t *advectbyzxA1_b, int *advectbyzxA1_nx, int *advectbyzxA1_ny, int *advectbyzxA1_nz, data_type_t *advectbyzxA1_dt, data_type_t *advectbyzxA1_adv_tmp, int *advectbyzxA1_Y_location, int *advectbyzxA1_Z_location, int advectbyzxA1_tag_id) { volatile data_type_t advA1_s_u_jm[5*(*advectbyzxA1_nx)] __attribute__ ((aligned (16))); volatile data_type_t advA1_s_u[5*(*advectbyzxA1_nx)] __attribute__ ((aligned (16))); volatile data_type_t advA1_s_b[3*(*advectbyzxA1_nx)] __attribute__ ((aligned (16))); data_type_t fluxbx[(*advectbyzxA1_nx)] __attribute__ ((aligned (16))); data_type_t b1x[(*advectbyzxA1_nx)] __attribute__ ((aligned (16))); data_type_t vx[(*advectbyzxA1_nx)] __attribute__ ((aligned (16))); data_type_t advA1_tmp1[(*advectbyzxA1_nx)] __attribute__ ((aligned (16))); data_type_t advA1_tmp2[(*advectbyzxA1_nx)] __attribute__ ((aligned (16))); int advA1_jm; int i,j,k; int j_global,k_global; int Y_inter=box_ny/num_SPE_Y; int Z_inter=box_nz/num_SPE_Z; int CELL_PER_BLOCK; CELL_PER_BLOCK = (*advectbyzxA1_nx); int u_memSize_PER_BLOCK; u_memSize_PER_BLOCK = 5* CELL_PER_BLOCK; int b_memSize_PER_BLOCK; b_memSize_PER_BLOCK = 3* CELL_PER_BLOCK; for (k=0;k<(*advectbyzxA1_nz);k++) { for (j=0;j<(*advectbyzxA1_ny);j++) { // assign global index k_global=k+(*advectbyzxA1_Z_location)*Z_inter; j_global=j+(*advectbyzxA1_Y_location)*Y_inter; // jm=mod(j+ny-2,ny)+1 advA1_jm=(j_global+(box_ny)-1)%(box_ny); // get u(:,:,j,k) spu_mfcdma32((void *)(advA1_s_u), (unsigned int)(advectbyzxA1_u+matrix4D(5,(box_nx),(box_ny),(box_nz),0,0,(j_global),(k_global))), u_memSize_PER_BLOCK * sizeof(data_type_t), advectbyzxA1_tag_id, MFC_GETB_CMD); // get u(:,:,jm,k) spu_mfcdma32((void *)(advA1_s_u_jm), (unsigned int)(advectbyzxA1_u+matrix4D(5,(box_nx),(box_ny),(box_nz),0,0,(advA1_jm),(k_global))), u_memSize_PER_BLOCK * sizeof(data_type_t), advectbyzxA1_tag_id, MFC_GET_CMD); // get b(:,:,j,k) spu_mfcdma32((void *)(advA1_s_b), (unsigned int)(advectbyzxA1_b+matrix4D(3,(box_nx),(box_ny),(box_nz),0,0,(j_global),(k_global))), b_memSize_PER_BLOCK * sizeof(data_type_t), advectbyzxA1_tag_id, MFC_GET_CMD); // Wait for final DMAs to complete before terminating SPU thread. (void)spu_mfcstat(MFC_TAG_UPDATE_ALL); // vx=(u(2,:,jm,k)+u(2,:,j,k))/(u(1,:,jm,k)+u(1,:,j,k)) for (i=0;i<(*advectbyzxA1_nx);i++) { vx[i]=(advA1_s_u_jm[matrix2D(5,(*advectbyzxA1_nx),(2-1),i)]+advA1_s_u[matrix2D(5,(*advectbyzxA1_nx),(2-1),i)])/(advA1_s_u_jm[matrix2D(5,(*advectbyzxA1_nx),(1-1),i)]+advA1_s_u[matrix2D(5,(*advectbyzxA1_nx),(1-1),i)]); } // vx=(cshift(vx,-1)+cshift(vx,1)+2*vx)/4 cshift_matrix1D(vx, (*advectbyzxA1_nx), (-1), advA1_tmp1); cshift_matrix1D(vx, (*advectbyzxA1_nx), (1), advA1_tmp2); // vx=(cshift(vx,-1)+cshift(vx,1)+2.0d0*vx)/4.0d0 for (i=0;i<(*advectbyzxA1_nx);i++) { vx[i]=(advA1_tmp1[i]+advA1_tmp2[i]+2.0E0*vx[i])/4.0E0; b1x[i]=advA1_s_b[matrix2D(3,(*advectbyzxA1_nx),(2-1),i)]; } // call tvdb(fluxbx,b1x,vx,nx,dt) tvdb(fluxbx,b1x,vx,advectbyzxA1_nx,advectbyzxA1_dt); // cshift(fluxbx,-1) cshift_matrix1D(fluxbx,(*advectbyzxA1_nx), (-1), advA1_tmp1); // b(2,:,j,k)=b1x // b(1,:,j,k)=b(1,:,j,k)-cshift(fluxbx,-1) for (i=0;i<(*advectbyzxA1_nx);i++) { advA1_s_b[matrix2D(3,(*advectbyzxA1_nx),(2-1),i)]=b1x[i]; advA1_s_b[matrix2D(3,(*advectbyzxA1_nx),(1-1),i)]=advA1_s_b[matrix2D(3,(*advectbyzxA1_nx),(1-1),i)]-advA1_tmp1[i]; } // send b(2,:,j,k) & b(1,:,j,k) spu_mfcdma32((void *)(advA1_s_b), (unsigned int)(advectbyzxA1_b+matrix4D(3,(box_nx),(box_ny),(box_nz),0,0,(j_global),(k_global))), b_memSize_PER_BLOCK * sizeof(data_type_t), advectbyzxA1_tag_id, MFC_PUTB_CMD); // send advA1_tmp1 out spu_mfcdma32((void *)(advA1_tmp1), (unsigned int)(advectbyzxA1_adv_tmp+matrix3D((box_nx),(box_ny),(box_nz),0,(j_global),(k_global))), CELL_PER_BLOCK * sizeof(data_type_t), advectbyzxA1_tag_id, MFC_PUT_CMD); } } // Wait for final DMAs to complete before terminating SPU thread. (void)spu_mfcstat(MFC_TAG_UPDATE_ALL); return; }