void attach_received_particles(particle_t **dst_buffer, uint64_t *position, uint64_t *buffer_size, particle_t *src_buffer, uint64_t n_src_particles, particle_t *src_buffer2, uint64_t n_src_particles2) { uint64_t cur_pos = (*position); uint64_t cur_buf_size = (*buffer_size); particle_t *cur_buffer = (*dst_buffer); particle_t *temp_buf; if ((cur_pos + n_src_particles + n_src_particles2 ) > cur_buf_size) { /* Have to resize buffer */ temp_buf = (particle_t*) prk_malloc((cur_buf_size + 2*(n_src_particles + n_src_particles2)) * sizeof(particle_t)); if (!temp_buf) { printf("Could not increase particle buffer size\n"); /* do not attempt graceful exit; just allow code to abort */ MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE); } memcpy(temp_buf, cur_buffer, cur_pos*sizeof(particle_t)); prk_free(cur_buffer); cur_buffer = temp_buf; (*dst_buffer) = temp_buf; (*buffer_size) = cur_buf_size + 2*(n_src_particles + n_src_particles2); } memcpy(&cur_buffer[cur_pos], src_buffer, n_src_particles * sizeof(particle_t)); (*position) += n_src_particles; memcpy(&cur_buffer[*position], src_buffer2, n_src_particles2 * sizeof(particle_t)); (*position) += n_src_particles2; }
/* Adds a particle to a buffer. Resizes buffer if need be. */ void add_particle_to_buffer(particle_t p, particle_t **buffer, uint64_t *position, uint64_t *buffer_size) { uint64_t cur_pos = (*position); uint64_t cur_buf_size = (*buffer_size); particle_t *cur_buffer = (*buffer); particle_t *temp_buf; if (cur_pos == cur_buf_size) { /* Have to resize buffer */ temp_buf = (particle_t*) prk_malloc(2 * cur_buf_size * sizeof(particle_t)); if (!temp_buf) { printf("Could not increase particle buffer size\n"); /* do not attempt graceful exit; just allow code to abort */ MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE); } memcpy(temp_buf, cur_buffer, cur_buf_size*sizeof(particle_t)); prk_free(cur_buffer); cur_buffer = temp_buf; (*buffer) = temp_buf; (*buffer_size) = cur_buf_size * 2; } cur_buffer[cur_pos] = p; (*position)++; }
/* Resizes a buffer if need be */ void resize_buffer(particle_t **buffer, uint64_t *size, uint64_t new_size) { uint64_t cur_size = (*size); if (new_size > cur_size) { prk_free(*buffer); (*buffer) = (particle_t*) prk_malloc(2*new_size*sizeof(particle_t)); if (!(*buffer)) { printf("Could not increase particle buffer size\n"); /* do not attempt graceful exit; just allow code to abort */ MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE); } (*size) = 2*new_size; } }
int main(int argc, char ** argv) { long Block_order; /* number of columns owned by rank */ int Block_size; /* size of a single block */ int Colblock_size; /* size of column block */ int Tile_order=32; /* default Tile order */ int tiling; /* boolean: true if tiling is used */ int Num_procs; /* number of ranks */ int order; /* order of overall matrix */ int bufferCount; /* number of input buffers */ int targetBuffer; /* buffer with which to communicate */ int send_to, recv_from; /* ranks with which to communicate */ long bytes; /* combined size of matrices */ int my_ID; /* rank */ int root=0; /* rank of root */ int iterations; /* number of times to do the transpose */ long i, j, it, jt, istart;/* dummies */ int iter; /* index of iteration */ int phase; /* phase inside staged communication */ int colstart; /* starting column for owning rank */ int error; /* error flag */ double *A_p; /* original matrix column block */ double *B_p; /* transposed matrix column block */ double **Work_in_p; /* workspace for the transpose function */ double *Work_out_p; /* workspace for the transpose function */ double epsilon = 1.e-8; /* error tolerance */ double avgtime; /* timing parameters */ long *pSync_bcast; /* work space for collectives */ long *pSync_reduce; /* work space for collectives */ double *pWrk; /* work space for SHMEM collectives */ double *local_trans_time, *trans_time; /* timing parameters */ double *abserr, *abserr_tot; /* local and aggregate error */ int *send_flag, *recv_flag; /* synchronization flags */ int *arguments; /* command line arguments */ /********************************************************************* ** Initialize the SHMEM environment *********************************************************************/ prk_shmem_init(); my_ID=prk_shmem_my_pe(); Num_procs=prk_shmem_n_pes(); if (my_ID == root) { printf("Parallel Research Kernels version %s\n", PRKVERSION); printf("SHMEM matrix transpose: B = A^T\n"); } // initialize sync variables for error checks pSync_bcast = (long *) prk_shmem_align(prk_get_alignment(),PRK_SHMEM_BCAST_SYNC_SIZE*sizeof(long)); pSync_reduce = (long *) prk_shmem_align(prk_get_alignment(),PRK_SHMEM_REDUCE_SYNC_SIZE*sizeof(long)); pWrk = (double *) prk_shmem_align(prk_get_alignment(),sizeof(double) * PRK_SHMEM_REDUCE_MIN_WRKDATA_SIZE); local_trans_time = (double *) prk_shmem_align(prk_get_alignment(),sizeof(double)); trans_time = (double *) prk_shmem_align(prk_get_alignment(),sizeof(double)); arguments = (int *) prk_shmem_align(prk_get_alignment(),4*sizeof(int)); abserr = (double *) prk_shmem_align(prk_get_alignment(),2*sizeof(double)); abserr_tot = abserr + 1; if (!pSync_bcast || !pSync_reduce || !pWrk || !local_trans_time || !trans_time || !arguments || !abserr) { printf("Rank %d could not allocate scalar work space on symm heap\n", my_ID); error = 1; goto ENDOFTESTS; } for(i=0;i<PRK_SHMEM_BCAST_SYNC_SIZE;i++) pSync_bcast[i]=PRK_SHMEM_SYNC_VALUE; for(i=0;i<PRK_SHMEM_REDUCE_SYNC_SIZE;i++) pSync_reduce[i]=PRK_SHMEM_SYNC_VALUE; /********************************************************************* ** process, test and broadcast input parameters *********************************************************************/ error = 0; if (my_ID == root) { if (argc != 4 && argc != 5){ printf("Usage: %s <# iterations> <matrix order> <# buffers> [Tile size]\n", *argv); error = 1; goto ENDOFTESTS; } iterations = atoi(*++argv); arguments[0]=iterations; if(iterations < 1){ printf("ERROR: iterations must be >= 1 : %d \n",iterations); error = 1; goto ENDOFTESTS; } order = atoi(*++argv); arguments[1]=order; if (order < Num_procs) { printf("ERROR: matrix order %d should at least # procs %d\n", order, Num_procs); error = 1; goto ENDOFTESTS; } if (order%Num_procs) { printf("ERROR: matrix order %d should be divisible by # procs %d\n", order, Num_procs); error = 1; goto ENDOFTESTS; } bufferCount = atoi(*++argv); arguments[2]=bufferCount; if (Num_procs > 1) { if ((bufferCount < 1) || (bufferCount >= Num_procs)) { printf("ERROR: bufferCount must be >= 1 and < # procs : %d\n", bufferCount); error = 1; goto ENDOFTESTS; } } if (argc == 5) Tile_order = atoi(*++argv); arguments[3]=Tile_order; ENDOFTESTS:; } bail_out(error); if (my_ID == root) { printf("Number of ranks = %d\n", Num_procs); printf("Matrix order = %d\n", order); printf("Number of iterations = %d\n", iterations); printf("Number of buffers = %d\n", bufferCount); if ((Tile_order > 0) && (Tile_order < order)) printf("Tile size = %d\n", Tile_order); else printf("Untiled\n"); } shmem_barrier_all(); /* Broadcast input data to all ranks */ shmem_broadcast32(&arguments[0], &arguments[0], 4, root, 0, 0, Num_procs, pSync_bcast); iterations=arguments[0]; order=arguments[1]; bufferCount=arguments[2]; Tile_order=arguments[3]; shmem_barrier_all(); prk_shmem_free(arguments); /* a non-positive tile size means no tiling of the local transpose */ tiling = (Tile_order > 0) && (Tile_order < order); bytes = 2 * sizeof(double) * order * order; /********************************************************************* ** The matrix is broken up into column blocks that are mapped one to a ** rank. Each column block is made up of Num_procs smaller square ** blocks of order block_order. *********************************************************************/ Block_order = order/Num_procs; colstart = Block_order * my_ID; Colblock_size = order * Block_order; Block_size = Block_order * Block_order; /********************************************************************* ** Create the column block of the test matrix, the row block of the ** transposed matrix, and workspace (workspace only if #procs>1) *********************************************************************/ A_p = (double *)prk_malloc(Colblock_size*sizeof(double)); if (A_p == NULL){ printf(" Error allocating space for original matrix on node %d\n",my_ID); error = 1; } bail_out(error); B_p = (double *)prk_malloc(Colblock_size*sizeof(double)); if (B_p == NULL){ printf(" Error allocating space for transpose matrix on node %d\n",my_ID); error = 1; } bail_out(error); if (Num_procs>1) { Work_in_p = (double**)prk_malloc(bufferCount*sizeof(double)); Work_out_p = (double *) prk_malloc(Block_size*sizeof(double)); recv_flag = (int*) prk_shmem_align(prk_get_alignment(),bufferCount*sizeof(int)); if ((Work_in_p == NULL)||(Work_out_p==NULL) || (recv_flag == NULL)){ printf(" Error allocating space for work or flags on node %d\n",my_ID); error = 1; } if (bufferCount < (Num_procs - 1)) { send_flag = (int*) prk_shmem_align(prk_get_alignment(), (Num_procs-1) * sizeof(int)); if (send_flag == NULL) { printf("Error allocating space for flags on node %d\n", my_ID); error = 1; } } bail_out(error); for(i=0;i<bufferCount;i++) { Work_in_p[i]=(double *) prk_shmem_align(prk_get_alignment(),Block_size*sizeof(double)); if (Work_in_p[i] == NULL) { printf(" Error allocating space for work on node %d\n",my_ID); error = 1; } bail_out(error); } if (bufferCount < (Num_procs - 1)) { for(i=0;i<(Num_procs-1);i++) send_flag[i]=0; } for(i=0;i<bufferCount;i++) recv_flag[i]=0; } /* Fill the original column matrices */ istart = 0; for (j=0;j<Block_order;j++) for (i=0;i<order; i++) { A(i,j) = (double) (order*(j+colstart) + i); B(i,j) = 0.0; } shmem_barrier_all(); if (bufferCount < (Num_procs - 1)) { if (Num_procs > 1) { for ( i = 0; i < bufferCount; i++) { recv_from = (my_ID + i + 1)%Num_procs; shmem_int_inc(&send_flag[i], recv_from); } } } shmem_barrier_all(); for (iter = 0; iter<=iterations; iter++){ /* start timer after a warmup iteration */ if (iter == 1) { shmem_barrier_all(); local_trans_time[0] = wtime(); } /* do the local transpose */ istart = colstart; if (!tiling) { for (i=0; i<Block_order; i++) for (j=0; j<Block_order; j++) { B(j,i) += A(i,j); A(i,j) += 1.0; } } else { for (i=0; i<Block_order; i+=Tile_order) for (j=0; j<Block_order; j+=Tile_order) for (it=i; it<MIN(Block_order,i+Tile_order); it++) for (jt=j; jt<MIN(Block_order,j+Tile_order);jt++) { B(jt,it) += A(it,jt); A(it,jt) += 1.0; } } for (phase=1; phase<Num_procs; phase++){ recv_from = (my_ID + phase )%Num_procs; send_to = (my_ID - phase + Num_procs)%Num_procs; targetBuffer = (iter * (Num_procs - 1) + (phase - 1)) % bufferCount; istart = send_to*Block_order; if (!tiling) { for (i=0; i<Block_order; i++) for (j=0; j<Block_order; j++){ Work_out(j,i) = A(i,j); A(i,j) += 1.0; } } else { for (i=0; i<Block_order; i+=Tile_order) for (j=0; j<Block_order; j+=Tile_order) for (it=i; it<MIN(Block_order,i+Tile_order); it++) for (jt=j; jt<MIN(Block_order,j+Tile_order);jt++) { Work_out(jt,it) = A(it,jt); A(it,jt) += 1.0; } } if (bufferCount < (Num_procs - 1)) shmem_int_wait_until(&send_flag[phase-1], SHMEM_CMP_EQ, iter+1); shmem_double_put(&Work_in_p[targetBuffer][0], &Work_out_p[0], Block_size, send_to); shmem_fence(); shmem_int_inc(&recv_flag[targetBuffer], send_to); i = (iter * (Num_procs - 1) + phase) / bufferCount; if ((iter * (Num_procs - 1) + phase) % bufferCount) i++; shmem_int_wait_until(&recv_flag[targetBuffer], SHMEM_CMP_EQ, i); istart = recv_from*Block_order; /* scatter received block to transposed matrix; no need to tile */ for (j=0; j<Block_order; j++) for (i=0; i<Block_order; i++) B(i,j) += Work_in(targetBuffer, i,j); if (bufferCount < (Num_procs - 1)) { if ((phase + bufferCount) < Num_procs) recv_from = (my_ID + phase + bufferCount) % Num_procs; else recv_from = (my_ID + phase + bufferCount + 1 - Num_procs) % Num_procs; shmem_int_inc(&send_flag[(phase+bufferCount-1)%(Num_procs-1)], recv_from); } } /* end of phase loop */ } /* end of iterations */ local_trans_time[0] = wtime() - local_trans_time[0]; shmem_barrier_all(); shmem_double_max_to_all(trans_time, local_trans_time, 1, 0, 0, Num_procs, pWrk, pSync_reduce); abserr[0] = 0.0; istart = 0; double addit = ((double)(iterations+1) * (double) (iterations))/2.0; for (j=0;j<Block_order;j++) for (i=0;i<order; i++) { abserr[0] += ABS(B(i,j) - (double)((order*i + j+colstart)*(iterations+1)+addit)); } shmem_barrier_all(); shmem_double_sum_to_all(abserr_tot, abserr, 1, 0, 0, Num_procs, pWrk, pSync_reduce); if (my_ID == root) { if (abserr_tot[0] <= epsilon) { printf("Solution validates\n"); avgtime = trans_time[0]/(double)iterations; printf("Rate (MB/s): %lf Avg time (s): %lf\n",1.0E-06*bytes/avgtime, avgtime); #ifdef VERBOSE printf("Summed errors: %f \n", abserr[0]); #endif } else { printf("ERROR: Aggregate squared error %e exceeds threshold %e\n", abserr[0], epsilon); error = 1; } } bail_out(error); if (Num_procs>1) { if (bufferCount < (Num_procs - 1)) prk_shmem_free(send_flag); prk_shmem_free(recv_flag); prk_free(Work_out_p); for(i=0;i<bufferCount;i++) prk_shmem_free(Work_in_p[i]); prk_free(Work_in_p); } prk_shmem_free(pSync_bcast); prk_shmem_free(pSync_reduce); prk_shmem_free(pWrk); prk_shmem_finalize(); exit(EXIT_SUCCESS); } /* end of main */