/*inline*/ void globalMax_double(LSMSCommunication &comm,double &a) { shmem_barrier(comm.comm.start_pe, comm.comm.logPE_stride, comm.comm.size,pSync1); static double r_d; r_d=a; shmem_double_max_to_all(&(a), &r_d, 1,comm.comm.start_pe, comm.comm.logPE_stride, comm.comm.size, pWrk_d, pSync1); }
int main(int argc, char ** argv) { int Num_procs; /* number of ranks */ int Num_procsx, Num_procsy; /* number of ranks in each coord direction */ int my_ID; /* SHMEM rank */ int my_IDx, my_IDy; /* coordinates of rank in rank grid */ int right_nbr; /* global rank of right neighboring tile */ int left_nbr; /* global rank of left neighboring tile */ int top_nbr; /* global rank of top neighboring tile */ int bottom_nbr; /* global rank of bottom neighboring tile */ DTYPE *top_buf_out; /* communication buffer */ DTYPE *top_buf_in[2]; /* " " */ DTYPE *bottom_buf_out; /* " " */ DTYPE *bottom_buf_in[2];/* " " */ DTYPE *right_buf_out; /* " " */ DTYPE *right_buf_in[2]; /* " " */ DTYPE *left_buf_out; /* " " */ DTYPE *left_buf_in[2]; /* " " */ int root = 0; int n, width, height;/* linear global and local grid dimension */ int i, j, ii, jj, kk, it, jt, iter, leftover; /* dummies */ int istart, iend; /* bounds of grid tile assigned to calling rank */ int jstart, jend; /* bounds of grid tile assigned to calling rank */ DTYPE reference_norm; DTYPE f_active_points; /* interior of grid with respect to stencil */ int stencil_size; /* number of points in the stencil */ DTYPE flops; /* floating point ops per iteration */ int iterations; /* number of times to run the algorithm */ double avgtime, /* timing parameters */ *local_stencil_time, *stencil_time; DTYPE * RESTRICT in; /* input grid values */ DTYPE * RESTRICT out; /* output grid values */ long total_length_in; /* total required length to store input array */ long total_length_out;/* total required length to store output array */ int error=0; /* error flag */ DTYPE weight[2*RADIUS+1][2*RADIUS+1]; /* weights of points in the stencil */ int *arguments; /* command line parameters */ int count_case=4; /* number of neighbors of a rank */ long *pSync_bcast; /* work space for collectives */ long *pSync_reduce; /* work space for collectives */ double *pWrk_time; /* work space for collectives */ DTYPE *pWrk_norm; /* work space for collectives */ int *iterflag; /* synchronization flags */ int sw; /* double buffering switch */ DTYPE *local_norm, *norm; /* local and global error norms */ /******************************************************************************* ** Initialize the SHMEM environment ********************************************************************************/ prk_shmem_init(); my_ID=prk_shmem_my_pe(); Num_procs=prk_shmem_n_pes(); pSync_bcast = (long *) prk_shmem_malloc(PRK_SHMEM_BCAST_SYNC_SIZE*sizeof(long)); pSync_reduce = (long *) prk_shmem_malloc(PRK_SHMEM_REDUCE_SYNC_SIZE*sizeof(long)); pWrk_time = (double *) prk_shmem_malloc(PRK_SHMEM_REDUCE_MIN_WRKDATA_SIZE*sizeof(double)); pWrk_norm = (DTYPE *) prk_shmem_malloc(PRK_SHMEM_REDUCE_MIN_WRKDATA_SIZE*sizeof(DTYPE)); local_stencil_time = (double *) prk_shmem_malloc(sizeof(double)); stencil_time = (double *) prk_shmem_malloc(sizeof(double)); local_norm = (DTYPE *) prk_shmem_malloc(sizeof(DTYPE)); norm = (DTYPE *) prk_shmem_malloc(sizeof(DTYPE)); iterflag = (int *) prk_shmem_malloc(2*sizeof(int)); if (!(pSync_bcast && pSync_reduce && pWrk_time && pWrk_norm && iterflag && local_stencil_time && stencil_time && local_norm && norm)) { printf("Could not allocate scalar variables on rank %d\n", my_ID); error = 1; } bail_out(error); for(i=0;i<PRK_SHMEM_BCAST_SYNC_SIZE;i++) pSync_bcast[i]=PRK_SHMEM_SYNC_VALUE; for(i=0;i<PRK_SHMEM_REDUCE_SYNC_SIZE;i++) pSync_reduce[i]=PRK_SHMEM_SYNC_VALUE; arguments=(int*)prk_shmem_malloc(2*sizeof(int)); /******************************************************************************* ** process, test, and broadcast input parameters ********************************************************************************/ if (my_ID == root) { #ifndef STAR printf("ERROR: Compact stencil not supported\n"); error = 1; goto ENDOFTESTS; #endif if (argc != 3){ printf("Usage: %s <# iterations> <array dimension> \n", *argv); error = 1; goto ENDOFTESTS; } iterations = atoi(*++argv); arguments[0]=iterations; if (iterations < 1){ printf("ERROR: iterations must be >= 1 : %d \n",iterations); error = 1; goto ENDOFTESTS; } n = atoi(*++argv); arguments[1]=n; long nsquare = (long)n * (long)n; if (nsquare < Num_procs){ printf("ERROR: grid size must be at least # ranks: %ld\n", nsquare); error = 1; goto ENDOFTESTS; } if (RADIUS < 0) { printf("ERROR: Stencil radius %d should be non-negative\n", RADIUS); error = 1; goto ENDOFTESTS; } if (2*RADIUS +1 > n) { printf("ERROR: Stencil radius %d exceeds grid size %d\n", RADIUS, n); error = 1; goto ENDOFTESTS; } ENDOFTESTS:; } bail_out(error); /* determine best way to create a 2D grid of ranks (closest to square, for best surface/volume ratio); we do this brute force for now */ for (Num_procsx=(int) (sqrt(Num_procs+1)); Num_procsx>0; Num_procsx--) { if (!(Num_procs%Num_procsx)) { Num_procsy = Num_procs/Num_procsx; break; } } my_IDx = my_ID%Num_procsx; my_IDy = my_ID/Num_procsx; /* compute neighbors; don't worry about dropping off the edges of the grid */ right_nbr = my_ID+1; left_nbr = my_ID-1; top_nbr = my_ID+Num_procsx; bottom_nbr = my_ID-Num_procsx; iterflag[0] = iterflag[1] = 0; if(my_IDx==0) count_case--; if(my_IDx==Num_procsx-1) count_case--; if(my_IDy==0) count_case--; if(my_IDy==Num_procsy-1) count_case--; if (my_ID == root) { printf("Parallel Research Kernels version %s\n", PRKVERSION); printf("SHMEM stencil execution on 2D grid\n"); printf("Number of ranks = %d\n", Num_procs); printf("Grid size = %d\n", n); printf("Radius of stencil = %d\n", RADIUS); printf("Tiles in x/y-direction = %d/%d\n", Num_procsx, Num_procsy); printf("Type of stencil = star\n"); #ifdef DOUBLE printf("Data type = double precision\n"); #else printf("Data type = single precision\n"); #endif #if LOOPGEN printf("Script used to expand stencil loop body\n"); #else printf("Compact representation of stencil loop body\n"); #endif #if SPLITFENCE printf("Split fence = ON\n"); #else printf("Split fence = OFF\n"); #endif printf("Number of iterations = %d\n", iterations); } shmem_barrier_all(); shmem_broadcast32(&arguments[0], &arguments[0], 2, root, 0, 0, Num_procs, pSync_bcast); iterations=arguments[0]; n=arguments[1]; shmem_barrier_all(); prk_shmem_free(arguments); /* compute amount of space required for input and solution arrays */ width = n/Num_procsx; leftover = n%Num_procsx; if (my_IDx<leftover) { istart = (width+1) * my_IDx; iend = istart + width + 1; } else { istart = (width+1) * leftover + width * (my_IDx-leftover); iend = istart + width; } width = iend - istart + 1; if (width == 0) { printf("ERROR: rank %d has no work to do\n", my_ID); error = 1; } bail_out(error); height = n/Num_procsy; leftover = n%Num_procsy; if (my_IDy<leftover) { jstart = (height+1) * my_IDy; jend = jstart + height + 1; } else { jstart = (height+1) * leftover + height * (my_IDy-leftover); jend = jstart + height; } height = jend - jstart + 1; if (height == 0) { printf("ERROR: rank %d has no work to do\n", my_ID); error = 1; } bail_out(error); if (width < RADIUS || height < RADIUS) { printf("ERROR: rank %d has work tile smaller then stencil radius\n", my_ID); error = 1; } bail_out(error); total_length_in = (width+2*RADIUS); total_length_in *= (height+2*RADIUS); total_length_in *= sizeof(DTYPE); total_length_out = width; total_length_out *= height; total_length_out *= sizeof(DTYPE); in = (DTYPE *) malloc(total_length_in); out = (DTYPE *) malloc(total_length_out); if (!in || !out) { printf("ERROR: rank %d could not allocate space for input/output array\n", my_ID); error = 1; } bail_out(error); /* fill the stencil weights to reflect a discrete divergence operator */ for (jj=-RADIUS; jj<=RADIUS; jj++) for (ii=-RADIUS; ii<=RADIUS; ii++) WEIGHT(ii,jj) = (DTYPE) 0.0; stencil_size = 4*RADIUS+1; for (ii=1; ii<=RADIUS; ii++) { WEIGHT(0, ii) = WEIGHT( ii,0) = (DTYPE) (1.0/(2.0*ii*RADIUS)); WEIGHT(0,-ii) = WEIGHT(-ii,0) = -(DTYPE) (1.0/(2.0*ii*RADIUS)); } norm[0] = (DTYPE) 0.0; f_active_points = (DTYPE) (n-2*RADIUS)*(DTYPE) (n-2*RADIUS); /* intialize the input and output arrays */ for (j=jstart; j<jend; j++) for (i=istart; i<iend; i++) { IN(i,j) = COEFX*i+COEFY*j; OUT(i,j) = (DTYPE)0.0; } /* allocate communication buffers for halo values */ top_buf_out=(DTYPE*)malloc(2*sizeof(DTYPE)*RADIUS*width); if (!top_buf_out) { printf("ERROR: Rank %d could not allocate output comm buffers for y-direction\n", my_ID); error = 1; } bail_out(error); bottom_buf_out = top_buf_out+RADIUS*width; top_buf_in[0]=(DTYPE*)prk_shmem_malloc(4*sizeof(DTYPE)*RADIUS*width); if(!top_buf_in) { printf("ERROR: Rank %d could not allocate input comm buffers for y-direction\n", my_ID); error=1; } bail_out(error); top_buf_in[1] = top_buf_in[0] + RADIUS*width; bottom_buf_in[0] = top_buf_in[1] + RADIUS*width; bottom_buf_in[1] = bottom_buf_in[0] + RADIUS*width; right_buf_out=(DTYPE*)malloc(2*sizeof(DTYPE)*RADIUS*height); if (!right_buf_out) { printf("ERROR: Rank %d could not allocate output comm buffers for x-direction\n", my_ID); error = 1; } bail_out(error); left_buf_out=right_buf_out+RADIUS*height; right_buf_in[0]=(DTYPE*)prk_shmem_malloc(4*sizeof(DTYPE)*RADIUS*height); if(!right_buf_in) { printf("ERROR: Rank %d could not allocate input comm buffers for x-dimension\n", my_ID); error=1; } bail_out(error); right_buf_in[1] = right_buf_in[0] + RADIUS*height; left_buf_in[0] = right_buf_in[1] + RADIUS*height; left_buf_in[1] = left_buf_in[0] + RADIUS*height; /* make sure all symmetric heaps are allocated before being used */ shmem_barrier_all(); for (iter = 0; iter<=iterations; iter++){ /* start timer after a warmup iteration */ if (iter == 1) { shmem_barrier_all(); local_stencil_time[0] = wtime(); } /* sw determines which incoming buffer to select */ sw = iter%2; /* need to fetch ghost point data from neighbors */ if (my_IDy < Num_procsy-1) { for (kk=0,j=jend-RADIUS; j<=jend-1; j++) for (i=istart; i<=iend; i++) { top_buf_out[kk++]= IN(i,j); } shmem_putmem(bottom_buf_in[sw], top_buf_out, RADIUS*width*sizeof(DTYPE), top_nbr); #if SPLITFENCE shmem_fence(); shmem_int_inc(&iterflag[sw], top_nbr); #endif } if (my_IDy > 0) { for (kk=0,j=jstart; j<=jstart+RADIUS-1; j++) for (i=istart; i<=iend; i++) { bottom_buf_out[kk++]= IN(i,j); } shmem_putmem(top_buf_in[sw], bottom_buf_out, RADIUS*width*sizeof(DTYPE), bottom_nbr); #if SPLITFENCE shmem_fence(); shmem_int_inc(&iterflag[sw], bottom_nbr); #endif } if(my_IDx < Num_procsx-1) { for(kk=0,j=jstart;j<=jend;j++) for(i=iend-RADIUS;i<=iend-1;i++) { right_buf_out[kk++]=IN(i,j); } shmem_putmem(left_buf_in[sw], right_buf_out, RADIUS*height*sizeof(DTYPE), right_nbr); #if SPLITFENCE shmem_fence(); shmem_int_inc(&iterflag[sw], right_nbr); #endif } if(my_IDx>0) { for(kk=0,j=jstart;j<=jend;j++) for(i=istart;i<=istart+RADIUS-1;i++) { left_buf_out[kk++]=IN(i,j); } shmem_putmem(right_buf_in[sw], left_buf_out, RADIUS*height*sizeof(DTYPE), left_nbr); #if SPLITFENCE shmem_fence(); shmem_int_inc(&iterflag[sw], left_nbr); #endif } #if SPLITFENCE == 0 shmem_fence(); if(my_IDy<Num_procsy-1) shmem_int_inc(&iterflag[sw], top_nbr); if(my_IDy>0) shmem_int_inc(&iterflag[sw], bottom_nbr); if(my_IDx<Num_procsx-1) shmem_int_inc(&iterflag[sw], right_nbr); if(my_IDx>0) shmem_int_inc(&iterflag[sw], left_nbr); #endif shmem_int_wait_until(&iterflag[sw], SHMEM_CMP_EQ, count_case*(iter/2+1)); if (my_IDy < Num_procsy-1) { for (kk=0,j=jend; j<=jend+RADIUS-1; j++) for (i=istart; i<=iend; i++) { IN(i,j) = top_buf_in[sw][kk++]; } } if (my_IDy > 0) { for (kk=0,j=jstart-RADIUS; j<=jstart-1; j++) for (i=istart; i<=iend; i++) { IN(i,j) = bottom_buf_in[sw][kk++]; } } if (my_IDx < Num_procsx-1) { for (kk=0,j=jstart; j<=jend; j++) for (i=iend; i<=iend+RADIUS-1; i++) { IN(i,j) = right_buf_in[sw][kk++]; } } if (my_IDx > 0) { for (kk=0,j=jstart; j<=jend; j++) for (i=istart-RADIUS; i<=istart-1; i++) { IN(i,j) = left_buf_in[sw][kk++]; } } /* Apply the stencil operator */ for (j=MAX(jstart,RADIUS); j<=MIN(n-RADIUS-1,jend); j++) { for (i=MAX(istart,RADIUS); i<=MIN(n-RADIUS-1,iend); i++) { #if LOOPGEN #include "loop_body_star.incl" #else for (jj=-RADIUS; jj<=RADIUS; jj++) OUT(i,j) += WEIGHT(0,jj)*IN(i,j+jj); for (ii=-RADIUS; ii<0; ii++) OUT(i,j) += WEIGHT(ii,0)*IN(i+ii,j); for (ii=1; ii<=RADIUS; ii++) OUT(i,j) += WEIGHT(ii,0)*IN(i+ii,j); #endif } } /* add constant to solution to force refresh of neighbor data, if any */ for (j=jstart; j<jend; j++) for (i=istart; i<iend; i++) IN(i,j)+= 1.0; } local_stencil_time[0] = wtime() - local_stencil_time[0]; shmem_barrier_all(); shmem_double_max_to_all(&stencil_time[0], &local_stencil_time[0], 1, 0, 0, Num_procs, pWrk_time, pSync_reduce); /* compute L1 norm in parallel */ local_norm[0] = (DTYPE) 0.0; for (j=MAX(jstart,RADIUS); j<MIN(n-RADIUS,jend); j++) { for (i=MAX(istart,RADIUS); i<MIN(n-RADIUS,iend); i++) { local_norm[0] += (DTYPE)ABS(OUT(i,j)); } } shmem_barrier_all(); #ifdef DOUBLE shmem_double_sum_to_all(&norm[0], &local_norm[0], 1, 0, 0, Num_procs, pWrk_norm, pSync_reduce); #else shmem_float_sum_to_all(&norm[0], &local_norm[0], 1, 0, 0, Num_procs, pWrk_norm, pSync_reduce); #endif /******************************************************************************* ** Analyze and output results. ********************************************************************************/ /* verify correctness */ if (my_ID == root) { norm[0] /= f_active_points; if (RADIUS > 0) { reference_norm = (DTYPE) (iterations+1) * (COEFX + COEFY); } else { reference_norm = (DTYPE) 0.0; } if (ABS(norm[0]-reference_norm) > EPSILON) { printf("ERROR: L1 norm = "FSTR", Reference L1 norm = "FSTR"\n", norm[0], reference_norm); error = 1; } else { printf("Solution validates\n"); #ifdef VERBOSE printf("Reference L1 norm = "FSTR", L1 norm = "FSTR"\n", reference_norm, norm[0]); #endif } } bail_out(error); if (my_ID == root) { /* flops/stencil: 2 flops (fma) for each point in the stencil, plus one flop for the update of the input of the array */ flops = (DTYPE) (2*stencil_size+1) * f_active_points; avgtime = stencil_time[0]/iterations; printf("Rate (MFlops/s): "FSTR" Avg time (s): %lf\n", 1.0E-06 * flops/avgtime, avgtime); } prk_shmem_free(top_buf_in); prk_shmem_free(right_buf_in); free(top_buf_out); free(right_buf_out); prk_shmem_free(pSync_bcast); prk_shmem_free(pSync_reduce); prk_shmem_free(pWrk_time); prk_shmem_free(pWrk_norm); prk_shmem_finalize(); exit(EXIT_SUCCESS); }
int max_to_all(int me, int npes) { int i, j, pass=0; memset(ok,0,sizeof(ok)); for (i = 0; i < N; i++) { src0[i] = src1[i] = src2[i] = src3[i] = src4[i] = src5[i] = src6[i] = me + i; } shmem_barrier_all(); shmem_short_max_to_all( dst0, src0, N, 0, 0, npes, pWrk0, pSync); shmem_int_max_to_all( dst1, src1, N, 0, 0, npes, pWrk1, pSync1); shmem_long_max_to_all( dst2, src2, N, 0, 0, npes, pWrk2, pSync); shmem_float_max_to_all( dst3, src3, N, 0, 0, npes, pWrk3, pSync1); shmem_double_max_to_all( dst4, src4, N, 0, 0, npes, pWrk4, pSync); shmem_longdouble_max_to_all(dst5, src5, N, 0, 0, npes, pWrk5, pSync1); shmem_longlong_max_to_all( dst6, src6, N, 0, 0, npes, pWrk6, pSync); if (me == 0) { for (i = 0,j=-1; i < N; i++,j++) { if(dst0[i] != npes+j) ok[0] = 1; if(dst1[i] != npes+j) ok[1] = 1; if(dst2[i] != npes+j) ok[2] = 1; if(dst3[i] != npes+j) ok[3] = 1; if(dst4[i] != npes+j) ok[4] = 1; if(dst5[i] != npes+j) ok[5] = 1; if(dst6[i] != npes+j) ok[6] = 1; } if(ok[0]==1){ printf("Reduction operation shmem_short_max_to_all: Failed\n"); } else{ Vprintf("Reduction operation shmem_short_max_to_all: Passed\n"); pass++; } if(ok[1]==1){ printf("Reduction operation shmem_int_max_to_all: Failed\n"); } else{ Vprintf("Reduction operation shmem_int_max_to_all: Passed\n"); pass++; } if(ok[2]==1){ printf("Reduction operation shmem_long_max_to_all: Failed\n"); } else{ Vprintf("Reduction operation shmem_long_max_to_all: Passed\n"); pass++; } if(ok[3]==1){ printf("Reduction operation shmem_float_max_to_all: Failed\n"); } else{ Vprintf("Reduction operation shmem_float_max_to_all: Passed\n"); pass++; } if(ok[4]==1){ printf("Reduction operation shmem_double_max_to_all: Failed\n"); } else{ Vprintf("Reduction operation shmem_double_max_to_all: Passed\n"); pass++; } if(ok[5]==1){ printf("Reduction operation shmem_longdouble_max_to_all: Failed\n"); } else{ Vprintf("Reduction operation shmem_longdouble_max_to_all: Passed\n"); pass++; } if(ok[6]==1){ printf("Reduction operation shmem_longlong_max_to_all: Failed\n"); } else{ Vprintf("Reduction operation shmem_longlong_max_to_all: Passed\n"); pass++; } Vprintf("\n"); } if (Serialize) shmem_barrier_all(); return (pass == 7 ? 1 : 0); }
int main() { int i,j; int me, npes; int success0, success1, success2, success3, success4, success5, success6; success0 = success1 = success2 = success3 = success4 = success5 = success6 = 0; start_pes(0); me = _my_pe(); npes = _num_pes(); for (i = 0; i < _SHMEM_REDUCE_SYNC_SIZE; i += 1) { pSync[i] = _SHMEM_SYNC_VALUE; pSync1[i] = _SHMEM_SYNC_VALUE; } for (i = 0; i < N; i += 1) { src0[i] = src1[i] = src2[i] = src3[i] = src4[i] = src5[i] = src6[i] = me + i; } /*Test MAX: shmem_double_max_to_all, shmem_float_max_to_all, shmem_int_max_to_all, shmem_long_max_to_all, shmem_longdouble_max_to_all, shmem_longlong_max_to_all, shmem_short_max_to_all */ shmem_barrier_all(); shmem_short_max_to_all(dst0, src0, N, 0, 0, npes, pWrk0, pSync); shmem_int_max_to_all(dst1, src1, N, 0, 0, npes, pWrk1, pSync1); shmem_long_max_to_all(dst2, src2, N, 0, 0, npes, pWrk2, pSync); shmem_float_max_to_all(dst3, src3, N, 0, 0, npes, pWrk3, pSync1); shmem_double_max_to_all(dst4, src4, N, 0, 0, npes, pWrk4, pSync); shmem_longdouble_max_to_all(dst5, src5, N, 0, 0, npes, pWrk5, pSync1); shmem_longlong_max_to_all(dst6, src6, N, 0, 0, npes, pWrk6, pSync); if(me == 0){ for (i = 0,j=-1; i < N; i++,j++) { if(dst0[i] != npes+j) success0 =1; if(dst1[i] != npes+j) success1 =1; if(dst2[i] != npes+j) success2 =1; if(dst3[i] != npes+j) success3 =1; if(dst4[i] != npes+j) success4 =1; if(dst5[i] != npes+j) success5 =1; if(dst6[i] != npes+j) success6 =1; } if(success0==1){ printf("Reduction operation shmem_short_max_to_all: Failed\n"); } else{ printf("Reduction operation shmem_short_max_to_all: Passed\n"); } if(success1==1){ printf("Reduction operation shmem_int_max_to_all: Failed\n"); } else{ printf("Reduction operation shmem_int_max_to_all: Passed\n"); } if(success2==1){ printf("Reduction operation shmem_long_max_to_all: Failed\n"); } else{ printf("Reduction operation shmem_long_max_to_all: Passed\n"); } if(success3==1){ printf("Reduction operation shmem_float_max_to_all: Failed\n"); } else{ printf("Reduction operation shmem_float_max_to_all: Passed\n"); } if(success4==1){ printf("Reduction operation shmem_double_max_to_all: Failed\n"); } else{ printf("Reduction operation shmem_double_max_to_all: Passed\n"); } if(success5==1){ printf("Reduction operation shmem_longdouble_max_to_all: Failed\n"); } else{ printf("Reduction operation shmem_longdouble_max_to_all: Passed\n"); } if(success6==1){ printf("Reduction operation shmem_longlong_max_to_all: Failed\n"); } else{ printf("Reduction operation shmem_longlong_max_to_all: Passed\n"); } } /*Test MIN: shmem_double_min_to_all, shmem_float_min_to_all, shmem_int_min_to_all, shmem_long_min_to_all, shmem_longdouble_min_to_all, shmem_longlong_min_to_all, shmem_short_min_to_all*/ success0 = success1 = success2 = success3 = success4 = success5 = success6 = 0; for (i = 0; i < N; i += 1) { src0[i] = src1[i] = src2[i] = src3[i] = src4[i] = src5[i] = src6[i] = me + i; } for (i = 0; i < N; i += 1) { dst0[i] = -9; dst1[i] = -9; dst2[i] = -9; dst3[i] = -9; dst4[i] = -9; dst5[i] = -9; dst6[i] = -9; } shmem_barrier_all(); shmem_short_min_to_all(dst0, src0, N, 0, 0, npes, pWrk0, pSync); shmem_int_min_to_all(dst1, src1, N, 0, 0, npes, pWrk1, pSync1); shmem_long_min_to_all(dst2, src2, N, 0, 0, npes, pWrk2, pSync); shmem_float_min_to_all(dst3, src3, N, 0, 0, npes, pWrk3, pSync1); shmem_double_min_to_all(dst4, src4, N, 0, 0, npes, pWrk4, pSync); shmem_longdouble_min_to_all(dst5, src5, N, 0, 0, npes, pWrk5, pSync1); shmem_longlong_min_to_all(dst6, src6, N, 0, 0, npes, pWrk6, pSync); if(me == 0){ for (i = 0; i < N; i++) { if(dst0[i] != i) success0 =1; if(dst1[i] != i) success1 =1; if(dst2[i] != i) success2 =1; if(dst3[i] != i) success3 =1; if(dst4[i] != i) success4 =1; if(dst5[i] != i) success5 =1; if(dst6[i] != i) success6 =1; } if(success0==1){ printf("Reduction operation shmem_short_min_to_all: Failed\n"); } else{ printf("Reduction operation shmem_short_min_to_all: Passed\n"); } if(success1==1){ printf("Reduction operation shmem_int_min_to_all: Failed\n"); } else{ printf("Reduction operation shmem_int_min_to_all: Passed\n"); } if(success2==1){ printf("Reduction operation shmem_long_min_to_all: Failed\n"); } else{ printf("Reduction operation shmem_long_min_to_all: Passed\n"); } if(success3==1){ printf("Reduction operation shmem_float_min_to_all: Failed\n"); } else{ printf("Reduction operation shmem_float_min_to_all: Passed\n"); } if(success4==1){ printf("Reduction operation shmem_double_min_to_all: Failed\n"); } else{ printf("Reduction operation shmem_double_min_to_all: Passed\n"); } if(success5==1){ printf("Reduction operation shmem_longdouble_min_to_all: Failed\n"); } else{ printf("Reduction operation shmem_longdouble_min_to_all: Passed\n"); } if(success6==1){ printf("Reduction operation shmem_longlong_min_to_all: Failed\n"); } else{ printf("Reduction operation shmem_longlong_min_to_all: Passed\n"); } } /*Test SUM: shmem_double_sum_to_all, shmem_float_sum_to_all, shmem_int_sum_to_all, shmem_long_sum_to_all, shmem_longdouble_sum_to_all, shmem_longlong_sum_to_all, shmem_short_sum_to_all*/ success0 = success1 = success2 = success3 = success4 = success5 = success6 = 0; for (i = 0; i < N; i += 1) { src0[i] = src1[i] = src2[i] = src3[i] = src4[i] = src5[i] = src6[i] = me; } for (i = 0; i < N; i += 1) { dst0[i] = -9; dst1[i] = -9; dst2[i] = -9; dst3[i] = -9; dst4[i] = -9; dst5[i] = -9; dst6[i] = -9; } shmem_barrier_all(); shmem_short_sum_to_all(dst0, src0, N, 0, 0, npes, pWrk0, pSync); shmem_int_sum_to_all(dst1, src1, N, 0, 0, npes, pWrk1, pSync1); shmem_long_sum_to_all(dst2, src2, N, 0, 0, npes, pWrk2, pSync); shmem_float_sum_to_all(dst3, src3, N, 0, 0, npes, pWrk3, pSync1); shmem_double_sum_to_all(dst4, src4, N, 0, 0, npes, pWrk4, pSync); shmem_longdouble_sum_to_all(dst5, src5, N, 0, 0, npes, pWrk5, pSync1); shmem_longlong_sum_to_all(dst6, src6, N, 0, 0, npes, pWrk6, pSync); if(me == 0){ for (i = 0; i < N; i++) { if(dst0[i] != (npes * (npes-1)/2)) success0 =1; if(dst1[i] != (npes * (npes-1)/2)) success1 =1; if(dst2[i] != (npes * (npes-1)/2)) success2 =1; if(dst3[i] != (npes * (npes-1)/2)) success3 =1; if(dst4[i] != (npes * (npes-1)/2)) success4 =1; if(dst5[i] != (npes * (npes-1)/2)) success5 =1; if(dst6[i] != (npes * (npes-1)/2)) success6 =1; } if(success0==1){ printf("Reduction operation shmem_short_sum_to_all: Failed\n"); } else{ printf("Reduction operation shmem_short_sum_to_all: Passed\n"); } if(success1==1){ printf("Reduction operation shmem_int_sum_to_all: Failed\n"); } else{ printf("Reduction operation shmem_int_sum_to_all: Passed\n"); } if(success2==1){ printf("Reduction operation shmem_long_sum_to_all: Failed\n"); } else{ printf("Reduction operation shmem_long_sum_to_all: Passed\n"); } if(success3==1){ printf("Reduction operation shmem_float_sum_to_all: Failed\n"); } else{ printf("Reduction operation shmem_float_sum_to_all: Passed\n"); } if(success4==1){ printf("Reduction operation shmem_double_sum_to_all: Failed\n"); } else{ printf("Reduction operation shmem_double_sum_to_all: Passed\n"); } if(success5==1){ printf("Reduction operation shmem_longdouble_sum_to_all: Failed\n"); } else{ printf("Reduction operation shmem_longdouble_sum_to_all: Passed\n"); } if(success6==1){ printf("Reduction operation shmem_longlong_sum_to_all: Failed\n"); } else{ printf("Reduction operation shmem_longlong_sum_to_all: Passed\n"); } } /*Test AND: shmem_int_and_to_all, shmem_long_and_to_all, shmem_longlong_and_to_all, shmem_short_and_to_all,*/ success0 = success1 = success2 = success6 = 0; for (i = 0; i < N; i += 1) { src0[i] = src1[i] = src2[i] = src6[i] = me; } for (i = 0; i < N; i += 1) { dst0[i] = -9; dst1[i] = -9; dst2[i] = -9; dst6[i] = -9; } shmem_barrier_all(); shmem_short_and_to_all(dst0, src0, N, 0, 0, npes, pWrk0, pSync); shmem_int_and_to_all(dst1, src1, N, 0, 0, npes, pWrk1, pSync1); shmem_long_and_to_all(dst2, src2, N, 0, 0, npes, pWrk2, pSync); shmem_longlong_and_to_all(dst6, src6, N, 0, 0, npes, pWrk6, pSync1); if(me==0){ for (i = 0; i < N; i++) { if(dst0[i] != 0) success0 =1; if(dst1[i] != 0) success1 =1; if(dst2[i] != 0) success2 =1; if(dst6[i] != 0) success6 =1; } if(success0==1){ printf("Reduction operation shmem_short_and_to_all: Failed\n"); } else{ printf("Reduction operation shmem_short_and_to_all: Passed\n"); } if(success1==1){ printf("Reduction operation shmem_int_and_to_all: Failed\n"); } else{ printf("Reduction operation shmem_int_and_to_all: Passed\n"); } if(success2==1){ printf("Reduction operation shmem_long_and_to_all: Failed\n"); } else{ printf("Reduction operation shmem_long_and_to_all: Passed\n"); } if(success6==1){ printf("Reduction operation shmem_longlong_and_to_all: Failed\n"); } else{ printf("Reduction operation shmem_longlong_and_to_all: Passed\n"); } } /*Test PROD: shmem_double_prod_to_all, shmem_float_prod_to_all, shmem_int_prod_to_all, shmem_long_prod_to_all, shmem_longdouble_prod_to_all, shmem_longlong_prod_to_all, shmem_short_prod_to_all, */ success0 = success1 = success2 = success3 = success4 = success5 = success6 = 0; for (i = 0; i < N; i += 1) { src0[i] = src1[i] = src2[i] = src3[i] = src4[i] = src5[i] = src6[i] = me + 1; } for (i = 0; i < N; i += 1) { dst0[i] = -9; dst1[i] = -9; dst2[i] = -9; dst3[i] = -9; dst4[i] = -9; dst5[i] = -9; dst6[i] = -9; } expected_result0 = expected_result1 = expected_result2 = expected_result3 = expected_result4 = expected_result5 = expected_result6 =1; for(i=1;i<=npes;i++){ expected_result0 = expected_result0 * i; expected_result1 = expected_result1 * i; expected_result2 = expected_result2 * i; expected_result3 = expected_result3 * i; expected_result4 = expected_result4 * i; expected_result5 = expected_result5 * i; expected_result6 = expected_result6 * i; } shmem_barrier_all(); shmem_short_prod_to_all(dst0, src0, N, 0, 0, npes, pWrk0, pSync); shmem_int_prod_to_all(dst1, src1, N, 0, 0, npes, pWrk1, pSync1); shmem_long_prod_to_all(dst2, src2, N, 0, 0, npes, pWrk2, pSync); shmem_float_prod_to_all(dst3, src3, N, 0, 0, npes, pWrk3, pSync1); shmem_double_prod_to_all(dst4, src4, N, 0, 0, npes, pWrk4, pSync); shmem_longdouble_prod_to_all(dst5, src5, N, 0, 0, npes, pWrk5, pSync1); shmem_longlong_prod_to_all(dst6, src6, N, 0, 0, npes, pWrk6, pSync); if(me == 0){ for (i = 0; i < N; i++) { /*printf("dst2[%d]: %ld, expected val: %ld\n",i, dst2[i], (long)expected_result2);*/ if(dst0[i] != expected_result0) success0 =1; if(dst1[i] != expected_result1) success1 =1; if(dst2[i] != expected_result2) success2 =1; if(dst3[i] != expected_result3) success3 =1; if(dst4[i] != expected_result4) success4 =1; if(dst5[i] != expected_result5) success5 =1; if(dst6[i] != expected_result6) success6 =1; } if(success0==1){ printf("Reduction operation shmem_short_prod_to_all: Failed\n"); } else{ printf("Reduction operation shmem_short_prod_to_all: Passed\n"); } if(success1==1){ printf("Reduction operation shmem_int_prod_to_all: Failed\n"); } else{ printf("Reduction operation shmem_int_prod_to_all: Passed\n"); } if(success2==1){ printf("Reduction operation shmem_long_prod_to_all: Failed\n"); } else{ printf("Reduction operation shmem_long_prod_to_all: Passed\n"); } if(success3==1){ printf("Reduction operation shmem_float_prod_to_all: Failed\n"); } else{ printf("Reduction operation shmem_float_prod_to_all: Passed\n"); } if(success4==1){ printf("Reduction operation shmem_double_prod_to_all: Failed\n"); } else{ printf("Reduction operation shmem_double_prod_to_all: Passed\n"); } if(success5==1){ printf("Reduction operation shmem_longdouble_prod_to_all: Failed\n"); } else{ printf("Reduction operation shmem_longdouble_prod_to_all: Passed\n"); } if(success6==1){ printf("Reduction operation shmem_longlong_prod_to_all: Failed\n"); } else{ printf("Reduction operation shmem_longlong_prod_to_all: Passed\n"); } } /*Test OR: shmem_int_or_to_all, shmem_long_or_to_all, shmem_longlong_or_to_all, shmem_short_or_to_all,*/ success0 = success1 = success2 = success6 = 0; for (i = 0; i < N; i += 1) { src0[i] = src1[i] = src2[i] = src6[i] = (me + 1)%4; } for (i = 0; i < N; i += 1) { dst0[i] = -9; dst1[i] = -9; dst2[i] = -9; dst6[i] = -9; } shmem_barrier_all(); shmem_short_or_to_all(dst0, src0, N, 0, 0, npes, pWrk0, pSync); shmem_int_or_to_all(dst1, src1, N, 0, 0, npes, pWrk1, pSync1); shmem_long_or_to_all(dst2, src2, N, 0, 0, npes, pWrk2, pSync); shmem_longlong_or_to_all(dst6, src6, N, 0, 0, npes, pWrk6, pSync1); if(me==0){ for (i = 0; i < N; i++) { if(dst0[i] != 3) success0 =1; if(dst1[i] != 3) success1 =1; if(dst2[i] != 3) success2 =1; if(dst6[i] != 3) success6 =1; } if(success0==1){ printf("Reduction operation shmem_short_or_to_all: Failed\n"); } else{ printf("Reduction operation shmem_short_or_to_all: Passed\n"); } if(success1==1){ printf("Reduction operation shmem_int_or_to_all: Failed\n"); } else{ printf("Reduction operation shmem_int_or_to_all: Passed\n"); } if(success2==1){ printf("Reduction operation shmem_long_or_to_all: Failed\n"); } else{ printf("Reduction operation shmem_long_or_to_all: Passed\n"); } if(success6==1){ printf("Reduction operation shmem_longlong_or_to_all: Failed\n"); } else{ printf("Reduction operation shmem_longlong_or_to_all: Passed\n"); } } /*Test XOR: shmem_int_xor_to_all, shmem_long_xor_to_all, shmem_longlong_xor_to_all, shmem_short_xor_to_all*/ success0 = success1 = success2 = success6 = 0; for (i = 0; i < N; i += 1) { src0[i] = src1[i] = src2[i] = src6[i] = me%2; } for (i = 0; i < N; i += 1) { dst0[i] = -9; dst1[i] = -9; dst2[i] = -9; dst6[i] = -9; } int expected_result = ((int)(npes/2) % 2); shmem_barrier_all(); shmem_short_xor_to_all(dst0, src0, N, 0, 0, npes, pWrk0, pSync); shmem_int_xor_to_all(dst1, src1, N, 0, 0, npes, pWrk1, pSync1); shmem_long_xor_to_all(dst2, src2, N, 0, 0, npes, pWrk2, pSync); shmem_longlong_xor_to_all(dst6, src6, N, 0, 0, npes, pWrk6, pSync1); if(me==0){ for (i = 0; i < N; i++) { if(dst0[i] != expected_result) success0 =1; if(dst1[i] != expected_result) success1 =1; if(dst2[i] != expected_result) success2 =1; if(dst6[i] != expected_result) success6 =1; } if(success0==1){ printf("Reduction operation shmem_short_xor_to_all: Failed\n"); } else{ printf("Reduction operation shmem_short_xor_to_all: Passed\n"); } if(success1==1){ printf("Reduction operation shmem_int_xor_to_all: Failed\n"); } else{ printf("Reduction operation shmem_int_xor_to_all: Passed\n"); } if(success2==1){ printf("Reduction operation shmem_long_xor_to_all: Failed\n"); } else{ printf("Reduction operation shmem_long_xor_to_all: Passed\n"); } if(success6==1){ printf("Reduction operation shmem_longlong_xor_to_all: Failed\n"); } else{ printf("Reduction operation shmem_longlong_xor_to_all: Passed\n"); } } return 0; }
int main(int argc, char *argv[]) { int i = 0, rank, size; int skip, numprocs; static double avg_time = 0.0, max_time = 0.0, min_time = 0.0; static double latency = 0.0; int64_t t_start = 0, t_stop = 0, timer=0; char *buffer=NULL; int max_msg_size = 1048576, full = 0; int t; for ( t = 0; t < _SHMEM_BCAST_SYNC_SIZE; t += 1) pSyncBcast1[t] = _SHMEM_SYNC_VALUE; for ( t = 0; t < _SHMEM_BCAST_SYNC_SIZE; t += 1) pSyncBcast2[t] = _SHMEM_SYNC_VALUE; for ( t = 0; t < _SHMEM_REDUCE_SYNC_SIZE; t += 1) pSyncRed1[t] = _SHMEM_SYNC_VALUE; for ( t = 0; t < _SHMEM_REDUCE_SYNC_SIZE; t += 1) pSyncRed2[t] = _SHMEM_SYNC_VALUE; start_pes(0); rank = _my_pe(); numprocs = _num_pes(); if (process_args(argc, argv, rank, &max_msg_size, &full)) { return 0; } if(numprocs < 2) { if(rank == 0) { fprintf(stderr, "This test requires at least two processes\n"); } return -1; } print_header(rank, full); buffer = shmalloc(max_msg_size * sizeof(char)); if(NULL == buffer) { fprintf(stderr, "malloc failed.\n"); exit(1); } memset(buffer,1, max_msg_size); for(size=1; size <=max_msg_size/sizeof(uint32_t); size *= 2) { if(size > LARGE_MESSAGE_SIZE) { skip = SKIP_LARGE; iterations = iterations_large; } else { skip = SKIP; } timer=0; for(i=0; i < iterations + skip ; i++) { t_start = TIME(); if(i%2) shmem_broadcast32(buffer, buffer, size, 0, 0, 0, numprocs, pSyncBcast1); else shmem_broadcast32(buffer, buffer, size, 0, 0, 0, numprocs, pSyncBcast2); t_stop = TIME(); if(i>=skip){ timer+=t_stop-t_start; } shmem_barrier_all(); } shmem_barrier_all(); latency = (1.0 * timer) / iterations; shmem_double_min_to_all(&min_time, &latency, 1, 0, 0, numprocs, pWrk1, pSyncRed1); shmem_double_max_to_all(&max_time, &latency, 1, 0, 0, numprocs, pWrk2, pSyncRed2); shmem_double_sum_to_all(&avg_time, &latency, 1, 0, 0, numprocs, pWrk1, pSyncRed1); avg_time = avg_time/numprocs; print_data(rank, full, size*sizeof(uint32_t), avg_time, min_time, max_time, iterations); } shfree(buffer); return EXIT_SUCCESS; }
int main () { int quantum = -1, checktick (); int BytesPerWord; int k; ssize_t j, i; STREAM_TYPE scalar; // process local counters int count_p = 0, next_p = 0; gcounter = 0; /* --- SETUP --- determine precision and check timing --- */ printf (HLINE); printf ("STREAM version $Revision: 5.10 $\n"); printf (HLINE); BytesPerWord = sizeof (STREAM_TYPE); printf ("This system uses %d bytes per array element.\n", BytesPerWord); /* SHMEM initialize */ start_pes (0); _world_size = _num_pes (); _world_rank = _my_pe (); /* wait for user to input runtime params */ for (int j = 0; j < _SHMEM_BARRIER_SYNC_SIZE; j++) { pSync0[j] = pSync1[j] = pSync2[j] = _SHMEM_SYNC_VALUE; } if (_world_rank == 0) { printf (HLINE); #ifdef N printf ("***** WARNING: ******\n"); printf (" It appears that you set the preprocessor variable N when compiling this code.\n"); printf (" This version of the code uses the preprocesor variable STREAM_ARRAY_SIZE to control the array size\n"); printf (" Reverting to default value of STREAM_ARRAY_SIZE=%llu\n", (unsigned long long) STREAM_ARRAY_SIZE); printf ("***** WARNING: ******\n"); #endif printf ("Array size = %llu (elements), Offset = %d (elements)\n", (unsigned long long) STREAM_ARRAY_SIZE, OFFSET); printf ("Memory per array = %.1f MiB (= %.1f GiB).\n", BytesPerWord * ((double) STREAM_ARRAY_SIZE / 1024.0 / 1024.0), BytesPerWord * ((double) STREAM_ARRAY_SIZE / 1024.0 / 1024.0 / 1024.0)); printf ("Total memory required = %.1f MiB (= %.1f GiB).\n", (3.0 * BytesPerWord) * ((double) STREAM_ARRAY_SIZE / 1024.0 / 1024.), (3.0 * BytesPerWord) * ((double) STREAM_ARRAY_SIZE / 1024.0 / 1024. / 1024.)); printf ("Each kernel will be executed %d times.\n", NTIMES); printf (" The *best* time for each kernel (excluding the first iteration)\n"); printf (" will be used to compute the reported bandwidth.\n"); printf ("Number of SHMEM PEs requested = %i\n", _world_size); } int blocksize = 10000; assert (STREAM_ARRAY_SIZE % blocksize == 0); // do something really minor /* Get initial value for system clock. */ for (j = 0; j < STREAM_ARRAY_SIZE; j++) { a[j] = 1.0; b[j] = 2.0; c[j] = 0.0; } printf (HLINE); if (_world_rank == 0) { if ((quantum = checktick ()) >= 1) printf ("Your clock granularity/precision appears to be " "%d microseconds.\n", quantum); else { printf ("Your clock granularity appears to be " "less than one microsecond.\n"); quantum = 1; } } // assign fixed iterations per PE // since we know default STREAM array size // we are hardcoding this, but if the value // changes, then this blocking factor must // also change // basically, each PE works on this block // size at a time time_start = mysecond (); /* Initialize */ next_p = shmem_int_fadd (&gcounter, 1, ROOT); for (j = 0; j < STREAM_ARRAY_SIZE; j += blocksize) { if (next_p == count_p) { for (i = j; i < (j + blocksize); i++) { a[i] = 2.0E0 * a[i]; } next_p = shmem_int_fadd (&gcounter, 1, ROOT); } count_p++; } time_end = mysecond (); clock_time_PE = time_end - time_start; shmem_double_sum_to_all (&total_clock_time, &clock_time_PE, 1, 0, 0, _world_size, pWrk0, pSync0); if (_world_rank == 0) { printf ("Each test below will take on the order" " of %d microseconds.\n", (int) (total_clock_time * 1.0E6)); printf (" (= %d clock ticks)\n", (int) ((1.0E6 * total_clock_time) / quantum)); printf ("Increase the size of the arrays if this shows that\n"); printf ("you are not getting at least 20 clock ticks per test.\n"); printf (HLINE); printf ("WARNING -- The above is only a rough guideline.\n"); printf ("For best results, please be sure you know the\n"); printf ("precision of your system timer.\n"); printf (HLINE); } /* --- MAIN LOOP --- repeat test cases NTIMES times --- */ // reduction required, as each PE only fills a,b,c partially scalar = 3.0; for (k = 0; k < NTIMES; k++) { // this is required for correctness // for NTIMES > 1 which is typically // the case for (j = 0; j < STREAM_ARRAY_SIZE; j += blocksize) { if (next_p == count_p) { for (i = j; i < (j + blocksize); i++) { a[i] = 1.0; b[i] = 2.0; c[i] = 0.0; a[i] = 2.0E0 * a[i]; } next_p = shmem_int_fadd (&gcounter, 1, ROOT); } count_p++; shmem_double_max_to_all (a + j, a + j, blocksize, 0, 0, _world_size, pWrk1, pSync1); } shmem_barrier_all (); time_start = mysecond (); for (j = 0; j < STREAM_ARRAY_SIZE; j += blocksize) { if (next_p == count_p) { for (i = j; i < (j + blocksize); i++) { c[i] = a[i]; } next_p = shmem_int_fadd (&gcounter, 1, ROOT); } count_p++; shmem_double_max_to_all (c + j, c + j, blocksize, 0, 0, _world_size, pWrk1, pSync1); } shmem_barrier_all (); time_end = mysecond () - time_start; shmem_double_max_to_all (×[0][k], &time_end, 1, 0, 0, _world_size, pWrk0, pSync0); time_start = mysecond (); for (j = 0; j < STREAM_ARRAY_SIZE; j += blocksize) { if (next_p == count_p) { for (i = j; i < (j + blocksize); i++) { b[i] = scalar * c[i]; } next_p = shmem_int_fadd (&gcounter, 1, ROOT); } count_p++; shmem_double_max_to_all (b + j, b + j, blocksize, 0, 0, _world_size, pWrk1, pSync1); } shmem_barrier_all (); time_end = mysecond () - time_start; shmem_double_sum_to_all (×[1][k], &time_end, 1, 0, 0, _world_size, pWrk0, pSync0); time_start = mysecond (); for (j = 0; j < STREAM_ARRAY_SIZE; j += blocksize) { if (next_p == count_p) { for (i = j; i < (j + blocksize); i++) { c[i] = a[i] + b[i]; } next_p = shmem_int_fadd (&gcounter, 1, ROOT); } count_p++; shmem_double_max_to_all (c + j, c + j, blocksize, 0, 0, _world_size, pWrk1, pSync1); } shmem_barrier_all (); time_end = mysecond () - time_start; shmem_double_sum_to_all (×[2][k], &time_end, 1, 0, 0, _world_size, pWrk0, pSync0); time_start = mysecond (); for (j = 0; j < STREAM_ARRAY_SIZE; j += blocksize) { if (next_p == count_p) { for (i = j; i < (j + blocksize); i++) { a[i] = b[i] + scalar * c[i]; } next_p = shmem_int_fadd (&gcounter, 1, ROOT); } count_p++; shmem_double_max_to_all (a + j, a + j, blocksize, 0, 0, _world_size, pWrk1, pSync1); } shmem_barrier_all (); time_end = mysecond () - time_start; shmem_double_sum_to_all (×[3][k], &time_end, 1, 0, 0, _world_size, pWrk0, pSync0); } shmem_barrier_all (); /* --- SUMMARY --- */ for (k = 1; k < NTIMES; k++) /* note -- skip first iteration */ { for (j = 0; j < 4; j++) { avgtime[j] = avgtime[j] + times[j][k]; mintime[j] = MIN (mintime[j], times[j][k]); maxtime[j] = MAX (maxtime[j], times[j][k]); } } if (_world_rank == 0) { printf ("Function Best Rate MB/s Avg time Min time Max time\n"); for (j = 0; j < 4; j++) { avgtime[j] = avgtime[j] / (double) (NTIMES - 1); printf ("%s%12.1f %11.6f %11.6f %11.6f\n", label[j], 1.0E-06 * bytes[j] / mintime[j], avgtime[j], mintime[j], maxtime[j]); } printf (HLINE); } /* --- Check Results --- */ if (_world_rank == 0) { checkSTREAMresults (); printf (HLINE); } return 0; }
int main(int argc, char ** argv) { long Block_order; /* number of columns owned by rank */ int Block_size; /* size of a single block */ int Colblock_size; /* size of column block */ int Tile_order=32; /* default Tile order */ int tiling; /* boolean: true if tiling is used */ int Num_procs; /* number of ranks */ int order; /* order of overall matrix */ int bufferCount; /* number of input buffers */ int targetBuffer; /* buffer with which to communicate */ int send_to, recv_from; /* ranks with which to communicate */ long bytes; /* combined size of matrices */ int my_ID; /* rank */ int root=0; /* rank of root */ int iterations; /* number of times to do the transpose */ long i, j, it, jt, istart;/* dummies */ int iter; /* index of iteration */ int phase; /* phase inside staged communication */ int colstart; /* starting column for owning rank */ int error; /* error flag */ double *A_p; /* original matrix column block */ double *B_p; /* transposed matrix column block */ double **Work_in_p; /* workspace for the transpose function */ double *Work_out_p; /* workspace for the transpose function */ double epsilon = 1.e-8; /* error tolerance */ double avgtime; /* timing parameters */ long *pSync_bcast; /* work space for collectives */ long *pSync_reduce; /* work space for collectives */ double *pWrk; /* work space for SHMEM collectives */ double *local_trans_time, *trans_time; /* timing parameters */ double *abserr, *abserr_tot; /* local and aggregate error */ int *send_flag, *recv_flag; /* synchronization flags */ int *arguments; /* command line arguments */ /********************************************************************* ** Initialize the SHMEM environment *********************************************************************/ prk_shmem_init(); my_ID=prk_shmem_my_pe(); Num_procs=prk_shmem_n_pes(); if (my_ID == root) { printf("Parallel Research Kernels version %s\n", PRKVERSION); printf("SHMEM matrix transpose: B = A^T\n"); } // initialize sync variables for error checks pSync_bcast = (long *) prk_shmem_align(prk_get_alignment(),PRK_SHMEM_BCAST_SYNC_SIZE*sizeof(long)); pSync_reduce = (long *) prk_shmem_align(prk_get_alignment(),PRK_SHMEM_REDUCE_SYNC_SIZE*sizeof(long)); pWrk = (double *) prk_shmem_align(prk_get_alignment(),sizeof(double) * PRK_SHMEM_REDUCE_MIN_WRKDATA_SIZE); local_trans_time = (double *) prk_shmem_align(prk_get_alignment(),sizeof(double)); trans_time = (double *) prk_shmem_align(prk_get_alignment(),sizeof(double)); arguments = (int *) prk_shmem_align(prk_get_alignment(),4*sizeof(int)); abserr = (double *) prk_shmem_align(prk_get_alignment(),2*sizeof(double)); abserr_tot = abserr + 1; if (!pSync_bcast || !pSync_reduce || !pWrk || !local_trans_time || !trans_time || !arguments || !abserr) { printf("Rank %d could not allocate scalar work space on symm heap\n", my_ID); error = 1; goto ENDOFTESTS; } for(i=0;i<PRK_SHMEM_BCAST_SYNC_SIZE;i++) pSync_bcast[i]=PRK_SHMEM_SYNC_VALUE; for(i=0;i<PRK_SHMEM_REDUCE_SYNC_SIZE;i++) pSync_reduce[i]=PRK_SHMEM_SYNC_VALUE; /********************************************************************* ** process, test and broadcast input parameters *********************************************************************/ error = 0; if (my_ID == root) { if (argc != 4 && argc != 5){ printf("Usage: %s <# iterations> <matrix order> <# buffers> [Tile size]\n", *argv); error = 1; goto ENDOFTESTS; } iterations = atoi(*++argv); arguments[0]=iterations; if(iterations < 1){ printf("ERROR: iterations must be >= 1 : %d \n",iterations); error = 1; goto ENDOFTESTS; } order = atoi(*++argv); arguments[1]=order; if (order < Num_procs) { printf("ERROR: matrix order %d should at least # procs %d\n", order, Num_procs); error = 1; goto ENDOFTESTS; } if (order%Num_procs) { printf("ERROR: matrix order %d should be divisible by # procs %d\n", order, Num_procs); error = 1; goto ENDOFTESTS; } bufferCount = atoi(*++argv); arguments[2]=bufferCount; if (Num_procs > 1) { if ((bufferCount < 1) || (bufferCount >= Num_procs)) { printf("ERROR: bufferCount must be >= 1 and < # procs : %d\n", bufferCount); error = 1; goto ENDOFTESTS; } } if (argc == 5) Tile_order = atoi(*++argv); arguments[3]=Tile_order; ENDOFTESTS:; } bail_out(error); if (my_ID == root) { printf("Number of ranks = %d\n", Num_procs); printf("Matrix order = %d\n", order); printf("Number of iterations = %d\n", iterations); printf("Number of buffers = %d\n", bufferCount); if ((Tile_order > 0) && (Tile_order < order)) printf("Tile size = %d\n", Tile_order); else printf("Untiled\n"); } shmem_barrier_all(); /* Broadcast input data to all ranks */ shmem_broadcast32(&arguments[0], &arguments[0], 4, root, 0, 0, Num_procs, pSync_bcast); iterations=arguments[0]; order=arguments[1]; bufferCount=arguments[2]; Tile_order=arguments[3]; shmem_barrier_all(); prk_shmem_free(arguments); /* a non-positive tile size means no tiling of the local transpose */ tiling = (Tile_order > 0) && (Tile_order < order); bytes = 2 * sizeof(double) * order * order; /********************************************************************* ** The matrix is broken up into column blocks that are mapped one to a ** rank. Each column block is made up of Num_procs smaller square ** blocks of order block_order. *********************************************************************/ Block_order = order/Num_procs; colstart = Block_order * my_ID; Colblock_size = order * Block_order; Block_size = Block_order * Block_order; /********************************************************************* ** Create the column block of the test matrix, the row block of the ** transposed matrix, and workspace (workspace only if #procs>1) *********************************************************************/ A_p = (double *)prk_malloc(Colblock_size*sizeof(double)); if (A_p == NULL){ printf(" Error allocating space for original matrix on node %d\n",my_ID); error = 1; } bail_out(error); B_p = (double *)prk_malloc(Colblock_size*sizeof(double)); if (B_p == NULL){ printf(" Error allocating space for transpose matrix on node %d\n",my_ID); error = 1; } bail_out(error); if (Num_procs>1) { Work_in_p = (double**)prk_malloc(bufferCount*sizeof(double)); Work_out_p = (double *) prk_malloc(Block_size*sizeof(double)); recv_flag = (int*) prk_shmem_align(prk_get_alignment(),bufferCount*sizeof(int)); if ((Work_in_p == NULL)||(Work_out_p==NULL) || (recv_flag == NULL)){ printf(" Error allocating space for work or flags on node %d\n",my_ID); error = 1; } if (bufferCount < (Num_procs - 1)) { send_flag = (int*) prk_shmem_align(prk_get_alignment(), (Num_procs-1) * sizeof(int)); if (send_flag == NULL) { printf("Error allocating space for flags on node %d\n", my_ID); error = 1; } } bail_out(error); for(i=0;i<bufferCount;i++) { Work_in_p[i]=(double *) prk_shmem_align(prk_get_alignment(),Block_size*sizeof(double)); if (Work_in_p[i] == NULL) { printf(" Error allocating space for work on node %d\n",my_ID); error = 1; } bail_out(error); } if (bufferCount < (Num_procs - 1)) { for(i=0;i<(Num_procs-1);i++) send_flag[i]=0; } for(i=0;i<bufferCount;i++) recv_flag[i]=0; } /* Fill the original column matrices */ istart = 0; for (j=0;j<Block_order;j++) for (i=0;i<order; i++) { A(i,j) = (double) (order*(j+colstart) + i); B(i,j) = 0.0; } shmem_barrier_all(); if (bufferCount < (Num_procs - 1)) { if (Num_procs > 1) { for ( i = 0; i < bufferCount; i++) { recv_from = (my_ID + i + 1)%Num_procs; shmem_int_inc(&send_flag[i], recv_from); } } } shmem_barrier_all(); for (iter = 0; iter<=iterations; iter++){ /* start timer after a warmup iteration */ if (iter == 1) { shmem_barrier_all(); local_trans_time[0] = wtime(); } /* do the local transpose */ istart = colstart; if (!tiling) { for (i=0; i<Block_order; i++) for (j=0; j<Block_order; j++) { B(j,i) += A(i,j); A(i,j) += 1.0; } } else { for (i=0; i<Block_order; i+=Tile_order) for (j=0; j<Block_order; j+=Tile_order) for (it=i; it<MIN(Block_order,i+Tile_order); it++) for (jt=j; jt<MIN(Block_order,j+Tile_order);jt++) { B(jt,it) += A(it,jt); A(it,jt) += 1.0; } } for (phase=1; phase<Num_procs; phase++){ recv_from = (my_ID + phase )%Num_procs; send_to = (my_ID - phase + Num_procs)%Num_procs; targetBuffer = (iter * (Num_procs - 1) + (phase - 1)) % bufferCount; istart = send_to*Block_order; if (!tiling) { for (i=0; i<Block_order; i++) for (j=0; j<Block_order; j++){ Work_out(j,i) = A(i,j); A(i,j) += 1.0; } } else { for (i=0; i<Block_order; i+=Tile_order) for (j=0; j<Block_order; j+=Tile_order) for (it=i; it<MIN(Block_order,i+Tile_order); it++) for (jt=j; jt<MIN(Block_order,j+Tile_order);jt++) { Work_out(jt,it) = A(it,jt); A(it,jt) += 1.0; } } if (bufferCount < (Num_procs - 1)) shmem_int_wait_until(&send_flag[phase-1], SHMEM_CMP_EQ, iter+1); shmem_double_put(&Work_in_p[targetBuffer][0], &Work_out_p[0], Block_size, send_to); shmem_fence(); shmem_int_inc(&recv_flag[targetBuffer], send_to); i = (iter * (Num_procs - 1) + phase) / bufferCount; if ((iter * (Num_procs - 1) + phase) % bufferCount) i++; shmem_int_wait_until(&recv_flag[targetBuffer], SHMEM_CMP_EQ, i); istart = recv_from*Block_order; /* scatter received block to transposed matrix; no need to tile */ for (j=0; j<Block_order; j++) for (i=0; i<Block_order; i++) B(i,j) += Work_in(targetBuffer, i,j); if (bufferCount < (Num_procs - 1)) { if ((phase + bufferCount) < Num_procs) recv_from = (my_ID + phase + bufferCount) % Num_procs; else recv_from = (my_ID + phase + bufferCount + 1 - Num_procs) % Num_procs; shmem_int_inc(&send_flag[(phase+bufferCount-1)%(Num_procs-1)], recv_from); } } /* end of phase loop */ } /* end of iterations */ local_trans_time[0] = wtime() - local_trans_time[0]; shmem_barrier_all(); shmem_double_max_to_all(trans_time, local_trans_time, 1, 0, 0, Num_procs, pWrk, pSync_reduce); abserr[0] = 0.0; istart = 0; double addit = ((double)(iterations+1) * (double) (iterations))/2.0; for (j=0;j<Block_order;j++) for (i=0;i<order; i++) { abserr[0] += ABS(B(i,j) - (double)((order*i + j+colstart)*(iterations+1)+addit)); } shmem_barrier_all(); shmem_double_sum_to_all(abserr_tot, abserr, 1, 0, 0, Num_procs, pWrk, pSync_reduce); if (my_ID == root) { if (abserr_tot[0] <= epsilon) { printf("Solution validates\n"); avgtime = trans_time[0]/(double)iterations; printf("Rate (MB/s): %lf Avg time (s): %lf\n",1.0E-06*bytes/avgtime, avgtime); #ifdef VERBOSE printf("Summed errors: %f \n", abserr[0]); #endif } else { printf("ERROR: Aggregate squared error %e exceeds threshold %e\n", abserr[0], epsilon); error = 1; } } bail_out(error); if (Num_procs>1) { if (bufferCount < (Num_procs - 1)) prk_shmem_free(send_flag); prk_shmem_free(recv_flag); prk_free(Work_out_p); for(i=0;i<bufferCount;i++) prk_shmem_free(Work_in_p[i]); prk_free(Work_in_p); } prk_shmem_free(pSync_bcast); prk_shmem_free(pSync_reduce); prk_shmem_free(pWrk); prk_shmem_finalize(); exit(EXIT_SUCCESS); } /* end of main */