int main(int argc, char ** argv) { long m, n; /* grid dimensions */ int i, j, iter; /* dummies */ int iterations; /* number of times to run the pipeline algorithm */ double pipeline_time, /* timing parameters */ avgtime, max_time; double epsilon = 1.e-8; /* error tolerance */ double corner_val; /* verification value at top right corner of grid */ double *vector;/* array holding grid values */ long total_length; /* total required length to store grid values */ /******************************************************************************* ** process and test input parameters ********************************************************************************/ if(MYTHREAD == THREADS-1){ printf("Parallel Research Kernels version %s\n", PRKVERSION); printf("UPC pipeline execution on 2D grid\n"); } if (argc != 4){ if(MYTHREAD == THREADS-1){ printf("Usage: %s <# iterations> <first array dimension> ", *argv); printf("<second array dimension>\n"); } upc_global_exit(EXIT_FAILURE); } iterations = atoi(*++argv); if (iterations < 1){ if(MYTHREAD == THREADS-1) printf("ERROR: iterations must be >= 1 : %d \n",iterations); upc_global_exit(EXIT_FAILURE); } m = atol(*++argv); n = atol(*++argv); if (m < 1 || n < 1){ if(MYTHREAD == THREADS-1) printf("ERROR: grid dimensions must be positive: %d, %d \n", m, n); upc_global_exit(EXIT_FAILURE); } if(MYTHREAD == THREADS-1){ printf("Number of threads = %d\n", THREADS); printf("Grid sizes = %ld, %ld\n", m, n); printf("Number of iterations = %d\n", iterations); #if USE_BUPC_EXT printf("Using Berkeley UPC extensions\n"); #endif } /********************************************************************* ** Allocate memory for input and output matrices *********************************************************************/ #if USE_BUPC_EXT bupc_sem_t *myflag = bupc_sem_alloc(BUPC_SEM_INTEGER | BUPC_SEM_MPRODUCER); upc_barrier; allflags[MYTHREAD] = myflag; upc_barrier; bupc_sem_t *mypeer = allflags[(MYTHREAD+1) % THREADS]; #endif long segment_size = m / THREADS; int leftover = m % THREADS; int myoffsetx, sizex; if(MYTHREAD < leftover){ myoffsetx = (segment_size + 1) * MYTHREAD; sizex = segment_size + 1; }else{ myoffsetx = (segment_size + 1) * leftover + segment_size * (MYTHREAD - leftover); sizex = segment_size; } #if USE_BUPC_EXT if(MYTHREAD != 0){ myoffsetx -= 1; sizex += 1; } #endif int sizey = n; int myoffsety = 0; upc_barrier; debug("Allocating arrays (%d, %d), offset (%d, %d)", sizex, sizey, myoffsetx, myoffsety); local_shared_block_ptrs in_array = shared_2d_array_alloc(sizex, sizey, myoffsetx, myoffsety); in_arrays[MYTHREAD] = in_array; double **in_array_private = shared_2d_array_to_private(in_array, sizex, sizey, myoffsetx, myoffsety); if(MYTHREAD == 0) current_max_line[MYTHREAD] = sizey; else current_max_line[MYTHREAD] = 0; upc_barrier; /********************************************************************* ** Initialize the matrices *********************************************************************/ /* clear the array */ for (j=0; j<n; j++) for (i=myoffsetx; i<myoffsetx + sizex; i++) ARRAY(i, j) = 0.0; /* set boundary values (bottom and left side of grid */ if(MYTHREAD == 0) for (j=0; j<n; j++) ARRAY(0, j) = (double) j; for (i=myoffsetx; i<myoffsetx + sizex; i++) ARRAY(i, 0) = (double) i; upc_barrier; for (iter = 0; iter<=iterations; iter++){ /* start timer after a warmup iteration */ if (iter == 1) pipeline_time = wtime(); if(MYTHREAD == 0) debug("start it %d, %f", iter, ARRAY(0, 0)); if(MYTHREAD != THREADS - 1) // Send the element in line 0 in_arrays[MYTHREAD + 1][0][myoffsetx + sizex -1] = ARRAY(myoffsetx + sizex - 1, 0); for (j=1; j<n; j++) { #if USE_BUPC_EXT if(MYTHREAD > 0){ bupc_sem_wait(myflag); } for (i=myoffsetx+1; i<myoffsetx + sizex; i++) ARRAY(i, j) = ARRAY(i-1, j) + ARRAY(i, j-1) - ARRAY(i-1, j-1); if(MYTHREAD != THREADS - 1){ in_arrays[MYTHREAD + 1][j][myoffsetx + sizex -1] = ARRAY(myoffsetx + sizex - 1, j); bupc_sem_post(mypeer); } #else while(j > current_max_line[MYTHREAD]) // Normally not necessary: bupc_poll(); ; if(MYTHREAD > 0) ARRAY(myoffsetx, j) = in_arrays[MYTHREAD - 1][j][myoffsetx-1] + ARRAY(myoffsetx, j-1) - in_arrays[MYTHREAD-1][j-1][myoffsetx-1]; for (i=myoffsetx+1; i<myoffsetx + sizex; i++) ARRAY(i, j) = ARRAY(i-1, j) + ARRAY(i, j-1) - ARRAY(i-1, j-1); if(MYTHREAD < THREADS - 1) current_max_line[MYTHREAD+1] = j; #endif } /* copy top right corner value to bottom left corner to create dependency; we need a barrier to make sure the latest value is used. This also guarantees that the flags for the next iteration (if any) are not getting clobbered */ if(MYTHREAD == 0) current_max_line[MYTHREAD] = sizey; else current_max_line[MYTHREAD] = 0; if(MYTHREAD == THREADS - 1){ in_arrays[0][0][0] = -ARRAY(m-1, n-1); } upc_barrier; } pipeline_time = wtime() - pipeline_time; times[MYTHREAD] = pipeline_time; upc_barrier; // Compute max_time if(MYTHREAD == THREADS - 1){ max_time = times[MYTHREAD]; for(i=1; i<THREADS; i++){ if(max_time < times[i]) max_time = times[i]; } } /******************************************************************************* ** Analyze and output results. ********************************************************************************/ /* verify correctness, using top right value; */ if( MYTHREAD == THREADS - 1){ corner_val = (double)((iterations+1)*(n+m-2)); if (fabs(ARRAY(m-1,n-1)-corner_val)/corner_val > epsilon) { printf("ERROR: checksum %lf does not match verification value %lf\n", ARRAY(m-1, n-1), corner_val); exit(EXIT_FAILURE); } #if VERBOSE printf("checksum %lf verification value %lf\n", ARRAY(m-1, n-1), corner_val); printf("Solution validates; verification value = %lf\n", corner_val); #else printf("Solution validates\n"); #endif avgtime = max_time/iterations; printf("Rate (MFlops/s): %lf Avg time (s): %lf\n", 1.0E-06 * 2 * ((double)(m-1)*(double)(n-1))/avgtime, avgtime); exit(EXIT_SUCCESS); } }
int main(int argc, char ** argv) { int N; int tile_size=32; /* default tile size for tiling of local transpose */ int num_iterations;/* number of times to do the transpose */ int tiling; /* boolean: true if tiling is used */ double total_bytes; /* combined size of matrices */ double start_time, /* timing parameters */ end_time, avgtime; /********************************************************************* ** read and test input parameters *********************************************************************/ if(argc != 3 && argc != 4){ if(MYTHREAD == 0) printf("Usage: %s <# iterations> <matrix order> [tile size]\n", *argv); upc_global_exit(EXIT_FAILURE); } num_iterations = atoi(*++argv); if(num_iterations < 1){ if(MYTHREAD == 0) printf("ERROR: iterations must be >= 1 : %d \n", num_iterations); upc_global_exit(EXIT_FAILURE); } N = atoi(*++argv); if(N < 0){ if(MYTHREAD == 0) printf("ERROR: Matrix Order must be greater than 0 : %d \n", N); upc_global_exit(EXIT_FAILURE); } if (argc == 4) tile_size = atoi(*++argv); /*a non-positive tile size means no tiling of the local transpose */ tiling = (tile_size > 0) && (tile_size < N); if(!tiling) tile_size = N; int sizex = N / THREADS; if(N % THREADS != 0) { if(MYTHREAD == 0) printf("N %% THREADS != 0\n"); upc_global_exit(EXIT_FAILURE); } int sizey = N; if(MYTHREAD == 0) { printf("Parallel Research Kernels version %s\n", PRKVERSION); printf("UPC matrix transpose: B = A^T\n"); printf("Number of threads = %d\n", THREADS); printf("Matrix order = %d\n", N); printf("Number of iterations = %d\n", num_iterations); if (tiling) printf("Tile size = %d\n", tile_size); else printf("Untiled\n"); } /********************************************************************* ** Allocate memory for input and output matrices *********************************************************************/ total_bytes = 2.0 * sizeof(double) * N * N; int myoffsetx = MYTHREAD * sizex; int myoffsety = 0; upc_barrier; debug("Allocating arrays (%d, %d), offset (%d, %d)", sizex, sizey, myoffsetx, myoffsety); local_shared_block_ptrs in_array = shared_2d_array_alloc(sizex, sizey, myoffsetx, myoffsety); local_shared_block_ptrs out_array = shared_2d_array_alloc(sizex, sizey, myoffsetx, myoffsety); local_shared_block_ptrs buf_array = shared_2d_array_alloc(sizex, sizey, myoffsetx, myoffsety); in_arrays[MYTHREAD] = in_array; out_arrays[MYTHREAD] = out_array; buf_arrays[MYTHREAD] = buf_array; double **in_array_private = shared_2d_array_to_private(in_array, sizex, sizey, myoffsetx, myoffsety); double **out_array_private = shared_2d_array_to_private(out_array, sizex, sizey, myoffsetx, myoffsety); double **buf_array_private = shared_2d_array_to_private(buf_array, sizex, sizey, myoffsetx, myoffsety); upc_barrier; /********************************************************************* ** Initialize the matrices *********************************************************************/ for(int y=myoffsety; y<myoffsety + sizey; y++){ for(int x=myoffsetx; x<myoffsetx + sizex; x++){ in_array_private[y][x] = (double) (x+N*y); out_array[y][x] = -1.0; } } upc_barrier; for(int y=myoffsety; y<myoffsety + sizey; y++){ for(int x=myoffsetx; x<myoffsetx + sizex; x++){ if(in_array_private[y][x] !=(double) (x+N*y)) die("x=%d y=%d in_array=%f != %f", x, y, in_array[y][x], (x+N*y)); if(out_array_private[y][x] != -1.0) die("out_array_private error"); } } /********************************************************************* ** Transpose *********************************************************************/ int transfer_size = sizex * sizex * sizeof(double); if(MYTHREAD == 0) debug("transfer size = %d", transfer_size); for(int iter=0; iter<=num_iterations; iter++){ /* start timer after a warmup iteration */ if(iter == 1){ upc_barrier; start_time = wtime(); } for(int i=0; i<THREADS; i++){ int local_blk_id = (MYTHREAD + i) % THREADS; int remote_blk_id = MYTHREAD; int remote_thread = local_blk_id; upc_memget(&buf_array_private[local_blk_id * sizex][myoffsetx], &in_arrays[remote_thread][remote_blk_id * sizex][remote_thread * sizex], transfer_size); #define OUT_ARRAY(x,y) out_array_private[local_blk_id * sizex + x][myoffsetx + y] #define BUF_ARRAY(x,y) buf_array_private[local_blk_id * sizex + x][myoffsetx + y] if(!tiling){ for(int x=0; x<sizex; x++){ for(int y=0; y<sizex; y++){ OUT_ARRAY(x,y) = BUF_ARRAY(y,x); } } } else{ for(int x=0; x<sizex; x+=tile_size){ for(int y=0; y<sizex; y+=tile_size){ for(int bx=x; bx<MIN(sizex, x+tile_size); bx++){ for(int by=y; by<MIN(sizex, y+tile_size); by++){ OUT_ARRAY(bx,by) = BUF_ARRAY(by,bx); } } } } } } upc_barrier; } upc_barrier; end_time = wtime(); /********************************************************************* ** Analyze and output results. *********************************************************************/ for(int y=myoffsety; y<myoffsety + sizey; y++){ for(int x=myoffsetx; x<myoffsetx + sizex; x++){ if(in_array_private[y][x] != (double)(x+ N*y)) die("Error in input: x=%d y=%d", x, y); if(out_array_private[y][x] != (double)(y + N*x)) die("x=%d y=%d in_array=%f != %f %d %d", x, y, out_array[y][x], (double)(y + N*x), (int)(out_array[y][x]) % N, (int)(out_array[y][x]) / N); } } if(MYTHREAD == 0){ printf("Solution validates\n"); double transfer_size = 2 * N * N * sizeof(double); avgtime = (end_time - start_time) / num_iterations; double rate = transfer_size / avgtime * 1.0E-06; printf("Rate (MB/s): %lf Avg time (s): %lf\n",rate, avgtime); } }
int main(int argc, char ** argv) { int n; /* linear grid dimension */ int i, j, ii, jj, it, jt, iter; /* dummies */ double norm, /* L1 norm of solution */ reference_norm; double f_active_points; /* interior of grid with respect to stencil */ DTYPE flops; /* floating point ops per iteration */ int iterations; /* number of times to run the algorithm */ double stencil_time, /* timing parameters */ avgtime, max_time; int stencil_size; /* number of points in stencil */ DTYPE weight[2*RADIUS+1][2*RADIUS+1]; /* weights of points in the stencil */ int istart; /* bounds of grid tile assigned to calling rank */ int jstart; /* bounds of grid tile assigned to calling rank */ int Num_procsx, Num_procsy; /******************************************************************************* ** process and test input parameters ********************************************************************************/ if(MYTHREAD == 0){ printf("Parallel Research Kernels version %s\n", PRKVERSION); printf("UPC stencil execution on 2D grid\n"); fflush(stdout); } if (argc != 4 && argc != 3) if(MYTHREAD == 0) bail_out("Usage: %s <# iterations> <array dimension> [x_tiles]\n", *argv); iterations = atoi(*++argv); if (iterations < 1) if(MYTHREAD == 0) bail_out("iterations must be >= 1 : %d", iterations); n = atoi(*++argv); if (n < 1) if(MYTHREAD == 0) bail_out("grid dimension must be positive: %d", n); if (argc == 4) Num_procsx = atoi(*++argv); else Num_procsx = 0; if(Num_procsx < 0) if(MYTHREAD == 0) bail_out("Number of tiles in the x-direction should be positive (got: %d)", Num_procsx); if(Num_procsx > THREADS) if(MYTHREAD == 0) bail_out("Number of tiles in the x-direction should be < THREADS (got: %d)", Num_procsx); /* Num_procsx=0 refers to automated calculation of division on each coordinates like MPI code */ if(Num_procsx == 0){ for (Num_procsx=(int) (sqrt(THREADS+1)); Num_procsx>0; Num_procsx--) { if (!(THREADS%Num_procsx)) { Num_procsy = THREADS/Num_procsx; break; } } } else { Num_procsy = THREADS / Num_procsx; } if(RADIUS < 1) if(MYTHREAD == 0) bail_out("Stencil radius %d should be positive", RADIUS); if(2*RADIUS +1 > n) if(MYTHREAD == 0) bail_out("Stencil radius %d exceeds grid size %d", RADIUS, n); if(Num_procsx * Num_procsy != THREADS){ bail_out("Num_procsx * Num_procsy != THREADS"); } /* compute amount of space required for input and solution arrays */ int my_IDx = MYTHREAD % Num_procsx; int my_IDy = MYTHREAD / Num_procsx; int blockx = n / Num_procsx; int leftover = n % Num_procsx; if (my_IDx < leftover) { istart = (blockx + 1) * my_IDx; blockx += 1; } else { istart = (blockx+1) * leftover + blockx * (my_IDx-leftover); } if (blockx == 0) bail_out("No work to do on x-direction!"); int blocky = n / Num_procsy; leftover = n % Num_procsy; if (my_IDy < leftover) { jstart = (blocky+1) * my_IDy; blocky += 1; } else { jstart = (blocky+1) * leftover + blocky * (my_IDy-leftover); } if (blocky == 0) bail_out("No work to do on y-direction!"); if(blockx < RADIUS || blocky < RADIUS) { bail_out("blockx < RADIUS || blocky < RADIUS"); } int myoffsetx = istart - RADIUS; int myoffsety = jstart - RADIUS; thread_offsetx[MYTHREAD] = myoffsetx; thread_offsety[MYTHREAD] = myoffsety; int sizex = blockx + 2*RADIUS; int sizey = blocky + 2*RADIUS; thread_sizex[MYTHREAD] = sizex; thread_sizey[MYTHREAD] = sizey; upc_barrier; local_shared_block_ptrs in_array = shared_2d_array_alloc(sizex, sizey, myoffsetx, myoffsety); local_shared_block_ptrs out_array = shared_2d_array_alloc(sizex, sizey, myoffsetx, myoffsety); in_arrays[MYTHREAD] = in_array; out_arrays[MYTHREAD] = out_array; DTYPE **in_array_private = shared_2d_array_to_private(in_array, sizex, sizey, myoffsetx, myoffsety); DTYPE **out_array_private = shared_2d_array_to_private(out_array, sizex, sizey, myoffsetx, myoffsety); upc_barrier; private_in_arrays = prk_malloc(sizeof(private_shared_block_ptrs) * THREADS); if(private_in_arrays == NULL) bail_out("Cannot allocate private_in_arrays"); private_out_arrays = prk_malloc(sizeof(private_shared_block_ptrs) * THREADS); if(private_out_arrays == NULL) bail_out("Cannot allocate private_out_arrays"); for(int thread=0; thread<THREADS; thread++){ private_in_arrays[thread] = partially_privatize(in_arrays[thread], thread); private_out_arrays[thread] = partially_privatize(out_arrays[thread], thread); } /* intialize the input and output arrays */ for(int y=myoffsety; y<myoffsety + sizey; y++){ for(int x=myoffsetx; x<myoffsetx + sizex; x++){ in_array_private[y][x] = COEFX*x + COEFY*y; out_array[y][x] = 0.; } } upc_barrier; for(int y=myoffsety; y<myoffsety + sizey; y++){ for(int x=myoffsetx; x<myoffsetx + sizex; x++){ if(in_array_private[y][x] != COEFX*x + COEFY*y) bail_out("x=%d y=%d in_array=%f != %f", x, y, in_array[y][x], COEFX*x + COEFY*y); } } /* fill the stencil weights to reflect a discrete divergence operator */ for (jj=-RADIUS; jj<=RADIUS; jj++) for (ii=-RADIUS; ii<=RADIUS; ii++) WEIGHT(ii, jj) = (DTYPE)0.0; stencil_size = 4*RADIUS+1; for (ii=1; ii<=RADIUS; ii++) { WEIGHT(0, ii) = WEIGHT( ii,0) = (DTYPE) (1.0/(2.0*ii*RADIUS)); WEIGHT(0,-ii) = WEIGHT(-ii,0) = -(DTYPE) (1.0/(2.0*ii*RADIUS)); } if(MYTHREAD == 0){ printf("Number of threads = %d\n", THREADS); printf("Grid size = %d\n", n); printf("Radius of stencil = %d\n", RADIUS); printf("Tiles in x/y-direction = %d/%d\n", Num_procsx, Num_procsy); #if DOUBLE printf("Data type = double precision\n"); #else printf("Data type = single precision\n"); #endif #if LOOPGEN printf("Script used to expand stencil loop body\n"); #else printf("Compact representation of stencil loop body\n"); #endif printf("Number of iterations = %d\n", iterations); } upc_barrier; int startx = myoffsetx + RADIUS; int endx = myoffsetx + sizex - RADIUS; int starty = myoffsety + RADIUS; int endy = myoffsety + sizey - RADIUS; if(my_IDx == 0) startx += RADIUS; if(my_IDx == Num_procsx - 1) endx -= RADIUS; if(my_IDy == 0) starty += RADIUS; if(my_IDy == Num_procsy - 1) endy -= RADIUS; upc_barrier; for (iter = 0; iter<=iterations; iter++){ /* start timer after a warmup iteration */ if (iter == 1) { upc_barrier; stencil_time = wtime(); } /* Get ghost zones */ /* NORTH */ if(my_IDy != 0){ int peer = (my_IDy - 1) * Num_procsx + my_IDx; for (int y=starty - RADIUS; y<starty; y++) { int transfer_size = (endx - startx) * sizeof(DTYPE); upc_memget(&in_array_private[y][startx], &private_in_arrays[peer][y][startx], transfer_size); } } /* SOUTH */ if(my_IDy != Num_procsy - 1){ int peer = (my_IDy + 1) * Num_procsx + my_IDx; for (int y=endy; y<endy + RADIUS; y++) { int transfer_size = (endx - startx) * sizeof(DTYPE); upc_memget(&in_array_private[y][startx], &private_in_arrays[peer][y][startx], transfer_size); } } /* LEFT */ if(my_IDx != 0){ int peer = my_IDy * Num_procsx + my_IDx - 1; for (int y=starty; y<endy; y++) { for (int x=startx - RADIUS; x<startx; x++) { in_array_private[y][x] = private_in_arrays[peer][y][x]; } } } /* RIGHT*/ if(my_IDx != Num_procsx - 1){ int peer = my_IDy * Num_procsx + my_IDx + 1; for (int y=starty; y<endy; y++) { for (int x=endx; x<endx + RADIUS; x++) { in_array_private[y][x] = private_in_arrays[peer][y][x]; } } } /* Apply the stencil operator */ for (j=starty; j<endy; j++) { for (i=startx; i<endx; i++) { #if LOOPGEN #include "loop_body_star.incl" #else for (jj=-RADIUS; jj<=RADIUS; jj++) OUT(i,j) += WEIGHT(0,jj)*IN(i,j+jj); for (ii=-RADIUS; ii<0; ii++) OUT(i,j) += WEIGHT(ii,0)*IN(i+ii,j); for (ii=1; ii<=RADIUS; ii++) OUT(i,j) += WEIGHT(ii,0)*IN(i+ii,j); #endif } } upc_barrier; /* <- Necessary barrier: some slow threads could use future data */ /* add constant to solution to force refresh of neighbor data, if any */ for(int y=myoffsety + RADIUS; y<myoffsety + sizey - RADIUS; y++) for(int x=myoffsetx + RADIUS; x<myoffsetx + sizex - RADIUS; x++) in_array_private[y][x] += 1.0; upc_barrier; /* <- Necessary barrier: some threads could start on old data */ } /* end of iterations */ stencil_time = wtime() - stencil_time; times[MYTHREAD] = stencil_time; upc_barrier; // Compute max_time if(MYTHREAD == 0){ max_time = times[MYTHREAD]; for(i=1; i<THREADS; i++){ if(max_time < times[i]) max_time = times[i]; } } norm = (double) 0.0; f_active_points = (double)(n-2*RADIUS) * (double)(n-2*RADIUS); /* compute L1 norm in parallel */ for (int y=starty; y<endy; y++) { for (int x=startx; x<endx; x++) { norm += (double)ABS(out_array[y][x]); } } norm /= f_active_points; norms[MYTHREAD] = norm; upc_barrier; if(MYTHREAD == 0){ norm = 0.; for(int i=0; i<THREADS; i++) norm += norms[i]; /******************************************************************************* ** Analyze and output results. ********************************************************************************/ /* verify correctness */ reference_norm = (double) (iterations+1) * (COEFX + COEFY); if (ABS(norm - reference_norm) > EPSILON) bail_out("L1 norm = "FSTR", Reference L1 norm = "FSTR"\n", norm, reference_norm); else { printf("Solution validates\n"); #if VERBOSE printf("Reference L1 norm = "FSTR", L1 norm = "FSTR"\n", reference_norm, norm); #endif } flops = (DTYPE) (2*stencil_size+1) * f_active_points; avgtime = max_time/iterations; printf("Rate (MFlops/s): "FSTR" Avg time (s): %lf\n", 1.0E-06 * flops/avgtime, avgtime); exit(EXIT_SUCCESS); } }