int main(int argc, char ** argv) { int N; int tile_size=32; /* default tile size for tiling of local transpose */ int num_iterations;/* number of times to do the transpose */ int tiling; /* boolean: true if tiling is used */ double total_bytes; /* combined size of matrices */ double start_time, /* timing parameters */ end_time, avgtime; /********************************************************************* ** read and test input parameters *********************************************************************/ if(argc != 3 && argc != 4){ if(MYTHREAD == 0) printf("Usage: %s <# iterations> <matrix order> [tile size]\n", *argv); upc_global_exit(EXIT_FAILURE); } num_iterations = atoi(*++argv); if(num_iterations < 1){ if(MYTHREAD == 0) printf("ERROR: iterations must be >= 1 : %d \n", num_iterations); upc_global_exit(EXIT_FAILURE); } N = atoi(*++argv); if(N < 0){ if(MYTHREAD == 0) printf("ERROR: Matrix Order must be greater than 0 : %d \n", N); upc_global_exit(EXIT_FAILURE); } if (argc == 4) tile_size = atoi(*++argv); /*a non-positive tile size means no tiling of the local transpose */ tiling = (tile_size > 0) && (tile_size < N); if(!tiling) tile_size = N; int sizex = N / THREADS; if(N % THREADS != 0) { if(MYTHREAD == 0) printf("N %% THREADS != 0\n"); upc_global_exit(EXIT_FAILURE); } int sizey = N; if(MYTHREAD == 0) { printf("Parallel Research Kernels version %s\n", PRKVERSION); printf("UPC matrix transpose: B = A^T\n"); printf("Number of threads = %d\n", THREADS); printf("Matrix order = %d\n", N); printf("Number of iterations = %d\n", num_iterations); if (tiling) printf("Tile size = %d\n", tile_size); else printf("Untiled\n"); } /********************************************************************* ** Allocate memory for input and output matrices *********************************************************************/ total_bytes = 2.0 * sizeof(double) * N * N; int myoffsetx = MYTHREAD * sizex; int myoffsety = 0; upc_barrier; debug("Allocating arrays (%d, %d), offset (%d, %d)", sizex, sizey, myoffsetx, myoffsety); local_shared_block_ptrs in_array = shared_2d_array_alloc(sizex, sizey, myoffsetx, myoffsety); local_shared_block_ptrs out_array = shared_2d_array_alloc(sizex, sizey, myoffsetx, myoffsety); local_shared_block_ptrs buf_array = shared_2d_array_alloc(sizex, sizey, myoffsetx, myoffsety); in_arrays[MYTHREAD] = in_array; out_arrays[MYTHREAD] = out_array; buf_arrays[MYTHREAD] = buf_array; double **in_array_private = shared_2d_array_to_private(in_array, sizex, sizey, myoffsetx, myoffsety); double **out_array_private = shared_2d_array_to_private(out_array, sizex, sizey, myoffsetx, myoffsety); double **buf_array_private = shared_2d_array_to_private(buf_array, sizex, sizey, myoffsetx, myoffsety); upc_barrier; /********************************************************************* ** Initialize the matrices *********************************************************************/ for(int y=myoffsety; y<myoffsety + sizey; y++){ for(int x=myoffsetx; x<myoffsetx + sizex; x++){ in_array_private[y][x] = (double) (x+N*y); out_array[y][x] = -1.0; } } upc_barrier; for(int y=myoffsety; y<myoffsety + sizey; y++){ for(int x=myoffsetx; x<myoffsetx + sizex; x++){ if(in_array_private[y][x] !=(double) (x+N*y)) die("x=%d y=%d in_array=%f != %f", x, y, in_array[y][x], (x+N*y)); if(out_array_private[y][x] != -1.0) die("out_array_private error"); } } /********************************************************************* ** Transpose *********************************************************************/ int transfer_size = sizex * sizex * sizeof(double); if(MYTHREAD == 0) debug("transfer size = %d", transfer_size); for(int iter=0; iter<=num_iterations; iter++){ /* start timer after a warmup iteration */ if(iter == 1){ upc_barrier; start_time = wtime(); } for(int i=0; i<THREADS; i++){ int local_blk_id = (MYTHREAD + i) % THREADS; int remote_blk_id = MYTHREAD; int remote_thread = local_blk_id; upc_memget(&buf_array_private[local_blk_id * sizex][myoffsetx], &in_arrays[remote_thread][remote_blk_id * sizex][remote_thread * sizex], transfer_size); #define OUT_ARRAY(x,y) out_array_private[local_blk_id * sizex + x][myoffsetx + y] #define BUF_ARRAY(x,y) buf_array_private[local_blk_id * sizex + x][myoffsetx + y] if(!tiling){ for(int x=0; x<sizex; x++){ for(int y=0; y<sizex; y++){ OUT_ARRAY(x,y) = BUF_ARRAY(y,x); } } } else{ for(int x=0; x<sizex; x+=tile_size){ for(int y=0; y<sizex; y+=tile_size){ for(int bx=x; bx<MIN(sizex, x+tile_size); bx++){ for(int by=y; by<MIN(sizex, y+tile_size); by++){ OUT_ARRAY(bx,by) = BUF_ARRAY(by,bx); } } } } } } upc_barrier; } upc_barrier; end_time = wtime(); /********************************************************************* ** Analyze and output results. *********************************************************************/ for(int y=myoffsety; y<myoffsety + sizey; y++){ for(int x=myoffsetx; x<myoffsetx + sizex; x++){ if(in_array_private[y][x] != (double)(x+ N*y)) die("Error in input: x=%d y=%d", x, y); if(out_array_private[y][x] != (double)(y + N*x)) die("x=%d y=%d in_array=%f != %f %d %d", x, y, out_array[y][x], (double)(y + N*x), (int)(out_array[y][x]) % N, (int)(out_array[y][x]) / N); } } if(MYTHREAD == 0){ printf("Solution validates\n"); double transfer_size = 2 * N * N * sizeof(double); avgtime = (end_time - start_time) / num_iterations; double rate = transfer_size / avgtime * 1.0E-06; printf("Rate (MB/s): %lf Avg time (s): %lf\n",rate, avgtime); } }
int main(int argc, char **argv) { int i, j, ntimes, err, flag, strl; double stim, read_tim, write_tim; double min_read_tim, min_write_tim, read_bw, write_bw; upcio_file_t *fh; upc_flag_t sync = 0; char *filename; shared int *buf; shared char *gfilename; shared int *len; ntimes=1; /* process 0 takes the file name as a command-line argument and broadcasts it to other processes */ len = (shared int *) upc_all_alloc(1, sizeof(int)); upc_barrier; if (!MYTHREAD) { i = 1; while ((i < argc) && strcmp("-fname", *argv)) { i++; argv++; } if (i >= argc) { fprintf(stderr, "\n*# Usage: perf -fname filename\n\n"); upc_global_exit(-1); } argv++; strl = strlen(*argv); upc_memput(len, &strl, sizeof(int)); } upc_barrier; upc_memget(&strl, len, sizeof(int)); upc_barrier; gfilename = (shared char *) upc_all_alloc(1,sizeof(char)*(strl)); if (!MYTHREAD) { upc_memput(gfilename, *argv, strl); fprintf(stderr, "Access size per process = %d bytes, ntimes = %d\n", SIZE, ntimes); } upc_barrier; filename = (char *) malloc(sizeof(char)*(strl+1)); upc_memget(filename, gfilename, strl); filename[strl] = '\0'; /* allocate the shared buf on each thread this is for shared w/r with INDIVIDUAL FP */ buf = (shared int *) upc_global_alloc(1,SIZE); upc_barrier; min_read_tim=0.0; min_write_tim=0.0; upc_barrier; fh = uopen( filename, 0); for (j=0; j<ntimes; j++) { upc_barrier; stim = UPC_Wtime(); upc_all_fseek(fh, MYTHREAD*SIZE + SIZE*THREADS*j, UPC_SEEK_SET); err = upc_all_fwrite_shared(fh, buf, BLOCK, SIZE, sizeof(unsigned char), sync); if( err == -1 ) { fprintf(stderr, "TH%2d: Error in write\n", MYTHREAD); break; } write_tim = UPC_Wtime() - stim; min_write_tim += write_tim; } upc_all_fclose(fh); upc_all_fsync(fh); min_write_tim /= ntimes; upc_barrier; fh = uopen( filename, 1); for (j=0; j<ntimes; j++) { upc_barrier; stim = UPC_Wtime(); upc_all_fseek(fh, MYTHREAD*SIZE + SIZE*THREADS*j, UPC_SEEK_SET); err = upc_all_fread_shared(fh, buf, BLOCK, SIZE, sizeof(unsigned char), sync); if( err == -1 ) { fprintf(stderr, "TH%2d: Error in read\n", MYTHREAD); break; } read_tim = UPC_Wtime() - stim; min_read_tim += read_tim; } upc_all_fclose(fh); min_read_tim /= ntimes; upc_barrier; if (!MYTHREAD) { read_bw = (SIZE*THREADS*ntimes)/(min_read_tim*1024.0*1024.0); write_bw = (SIZE*THREADS*ntimes)/(min_write_tim*1024.0*1024.0); printf("TH: %d - Write bandwidth with a prior file sync = %f Mbytes/sec\n", MYTHREAD, write_bw); printf("TH: %d - Read bandwidth with a prior file sync = %f Mbytes/sec\n", MYTHREAD, read_bw); } upc_barrier; /* only thread 0 clean up the single shared buf */ if(!MYTHREAD) { upc_free(buf); upc_free(gfilename); upc_free(len); } free(filename); return 0; }
int main(int argc, char ** argv) { int n; /* linear grid dimension */ int i, j, ii, jj, it, jt, iter; /* dummies */ double norm, /* L1 norm of solution */ reference_norm; double f_active_points; /* interior of grid with respect to stencil */ DTYPE flops; /* floating point ops per iteration */ int iterations; /* number of times to run the algorithm */ double stencil_time, /* timing parameters */ avgtime, max_time; int stencil_size; /* number of points in stencil */ DTYPE weight[2*RADIUS+1][2*RADIUS+1]; /* weights of points in the stencil */ int istart; /* bounds of grid tile assigned to calling rank */ int jstart; /* bounds of grid tile assigned to calling rank */ int Num_procsx, Num_procsy; /******************************************************************************* ** process and test input parameters ********************************************************************************/ if(MYTHREAD == 0){ printf("Parallel Research Kernels version %s\n", PRKVERSION); printf("UPC stencil execution on 2D grid\n"); fflush(stdout); } if (argc != 4 && argc != 3) if(MYTHREAD == 0) bail_out("Usage: %s <# iterations> <array dimension> [x_tiles]\n", *argv); iterations = atoi(*++argv); if (iterations < 1) if(MYTHREAD == 0) bail_out("iterations must be >= 1 : %d", iterations); n = atoi(*++argv); if (n < 1) if(MYTHREAD == 0) bail_out("grid dimension must be positive: %d", n); if (argc == 4) Num_procsx = atoi(*++argv); else Num_procsx = 0; if(Num_procsx < 0) if(MYTHREAD == 0) bail_out("Number of tiles in the x-direction should be positive (got: %d)", Num_procsx); if(Num_procsx > THREADS) if(MYTHREAD == 0) bail_out("Number of tiles in the x-direction should be < THREADS (got: %d)", Num_procsx); /* Num_procsx=0 refers to automated calculation of division on each coordinates like MPI code */ if(Num_procsx == 0){ for (Num_procsx=(int) (sqrt(THREADS+1)); Num_procsx>0; Num_procsx--) { if (!(THREADS%Num_procsx)) { Num_procsy = THREADS/Num_procsx; break; } } } else { Num_procsy = THREADS / Num_procsx; } if(RADIUS < 1) if(MYTHREAD == 0) bail_out("Stencil radius %d should be positive", RADIUS); if(2*RADIUS +1 > n) if(MYTHREAD == 0) bail_out("Stencil radius %d exceeds grid size %d", RADIUS, n); if(Num_procsx * Num_procsy != THREADS){ bail_out("Num_procsx * Num_procsy != THREADS"); } /* compute amount of space required for input and solution arrays */ int my_IDx = MYTHREAD % Num_procsx; int my_IDy = MYTHREAD / Num_procsx; int blockx = n / Num_procsx; int leftover = n % Num_procsx; if (my_IDx < leftover) { istart = (blockx + 1) * my_IDx; blockx += 1; } else { istart = (blockx+1) * leftover + blockx * (my_IDx-leftover); } if (blockx == 0) bail_out("No work to do on x-direction!"); int blocky = n / Num_procsy; leftover = n % Num_procsy; if (my_IDy < leftover) { jstart = (blocky+1) * my_IDy; blocky += 1; } else { jstart = (blocky+1) * leftover + blocky * (my_IDy-leftover); } if (blocky == 0) bail_out("No work to do on y-direction!"); if(blockx < RADIUS || blocky < RADIUS) { bail_out("blockx < RADIUS || blocky < RADIUS"); } int myoffsetx = istart - RADIUS; int myoffsety = jstart - RADIUS; thread_offsetx[MYTHREAD] = myoffsetx; thread_offsety[MYTHREAD] = myoffsety; int sizex = blockx + 2*RADIUS; int sizey = blocky + 2*RADIUS; thread_sizex[MYTHREAD] = sizex; thread_sizey[MYTHREAD] = sizey; upc_barrier; local_shared_block_ptrs in_array = shared_2d_array_alloc(sizex, sizey, myoffsetx, myoffsety); local_shared_block_ptrs out_array = shared_2d_array_alloc(sizex, sizey, myoffsetx, myoffsety); in_arrays[MYTHREAD] = in_array; out_arrays[MYTHREAD] = out_array; DTYPE **in_array_private = shared_2d_array_to_private(in_array, sizex, sizey, myoffsetx, myoffsety); DTYPE **out_array_private = shared_2d_array_to_private(out_array, sizex, sizey, myoffsetx, myoffsety); upc_barrier; private_in_arrays = prk_malloc(sizeof(private_shared_block_ptrs) * THREADS); if(private_in_arrays == NULL) bail_out("Cannot allocate private_in_arrays"); private_out_arrays = prk_malloc(sizeof(private_shared_block_ptrs) * THREADS); if(private_out_arrays == NULL) bail_out("Cannot allocate private_out_arrays"); for(int thread=0; thread<THREADS; thread++){ private_in_arrays[thread] = partially_privatize(in_arrays[thread], thread); private_out_arrays[thread] = partially_privatize(out_arrays[thread], thread); } /* intialize the input and output arrays */ for(int y=myoffsety; y<myoffsety + sizey; y++){ for(int x=myoffsetx; x<myoffsetx + sizex; x++){ in_array_private[y][x] = COEFX*x + COEFY*y; out_array[y][x] = 0.; } } upc_barrier; for(int y=myoffsety; y<myoffsety + sizey; y++){ for(int x=myoffsetx; x<myoffsetx + sizex; x++){ if(in_array_private[y][x] != COEFX*x + COEFY*y) bail_out("x=%d y=%d in_array=%f != %f", x, y, in_array[y][x], COEFX*x + COEFY*y); } } /* fill the stencil weights to reflect a discrete divergence operator */ for (jj=-RADIUS; jj<=RADIUS; jj++) for (ii=-RADIUS; ii<=RADIUS; ii++) WEIGHT(ii, jj) = (DTYPE)0.0; stencil_size = 4*RADIUS+1; for (ii=1; ii<=RADIUS; ii++) { WEIGHT(0, ii) = WEIGHT( ii,0) = (DTYPE) (1.0/(2.0*ii*RADIUS)); WEIGHT(0,-ii) = WEIGHT(-ii,0) = -(DTYPE) (1.0/(2.0*ii*RADIUS)); } if(MYTHREAD == 0){ printf("Number of threads = %d\n", THREADS); printf("Grid size = %d\n", n); printf("Radius of stencil = %d\n", RADIUS); printf("Tiles in x/y-direction = %d/%d\n", Num_procsx, Num_procsy); #if DOUBLE printf("Data type = double precision\n"); #else printf("Data type = single precision\n"); #endif #if LOOPGEN printf("Script used to expand stencil loop body\n"); #else printf("Compact representation of stencil loop body\n"); #endif printf("Number of iterations = %d\n", iterations); } upc_barrier; int startx = myoffsetx + RADIUS; int endx = myoffsetx + sizex - RADIUS; int starty = myoffsety + RADIUS; int endy = myoffsety + sizey - RADIUS; if(my_IDx == 0) startx += RADIUS; if(my_IDx == Num_procsx - 1) endx -= RADIUS; if(my_IDy == 0) starty += RADIUS; if(my_IDy == Num_procsy - 1) endy -= RADIUS; upc_barrier; for (iter = 0; iter<=iterations; iter++){ /* start timer after a warmup iteration */ if (iter == 1) { upc_barrier; stencil_time = wtime(); } /* Get ghost zones */ /* NORTH */ if(my_IDy != 0){ int peer = (my_IDy - 1) * Num_procsx + my_IDx; for (int y=starty - RADIUS; y<starty; y++) { int transfer_size = (endx - startx) * sizeof(DTYPE); upc_memget(&in_array_private[y][startx], &private_in_arrays[peer][y][startx], transfer_size); } } /* SOUTH */ if(my_IDy != Num_procsy - 1){ int peer = (my_IDy + 1) * Num_procsx + my_IDx; for (int y=endy; y<endy + RADIUS; y++) { int transfer_size = (endx - startx) * sizeof(DTYPE); upc_memget(&in_array_private[y][startx], &private_in_arrays[peer][y][startx], transfer_size); } } /* LEFT */ if(my_IDx != 0){ int peer = my_IDy * Num_procsx + my_IDx - 1; for (int y=starty; y<endy; y++) { for (int x=startx - RADIUS; x<startx; x++) { in_array_private[y][x] = private_in_arrays[peer][y][x]; } } } /* RIGHT*/ if(my_IDx != Num_procsx - 1){ int peer = my_IDy * Num_procsx + my_IDx + 1; for (int y=starty; y<endy; y++) { for (int x=endx; x<endx + RADIUS; x++) { in_array_private[y][x] = private_in_arrays[peer][y][x]; } } } /* Apply the stencil operator */ for (j=starty; j<endy; j++) { for (i=startx; i<endx; i++) { #if LOOPGEN #include "loop_body_star.incl" #else for (jj=-RADIUS; jj<=RADIUS; jj++) OUT(i,j) += WEIGHT(0,jj)*IN(i,j+jj); for (ii=-RADIUS; ii<0; ii++) OUT(i,j) += WEIGHT(ii,0)*IN(i+ii,j); for (ii=1; ii<=RADIUS; ii++) OUT(i,j) += WEIGHT(ii,0)*IN(i+ii,j); #endif } } upc_barrier; /* <- Necessary barrier: some slow threads could use future data */ /* add constant to solution to force refresh of neighbor data, if any */ for(int y=myoffsety + RADIUS; y<myoffsety + sizey - RADIUS; y++) for(int x=myoffsetx + RADIUS; x<myoffsetx + sizex - RADIUS; x++) in_array_private[y][x] += 1.0; upc_barrier; /* <- Necessary barrier: some threads could start on old data */ } /* end of iterations */ stencil_time = wtime() - stencil_time; times[MYTHREAD] = stencil_time; upc_barrier; // Compute max_time if(MYTHREAD == 0){ max_time = times[MYTHREAD]; for(i=1; i<THREADS; i++){ if(max_time < times[i]) max_time = times[i]; } } norm = (double) 0.0; f_active_points = (double)(n-2*RADIUS) * (double)(n-2*RADIUS); /* compute L1 norm in parallel */ for (int y=starty; y<endy; y++) { for (int x=startx; x<endx; x++) { norm += (double)ABS(out_array[y][x]); } } norm /= f_active_points; norms[MYTHREAD] = norm; upc_barrier; if(MYTHREAD == 0){ norm = 0.; for(int i=0; i<THREADS; i++) norm += norms[i]; /******************************************************************************* ** Analyze and output results. ********************************************************************************/ /* verify correctness */ reference_norm = (double) (iterations+1) * (COEFX + COEFY); if (ABS(norm - reference_norm) > EPSILON) bail_out("L1 norm = "FSTR", Reference L1 norm = "FSTR"\n", norm, reference_norm); else { printf("Solution validates\n"); #if VERBOSE printf("Reference L1 norm = "FSTR", L1 norm = "FSTR"\n", reference_norm, norm); #endif } flops = (DTYPE) (2*stencil_size+1) * f_active_points; avgtime = max_time/iterations; printf("Rate (MFlops/s): "FSTR" Avg time (s): %lf\n", 1.0E-06 * flops/avgtime, avgtime); exit(EXIT_SUCCESS); } }
extern int user_main( int argc, char ** argv) #line 14 "pgen.upc" { #line 14 "pgen.upc" UPCR_BEGIN_FUNCTION(); register _IEEE64 _bupc_comma; register _INT64 _bupc_comma0; register _INT32 _bupc_comma1; register _UINT64 _bupc_comma2; register _IEEE64 _bupc_comma3; register _IEEE64 _bupc_comma4; register _INT64 _bupc_comma5; register _INT64 _bupc_comma7; register _INT64 _bupc_comma6; register _IEEE64 _bupc_comma8; register _IEEE64 _bupc_comma9; register _IEEE64 _bupc_comma10; _IEEE64 inputTime; _IEEE64 constrTime; _IEEE64 traversalTime; int n_total_kmers; int n_kmers_to_process_ideal; int start_kmer; int end_kmer; int n_kmers_to_process; int char_start_position; int chars_to_read; unsigned char * buffer; unsigned char * _bupc__casttmp9; struct __sFILE * input_file; int _bupc__spilleq10; unsigned long _bupc__spilleq11; struct memory_heap_t private_memory_heap; struct hash_table_t * private_hashtable; int nints; upcr_pshared_ptr_t process_kmer_list_offsets_global; int * process_kmer_list_offsets; int max_kmers_to_transfer_to_single_process; int nchars; upcr_pshared_ptr_t kmers_to_transfer_global; char * kmers_to_transfer; int i; int process_owner; int _bupc_w2c_i0; int n_kmers_char_to_transfer_to_self; int j; long long _bupc__spilleq12; void * _bupc_call5; struct __sFILE * _bupc_call6; struct hash_table_t * _bupc_call7; upcr_shared_ptr_t _bupc_call8; upcr_shared_ptr_t _bupc_call9; upcr_pshared_ptr_t _bupc_Mstopcvt10; upcr_pshared_ptr_t _bupc_Mptra11; int * _bupc_Mcvtptr12; upcr_pshared_ptr_t _bupc_Mstopcvt13; upcr_pshared_ptr_t _bupc_Mptra14; char * _bupc_Mcvtptr15; upcr_pshared_ptr_t _bupc_Mptra16; upcr_pshared_ptr_t _bupc_Mptra17; int _bupc_spillld18; upcr_pshared_ptr_t _bupc_Mptra19; upcr_pshared_ptr_t _bupc_Mptra20; upcr_shared_ptr_t _bupc_Mstopcvt21; #line 17 "pgen.upc" inputTime = 0.0; #line 17 "pgen.upc" constrTime = 0.0; #line 17 "pgen.upc" traversalTime = 0.0; #line 20 "pgen.upc" upcr_barrier(346153894, 1); #line 21 "pgen.upc" _bupc_comma = gettime(); #line 21 "pgen.upc" inputTime = inputTime - _bupc_comma; #line 24 "pgen.upc" _bupc_comma0 = getNumKmersInUFX((const char *) * (argv + 1LL)); #line 24 "pgen.upc" n_total_kmers = _bupc_comma0; #line 25 "pgen.upc" n_kmers_to_process_ideal = n_total_kmers / ((int) upcr_threads () ); #line 26 "pgen.upc" start_kmer = ((int) upcr_mythread () ) * n_kmers_to_process_ideal; #line 27 "pgen.upc" end_kmer = (((int) upcr_mythread () ) + 1) * n_kmers_to_process_ideal; #line 28 "pgen.upc" if(((int) upcr_mythread () ) == (((int) upcr_threads () ) + -1)) #line 28 "pgen.upc" { #line 28 "pgen.upc" end_kmer = n_total_kmers; } #line 29 "pgen.upc" n_kmers_to_process = end_kmer - start_kmer; #line 30 "pgen.upc" char_start_position = start_kmer * 23; #line 31 "pgen.upc" chars_to_read = n_kmers_to_process * 23; #line 32 "pgen.upc" _bupc_call5 = malloc((unsigned long)(_UINT64)(chars_to_read)); #line 32 "pgen.upc" _bupc__casttmp9 = _bupc_call5; #line 32 "pgen.upc" buffer = _bupc__casttmp9; #line 34 "pgen.upc" printf("Process %d: Reading and creating graph for K-mers %d - %d\n", ((int) upcr_mythread () ), start_kmer, end_kmer); #line 37 "pgen.upc" _bupc_call6 = fopen((const char *) * (argv + 1LL), "r"); #line 37 "pgen.upc" input_file = _bupc_call6; #line 38 "pgen.upc" _bupc_comma1 = fseek(input_file, (long) char_start_position, (int) 1); #line 38 "pgen.upc" _bupc__spilleq10 = _bupc_comma1; #line 38 "pgen.upc" if(_bupc__spilleq10 != 0) #line 38 "pgen.upc" { #line 39 "pgen.upc" printf("Error Seeking..."); #line 40 "pgen.upc" upcri_do_exit((int) 0); } #line 42 "pgen.upc" _bupc_comma2 = fread(buffer, (unsigned long) 1ULL, (unsigned long)(_UINT64)(chars_to_read), input_file); #line 42 "pgen.upc" _bupc__spilleq11 = _bupc_comma2; #line 42 "pgen.upc" if(_bupc__spilleq11 != (_UINT64)(chars_to_read)) #line 42 "pgen.upc" { #line 43 "pgen.upc" printf("Error reading..."); #line 44 "pgen.upc" upcri_do_exit((int) 0); } #line 46 "pgen.upc" fclose(input_file); #line 47 "pgen.upc" upcr_barrier(346153895, 1); #line 48 "pgen.upc" _bupc_comma3 = gettime(); #line 48 "pgen.upc" inputTime = inputTime + _bupc_comma3; #line 51 "pgen.upc" _bupc_comma4 = gettime(); #line 51 "pgen.upc" constrTime = constrTime - _bupc_comma4; #line 56 "pgen.upc" _bupc_call7 = create_hash_table((long long)(n_kmers_to_process * 2), &private_memory_heap); #line 56 "pgen.upc" private_hashtable = _bupc_call7; #line 59 "pgen.upc" nints = ((int) upcr_threads () ); #line 60 "pgen.upc" _bupc_call8 = upc_all_alloc((unsigned long)(_UINT64)(((int) upcr_threads () )), (unsigned long)((_UINT64)(nints) * 4ULL)); #line 60 "pgen.upc" _bupc_Mstopcvt10 = UPCR_SHARED_TO_PSHARED(_bupc_call8); #line 60 "pgen.upc" process_kmer_list_offsets_global = _bupc_Mstopcvt10; #line 61 "pgen.upc" _bupc_Mptra11 = UPCR_ADD_PSHARED1(process_kmer_list_offsets_global, 4ULL, ((int) upcr_mythread () )); #line 61 "pgen.upc" _bupc_Mcvtptr12 = (int *) UPCR_PSHARED_TO_LOCAL(_bupc_Mptra11); #line 61 "pgen.upc" process_kmer_list_offsets = _bupc_Mcvtptr12; #line 62 "pgen.upc" memset(process_kmer_list_offsets, (int) 0, (unsigned long)((_UINT64)(((int) upcr_threads () )) * 4ULL)); #line 64 "pgen.upc" max_kmers_to_transfer_to_single_process = (n_kmers_to_process_ideal / ((int) upcr_threads () )) * 2; #line 65 "pgen.upc" nchars = (max_kmers_to_transfer_to_single_process * ((int) upcr_threads () )) * 23; #line 68 "pgen.upc" _bupc_call9 = upc_all_alloc((unsigned long)(_UINT64)(((int) upcr_threads () )), (unsigned long)(_UINT64)(nchars)); #line 68 "pgen.upc" _bupc_Mstopcvt13 = UPCR_SHARED_TO_PSHARED(_bupc_call9); #line 68 "pgen.upc" kmers_to_transfer_global = _bupc_Mstopcvt13; #line 70 "pgen.upc" _bupc_Mptra14 = UPCR_ADD_PSHARED1(kmers_to_transfer_global, 1ULL, ((int) upcr_mythread () )); #line 70 "pgen.upc" _bupc_Mcvtptr15 = (char *) UPCR_PSHARED_TO_LOCAL(_bupc_Mptra14); #line 70 "pgen.upc" kmers_to_transfer = _bupc_Mcvtptr15; #line 73 "pgen.upc" i = 0; #line 73 "pgen.upc" while(i < (n_kmers_to_process * 23)) #line 73 "pgen.upc" { #line 74 "pgen.upc" _bupc_comma5 = hashkmer((long long) ((int) upcr_threads () ), (char *)(buffer + i)); #line 74 "pgen.upc" process_owner = _bupc_comma5; #line 76 "pgen.upc" if(process_owner == ((int) upcr_mythread () )) #line 76 "pgen.upc" { #line 77 "pgen.upc" add_kmer(private_hashtable, &private_memory_heap, buffer + i, (char)(char) * ((buffer + (i + 19)) + 1LL), (char)(char) * ((buffer + (i + 19)) + 2LL)); } else #line 77 "pgen.upc" { #line 83 "pgen.upc" memcpy(kmers_to_transfer + (*(process_kmer_list_offsets + process_owner) + ((process_owner * max_kmers_to_transfer_to_single_process) * 23)), buffer + i, (unsigned long) 23ULL); #line 84 "pgen.upc" * (process_kmer_list_offsets + process_owner) = *(process_kmer_list_offsets + process_owner) + 23; } #line 86 "pgen.upc" _1 :; #line 86 "pgen.upc" i = i + 23; } #line 89 "pgen.upc" upcr_barrier(346153896, 1); #line 91 "pgen.upc" _bupc_w2c_i0 = 0; #line 91 "pgen.upc" while(_bupc_w2c_i0 < ((int) upcr_threads () )) #line 91 "pgen.upc" { #line 92 "pgen.upc" if(_bupc_w2c_i0 != ((int) upcr_mythread () )) #line 92 "pgen.upc" { #line 93 "pgen.upc" _bupc_Mptra16 = UPCR_ADD_PSHARED1(process_kmer_list_offsets_global, 4ULL, _bupc_w2c_i0); #line 93 "pgen.upc" _bupc_Mptra17 = UPCR_ADD_PSHAREDI(_bupc_Mptra16, 4ULL, ((int) upcr_mythread () )); #line 93 "pgen.upc" UPCR_GET_PSHARED(&_bupc_spillld18, _bupc_Mptra17, 0, 4); #line 93 "pgen.upc" n_kmers_char_to_transfer_to_self = _bupc_spillld18; #line 96 "pgen.upc" _bupc_Mptra19 = UPCR_ADD_PSHARED1(kmers_to_transfer_global, 1ULL, _bupc_w2c_i0); #line 96 "pgen.upc" _bupc_Mptra20 = UPCR_ADD_PSHAREDI(_bupc_Mptra19, 1ULL, (((int) upcr_mythread () ) * max_kmers_to_transfer_to_single_process) * 23); #line 96 "pgen.upc" _bupc_Mstopcvt21 = UPCR_PSHARED_TO_SHARED(_bupc_Mptra20); #line 96 "pgen.upc" upc_memget(buffer, _bupc_Mstopcvt21, (unsigned long)(_UINT64)(n_kmers_char_to_transfer_to_self)); #line 98 "pgen.upc" j = 0; #line 98 "pgen.upc" while(j < n_kmers_char_to_transfer_to_self) #line 98 "pgen.upc" { #line 99 "pgen.upc" _bupc_comma7 = hashkmer((long long) ((int) upcr_threads () ), (char *)(buffer + j)); #line 99 "pgen.upc" _bupc__spilleq12 = _bupc_comma7; #line 99 "pgen.upc" if(_bupc__spilleq12 != (_INT64)(((int) upcr_mythread () ))) #line 99 "pgen.upc" { #line 99 "pgen.upc" _bupc_comma6 = hashkmer((long long) ((int) upcr_threads () ), (char *)(buffer + j)); #line 99 "pgen.upc" printf("%d %d\n", ((int) upcr_mythread () ), _bupc_comma6); } #line 100 "pgen.upc" add_kmer(private_hashtable, &private_memory_heap, buffer + j, (char)(char) * ((buffer + (j + 19)) + 1LL), (char)(char) * ((buffer + (j + 19)) + 2LL)); #line 101 "pgen.upc" _3 :; #line 101 "pgen.upc" j = j + 23; } } #line 103 "pgen.upc" _2 :; #line 103 "pgen.upc" _bupc_w2c_i0 = _bupc_w2c_i0 + 1; } #line 105 "pgen.upc" upcr_barrier(346153897, 1); #line 106 "pgen.upc" _bupc_comma8 = gettime(); #line 106 "pgen.upc" constrTime = constrTime + _bupc_comma8; #line 109 "pgen.upc" _bupc_comma9 = gettime(); #line 109 "pgen.upc" traversalTime = traversalTime - _bupc_comma9; #line 114 "pgen.upc" upcr_barrier(346153898, 1); #line 115 "pgen.upc" _bupc_comma10 = gettime(); #line 115 "pgen.upc" traversalTime = traversalTime + _bupc_comma10; #line 119 "pgen.upc" if(((int) upcr_mythread () ) == 0) #line 119 "pgen.upc" { #line 120 "pgen.upc" printf("%s: Input set: %s\n", *argv, *(argv + 1LL)); #line 121 "pgen.upc" printf("Number of UPC threads: %d\n", ((int) upcr_threads () )); #line 122 "pgen.upc" printf("Input reading time: %f seconds\n", inputTime); #line 123 "pgen.upc" printf("Graph construction time: %f seconds\n", constrTime); #line 124 "pgen.upc" printf("Graph traversal time: %f seconds\n", traversalTime); } #line 126 "pgen.upc" UPCR_EXIT_FUNCTION(); #line 126 "pgen.upc" return 0; } /* user_main */
void upc_all_fwrite_shared_async( upcio_file_t *fh_shared, shared void *buffer, uint32_t blocksize, upc_off_t size, uint32_t nmemb, int64_t *ret, upc_flag_t sync_mode ) { Plfs_fd *fd; UPC_ADIO_Request request; unsigned char *local_buf; upc_off_t count, blocksize_byte, roundsize; uint32_t round, i; upc_off_t start_th, my_th; int error_code; shared unsigned char * buffer_char; struct __struct_thread_upc_file_t *fh; upc_off_t *dispsize; upc_off_t *disparray; upc_off_t disp; upc_off_t extra; uint32_t extra_block; upc_off_t mpi_size; upc_off_t nblocks; /*------------------------------------------------------------------*/ /* the file handler has to be valid */ /*------------------------------------------------------------------*/ if( fh_shared == NULL ) return; /*------------------------------------------------------------------*/ /* cast the local file handler into private ones */ /* hopefully doing so will increase performance */ /*------------------------------------------------------------------*/ fh = (struct __struct_thread_upc_file_t *)(fh_shared->th[MYTHREAD]); fd = (Plfs_fd *)fh->adio_fd; /*------------------------------------------------------------------*/ /* make sure the file is not opened with read only */ /*------------------------------------------------------------------*/ if( fh->flags & UPC_RDONLY ) return; /*------------------------------------------------------------------*/ /* make sure there is no asynchrounouse ops pending */ /*------------------------------------------------------------------*/ if( fh->async_flag == 1 ) return; /*------------------------------------------------------------------*/ /* set the asynchrounouse ops flag */ /*------------------------------------------------------------------*/ fh->async_flag = 1; /*------------------------------------------------------------------*/ /* upc sync mode */ /*------------------------------------------------------------------*/ if( sync_mode & UPC_IN_NOSYNC ) ; else if( sync_mode & UPC_IN_MYSYNC ) upc_barrier; else upc_barrier; count = size*nmemb; blocksize_byte = blocksize*size; if( fh->flags & UPC_INDIVIDUAL_FP ) { if( blocksize ) { roundsize = blocksize_byte * THREADS; buffer_char = (shared unsigned char *)buffer; start_th = upc_threadof( buffer_char ); local_buf=(unsigned char *)malloc(sizeof(unsigned char)*count); my_th = start_th; round = 0; for(i=0; i<count-(count%blocksize_byte); i+=blocksize_byte) { upc_memget(&local_buf[i], buffer_char+round*roundsize+my_th-start_th, blocksize_byte); my_th++; if(my_th == THREADS) { my_th = 0; round++; } } upc_memget(&local_buf[i], buffer_char+round*roundsize+my_th-start_th, count-i); UPC_ADIO_IwriteContig( fd, local_buf, count, fh->private_pointer, &request, ret, &error_code ); } else { local_buf = (unsigned char *)malloc(sizeof(unsigned char)*count); upc_memget(local_buf, buffer, count); UPC_ADIO_IwriteContig( fd, local_buf, count, fh->private_pointer, &request, ret, &error_code ); } /*------------------------------------------------------------------*/ /* increment the file pointer */ /*------------------------------------------------------------------*/ fh->private_pointer += count; } else { if( blocksize ) { buffer_char = (shared unsigned char *)buffer; start_th = upc_threadof( buffer_char ); roundsize = blocksize_byte * THREADS; round = (uint32_t)(count / roundsize); nblocks = (uint32_t)(count / blocksize_byte); extra_block = nblocks%THREADS; extra = count % blocksize_byte; mpi_size = round * blocksize_byte; round++; if( MYTHREAD < start_th ) /* wrap around */ { my_th = MYTHREAD + THREADS - start_th; local_buf = (unsigned char *)(buffer_char + roundsize - start_th +MYTHREAD ); } else { my_th = MYTHREAD - start_th; local_buf = (unsigned char *)(buffer_char + my_th); } if( my_th < nblocks%THREADS ) mpi_size += blocksize_byte; if( my_th == extra_block ) mpi_size += extra; disp = fh->shared_pointer + blocksize_byte * my_th; disparray = (upc_off_t *)malloc(round*sizeof(upc_off_t)); dispsize = (upc_off_t *)malloc(round*sizeof(upc_off_t)); for( i=0; i<round; i++ ) { disparray[i] = disp + i * roundsize; dispsize[i] = blocksize_byte; } UPC_ADIO_IwriteStrided( fd, 1, &local_buf, &mpi_size, round, disparray, dispsize, &request, ret, &error_code ); /*------------------------------------------------------------------*/ /* update the metadata */ /*------------------------------------------------------------------*/ fh->disparray = disparray; fh->dispsize = dispsize; } else { //if( MYTHREAD == upc_threadof(buffer) ) if( MYTHREAD == 0 ) { local_buf = (unsigned char *)malloc(sizeof(unsigned char)*count); upc_memget(local_buf, buffer, count); UPC_ADIO_IwriteContig( fd, local_buf, count, fh->shared_pointer, &request, ret, &error_code ); } } /*------------------------------------------------------------------*/ /* increment the file pointer */ /*------------------------------------------------------------------*/ fh->shared_pointer += count; } /*------------------------------------------------------------------*/ /* update the metadata */ /*------------------------------------------------------------------*/ fh->request = request; fh->async_op = __REF_UPC_WRITE_SHARED_ASYNC; fh->local_ptr = local_buf; fh->blocksize = blocksize_byte; fh->size = count; /*------------------------------------------------------------------*/ /* upc sync mode */ /*------------------------------------------------------------------*/ if( sync_mode & UPC_IN_NOSYNC ) ; else if( sync_mode & UPC_IN_MYSYNC ) upc_barrier; else upc_barrier; return; }