int main(int argc, char *argv[]) { if (argc < 2) { printf("Missing size of array!\n"); return EXIT_FAILURE; } int size_array = atoi(argv[1]); int *array = (int *) malloc(size_array * sizeof(uint32_t)); for (int i = 0; i < size_array; i++) { array[i] = getrand(0, 100000); // printf ("array[%d]= %d ",i,array[i]); } double time = wtime(); for (int i = 0; i < size_array - 1; i++) { for (int j = 0; j < size_array - i - 1; j++) { if (array[j] > array[j + 1]) { int tmp = array[j]; array[j] = array[j + 1]; array[j + 1] = tmp; } } } /* for (int i = 0; i < size_array; i++) { printf ("array[%d]= %d ",i,array[i]); }*/ time = wtime() - time; FILE *tb; tb = fopen("bubblesort.dat", "a"); fprintf(tb, "%d %.6f\n", size_array, time); free(array); return EXIT_SUCCESS; }
int main(int argc, char **argv){ int i, me, target; unsigned int size; double t; MPI_Status status; MPI_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &me); target = 1 - me; init_buf(send_buf, me); init_buf(recv_buf, target); if(me==0) print_items(); for(size=1;size<MAX_SIZE+1;size*=2){ MPI_Barrier(MPI_COMM_WORLD); for(i=0;i<LOOP+WARMUP;i++){ if(WARMUP == i) t = wtime(); if(me == 0){ MPI_Send(send_buf, size, MPI_CHAR, target, 9, MPI_COMM_WORLD); MPI_Recv(recv_buf, size, MPI_CHAR, target, 5, MPI_COMM_WORLD, &status); } else { MPI_Recv(recv_buf, size, MPI_CHAR, target, 9, MPI_COMM_WORLD, &status); MPI_Send(send_buf, size, MPI_CHAR, target, 5, MPI_COMM_WORLD); } } MPI_Barrier(MPI_COMM_WORLD); t = wtime() - t; if(me == 0) print_results(size, t); } MPI_Finalize(); return 0; }
// startrun: startup hierarchical N-body code. // ___________________________________________ // This runs once. local void startrun(void) { printf("startrun\n"); startrun_time_0 = wtime(); bodyptr p1, p2, p; stream gravstr; define_body(sizeof(body), Precision, NDIM); // setup phat body struct define_body_offset(PosTag, BodyOffset(Pos)); define_body_offset(VelTag, BodyOffset(Vel)); define_body_offset(MassTag, BodyOffset(Mass)); define_body_offset(PhiTag, BodyOffset(Phi)); define_body_offset(AccTag, BodyOffset(Acc)); infile = getparam("in"); // set I/O file names outfile = getparam("out"); savefile = getparam("save"); if (strnull(getparam("restore"))) // starting a new run? newrun(); else // else resume old run oldrun(); if (ABS(nstatic) > nbody) // check nstatic is OK error("%s: absurd value for nstatic\n", getargv0()); p1 = bodytab + MAX(nstatic, 0); // set dynamic body range p2 = bodytab + nbody + MIN(nstatic, 0); testcalc = TRUE; // determine type of calc: for (p = p1; p < p2; p++) testcalc = testcalc && (Mass(p) == 0); // look for dynamic masses strfile = getparam("stream"); logfile = getparam("log"); #if defined(EXTGRAV) if (! strnull(getparam("gravgsp"))) { // was GSP file given? gravstr = stropen(getparam("gravgsp"), "r"); get_history(gravstr); gravgsp = get_gsprof(gravstr); // read external field GSP strclose(gravstr); } #endif startrun_time_1 = wtime(); }
int main(int argc, char **argv) { int n; int repeat; double dot; long start_time, end_time; if ((argc != 3)) { printf("Uso: %s <tamanho dos vetores> <repeticoes>\n", argv[0]); exit(EXIT_FAILURE); } n = atoi(argv[1]); // tamanho dos vetores repeat = atoi(argv[2]); // numero de repeticoes (variar carga) // Cria vetores double *a = (double *) malloc(sizeof(double) * n); double *b = (double *) malloc(sizeof(double) * n); if (a == NULL || b == NULL) { printf("Erro de alocacao de memoria\n"); exit(EXIT_FAILURE); } init_vectors(a, b, n); start_time = wtime(); dot = dot_product(a, b, n, repeat); end_time = wtime(); printf("Produto escalar = %f\n", dot); printf("Tempo de calculo = %ld usec\n", (long) (end_time - start_time)); free((void *) a); free((void *) b); return EXIT_SUCCESS; }
int main(int argc, char **argv) { pthread_t thread1; int ret = -1; int i = 0; double time1, time2; ret = pthread_create(&thread1, NULL, thread1_fn, NULL); assert(ret == 0); time1 = wtime(); for(i=0; i<ITERATIONS; i++) { wakeywakey(); } time2 = wtime(); printf("time for %d iterations: %f seconds.\n", ITERATIONS, (time2-time1)); printf("per iteration: %f\n", (time2-time1)/(double)ITERATIONS); return(0); }
void init_synchronization(void) { current_synchronization = form_of_synchronization; max_counter = First_max_counter; interval = First_interval; first_measurement_run = True; logging(DBG_SYNC, "starting with max_counter = %d interval = %9.1f\n", max_counter, interval*1.0e6); if( current_synchronization == SYNC_REAL) { if( ! mpi_wtime_is_global ) determine_time_differences(); if( lrootproc() ) start_batch = wtime(); logging(DBG_SYNC, "---- new start_batch ----------------\n"); MPI_Bcast(&start_batch, 1, MPI_DOUBLE, 0, get_measurement_comm()); } }
depth_t bc::bfs_sssp( index_t root ) { sa[root] = 0; sp_count[root] = 1; depth_t level = 0; dist[root]=0; while(true) { double ltm= wtime(); index_t front_count = 0; for(vertex_t vert_id = 0; vert_id<g->vert_count; vert_id++) { if(sa[vert_id] == level) { index_t my_beg = g->beg_pos[vert_id]; index_t my_end = g->beg_pos[vert_id + 1]; for(; my_beg<my_end; my_beg++) { vertex_t nebr=g->csr[my_beg]; path_t weit=g->weight[my_beg]; if(dist[nebr]>dist[vert_id]+weit) { dist[nebr]=dist[vert_id]+weit; sp_count[nebr]=0; //prior parent is wrong sa[nebr]=level+1; front_count++; } if(dist[nebr]==dist[vert_id]+weit) sp_count[nebr]+=sp_count[vert_id]; } } } // std::cout<<"Level "<<(int) level<<": "<<front_count<<" " // <<wtime() - ltm<<"\n"; if(front_count == 0) break; level ++; } return level+1; }
double stop_synchronization(void) { stop_batch = stop_sync = wtime(); if( current_synchronization == SYNC_REAL ) { if( stop_sync - start_sync > interval ) invalid[counter] = INVALID_TOOK_TOO_LONG; logging(DBG_SYNC, "stop_sync = %9.1f ", normalize_time(stop_sync)); switch( invalid[counter] ) { case INVALID_TOOK_TOO_LONG: logging(DBG_SYNC, "invalid_too_long\n"); break; case INVALID_STARTED_LATE: logging(DBG_SYNC, "invalid_started_late\n"); break; default: logging(DBG_SYNC, "\n"); } } return stop_sync; }
int main(){ double t; int i, me, target; unsigned int size; me = xmp_node_num(); target = 3 - me; init_buf(local_buf, me); init_buf(target_buf, me); if(me==1) print_items(); for(size=4;size<MAX_SIZE+1;size*=2){ // size must be more than 4 when using Fujitsu RDMA xmp_sync_all(NULL); for(i=0;i<LOOP+WARMUP;i++){ if(WARMUP == i) t = wtime(); if(me == 1){ local_buf[0:size] = target_buf[0:size]:[target]; xmp_sync_memory(NULL); #ifdef DEBUG if(local_buf[0] != '2' && local_buf[size-1] != '2') fprintf(stderr, "Error !\n"); local_buf[0] = '1'; local_buf[size-1] = '1'; #endif xmp_sync_all(NULL); } else{ xmp_sync_all(NULL); local_buf[0:size] = target_buf[0:size]:[target]; #ifdef DEBUG if(local_buf[0] != '1' && local_buf[size-1] != '1') fprintf(stderr, "Error !\n"); local_buf[0] = '2'; local_buf[size-1] = '2'; #endif } xmp_sync_all(NULL); }
int main() { Init(); double start = wtime(); double start_linked_list = wtime(); RunThoughLinkedList(); double end_linked_list = wtime(); double start_explicit = wtime(); RunExplicit(); double end_explicit = wtime(); double end = wtime(); printf("Time through Linked List %7.2f\n" "Time through explicit %7.2f\n" "Total Time taken %7.2f\n", end_linked_list-start_linked_list, end_explicit-start_explicit, end-start ); }
int main(int argc, char **argv) { long int j, iter; /* dummies */ double scalar; /* constant used in Triad operation */ int iterations; /* number of times vector loop gets repeated */ long int length, /* vector length per processor */ total_length, /* total vector length */ offset; /* offset between vectors a and b, and b and c */ double bytes; /* memory IO size */ size_t space; /* memory used for a single vector */ double nstream_time, /* timing parameters */ avgtime = 0.0, maxtime = 0.0, mintime = 366.0*8760.0*3600.0; /* set the minimum time to a large value; one leap year should be enough */ int Num_procs, /* process parameters */ my_ID, /* rank of calling process */ root=0; /* ID of master process */ int error=0; /* error flag for individual process */ /********************************************************************************** * process and test input parameters ***********************************************************************************/ MPI_Init(&argc,&argv); MPI_Comm_size(MPI_COMM_WORLD,&Num_procs); MPI_Comm_rank(MPI_COMM_WORLD,&my_ID); if (my_ID == root) { printf("MPI stream triad: A = B + scalar*C\n"); if (argc != 4) { printf("Usage: %s <# iterations> <vector length> <offset>\n", *argv); error = 1; goto ENDOFTESTS; } iterations = atoi(*++argv); if (iterations < 1) { printf("ERROR: Invalid number of iterations: %d\n", iterations); error = 1; goto ENDOFTESTS; } total_length = atol(*++argv); if (total_length < Num_procs) { printf("ERROR: Invalid vector length: %ld\n", total_length); error = 1; goto ENDOFTESTS; } else length = total_length/Num_procs; offset = atol(*++argv); if (offset < 0) { printf("ERROR: Invalid array offset: %ld\n", offset); error = 1; goto ENDOFTESTS; } #ifdef STATIC_ALLOCATION if ((3*length + 2*offset) > N) { printf("ERROR: vector length/offset %ld/%ld too ", total_length, offset); printf("large; increase MAXLENGTH in Makefile or decrease vector length\n"); error = 1; goto ENDOFTESTS; } #endif ENDOFTESTS: ; } bail_out(error); /* broadcast initialization data */ MPI_Bcast(&length,1, MPI_LONG, root, MPI_COMM_WORLD); MPI_Bcast(&offset,1, MPI_LONG, root, MPI_COMM_WORLD); MPI_Bcast(&iterations,1, MPI_INT, root, MPI_COMM_WORLD); #ifndef STATIC_ALLOCATION space = (3*length + 2*offset)*sizeof(double); a = (double *) malloc(space); if (!a && my_ID == root) { printf("ERROR: Could not allocate %ld bytes for vectors\n", (long int)space); error = 1; } bail_out(error); #endif b = a + length + offset; c = b + length + offset; bytes = 3.0 * sizeof(double) * length * Num_procs; if (my_ID == root) { printf("Number of processes = %d\n", Num_procs); printf("Vector length = %ld\n", total_length); printf("Offset = %ld\n", offset); printf("Number of iterations = %d\n", iterations); } #pragma vector always for (j=0; j<length; j++) { a[j] = 0.0; b[j] = 2.0; c[j] = 2.0; } /* --- MAIN LOOP --- repeat Triad iterations times --- */ scalar = SCALAR; for (iter=0; iter<iterations; iter++) { MPI_Barrier(MPI_COMM_WORLD); if (my_ID == root) { nstream_time = wtime(); } #pragma vector always for (j=0; j<length; j++) a[j] = b[j]+scalar*c[j]; if (my_ID == root) { if (iter>0 || iterations==1) { /* skip the first iteration */ nstream_time = wtime() - nstream_time; avgtime = avgtime + nstream_time; mintime = MIN(mintime, nstream_time); maxtime = MAX(maxtime, nstream_time); } } /* insert a dependency between iterations to avoid dead-code elimination */ #pragma vector always for (j=0; j<length; j++) b[j] = a[j]; } /********************************************************************* ** Analyze and output results. *********************************************************************/ if (my_ID == root) { if (checkTRIADresults(iterations, length)) { avgtime = avgtime/(double)(MAX(iterations-1,1)); printf("Rate (MB/s): %lf, Avg time (s): %lf, Min time (s): %lf", 1.0E-06 * bytes/mintime, avgtime, mintime); printf(", Max time (s): %lf\n", maxtime); } else error = 1; } bail_out(error); MPI_Finalize(); }
int main( int argc, char *argv[] ) { unsigned iter; FILE *infile, *resfile; char *resfilename; // algorithmic parameters algoparam_t param; int np; double runtime, flop; double residual=0.0; // check arguments if( argc < 2 ) { usage( argv[0] ); return 1; } // check input file if( !(infile=fopen(argv[1], "r")) ) { fprintf(stderr, "\nError: Cannot open \"%s\" for reading.\n\n", argv[1]); usage(argv[0]); return 1; } // check result file resfilename= (argc>=3) ? argv[2]:"heat.ppm"; if( !(resfile=fopen(resfilename, "w")) ) { fprintf(stderr, "\nError: Cannot open \"%s\" for writing.\n\n", resfilename); usage(argv[0]); return 1; } // check input if( !read_input(infile, ¶m) ) { fprintf(stderr, "\nError: Error parsing input file.\n\n"); usage(argv[0]); return 1; } print_params(¶m); if( !initialize(¶m) ) { fprintf(stderr, "Error in Solver initialization.\n\n"); usage(argv[0]); return 1; } // full size (param.resolution are only the inner points) np = param.resolution + 2; #if _EXTRAE_ Extrae_init(); #endif // starting time runtime = wtime(); iter = 0; while(1) { switch( param.algorithm ) { case 0: // JACOBI residual = relax_jacobi(param.u, param.uhelp, np, np); // Copy uhelp into u copy_mat(param.uhelp, param.u, np, np); break; case 1: // GAUSS residual = relax_gauss(param.u, np, np); break; } iter++; // solution good enough ? if (residual < 0.00005) break; // max. iteration reached ? (no limit with maxiter=0) if (param.maxiter>0 && iter>=param.maxiter) break; } // Flop count after iter iterations flop = iter * 11.0 * param.resolution * param.resolution; // stopping time runtime = wtime() - runtime; #if _EXTRAE_ Extrae_fini(); #endif fprintf(stdout, "Time: %04.3f \n", runtime); fprintf(stdout, "Flops and Flops per second: (%3.3f GFlop => %6.2f MFlop/s)\n", flop/1000000000.0, flop/runtime/1000000); fprintf(stdout, "Convergence to residual=%f: %d iterations\n", residual, iter); // for plot... coarsen( param.u, np, np, param.uvis, param.visres+2, param.visres+2 ); write_image( resfile, param.uvis, param.visres+2, param.visres+2 ); finalize( ¶m ); return 0; }
int main(int argc, char* argv[]) { double t1, t2, t3, t4, t5; double sum1, sum2, sum3, sum4; int arg = 1, len = 0, iters = 0, verb = 0, run = 1; int do_vcopy = 1, do_vadd = 1, do_vjacobi = 1; while(argc>arg) { if (strcmp(argv[arg],"-v")==0) verb++; else if (strcmp(argv[arg],"-vv")==0) verb+=2; else if (strcmp(argv[arg],"-n")==0) run = 0; else if (strcmp(argv[arg],"-c")==0) do_vadd = 0, do_vjacobi = 0; else if (strcmp(argv[arg],"-a")==0) do_vcopy = 0, do_vjacobi = 0; else if (strcmp(argv[arg],"-j")==0) do_vcopy = 0, do_vadd = 0; else break; arg++; } if (argc>arg) { len = atoi(argv[arg]); arg++; } if (argc>arg) { iters = atoi(argv[arg]); arg++; } if (len == 0) len = 10000; if (iters == 0) iters = 20; len = len * 1000; printf("Alloc/init 3 double arrays of length %d ...\n", len); double* a = (double*) malloc(len * sizeof(double)); double* b = (double*) malloc(len * sizeof(double)); double* c = (double*) malloc(len * sizeof(double)); for(int i = 0; i<len; i++) { a[i] = 1.0; b[i] = (double) (i % 20); c[i] = 3.0; } // Generate vectorized variants & run against naive/original #if __AVX__ bool do32 = true; #else bool do32 = false; #endif // vcopy if (do_vcopy) { vcopy_t vcopy16, vcopy32; Rewriter* rc16 = dbrew_new(); if (verb>1) dbrew_verbose(rc16, true, true, true); dbrew_set_function(rc16, (uint64_t) vcopy); dbrew_config_parcount(rc16, 3); dbrew_config_force_unknown(rc16, 0); dbrew_set_vectorsize(rc16, 16); vcopy16 = (vcopy_t) dbrew_rewrite(rc16, a, b, len); if (verb) decode_func(rc16, "vcopy16"); if (do32) { Rewriter* rc32 = dbrew_new(); if (verb>1) dbrew_verbose(rc32, true, true, true); dbrew_set_function(rc32, (uint64_t) vcopy); dbrew_config_parcount(rc32, 3); dbrew_config_force_unknown(rc32, 0); dbrew_set_vectorsize(rc32, 32); vcopy32 = (vcopy_t) dbrew_rewrite(rc32, a, b, len); if (verb) decode_func(rc32, "vcopy32"); } printf("Running %d iterations of vcopy ...\n", iters); t1 = wtime(); for(int iter = 0; iter < iters; iter++) naive_vcopy(a, b, len); t2 = wtime(); for(int iter = 0; iter < iters; iter++) vcopy(a, b, len); t3 = wtime(); if (run) for(int iter = 0; iter < iters; iter++) vcopy16(a, b, len); t4 = wtime(); if (do32 && run) for(int iter = 0; iter < iters; iter++) vcopy32(a, b, len); t5 = wtime(); printf(" naive: %.3f s, un-rewritten: %.3f s, rewritten-16: %.3f s", t2-t1, t3-t2, t4-t3); if (do32) printf(", rewritten-32: %.3f s", t5-t4); printf("\n"); } // vadd if (do_vadd) { vadd_t vadd16, vadd32; Rewriter* ra16 = dbrew_new(); if (verb>1) dbrew_verbose(ra16, true, true, true); dbrew_set_function(ra16, (uint64_t) vadd); dbrew_config_parcount(ra16, 4); dbrew_config_force_unknown(ra16, 0); dbrew_set_vectorsize(ra16, 16); vadd16 = (vadd_t) dbrew_rewrite(ra16, a, b, c, len); if (verb) decode_func(ra16, "vadd16"); if (do32) { Rewriter* ra32 = dbrew_new(); if (verb>1) dbrew_verbose(ra32, true, true, true); dbrew_set_function(ra32, (uint64_t) vadd); dbrew_config_parcount(ra32, 4); dbrew_config_force_unknown(ra32, 0); dbrew_set_vectorsize(ra32, 32); vadd32 = (vadd_t) dbrew_rewrite(ra32, a, b, c, len); if (verb) decode_func(ra32, "vadd32"); } sum1 = 0.0, sum2 = 0.0, sum3 = 0.0, sum4 = 0.0; printf("Running %d iterations of vadd ...\n", iters); t1 = wtime(); for(int iter = 0; iter < iters; iter++) naive_vadd(a, b, c, len); for(int i = 0; i < len; i++) sum1 += a[i]; t2 = wtime(); for(int iter = 0; iter < iters; iter++) vadd(a, b, c, len); for(int i = 0; i < len; i++) sum2 += a[i]; t3 = wtime(); if (run) for(int iter = 0; iter < iters; iter++) vadd16(a, b, c, len); for(int i = 0; i < len; i++) sum3 += a[i]; t4 = wtime(); if (do32 && run) for(int iter = 0; iter < iters; iter++) vadd32(a, b, c, len); for(int i = 0; i < len; i++) sum4 += a[i]; t5 = wtime(); printf(" naive: %.3f s, un-rewritten: %.3f s, rewritten-16: %.3f s", t2-t1, t3-t2, t4-t3); if (do32) printf(", rewritten-32: %.3f s", t5-t4); printf("\n"); printf(" sum naive: %f, sum rewritten-16: %f, sum rewritten-16: %f\n", sum1, sum3, sum4); } // vjacobi_1d if (do_vjacobi) { vcopy_t vjacobi_1d16, vjacobi_1d32; Rewriter* rj16 = dbrew_new(); if (verb>1) dbrew_verbose(rj16, true, true, true); dbrew_set_function(rj16, (uint64_t) vjacobi_1d); dbrew_config_parcount(rj16, 3); dbrew_config_force_unknown(rj16, 0); dbrew_set_vectorsize(rj16, 16); vjacobi_1d16 = (vcopy_t) dbrew_rewrite(rj16, a, b, len); if (verb) decode_func(rj16, "vjacobi_1d16"); if (do32) { Rewriter* rj32 = dbrew_new(); if (verb>1) dbrew_verbose(rj32, true, true, true); dbrew_set_function(rj32, (uint64_t) vjacobi_1d); dbrew_config_parcount(rj32, 3); dbrew_config_force_unknown(rj32, 0); dbrew_set_vectorsize(rj32, 32); vjacobi_1d32 = (vcopy_t) dbrew_rewrite(rj32, a, b, len); if (verb) decode_func(rj32, "vjacobi_1d32"); } sum1 = 0.0, sum2 = 0.0, sum3 = 0.0, sum4 = 0.0; printf("Running %d iterations of vjacobi_1d ...\n", iters); t1 = wtime(); for(int iter = 0; iter < iters; iter++) naive_vjacobi_1d(a+1, b+1, len-2); for(int i = 0; i < len; i++) sum1 += a[i]; t2 = wtime(); for(int iter = 0; iter < iters; iter++) vjacobi_1d(a+1, b+1, len-2); for(int i = 0; i < len; i++) sum2 += a[i]; t3 = wtime(); if (run) for(int iter = 0; iter < iters; iter++) vjacobi_1d16(a+1, b+1, len-2); for(int i = 0; i < len; i++) sum3 += a[i]; t4 = wtime(); if (do32 && run) for(int iter = 0; iter < iters; iter++) vjacobi_1d32(a+1, b+1, len-2); for(int i = 0; i < len; i++) sum4 += a[i]; t5 = wtime(); printf(" naive: %.3f s, un-rewritten: %.3f s, rewritten-16: %.3f s", t2-t1, t3-t2, t4-t3); if (do32) printf(", rewritten-32: %.3f s", t5-t4); printf("\n"); printf(" sum naive: %f, sum rewritten-16: %f, sum rewritten-16: %f\n", sum1, sum3, sum4); } }
int main(int argc, char ** argv) { int Block_order; size_t Block_size; size_t Colblock_size; int Tile_order=32; int tiling; int Num_procs; /* Number of ranks */ int order; /* overall matrix order */ int send_to, recv_from; /* communicating ranks */ size_t bytes; /* total amount of data to be moved */ int my_ID; /* rank */ int root=0; /* root rank of a communicator */ int iterations; /* number of times to run the pipeline algorithm */ int i, j, it, jt, ID;/* dummies */ int iter; /* index of iteration */ int phase; /* phase in the staged communication */ size_t colstart; /* sequence number of first column owned by calling rank */ int error=0; /* error flag */ double *A_p; /* original matrix column block */ double *B_p; /* transposed matrix column block */ double *Work_in_p; /* workspace for the transpose function */ double *Work_out_p;/* workspace for the transpose function */ double abserr, abserr_tot; /* computed error */ double epsilon = 1.e-8; /* error tolerance */ double local_trans_time, /* timing parameters */ trans_time, avgtime; MPI_Status status; /* completion status of message */ MPI_Win shm_win_A; /* Shared Memory window object */ MPI_Win shm_win_B; /* Shared Memory window object */ MPI_Win shm_win_Work_in; /* Shared Memory window object */ MPI_Win shm_win_Work_out; /* Shared Memory window object */ MPI_Info rma_winfo;/* info for window */ MPI_Comm shm_comm_prep;/* Shared Memory prep Communicator */ MPI_Comm shm_comm; /* Shared Memory Communicator */ int shm_procs; /* # of ranks in shared domain */ int shm_ID; /* MPI rank within coherence domain */ int group_size; /* number of ranks per shared memory group */ int Num_groups; /* number of shared memory group */ int group_ID; /* sequence number of shared memory group */ int size_mul; /* size multiplier; 0 for non-root ranks in coherence domain*/ int istart; MPI_Request send_req, recv_req; /********************************************************************************* ** Initialize the MPI environment **********************************************************************************/ MPI_Init(&argc,&argv); MPI_Comm_rank(MPI_COMM_WORLD, &my_ID); MPI_Comm_size(MPI_COMM_WORLD, &Num_procs); root = 0; /********************************************************************* ** process, test and broadcast input parameter *********************************************************************/ if (my_ID == root){ if (argc != 4 && argc !=5){ printf("Usage: %s <#ranks per coherence domain> <# iterations> <matrix order> [tile size]\n", *argv); error = 1; goto ENDOFTESTS; } group_size = atoi(*++argv); if (group_size < 1) { printf("ERROR: # ranks per coherence domain must be >= 1 : %d \n",group_size); error = 1; goto ENDOFTESTS; } if (Num_procs%group_size) { printf("ERROR: toal # %d ranks not divisible by ranks per coherence domain %d\n", Num_procs, group_size); error = 1; goto ENDOFTESTS; } iterations = atoi(*++argv); if (iterations < 1){ printf("ERROR: iterations must be >= 1 : %d \n",iterations); error = 1; goto ENDOFTESTS; } order = atoi(*++argv); if (order < Num_procs) { printf("ERROR: matrix order %d should at least # procs %d\n", order, Num_procs); error = 1; goto ENDOFTESTS; } if (order%Num_procs) { printf("ERROR: matrix order %d should be divisible by # procs %d\n", order, Num_procs); error = 1; goto ENDOFTESTS; } if (argc == 5) Tile_order = atoi(*++argv); ENDOFTESTS:; } bail_out(error); /* Broadcast input data to all ranks */ MPI_Bcast(&order, 1, MPI_INT, root, MPI_COMM_WORLD); MPI_Bcast(&iterations, 1, MPI_INT, root, MPI_COMM_WORLD); MPI_Bcast(&Tile_order, 1, MPI_INT, root, MPI_COMM_WORLD); MPI_Bcast(&group_size, 1, MPI_INT, root, MPI_COMM_WORLD); if (my_ID == root) { printf("Parallel Research Kernels version %s\n", PRKVERSION); printf("MPI+SHM Matrix transpose: B = A^T\n"); printf("Number of ranks = %d\n", Num_procs); printf("Rank group size = %d\n", group_size); printf("Matrix order = %d\n", order); printf("Number of iterations = %d\n", iterations); if ((Tile_order > 0) && (Tile_order < order)) printf("Tile size = %d\n", Tile_order); else printf("Untiled\n"); #ifndef SYNCHRONOUS printf("Non-"); #endif printf("Blocking messages\n"); } /* Setup for Shared memory regions */ /* first divide WORLD in groups of size group_size */ MPI_Comm_split(MPI_COMM_WORLD, my_ID/group_size, my_ID%group_size, &shm_comm_prep); /* derive from that a SHM communicator */ MPI_Comm_split_type(shm_comm_prep, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &shm_comm); MPI_Comm_rank(shm_comm, &shm_ID); MPI_Comm_size(shm_comm, &shm_procs); /* do sanity check, making sure groups did not shrink in second comm split */ if (shm_procs != group_size) MPI_Abort(MPI_COMM_WORLD, 666); /* a non-positive tile size means no tiling of the local transpose */ tiling = (Tile_order > 0) && (Tile_order < order); bytes = 2 * sizeof(double) * order * order; /********************************************************************* ** The matrix is broken up into column blocks that are mapped one to a ** rank. Each column block is made up of Num_procs smaller square ** blocks of order block_order. *********************************************************************/ Num_groups = Num_procs/group_size; Block_order = order/Num_groups; group_ID = my_ID/group_size; colstart = Block_order * group_ID; Colblock_size = order * Block_order; Block_size = Block_order * Block_order; /********************************************************************* ** Create the column block of the test matrix, the column block of the ** transposed matrix, and workspace (workspace only if #procs>1) *********************************************************************/ /* RMA win info */ MPI_Info_create(&rma_winfo); /* This key indicates that passive target RMA will not be used. * It is the one info key that MPICH actually uses for optimization. */ MPI_Info_set(rma_winfo, "no_locks", "true"); /* only the root of each SHM domain specifies window of nonzero size */ size_mul = (shm_ID==0); int offset = 32; MPI_Aint size= (Colblock_size+offset)*sizeof(double)*size_mul; int disp_unit; MPI_Win_allocate_shared(size, sizeof(double), rma_winfo, shm_comm, (void *) &A_p, &shm_win_A); MPI_Win_lock_all(MPI_MODE_NOCHECK,shm_win_A); MPI_Win_shared_query(shm_win_A, MPI_PROC_NULL, &size, &disp_unit, (void *)&A_p); if (A_p == NULL){ printf(" Error allocating space for original matrix on node %d\n",my_ID); error = 1; } bail_out(error); A_p += offset; /* recompute memory size (overwritten by prior query */ size= (Colblock_size+offset)*sizeof(double)*size_mul; MPI_Win_allocate_shared(size, sizeof(double), rma_winfo, shm_comm, (void *) &B_p, &shm_win_B); MPI_Win_lock_all(MPI_MODE_NOCHECK,shm_win_B); MPI_Win_shared_query(shm_win_B, MPI_PROC_NULL, &size, &disp_unit, (void *)&B_p); if (B_p == NULL){ printf(" Error allocating space for transposed matrix by group %d\n",group_ID); error = 1; } bail_out(error); B_p += offset; if (Num_groups>1) { size = Block_size*sizeof(double)*size_mul; MPI_Win_allocate_shared(size, sizeof(double),rma_winfo, shm_comm, (void *) &Work_in_p, &shm_win_Work_in); MPI_Win_lock_all(MPI_MODE_NOCHECK,shm_win_Work_in); MPI_Win_shared_query(shm_win_Work_in, MPI_PROC_NULL, &size, &disp_unit, (void *)&Work_in_p); if (Work_in_p == NULL){ printf(" Error allocating space for in block by group %d\n",group_ID); error = 1; } bail_out(error); /* recompute memory size (overwritten by prior query */ size = Block_size*sizeof(double)*size_mul; MPI_Win_allocate_shared(size, sizeof(double), rma_winfo, shm_comm, (void *) &Work_out_p, &shm_win_Work_out); MPI_Win_lock_all(MPI_MODE_NOCHECK,shm_win_Work_out); MPI_Win_shared_query(shm_win_Work_out, MPI_PROC_NULL, &size, &disp_unit, (void *)&Work_out_p); if (Work_out_p == NULL){ printf(" Error allocating space for out block by group %d\n",group_ID); error = 1; } bail_out(error); } /* Fill the original column matrix */ istart = 0; int chunk_size = Block_order/group_size; if (tiling) { for (j=shm_ID*chunk_size;j<(shm_ID+1)*chunk_size;j+=Tile_order) { for (i=0;i<order; i+=Tile_order) for (jt=j; jt<MIN((shm_ID+1)*chunk_size,j+Tile_order); jt++) for (it=i; it<MIN(order,i+Tile_order); it++) { A(it,jt) = (double) ((double)order*(jt+colstart) + it); B(it,jt) = -1.0; } } } else { for (j=shm_ID*chunk_size;j<(shm_ID+1)*chunk_size;j++) for (i=0;i<order; i++) { A(i,j) = (double)((double)order*(j+colstart) + i); B(i,j) = -1.0; } } /* NEED A STORE FENCE HERE */ MPI_Win_sync(shm_win_A); MPI_Win_sync(shm_win_B); MPI_Barrier(shm_comm); for (iter=0; iter<=iterations; iter++) { /* start timer after a warmup iteration */ if (iter == 1) { MPI_Barrier(MPI_COMM_WORLD); local_trans_time = wtime(); } /* do the local transpose */ istart = colstart; if (!tiling) { for (i=shm_ID*chunk_size; i<(shm_ID+1)*chunk_size; i++) { for (j=0; j<Block_order; j++) B(j,i) = A(i,j); } } else { for (i=shm_ID*chunk_size; i<(shm_ID+1)*chunk_size; i+=Tile_order) { for (j=0; j<Block_order; j+=Tile_order) for (it=i; it<MIN(Block_order,i+Tile_order); it++) for (jt=j; jt<MIN(Block_order,j+Tile_order);jt++) { B(jt,it) = A(it,jt); } } } for (phase=1; phase<Num_groups; phase++){ recv_from = ((group_ID + phase )%Num_groups); send_to = ((group_ID - phase + Num_groups)%Num_groups); istart = send_to*Block_order; if (!tiling) { for (i=shm_ID*chunk_size; i<(shm_ID+1)*chunk_size; i++) for (j=0; j<Block_order; j++){ Work_out(j,i) = A(i,j); } } else { for (i=shm_ID*chunk_size; i<(shm_ID+1)*chunk_size; i+=Tile_order) for (j=0; j<Block_order; j+=Tile_order) for (it=i; it<MIN(Block_order,i+Tile_order); it++) for (jt=j; jt<MIN(Block_order,j+Tile_order);jt++) { Work_out(jt,it) = A(it,jt); } } /* NEED A LOAD/STORE FENCE HERE */ MPI_Win_sync(shm_win_Work_in); MPI_Win_sync(shm_win_Work_out); MPI_Barrier(shm_comm); if (shm_ID==0) { #ifndef SYNCHRONOUS /* if we place the Irecv outside this block, it would not be protected by a local barrier, which creates a race */ MPI_Irecv(Work_in_p, Block_size, MPI_DOUBLE, recv_from*group_size, phase, MPI_COMM_WORLD, &recv_req); MPI_Isend(Work_out_p, Block_size, MPI_DOUBLE, send_to*group_size, phase, MPI_COMM_WORLD, &send_req); MPI_Wait(&recv_req, &status); MPI_Wait(&send_req, &status); #else MPI_Sendrecv(Work_out_p, Block_size, MPI_DOUBLE, send_to*group_size, phase, Work_in_p, Block_size, MPI_DOUBLE, recv_from*group_size, phase, MPI_COMM_WORLD, &status); #endif } /* NEED A LOAD FENCE HERE */ MPI_Win_sync(shm_win_Work_in); MPI_Win_sync(shm_win_Work_out); MPI_Barrier(shm_comm); istart = recv_from*Block_order; /* scatter received block to transposed matrix; no need to tile */ for (j=shm_ID*chunk_size; j<(shm_ID+1)*chunk_size; j++) for (i=0; i<Block_order; i++) B(i,j) = Work_in(i,j); } /* end of phase loop */ } /* end of iterations */ local_trans_time = wtime() - local_trans_time; MPI_Reduce(&local_trans_time, &trans_time, 1, MPI_DOUBLE, MPI_MAX, root, MPI_COMM_WORLD); abserr = 0.0; istart = 0; /* for (j=shm_ID;j<Block_order;j+=group_size) for (i=0;i<order; i++) { */ for (j=shm_ID*chunk_size; j<(shm_ID+1)*chunk_size; j++) for (i=0;i<order; i++) { abserr += ABS(B(i,j) - (double)((double)order*i + j+colstart)); } MPI_Reduce(&abserr, &abserr_tot, 1, MPI_DOUBLE, MPI_SUM, root, MPI_COMM_WORLD); if (my_ID == root) { if (abserr_tot < epsilon) { printf("Solution validates\n"); avgtime = trans_time/(double)iterations; printf("Rate (MB/s): %lf Avg time (s): %lf\n",1.0E-06*bytes/avgtime, avgtime); #ifdef VERBOSE printf("Summed errors: %f \n", abserr_tot); #endif } else { printf("ERROR: Aggregate squared error %e exceeds threshold %e\n", abserr_tot, epsilon); error = 1; } } bail_out(error); MPI_Win_unlock_all(shm_win_A); MPI_Win_unlock_all(shm_win_B); MPI_Win_free(&shm_win_A); MPI_Win_free(&shm_win_B); if (Num_groups>1) { MPI_Win_unlock_all(shm_win_Work_in); MPI_Win_unlock_all(shm_win_Work_out); MPI_Win_free(&shm_win_Work_in); MPI_Win_free(&shm_win_Work_out); } MPI_Info_free(&rma_winfo); MPI_Finalize(); exit(EXIT_SUCCESS); } /* end of main */
/*--------------------------------------------------------------------------- * * Compute matrix product using BLAS routine DGEMM. * * Input * int argc - length of argv[] array * char* argv[] - pointer to command line parameter array * int verbosity - program verification: verbosity > 0 gives more output * * Output * double - elapsed time for product computation */ double multiply_by_blas( int argc, char* argv[], int verbosity ) { int rows, cols, mids; double **a, **b, **c; double t1, t2; double sec; double gflop_count; /* * process command line arguments */ rows = atoi( argv[0] ); mids = atoi( argv[1] ); cols = atoi( argv[2] ); gflop_count = 2.0 * rows * mids * cols / 1.0e9; if ( verbosity > 0 ) { printf( "BLAS: rows = %d, mids = %d, columns = %d\n", rows, mids, cols ); } /* * allocate and initialize matrices */ a = (double**) allocateMatrix( rows, mids ); b = (double**) allocateMatrix( mids, cols ); c = (double**) allocateMatrix( rows, cols ); initialize_matrices( a, b, c, rows, cols, mids, verbosity ); /* * compute product: There is an implicit matrix transpose when * passing from Fortran to C and vice-versa. To compute C := * alpha * A * B + beta * C we use dgemm() to compute C' := alpha * * B' * A' + beta * C'. The first two arguments to dgemm() are * 'N' indicating we don't want a transpose in addition to the * implicit one. The matrices A and B are passed in reverse order * so dgemm() receives (after the implicit transpose) B' and A'. * Arguments 3 and 4 are the dimensions of C' and argument 5 is * the column dimension of B' (and the row dimension of A'). */ t1 = wtime(); dgemm( 'N', 'N', cols, rows, mids, 1.0, &b[0][0], cols, &a[0][0], mids, 0.0, &c[0][0], cols ); t2 = wtime(); sec = t2 - t1; if ( verbosity > 1 ) printf( "checksum = %f\n", checksum( c, rows, cols ) ); printf( "BLAS: %6.3f secs %6.3f gflops ( %5d x %5d x %5d )\n", sec, gflop_count / sec, rows, mids, cols ); /* * clean up */ deallocateMatrix( a ); deallocateMatrix( b ); deallocateMatrix( c ); return t2 - t1; }
int main(int argc, char ** argv) { int vector_length; /* length of vectors to be aggregated */ int total_length; /* bytes needed to store reduction vectors */ double reduce_time, /* timing parameters */ avgtime = 0.0, maxtime = 0.0, mintime = 366.0*24.0*3600.0; /* set the minimum time to a large value; one leap year should be enough */ double epsilon=1.e-8; /* error tolerance */ int i, iter; /* dummies */ double element_value; /* reference element value for final vector */ int iterations; /* number of times the reduction is carried out */ static double /* use static so it goes on the heap, not stack */ RESTRICT vector[MEMWORDS];/* we would like to allocate "vector" dynamically, but need to be able to flush the thing in some versions of the reduction algorithm -> static */ /***************************************************************************** ** process and test input parameters ******************************************************************************/ if (argc != 3){ printf("Usage: %s <# iterations> <vector length>\n", *argv); return(EXIT_FAILURE); } iterations = atoi(*++argv); if (iterations < 1){ printf("ERROR: Iterations must be positive : %d \n", iterations); exit(EXIT_FAILURE); } vector_length = atoi(*++argv); if (vector_length < 1){ printf("ERROR: vector length must be >= 1 : %d \n",vector_length); exit(EXIT_FAILURE); } /* make sure we stay within the memory allocated for vector */ total_length = 2*vector_length; if (total_length/2 != vector_length || total_length > MEMWORDS) { printf("Vector length of %d too large; ", vector_length); printf("increase MEMWORDS in Makefile or reduce vector length\n"); exit(EXIT_FAILURE); } printf("Serial Vector Reduction\n"); printf("Vector length = %d\n", vector_length); printf("Number of iterations = %d\n", iterations); for (iter=0; iter<iterations; iter++) { /* initialize the arrays, assuming first-touch memory placement */ for (i=0; i<vector_length; i++) { VEC0(i) = (double)(1); VEC1(i) = (double)(2); } reduce_time = wtime(); /* do actual reduction */ /* first do the "local" part, which is the same for all algorithms */ for (i=0; i<vector_length; i++) { VEC0(i) += VEC1(i); } reduce_time = wtime() - reduce_time; #ifdef VERBOSE printf("\nFinished with reduction, using %lf seconds \n", reduce_time); #endif if (iter>0 || iterations==1) { /* skip the first iteration */ avgtime = avgtime + reduce_time; mintime = MIN(mintime, reduce_time); maxtime = MAX(maxtime, reduce_time); } } /* end of iter loop */ /* verify correctness */ element_value = (2.0+1.0); for (i=0; i<vector_length; i++) { if (ABS(VEC0(i) - element_value) >= epsilon) { printf("First error at i=%d; value: %lf; reference value: %lf\n", i, VEC0(i), element_value); exit(EXIT_FAILURE); } } printf("Solution validates\n"); #ifdef VERBOSE printf("Element verification value: %lf\n", element_value); #endif avgtime = avgtime/(double)(MAX(iterations-1,1)); printf("Rate (MFlops/s): %lf, Avg time (s): %lf, Min time (s): %lf", 1.0E-06 * (2.0-1.0)*vector_length/mintime, avgtime, mintime); printf(", Max time (s): %lf\n", maxtime); exit(EXIT_SUCCESS); }
if(me == 1){ local_buf[0:size] = target_buf[0:size]:[target]; xmp_sync_memory(NULL); #ifdef DEBUG if(local_buf[0] != '2' && local_buf[size-1] != '2') fprintf(stderr, "Error !\n"); local_buf[0] = '1'; local_buf[size-1] = '1'; #endif xmp_sync_all(NULL); } else{ xmp_sync_all(NULL); local_buf[0:size] = target_buf[0:size]:[target]; #ifdef DEBUG if(local_buf[0] != '1' && local_buf[size-1] != '1') fprintf(stderr, "Error !\n"); local_buf[0] = '2'; local_buf[size-1] = '2'; #endif } xmp_sync_all(NULL); } xmp_sync_all(NULL); t = wtime() - t; if(me == 1) print_results(size, t); } return 0; }
static double wtime() { static struct timeval tv0 = {.tv_sec = 0}; struct timeval tv; int cc; cc = gettimeofday(&tv, 0); assert(cc == 0); if (tv0.tv_sec == 0) { tv0 = tv; assert(tv0.tv_sec != 0); } double dt = ((double)(tv.tv_sec - tv0.tv_sec) + ((double)(tv.tv_usec - tv0.tv_usec) * 1e-6)); return dt; } /* Puts 200 key-value pairs to output KVO. It is a map-function. It runs only on rank0. Inputs (KV0 and KVS0) are dummy. */ static int addkeysfn(const struct kmr_kv_box kv0, const KMR_KVS *kvs0, KMR_KVS *kvo, void *p, const long ind) { assert(kvs0 == 0 && kv0.klen == 0 && kv0.vlen == 0 && kvo != 0); char k[80]; char v[80]; int cc; for (int i = 0; i < 200; i++) { snprintf(k, 80, "key%d", i); snprintf(v, 80, "value%d", i); struct kmr_kv_box kv = { .klen = (int)(strlen(k) + 1), .vlen = (int)(strlen(v) + 1), .k.p = k, .v.p = v }; cc = kmr_add_kv(kvo, kv); assert(cc == MPI_SUCCESS); } return MPI_SUCCESS; } static int replacevaluefn(const struct kmr_kv_box kv0, const KMR_KVS *kvs0, KMR_KVS *kvo, void *p, const long i) { assert(kvs0 != 0 && kvo != 0); int cc, x; char gomi; cc = sscanf((&((char *)kv0.k.p)[3]), "%d%c", &x, &gomi); assert(cc == 1); char v[80]; snprintf(v, 10, "newvalue%d", x); struct kmr_kv_box kv = {.klen = kv0.klen, .vlen = (int)(strlen(v) + 1), .k.p = kv0.k.p, .v.p = v }; cc = kmr_add_kv(kvo, kv); assert(cc == MPI_SUCCESS); return MPI_SUCCESS; } static int emptyreducefn(const struct kmr_kv_box kv[], const long n, const KMR_KVS *kvs, KMR_KVS *kvo, void *p) { return MPI_SUCCESS; } /* Do KMR operations many times. */ static void simple0(int nprocs, int rank) { int cc; KMR *mr = kmr_create_context(MPI_COMM_WORLD, MPI_INFO_NULL, 0); double t0, t1; t0 = wtime(); for (int i = 0; i < 10000; i++) { /* Check timeout. */ t1 = wtime(); KMR_KVS *to0 = kmr_create_kvs(mr, KMR_KV_INTEGER, KMR_KV_INTEGER); if (rank == 0) { struct kmr_kv_box kv = { .klen = (int)sizeof(long), .vlen = (int)sizeof(long), .k.i = 0, .v.i = ((t1 - t0) > 20.0) }; cc = kmr_add_kv(to0, kv); assert(cc == MPI_SUCCESS); } cc = kmr_add_kv_done(to0); assert(cc == MPI_SUCCESS); KMR_KVS *to1 = kmr_create_kvs(mr, KMR_KV_INTEGER, KMR_KV_INTEGER); cc = kmr_replicate(to0, to1, kmr_noopt); assert(cc == MPI_SUCCESS); struct kmr_kv_box tok = {.klen = (int)sizeof(long), .k.p = 0, .vlen = 0, .v.p = 0}; struct kmr_kv_box tov; cc = kmr_find_key(to1, tok, &tov); assert(cc == MPI_SUCCESS); cc = kmr_free_kvs(to1); assert(cc == MPI_SUCCESS); if (tov.v.i) { if (rank == 0) { printf("loops %d\n", i); } break; } /* Put some pairs. */ KMR_KVS *kvs0 = kmr_create_kvs(mr, KMR_KV_OPAQUE, KMR_KV_OPAQUE); cc = kmr_map_on_rank_zero(kvs0, 0, kmr_noopt, addkeysfn); assert(cc == MPI_SUCCESS); /* Replicate pairs to all ranks. */ KMR_KVS *kvs1 = kmr_create_kvs(mr, KMR_KV_OPAQUE, KMR_KV_OPAQUE); cc = kmr_replicate(kvs0, kvs1, kmr_noopt); assert(cc == MPI_SUCCESS); /* Map pairs. */ KMR_KVS *kvs2 = kmr_create_kvs(mr, KMR_KV_OPAQUE, KMR_KV_OPAQUE); cc = kmr_map(kvs1, kvs2, 0, kmr_noopt, replacevaluefn); assert(cc == MPI_SUCCESS); /* Collect pairs by theirs keys. */ KMR_KVS *kvs3 = kmr_create_kvs(mr, KMR_KV_OPAQUE, KMR_KV_OPAQUE); cc = kmr_shuffle(kvs2, kvs3, kmr_noopt); assert(cc == MPI_SUCCESS); /* Reduce collected pairs. */ KMR_KVS *kvs4 = kmr_create_kvs(mr, KMR_KV_OPAQUE, KMR_KV_OPAQUE); cc = kmr_reduce(kvs3, kvs4, 0, kmr_noopt, emptyreducefn); assert(cc == MPI_SUCCESS); cc = kmr_free_kvs(kvs4); assert(cc == MPI_SUCCESS); } cc = kmr_free_context(mr); assert(cc == MPI_SUCCESS); } int main(int argc, char *argv[]) { char cmd[256]; int pid = getpid(); int N = 8; int nprocs, rank, thlv; /*MPI_Init(&argc, &argv);*/ MPI_Init_thread(&argc, &argv, MPI_THREAD_SERIALIZED, &thlv); MPI_Comm_size(MPI_COMM_WORLD, &nprocs); MPI_Comm_rank(MPI_COMM_WORLD, &rank); kmr_init(); MPI_Barrier(MPI_COMM_WORLD); if (rank == 0) {printf("Check leakage by observing heap size.\n");} if (rank == 0) {printf("Watch VSZ changes (loops %d times)...\n", N);} if (rank == 0) {printf("(Each loop will take approx. 20 sec).\n");} fflush(0); usleep(50 * 1000); MPI_Barrier(MPI_COMM_WORLD); for (int i = 0; i < N; i++) { simple0(nprocs, rank); MPI_Barrier(MPI_COMM_WORLD); if (rank == 0) { snprintf(cmd, sizeof(cmd), "ps l %d", pid); system(cmd); } fflush(0); } MPI_Barrier(MPI_COMM_WORLD); if (rank == 0) printf("OK\n"); fflush(0); kmr_fin(); MPI_Finalize(); return 0; }
int main(int argc, char ** argv) { int my_ID; /* Thread ID */ long vector_length; /* length of vectors to be aggregated */ long total_length; /* bytes needed to store reduction vectors */ double reduce_time, /* timing parameters */ avgtime; double epsilon=1.e-8; /* error tolerance */ int group_size, /* size of aggregating half of thread pool */ old_size, /* group size in previous binary tree iteration */ i, id, iter, stage; /* dummies */ double element_value; /* reference element value for final vector */ char *algorithm; /* reduction algorithm selector */ int intalgorithm; /* integer encoding of algorithm selector */ int iterations; /* number of times the reduction is carried out */ int flag[MAX_THREADS*LINEWORDS]; /* used for pairwise synchronizations */ int start[MAX_THREADS], end[MAX_THREADS];/* segments of vectors for bucket algorithm */ long segment_size; int my_donor, my_segment; int nthread_input, /* thread parameters */ nthread; double RESTRICT *vector;/* vector pair to be reduced */ int num_error=0; /* flag that signals that requested and obtained numbers of threads are the same */ /***************************************************************************** ** process and test input parameters ******************************************************************************/ printf("Parallel Research Kernels version %s\n", PRKVERSION); printf("OpenMP Vector Reduction\n"); if (argc != 4 && argc != 5){ printf("Usage: %s <# threads> <# iterations> <vector length> ", *argv); printf("[<alghorithm>]\n"); printf("Algorithm: linear, binary-barrier, binary-p2p, or long-optimal\n"); return(EXIT_FAILURE); } /* Take number of threads to request from command line */ nthread_input = atoi(*++argv); if ((nthread_input < 1) || (nthread_input > MAX_THREADS)) { printf("ERROR: Invalid number of threads: %d\n", nthread_input); exit(EXIT_FAILURE); } omp_set_num_threads(nthread_input); iterations = atoi(*++argv); if (iterations < 1){ printf("ERROR: Iterations must be positive : %d \n", iterations); exit(EXIT_FAILURE); } vector_length = atol(*++argv); if (vector_length < 1){ printf("ERROR: vector length must be >= 1 : %ld \n",vector_length); exit(EXIT_FAILURE); } total_length = vector_length*2*nthread_input*sizeof(double); vector = (double *) prk_malloc(total_length); if (!vector) { printf("ERROR: Could not allocate space for vectors: %ld\n", total_length); exit(EXIT_FAILURE); } algorithm = "binary-p2p"; if (argc == 5) algorithm = *++argv; intalgorithm = NONE; if (!strcmp(algorithm,"linear" )) intalgorithm = LINEAR; if (!strcmp(algorithm,"binary-barrier")) intalgorithm = BINARY_BARRIER; if (!strcmp(algorithm,"binary-p2p" )) intalgorithm = BINARY_P2P; if (!strcmp(algorithm,"long-optimal" )) intalgorithm = LONG_OPTIMAL; if (intalgorithm == NONE) { printf("Wrong algorithm: %s; choose linear, binary-barrier, ", algorithm); printf("binary-p2p, or long-optimal\n"); exit(EXIT_FAILURE); } else { if (nthread_input == 1) intalgorithm = LOCAL; } #pragma omp parallel private(i, old_size, group_size, my_ID, iter, start, end, \ segment_size, stage, id, my_donor, my_segment) { my_ID = omp_get_thread_num(); #pragma omp master { nthread = omp_get_num_threads(); if (nthread != nthread_input) { num_error = 1; printf("ERROR: number of requested threads %d does not equal ", nthread_input); printf("number of spawned threads %d\n", nthread); } else { printf("Number of threads = %d\n",nthread_input); printf("Vector length = %ld\n", vector_length); printf("Reduction algorithm = %s\n", algorithm); printf("Number of iterations = %d\n", iterations); } } bail_out(num_error); for (iter=0; iter<=iterations; iter++) { /* start timer after a warmup iteration */ if (iter == 1) { #pragma omp barrier #pragma omp master { reduce_time = wtime(); } } /* in case of the long-optimal algorithm we need a barrier before the reinitialization to make sure that we don't overwrite parts of the vector before other threads are done with those parts */ if (intalgorithm == LONG_OPTIMAL) { #pragma omp barrier } /* initialize the arrays, assuming first-touch memory placement */ for (i=0; i<vector_length; i++) { VEC0(my_ID,i) = (double)(my_ID+1); VEC1(my_ID,i) = (double)(my_ID+1+nthread); } if (intalgorithm == BINARY_P2P) { /* we need a barrier before setting all flags to zero, to avoid zeroing some that are still in use in a previous iteration */ #pragma omp barrier flag(my_ID) = 0; /* we also need a barrier after setting the flags, to make each is visible to all threads, and to synchronize before the timer starts */ #pragma omp barrier } /* do actual reduction */ /* first do the "local" part, which is the same for all algorithms */ for (i=0; i<vector_length; i++) { VEC0(my_ID,i) += VEC1(my_ID,i); } /* now do the "non-local" part */ switch (intalgorithm) { case LOCAL: break; case LINEAR: { #pragma omp barrier #pragma omp master { for (id=1; id<nthread; id++) { for (i=0; i<vector_length; i++) { VEC0(0,i) += VEC0(id,i); } } } } break; case BINARY_BARRIER: group_size = nthread; while (group_size >1) { /* barrier to make sure threads have completed their updates before the results are being read */ #pragma omp barrier old_size = group_size; group_size = (group_size+1)/2; /* Threads in "first half" of group aggregate data from threads in second half; must make sure the counterpart is within old group. If group size is odd, the last thread in the group does not have a counterpart. */ if (my_ID < group_size && my_ID+group_size<old_size) { for (i=0; i<vector_length; i++) { VEC0(my_ID,i) += VEC0(my_ID+group_size,i); } } } break; case BINARY_P2P: group_size = nthread; while (group_size >1) { old_size = group_size; group_size = (group_size+1)/2; /* synchronize between each pair of threads that collaborate to aggregate a new subresult, to make sure the donor of the pair has updated its vector in the previous round before it is being read */ if (my_ID < group_size && my_ID+group_size<old_size) { while (flag(my_ID+group_size) == 0) { #pragma omp flush } /* make sure I read the latest version of vector from memory */ #pragma omp flush for (i=0; i<vector_length; i++) { VEC0(my_ID,i) += VEC0(my_ID+group_size,i); } } else { if (my_ID < old_size) { /* I am a producer of data in this iteration; make sure my updated version can be seen by all threads */ flag(my_ID) = 1; #pragma omp flush } } } break; case LONG_OPTIMAL: /* compute starts and ends of subvectors to be passed among threads */ segment_size = (vector_length+nthread-1)/nthread; for (id=0; id<nthread; id++) { start[id] = segment_size*id; end[id] = MIN(vector_length,segment_size*(id+1)); } /* first do the Bucket Reduce Scatter in nthread-1 stages */ my_donor = (my_ID-1+nthread)%nthread; for (stage=1; stage<nthread; stage++) { #pragma omp barrier my_segment = (my_ID-stage+nthread)%nthread; for (i=start[my_segment]; i<end[my_segment]; i++) { VEC0(my_ID,i) += VEC0(my_donor,i); } } /* next, each thread pushes its contribution into the master thread vector; no need to synchronize, because of the push model */ my_segment = (my_ID+1)%nthread; if (my_ID != 0) for (i=start[my_segment]; i<end[my_segment]; i++) { VEC0(0,i) = VEC0(my_ID,i); } break; } /* end of algorithm switch statement */ } /* end of iter loop */ #pragma omp barrier #pragma omp master { reduce_time = wtime() - reduce_time; } } /* end of OpenMP parallel region */ /* verify correctness */ element_value = (double)nthread*(2.0*(double)nthread+1.0); for (i=0; i<vector_length; i++) { if (ABS(VEC0(0,i) - element_value) >= epsilon) { printf("First error at i=%d; value: %lf; reference value: %lf\n", i, VEC0(0,i), element_value); exit(EXIT_FAILURE); } } printf("Solution validates\n"); #ifdef VERBOSE printf("Element verification value: %lf\n", element_value); #endif avgtime = reduce_time/iterations; printf("Rate (MFlops/s): %lf Avg time (s): %lf\n", 1.0E-06 * (2.0*nthread-1.0)*vector_length/avgtime, avgtime); exit(EXIT_SUCCESS); }
int main(int argc, char ** argv) { int Num_procs; /* number of ranks */ int Num_procsx, Num_procsy; /* number of ranks in each coord direction */ int my_ID; /* MPI rank */ int my_IDx, my_IDy; /* coordinates of rank in rank grid */ int right_nbr; /* global rank of right neighboring tile */ int left_nbr; /* global rank of left neighboring tile */ int top_nbr; /* global rank of top neighboring tile */ int bottom_nbr; /* global rank of bottom neighboring tile */ DTYPE *top_buf_out; /* communication buffer */ DTYPE *top_buf_in; /* " " */ DTYPE *bottom_buf_out; /* " " */ DTYPE *bottom_buf_in; /* " " */ DTYPE *right_buf_out; /* " " */ DTYPE *right_buf_in; /* " " */ DTYPE *left_buf_out; /* " " */ DTYPE *left_buf_in; /* " " */ int root = 0; int n, width, height;/* linear global and local grid dimension */ long nsquare; /* total number of grid points */ int i, j, ii, jj, kk, it, jt, iter, leftover; /* dummies */ int istart, iend; /* bounds of grid tile assigned to calling rank */ int jstart, jend; /* bounds of grid tile assigned to calling rank */ DTYPE norm, /* L1 norm of solution */ local_norm, /* contribution of calling rank to L1 norm */ reference_norm; DTYPE f_active_points; /* interior of grid with respect to stencil */ DTYPE flops; /* floating point ops per iteration */ int iterations; /* number of times to run the algorithm */ double local_stencil_time,/* timing parameters */ stencil_time, avgtime; int stencil_size; /* number of points in stencil */ int nthread_input, /* thread parameters */ nthread; DTYPE * RESTRICT in; /* input grid values */ DTYPE * RESTRICT out; /* output grid values */ long total_length_in; /* total required length to store input array */ long total_length_out;/* total required length to store output array */ int error=0; /* error flag */ DTYPE weight[2*RADIUS+1][2*RADIUS+1]; /* weights of points in the stencil */ MPI_Request request[8]; /******************************************************************************* ** Initialize the MPI environment ********************************************************************************/ MPI_Init(&argc,&argv); MPI_Comm_rank(MPI_COMM_WORLD, &my_ID); MPI_Comm_size(MPI_COMM_WORLD, &Num_procs); /******************************************************************************* ** process, test, and broadcast input parameters ********************************************************************************/ if (my_ID == root) { printf("Parallel Research Kernels version %s\n", PRKVERSION); printf("MPI+OPENMP stencil execution on 2D grid\n"); #ifndef STAR printf("ERROR: Compact stencil not supported\n"); error = 1; goto ENDOFTESTS; #endif if (argc != 4){ printf("Usage: %s <#threads><#iterations> <array dimension> \n", *argv); error = 1; goto ENDOFTESTS; } /* Take number of threads to request from command line */ nthread_input = atoi(*++argv); if ((nthread_input < 1) || (nthread_input > MAX_THREADS)) { printf("ERROR: Invalid number of threads: %d\n", nthread_input); error = 1; goto ENDOFTESTS; } iterations = atoi(*++argv); if (iterations < 1){ printf("ERROR: iterations must be >= 1 : %d \n",iterations); error = 1; goto ENDOFTESTS; } n = atoi(*++argv); nsquare = (long) n * (long) n; if (nsquare < Num_procs){ printf("ERROR: grid size %ld must be at least # ranks: %d\n", nsquare, Num_procs); error = 1; goto ENDOFTESTS; } if (RADIUS < 0) { printf("ERROR: Stencil radius %d should be non-negative\n", RADIUS); error = 1; goto ENDOFTESTS; } if (2*RADIUS +1 > n) { printf("ERROR: Stencil radius %d exceeds grid size %d\n", RADIUS, n); error = 1; goto ENDOFTESTS; } ENDOFTESTS:; } bail_out(error); /* determine best way to create a 2D grid of ranks (closest to square, for best surface/volume ratio); we do this brute force for now */ for (Num_procsx=(int) (sqrt(Num_procs+1)); Num_procsx>0; Num_procsx--) { if (!(Num_procs%Num_procsx)) { Num_procsy = Num_procs/Num_procsx; break; } } my_IDx = my_ID%Num_procsx; my_IDy = my_ID/Num_procsx; /* compute neighbors; don't worry about dropping off the edges of the grid */ right_nbr = my_ID+1; left_nbr = my_ID-1; top_nbr = my_ID+Num_procsx; bottom_nbr = my_ID-Num_procsx; MPI_Bcast(&n, 1, MPI_INT, root, MPI_COMM_WORLD); MPI_Bcast(&iterations, 1, MPI_INT, root, MPI_COMM_WORLD); MPI_Bcast(&nthread_input, 1, MPI_INT, root, MPI_COMM_WORLD); omp_set_num_threads(nthread_input); if (my_ID == root) { printf("Number of ranks = %d\n", Num_procs); printf("Number of threads = %d\n", omp_get_max_threads()); printf("Grid size = %d\n", n); printf("Radius of stencil = %d\n", RADIUS); printf("Tiles in x/y-direction = %d/%d\n", Num_procsx, Num_procsy); printf("Type of stencil = star\n"); #if DOUBLE printf("Data type = double precision\n"); #else printf("Data type = single precision\n"); #endif #if LOOPGEN printf("Script used to expand stencil loop body\n"); #else printf("Compact representation of stencil loop body\n"); #endif printf("Number of iterations = %d\n", iterations); } /* compute amount of space required for input and solution arrays */ width = n/Num_procsx; leftover = n%Num_procsx; if (my_IDx<leftover) { istart = (width+1) * my_IDx; iend = istart + width; } else { istart = (width+1) * leftover + width * (my_IDx-leftover); iend = istart + width - 1; } width = iend - istart + 1; if (width == 0) { printf("ERROR: rank %d has no work to do\n", my_ID); error = 1; } bail_out(error); height = n/Num_procsy; leftover = n%Num_procsy; if (my_IDy<leftover) { jstart = (height+1) * my_IDy; jend = jstart + height; } else { jstart = (height+1) * leftover + height * (my_IDy-leftover); jend = jstart + height - 1; } height = jend - jstart + 1; if (height == 0) { printf("ERROR: rank %d has no work to do\n", my_ID); error = 1; } bail_out(error); if (width < RADIUS || height < RADIUS) { printf("ERROR: rank %d has work tile smaller then stencil radius\n", my_ID); error = 1; } bail_out(error); total_length_in = (width+2*RADIUS)*(height+2*RADIUS)*sizeof(DTYPE); if (total_length_in/(height+2*RADIUS) != (width+2*RADIUS)*sizeof(DTYPE)) { printf("ERROR: Space for %d x %d input array cannot be represented\n", width+2*RADIUS, height+2*RADIUS); error = 1; } bail_out(error); total_length_out = width*height*sizeof(DTYPE); in = (DTYPE *) prk_malloc(total_length_in); out = (DTYPE *) prk_malloc(total_length_out); if (!in || !out) { printf("ERROR: rank %d could not allocate space for input/output array\n", my_ID); error = 1; } bail_out(error); /* fill the stencil weights to reflect a discrete divergence operator */ for (jj=-RADIUS; jj<=RADIUS; jj++) for (ii=-RADIUS; ii<=RADIUS; ii++) WEIGHT(ii,jj) = (DTYPE) 0.0; stencil_size = 4*RADIUS+1; for (ii=1; ii<=RADIUS; ii++) { WEIGHT(0, ii) = WEIGHT( ii,0) = (DTYPE) (1.0/(2.0*ii*RADIUS)); WEIGHT(0,-ii) = WEIGHT(-ii,0) = -(DTYPE) (1.0/(2.0*ii*RADIUS)); } norm = (DTYPE) 0.0; f_active_points = (DTYPE) (n-2*RADIUS)*(DTYPE) (n-2*RADIUS); /* intialize the input and output arrays */ #pragma omp parallel for private (i) for (j=jstart; j<=jend; j++) for (i=istart; i<=iend; i++) { IN(i,j) = COEFX*i+COEFY*j; OUT(i,j) = (DTYPE)0.0; } /* allocate communication buffers for halo values */ top_buf_out = (DTYPE *) prk_malloc(4*sizeof(DTYPE)*RADIUS*width); if (!top_buf_out) { printf("ERROR: Rank %d could not allocated comm buffers for y-direction\n", my_ID); error = 1; } bail_out(error); top_buf_in = top_buf_out + RADIUS*width; bottom_buf_out = top_buf_out + 2*RADIUS*width; bottom_buf_in = top_buf_out + 3*RADIUS*width; right_buf_out = (DTYPE *) prk_malloc(4*sizeof(DTYPE)*RADIUS*height); if (!right_buf_out) { printf("ERROR: Rank %d could not allocated comm buffers for x-direction\n", my_ID); error = 1; } bail_out(error); right_buf_in = right_buf_out + RADIUS*height; left_buf_out = right_buf_out + 2*RADIUS*height; left_buf_in = right_buf_out + 3*RADIUS*height; for (iter = 0; iter<=iterations; iter++){ /* start timer after a warmup iteration */ if (iter == 1) { MPI_Barrier(MPI_COMM_WORLD); local_stencil_time = wtime(); } /* need to fetch ghost point data from neighbors in y-direction */ if (my_IDy < Num_procsy-1) { MPI_Irecv(top_buf_in, RADIUS*width, MPI_DTYPE, top_nbr, 101, MPI_COMM_WORLD, &(request[1])); for (kk=0,j=jend-RADIUS+1; j<=jend; j++) for (i=istart; i<=iend; i++) { top_buf_out[kk++]= IN(i,j); } MPI_Isend(top_buf_out, RADIUS*width,MPI_DTYPE, top_nbr, 99, MPI_COMM_WORLD, &(request[0])); } if (my_IDy > 0) { MPI_Irecv(bottom_buf_in,RADIUS*width, MPI_DTYPE, bottom_nbr, 99, MPI_COMM_WORLD, &(request[3])); for (kk=0,j=jstart; j<=jstart+RADIUS-1; j++) for (i=istart; i<=iend; i++) { bottom_buf_out[kk++]= IN(i,j); } MPI_Isend(bottom_buf_out, RADIUS*width,MPI_DTYPE, bottom_nbr, 101, MPI_COMM_WORLD, &(request[2])); } if (my_IDy < Num_procsy-1) { MPI_Wait(&(request[0]), MPI_STATUS_IGNORE); MPI_Wait(&(request[1]), MPI_STATUS_IGNORE); for (kk=0,j=jend+1; j<=jend+RADIUS; j++) for (i=istart; i<=iend; i++) { IN(i,j) = top_buf_in[kk++]; } } if (my_IDy > 0) { MPI_Wait(&(request[2]), MPI_STATUS_IGNORE); MPI_Wait(&(request[3]), MPI_STATUS_IGNORE); for (kk=0,j=jstart-RADIUS; j<=jstart-1; j++) for (i=istart; i<=iend; i++) { IN(i,j) = bottom_buf_in[kk++]; } } /* need to fetch ghost point data from neighbors in x-direction */ if (my_IDx < Num_procsx-1) { MPI_Irecv(right_buf_in, RADIUS*height, MPI_DTYPE, right_nbr, 1010, MPI_COMM_WORLD, &(request[1+4])); for (kk=0,j=jstart; j<=jend; j++) for (i=iend-RADIUS+1; i<=iend; i++) { right_buf_out[kk++]= IN(i,j); } MPI_Isend(right_buf_out, RADIUS*height, MPI_DTYPE, right_nbr, 990, MPI_COMM_WORLD, &(request[0+4])); } if (my_IDx > 0) { MPI_Irecv(left_buf_in, RADIUS*height, MPI_DTYPE, left_nbr, 990, MPI_COMM_WORLD, &(request[3+4])); for (kk=0,j=jstart; j<=jend; j++) for (i=istart; i<=istart+RADIUS-1; i++) { left_buf_out[kk++]= IN(i,j); } MPI_Isend(left_buf_out, RADIUS*height, MPI_DTYPE, left_nbr, 1010, MPI_COMM_WORLD, &(request[2+4])); } if (my_IDx < Num_procsx-1) { MPI_Wait(&(request[0+4]), MPI_STATUS_IGNORE); MPI_Wait(&(request[1+4]), MPI_STATUS_IGNORE); for (kk=0,j=jstart; j<=jend; j++) for (i=iend+1; i<=iend+RADIUS; i++) { IN(i,j) = right_buf_in[kk++]; } } if (my_IDx > 0) { MPI_Wait(&(request[2+4]), MPI_STATUS_IGNORE); MPI_Wait(&(request[3+4]), MPI_STATUS_IGNORE); for (kk=0,j=jstart; j<=jend; j++) for (i=istart-RADIUS; i<=istart-1; i++) { IN(i,j) = left_buf_in[kk++]; } } /* Apply the stencil operator */ #pragma omp parallel for private (i, j, ii, jj) for (j=MAX(jstart,RADIUS); j<=MIN(n-RADIUS-1,jend); j++) { for (i=MAX(istart,RADIUS); i<=MIN(n-RADIUS-1,iend); i++) { #if LOOPGEN #include "loop_body_star.incl" #else for (jj=-RADIUS; jj<=RADIUS; jj++) OUT(i,j) += WEIGHT(0,jj)*IN(i,j+jj); for (ii=-RADIUS; ii<0; ii++) OUT(i,j) += WEIGHT(ii,0)*IN(i+ii,j); for (ii=1; ii<=RADIUS; ii++) OUT(i,j) += WEIGHT(ii,0)*IN(i+ii,j); #endif } } #pragma omp parallel for private (i) /* add constant to solution to force refresh of neighbor data, if any */ for (j=jstart; j<=jend; j++) for (i=istart; i<=iend; i++) IN(i,j)+= 1.0; } local_stencil_time = wtime() - local_stencil_time; MPI_Reduce(&local_stencil_time, &stencil_time, 1, MPI_DOUBLE, MPI_MAX, root, MPI_COMM_WORLD); /* compute L1 norm in parallel */ local_norm = (DTYPE) 0.0; #pragma omp parallel for reduction(+:local_norm) private (i) for (j=MAX(jstart,RADIUS); j<=MIN(n-RADIUS-1,jend); j++) { for (i=MAX(istart,RADIUS); i<=MIN(n-RADIUS-1,iend); i++) { local_norm += (DTYPE)ABS(OUT(i,j)); } } MPI_Reduce(&local_norm, &norm, 1, MPI_DTYPE, MPI_SUM, root, MPI_COMM_WORLD); /******************************************************************************* ** Analyze and output results. ********************************************************************************/ /* verify correctness */ if (my_ID == root) { norm /= f_active_points; if (RADIUS > 0) { reference_norm = (DTYPE) (iterations+1) * (COEFX + COEFY); } else { reference_norm = (DTYPE) 0.0; } if (ABS(norm-reference_norm) > EPSILON) { printf("ERROR: L1 norm = "FSTR", Reference L1 norm = "FSTR"\n", norm, reference_norm); error = 1; } else { printf("Solution validates\n"); #if VERBOSE printf("Reference L1 norm = "FSTR", L1 norm = "FSTR"\n", reference_norm, norm); #endif } } bail_out(error); if (my_ID == root) { /* flops/stencil: 2 flops (fma) for each point in the stencil, plus one flop for the update of the input of the array */ flops = (DTYPE) (2*stencil_size+1) * f_active_points; avgtime = stencil_time/iterations; printf("Rate (MFlops/s): "FSTR" Avg time (s): %lf\n", 1.0E-06 * flops/avgtime, avgtime); } MPI_Finalize(); exit(EXIT_SUCCESS); }
/*--------------------------------------------------------------------------- * * Compute matrix product using tiling. The loop order used for the tile * products is specified in string variable "mode". * * Input * int argc - length of argv[] array * char* argv[] - pointer to command line parameter array * int verbosity - program verification: verbosity > 0 gives more output * char* order - string indicating loop order, e.g., "ijk" or "jki" * * Output * double - elapsed time for product computation */ double multiply_by_tiles( int argc, char* argv[], int verbosity, char* order ) { int rows, cols, mids; int rows_per_tile, cols_per_tile, mids_per_tile; int row_start, row_end; int col_start, col_end; int mid_start, mid_end; double **a, **b, **c; double t1, t2; double sec; double gflop_count; /* * process command line arguments */ rows = atoi( argv[0] ); mids = atoi( argv[1] ); cols = atoi( argv[2] ); rows_per_tile = atoi( argv[3] ); mids_per_tile = atoi( argv[4] ); cols_per_tile = atoi( argv[5] ); gflop_count = 2.0 * rows * mids * cols / 1.0e9; if ( verbosity > 0 ) { printf( "Tiles(%3s): rows = %d, mids = %d, columns = %d\n", order, rows, mids, cols ); printf( "block rows = %d, mids = %d, columns = %d\n", rows_per_tile, mids_per_tile, cols_per_tile ); } /* * allocate and initialize matrices */ a = (double**) allocateMatrix( rows, mids ); b = (double**) allocateMatrix( mids, cols ); c = (double**) allocateMatrix( rows, cols ); initialize_matrices( a, b, c, rows, cols, mids, verbosity ); /* * compute product */ t1 = wtime(); for ( row_start = 0; row_start < rows; row_start += rows_per_tile ) { row_end = row_start + rows_per_tile - 1; if ( row_end >= rows ) row_end = rows - 1; for ( col_start = 0; col_start < cols; col_start += cols_per_tile ) { col_end = col_start + cols_per_tile - 1; if ( col_end >= cols ) col_end = cols - 1; for ( mid_start = 0; mid_start < mids; mid_start += mids_per_tile ) { mid_end = mid_start + mids_per_tile - 1; if ( mid_end >= mids ) mid_end = mids - 1; do_product( a, b, c, row_start, row_end, col_start, col_end, mid_start, mid_end ); } } } t2 = wtime(); sec = t2 - t1; if ( verbosity > 1 ) printf( "checksum = %f\n", checksum( c, rows, cols ) ); printf( "tiles(%3s): %6.3f secs %6.3f gflops ", order, sec, gflop_count / sec ); printf( "( %5d x %5d x %5d ) ( %4d x %4d x %4d )\n", rows, mids, cols, rows_per_tile, mids_per_tile, cols_per_tile ); /* * clean up */ deallocateMatrix( a ); deallocateMatrix( b ); deallocateMatrix( c ); return t2 - t1; }
int main(int argc, char ** argv) { int Num_procs; /* number of ranks */ int Num_procsx, Num_procsy; /* number of ranks in each coord direction */ int Num_groupsx, Num_groupsy; /* number of blocks in each coord direction */ int my_group; /* sequence number of shared memory block */ int my_group_IDx, my_group_IDy; /* coordinates of block within block grid */ int group_size; /* number of ranks in shared memory group */ int group_sizex, group_sizey; /* number of ranks in block in each coord direction */ int my_ID; /* MPI rank */ int my_global_IDx, my_global_IDy; /* coordinates of rank in overall rank grid */ int my_local_IDx, my_local_IDy; /* coordinates of rank within shared memory block */ int right_nbr; /* global rank of right neighboring tile */ int left_nbr; /* global rank of left neighboring tile */ int top_nbr; /* global rank of top neighboring tile */ int bottom_nbr; /* global rank of bottom neighboring tile */ int local_nbr[4]; /* list of synchronizing local neighbors */ int num_local_nbrs; /* number of synchronizing local neighbors */ int dummy; DTYPE *top_buf_out; /* communication buffer */ DTYPE *top_buf_in; /* " " */ DTYPE *bottom_buf_out; /* " " */ DTYPE *bottom_buf_in; /* " " */ DTYPE *right_buf_out; /* " " */ DTYPE *right_buf_in; /* " " */ DTYPE *left_buf_out; /* " " */ DTYPE *left_buf_in; /* " " */ int root = 0; long n, width, height;/* linear global and block grid dimension */ int width_rank, height_rank; /* linear local dimension */ int iter, leftover; /* dummies */ int istart_rank, iend_rank; /* bounds of grid tile assigned to calling rank */ int jstart_rank, jend_rank; /* bounds of grid tile assigned to calling rank */ int istart, iend; /* bounds of grid block containing tile */ int jstart, jend; /* bounds of grid block containing tile */ DTYPE norm, /* L1 norm of solution */ local_norm, /* contribution of calling rank to L1 norm */ reference_norm; /* value to be matched by computed norm */ DTYPE f_active_points; /* interior of grid with respect to stencil */ DTYPE flops; /* floating point ops per iteration */ int iterations; /* number of times to run the algorithm */ double local_stencil_time,/* timing parameters */ stencil_time, avgtime; int stencil_size; /* number of points in stencil */ DTYPE * RESTRICT in; /* input grid values */ DTYPE * RESTRICT out; /* output grid values */ long total_length_in; /* total required length to store input array */ long total_length_out;/* total required length to store output array */ int error=0; /* error flag */ DTYPE weight[2*RADIUS+1][2*RADIUS+1]; /* weights of points in the stencil */ MPI_Request request[8]; /* requests for sends & receives in 4 coord directions */ MPI_Win shm_win_in; /* shared memory window object for IN array */ MPI_Win shm_win_out; /* shared memory window object for OUT array */ MPI_Comm shm_comm_prep; /* preparatory shared memory communicator */ MPI_Comm shm_comm; /* Shared Memory Communicator */ int shm_procs; /* # of rankes in shared domain */ int shm_ID; /* MPI rank in shared memory domain */ MPI_Aint size_in; /* size of the IN array in shared memory window */ MPI_Aint size_out; /* size of the OUT array in shared memory window */ int size_mul; /* one for shm_comm root, zero for the other ranks */ int disp_unit; /* ignored */ /******************************************************************************* ** Initialize the MPI environment ********************************************************************************/ MPI_Init(&argc,&argv); MPI_Comm_rank(MPI_COMM_WORLD, &my_ID); MPI_Comm_size(MPI_COMM_WORLD, &Num_procs); /******************************************************************************* ** process, test, and broadcast input parameters ********************************************************************************/ if (my_ID == root) { printf("Parallel Research Kernels version %s\n", PRKVERSION); printf("MPI+SHM stencil execution on 2D grid\n"); #if !STAR printf("ERROR: Compact stencil not supported\n"); error = 1; goto ENDOFTESTS; #endif if (argc != 4){ printf("Usage: %s <#ranks per coherence domain><# iterations> <array dimension> \n", *argv); error = 1; goto ENDOFTESTS; } group_size = atoi(*++argv); if (group_size < 1) { printf("ERROR: # ranks per coherence domain must be >= 1 : %d \n",group_size); error = 1; goto ENDOFTESTS; } if (Num_procs%group_size) { printf("ERROR: total # %d ranks not divisible by ranks per coherence domain %d\n", Num_procs, group_size); error = 1; goto ENDOFTESTS; } iterations = atoi(*++argv); if (iterations < 0){ printf("ERROR: iterations must be >= 0 : %d \n",iterations); error = 1; goto ENDOFTESTS; } n = atol(*++argv); long nsquare = n * n; if (nsquare < Num_procs){ printf("ERROR: grid size must be at least # ranks: %ld\n", nsquare); error = 1; goto ENDOFTESTS; } if (RADIUS < 0) { printf("ERROR: Stencil radius %d should be non-negative\n", RADIUS); error = 1; goto ENDOFTESTS; } if (2*RADIUS +1 > n) { printf("ERROR: Stencil radius %d exceeds grid size %ld\n", RADIUS, n); error = 1; goto ENDOFTESTS; } ENDOFTESTS:; } bail_out(error); MPI_Bcast(&n, 1, MPI_LONG, root, MPI_COMM_WORLD); MPI_Bcast(&iterations, 1, MPI_INT, root, MPI_COMM_WORLD); MPI_Bcast(&group_size, 1, MPI_INT, root, MPI_COMM_WORLD); /* determine best way to create a 2D grid of ranks (closest to square, for best surface/volume ratio); we do this brute force for now. The decomposition needs to be such that shared memory groups can evenly tessellate the rank grid */ for (Num_procsx=(int) (sqrt(Num_procs+1)); Num_procsx>0; Num_procsx--) { if (!(Num_procs%Num_procsx)) { Num_procsy = Num_procs/Num_procsx; for (group_sizex=(int)(sqrt(group_size+1)); group_sizex>0; group_sizex--) { if (!(group_size%group_sizex) && !(Num_procsx%group_sizex)) { group_sizey=group_size/group_sizex; break; } } if (!(Num_procsy%group_sizey)) break; } } if (my_ID == root) { printf("Number of ranks = %d\n", Num_procs); printf("Grid size = %ld\n", n); printf("Radius of stencil = %d\n", RADIUS); printf("Tiles in x/y-direction = %d/%d\n", Num_procsx, Num_procsy); printf("Tiles per shared memory domain = %d\n", group_size); printf("Tiles in x/y-direction in group = %d/%d\n", group_sizex, group_sizey); printf("Type of stencil = star\n"); #if LOCAL_BARRIER_SYNCH printf("Local synchronization = barrier\n"); #else printf("Local synchronization = point to point\n"); #endif #if DOUBLE printf("Data type = double precision\n"); #else printf("Data type = single precision\n"); #endif #if LOOPGEN printf("Script used to expand stencil loop body\n"); #else printf("Compact representation of stencil loop body\n"); #endif printf("Number of iterations = %d\n", iterations); } /* Setup for Shared memory regions */ /* first divide WORLD in groups of size group_size */ MPI_Comm_split(MPI_COMM_WORLD, my_ID/group_size, my_ID%group_size, &shm_comm_prep); /* derive from that an SHM communicator */ MPI_Comm_split_type(shm_comm_prep, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &shm_comm); MPI_Comm_rank(shm_comm, &shm_ID); MPI_Comm_size(shm_comm, &shm_procs); /* do sanity check, making sure groups did not shrink in second comm split */ if (shm_procs != group_size) MPI_Abort(MPI_COMM_WORLD, 666); Num_groupsx = Num_procsx/group_sizex; Num_groupsy = Num_procsy/group_sizey; my_group = my_ID/group_size; my_group_IDx = my_group%Num_groupsx; my_group_IDy = my_group/Num_groupsx; my_local_IDx = my_ID%group_sizex; my_local_IDy = (my_ID%group_size)/group_sizex; my_global_IDx = my_group_IDx*group_sizex+my_local_IDx; my_global_IDy = my_group_IDy*group_sizey+my_local_IDy; /* set all neighboring ranks to -1 (no communication with those ranks) */ left_nbr = right_nbr = top_nbr = bottom_nbr = -1; /* keep track of local neighbors for local synchronization */ num_local_nbrs = 0; if (my_local_IDx == group_sizex-1 && my_group_IDx != (Num_groupsx-1)) { right_nbr = (my_group+1)*group_size+shm_ID-group_sizex+1; } if (my_local_IDx != group_sizex-1) { local_nbr[num_local_nbrs++] = shm_ID + 1; } if (my_local_IDx == 0 && my_group_IDx != 0) { left_nbr = (my_group-1)*group_size+shm_ID+group_sizex-1; } if (my_local_IDx != 0) { local_nbr[num_local_nbrs++] = shm_ID - 1; } if (my_local_IDy == group_sizey-1 && my_group_IDy != (Num_groupsy-1)) { top_nbr = (my_group+Num_groupsx)*group_size + my_local_IDx; } if (my_local_IDy != group_sizey-1) { local_nbr[num_local_nbrs++] = shm_ID + group_sizex; } if (my_local_IDy == 0 && my_group_IDy != 0) { bottom_nbr = (my_group-Num_groupsx)*group_size + group_sizex*(group_sizey-1)+my_local_IDx; } if (my_local_IDy != 0) { local_nbr[num_local_nbrs++] = shm_ID - group_sizex; } /* compute amount of space required for input and solution arrays for the block, and also compute index sets */ width = n/Num_groupsx; leftover = n%Num_groupsx; if (my_group_IDx<leftover) { istart = (width+1) * my_group_IDx; iend = istart + width; } else { istart = (width+1) * leftover + width * (my_group_IDx-leftover); iend = istart + width - 1; } width = iend - istart + 1; if (width == 0) { printf("ERROR: rank %d has no work to do\n", my_ID); error = 1; } bail_out(error); height = n/Num_groupsy; leftover = n%Num_groupsy; if (my_group_IDy<leftover) { jstart = (height+1) * my_group_IDy; jend = jstart + height; } else { jstart = (height+1) * leftover + height * (my_group_IDy-leftover); jend = jstart + height - 1; } height = jend - jstart + 1; if (height == 0) { printf("ERROR: rank %d has no work to do\n", my_ID); error = 1; } bail_out(error); if (width < RADIUS || height < RADIUS) { printf("ERROR: rank %d has work tile smaller then stencil radius; w=%ld,h=%ld\n", my_ID, width, height); error = 1; } bail_out(error); total_length_in = (width+2*RADIUS)*(height+2*RADIUS)*sizeof(DTYPE); total_length_out = width*height*sizeof(DTYPE); /* only the root of each SHM domain specifies window of nonzero size */ size_mul = (shm_ID==0); size_in= total_length_in*size_mul; MPI_Win_allocate_shared(size_in, sizeof(double), MPI_INFO_NULL, shm_comm, (void *) &in, &shm_win_in); MPI_Win_lock_all(MPI_MODE_NOCHECK, shm_win_in); MPI_Win_shared_query(shm_win_in, MPI_PROC_NULL, &size_in, &disp_unit, (void *)&in); if (in == NULL){ printf("Error allocating space for input array by group %d\n",my_group); error = 1; } bail_out(error); size_out= total_length_out*size_mul; MPI_Win_allocate_shared(size_out, sizeof(double), MPI_INFO_NULL, shm_comm, (void *) &out, &shm_win_out); MPI_Win_lock_all(MPI_MODE_NOCHECK, shm_win_out); MPI_Win_shared_query(shm_win_out, MPI_PROC_NULL, &size_out, &disp_unit, (void *)&out); if (out == NULL){ printf("Error allocating space for output array by group %d\n", my_group); error = 1; } bail_out(error); /* determine index set assigned to each rank */ width_rank = width/group_sizex; leftover = width%group_sizex; if (my_local_IDx<leftover) { istart_rank = (width_rank+1) * my_local_IDx; iend_rank = istart_rank + width_rank; } else { istart_rank = (width_rank+1) * leftover + width_rank * (my_local_IDx-leftover); iend_rank = istart_rank + width_rank - 1; } istart_rank += istart; iend_rank += istart; width_rank = iend_rank - istart_rank + 1; height_rank = height/group_sizey; leftover = height%group_sizey; if (my_local_IDy<leftover) { jstart_rank = (height_rank+1) * my_local_IDy; jend_rank = jstart_rank + height_rank; } else { jstart_rank = (height_rank+1) * leftover + height_rank * (my_local_IDy-leftover); jend_rank = jstart_rank + height_rank - 1; } jstart_rank+=jstart; jend_rank+=jstart; height_rank = jend_rank - jstart_rank + 1; if (height_rank*width_rank==0) { error = 1; printf("Rank %d has no work to do\n", my_ID); } bail_out(error); /* allocate communication buffers for halo values */ top_buf_out = (DTYPE *) prk_malloc(4*sizeof(DTYPE)*RADIUS*width_rank); if (!top_buf_out) { printf("ERROR: Rank %d could not allocated comm buffers for y-direction\n", my_ID); error = 1; } bail_out(error); top_buf_in = top_buf_out + RADIUS*width_rank; bottom_buf_out = top_buf_out + 2*RADIUS*width_rank; bottom_buf_in = top_buf_out + 3*RADIUS*width_rank; right_buf_out = (DTYPE *) prk_malloc(4*sizeof(DTYPE)*RADIUS*height_rank); if (!right_buf_out) { printf("ERROR: Rank %d could not allocated comm buffers for x-direction\n", my_ID); error = 1; } bail_out(error); right_buf_in = right_buf_out + RADIUS*height_rank; left_buf_out = right_buf_out + 2*RADIUS*height_rank; left_buf_in = right_buf_out + 3*RADIUS*height_rank; /* fill the stencil weights to reflect a discrete divergence operator */ for (int jj=-RADIUS; jj<=RADIUS; jj++) for (int ii=-RADIUS; ii<=RADIUS; ii++) WEIGHT(ii,jj) = (DTYPE) 0.0; stencil_size = 4*RADIUS+1; for (int ii=1; ii<=RADIUS; ii++) { WEIGHT(0, ii) = WEIGHT( ii,0) = (DTYPE) (1.0/(2.0*ii*RADIUS)); WEIGHT(0,-ii) = WEIGHT(-ii,0) = -(DTYPE) (1.0/(2.0*ii*RADIUS)); } norm = (DTYPE) 0.0; f_active_points = (DTYPE) (n-2*RADIUS)*(DTYPE) (n-2*RADIUS); /* intialize the input and output arrays */ for (int j=jstart_rank; j<=jend_rank; j++) for (int i=istart_rank; i<=iend_rank; i++) { IN(i,j) = COEFX*i+COEFY*j; OUT(i,j) = (DTYPE)0.0; } /* LOAD/STORE FENCE */ MPI_Win_sync(shm_win_in); MPI_Win_sync(shm_win_out); MPI_Barrier(shm_comm); for (iter = 0; iter<=iterations; iter++){ /* start timer after a warmup iteration */ if (iter == 1) { MPI_Barrier(MPI_COMM_WORLD); local_stencil_time = wtime(); } /* need to fetch ghost point data from neighbors in y-direction */ if (top_nbr != -1) { MPI_Irecv(top_buf_in, RADIUS*width_rank, MPI_DTYPE, top_nbr, 101, MPI_COMM_WORLD, &(request[1])); for (int kk=0,j=jend_rank-RADIUS+1; j<=jend_rank; j++) for (int i=istart_rank; i<=iend_rank; i++) { top_buf_out[kk++]= IN(i,j); } MPI_Isend(top_buf_out, RADIUS*width_rank,MPI_DTYPE, top_nbr, 99, MPI_COMM_WORLD, &(request[0])); } if (bottom_nbr != -1) { MPI_Irecv(bottom_buf_in,RADIUS*width_rank, MPI_DTYPE, bottom_nbr, 99, MPI_COMM_WORLD, &(request[3])); for (int kk=0,j=jstart_rank; j<=jstart_rank+RADIUS-1; j++) for (int i=istart_rank; i<=iend_rank; i++) { bottom_buf_out[kk++]= IN(i,j); } MPI_Isend(bottom_buf_out, RADIUS*width_rank,MPI_DTYPE, bottom_nbr, 101, MPI_COMM_WORLD, &(request[2])); } if (top_nbr != -1) { MPI_Wait(&(request[0]), MPI_STATUS_IGNORE); MPI_Wait(&(request[1]), MPI_STATUS_IGNORE); for (int kk=0,j=jend_rank+1; j<=jend_rank+RADIUS; j++) for (int i=istart_rank; i<=iend_rank; i++) { IN(i,j) = top_buf_in[kk++]; } } if (bottom_nbr != -1) { MPI_Wait(&(request[2]), MPI_STATUS_IGNORE); MPI_Wait(&(request[3]), MPI_STATUS_IGNORE); for (int kk=0,j=jstart_rank-RADIUS; j<=jstart_rank-1; j++) for (int i=istart_rank; i<=iend_rank; i++) { IN(i,j) = bottom_buf_in[kk++]; } } /* LOAD/STORE FENCE */ MPI_Win_sync(shm_win_in); /* need to fetch ghost point data from neighbors in x-direction */ if (right_nbr != -1) { MPI_Irecv(right_buf_in, RADIUS*height_rank, MPI_DTYPE, right_nbr, 1010, MPI_COMM_WORLD, &(request[1+4])); for (int kk=0,j=jstart_rank; j<=jend_rank; j++) for (int i=iend_rank-RADIUS+1; i<=iend_rank; i++) { right_buf_out[kk++]= IN(i,j); } MPI_Isend(right_buf_out, RADIUS*height_rank, MPI_DTYPE, right_nbr, 990, MPI_COMM_WORLD, &(request[0+4])); } if (left_nbr != -1) { MPI_Irecv(left_buf_in, RADIUS*height_rank, MPI_DTYPE, left_nbr, 990, MPI_COMM_WORLD, &(request[3+4])); for (int kk=0,j=jstart_rank; j<=jend_rank; j++) for (int i=istart_rank; i<=istart_rank+RADIUS-1; i++) { left_buf_out[kk++]= IN(i,j); } MPI_Isend(left_buf_out, RADIUS*height_rank, MPI_DTYPE, left_nbr, 1010, MPI_COMM_WORLD, &(request[2+4])); } if (right_nbr != -1) { MPI_Wait(&(request[0+4]), MPI_STATUS_IGNORE); MPI_Wait(&(request[1+4]), MPI_STATUS_IGNORE); for (int kk=0,j=jstart_rank; j<=jend_rank; j++) for (int i=iend_rank+1; i<=iend_rank+RADIUS; i++) { IN(i,j) = right_buf_in[kk++]; } } if (left_nbr != -1) { MPI_Wait(&(request[2+4]), MPI_STATUS_IGNORE); MPI_Wait(&(request[3+4]), MPI_STATUS_IGNORE); for (int kk=0,j=jstart_rank; j<=jend_rank; j++) for (int i=istart_rank-RADIUS; i<=istart_rank-1; i++) { IN(i,j) = left_buf_in[kk++]; } } /* LOAD/STORE FENCE */ MPI_Win_sync(shm_win_in); /* Apply the stencil operator */ for (int j=MAX(jstart_rank,RADIUS); j<=MIN(n-RADIUS-1,jend_rank); j++) { for (int i=MAX(istart_rank,RADIUS); i<=MIN(n-RADIUS-1,iend_rank); i++) { #if LOOPGEN #include "loop_body_star.incl" #else for (int jj=-RADIUS; jj<=RADIUS; jj++) OUT(i,j) += WEIGHT(0,jj)*IN(i,j+jj); for (int ii=-RADIUS; ii<0; ii++) OUT(i,j) += WEIGHT(ii,0)*IN(i+ii,j); for (int ii=1; ii<=RADIUS; ii++) OUT(i,j) += WEIGHT(ii,0)*IN(i+ii,j); #endif } } /* LOAD/STORE FENCE */ MPI_Win_sync(shm_win_out); #if LOCAL_BARRIER_SYNCH MPI_Barrier(shm_comm); // needed to avoid writing IN while other ranks are reading it #else for (int i=0; i<num_local_nbrs; i++) { MPI_Irecv(&dummy, 0, MPI_INT, local_nbr[i], 666, shm_comm, &(request[i])); MPI_Send(&dummy, 0, MPI_INT, local_nbr[i], 666, shm_comm); } MPI_Waitall(num_local_nbrs, request, MPI_STATUSES_IGNORE); #endif /* add constant to solution to force refresh of neighbor data, if any */ for (int j=jstart_rank; j<=jend_rank; j++) for (int i=istart_rank; i<=iend_rank; i++) IN(i,j)+= 1.0; /* LOAD/STORE FENCE */ MPI_Win_sync(shm_win_in); #if LOCAL_BARRIER_SYNCH MPI_Barrier(shm_comm); // needed to avoid reading IN while other ranks are writing it #else for (int i=0; i<num_local_nbrs; i++) { MPI_Irecv(&dummy, 0, MPI_INT, local_nbr[i], 666, shm_comm, &(request[i])); MPI_Send(&dummy, 0, MPI_INT, local_nbr[i], 666, shm_comm); } MPI_Waitall(num_local_nbrs, request, MPI_STATUSES_IGNORE); #endif } /* end of iterations */ local_stencil_time = wtime() - local_stencil_time; MPI_Reduce(&local_stencil_time, &stencil_time, 1, MPI_DOUBLE, MPI_MAX, root, MPI_COMM_WORLD); /* compute L1 norm in parallel */ local_norm = (DTYPE) 0.0; for (int j=MAX(jstart_rank,RADIUS); j<=MIN(n-RADIUS-1,jend_rank); j++) { for (int i=MAX(istart_rank,RADIUS); i<=MIN(n-RADIUS-1,iend_rank); i++) { local_norm += (DTYPE)ABS(OUT(i,j)); } } MPI_Reduce(&local_norm, &norm, 1, MPI_DTYPE, MPI_SUM, root, MPI_COMM_WORLD); /******************************************************************************* ** Analyze and output results. ********************************************************************************/ /* verify correctness */ if (my_ID == root) { norm /= f_active_points; if (RADIUS > 0) { reference_norm = (DTYPE) (iterations+1) * (COEFX + COEFY); } else { reference_norm = (DTYPE) 0.0; } if (ABS(norm-reference_norm) > EPSILON) { printf("ERROR: L1 norm = "FSTR", Reference L1 norm = "FSTR"\n", norm, reference_norm); error = 1; } else { printf("Solution validates\n"); #if VERBOSE printf("Reference L1 norm = "FSTR", L1 norm = "FSTR"\n", reference_norm, norm); #endif } } bail_out(error); MPI_Win_unlock_all(shm_win_in); MPI_Win_unlock_all(shm_win_out); MPI_Win_free(&shm_win_in); MPI_Win_free(&shm_win_out); if (my_ID == root) { /* flops/stencil: 2 flops (fma) for each point in the stencil, plus one flop for the update of the input of the array */ flops = (DTYPE) (2*stencil_size+1) * f_active_points; avgtime = stencil_time/iterations; printf("Rate (MFlops/s): "FSTR" Avg time (s): %lf\n", 1.0E-06 * flops/avgtime, avgtime); } MPI_Finalize(); exit(EXIT_SUCCESS); }
int main(int argc, char ** argv) { int args_used = 1; // keeps track of # consumed arguments uint64_t L; // dimension of grid in cells uint64_t iterations; // total number of simulation steps uint64_t n; // total number of particles in the simulation char *init_mode; // particle initialization mode (char) uint64_t particle_mode; // particle initialization mode (int) double rho; // attenuation factor for geometric particle distribution int64_t k, m; // determine initial horizontal and vertical velocity of // particles-- (2*k)+1 cells per time step double alpha, beta; // slope and offset values for linear particle distribution bbox_t grid_patch, // whole grid init_patch; // subset of grid used for localized initialization int correctness = 1; // determines whether simulation was correct double *Qgrid; // field of fixed charges particle_t *particles, *p; // the particles array uint64_t iter, i; // dummies double fx, fy, ax, ay; // forces and accelerations #if UNUSED int particles_per_cell;// number of particles per cell to be injected int error=0; // used for graceful exit after error #endif double avg_time, pic_time;// timing parameters int nthread_input, // thread parameters nthread; int num_error=0; // flag that signals that requested and obtained // numbers of threads are the same random_draw_t dice; printf("Parallel Research Kernels Version %s\n", PRKVERSION); printf("OpenMP Particle-in-Cell execution on 2D grid\n"); /******************************************************************************* ** process and test input parameters ********************************************************************************/ if (argc<7) { printf("Usage: %s <#threads> <#simulation steps> <grid size> <#particles> <k (particle charge semi-increment)> ", argv[0]); printf("<m (vertical particle velocity)>\n"); printf(" <init mode> <init parameters>]\n"); printf(" init mode \"GEOMETRIC\" parameters: <attenuation factor>\n"); printf(" \"SINUSOIDAL\" parameters: none\n"); printf(" \"LINEAR\" parameters: <negative slope> <constant offset>\n"); printf(" \"PATCH\" parameters: <xleft> <xright> <ybottom> <ytop>\n"); exit(SUCCESS); } /* Take number of threads to request from command line */ nthread_input = atoi(*++argv); if ((nthread_input < 1) || (nthread_input > MAX_THREADS)) { printf("ERROR: Invalid number of threads: %d\n", nthread_input); exit(EXIT_FAILURE); } omp_set_num_threads(nthread_input); iterations = atol(*++argv); args_used++; if (iterations<1) { printf("ERROR: Number of time steps must be positive: %" PRIu64 "\n", iterations); exit(FAILURE); } L = atol(*++argv); args_used++; if (L<1 || L%2) { printf("ERROR: Number of grid cells must be positive and even: %" PRIu64 "\n", L); exit(FAILURE); } grid_patch = (bbox_t){0, L+1, 0, L+1}; n = atol(*++argv); args_used++; if (n<1) { printf("ERROR: Number of particles must be positive: %" PRIu64 "\n", n); exit(FAILURE); } particle_mode = UNDEFINED; k = atoi(*++argv); args_used++; if (k<0) { printf("ERROR: Particle semi-charge must be non-negative: %" PRIu64 "\n", k); exit(FAILURE); } m = atoi(*++argv); args_used++; init_mode = *++argv; args_used++; /* Initialize particles with geometric distribution */ if (strcmp(init_mode, "GEOMETRIC") == 0) { if (argc<args_used+1) { printf("ERROR: Not enough arguments for GEOMETRIC\n"); exit(FAILURE); } particle_mode = GEOMETRIC; rho = atof(*++argv); args_used++; } /* Initialize with a sinusoidal particle distribution (single period) */ if (strcmp(init_mode, "SINUSOIDAL") == 0) { particle_mode = SINUSOIDAL; } /* Initialize particles with linear distribution */ /* The linear function is f(x) = -alpha * x + beta , x in [0,1]*/ if (strcmp(init_mode, "LINEAR") == 0) { if (argc<args_used+2) { printf("ERROR: Not enough arguments for LINEAR initialization\n"); exit(EXIT_FAILURE); } particle_mode = LINEAR; alpha = atof(*++argv); args_used++; beta = atof(*++argv); args_used++; if (beta <0 || beta<alpha) { printf("ERROR: linear profile gives negative particle density\n"); exit(EXIT_FAILURE); } } /* Initialize particles uniformly within a "patch" */ if (strcmp(init_mode, "PATCH") == 0) { if (argc<args_used+4) { printf("ERROR: Not enough arguments for PATCH initialization\n"); exit(FAILURE); } particle_mode = PATCH; init_patch.left = atoi(*++argv); args_used++; init_patch.right = atoi(*++argv); args_used++; init_patch.bottom = atoi(*++argv); args_used++; init_patch.top = atoi(*++argv); args_used++; if (bad_patch(&init_patch, &grid_patch)) { printf("ERROR: inconsistent initial patch\n"); exit(FAILURE); } } #pragma omp parallel { #pragma omp master { nthread = omp_get_num_threads(); if (nthread != nthread_input) { num_error = 1; printf("ERROR: number of requested threads %d does not equal ", nthread_input); printf("number of spawned threads %d\n", nthread); } else { printf("Number of threads = %d\n",nthread_input); printf("Grid size = %lld\n", L); printf("Number of particles requested = %lld\n", n); printf("Number of time steps = %lld\n", iterations); printf("Initialization mode = %s\n", init_mode); switch(particle_mode) { case GEOMETRIC: printf(" Attenuation factor = %lf\n", rho); break; case SINUSOIDAL: break; case LINEAR: printf(" Negative slope = %lf\n", alpha); printf(" Offset = %lf\n", beta); break; case PATCH: printf(" Bounding box = %" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64 "\n", init_patch.left, init_patch.right, init_patch.bottom, init_patch.top); break; default: printf("ERROR: Unsupported particle initializating mode\n"); exit(FAILURE); } printf("Particle charge semi-increment = %"PRIu64"\n", k); printf("Vertical velocity = %"PRIu64"\n", m); /* Initialize grid of charges and particles */ Qgrid = initializeGrid(L); LCG_init(&dice); switch(particle_mode) { case GEOMETRIC: particles = initializeGeometric(n, L, rho, k, m, &n, &dice); break; case SINUSOIDAL: particles = initializeSinusoidal(n, L, k, m, &n, &dice); break; case LINEAR: particles = initializeLinear(n, L, alpha, beta, k, m, &n, &dice); break; case PATCH: particles = initializePatch(n, L, init_patch, k, m, &n, &dice); break; default: printf("ERROR: Unsupported particle distribution\n"); exit(FAILURE); } printf("Number of particles placed = %lld\n", n); } } bail_out(num_error); } for (iter=0; iter<=iterations; iter++) { /* start the timer after one warm-up time step */ if (iter==1) { pic_time = wtime(); } /* Calculate forces on particles and update positions */ #pragma omp parallel for private(i, p, fx, fy, ax, ay) for (i=0; i<n; i++) { p = particles; fx = 0.0; fy = 0.0; computeTotalForce(p[i], L, Qgrid, &fx, &fy); ax = fx * MASS_INV; ay = fy * MASS_INV; /* Update particle positions, taking into account periodic boundaries */ p[i].x = fmod(p[i].x + p[i].v_x*DT + 0.5*ax*DT*DT + L, L); p[i].y = fmod(p[i].y + p[i].v_y*DT + 0.5*ay*DT*DT + L, L); /* Update velocities */ p[i].v_x += ax * DT; p[i].v_y += ay * DT; } } pic_time = wtime() - pic_time; /* Run the verification test */ for (i=0; i<n; i++) { correctness *= verifyParticle(particles[i], iterations, Qgrid, L); } if (correctness) { printf("Solution validates\n"); #ifdef VERBOSE printf("Simulation time is %lf seconds\n", pic_time); #endif avg_time = n*iterations/pic_time; printf("Rate (Mparticles_moved/s): %lf\n", 1.0e-6*avg_time); } else { printf("Solution does not validate\n"); } return(EXIT_SUCCESS); }
int main(int argc, char ** argv) { int N; int tile_size=32; /* default tile size for tiling of local transpose */ int num_iterations;/* number of times to do the transpose */ int tiling; /* boolean: true if tiling is used */ double total_bytes; /* combined size of matrices */ double start_time, /* timing parameters */ end_time, avgtime; /********************************************************************* ** read and test input parameters *********************************************************************/ if(argc != 3 && argc != 4){ if(MYTHREAD == 0) printf("Usage: %s <# iterations> <matrix order> [tile size]\n", *argv); upc_global_exit(EXIT_FAILURE); } num_iterations = atoi(*++argv); if(num_iterations < 1){ if(MYTHREAD == 0) printf("ERROR: iterations must be >= 1 : %d \n", num_iterations); upc_global_exit(EXIT_FAILURE); } N = atoi(*++argv); if(N < 0){ if(MYTHREAD == 0) printf("ERROR: Matrix Order must be greater than 0 : %d \n", N); upc_global_exit(EXIT_FAILURE); } if (argc == 4) tile_size = atoi(*++argv); /*a non-positive tile size means no tiling of the local transpose */ tiling = (tile_size > 0) && (tile_size < N); if(!tiling) tile_size = N; int sizex = N / THREADS; if(N % THREADS != 0) { if(MYTHREAD == 0) printf("N %% THREADS != 0\n"); upc_global_exit(EXIT_FAILURE); } int sizey = N; if(MYTHREAD == 0) { printf("Parallel Research Kernels version %s\n", PRKVERSION); printf("UPC matrix transpose: B = A^T\n"); printf("Number of threads = %d\n", THREADS); printf("Matrix order = %d\n", N); printf("Number of iterations = %d\n", num_iterations); if (tiling) printf("Tile size = %d\n", tile_size); else printf("Untiled\n"); } /********************************************************************* ** Allocate memory for input and output matrices *********************************************************************/ total_bytes = 2.0 * sizeof(double) * N * N; int myoffsetx = MYTHREAD * sizex; int myoffsety = 0; upc_barrier; debug("Allocating arrays (%d, %d), offset (%d, %d)", sizex, sizey, myoffsetx, myoffsety); local_shared_block_ptrs in_array = shared_2d_array_alloc(sizex, sizey, myoffsetx, myoffsety); local_shared_block_ptrs out_array = shared_2d_array_alloc(sizex, sizey, myoffsetx, myoffsety); local_shared_block_ptrs buf_array = shared_2d_array_alloc(sizex, sizey, myoffsetx, myoffsety); in_arrays[MYTHREAD] = in_array; out_arrays[MYTHREAD] = out_array; buf_arrays[MYTHREAD] = buf_array; double **in_array_private = shared_2d_array_to_private(in_array, sizex, sizey, myoffsetx, myoffsety); double **out_array_private = shared_2d_array_to_private(out_array, sizex, sizey, myoffsetx, myoffsety); double **buf_array_private = shared_2d_array_to_private(buf_array, sizex, sizey, myoffsetx, myoffsety); upc_barrier; /********************************************************************* ** Initialize the matrices *********************************************************************/ for(int y=myoffsety; y<myoffsety + sizey; y++){ for(int x=myoffsetx; x<myoffsetx + sizex; x++){ in_array_private[y][x] = (double) (x+N*y); out_array[y][x] = -1.0; } } upc_barrier; for(int y=myoffsety; y<myoffsety + sizey; y++){ for(int x=myoffsetx; x<myoffsetx + sizex; x++){ if(in_array_private[y][x] !=(double) (x+N*y)) die("x=%d y=%d in_array=%f != %f", x, y, in_array[y][x], (x+N*y)); if(out_array_private[y][x] != -1.0) die("out_array_private error"); } } /********************************************************************* ** Transpose *********************************************************************/ int transfer_size = sizex * sizex * sizeof(double); if(MYTHREAD == 0) debug("transfer size = %d", transfer_size); for(int iter=0; iter<=num_iterations; iter++){ /* start timer after a warmup iteration */ if(iter == 1){ upc_barrier; start_time = wtime(); } for(int i=0; i<THREADS; i++){ int local_blk_id = (MYTHREAD + i) % THREADS; int remote_blk_id = MYTHREAD; int remote_thread = local_blk_id; upc_memget(&buf_array_private[local_blk_id * sizex][myoffsetx], &in_arrays[remote_thread][remote_blk_id * sizex][remote_thread * sizex], transfer_size); #define OUT_ARRAY(x,y) out_array_private[local_blk_id * sizex + x][myoffsetx + y] #define BUF_ARRAY(x,y) buf_array_private[local_blk_id * sizex + x][myoffsetx + y] if(!tiling){ for(int x=0; x<sizex; x++){ for(int y=0; y<sizex; y++){ OUT_ARRAY(x,y) = BUF_ARRAY(y,x); } } } else{ for(int x=0; x<sizex; x+=tile_size){ for(int y=0; y<sizex; y+=tile_size){ for(int bx=x; bx<MIN(sizex, x+tile_size); bx++){ for(int by=y; by<MIN(sizex, y+tile_size); by++){ OUT_ARRAY(bx,by) = BUF_ARRAY(by,bx); } } } } } } upc_barrier; } upc_barrier; end_time = wtime(); /********************************************************************* ** Analyze and output results. *********************************************************************/ for(int y=myoffsety; y<myoffsety + sizey; y++){ for(int x=myoffsetx; x<myoffsetx + sizex; x++){ if(in_array_private[y][x] != (double)(x+ N*y)) die("Error in input: x=%d y=%d", x, y); if(out_array_private[y][x] != (double)(y + N*x)) die("x=%d y=%d in_array=%f != %f %d %d", x, y, out_array[y][x], (double)(y + N*x), (int)(out_array[y][x]) % N, (int)(out_array[y][x]) / N); } } if(MYTHREAD == 0){ printf("Solution validates\n"); double transfer_size = 2 * N * N * sizeof(double); avgtime = (end_time - start_time) / num_iterations; double rate = transfer_size / avgtime * 1.0E-06; printf("Rate (MB/s): %lf Avg time (s): %lf\n",rate, avgtime); } }
/*********************************************************************** * Read the input file. ***********************************************************************/ int read_input ( FILE *fp_in, FILE *fp_out, input_data *input_vars, para_data *para_vars, time_data *time_vars ) { /*********************************************************************** * Local variables. ***********************************************************************/ double t1, t2; int ierr = 0; char *error = NULL; char *line = NULL; size_t len = 0; ssize_t read; char *tmpData = NULL; int tmpStrLen, i; /*********************************************************************** * Read the input file. Echo to output file. Call for an input variable * check. Only root reads, echoes, checks input. ***********************************************************************/ t1 = wtime (); if ( IPROC == ROOT ) { if ( !fp_in ) { tmpStrLen = strlen (" ***ERROR: READ_INPUT:" " Problem reading input file.\n"); ALLOC_STR ( error, tmpStrLen + 1, &ierr ); snprintf ( error, tmpStrLen + 1, " ***ERROR: READ_INPUT:" " Problem reading input file.\n" ); print_error ( fp_out, error, IPROC, ROOT ); FREE ( error ); ierr = 1; } else { while ( (read = getline(&line, &len, fp_in)) != -1 ) { i = 0; while ( isspace(line[i]) ) { i++; } // Parallel processing inputs // npey: number of process elements in y-dir if ( strncmp(&line[i], "npey=", strlen("npey=")) == 0 ) { get_input_value ( &line[i], "npey=", &tmpData ); NPEY = atoi ( tmpData ); } // npez: input number of process elements in z-dir else if ( strncmp(&line[i], "npez=", strlen("npez=")) == 0 ) { get_input_value ( &line[i], "npez=", &tmpData ); NPEZ = atoi ( tmpData ); } // ichunk: else if ( strncmp(&line[i], "ichunk=", strlen("ichunk=")) == 0 ) { get_input_value ( &line[i], "ichunk=", &tmpData ); ICHUNK = atoi ( tmpData ); } // nthreads: input number of threads else if ( strncmp(&line[i], "nthreads=", strlen("nthreads=")) == 0 ) { get_input_value ( &line[i], "nthreads=", &tmpData ); NTHREADS = atoi ( tmpData ); } // nnested: else if ( strncmp(&line[i], "nnested=", strlen("nnested=")) == 0 ) { get_input_value ( &line[i], "nnested=", &tmpData ); NNESTED = atoi ( tmpData ); } // Geometry inputs // ndimen: else if ( strncmp(&line[i], "ndimen=", strlen("ndimen=")) == 0 ) { get_input_value ( &line[i], "ndimen=", &tmpData ); NDIMEN = atoi ( tmpData ); } // nx: else if ( strncmp(&line[i], "nx=", strlen("nx=")) == 0 ) { get_input_value ( &line[i], "nx=", &tmpData ); NX = atoi ( tmpData ); } // ny: else if ( strncmp(&line[i], "ny=", strlen("ny=")) == 0 ) { get_input_value ( &line[i], "ny=", &tmpData ); NY = atoi ( tmpData ); } // nz: else if ( strncmp(&line[i], "nz=", strlen("nz=")) == 0 ) { get_input_value ( &line[i], "nz=", &tmpData ); NZ = atoi ( tmpData ); } // lx: else if ( strncmp(&line[i], "lx=", strlen("lx=")) == 0 ) { get_input_value ( &line[i], "lx=", &tmpData ); LX = atof ( tmpData ); } // ly: else if ( strncmp(&line[i], "ly=", strlen("ly=")) == 0 ) { get_input_value ( &line[i], "ly=", &tmpData ); LY = atof ( tmpData ); } // lz: else if ( strncmp(&line[i], "lz=", strlen("lz=")) == 0 ) { get_input_value ( &line[i], "lz=", &tmpData ); LZ = atof ( tmpData ); } // Sn inputs // nmom: else if ( strncmp(&line[i], "nmom=", strlen("nmom=")) == 0 ) { get_input_value ( &line[i], "nmom=", &tmpData ); NMOM = atoi ( tmpData ); } // nang: else if ( strncmp(&line[i], "nang=", strlen("nang=")) == 0 ) { get_input_value ( &line[i], "nang=", &tmpData ); NANG = atoi ( tmpData ); } // Data inputs // ng: else if ( strncmp(&line[i], "ng=", strlen("ng=")) == 0 ) { get_input_value ( &line[i], "ng=", &tmpData ); NG = atoi ( tmpData ); } // mat_opt: else if ( strncmp(&line[i], "mat_opt=", strlen("mat_opt=")) == 0 ) { get_input_value ( &line[i], "mat_opt=", &tmpData ); MAT_OPT = atoi ( tmpData ); } // src_opt: else if ( strncmp(&line[i], "src_opt=", strlen("src_opt=")) == 0 ) { get_input_value ( &line[i], "src_opt=", &tmpData ); SRC_OPT = atoi ( tmpData ); } // scatp: else if ( strncmp(&line[i], "scatp=", strlen("scatp=")) == 0 ) { get_input_value ( &line[i], "scatp=", &tmpData ); SCATP = atoi ( tmpData ); } // Control inputs // epsi: else if ( strncmp(&line[i], "epsi=", strlen("epsi=")) == 0 ) { get_input_value ( &line[i], "epsi=", &tmpData ); EPSI = atof ( tmpData ); } // tf: else if ( strncmp(&line[i], "tf=", strlen("tf=")) == 0 ) { get_input_value ( &line[i], "tf=", &tmpData ); TF = atof ( tmpData ); } // iitm: else if ( strncmp(&line[i], "iitm=", strlen("iitm=")) == 0 ) { get_input_value ( &line[i], "iitm=", &tmpData ); IITM = atoi ( tmpData ); } // oitm: else if ( strncmp(&line[i], "oitm=", strlen("oitm=")) == 0 ) { get_input_value ( &line[i], "oitm=", &tmpData ); OITM = atoi ( tmpData ); } // timedep: else if ( strncmp(&line[i], "timedep=", strlen("timedep=")) == 0 ) { get_input_value ( &line[i], "timedep=", &tmpData ); TIMEDEP = atoi ( tmpData ); } // nsteps: else if ( strncmp(&line[i], "nsteps=", strlen("nsteps=")) == 0 ) { get_input_value ( &line[i], "nsteps=", &tmpData ); NSTEPS = atoi ( tmpData ); } // it_det: else if ( strncmp(&line[i], "it_det=", strlen("it_det=")) == 0 ) { get_input_value ( &line[i], "it_det=", &tmpData ); IT_DET = atoi ( tmpData ); } // fluxp: else if ( strncmp(&line[i], "fluxp=", strlen("fluxp=")) == 0 ) { get_input_value ( &line[i], "fluxp=", &tmpData ); FLUXP = atoi ( tmpData ); } // fixup: else if ( strncmp(&line[i], "fixup=", strlen("fixup=")) == 0 ) { get_input_value ( &line[i], "fixup=", &tmpData ); FIXUP = atoi ( tmpData ); } } // Free temp data from memory FREE ( tmpData ); } } bcast_i_scalar ( &ierr, COMM_SNAP, ROOT, NPROC ); if ( ierr != 0 ) { return ierr; } if ( IPROC == ROOT ) { input_echo ( input_vars, fp_out ); ierr = input_check ( fp_out, input_vars, para_vars ); } /*********************************************************************** * Broadcast the data to all processes. ***********************************************************************/ ierr = var_bcast ( input_vars, para_vars ); t2 = wtime(); time_vars->tinp = t2 - t1; return ierr; }
int main(int argc, char *argv[]) { int my_ID, myrow, mycol, /* my index and row and column index */ root=0, /* ID of root rank */ Num_procs, /* number of ranks */ nprow, npcol, /* row, column dimensions of rank grid */ order, /* matrix order */ mynrows, myfrow, /* my number of rows and index of first row*/ mylrow, /* and last row */ myncols, myfcol, /* my number of cols and index of first row*/ mylcol, /* and last row */ *mm, /* arrays that hold m_i's and n_j's */ *nn, nb, /* block factor for SUMMA */ inner_block_flag, /* flag to select local DGEMM blocking */ error=0, /* error flag */ *ranks, /* work array for row and column ranks */ lda, ldb, ldc, /* leading array dimensions of a, b, and c */ i, j, ii, jj, /* dummy variables */ iter, iterations; double *a, *b, *c, /* arrays that hold local a, b, c */ *work1, *work2, /* work arrays to pass to dpmmmult */ local_dgemm_time, /* timing parameters */ dgemm_time, avgtime; double forder, nflops, /* float matrix order + total flops */ checksum, /* array checksum for verification test */ checksum_local=0.0, ref_checksum; /* reference checkcum for verification */ MPI_Group world_group, temp_group; MPI_Comm comm_row, /* communicators for row and column ranks */ comm_col; /* of rank grid */ /* initialize */ MPI_Init(&argc,&argv); MPI_Comm_rank( MPI_COMM_WORLD, &my_ID ); MPI_Comm_size( MPI_COMM_WORLD, &Num_procs ); /********************************************************************* ** process, test and broadcast input parameters *********************************************************************/ if (my_ID == root) { if (argc != 5) { printf("Usage: %s <# iterations> <matrix order> <outer block size> ", *argv); printf("<local block flag (non-zero=yes, zero=no)>\n"); error = 1; goto ENDOFTESTS; } iterations = atoi(*++argv); if(iterations < 1) { printf("ERROR: iterations must be positive: %d \n",iterations); error = 1; goto ENDOFTESTS; } order = atoi(*++argv); if (order < Num_procs) { printf("ERROR: matrix order too small: %d\n", order); error = 1; goto ENDOFTESTS; } nb = atoi(*++argv); /* a non-positive tile size means no outer level tiling */ inner_block_flag = atoi(*++argv); ENDOFTESTS: ; } bail_out(error); MPI_Bcast(&order, 1, MPI_INT, root, MPI_COMM_WORLD); MPI_Bcast(&iterations, 1, MPI_INT, root, MPI_COMM_WORLD); MPI_Bcast(&nb, 1, MPI_INT, root, MPI_COMM_WORLD); MPI_Bcast(&inner_block_flag, 1, MPI_INT, root, MPI_COMM_WORLD); /* compute rank grid to most closely match a square; to do so, compute largest divisor of Num_procs, using hare-brained method. The small term epsilon is used to guard against roundoff errors in case Num_procs is a perfect square */ nprow = (int) (sqrt((double) Num_procs + epsilon)); while (Num_procs%nprow) nprow--; npcol = Num_procs/nprow; if (my_ID == root) { printf("Parallel Research Kernels version %s\n", PRKVERSION); printf("MPI Dense matrix-matrix multiplication: C = A x B\n"); printf("Number of ranks = %d\n", Num_procs); printf("Rank grid = %d rows x %d columns\n", nprow, npcol); printf("Matrix order = %d\n", order); printf("Outer block size = %d\n", nb); printf("Number of iterations = %d\n", iterations); if (inner_block_flag) printf("Using local dgemm blocking\n"); else printf("No local dgemm blocking\n"); } /* set up row and column communicators */ ranks = (int *) malloc (3*Num_procs*sizeof(int)); if (!ranks) { printf("ERROR: Proc %d could not allocate rank work arrays\n", my_ID); error = 1; } bail_out(error); mm = ranks + Num_procs; nn = mm + Num_procs; /* 1. extract group of ranks that make up WORLD */ MPI_Comm_group( MPI_COMM_WORLD, &world_group ); /* 2. create list of all ranks in same row of rank grid */ ranks[0] = my_ID/npcol * npcol; for (i=1; i<npcol; i++) ranks[i] = ranks[i-1] + 1; /* create row group and communicator */ MPI_Group_incl( world_group, npcol, ranks, &temp_group ); MPI_Comm_create( MPI_COMM_WORLD, temp_group, &comm_row ); /* 3. create list of all ranks in same column of rank grid */ ranks[0] = my_ID%npcol; for (i=1; i<nprow; i++) ranks[i] = ranks[i-1] + npcol; /* create column group and communicator */ MPI_Group_incl( world_group, nprow, ranks, &temp_group ); MPI_Comm_create( MPI_COMM_WORLD, temp_group, &comm_col ); /* extract this node's row and column index */ MPI_Comm_rank( comm_row, &mycol ); MPI_Comm_rank( comm_col, &myrow ); /* mynrows = number of rows assigned to me; distribute excess rows evenly if nprow does not divide matrix order evenly */ if (myrow < order%nprow) mynrows = (order/nprow)+1; else mynrows = (order/nprow); /* make sure lda is a multiple of the block size nb */ if (mynrows%nb==0 || mynrows<nb) lda = mynrows; else lda = (mynrows/nb+1)*nb; /* myncols = number of colums assigned to me; distribute excess columns evenly if npcol does not divide order evenly */ if (mycol < order%npcol) myncols = (order/npcol)+1; else myncols = (order/npcol); /* get space for local blocks of A, B, C */ a = (double *) malloc( lda*myncols*sizeof(double) ); b = (double *) malloc( lda*myncols*sizeof(double) ); c = (double *) malloc( lda*myncols*sizeof(double) ); if ( a == NULL || b == NULL || c == NULL ) { error = 1; printf("ERROR: Proc %d could not allocate a, b, and/or c\n",my_ID); } bail_out(error); /* get space for two work arrays for dgemm */ work1 = (double *) malloc( nb*lda*sizeof(double) ); work2 = (double *) malloc( nb*myncols*sizeof(double) ); if ( !work1 || !work2 ) { printf("ERROR: Proc %d could not allocate work buffers\n", my_ID); error = 1; } bail_out(error); /* collect array that holds mynrows from all nodes in my row of the rank grid (array of all m_i) */ MPI_Allgather( &mynrows, 1, MPI_INT, mm, 1, MPI_INT, comm_col ); /* myfrow = first row on my node */ for (myfrow=1,i=0; i<myrow; i++) myfrow += mm[i]; mylrow = myfrow+mynrows-1; /* collect array that holds myncols from all nodes in my column of the rank grid (array of all n_j) */ MPI_Allgather( &myncols, 1, MPI_INT, nn, 1, MPI_INT, comm_row ); /* myfcol = first col on my node */ for (myfcol=1,i=0; i<mycol; i++) myfcol += nn[i]; mylcol = myfcol+myncols-1; /* initialize matrices A, B, and C */ ldc = ldb = lda; for (jj=0, j=myfcol; j<=mylcol; j++,jj++ ) for (ii=0, i=myfrow; i<=mylrow; i++, ii++ ) { A(ii,jj) = (double) (j-1); B(ii,jj) = (double) (j-1); C(ii,jj) = 0.0; } for (iter=0; iter<=iterations; iter++) { /* start timer after a warmup iteration */ if (iter == 1) { MPI_Barrier(MPI_COMM_WORLD); local_dgemm_time = wtime(); } /* actual matrix-vector multiply */ dgemm(order, nb, inner_block_flag, a, lda, b, lda, c, lda, mm, nn, comm_row, comm_col, work1, work2 ); } /* end of iterations */ local_dgemm_time = wtime() - local_dgemm_time; MPI_Reduce(&local_dgemm_time, &dgemm_time, 1, MPI_DOUBLE, MPI_MAX, root, MPI_COMM_WORLD); /* verification test */ for (jj=0, j=myfcol; j<=mylcol; j++, jj++) for (ii=0, i=myfrow; i<=mylrow; i++, ii++) checksum_local += C(ii,jj); MPI_Reduce(&checksum_local, &checksum, 1, MPI_DOUBLE, MPI_SUM, root, MPI_COMM_WORLD); forder = (double) order; ref_checksum = (0.25*forder*forder*forder*(forder-1.0)*(forder-1.0)); ref_checksum *= (iterations+1); if (my_ID == root) { if (ABS((checksum - ref_checksum)/ref_checksum) > epsilon) { printf("ERROR: Checksum = %lf, Reference checksum = %lf\n", checksum, ref_checksum); error = 1; } else { printf("Solution validates\n"); #ifdef VERBOSE printf("Reference checksum = %lf, checksum = %lf\n", ref_checksum, checksum); #endif } } bail_out(error); /* report elapsed time */ nflops = 2.0*forder*forder*forder; if ( my_ID == root ) { avgtime = dgemm_time/iterations; printf("Rate (MFlops/s): %lf Avg time (s): %lf\n", 1.0E-06 * nflops/avgtime, avgtime); } MPI_Finalize(); }
int main(int argc, char ** argv) { int Num_procs; /* number of ranks */ int Num_procsx, Num_procsy; /* number of ranks in each coord direction */ int my_ID; /* SHMEM rank */ int my_IDx, my_IDy; /* coordinates of rank in rank grid */ int right_nbr; /* global rank of right neighboring tile */ int left_nbr; /* global rank of left neighboring tile */ int top_nbr; /* global rank of top neighboring tile */ int bottom_nbr; /* global rank of bottom neighboring tile */ DTYPE *top_buf_out; /* communication buffer */ DTYPE *top_buf_in[2]; /* " " */ DTYPE *bottom_buf_out; /* " " */ DTYPE *bottom_buf_in[2];/* " " */ DTYPE *right_buf_out; /* " " */ DTYPE *right_buf_in[2]; /* " " */ DTYPE *left_buf_out; /* " " */ DTYPE *left_buf_in[2]; /* " " */ int root = 0; int n, width, height;/* linear global and local grid dimension */ int i, j, ii, jj, kk, it, jt, iter, leftover; /* dummies */ int istart, iend; /* bounds of grid tile assigned to calling rank */ int jstart, jend; /* bounds of grid tile assigned to calling rank */ DTYPE reference_norm; DTYPE f_active_points; /* interior of grid with respect to stencil */ int stencil_size; /* number of points in the stencil */ DTYPE flops; /* floating point ops per iteration */ int iterations; /* number of times to run the algorithm */ double avgtime, /* timing parameters */ *local_stencil_time, *stencil_time; DTYPE * RESTRICT in; /* input grid values */ DTYPE * RESTRICT out; /* output grid values */ long total_length_in; /* total required length to store input array */ long total_length_out;/* total required length to store output array */ int error=0; /* error flag */ DTYPE weight[2*RADIUS+1][2*RADIUS+1]; /* weights of points in the stencil */ int *arguments; /* command line parameters */ int count_case=4; /* number of neighbors of a rank */ long *pSync_bcast; /* work space for collectives */ long *pSync_reduce; /* work space for collectives */ double *pWrk_time; /* work space for collectives */ DTYPE *pWrk_norm; /* work space for collectives */ int *iterflag; /* synchronization flags */ int sw; /* double buffering switch */ DTYPE *local_norm, *norm; /* local and global error norms */ /******************************************************************************* ** Initialize the SHMEM environment ********************************************************************************/ prk_shmem_init(); my_ID=prk_shmem_my_pe(); Num_procs=prk_shmem_n_pes(); pSync_bcast = (long *) prk_shmem_malloc(PRK_SHMEM_BCAST_SYNC_SIZE*sizeof(long)); pSync_reduce = (long *) prk_shmem_malloc(PRK_SHMEM_REDUCE_SYNC_SIZE*sizeof(long)); pWrk_time = (double *) prk_shmem_malloc(PRK_SHMEM_REDUCE_MIN_WRKDATA_SIZE*sizeof(double)); pWrk_norm = (DTYPE *) prk_shmem_malloc(PRK_SHMEM_REDUCE_MIN_WRKDATA_SIZE*sizeof(DTYPE)); local_stencil_time = (double *) prk_shmem_malloc(sizeof(double)); stencil_time = (double *) prk_shmem_malloc(sizeof(double)); local_norm = (DTYPE *) prk_shmem_malloc(sizeof(DTYPE)); norm = (DTYPE *) prk_shmem_malloc(sizeof(DTYPE)); iterflag = (int *) prk_shmem_malloc(2*sizeof(int)); if (!(pSync_bcast && pSync_reduce && pWrk_time && pWrk_norm && iterflag && local_stencil_time && stencil_time && local_norm && norm)) { printf("Could not allocate scalar variables on rank %d\n", my_ID); error = 1; } bail_out(error); for(i=0;i<PRK_SHMEM_BCAST_SYNC_SIZE;i++) pSync_bcast[i]=PRK_SHMEM_SYNC_VALUE; for(i=0;i<PRK_SHMEM_REDUCE_SYNC_SIZE;i++) pSync_reduce[i]=PRK_SHMEM_SYNC_VALUE; arguments=(int*)prk_shmem_malloc(2*sizeof(int)); /******************************************************************************* ** process, test, and broadcast input parameters ********************************************************************************/ if (my_ID == root) { #ifndef STAR printf("ERROR: Compact stencil not supported\n"); error = 1; goto ENDOFTESTS; #endif if (argc != 3){ printf("Usage: %s <# iterations> <array dimension> \n", *argv); error = 1; goto ENDOFTESTS; } iterations = atoi(*++argv); arguments[0]=iterations; if (iterations < 1){ printf("ERROR: iterations must be >= 1 : %d \n",iterations); error = 1; goto ENDOFTESTS; } n = atoi(*++argv); arguments[1]=n; long nsquare = (long)n * (long)n; if (nsquare < Num_procs){ printf("ERROR: grid size must be at least # ranks: %ld\n", nsquare); error = 1; goto ENDOFTESTS; } if (RADIUS < 0) { printf("ERROR: Stencil radius %d should be non-negative\n", RADIUS); error = 1; goto ENDOFTESTS; } if (2*RADIUS +1 > n) { printf("ERROR: Stencil radius %d exceeds grid size %d\n", RADIUS, n); error = 1; goto ENDOFTESTS; } ENDOFTESTS:; } bail_out(error); /* determine best way to create a 2D grid of ranks (closest to square, for best surface/volume ratio); we do this brute force for now */ for (Num_procsx=(int) (sqrt(Num_procs+1)); Num_procsx>0; Num_procsx--) { if (!(Num_procs%Num_procsx)) { Num_procsy = Num_procs/Num_procsx; break; } } my_IDx = my_ID%Num_procsx; my_IDy = my_ID/Num_procsx; /* compute neighbors; don't worry about dropping off the edges of the grid */ right_nbr = my_ID+1; left_nbr = my_ID-1; top_nbr = my_ID+Num_procsx; bottom_nbr = my_ID-Num_procsx; iterflag[0] = iterflag[1] = 0; if(my_IDx==0) count_case--; if(my_IDx==Num_procsx-1) count_case--; if(my_IDy==0) count_case--; if(my_IDy==Num_procsy-1) count_case--; if (my_ID == root) { printf("Parallel Research Kernels version %s\n", PRKVERSION); printf("SHMEM stencil execution on 2D grid\n"); printf("Number of ranks = %d\n", Num_procs); printf("Grid size = %d\n", n); printf("Radius of stencil = %d\n", RADIUS); printf("Tiles in x/y-direction = %d/%d\n", Num_procsx, Num_procsy); printf("Type of stencil = star\n"); #ifdef DOUBLE printf("Data type = double precision\n"); #else printf("Data type = single precision\n"); #endif #if LOOPGEN printf("Script used to expand stencil loop body\n"); #else printf("Compact representation of stencil loop body\n"); #endif #if SPLITFENCE printf("Split fence = ON\n"); #else printf("Split fence = OFF\n"); #endif printf("Number of iterations = %d\n", iterations); } shmem_barrier_all(); shmem_broadcast32(&arguments[0], &arguments[0], 2, root, 0, 0, Num_procs, pSync_bcast); iterations=arguments[0]; n=arguments[1]; shmem_barrier_all(); prk_shmem_free(arguments); /* compute amount of space required for input and solution arrays */ width = n/Num_procsx; leftover = n%Num_procsx; if (my_IDx<leftover) { istart = (width+1) * my_IDx; iend = istart + width + 1; } else { istart = (width+1) * leftover + width * (my_IDx-leftover); iend = istart + width; } width = iend - istart + 1; if (width == 0) { printf("ERROR: rank %d has no work to do\n", my_ID); error = 1; } bail_out(error); height = n/Num_procsy; leftover = n%Num_procsy; if (my_IDy<leftover) { jstart = (height+1) * my_IDy; jend = jstart + height + 1; } else { jstart = (height+1) * leftover + height * (my_IDy-leftover); jend = jstart + height; } height = jend - jstart + 1; if (height == 0) { printf("ERROR: rank %d has no work to do\n", my_ID); error = 1; } bail_out(error); if (width < RADIUS || height < RADIUS) { printf("ERROR: rank %d has work tile smaller then stencil radius\n", my_ID); error = 1; } bail_out(error); total_length_in = (width+2*RADIUS); total_length_in *= (height+2*RADIUS); total_length_in *= sizeof(DTYPE); total_length_out = width; total_length_out *= height; total_length_out *= sizeof(DTYPE); in = (DTYPE *) malloc(total_length_in); out = (DTYPE *) malloc(total_length_out); if (!in || !out) { printf("ERROR: rank %d could not allocate space for input/output array\n", my_ID); error = 1; } bail_out(error); /* fill the stencil weights to reflect a discrete divergence operator */ for (jj=-RADIUS; jj<=RADIUS; jj++) for (ii=-RADIUS; ii<=RADIUS; ii++) WEIGHT(ii,jj) = (DTYPE) 0.0; stencil_size = 4*RADIUS+1; for (ii=1; ii<=RADIUS; ii++) { WEIGHT(0, ii) = WEIGHT( ii,0) = (DTYPE) (1.0/(2.0*ii*RADIUS)); WEIGHT(0,-ii) = WEIGHT(-ii,0) = -(DTYPE) (1.0/(2.0*ii*RADIUS)); } norm[0] = (DTYPE) 0.0; f_active_points = (DTYPE) (n-2*RADIUS)*(DTYPE) (n-2*RADIUS); /* intialize the input and output arrays */ for (j=jstart; j<jend; j++) for (i=istart; i<iend; i++) { IN(i,j) = COEFX*i+COEFY*j; OUT(i,j) = (DTYPE)0.0; } /* allocate communication buffers for halo values */ top_buf_out=(DTYPE*)malloc(2*sizeof(DTYPE)*RADIUS*width); if (!top_buf_out) { printf("ERROR: Rank %d could not allocate output comm buffers for y-direction\n", my_ID); error = 1; } bail_out(error); bottom_buf_out = top_buf_out+RADIUS*width; top_buf_in[0]=(DTYPE*)prk_shmem_malloc(4*sizeof(DTYPE)*RADIUS*width); if(!top_buf_in) { printf("ERROR: Rank %d could not allocate input comm buffers for y-direction\n", my_ID); error=1; } bail_out(error); top_buf_in[1] = top_buf_in[0] + RADIUS*width; bottom_buf_in[0] = top_buf_in[1] + RADIUS*width; bottom_buf_in[1] = bottom_buf_in[0] + RADIUS*width; right_buf_out=(DTYPE*)malloc(2*sizeof(DTYPE)*RADIUS*height); if (!right_buf_out) { printf("ERROR: Rank %d could not allocate output comm buffers for x-direction\n", my_ID); error = 1; } bail_out(error); left_buf_out=right_buf_out+RADIUS*height; right_buf_in[0]=(DTYPE*)prk_shmem_malloc(4*sizeof(DTYPE)*RADIUS*height); if(!right_buf_in) { printf("ERROR: Rank %d could not allocate input comm buffers for x-dimension\n", my_ID); error=1; } bail_out(error); right_buf_in[1] = right_buf_in[0] + RADIUS*height; left_buf_in[0] = right_buf_in[1] + RADIUS*height; left_buf_in[1] = left_buf_in[0] + RADIUS*height; /* make sure all symmetric heaps are allocated before being used */ shmem_barrier_all(); for (iter = 0; iter<=iterations; iter++){ /* start timer after a warmup iteration */ if (iter == 1) { shmem_barrier_all(); local_stencil_time[0] = wtime(); } /* sw determines which incoming buffer to select */ sw = iter%2; /* need to fetch ghost point data from neighbors */ if (my_IDy < Num_procsy-1) { for (kk=0,j=jend-RADIUS; j<=jend-1; j++) for (i=istart; i<=iend; i++) { top_buf_out[kk++]= IN(i,j); } shmem_putmem(bottom_buf_in[sw], top_buf_out, RADIUS*width*sizeof(DTYPE), top_nbr); #if SPLITFENCE shmem_fence(); shmem_int_inc(&iterflag[sw], top_nbr); #endif } if (my_IDy > 0) { for (kk=0,j=jstart; j<=jstart+RADIUS-1; j++) for (i=istart; i<=iend; i++) { bottom_buf_out[kk++]= IN(i,j); } shmem_putmem(top_buf_in[sw], bottom_buf_out, RADIUS*width*sizeof(DTYPE), bottom_nbr); #if SPLITFENCE shmem_fence(); shmem_int_inc(&iterflag[sw], bottom_nbr); #endif } if(my_IDx < Num_procsx-1) { for(kk=0,j=jstart;j<=jend;j++) for(i=iend-RADIUS;i<=iend-1;i++) { right_buf_out[kk++]=IN(i,j); } shmem_putmem(left_buf_in[sw], right_buf_out, RADIUS*height*sizeof(DTYPE), right_nbr); #if SPLITFENCE shmem_fence(); shmem_int_inc(&iterflag[sw], right_nbr); #endif } if(my_IDx>0) { for(kk=0,j=jstart;j<=jend;j++) for(i=istart;i<=istart+RADIUS-1;i++) { left_buf_out[kk++]=IN(i,j); } shmem_putmem(right_buf_in[sw], left_buf_out, RADIUS*height*sizeof(DTYPE), left_nbr); #if SPLITFENCE shmem_fence(); shmem_int_inc(&iterflag[sw], left_nbr); #endif } #if SPLITFENCE == 0 shmem_fence(); if(my_IDy<Num_procsy-1) shmem_int_inc(&iterflag[sw], top_nbr); if(my_IDy>0) shmem_int_inc(&iterflag[sw], bottom_nbr); if(my_IDx<Num_procsx-1) shmem_int_inc(&iterflag[sw], right_nbr); if(my_IDx>0) shmem_int_inc(&iterflag[sw], left_nbr); #endif shmem_int_wait_until(&iterflag[sw], SHMEM_CMP_EQ, count_case*(iter/2+1)); if (my_IDy < Num_procsy-1) { for (kk=0,j=jend; j<=jend+RADIUS-1; j++) for (i=istart; i<=iend; i++) { IN(i,j) = top_buf_in[sw][kk++]; } } if (my_IDy > 0) { for (kk=0,j=jstart-RADIUS; j<=jstart-1; j++) for (i=istart; i<=iend; i++) { IN(i,j) = bottom_buf_in[sw][kk++]; } } if (my_IDx < Num_procsx-1) { for (kk=0,j=jstart; j<=jend; j++) for (i=iend; i<=iend+RADIUS-1; i++) { IN(i,j) = right_buf_in[sw][kk++]; } } if (my_IDx > 0) { for (kk=0,j=jstart; j<=jend; j++) for (i=istart-RADIUS; i<=istart-1; i++) { IN(i,j) = left_buf_in[sw][kk++]; } } /* Apply the stencil operator */ for (j=MAX(jstart,RADIUS); j<=MIN(n-RADIUS-1,jend); j++) { for (i=MAX(istart,RADIUS); i<=MIN(n-RADIUS-1,iend); i++) { #if LOOPGEN #include "loop_body_star.incl" #else for (jj=-RADIUS; jj<=RADIUS; jj++) OUT(i,j) += WEIGHT(0,jj)*IN(i,j+jj); for (ii=-RADIUS; ii<0; ii++) OUT(i,j) += WEIGHT(ii,0)*IN(i+ii,j); for (ii=1; ii<=RADIUS; ii++) OUT(i,j) += WEIGHT(ii,0)*IN(i+ii,j); #endif } } /* add constant to solution to force refresh of neighbor data, if any */ for (j=jstart; j<jend; j++) for (i=istart; i<iend; i++) IN(i,j)+= 1.0; } local_stencil_time[0] = wtime() - local_stencil_time[0]; shmem_barrier_all(); shmem_double_max_to_all(&stencil_time[0], &local_stencil_time[0], 1, 0, 0, Num_procs, pWrk_time, pSync_reduce); /* compute L1 norm in parallel */ local_norm[0] = (DTYPE) 0.0; for (j=MAX(jstart,RADIUS); j<MIN(n-RADIUS,jend); j++) { for (i=MAX(istart,RADIUS); i<MIN(n-RADIUS,iend); i++) { local_norm[0] += (DTYPE)ABS(OUT(i,j)); } } shmem_barrier_all(); #ifdef DOUBLE shmem_double_sum_to_all(&norm[0], &local_norm[0], 1, 0, 0, Num_procs, pWrk_norm, pSync_reduce); #else shmem_float_sum_to_all(&norm[0], &local_norm[0], 1, 0, 0, Num_procs, pWrk_norm, pSync_reduce); #endif /******************************************************************************* ** Analyze and output results. ********************************************************************************/ /* verify correctness */ if (my_ID == root) { norm[0] /= f_active_points; if (RADIUS > 0) { reference_norm = (DTYPE) (iterations+1) * (COEFX + COEFY); } else { reference_norm = (DTYPE) 0.0; } if (ABS(norm[0]-reference_norm) > EPSILON) { printf("ERROR: L1 norm = "FSTR", Reference L1 norm = "FSTR"\n", norm[0], reference_norm); error = 1; } else { printf("Solution validates\n"); #ifdef VERBOSE printf("Reference L1 norm = "FSTR", L1 norm = "FSTR"\n", reference_norm, norm[0]); #endif } } bail_out(error); if (my_ID == root) { /* flops/stencil: 2 flops (fma) for each point in the stencil, plus one flop for the update of the input of the array */ flops = (DTYPE) (2*stencil_size+1) * f_active_points; avgtime = stencil_time[0]/iterations; printf("Rate (MFlops/s): "FSTR" Avg time (s): %lf\n", 1.0E-06 * flops/avgtime, avgtime); } prk_shmem_free(top_buf_in); prk_shmem_free(right_buf_in); free(top_buf_out); free(right_buf_out); prk_shmem_free(pSync_bcast); prk_shmem_free(pSync_reduce); prk_shmem_free(pWrk_time); prk_shmem_free(pWrk_norm); prk_shmem_finalize(); exit(EXIT_SUCCESS); }
int main(int argc, char **argv){ int iter, r; /* dummies */ int lsize; /* logarithmic linear size of grid */ int lsize2; /* logarithmic size of grid */ int size; /* linear size of grid */ s64Int size2; /* matrix order (=total # points in grid) */ int radius, /* stencil parameters */ stencil_size; s64Int row, col, first, last; /* dummies */ s64Int i, j; /* dummies */ int iterations; /* number of times the multiplication is done */ s64Int elm; /* sequence number of matrix nonzero */ s64Int nent; /* number of nonzero entries */ double sparsity; /* fraction of non-zeroes in matrix */ double sparse_time,/* timing parameters */ avgtime = 0.0, maxtime = 0.0, mintime = 366.0*24.0*3600.0; /* set the minimum time to a large value; one leap year should be enough */ double * RESTRICT matrix; /* sparse matrix entries */ double * RESTRICT vector; /* vector multiplying the sparse matrix */ double * RESTRICT result; /* computed matrix-vector product */ double temp; /* temporary scalar storing reduction data */ double vector_sum; /* checksum of result */ double reference_sum; /* checksum of "rhs" */ double epsilon = 1.e-8; /* error tolerance */ s64Int * RESTRICT colIndex; /* column indices of sparse matrix entries */ int nthread_input, /* thread parameters */ nthread; int num_error=0; /* flag that signals that requested and obtained numbers of threads are the same */ size_t vector_space, /* variables used to hold malloc sizes */ matrix_space, index_space; if (argc != 5) { printf("Usage: %s <# threads> <# iterations> <2log grid size> <stencil radius>\n",*argv); exit(EXIT_FAILURE); } /* Take number of threads to request from command line */ nthread_input = atoi(*++argv); if ((nthread_input < 1) || (nthread_input > MAX_THREADS)) { printf("ERROR: Invalid number of threads: %d\n", nthread_input); exit(EXIT_FAILURE); } omp_set_num_threads(nthread_input); iterations = atoi(*++argv); if (iterations < 1){ printf("ERROR: Iterations must be positive : %d \n", iterations); exit(EXIT_FAILURE); } lsize = atoi(*++argv); lsize2 = 2*lsize; size = 1<<lsize; if (lsize <0) { printf("ERROR: Log of grid size must be greater than or equal to zero: %d\n", (int) lsize); exit(EXIT_FAILURE); } /* compute number of points in the grid */ size2 = size*size; radius = atoi(*++argv); if (radius <0) { printf("ERROR: Stencil radius must be non-negative: %d\n", (int) size); exit(EXIT_FAILURE); } /* emit error if (periodic) stencil overlaps with itself */ if (size <2*radius+1) { printf("ERROR: Grid extent %d smaller than stencil diameter 2*%d+1= %d\n", size, radius, radius*2+1); exit(EXIT_FAILURE); } /* compute total size of star stencil in 2D */ stencil_size = 4*radius+1; /* sparsity follows from number of non-zeroes per row */ sparsity = (double)(4*radius+1)/(double)size2; /* compute total number of non-zeroes */ nent = size2*stencil_size; matrix_space = nent*sizeof(double); if (matrix_space/sizeof(double) != nent) { printf("ERROR: Cannot represent space for matrix: %ld\n", matrix_space); exit(EXIT_FAILURE); } matrix = (double *) malloc(matrix_space); if (!matrix) { printf("ERROR: Could not allocate space for sparse matrix: "FSTR64U"\n", nent); exit(EXIT_FAILURE); } vector_space = 2*size2*sizeof(double); if (vector_space/sizeof(double) != 2*size2) { printf("ERROR: Cannot represent space for vectors: %ld\n", vector_space); exit(EXIT_FAILURE); } vector = (double *) malloc(vector_space); if (!vector) { printf("ERROR: Could not allocate space for vectors: %d\n", (int)(2*size2)); exit(EXIT_FAILURE); } result = vector + size2; index_space = nent*sizeof(s64Int); if (index_space/sizeof(s64Int) != nent) { printf("ERROR: Cannot represent space for column indices: %ld\n", index_space); exit(EXIT_FAILURE); } colIndex = (s64Int *) malloc(index_space); if (!colIndex) { printf("ERROR: Could not allocate space for column indices: "FSTR64U"\n", nent*sizeof(s64Int)); exit(EXIT_FAILURE); } #pragma omp parallel private (row, col, elm, first, last, iter) { #pragma omp master { nthread = omp_get_num_threads(); printf("OpenMP Sparse matrix-vector multiplication\n"); if (nthread != nthread_input) { num_error = 1; printf("ERROR: number of requested threads %d does not equal ", nthread_input); printf("number of spawned threads %d\n", nthread); } else { printf("Number of threads = %16d\n",nthread_input); printf("Matrix order = "FSTR64U"\n", size2); printf("Stencil diameter = %16d\n", 2*radius+1); printf("Sparsity = %16.10lf\n", sparsity); #ifdef SCRAMBLE printf("Using scrambled indexing\n"); #else printf("Using canonical indexing\n"); #endif printf("Number of iterations = %16d\n", iterations); } } bail_out(num_error); /* initialize the input and result vectors */ #pragma omp for for (row=0; row<size2; row++) result[row] = vector[row] = 0.0; /* fill matrix with nonzeroes corresponding to difference stencil. We use the scrambling for reordering the points in the grid. */ #pragma omp for private (i,j,r) for (row=0; row<size2; row++) { j = row/size; i=row%size; elm = row*stencil_size; colIndex[elm] = REVERSE(LIN(i,j),lsize2); for (r=1; r<=radius; r++, elm+=4) { colIndex[elm+1] = REVERSE(LIN((i+r)%size,j),lsize2); colIndex[elm+2] = REVERSE(LIN((i-r+size)%size,j),lsize2); colIndex[elm+3] = REVERSE(LIN(i,(j+r)%size),lsize2); colIndex[elm+4] = REVERSE(LIN(i,(j-r+size)%size),lsize2); } // sort colIndex to make sure the compressed row accesses // vector elements in increasing order qsort(&(colIndex[row*stencil_size]), stencil_size, sizeof(s64Int), compare); for (elm=row*stencil_size; elm<(row+1)*stencil_size; elm++) matrix[elm] = 1.0/(double)(colIndex[elm]+1); } for (iter=0; iter<iterations; iter++) { #pragma omp barrier #pragma omp master { sparse_time = wtime(); } /* fill vector */ #pragma omp for for (row=0; row<size2; row++) vector[row] += (double) (row+1); /* do the actual matrix-vector multiplication */ #pragma omp for for (row=0; row<size2; row++) { temp = 0.0; first = stencil_size*row; last = first+stencil_size-1; #pragma simd reduction(+:temp) for (col=first; col<=last; col++) { temp += matrix[col]*vector[colIndex[col]]; } result[row] += temp; } #pragma omp master { sparse_time = wtime() - sparse_time; if (iter>0 || iterations==1) { /* skip the first iteration */ avgtime = avgtime + sparse_time; mintime = MIN(mintime, sparse_time); maxtime = MAX(maxtime, sparse_time); } } } } /* end of parallel region */ /* verification test */ reference_sum = 0.5 * (double) nent * (double) iterations * (double) (iterations +1); vector_sum = 0.0; for (row=0; row<size2; row++) vector_sum += result[row]; if (ABS(vector_sum-reference_sum) > epsilon) { printf("ERROR: Vector sum = %lf, Reference vector sum = %lf\n", vector_sum, reference_sum); exit(EXIT_FAILURE); } else { printf("Solution validates\n"); #ifdef VERBOSE printf("Reference sum = %lf, vector sum = %lf\n", reference_sum, vector_sum); #endif } avgtime = avgtime/(double)(MAX(iterations-1,1)); printf("Rate (MFlops/s): %lf, Avg time (s): %lf, Min time (s): %lf", 1.0E-06 * (2.0*nent)/mintime, avgtime, mintime); printf(", Max time (s): %lf\n", maxtime); exit(EXIT_SUCCESS); }
int main(int argc, char ** argv) { long order; /* order of a the matrix */ int Tile_order=32; /* default tile size for tiling of local transpose */ int iterations; /* number of times to do the transpose */ int tiling; /* boolean: true if tiling is used */ int i, j, it, jt, iter; /* dummies */ double bytes; /* combined size of matrices */ double * RESTRICT A; /* buffer to hold original matrix */ double * RESTRICT B; /* buffer to hold transposed matrix */ double abserr; /* absolute error */ double epsilon=1.e-8; /* error tolerance */ double transpose_time,/* timing parameters */ avgtime; int nthread_input, nthread; int num_error=0; /* flag that signals that requested and obtained numbers of threads are the same */ /********************************************************************* ** read and test input parameters *********************************************************************/ printf("Parallel Research Kernels version %s\n", PRKVERSION); printf("OpenMP Matrix transpose: B = A^T\n"); if (argc != 4 && argc != 5){ printf("Usage: %s <# threads> <# iterations> <matrix order> [tile size]\n", *argv); exit(EXIT_FAILURE); } /* Take number of threads to request from command line */ nthread_input = atoi(*++argv); if ((nthread_input < 1) || (nthread_input > MAX_THREADS)) { printf("ERROR: Invalid number of threads: %d\n", nthread_input); exit(EXIT_FAILURE); } omp_set_num_threads(nthread_input); iterations = atoi(*++argv); if (iterations < 1){ printf("ERROR: iterations must be >= 1 : %d \n",iterations); exit(EXIT_FAILURE); } order = atoi(*++argv); if (order < 0){ printf("ERROR: Matrix Order must be greater than 0 : %d \n", order); exit(EXIT_FAILURE); } if (argc == 5) Tile_order = atoi(*++argv); /* a non-positive tile size means no tiling of the local transpose */ tiling = (Tile_order > 0) && (Tile_order < order); if (!tiling) Tile_order = order; /********************************************************************* ** Allocate space for the input and transpose matrix *********************************************************************/ A = (double *)malloc(order*order*sizeof(double)); if (A == NULL){ printf(" ERROR: cannot allocate space for input matrix: %ld\n", order*order*sizeof(double)); exit(EXIT_FAILURE); } B = (double *)malloc(order*order*sizeof(double)); if (B == NULL){ printf(" ERROR: cannot allocate space for output matrix: %ld\n", order*order*sizeof(double)); exit(EXIT_FAILURE); } bytes = 2.0 * sizeof(double) * order * order; #pragma omp parallel private (iter) { #pragma omp master { nthread = omp_get_num_threads(); if (nthread != nthread_input) { num_error = 1; printf("ERROR: number of requested threads %d does not equal ", nthread_input); printf("number of spawned threads %d\n", nthread); } else { printf("Number of threads = %i;\n",nthread_input); printf("Matrix order = %ld\n", order); printf("Number of iterations = %d\n", iterations); if (tiling) { printf("Tile size = %d\n", Tile_order); #ifdef COLLAPSE printf("Using loop collapse\n"); #endif } else printf("Untiled\n"); } } bail_out(num_error); /* Fill the original matrix, set transpose to known garbage value. */ if (tiling) { #ifdef COLLAPSE #pragma omp for private (i,it,jt) collapse(2) #else #pragma omp for private (i,it,jt) #endif for (j=0; j<order; j+=Tile_order) for (i=0; i<order; i+=Tile_order) for (jt=j; jt<MIN(order,j+Tile_order);jt++) for (it=i; it<MIN(order,i+Tile_order); it++){ A(it,jt) = (double) (order*jt + it); B(it,jt) = 0.0; } } else { #pragma omp for private (i) for (j=0;j<order;j++) for (i=0;i<order; i++) { A(i,j) = (double) (order*j + i); B(i,j) = 0.0; } } for (iter = 0; iter<=iterations; iter++){ /* start timer after a warmup iteration */ if (iter == 1) { #pragma omp barrier #pragma omp master { transpose_time = wtime(); } } /* Transpose the matrix */ if (!tiling) { #pragma omp for private (j) for (i=0;i<order; i++) for (j=0;j<order;j++) { B(j,i) += A(i,j); A(i,j) += 1.0; } } else { #ifdef COLLAPSE #pragma omp for private (j,it,jt) collapse(2) #else #pragma omp for private (j,it,jt) #endif for (i=0; i<order; i+=Tile_order) for (j=0; j<order; j+=Tile_order) for (it=i; it<MIN(order,i+Tile_order); it++) for (jt=j; jt<MIN(order,j+Tile_order);jt++) { B(jt,it) += A(it,jt); A(it,jt) += 1.0; } } } /* end of iter loop */ #pragma omp barrier #pragma omp master { transpose_time = wtime() - transpose_time; } } /* end of OpenMP parallel region */ abserr = test_results (order, B, iterations); /********************************************************************* ** Analyze and output results. *********************************************************************/ if (abserr < epsilon) { printf("Solution validates\n"); avgtime = transpose_time/iterations; printf("Rate (MB/s): %lf Avg time (s): %lf\n", 1.0E-06 * bytes/avgtime, avgtime); #ifdef VERBOSE printf("Squared errors: %f \n", abserr); #endif exit(EXIT_SUCCESS); } else { printf("ERROR: Aggregate squared error %lf exceeds threshold %e\n", abserr, epsilon); exit(EXIT_FAILURE); } } /* end of main */
int main(int argc, char ** argv) { int my_ID; /* Thread ID */ int vector_length; /* length of vector loop containing the branch */ int nfunc; /* number of functions used in INS_HEAVY option */ int rank; /* matrix rank used in INS_HEAVY option */ double branch_time, /* timing parameters */ no_branch_time; double ops; /* double precision representation of integer ops */ int iterations; /* number of times the branch loop is carried out */ int i, iter, aux; /* dummies */ char *branch_type; /* string defining branching type */ int btype; /* integer encoding branching type */ int total=0, total_ref; /* computed and stored verification values */ int nthread_input; /* thread parameters */ int nthread; int num_error=0; /* flag that signals that requested and obtained numbers of threads are the same */ /********************************************************************************** ** process and test input parameters **********************************************************************************/ if (argc != 5){ printf("Usage: %s <# threads> <# iterations> <vector length>", *argv); printf("<branching type>\n"); printf("branching type: vector_go, vector_stop, no_vector, ins_heavy\n"); exit(EXIT_FAILURE); } nthread_input = atoi(*++argv); if ((nthread_input < 1) || (nthread_input > MAX_THREADS)) { printf("ERROR: Invalid number of threads: %d\n", nthread_input); exit(EXIT_FAILURE); } omp_set_num_threads(nthread_input); iterations = atoi(*++argv); if (iterations < 1 || iterations%2==1){ printf("ERROR: Iterations must be positive and even : %d \n", iterations); exit(EXIT_FAILURE); } vector_length = atoi(*++argv); if (vector_length < 1){ printf("ERROR: loop length must be >= 1 : %d \n",vector_length); exit(EXIT_FAILURE); } branch_type = *++argv; if (!strcmp(branch_type,"vector_stop")) btype = VECTOR_STOP; else if (!strcmp(branch_type,"vector_go" )) btype = VECTOR_GO; else if (!strcmp(branch_type,"no_vector" )) btype = NO_VECTOR; else if (!strcmp(branch_type,"ins_heavy" )) btype = INS_HEAVY; else { printf("Wrong branch type: %s; choose vector_stop, vector_go, ", branch_type); printf("no_vector, or ins_heavy\n"); exit(EXIT_FAILURE); } #pragma omp parallel private(i, my_ID, iter, aux, nfunc, rank) reduction(+:total) { int * RESTRICT vector; int * RESTRICT index; int factor = -1; #pragma omp master { nthread = omp_get_num_threads(); printf("Parallel Research Kernels version %s\n", PRKVERSION); printf("OpenMP Branching Bonanza\n"); if (nthread != nthread_input) { num_error = 1; printf("ERROR: number of requested threads %d does not equal ", nthread_input); printf("number of spawned threads %d\n", nthread); } else { printf("Number of threads = %d\n", nthread_input); printf("Vector length = %d\n", vector_length); printf("Number of iterations = %d\n", iterations); printf("Branching type = %s\n", branch_type); } } bail_out(num_error); my_ID = omp_get_thread_num(); vector = malloc(vector_length*2*sizeof(int)); if (!vector) { printf("ERROR: Thread %d failed to allocate space for vector\n", my_ID); num_error = 1; } bail_out(num_error); /* grab the second half of vector to store index array */ index = vector + vector_length; /* initialize the array with entries with varying signs; array "index" is only used to obfuscate the compiler (i.e. it won't vectorize a loop containing indirect referencing). It functions as the identity operator. */ for (i=0; i<vector_length; i++) { vector[i] = 3 - (i&7); index[i] = i; } #pragma omp barrier #pragma omp master { branch_time = wtime(); } /* do actual branching */ switch (btype) { case VECTOR_STOP: /* condition vector[index[i]]>0 inhibits vectorization */ for (iter=0; iter<iterations; iter+=2) { #pragma vector always for (i=0; i<vector_length; i++) { aux = -(3 - (i&7)); if (vector[index[i]]>0) vector[i] -= 2*vector[i]; else vector[i] -= 2*aux; } #pragma vector always for (i=0; i<vector_length; i++) { aux = (3 - (i&7)); if (vector[index[i]]>0) vector[i] -= 2*vector[i]; else vector[i] -= 2*aux; } } break; case VECTOR_GO: /* condition aux>0 allows vectorization */ for (iter=0; iter<iterations; iter+=2) { #pragma vector always for (i=0; i<vector_length; i++) { aux = -(3 - (i&7)); if (aux>0) vector[i] -= 2*vector[i]; else vector[i] -= 2*aux; } #pragma vector always for (i=0; i<vector_length; i++) { aux = (3 - (i&7)); if (aux>0) vector[i] -= 2*vector[i]; else vector[i] -= 2*aux; } } break; case NO_VECTOR: /* condition aux>0 allows vectorization, but indirect indexing inbibits it */ for (iter=0; iter<iterations; iter+=2) { #pragma vector always for (i=0; i<vector_length; i++) { aux = -(3 - (i&7)); if (aux>0) vector[i] -= 2*vector[index[i]]; else vector[i] -= 2*aux; } #pragma vector always for (i=0; i<vector_length; i++) { aux = (3 - (i&7)); if (aux>0) vector[i] -= 2*vector[index[i]]; else vector[i] -= 2*aux; } } break; case INS_HEAVY: fill_vec(vector, vector_length, iterations, WITH_BRANCHES, &nfunc, &rank); } #pragma omp master { branch_time = wtime() - branch_time; if (btype == INS_HEAVY) { printf("Number of matrix functions = %d\n", nfunc); printf("Matrix order = %d\n", rank); } } /* do the whole thing once more, but now without branches */ #pragma omp barrier #pragma omp master { no_branch_time = wtime(); } /* do actual branching */ switch (btype) { case VECTOR_STOP: case VECTOR_GO: for (iter=0; iter<iterations; iter+=2) { #pragma vector always for (i=0; i<vector_length; i++) { aux = -(3-(i&7)); vector[i] -= (vector[i] + aux); } for (i=0; i<vector_length; i++) { aux = (3-(i&7)); vector[i] -= (vector[i] + aux); } } break; case NO_VECTOR: for (iter=0; iter<iterations; iter+=2) { #pragma vector always for (i=0; i<vector_length; i++) { aux = -(3-(i&7)); vector[i] -= (vector[index[i]]+aux); } #pragma vector always for (i=0; i<vector_length; i++) { aux = (3-(i&7)); vector[i] -= (vector[index[i]]+aux); } } break; case INS_HEAVY: fill_vec(vector, vector_length, iterations, WITHOUT_BRANCHES, &nfunc, &rank); } #pragma omp master { no_branch_time = wtime() - no_branch_time; ops = (double)vector_length * (double)iterations * (double)nthread; if (btype == INS_HEAVY) ops *= rank*(rank*19 + 6); else ops *= 4; } for (total = 0, i=0; i<vector_length; i++) total += vector[i]; } /* end of OPENMP parallel region */ /* compute verification values */ total_ref = ((vector_length%8)*(vector_length%8-8) + vector_length)/2*nthread; if (total == total_ref) { printf("Solution validates\n"); printf("Rate (Mops/s) with branches: %lf time (s): %lf\n", ops/(branch_time*1.e6), branch_time); printf("Rate (Mops/s) without branches: %lf time (s): %lf\n", ops/(no_branch_time*1.e6), no_branch_time); #ifdef VERBOSE printf("Array sum = %d, reference value = %d\n", total, total_ref); #endif } else { printf("ERROR: array sum = %d, reference value = %d\n", total, total_ref); } exit(EXIT_SUCCESS); }