int main (int argc, char *argv[]) { /* * Initialize TAU and start a timer for the main function. */ TAU_INIT(&argc, &argv); TAU_PROFILE_SET_NODE(0); TAU_PROFILE_TIMER(tautimer, __func__, __FILE__, TAU_USER); TAU_PROFILE_START(tautimer); /* * Initialize MPI. We don't require threaded support, but with threads * we can send the TAU data over SOS asynchronously. */ int rc = MPI_SUCCESS; int provided = 0; rc = MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &provided); if (rc != MPI_SUCCESS) { char *errorstring; int length = 0; MPI_Error_string(rc, errorstring, &length); printf("Error: MPI_Init failed, rc = %d\n%s\n", rc, errorstring); exit(1); } MPI_Comm_rank(MPI_COMM_WORLD, &myrank); MPI_Comm_size(MPI_COMM_WORLD, &commsize); //my_printf("MPI_Init_thread: provided = %d, MPI_THREAD_MULTIPLE=%d\n", provided, MPI_THREAD_MULTIPLE); my_printf("%s Running with commsize %d\n", argv[0], commsize); /* * Initialize SOS. This will have been done in TAU, but in case we don't * use TAU, we still want SOS action. */ SOS_init_wrapper(&argc, &argv); /* * Run the worker code. */ worker(argc, argv); /* * Finalize SOS and MPI */ SOS_FINALIZE(); MPI_Finalize(); my_printf ("%s Done.\n", argv[0]); /* * Stop our main TAU timer. it probably will have been stopped in the MPI_Finalize call. */ TAU_PROFILE_STOP(tautimer); return 0; }
int worker(int argc, char* argv[]) { TAU_PROFILE_TIMER(timer, __func__, __FILE__, TAU_USER); TAU_PROFILE_START(timer); static bool announced = false; my_printf("%d of %d In worker A\n", myrank, commsize); /* validate input */ validate_input(argc, argv); my_printf("Worker A will execute %d iterations.\n", iterations); /* ADIOS: These declarations are required to match the generated * gread_/gwrite_ functions. (And those functions are * generated by calling 'gpp.py adios_config.xml') ... */ uint64_t adios_groupsize; uint64_t adios_totalsize; uint64_t adios_handle; char adios_filename[256]; MPI_Comm adios_comm; /* ADIOS: Can duplicate, split the world, whatever. * This allows you to have P writers to N files. * With no splits, everyone shares 1 file, but * can write lock-free by using different areas. */ //MPI_Comm_dup(MPI_COMM_WORLD, &adios_comm); adios_comm = MPI_COMM_WORLD; int NX = 10; int NY = 1; double t[NX]; double p[NX]; /* ADIOS: Set up the adios communications and buffers, open the file. */ if (send_to_b) { sprintf(adios_filename, "adios_a_to_b.bp"); adios_init("adios_config.xml", adios_comm); } int index, i; for (index = 0 ; index < iterations ; index++ ) { /* Do some exchanges with neighbors */ do_neighbor_exchange(); /* "Compute" */ compute(index); /* Write output */ //my_printf("a"); for (i = 0; i < NX; i++) { t[i] = index*100.0 + myrank*NX + i; } for (i = 0; i < NY; i++) { p[i] = index*1000.0 + myrank*NY + i; } if (send_to_b) { TAU_PROFILE_TIMER(adiostimer, "ADIOS send", __FILE__, TAU_USER); TAU_PROFILE_START(adiostimer); if (index == 0) { adios_open(&adios_handle, "a_to_b", adios_filename, "w", adios_comm); } else { adios_open(&adios_handle, "a_to_b", adios_filename, "a", adios_comm); } /* ADIOS: Actually write the data out. * Yes, this is the recommended method, and this way, changes in * configuration with the .XML file will, even in the worst-case * scenario, merely require running 'gpp.py adios_config.xml' * and typing 'make'. */ #include "gwrite_a_to_b.ch" /* ADIOS: Close out the file completely and finalize. * If MPI is being used, this must happen before MPI_Finalize(). */ adios_close(adios_handle); TAU_PROFILE_STOP(adiostimer); #if 1 if (!announced) { SOS_val foo; foo.i_val = NX; SOS_pack(example_pub, "NX", SOS_VAL_TYPE_INT, foo); SOS_announce(example_pub); SOS_publish(example_pub); announced = true; } #endif } MPI_Barrier(MPI_COMM_WORLD); } MPI_Barrier(MPI_COMM_WORLD); if (send_to_b) { adios_finalize(myrank); } my_printf("Worker A exting.\n"); //MPI_Comm_free(&adios_comm); TAU_PROFILE_STOP(timer); /* exit */ return 0; }
int main (int argc, char *argv[]) { validate_input(argc, argv); /* * Initialize TAU and start a timer for the main function. */ TAU_INIT(&argc, &argv); TAU_PROFILE_SET_NODE(0); TAU_PROFILE_TIMER(tautimer, __func__, my_name, TAU_USER); TAU_PROFILE_START(tautimer); /* * Initialize MPI. We don't require threaded support, but with threads * we can send the TAU data over SOS asynchronously. */ int rc = MPI_SUCCESS; int provided = 0; rc = MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &provided); if (rc != MPI_SUCCESS) { char *errorstring; int length = 0; MPI_Error_string(rc, errorstring, &length); fprintf(stderr, "Error: MPI_Init failed, rc = %d\n%s\n", rc, errorstring); fflush(stderr); exit(99); } MPI_Comm_rank(MPI_COMM_WORLD, &my_rank); MPI_Comm_size(MPI_COMM_WORLD, &comm_size); my_printf("%s %s %d Running with comm_size %d\n", argv[0], my_name, getpid(), comm_size); MPI_Comm adios_comm; MPI_Comm_dup(MPI_COMM_WORLD, &adios_comm); adios_init ("arrays.xml", adios_comm); /* * Loop and do the things */ int iter = 0; char tmpstr[256] = {0}; int * return_codes = (int *)(calloc(num_sources,sizeof(int))); while (iter < iterations) { int index; /* * Read upstream input */ for (index = 0 ; index < num_sources ; index++) { if (return_codes[index] > 0) { my_printf("%s source is gone\n", sources[index]); continue; // this input is gone } my_printf ("%s reading from %s.\n", my_name, sources[index]); sprintf(tmpstr,"%s READING FROM %s", my_name, sources[index]); TAU_START(tmpstr); //mpi_reader(adios_comm, sources[index]); return_codes[index] = flexpath_reader(adios_comm, index); TAU_STOP(tmpstr); } /* * "compute" */ my_printf ("%s computing.\n", my_name); compute(iter); bool time_to_go = (num_sources == 0) ? (iter == (iterations-1)) : true; for (index = 0 ; index < num_sources ; index++) { if (return_codes[index] == 0) { time_to_go = false; break; // out of this for loop } } /* * Send output downstream */ for (index = 0 ; index < num_sinks ; index++) { my_printf ("%s writing to %s.\n", my_name, sinks[index]); sprintf(tmpstr,"%s WRITING TO %s", my_name, sinks[index]); TAU_START(tmpstr); //mpi_writer(adios_comm, sinks[index]); flexpath_writer(adios_comm, index, (iter > 0), time_to_go); TAU_STOP(tmpstr); } if (time_to_go) { break; // out of the while loop } my_printf ("%s not time to go...\n", my_name); iter++; } /* * Finalize ADIOS */ const char const * dot_filename = ".finished"; if (num_sources > 0) { adios_read_finalize_method(ADIOS_READ_METHOD_FLEXPATH); #if 0 } else { while (true) { // assume this is the main process. It can't exit until // the last process is done. if( access( dot_filename, F_OK ) != -1 ) { // file exists unlink(dot_filename); break; } else { // file doesn't exist sleep(1); } } #endif } if (num_sinks > 0) { adios_finalize (my_rank); #if 0 } else { // assume this is the last process. // Tell the main process we are done. FILE *file; if (file = fopen(dot_filename, "w")) { fprintf(file, "done.\n"); fclose(file); } #endif } /* * Finalize MPI */ MPI_Comm_free(&adios_comm); MPI_Finalize(); my_printf ("%s Done.\n", my_name); TAU_PROFILE_STOP(tautimer); return 0; }
int worker(int argc, char* argv[]) { TAU_PROFILE_TIMER(timer, __func__, __FILE__, TAU_USER); TAU_PROFILE_START(timer); my_printf("%d of %d In worker B\n", myrank, commsize); static bool announced = false; /* validate input */ validate_input(argc, argv); my_printf("Worker B will execute until it sees n iterations.\n", iterations); /* ADIOS: These declarations are required to match the generated * gread_/gwrite_ functions. (And those functions are * generated by calling 'gpp.py adios_config.xml') ... * EXCEPT THAT THE generation of Reader code is broken. * So, we will write the reader code manually. */ uint64_t adios_groupsize; uint64_t adios_totalsize; uint64_t adios_handle; void * data = NULL; uint64_t start[2], count[2]; int i, j, steps = 0; int NX = 10; int NY = 1; double t[NX]; double p[NX]; /* ADIOS: Can duplicate, split the world, whatever. * This allows you to have P writers to N files. * With no splits, everyone shares 1 file, but * can write lock-free by using different areas. */ MPI_Comm adios_comm, adios_comm_b_to_c; adios_comm = MPI_COMM_WORLD; //MPI_Comm_dup(MPI_COMM_WORLD, &adios_comm); adios_comm_b_to_c = MPI_COMM_WORLD; //MPI_Comm_dup(MPI_COMM_WORLD, &adios_comm_b_to_c); enum ADIOS_READ_METHOD method = ADIOS_READ_METHOD_FLEXPATH; adios_read_init_method(method, adios_comm, "verbose=3"); if (adios_errno != err_no_error) { fprintf (stderr, "rank %d: Error %d at init: %s\n", myrank, adios_errno, adios_errmsg()); exit(4); } if (send_to_c) { adios_init("adios_config.xml", adios_comm); } /* ADIOS: Set up the adios communications and buffers, open the file. */ ADIOS_FILE *fp; // file handler ADIOS_VARINFO *vi; // information about one variable ADIOS_SELECTION * sel; char adios_filename_a_to_b[256]; char adios_filename_b_to_c[256]; enum ADIOS_LOCKMODE lock_mode = ADIOS_LOCKMODE_NONE; double timeout_sec = 1.0; sprintf(adios_filename_a_to_b, "adios_a_to_b.bp"); sprintf(adios_filename_b_to_c, "adios_b_to_c.bp"); my_printf ("rank %d: Worker B opening file: %s\n", myrank, adios_filename_a_to_b); fp = adios_read_open(adios_filename_a_to_b, method, adios_comm, lock_mode, timeout_sec); if (adios_errno == err_file_not_found) { fprintf (stderr, "rank %d: Stream not found after waiting %d seconds: %s\n", myrank, timeout_sec, adios_errmsg()); exit(1); } else if (adios_errno == err_end_of_stream) { // stream has been gone before we tried to open fprintf (stderr, "rank %d: Stream terminated before open. %s\n", myrank, adios_errmsg()); exit(2); } else if (fp == NULL) { // some other error happened fprintf (stderr, "rank %d: Error %d at opening: %s\n", myrank, adios_errno, adios_errmsg()); exit(3); } else { my_printf("Found file %s\n", adios_filename_a_to_b); my_printf ("File info:\n"); my_printf (" current step: %d\n", fp->current_step); my_printf (" last step: %d\n", fp->last_step); my_printf (" # of variables: %d:\n", fp->nvars); vi = adios_inq_var(fp, "temperature"); adios_inq_var_blockinfo(fp, vi); printf ("ndim = %d\n", vi->ndim); printf ("nsteps = %d\n", vi->nsteps); printf ("dims[%llu][%llu]\n", vi->dims[0], vi->dims[1]); uint64_t slice_size = vi->dims[0]/commsize; if (myrank == commsize-1) { slice_size = slice_size + vi->dims[0]%commsize; } start[0] = myrank * slice_size; count[0] = slice_size; start[1] = 0; count[1] = vi->dims[1]; data = malloc (slice_size * vi->dims[1] * 8); /* Processing loop over the steps (we are already in the first one) */ while (adios_errno != err_end_of_stream && steps < iterations) { steps++; // steps start counting from 1 TAU_PROFILE_TIMER(adios_recv_timer, "ADIOS recv", __FILE__, TAU_USER); TAU_PROFILE_START(adios_recv_timer); sel = adios_selection_boundingbox (vi->ndim, start, count); adios_schedule_read (fp, sel, "temperature", 0, 1, data); adios_perform_reads (fp, 1); if (myrank == 0) printf ("--------- B Step: %d --------------------------------\n", fp->current_step); #if 0 printf("B rank=%d: [0:%lld,0:%lld] = [", myrank, vi->dims[0], vi->dims[1]); for (i = 0; i < slice_size; i++) { printf (" ["); for (j = 0; j < vi->dims[1]; j++) { printf ("%g ", *((double *)data + i * vi->dims[1] + j)); } printf ("]"); } printf (" ]\n\n"); #endif // advance to 1) next available step with 2) blocking wait adios_advance_step (fp, 0, timeout_sec); if (adios_errno == err_step_notready) { printf ("B rank %d: No new step arrived within the timeout. Quit. %s\n", myrank, adios_errmsg()); break; // quit while loop } TAU_PROFILE_STOP(adios_recv_timer); /* Do some exchanges with neighbors */ //do_neighbor_exchange(); /* "Compute" */ compute(steps); for (i = 0; i < NX; i++) { t[i] = steps*100.0 + myrank*NX + i; } for (i = 0; i < NY; i++) { p[i] = steps*1000.0 + myrank*NY + i; } if (send_to_c) { TAU_PROFILE_TIMER(adios_send_timer, "ADIOS send", __FILE__, TAU_USER); TAU_PROFILE_START(adios_send_timer); /* ADIOS: write to the next application in the workflow */ if (steps == 0) { adios_open(&adios_handle, "b_to_c", adios_filename_b_to_c, "w", adios_comm_b_to_c); } else { adios_open(&adios_handle, "b_to_c", adios_filename_b_to_c, "a", adios_comm_b_to_c); } /* ADIOS: Actually write the data out. * Yes, this is the recommended method, and this way, changes in * configuration with the .XML file will, even in the worst-case * scenario, merely require running 'gpp.py adios_config.xml' * and typing 'make'. */ #include "gwrite_b_to_c.ch" /* ADIOS: Close out the file completely and finalize. * If MPI is being used, this must happen before MPI_Finalize(). */ adios_close(adios_handle); TAU_PROFILE_STOP(adios_send_timer); #if 1 if (!announced) { SOS_val foo; foo.i_val = NX; SOS_pack(example_pub, "NX", SOS_VAL_TYPE_INT, foo); SOS_announce(example_pub); SOS_publish(example_pub); announced = true; } #endif } MPI_Barrier(adios_comm_b_to_c); } MPI_Barrier(MPI_COMM_WORLD); adios_read_close(fp); /* ADIOS: Close out the file completely and finalize. * If MPI is being used, this must happen before MPI_Finalize(). */ adios_read_finalize_method(method); } if (send_to_c) { adios_finalize(myrank); } free(data); //MPI_Comm_free(&adios_comm); //MPI_Comm_free(&adios_comm_b_to_c); TAU_PROFILE_STOP(timer); /* exit */ return 0; }
int main(int argc, char** argv) { ios::sync_with_stdio(); #ifdef USE_TAU TAU_PROFILE_INIT(argc,argv); TAU_PROFILE("main","int (int argc, char** argv)",TAU_DEFAULT); TAU_PROFILE_TIMER(loop_timer,"Computation Loop","", TAU_USER); TAU_PROFILE_TIMER(bc_timer,"Boundary Condition","", TAU_USER); TAU_PROFILE_TIMER(update_timer,"Ghost Boundary Update","", TAU_USER); TAU_PROFILE_TIMER(copy_timer,"Array Copy","", TAU_USER); #endif int Number_of_Processors=0; Optimization_Manager::setForceVSG_Update(Off); Optimization_Manager::Initialize_Virtual_Machine("",Number_of_Processors,argc,argv); Partitioning_Type::SpecifyDefaultInternalGhostBoundaryWidths(1); Optimization_Manager::setForceVSG_Update(Off); Index::setBoundsCheck(off); int myid = Communication_Manager::My_Process_Number; int numProcs = Communication_Manager::Number_Of_Processors; const int Xsize=1003*numProcs, Ysize=1003; const Range ix(-1,Xsize-2), iy(-1,Ysize-2), all; const Range ix1(0,Xsize-3), iy1(0,Ysize-3); Partitioning_Type thePartition; thePartition.SpecifyDecompositionAxes(1); doubleArray A(ix,iy), old_A(ix,iy), x(ix,iy), y(ix,iy), temp(ix,iy); const double dx=1./(Xsize-3), dy=1./(Ysize-3), dt=0.1/(Xsize+Ysize); double theTime=0.0,maxError; A.partition(thePartition); old_A.partition(thePartition); temp.partition(thePartition); x.partition(thePartition); y.partition(thePartition); intSerialArray theProcessorSet = (A.getPartition()).getProcessorSet(); doubleSerialArray xlocal = x.getLocalArray(); doubleSerialArray ylocal = y.getLocalArray(); cout<<"size of xlocal: "<<xlocal.getSize()<<endl; Optimization_Manager::setOptimizedScalarIndexing(On); for( int i=xlocal.getBase(0);i<=xlocal.getBound(0);i++) xlocal(i,all) = dx*i; for( int j=ylocal.getBase(1);j<=ylocal.getBound(1);j++) ylocal(all,j) = dy*j; Optimization_Manager::setOptimizedScalarIndexing(Off); x.updateGhostBoundaries(); y.updateGhostBoundaries(); A = (1.0 + theTime)*(2.0 + x + y); doubleSerialArray oldALocal = old_A.getLocalArray(); doubleSerialArray ALocal = A.getLocalArray(); doubleSerialArray tempLocal = temp.getLocalArray(); int iLower = (myid>0)?ALocal.getBase(0)+1:ALocal.getBase(0)+1; int iUpper = (myid<numProcs-1)?ALocal.getBound(0)-1:ALocal.getBound(0)-1; int jLower = ALocal.getBase(1)+1; int jUpper = ALocal.getBound(1)-1; Range ILocInterior(iLower,iUpper); Range JLocInterior(jLower,jUpper); iLower = (myid>0)?ALocal.getBase(0)+1:ALocal.getBase(0); iUpper = (myid<numProcs-1)?ALocal.getBound(0)-1:ALocal.getBound(0); jLower = ALocal.getBase(1); jUpper = ALocal.getBound(1); Range ILoc(iLower,iUpper); Range JLoc(jLower,jUpper); if (myid == 0) cout << "-----Starting computation" << endl; double t = MPI_Wtime(); #ifdef USE_TAU TAU_PROFILE_START(copy_timer); #endif oldALocal = ALocal; #ifdef USE_TAU TAU_PROFILE_STOP(copy_timer); TAU_PROFILE_START(update_timer); #endif old_A.updateGhostBoundaries(); #ifdef USE_TAU TAU_PROFILE_STOP(update_timer); TAU_PROFILE_START(loop_timer); #endif tempLocal(ILocInterior,JLocInterior) = ALocal(ILocInterior,JLocInterior) - dt*( ( ALocal(ILocInterior+1,JLocInterior) - ALocal(ILocInterior-1,JLocInterior) ) / (2.0*dx) + ( ALocal(ILocInterior,JLocInterior+1) - ALocal(ILocInterior,JLocInterior-1) ) / (2.0*dy) - (4.0 + 2.0*theTime + xlocal(ILocInterior,JLocInterior) + ylocal(ILocInterior,JLocInterior)) ); #ifdef USE_TAU TAU_PROFILE_STOP(loop_timer); TAU_PROFILE_START(copy_timer); #endif // // copy temp into A // ALocal(ILocInterior,JLocInterior) = tempLocal(ILocInterior,JLocInterior); #ifdef USE_TAU TAU_PROFILE_STOP(copy_timer); TAU_PROFILE_START(bc_timer); #endif A(all,Ysize-2) = (1.0+(theTime+dt))*(2.0+x(all,Ysize-2)+y(all,Ysize-2)); A(all,-1) = (1.0+(theTime+dt))*(2.0+x(all,-1)+y(all,-1)); A(-1,iy1) = (1.0+(theTime+dt))*(2.0+x(-1,iy1)+y(-1,iy1)); A(Xsize-2,iy1) = (1.0+(theTime+dt))*(2.0+x(Xsize-2,iy1)+y(Xsize-2,iy1)); #ifdef USE_TAU TAU_PROFILE_STOP(bc_timer); TAU_PROFILE_START(update_timer); #endif A.updateGhostBoundaries(); #ifdef USE_TAU TAU_PROFILE_STOP(update_timer); #endif theTime += dt; for (int k = 0; k<100; k++) { #ifdef USE_TAU TAU_PROFILE_START(loop_timer); #endif tempLocal(ILocInterior,JLocInterior) = oldALocal(ILocInterior,JLocInterior) - 2.*dt*( ( ALocal(ILocInterior+1,JLocInterior) - ALocal(ILocInterior-1,JLocInterior) ) / (2.0*dx) + ( ALocal(ILocInterior,JLocInterior+1) - ALocal(ILocInterior,JLocInterior-1) ) / (2.0*dy) - (4.0 + 2.0*theTime + xlocal(ILocInterior,JLocInterior) + ylocal(ILocInterior,JLocInterior)) ); #ifdef USE_TAU TAU_PROFILE_STOP(loop_timer); TAU_PROFILE_START(copy_timer); #endif oldALocal(ILoc,JLoc) = ALocal(ILoc,JLoc); ALocal(ILoc,JLoc) = tempLocal(ILoc,JLoc); #ifdef USE_TAU TAU_PROFILE_STOP(copy_timer); TAU_PROFILE_START(bc_timer); #endif A(all,Ysize-2) = (1.0+(theTime+dt))*(2.0+x(all,Ysize-2)+y(all,Ysize-2)); A(all,-1) = (1.0+(theTime+dt))*(2.0+x(all,-1)+y(all,-1)); A(-1,iy1) = (1.0+(theTime+dt))*(2.0+x(-1,iy1)+y(-1,iy1)); A(Xsize-2,iy1) = (1.0+(theTime+dt))*(2.0+x(Xsize-2,iy1)+y(Xsize-2,iy1)); #ifdef USE_TAU TAU_PROFILE_STOP(bc_timer); TAU_PROFILE_START(update_timer); #endif A.updateGhostBoundaries(); #ifdef USE_TAU TAU_PROFILE_STOP(update_timer); #endif theTime += dt; } t = MPI_Wtime() - t; double maxTime; MPI_Reduce(&t, &maxTime, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); if (myid == 0) { cout << " Total Time= " << maxTime<< " s" << endl; } maxError = max(fabs(A(ix,iy)-(1.0+theTime)*(2.0+x(ix,iy)+y(ix,iy)))); if (myid == 0) { cout << "max error at t="<<theTime<<" is: "<<maxError<<endl; cout << "number of messages sent= "<<Diagnostic_Manager::getNumberOfMessagesSent()<<endl; cout << "number of messages received= "<<Diagnostic_Manager::getNumberOfMessagesReceived()<<endl; ofstream OutFile; OutFile.open("/p/gb1/bmiller/ConvectPpp/ConvectPppNew.ConstSizePerProc.out",ios::app); OutFile << numProcs << " " << maxTime << " " << maxError << endl; OutFile.close(); } #ifdef USE_TAU TAU_PROFILE_EXIT("Finishing Profiling."); #endif Optimization_Manager::Exit_Virtual_Machine(); return 0; }