void main( void ) { typedef enum { CONFIGURE, INITIALISE1, RUN } MAINLOOP_STATE; static MAINLOOP_STATE MainLoopState = CONFIGURE; while( 1 ) // Main Super Loop { switch ( MainLoopState ) { case CONFIGURE : if ( Configure() == Command_Complete ) { MainLoopState = INITIALISE1; } break; case INITIALISE1 : if ( Initialise() == Command_Complete ) { MainLoopState = RUN; } break; case RUN : Manage_Comms(); Manage_Movements(); break; } } }
int main ( int argc, char *argv[] ) { // Solution arrays real *h_u; /* to be allocated in ROOT only */ real *t_u; real *t_un; // Auxiliary variables int rank; int size; int step; dmn domain; double wtime; int nbrs[6]; int i, j, k; // Initialize MPI MPI_Init(&argc, &argv); MPI_Comm_size(MPI_COMM_WORLD, &size); // if number of np != Sx*Sy*Sz then terminate. if (size != SX*SY*SZ){ if (rank==ROOT) fprintf(stderr,"%s: Needs at least %d processors.\n", argv[0], SX*SY*SZ); MPI_Finalize(); return 1; } // verify subsizes if (NX%SX!=0 || NY%SY!=0 || NZ%SZ!=0) { if (rank==ROOT) fprintf(stderr,"%s: Subdomain sizes not an integer value.\n", argv[0]); MPI_Finalize(); return 1; } // Build a 2D cartessian communicator MPI_Comm Comm3d; int ndim=3; int dim[3]={SZ,SY,SX}; // domain decomposition subdomains int period[3]={false,false,false}; // periodic conditions int reorder={true}; // allow reorder if necesary int coord[3]; MPI_Cart_create(MPI_COMM_WORLD,ndim,dim,period,reorder,&Comm3d); MPI_Comm_rank(Comm3d,&rank); // rank wrt to Comm2d MPI_Cart_coords(Comm3d,rank,3,coord); // rank coordinates // Map the neighbours ranks MPI_Cart_shift(Comm3d,0,1,&nbrs[TOP],&nbrs[BOTTOM]); MPI_Cart_shift(Comm3d,1,1,&nbrs[NORTH],&nbrs[SOUTH]); MPI_Cart_shift(Comm3d,2,1,&nbrs[WEST],&nbrs[EAST]); // Manage Domain sizes domain = Manage_Domain(rank,size,coord,nbrs); // Allocate Memory Manage_Memory(0,domain,&h_u,&t_u,&t_un); // Root mode: Build Initial Condition if (domain.rank==ROOT) Call_IC(2,h_u); // Build MPI data types MPI_Datatype myGlobal; MPI_Datatype myLocal; MPI_Datatype xySlice; MPI_Datatype yzSlice; MPI_Datatype xzSlice; //Manage_DataTypes(0,domain,&xySlice,&yzSlice,&xzSlice,&myLocal,&myGlobal); // Build a MPI data type for a subarray in Root processor MPI_Datatype global; int nx = domain.nx; int ny = domain.ny; int nz = domain.nz; int bigsizes[3] = {NZ,NY,NX}; int subsizes[3] = {nz,ny,nx}; int starts[3] = {0,0,0}; MPI_Type_create_subarray(3, bigsizes, subsizes, starts, MPI_ORDER_C, MPI_CUSTOM_REAL, &global); MPI_Type_create_resized(global, 0, nx*sizeof(real), &myGlobal); // extend the type MPI_Type_commit(&myGlobal); // Build a MPI data type for a subarray in workers int bigsizes2[3] = {R+nz+R,R+ny+R,R+nx+R}; int subsizes2[3] = {nz,ny,nx}; int starts2[3] = {R,R,R}; MPI_Type_create_subarray(3, bigsizes2, subsizes2, starts2, MPI_ORDER_C, MPI_CUSTOM_REAL, &myLocal); MPI_Type_commit(&myLocal); // now we can use this MPI costum data type // halo data types MPI_Datatype yVector; MPI_Type_vector( ny, nx, nx+2*R, MPI_CUSTOM_REAL, &xySlice); MPI_Type_commit(&xySlice); MPI_Type_vector( ny, 1, nx+2*R, MPI_CUSTOM_REAL, &yVector); MPI_Type_create_hvector(nz, 1, (nx+2*R)*(ny+2*R)*sizeof(real), yVector, &yzSlice); MPI_Type_commit(&yzSlice); MPI_Type_vector( nz, nx, (nx+2*R)*(ny+2*R), MPI_CUSTOM_REAL, &xzSlice); MPI_Type_commit(&xzSlice); // build sendcounts and displacements in root processor int sendcounts[size], displs[size]; if (rank==ROOT) { for (i=0; i<size; i++) sendcounts[i]=1; int disp = 0; // displacement counter for (k=0; k<SZ; k++) { for (j=0; j<SY; j++) { for (i=0; i<SX; i++) { displs[i+SX*j+SX*SY*k]=disp; disp+=1; // x-displacements } disp += SX*(ny-1); // y-displacements } disp += SX*NY*(nz-1); // z-displacements } } // Scatter global array data and exchange halo regions MPI_Scatterv(h_u, sendcounts, displs, myGlobal, t_u, 1, myLocal, ROOT, Comm3d); Manage_Comms(domain,Comm3d,xySlice,yzSlice,xzSlice,t_u); MPI_Barrier(Comm3d); // ROOT mode: Record the starting time. if (rank==ROOT) wtime=MPI_Wtime(); // Asynchronous MPI Solver for (step = 0; step < NO_STEPS; step+=2) { // print iteration in ROOT mode if (rank==ROOT && step%10000==0) printf(" Step %d of %d\n",step,(int)NO_STEPS); // Exchange Boundaries and compute stencil Call_Laplace(domain,&t_u,&t_un);Manage_Comms(domain,Comm3d,xySlice,yzSlice,xzSlice,t_un);//1stIter Call_Laplace(domain,&t_un,&t_u);Manage_Comms(domain,Comm3d,xySlice,yzSlice,xzSlice,t_u );//2ndIter } // ROOT mode: Record the final time. if (rank==ROOT) { wtime = MPI_Wtime()-wtime; printf ("\n Wall clock elapsed = %f seconds\n\n", wtime ); } /* // CAREFUL: uncomment only for debugging. Print subroutine for (int p=0; p<size; p++) { if (rank == p) { printf("Local process on rank %d is:\n", rank); for (k=0; k<nz+2*R; k++) { printf("-- layer %d --\n",k); for (j=0; j<ny+2*R; j++) { putchar('|'); for (i=0; i<nx+2*R; i++) printf("%3.0f ",t_u[i+(nx+2*R)*j+(nx+2*R)*(ny+2*R)*k]); printf("|\n"); } printf("\n"); } } MPI_Barrier(Comm3d); }*/ // gather all pieces into the big data array MPI_Gatherv(t_u, 1, myLocal, h_u, sendcounts, displs, myGlobal, ROOT, Comm3d); // save results to file //if (rank==0) Print(h_u,NX,NY,NZ); if (rank==ROOT) Save_Results(h_u); // Free MPI types Manage_DataTypes(1,domain,&xySlice,&yzSlice,&xzSlice,&myLocal,&myGlobal); // Free Memory Manage_Memory(1,domain,&h_u,&t_u,&t_un); // finalize MPI MPI_Finalize(); // ROOT mode: Terminate. if (rank==ROOT) { printf ("HEAT_MPI:\n" ); printf (" Normal end of execution.\n\n" ); } return 0; }
int main ( int argc, char *argv[] ) { // Auxiliary variables int rank; int size; int step; dmn domain; double wtime; // Solution arrays real *h_u; /* will be allocated in ROOT only */ real *t_u; /* processors sub-domain */ real *d_u; real *d_un; // Initialize devices //Manage_Devices(); // Initialize MPI MPI_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &size); // if number of np != SX then terminate. if (size != SZ){ if (rank==ROOT) fprintf(stderr,"%s: Needs at least %d processors.\n", argv[0], SZ); MPI_Finalize(); return 1; } // verify subsizes if (NZ%SZ!=0) { if (rank==ROOT) fprintf(stderr,"%s: Subdomain sizes are not an integer value.\n", argv[0]); MPI_Finalize(); return 1; } // Associate each rank with a GPU if (argc < size) { if (rank==ROOT) printf("Usage : mpirun -np# %s <GPU list per rank>\n", argv[0]); MPI_Finalize(); return 1; } // Manage Domain sizes domain = Manage_Domain(rank,size,atoi(argv[rank+1])); MPI_Barrier(MPI_COMM_WORLD); // Allocate Memory Manage_Memory(0,domain,&h_u,&t_u,&d_u,&d_un); // Root mode: Build Initial Condition and scatter it to the rest of processors if (domain.rank==ROOT) Call_IC(2,h_u); MPI_Scatter(h_u, domain.size, MPI_CUSTOM_REAL, t_u+R*NX*NY, domain.size, MPI_CUSTOM_REAL, ROOT, MPI_COMM_WORLD); // Send local domain to devices Manage_Comms(0,domain,&t_u,&d_u); // Exchange halo data Manage_Comms(1,domain,&t_u,&d_u); MPI_Barrier(MPI_COMM_WORLD); // ROOT mode: Record the starting time. if (rank==ROOT) wtime=MPI_Wtime(); // Asynchronous MPI Solver for (step = 0; step < NO_STEPS; step+=2) { // print iteration in ROOT mode if (rank==ROOT && step%10000==0) printf(" Step %d of %d\n",step,(int)NO_STEPS); // Exchange Boundaries and compute stencil Call_Laplace(domain,&d_u,&d_un); Manage_Comms(1,domain,&t_u,&d_un); // 1st iter Call_Laplace(domain,&d_un,&d_u); Manage_Comms(1,domain,&t_u,&d_u ); // 2nd iter } MPI_Barrier(MPI_COMM_WORLD); // Gather local domain back from devices Manage_Comms(2,domain,&t_u,&d_u); // ! Uncommment for debugging // if (rank==0) Print_SubDomain(domain,t_u); MPI_Barrier(MPI_COMM_WORLD); // if (rank==1) Print_SubDomain(domain,t_u); MPI_Barrier(MPI_COMM_WORLD); // if (rank==0) Print_Domain(domain,h_u); MPI_Barrier(MPI_COMM_WORLD); // ROOT mode: Record the final time. if (rank==ROOT) { wtime = MPI_Wtime()-wtime; printf("\n Wall clock elapsed seconds = %f\n\n", wtime ); } // Gather solutions to ROOT and write solution in ROOT mode MPI_Gather(t_u+R*NX*NY, domain.size, MPI_CUSTOM_REAL, h_u, domain.size, MPI_CUSTOM_REAL, ROOT, MPI_COMM_WORLD); if (rank==ROOT) Save_Results(h_u); // Free Memory Manage_Memory(2,domain,&h_u,&t_u,&d_u,&d_un); MPI_Barrier(MPI_COMM_WORLD); // Terminate MPI. MPI_Finalize(); // ROOT mode: Terminate. if (rank==ROOT) { printf("HEAT_MPI:\n" ); printf(" Normal end of execution.\n\n" ); } return 0; }
int main() { time_t t; int tid; int step; // CPU (Host) variables float *h_p; // Primitives Vector - entire domain float *h_pl; // Primitives Vector - local (in-thread) domain // GPU (Device) variables float *d_p; // Primitives on GPU float *d_u; // Conserved quantity on GPU float *d_Fp; // Forward Fluxes float *d_Fm; // Backward Fluxes size_t size; // First, perform 1st phase memory management tasks Manage_Memory(0,0, &h_p, &h_pl, &d_p, &d_u, &d_Fp, &d_Fm); // Set the number of threads omp_set_num_threads(2); #pragma omp parallel shared(h_p) private(tid, h_pl, d_p, d_u, d_Fp, d_Fm, step) { // Now living in multiple theads land. Get the Thread ID. tid = omp_get_thread_num(); // Allocate memory on GPU for each thread Manage_Memory(1, tid, &h_p, &h_pl, &d_p, &d_u, &d_Fp, &d_Fm); // Compute the Initial Conditions on the GPU Call_GPU_Init(&d_p, &d_u, tid); #pragma omp barrier // Request computers current time t = clock(); // Solver Loop for (step = 0; step < NO_STEPS; step++) { if (step % 1000 == 0) printf("Step %d of %d\n", step, NO_STEPS); // Synchronization work // 1) Copy h_pl (on host) from d_p on the device Manage_Comms(3, tid, &h_p, &h_pl, &d_p, &d_u, &d_Fp, &d_Fm); #pragma omp barrier // 2) Update h_p from h_pl prior to bounds adjustment Manage_Bounds(-1, tid, h_p, h_pl); // 3) Update h_pl from h_p after bounds adjustment Manage_Bounds(0, tid, h_p, h_pl); // 4) Send h_pl from host to GPU (d_pl), ready for flux calculation Manage_Comms(2, tid, &h_p, &h_pl, &d_p, &d_u, &d_Fp, &d_Fm); // Calculate flux now Call_GPU_Calc_Flux(&d_p, &d_Fp, &d_Fm, tid); // Update state now Call_GPU_Calc_State(&d_p, &d_u, &d_Fp, &d_Fm, tid); #pragma omp barrier } // Measure computation time t = clock()-t; // Grab all results from the GPU devices and store in h_pl Manage_Comms(1, tid, &h_p, &h_pl, &d_p, &d_u, &d_Fp, &d_Fm); #pragma omp barrier // Save h_pl to h_p Manage_Bounds(1, tid, h_p, h_pl); // Free GPU memory on each thread Manage_Memory(2, tid, &h_p, &h_pl, &d_p, &d_u, &d_Fp, &d_Fm); #pragma omp barrier } // Save results Save_Results(h_p); // Report time printf("CPU time (%f seconds).\n",((float)t)/CLOCKS_PER_SEC); // Free last memory on host Manage_Memory(3,0, &h_p, &h_pl, &d_p, &d_u, &d_Fp, &d_Fm); return 0; }
int main ( int argc, char *argv[] ) { // Auxiliary variables int rank; int npcs; int step; dmn domain; double wtime; // Solution arrays double *g_u; /* will be allocated in ROOT only */ double *t_u; double *t_un; // Initialize MPI MPI_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &npcs); // Manage Domain sizes domain = Manage_Domain(rank,npcs); // Allocate Memory Manage_Memory(0,domain,&g_u,&t_u,&t_un); // Root mode: Build Initial Condition and scatter it to the rest of processors if (domain.rank==ROOT) Call_IC(2,g_u); MPI_Scatter(g_u, domain.size, MPI_DOUBLE, t_u+NX*NY, domain.size, MPI_DOUBLE, ROOT, MPI_COMM_WORLD); // Exchage Halo regions Manage_Comms(domain,&t_u); MPI_Barrier(MPI_COMM_WORLD); // ROOT mode: Record the starting time. if (rank==ROOT) wtime=MPI_Wtime(); // Asynchronous MPI Solver for (step = 0; step < NO_STEPS; step+=2) { // print iteration in ROOT mode if (rank==ROOT && step%10000==0) printf(" Step %d of %d\n",step,(int)NO_STEPS); // Exchange Boundaries and compute stencil Call_Laplace(domain,&t_u,&t_un); Manage_Comms(domain,&t_un); // 1st iter Call_Laplace(domain,&t_un,&t_u); Manage_Comms(domain,&t_u ); // 2nd iter } MPI_Barrier(MPI_COMM_WORLD); // ROOT mode: Record the final time. if (rank==ROOT) { wtime = MPI_Wtime()-wtime; printf ("\n Wall clock elapsed seconds = %f\n\n", wtime ); } // Gather solutions to ROOT and write solution in ROOT mode MPI_Gather(t_u+NX*NY, domain.size, MPI_DOUBLE, g_u, domain.size, MPI_DOUBLE, ROOT, MPI_COMM_WORLD); if (rank==ROOT) Save_Results(g_u); // Free Memory Manage_Memory(1,domain,&g_u,&t_u,&t_un); MPI_Barrier(MPI_COMM_WORLD); // Terminate MPI. MPI_Finalize(); // ROOT mode: Terminate. if (rank==ROOT) { printf ("HEAT_MPI:\n" ); printf (" Normal end of execution.\n\n" ); } return 0; }