int main (int argc, char ** argv ) { int i; MPI_Init (&argc, &argv); MPI_Comm_dup (MPI_COMM_WORLD, &comm); MPI_Comm_rank (comm, &rank); MPI_Comm_size (comm, &nproc); comp_comm_init(comm); if (processArgs(argc, argv)) { return 1; } if (!rank) { printf ("Setup parameters:\n"); printf (" # of steps (outputs): %d\n", nsteps); printf (" # of iterations per step: %d\n", niterations); printf (" # of computation units in each iteration: %d\n", ncomp); printf (" # of communication units in each iteration: %d\n", ncomm); printf (" output size per process: %d x %d doubles = %lld bytes\n", nx, ny, sizeof(double) * nx * (uint64_t) ny); printf (" output size per step: %lld bytes\n", nproc * sizeof(double) * nx * (uint64_t) ny); } //2D array with 1D decomposition offs_x = rank * nx; offs_y = 0; gnx = nproc * nx; gny = ny; data = (double*) malloc (sizeof(double) * nx * (size_t) ny); timing_alloc(nsteps); int bufsizeMB = nx*ny*sizeof(double)/1048576 + 1; output_init(comm, bufsizeMB); output_define(nx, ny, gnx, gny, offs_x, offs_y); int it, step, icomp, icomm; /* Warm up a bit */ if (!rank) printf ("Warm up for 1 steps, %d iterations per step...\n", niterations); for (step=0; step < 1; step++) { for (it=0; it < niterations; it++) { for (icomp=0; icomp < ncomp; icomp++) { do_calc_unit (data, nx, ny); } for (icomm=0; icomm < ncomm; icomm++) { do_comm_unit (comm); } } } /* Do the steps with output now */ data_init(); if (!rank) printf ("Start running with I/O and measurements...\n"); double Tcalc_it, Tcomm_it; double Truntime; //to print total time for the loop below (for overhead calculation) char filename[256]; MPI_Barrier (comm); Truntime = MPI_Wtime(); for (step=0; step < nsteps; step++) { if (!rank) printf ("Start step %d\n", step); Tcalc[step] = 0; Tcomm[step] = 0; for (it=0; it < niterations; it++) { // spend some time with computation Tcalc_it = MPI_Wtime(); for (icomp=0; icomp < ncomp; icomp++) { do_calc_unit (data, nx, ny); } Tcalc_it = MPI_Wtime() - Tcalc_it; Tcalc[step] += Tcalc_it; // spend some time with communication Tcomm_it = MPI_Wtime(); for (icomm=0; icomm < ncomm; icomm++) { do_comm_unit (comm); } Tcomm_it = MPI_Wtime() - Tcomm_it; Tcomm[step] += Tcomm_it; } // output per step snprintf (filename, sizeof(filename), "data%6.6d", step); if (!rank) printf ("Output to %s\n", filename); MPI_Barrier (comm); output_dump(filename, step, data); } MPI_Barrier (comm); Truntime = MPI_Wtime() - Truntime; if (!rank) printf ("Finalize...\n"); MPI_Barrier (comm); output_finalize (rank); timing_report(nsteps, comm); double Truntime_max; MPI_Reduce (&Truntime, &Truntime_max, 1, MPI_DOUBLE, MPI_MAX, 0, comm); if (!rank) printf ("Total runtime of main loop: %9.3f\n", Truntime); free (data); timing_free(); MPI_Barrier (comm); MPI_Finalize (); return 0; }
/* * Dump the setup code read in the various prologue (.pro and .ps) * files. The hard part is that we don't want to dump too * many definitions of fonts, to avoid running out of memory on * too old PS level 1 printers. * Nevertheless, I still wait for somebody to tell me if this is * really needed (useful is sure, needed is not) */ void dump_setup (FILE * stream, a2ps_job * job) { output_dump (job->status->setup, stream); }