JNIEXPORT jint JNICALL Java_mpi_MPI_queryThread_1jni(JNIEnv *env, jclass clazz) { int provided; int rc = MPI_Query_thread(&provided); ompi_java_exceptionCheck(env, rc); return provided; }
/** Initialize DALEC. MPI must be initialized before this can be called. It * invalid to make DALEC calls before initialization. Collective on the world * group. * * @return Zero on success */ int PDALEC_Initialize(MPI_Comm user_comm) { int dalec_alive = atomic_fetch_sub_explicit(&(DALECI_GLOBAL_STATE.alive), 1,memory_order_seq_cst); if (dalec_alive == 0) { /* Initialize, since this is the first call to this function. */ int mpi_is_init, mpi_is_fin; MPI_Initialized(&mpi_is_init); MPI_Finalized(&mpi_is_fin); if (!mpi_is_init || mpi_is_fin) { DALECI_Warning("MPI must be active when calling DALEC_Initialize"); return DALEC_ERROR_MPI_USAGE; } /* Always dupe the user communicator for internal usage. */ /* Do not abort on MPI failure, let user handle if MPI does not abort. */ int rc = MPI_Comm_dup(user_comm, &DALECI_GLOBAL_STATE.mpi_comm); return DALECI_Check_MPI("DALEC_Initialize", "MPI_Comm_dup", rc); /* Determine what level of threading MPI supports. */ int mpi_thread_level; MPI_Query_thread(&mpi_thread_level); DALECI_GLOBAL_STATE.mpi_thread_level; } else { /* Library has already been initialized. */ return DALEC_SUCCESS; } }
bool init(int *ac, const char **av, bool useCommWorld) { int initialized = false; MPI_CALL(Initialized(&initialized)); int provided = 0; if (!initialized) { /* MPI not initialized by the app - it's up to us */ MPI_CALL(Init_thread(ac, const_cast<char ***>(&av), MPI_THREAD_MULTIPLE, &provided)); } else { /* MPI was already initialized by the app that called us! */ MPI_Query_thread(&provided); } if (provided != MPI_THREAD_MULTIPLE && provided != MPI_THREAD_SERIALIZED) { throw std::runtime_error("MPI initialization error: The MPI runtime must" " support either MPI_THREAD_MULTIPLE or" " MPI_THREAD_SERIALIZED."); } mpiIsThreaded = provided == MPI_THREAD_MULTIPLE; if (useCommWorld) { world.setTo(MPI_COMM_WORLD); } return !initialized; }
threading_modes threading_mode() const { int provided; MPI_Query_thread(&provided); switch (provided) { case MPI_THREAD_SINGLE: return threading_modes::single; case MPI_THREAD_FUNNELED: return threading_modes::funneled; case MPI_THREAD_SERIALIZED: return threading_modes::serialized; case MPI_THREAD_MULTIPLE: return threading_modes::multiple; } return threading_modes::single; // make compiler happy }
int main( int argc, char *argv[] ) { int provided, claimed; /*** Select one of the following MPI_Init_thread( 0, 0, MPI_THREAD_SINGLE, &provided ); MPI_Init_thread( 0, 0, MPI_THREAD_FUNNELED, &provided ); MPI_Init_thread( 0, 0, MPI_THREAD_SERIALIZED, &provided ); MPI_Init_thread( 0, 0, MPI_THREAD_MULTIPLE, &provided ); ***/ MPI_Init_thread(0, 0, MPI_THREAD_MULTIPLE, &provided ); MPI_Query_thread( &claimed ); printf( "Query thread level= %d Init_thread level= %d\n", claimed, provided ); MPI_Finalize(); }
void init_mpi (struct pe_vars * v) { int mpi_provided; MPI_Init_thread( NULL, NULL, MPI_THREAD_SERIALIZED, &mpi_provided ); MPI_Query_thread(&mpi_provided); if (strcmp((const char *)MPI_THREAD_STRING(mpi_provided),"WTF") == 0) MPI_Abort (MPI_COMM_WORLD, 5); MPI_Comm_rank( MPI_COMM_WORLD, &(v->me) ); MPI_Comm_size( MPI_COMM_WORLD, &(v->npes) ); v->pairs = v->npes / 2; v->nxtpe = ( v->me < v->pairs ) ? ( v->me + v->pairs ) : ( v->me - v->pairs ); return; }
int pmrrr (char *jobz, char *range, int *np, double *D, double *E, double *vl, double *vu, int *il, int *iu, int *tryracp, MPI_Comm comm, int *nzp, int *offsetp, double *W, double *Z, int *ldz, int *Zsupp) { /* Input parameter */ int n = *np; bool onlyW = toupper(jobz[0]) == 'N'; bool wantZ = toupper(jobz[0]) == 'V'; bool cntval = toupper(jobz[0]) == 'C'; bool alleig = toupper(range[0]) == 'A'; bool valeig = toupper(range[0]) == 'V'; bool indeig = toupper(range[0]) == 'I'; /* Check input parameters */ if(!(onlyW || wantZ || cntval)) return 1; if(!(alleig || valeig || indeig)) return 1; if(n <= 0) return 1; if (valeig) { if(*vu<=*vl) return 1; } else if (indeig) { if (*il<1 || *il>n || *iu<*il || *iu>n) return 1; } /* MPI & multithreading info */ int is_init, is_final; MPI_Initialized(&is_init); MPI_Finalized(&is_final); if (is_init!=1 || is_final==1) { fprintf(stderr, "ERROR: MPI is not active! (init=%d, final=%d) \n", is_init, is_final); return 1; } MPI_Comm comm_dup; MPI_Comm_dup(comm, &comm_dup); int nproc, pid, thread_support; MPI_Comm_size(comm_dup, &nproc); MPI_Comm_rank(comm_dup, &pid); MPI_Query_thread(&thread_support); int nthreads; if ( !(thread_support == MPI_THREAD_MULTIPLE || thread_support == MPI_THREAD_FUNNELED) ) { /* Disable multithreading; note: to support multithreading with * MPI_THREAD_SERIALIZED the code must be changed slightly; this * is not supported at the moment */ nthreads = 1; } else { char *ompvar = getenv("PMR_NUM_THREADS"); if (ompvar == NULL) { nthreads = DEFAULT_NUM_THREADS; } else { nthreads = atoi(ompvar); } } #if defined(MVAPICH2_VERSION) if (nthreads>1) { int mv2_affinity=1; char *mv2_string = getenv("MV2_ENABLE_AFFINITY"); if (mv2_string != NULL) mv2_affinity = atoi(mv2_string); if (mv2_affinity!=0) { nthreads = 1; if (pid==0) { fprintf(stderr, "WARNING: PMRRR incurs a significant performance penalty when multithreaded with MVAPICH2 with affinity enabled. The number of threads has been reduced to one; please rerun with MV2_ENABLE_AFFINITY=0 or PMR_NUM_THREADS=1 in the future.\n"); fflush(stderr); } } } #endif /* If only maximal number of local eigenvectors are queried * return if possible here */ *nzp = 0; *offsetp = 0; if (cntval) { if ( alleig || n < DSTEMR_IF_SMALLER ) { *nzp = iceil(n,nproc); MPI_Comm_free(&comm_dup); return 0; } else if (indeig) { *nzp = iceil(*iu-*il+1,nproc); MPI_Comm_free(&comm_dup); return 0; } } /* Check if computation should be done by multiple processes */ int info; if (n < DSTEMR_IF_SMALLER) { info = handle_small_cases(jobz, range, np, D, E, vl, vu, il, iu, tryracp, comm, nzp, offsetp, W, Z, ldz, Zsupp); MPI_Comm_free(&comm_dup); return info; } /* Allocate memory */ double *Werr = (double*)malloc(n*sizeof(double)); assert(Werr!=NULL); double *Wgap = (double*)malloc(n*sizeof(double)); assert(Wgap!=NULL); double *gersch = (double*)malloc(2*n*sizeof(double)); assert(gersch!=NULL); int *iblock = (int*)calloc(n,sizeof(int)); assert(iblock!=NULL); int *iproc = (int*)malloc(n*sizeof(int)); assert(iproc!=NULL); int *Windex = (int*)malloc(n*sizeof(int)); assert(Windex!=NULL); int *isplit = (int*)malloc(n*sizeof(int)); assert(isplit!=NULL); int *Zindex = (int*)malloc(n*sizeof(int)); assert(Zindex!=NULL); proc_t *procinfo = (proc_t*)malloc(sizeof(proc_t)); assert(procinfo!=NULL); in_t *Dstruct = (in_t*)malloc(sizeof(in_t)); assert(Dstruct!=NULL); val_t *Wstruct = (val_t*)malloc(sizeof(val_t)); assert(Wstruct!=NULL); vec_t *Zstruct = (vec_t*)malloc(sizeof(vec_t)); assert(Zstruct!=NULL); tol_t *tolstruct = (tol_t*)malloc(sizeof(tol_t)); assert(tolstruct!=NULL); /* Bundle variables into a structures */ procinfo->pid = pid; procinfo->nproc = nproc; procinfo->comm = comm_dup; procinfo->nthreads = nthreads; procinfo->thread_support = thread_support; Dstruct->n = n; Dstruct->D = D; Dstruct->E = E; Dstruct->isplit = isplit; Wstruct->n = n; Wstruct->vl = vl; Wstruct->vu = vu; Wstruct->il = il; Wstruct->iu = iu; Wstruct->W = W; Wstruct->Werr = Werr; Wstruct->Wgap = Wgap; Wstruct->Windex = Windex; Wstruct->iblock = iblock; Wstruct->iproc = iproc; Wstruct->gersch = gersch; Zstruct->ldz = *ldz; Zstruct->nz = 0; Zstruct->Z = Z; Zstruct->Zsupp = Zsupp; Zstruct->Zindex = Zindex; /* Scale matrix to allowable range, returns 1.0 if not scaled */ double scale = scale_matrix(Dstruct, Wstruct, valeig); /* Test if matrix warrants more expensive computations which * guarantees high relative accuracy */ if (*tryracp) odrrr(&n, D, E, &info); /* 0 - rel acc */ else info = -1; int i; double *Dcopy, *E2copy; if (info == 0) { /* This case is extremely rare in practice */ tolstruct->split = DBL_EPSILON; /* Copy original data needed for refinement later */ Dcopy = (double*)malloc(n*sizeof(double)); assert(Dcopy!=NULL); memcpy(Dcopy, D, n*sizeof(double)); E2copy = (double*)malloc(n*sizeof(double)); assert(E2copy!=NULL); for (i=0; i<n-1; i++) E2copy[i] = E[i]*E[i]; } else { /* Neg. threshold forces old splitting criterion */ tolstruct->split = -DBL_EPSILON; *tryracp = 0; } if (!wantZ) { /* Compute eigenvalues to full precision */ tolstruct->rtol1 = 4.0 * DBL_EPSILON; tolstruct->rtol2 = 4.0 * DBL_EPSILON; } else { /* Do not compute to full accuracy first, but refine later */ tolstruct->rtol1 = sqrt(DBL_EPSILON); tolstruct->rtol1 = fmin(1e-2*MIN_RELGAP, tolstruct->rtol1); tolstruct->rtol2 = sqrt(DBL_EPSILON)*5.0E-3; tolstruct->rtol2 = fmin(5e-6*MIN_RELGAP, tolstruct->rtol2); tolstruct->rtol2 = fmax(4.0 * DBL_EPSILON, tolstruct->rtol2); } /* Compute all eigenvalues: sorted by block */ info = plarre(procinfo,jobz,range,Dstruct,Wstruct,tolstruct,nzp,offsetp); assert(info == 0); /* If just number of local eigenvectors are queried */ if (cntval & valeig) { clean_up(comm_dup, Werr, Wgap, gersch, iblock, iproc, Windex, isplit, Zindex, procinfo, Dstruct, Wstruct, Zstruct, tolstruct); return 0; } /* If only eigenvalues are to be computed */ if (!wantZ) { /* Refine to high relative with respect to input T */ if (*tryracp) { info = refine_to_highrac (procinfo, jobz, Dcopy, E2copy, Dstruct, nzp, Wstruct, tolstruct); assert(info == 0); } /* Sort eigenvalues */ qsort(W, n, sizeof(double), cmp); /* Only keep subset ifirst:ilast */ int ifirst, ilast, isize; int iil = *il; int iiu = *iu; int ifirst_tmp=iil; for (i=0; i<nproc; i++) { int chunk = (iiu-iil+1)/nproc + (i < (iiu-iil+1)%nproc); int ilast_tmp; if (i == nproc-1) { ilast_tmp = iiu; } else { ilast_tmp = ifirst_tmp + chunk - 1; ilast_tmp = imin(ilast_tmp, iiu); } if (i == pid) { ifirst = ifirst_tmp; ilast = ilast_tmp; isize = ilast - ifirst + 1; *offsetp = ifirst - iil; *nzp = isize; } ifirst_tmp = ilast_tmp + 1; ifirst_tmp = imin(ifirst_tmp, iiu + 1); } if (isize > 0) { memmove(W, &W[ifirst-1], *nzp * sizeof(double)); } /* If matrix was scaled, rescale eigenvalues */ invscale_eigenvalues(Wstruct, scale, *nzp); clean_up (comm_dup, Werr, Wgap, gersch, iblock, iproc, Windex, isplit, Zindex, procinfo, Dstruct, Wstruct, Zstruct, tolstruct); return 0; } /* end of only eigenvalues to compute */ /* Compute eigenvectors */ info = plarrv(procinfo, Dstruct, Wstruct, Zstruct, tolstruct, nzp, offsetp); assert(info == 0); /* Refine to high relative with respect to input matrix */ if (*tryracp) { info = refine_to_highrac(procinfo, jobz, Dcopy, E2copy, Dstruct, nzp, Wstruct, tolstruct); assert(info == 0); } /* If matrix was scaled, rescale eigenvalues */ invscale_eigenvalues(Wstruct, scale, n); /* Make the first nz elements of W contains the eigenvalues * associated to the process */ int j, im=0; for (j=0; j<n; j++) { if (iproc[j] == pid) { W[im] = W[j]; Windex[im] = Windex[j]; Zindex[im] = Zindex[j]; im++; } } clean_up(comm_dup, Werr, Wgap, gersch, iblock, iproc, Windex, isplit, Zindex, procinfo, Dstruct, Wstruct, Zstruct, tolstruct); if (*tryracp) { free(Dcopy); free(E2copy); } return 0; } /* end pmrrr */
int main(int argc, char *argv[]) { nptsside = atoi(argv[1]); print_node = atoi(argv[2]); nodenum = (int) atoi(argv[3]); ppnnum = (int) atoi(argv[4]); tasktype = argv[5][0]; side2 = nptsside / 2.0; side4 = nptsside / 4.0; int provided, claimed; MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &provided ); MPI_Query_thread( &claimed ); // MPI_Init(&argc, &argv); MPI_Comm_size(MPI_COMM_WORLD, &nnodes); MPI_Comm_rank(MPI_COMM_WORLD, &my_rank); if (my_rank == print_node) { printf( "Query thread level= %d Init_thread level= %d\n", claimed, provided ); printf( "Defined LEVEL= %d (ompi_info | grep -i thread) \n", MPI_THREAD_MULTIPLE); } mpi_chunksize = nptsside/nnodes; struct timespec bgn,nd; clock_gettime(CLOCK_REALTIME, &bgn); #ifdef RC scram = rpermute(nptsside, 0); MPI_Scatter(scram, mpi_chunksize, MPI_INT, scram, mpi_chunksize, MPI_INT, 0, MPI_COMM_WORLD); #else findmyrange(nptsside, nnodes, my_rank, myrange); scram = rpermute(mpi_chunksize, myrange[0]); printf("My range is %d %d \n", myrange[0], myrange[1]); #endif dowork(); //implied barrier clock_gettime(CLOCK_REALTIME, &nd); if (my_rank == print_node) { printf("Random chunk RC is defined\n"); printf("time:%g :: maxiteration:%d :: Pixel:%d :: threadnum:%d :: mpi_chunksize:%d :: tot_count:%d :: count:%d :: nodenum:%d :: ppnnum:%d :: tasktype:%c\n", timediff(bgn,nd),MAXITERS, nptsside,nnodes,mpi_chunksize,tot_count,count,nodenum,ppnnum,tasktype); FILE *fp; fp = fopen("OMP_MPI_mand.txt","a"); fprintf(fp, "time:%g :: maxiteration:%d :: Pixel:%d :: threadnum:%d :: mpi_chunksize:%d :: tot_count:%d :: count:%d :: nodenum:%d :: ppnnum:%d :: tasktype:%c\n", timediff(bgn,nd),MAXITERS,nptsside,nnodes,mpi_chunksize,tot_count,count,nodenum,ppnnum,tasktype); fclose(fp); } MPI_Finalize(); }
FORT_DLL_SPEC void FORT_CALL mpi_query_thread_ ( MPI_Fint *v1, MPI_Fint *ierr ){ *ierr = MPI_Query_thread( v1 ); }
/// main for SCF int main (int argc, char **argv) { // init MPI int myrank; int nprocs; int provided; #if defined (USE_ELEMENTAL) ElInitialize( &argc, &argv ); ElMPICommRank( MPI_COMM_WORLD, &myrank ); ElMPICommSize( MPI_COMM_WORLD, &nprocs ); MPI_Query_thread(&provided); #else MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &provided); MPI_Comm_rank(MPI_COMM_WORLD, &myrank); MPI_Comm_size(MPI_COMM_WORLD, &nprocs); #endif if (myrank == 0) { printf("MPI thread support: %s\n", MPI_THREAD_STRING(provided)); } #if 0 char hostname[1024]; gethostname (hostname, 1024); printf ("Rank %d of %d running on node %s\n", myrank, nprocs, hostname); #endif // create basis set BasisSet_t basis; CInt_createBasisSet(&basis); // input parameters and load basis set int nprow_fock; int npcol_fock; int nblks_fock; int nprow_purif; int nshells; int natoms; int nfunctions; int niters; if (myrank == 0) { if (argc != 8) { usage(argv[0]); MPI_Finalize(); exit(0); } // init parameters nprow_fock = atoi(argv[3]); npcol_fock = atoi(argv[4]); nprow_purif = atoi(argv[5]); nblks_fock = atoi(argv[6]); niters = atoi(argv[7]); assert(nprow_fock * npcol_fock == nprocs); assert(nprow_purif * nprow_purif * nprow_purif <= nprocs); assert(niters > 0); CInt_loadBasisSet(basis, argv[1], argv[2]); nshells = CInt_getNumShells(basis); natoms = CInt_getNumAtoms(basis); nfunctions = CInt_getNumFuncs(basis); assert(nprow_fock <= nshells && npcol_fock <= nshells); assert(nprow_purif <= nfunctions && nprow_purif <= nfunctions); printf("Job information:\n"); char *fname; fname = basename(argv[2]); printf(" molecule: %s\n", fname); fname = basename(argv[1]); printf(" basisset: %s\n", fname); printf(" charge = %d\n", CInt_getTotalCharge(basis)); printf(" #atoms = %d\n", natoms); printf(" #shells = %d\n", nshells); printf(" #functions = %d\n", nfunctions); printf(" fock build uses %d (%dx%d) nodes\n", nprow_fock * npcol_fock, nprow_fock, npcol_fock); printf(" purification uses %d (%dx%dx%d) nodes\n", nprow_purif * nprow_purif * nprow_purif, nprow_purif, nprow_purif, nprow_purif); printf(" #tasks = %d (%dx%d)\n", nblks_fock * nblks_fock * nprow_fock * nprow_fock, nblks_fock * nprow_fock, nblks_fock * nprow_fock); int nthreads = omp_get_max_threads(); printf(" #nthreads_cpu = %d\n", nthreads); } int btmp[8]; btmp[0] = nprow_fock; btmp[1] = npcol_fock; btmp[2] = nprow_purif; btmp[3] = nblks_fock; btmp[4] = niters; btmp[5] = natoms; btmp[6] = nshells; btmp[7] = nfunctions; MPI_Bcast(btmp, 8, MPI_INT, 0, MPI_COMM_WORLD); nprow_fock = btmp[0]; npcol_fock = btmp[1]; nprow_purif = btmp[2]; nblks_fock = btmp[3]; niters = btmp[4]; natoms = btmp[5]; nshells = btmp[6]; nfunctions = btmp[7]; // broadcast basis set void *bsbuf; int bsbufsize; if (myrank == 0) { CInt_packBasisSet(basis, &bsbuf, &bsbufsize); MPI_Bcast(&bsbufsize, 1, MPI_INT, 0, MPI_COMM_WORLD); MPI_Bcast(bsbuf, bsbufsize, MPI_CHAR, 0, MPI_COMM_WORLD); } else { MPI_Bcast(&bsbufsize, 1, MPI_INT, 0, MPI_COMM_WORLD); bsbuf = (void *)malloc(bsbufsize); assert(bsbuf != NULL); MPI_Bcast(bsbuf, bsbufsize, MPI_CHAR, 0, MPI_COMM_WORLD); CInt_unpackBasisSet(basis, bsbuf); free(bsbuf); } // init PFock if (myrank == 0) { printf("Initializing pfock ...\n"); } PFock_t pfock; PFock_create(basis, nprow_fock, npcol_fock, nblks_fock, 1e-11, MAX_NUM_D, IS_SYMM, &pfock); if (myrank == 0) { double mem_cpu; PFock_getMemorySize(pfock, &mem_cpu); printf(" CPU uses %.3f MB\n", mem_cpu / 1024.0 / 1024.0); printf(" Done\n"); } // init purif purif_t *purif = create_purif(basis, nprow_purif, nprow_purif, nprow_purif); init_oedmat(basis, pfock, purif, nprow_fock, npcol_fock); // compute SCF if (myrank == 0) { printf("Computing SCF ...\n"); } int rowstart = purif->srow_purif; int rowend = purif->nrows_purif + rowstart - 1; int colstart = purif->scol_purif; int colend = purif->ncols_purif + colstart - 1; double energy0 = -1.0; double totaltime = 0.0; double purif_flops = 2.0 * nfunctions * nfunctions * nfunctions; double diis_flops; // set initial guess if (myrank == 0) { printf(" initialing D ...\n"); } PFock_setNumDenMat(NUM_D, pfock); initial_guess(pfock, basis, purif->runpurif, rowstart, rowend, colstart, colend, purif->D_block, purif->ldx); MPI_Barrier(MPI_COMM_WORLD); // compute nuc energy double ene_nuc = CInt_getNucEnergy(basis); if (myrank == 0) { printf(" nuc energy = %.10f\n", ene_nuc); } MPI_Barrier(MPI_COMM_WORLD); // main loop double t1, t2, t3, t4; for (int iter = 0; iter < niters; iter++) { if (myrank == 0) { printf(" iter %d\n", iter); } t3 = MPI_Wtime(); // fock matrix construction t1 = MPI_Wtime(); fock_build(pfock, basis, purif->runpurif, rowstart, rowend, colstart, colend, purif->ldx, purif->D_block, purif->F_block); if (myrank == 0) { printf("After fock build \n"); } // compute energy double energy = compute_energy(purif, purif->F_block, purif->D_block); t2 = MPI_Wtime(); if (myrank == 0) { printf(" fock build takes %.3f secs\n", t2 - t1); if (iter > 0) { printf(" energy %.10f (%.10f), %le\n", energy + ene_nuc, energy, fabs (energy - energy0)); } else { printf(" energy %.10f (%.10f)\n", energy + ene_nuc, energy); } } if (iter > 0 && fabs (energy - energy0) < 1e-11) { niters = iter + 1; break; } energy0 = energy; // compute DIIS t1 = MPI_Wtime(); compute_diis(pfock, purif, purif->D_block, purif->F_block, iter); t2 = MPI_Wtime(); if (myrank == 0) { if (iter > 1) { diis_flops = purif_flops * 6.0; } else { diis_flops = purif_flops * 2.0; } printf(" diis takes %.3f secs, %.3lf Gflops\n", t2 - t1, diis_flops / (t2 - t1) / 1e9); } #ifdef __SCF_OUT__ if (myrank == 0) { double outbuf[nfunctions]; char fname[1024]; sprintf(fname, "XFX_%d_%d.dat", nfunctions, iter); FILE *fp = fopen(fname, "w+"); assert(fp != NULL); for (int i = 0; i < nfunctions; i++) { PFock_getMat(pfock, PFOCK_MAT_TYPE_F, USE_D_ID, i, i, USE_D_ID, nfunctions - 1, outbuf, nfunctions); for (int j = 0; j < nfunctions; j++) { fprintf(fp, "%.10e\n", outbuf[j]); } } fclose(fp); } #endif // purification MPI_Barrier(MPI_COMM_WORLD); t1 = MPI_Wtime(); int it = compute_purification(purif, purif->F_block, purif->D_block); t2 = MPI_Wtime(); MPI_Barrier(MPI_COMM_WORLD); if (myrank == 0) { printf(" purification takes %.3f secs," " %d iterations, %.3f Gflops\n", t2 - t1, it, (it * 2.0 + 4.0) * purif_flops / (t2 - t1) / 1e9); } /* #if defined(USE_ELEMENTAL) ElGlobalArraysPrint_d( eldga, pfock->ga_D[USE_D_ID] ); #else GA_Print (pfock->ga_D[USE_D_ID]); #endif */ t4 = MPI_Wtime (); totaltime += t4 - t3; #ifdef __SCF_TIMING__ PFock_getStatistics(pfock); double purif_timedgemm; double purif_timepdgemm; double purif_timepass; double purif_timetr; MPI_Reduce(&purif->timedgemm, &purif_timedgemm, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD); MPI_Reduce(&purif->timepdgemm, &purif_timepdgemm, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD); MPI_Reduce(&purif->timepass, &purif_timepass, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD); MPI_Reduce(&purif->timetr, &purif_timetr, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD); if (myrank == 0) { printf(" Purification Statistics:\n"); printf(" average totaltime = %.3f\n" " average timetr = %.3f\n" " average timedgemm = %.3f, %.3f Gflops\n" " average timepdgemm = %.3f, %.3f Gflops\n", purif_timepass / purif->np_purif, purif_timetr / purif->np_purif, purif_timedgemm / purif->np_purif, (it * 2.0 + 4.0) * purif_flops / (purif_timedgemm / purif->np_purif) / 1e9, purif_timepdgemm / purif->np_purif, (it * 2.0 + 4.0) * purif_flops / (purif_timepdgemm / purif->np_purif) / 1e9); } #endif } /* for (iter = 0; iter < NITERATIONS; iter++) */ if (myrank == 0) { printf(" totally takes %.3f secs: %.3f secs/iters\n", totaltime, totaltime / niters); printf(" Done\n"); } destroy_purif(purif); PFock_destroy(pfock); CInt_destroyBasisSet(basis); MPI_Finalize(); return 0; }
void myhbwmalloc_init(void) { /* set to NULL before trying to initialize. if we return before * successful creation of the mspace, then it will still be NULL, * and we can use that in subsequent library calls to determine * that the library failed to initialize. */ myhbwmalloc_mspace = NULL; /* verbose printout? */ myhbwmalloc_verbose = 0; { char * env_char = getenv("HBWMALLOC_VERBOSE"); if (env_char != NULL) { myhbwmalloc_verbose = 1; printf("hbwmalloc: HBWMALLOC_VERBOSE set\n"); } } /* fail hard or soft? */ myhbwmalloc_hardfail = 1; { char * env_char = getenv("HBWMALLOC_SOFTFAIL"); if (env_char != NULL) { myhbwmalloc_hardfail = 0; printf("hbwmalloc: HBWMALLOC_SOFTFAIL set\n"); } } /* set the atexit handler that will destroy the mspace and free the numa allocation */ atexit(myhbwmalloc_final); /* detect and configure use of NUMA memory nodes */ { int max_possible_node = numa_max_possible_node(); int num_possible_nodes = numa_num_possible_nodes(); int max_numa_nodes = numa_max_node(); int num_configured_nodes = numa_num_configured_nodes(); int num_configured_cpus = numa_num_configured_cpus(); if (myhbwmalloc_verbose) { printf("hbwmalloc: numa_max_possible_node() = %d\n", max_possible_node); printf("hbwmalloc: numa_num_possible_nodes() = %d\n", num_possible_nodes); printf("hbwmalloc: numa_max_node() = %d\n", max_numa_nodes); printf("hbwmalloc: numa_num_configured_nodes() = %d\n", num_configured_nodes); printf("hbwmalloc: numa_num_configured_cpus() = %d\n", num_configured_cpus); } /* FIXME this is a hack. assumes HBW is only numa node 1. */ if (num_configured_nodes <= 2) { myhbwmalloc_numa_node = num_configured_nodes-1; } else { fprintf(stderr,"hbwmalloc: we support only 2 numa nodes, not %d\n", num_configured_nodes); } if (myhbwmalloc_verbose) { for (int i=0; i<num_configured_nodes; i++) { unsigned max_numa_cpus = numa_num_configured_cpus(); struct bitmask * mask = numa_bitmask_alloc( max_numa_cpus ); int rc = numa_node_to_cpus(i, mask); if (rc != 0) { fprintf(stderr, "hbwmalloc: numa_node_to_cpus failed\n"); } else { printf("hbwmalloc: numa node %d cpu mask:", i); for (unsigned j=0; j<max_numa_cpus; j++) { int bit = numa_bitmask_isbitset(mask,j); printf(" %d", bit); } printf("\n"); } numa_bitmask_free(mask); } fflush(stdout); } } #if 0 /* unused */ /* see if the user specifies a slab size */ size_t slab_size_requested = 0; { char * env_char = getenv("HBWMALLOC_BYTES"); if (env_char!=NULL) { long units = 1L; if ( NULL != strstr(env_char,"G") ) units = 1000000000L; else if ( NULL != strstr(env_char,"M") ) units = 1000000L; else if ( NULL != strstr(env_char,"K") ) units = 1000L; else units = 1L; int num_count = strspn(env_char, "0123456789"); memset( &env_char[num_count], ' ', strlen(env_char)-num_count); slab_size_requested = units * atol(env_char); } if (myhbwmalloc_verbose) { printf("hbwmalloc: requested slab_size_requested = %zu\n", slab_size_requested); } } #endif /* see what libnuma says is available */ size_t myhbwmalloc_slab_size; { int node = myhbwmalloc_numa_node; long long freemem; long long maxmem = numa_node_size64(node, &freemem); if (myhbwmalloc_verbose) { printf("hbwmalloc: numa_node_size64 says maxmem=%lld freemem=%lld for numa node %d\n", maxmem, freemem, node); } myhbwmalloc_slab_size = freemem; } /* assume threads, disable if MPI knows otherwise, then allow user to override. */ int multithreaded = 1; #ifdef HAVE_MPI int nprocs; { int is_init, is_final; MPI_Initialized(&is_init); MPI_Finalized(&is_final); if (is_init && !is_final) { MPI_Comm_size(MPI_COMM_WORLD, &nprocs); } /* give equal portion to every MPI process */ myhbwmalloc_slab_size /= nprocs; /* if the user initializes MPI with MPI_Init or * MPI_Init_thread(MPI_THREAD_SINGLE), they assert there * are no threads at all, which means we can skip the * malloc mspace lock. * * if the user lies to MPI, they deserve any bad thing * that comes of it. */ int provided; MPI_Query_thread(&provided); if (provided==MPI_THREAD_SINGLE) { multithreaded = 0; } else { multithreaded = 1; } if (myhbwmalloc_verbose) { printf("hbwmalloc: MPI processes = %d (threaded = %d)\n", nprocs, multithreaded); printf("hbwmalloc: myhbwmalloc_slab_size = %d\n", myhbwmalloc_slab_size); } } #endif /* user can assert that hbwmalloc and friends need not be thread-safe */ { char * env_char = getenv("HBWMALLOC_LOCKLESS"); if (env_char != NULL) { multithreaded = 0; if (myhbwmalloc_verbose) { printf("hbwmalloc: user has disabled locking in mspaces by setting HBWMALLOC_LOCKLESS\n"); } } } myhbwmalloc_slab = numa_alloc_onnode( myhbwmalloc_slab_size, myhbwmalloc_numa_node); if (myhbwmalloc_slab==NULL) { fprintf(stderr, "hbwmalloc: numa_alloc_onnode returned NULL for size = %zu\n", myhbwmalloc_slab_size); return; } else { if (myhbwmalloc_verbose) { printf("hbwmalloc: numa_alloc_onnode succeeded for size %zu\n", myhbwmalloc_slab_size); } /* part (less than 128*sizeof(size_t) bytes) of this space is used for bookkeeping, * so the capacity must be at least this large */ if (myhbwmalloc_slab_size < 128*sizeof(size_t)) { fprintf(stderr, "hbwmalloc: not enough space for mspace bookkeeping\n"); return; } /* see above regarding if the user lies to MPI. */ int locked = multithreaded; myhbwmalloc_mspace = create_mspace_with_base( myhbwmalloc_slab, myhbwmalloc_slab_size, locked); if (myhbwmalloc_mspace == NULL) { fprintf(stderr, "hbwmalloc: create_mspace_with_base returned NULL\n"); return; } else if (myhbwmalloc_verbose) { printf("hbwmalloc: create_mspace_with_base succeeded for size %zu\n", myhbwmalloc_slab_size); } } }
int A1D_Initialize() { int mpi_initialized, mpi_provided; int mpi_status; int i; size_t bytes_in, bytes_out; DCMF_Result dcmf_result; DCMF_Configure_t dcmf_config; DCMF_Memregion_t local_memregion; /*************************************************** * * configure MPI * ***************************************************/ /* MPI has to be initialized for this implementation to work */ MPI_Initialized(&mpi_initialized); assert(mpi_initialized==1); /* MPI has to be thread-safe so that DCMF doesn't explode */ MPI_Query_thread(&mpi_provided); assert(mpi_provided==MPI_THREAD_MULTIPLE); /* have to use our own communicator for collectives to be proper */ mpi_status = MPI_Comm_dup(MPI_COMM_WORLD,&A1D_COMM_WORLD); assert(mpi_status==0); /* get my MPI rank */ mpi_status = MPI_Comm_rank(A1D_COMM_WORLD,&myrank); assert(mpi_status==0); /* get MPI world size */ mpi_status = MPI_Comm_size(A1D_COMM_WORLD,&mpi_size); assert(mpi_status==0); /* make sure MPI and DCMF agree */ assert(myrank==DCMF_Messager_rank()); assert(mpi_size==DCMF_Messager_size()); /* barrier before DCMF_Messager_configure to make sure MPI is ready everywhere */ mpi_status = MPI_Barrier(A1D_COMM_WORLD); assert(mpi_status==0); /*************************************************** * * configure DCMF * ***************************************************/ /* to be safe, but perhaps not necessary */ dcmf_config.thread_level = DCMF_THREAD_MULTIPLE; #ifdef ACCUMULATE_IMPLEMENTED /* interrupts required for accumulate only, Put/Get use DMA * if accumulate not used, MPI will query environment for DCMF_INTERRUPTS */ dcmf_config.interrupts = DCMF_INTERRUPTS_ON; #endif /* reconfigure DCMF with interrupts on */ DCMF_CriticalSection_enter(0); dcmf_result = DCMF_Messager_configure(&dcmf_config, &dcmf_config); assert(dcmf_result==DCMF_SUCCESS); DCMF_CriticalSection_exit(0); /* barrier after DCMF_Messager_configure to make sure everyone has the new DCMF config */ mpi_status = MPI_Barrier(A1D_COMM_WORLD); assert(mpi_status==0); /*************************************************** * * setup DCMF memregions * ***************************************************/ /* allocate memregion list */ A1D_Memregion_list = malloc( mpi_size * sizeof(DCMF_Memregion_t) ); assert(A1D_Memregion_list != NULL); /* allocate base pointer list */ A1D_Baseptr_list = malloc( mpi_size * sizeof(void*) ); assert(A1D_Memregion_list != NULL); /* create memregions */ bytes_in = -1; DCMF_CriticalSection_enter(0); dcmf_result = DCMF_Memregion_create(&local_memregion,&bytes_out,bytes_in,NULL,0); assert(dcmf_result==DCMF_SUCCESS); DCMF_CriticalSection_exit(0); /* exchange memregions because we don't use symmetry heap */ mpi_status = MPI_Allgather(&local_memregion,sizeof(DCMF_Memregion_t),MPI_BYTE, A1D_Memregion_list,sizeof(DCMF_Memregion_t),MPI_BYTE, A1D_COMM_WORLD); assert(mpi_status==0); /* destroy temporary local memregion */ DCMF_CriticalSection_enter(0); dcmf_result = DCMF_Memregion_destroy(&local_memregion); assert(dcmf_result==DCMF_SUCCESS); DCMF_CriticalSection_exit(0); /* check for valid memregions */ DCMF_CriticalSection_enter(0); for (i = 0; i < mpi_size; i++) { dcmf_result = DCMF_Memregion_query(&A1D_Memregion_list[i], &bytes_out, &A1D_Baseptr_list[i]); assert(dcmf_result==DCMF_SUCCESS); } DCMF_CriticalSection_exit(0); #ifdef FLUSH_IMPLEMENTED /*************************************************** * * setup flush list(s) * ***************************************************/ /* allocate Put list */ A1D_Put_flush_list = malloc( mpi_size * sizeof(int) ); assert(A1D_Put_flush_list != NULL); #ifdef ACCUMULATE_IMPLEMENTED /* allocate Acc list */ A1D_Send_flush_list = malloc( mpi_size * sizeof(int) ); assert(A1D_Send_flush_list != NULL); #endif #endif /*************************************************** * * define null callback * ***************************************************/ A1D_Nocallback.function = NULL; A1D_Nocallback.clientdata = NULL; return(0); }
int A1D_Initialize() { #ifdef DMAPPD_USES_MPI int mpi_initialized, mpi_provided; int mpi_status = MPI_SUCCESS; int namelen; char procname[MPI_MAX_PROCESSOR_NAME]; #endif #ifdef __CRAYXE int pmi_status = PMI_SUCCESS; int nodeid = -1; rca_mesh_coord_t rca_xyz; dmapp_return_t dmapp_status = DMAPP_RC_SUCCESS; dmapp_rma_attrs_ext_t dmapp_config_in, dmapp_config_out; dmapp_jobinfo_t dmapp_info; dmapp_pe_t dmapp_rank = -1; int dmapp_size = -1; #endif int sheapflag = 0; #ifdef DEBUG_FUNCTION_ENTER_EXIT fprintf(stderr,"entering A1D_Initialize() \n"); #endif #ifdef DMAPPD_USES_MPI /*************************************************** * * configure MPI * ***************************************************/ /* MPI has to be Initialized for this implementation to work */ MPI_Initialized(&mpi_initialized); assert(mpi_initialized==1); /* MPI has to tolerate threads because A1 supports them */ MPI_Query_thread(&mpi_provided); //assert(mpi_provided>MPI_THREAD_SINGLE); /* have to use our own communicator for collectives to be proper */ mpi_status = MPI_Comm_dup(MPI_COMM_WORLD,&A1D_COMM_WORLD); assert(mpi_status==0); /* get my MPI rank */ mpi_status = MPI_Comm_rank(A1D_COMM_WORLD,&mpi_rank); assert(mpi_status==0); /* get MPI world size */ mpi_status = MPI_Comm_size(A1D_COMM_WORLD,&mpi_size); assert(mpi_status==0); /* in a perfect world, this would provide topology information like BG */ MPI_Get_processor_name( procname, &namelen ); printf( "%d: MPI_Get_processor_name = %s\n" , mpi_rank, procname ); fflush( stdout ); /* barrier to make sure MPI is ready everywhere */ mpi_status = MPI_Barrier(A1D_COMM_WORLD); assert(mpi_status==0); #endif #ifdef __CRAYXE /*************************************************** * * query topology * ***************************************************/ PMI_Get_nid( mpi_rank, &nodeid ); assert(pmi_status==PMI_SUCCESS); rca_get_meshcoord((uint16_t)nodeid, &rca_xyz); printf("%d: rca_get_meshcoord returns (%2u,%2u,%2u)\n", mpi_rank, rca_xyz.mesh_x, rca_xyz.mesh_y, rca_xyz.mesh_z ); #endif #ifdef __CRAYXE /*************************************************** * * configure DMAPP * ***************************************************/ dmapp_config_in.max_outstanding_nb = DMAPP_DEF_OUTSTANDING_NB; /* 512 */ dmapp_config_in.offload_threshold = DMAPP_OFFLOAD_THRESHOLD; /* 4096 */ #ifdef DETERMINISTIC_ROUTING dmapp_config_in.put_relaxed_ordering = DMAPP_ROUTING_DETERMINISTIC; dmapp_config_in.get_relaxed_ordering = DMAPP_ROUTING_DETERMINISTIC; #else dmapp_config_in.put_relaxed_ordering = DMAPP_ROUTING_ADAPTIVE; dmapp_config_in.get_relaxed_ordering = DMAPP_ROUTING_ADAPTIVE; #endif dmapp_config_in.max_concurrency = 1; /* not thread-safe */ #ifdef FLUSH_IMPLEMENTED dmapp_config_in.PI_ordering = DMAPP_PI_ORDERING_RELAXED; #else dmapp_config_in.PI_ordering = DMAPP_PI_ORDERING_STRICT; #endif dmapp_status = dmapp_init_ext( &dmapp_config_in, &dmapp_config_out ); assert(dmapp_status==DMAPP_RC_SUCCESS); #ifndef FLUSH_IMPLEMENTED /* without strict PI ordering, we have to flush remote stores with a get packet to force global visibility */ assert( dmapp_config_out.PI_ordering == DMAPP_PI_ORDERING_STRICT); #endif dmapp_status = dmapp_get_jobinfo(&dmapp_info); assert(dmapp_status==DMAPP_RC_SUCCESS); dmapp_rank = dmapp_info.pe; dmapp_size = dmapp_info.npes; A1D_Sheap_desc = dmapp_info.sheap_seg; /* make sure PMI and DMAPP agree */ assert(mpi_rank==dmapp_rank); assert(mpi_size==dmapp_size); #endif /*************************************************** * * setup protocols * ***************************************************/ #ifdef FLUSH_IMPLEMENTED /* allocate Put list */ A1D_Put_flush_list = malloc( mpi_size * sizeof(int32_t) ); assert(A1D_Put_flush_list != NULL); #endif #ifdef __CRAYXE A1D_Acc_lock = dmapp_sheap_malloc( sizeof(int64_t) ); #endif A1D_Allreduce_issame64((size_t)A1D_Acc_lock, &sheapflag); assert(sheapflag==1); #ifdef DEBUG_FUNCTION_ENTER_EXIT fprintf(stderr,"exiting A1D_Initialize() \n"); #endif return(0); }
int main(int argc, char *argv[]) { /********************************************************************************* * INITIALIZE MPI *********************************************************************************/ int world_size = 0, world_rank = -1; int provided = -1; #if defined(USE_MPI_INIT) MPI_Init( &argc, &argv ); MPI_Comm_rank( MPI_COMM_WORLD, &world_rank ); if (world_rank==0) print_meminfo(stdout, "after MPI_Init"); #else int requested = -1; # if defined(USE_MPI_INIT_THREAD_MULTIPLE) requested = MPI_THREAD_MULTIPLE; # elif defined(USE_MPI_INIT_THREAD_SERIALIZED) requested = MPI_THREAD_SERIALIZED; # elif defined(USE_MPI_INIT_THREAD_FUNNELED) requested = MPI_THREAD_FUNNELED; # else requested = MPI_THREAD_SINGLE; # endif MPI_Init_thread( &argc, &argv, requested, &provided ); MPI_Comm_rank( MPI_COMM_WORLD, &world_rank ); if (world_rank==0) print_meminfo(stdout, "after MPI_Init_thread"); if (provided>requested) { if (world_rank==0) printf("MPI_Init_thread returned %s instead of %s, but this is okay. \n", MPI_THREAD_STRING(provided), MPI_THREAD_STRING(requested) ); } if (provided<requested) { if (world_rank==0) printf("MPI_Init_thread returned %s instead of %s so the test will exit. \n", MPI_THREAD_STRING(provided), MPI_THREAD_STRING(requested) ); MPI_Abort(MPI_COMM_WORLD, 1); } #endif double t0 = MPI_Wtime(); int is_init = 0; MPI_Initialized(&is_init); if (world_rank==0) printf("MPI %s initialized. \n", (is_init==1 ? "was" : "was not") ); MPI_Query_thread(&provided); if (world_rank==0) printf("MPI thread support is %s. \n", MPI_THREAD_STRING(provided) ); MPI_Comm_size( MPI_COMM_WORLD, &world_size ); if (world_rank==0) printf("MPI test program running on %d ranks. \n", world_size); char procname[MPI_MAX_PROCESSOR_NAME]; int pnlen; MPI_Get_processor_name(procname,&pnlen); printf("%d: processor name = %s\n", world_rank, procname); /********************************************************************************* * SETUP MPI COMMUNICATORS *********************************************************************************/ if (world_rank==0) printf("MPI_Barrier on MPI_COMM_WORLD 1 \n"); MPI_Barrier( MPI_COMM_WORLD ); if (world_rank==0) printf("MPI_Comm_dup of MPI_COMM_WORLD \n"); MPI_Comm comm_world_dup; MPI_Comm_dup(MPI_COMM_WORLD, &comm_world_dup); if (world_rank==0) print_meminfo(stdout, "after MPI_Comm_dup"); if (world_rank==0) printf("MPI_Barrier on comm_world_dup \n"); MPI_Barrier( comm_world_dup ); if (world_rank==0) printf("MPI_Comm_split of MPI_COMM_WORLD into world_reordered \n"); MPI_Comm comm_world_reordered; MPI_Comm_split(MPI_COMM_WORLD, 0, world_size-world_rank, &comm_world_reordered); if (world_rank==0) print_meminfo(stdout, "after MPI_Comm_split"); if (world_rank==0) printf("MPI_Comm_split of MPI_COMM_WORLD into left-right \n"); MPI_Comm comm_world_leftright; int leftright = (world_rank<(world_size/2)); MPI_Comm_split(MPI_COMM_WORLD, leftright, world_rank, &comm_world_leftright); if (world_rank==0) print_meminfo(stdout, "after MPI_Comm_split"); if (world_rank==0) printf("MPI_Barrier on comm_world_leftright \n"); MPI_Barrier( comm_world_leftright ); if (world_rank==0) printf("MPI_Comm_split of MPI_COMM_WORLD into odd-even \n"); MPI_Comm comm_world_oddeven; int oddeven = (world_rank%2); MPI_Comm_split(MPI_COMM_WORLD, oddeven, world_rank, &comm_world_oddeven); if (world_rank==0) print_meminfo(stdout, "after MPI_Comm_split"); if (world_rank==0) printf("MPI_Barrier on comm_world_oddeven \n"); MPI_Barrier( comm_world_oddeven ); if (world_rank==0) printf("MPI_Comm_split MPI_COMM_WORLD into (world-1) \n"); MPI_Comm comm_world_minus_one; int left_out = world_rank==(world_size/2); MPI_Comm_split(MPI_COMM_WORLD, left_out, world_rank, &comm_world_minus_one); if (world_rank==0) print_meminfo(stdout, "after MPI_Comm_split"); if (world_rank==0) printf("MPI_Barrier on comm_world_minus_one \n"); MPI_Barrier( comm_world_minus_one ); if (world_rank==0) printf("MPI_Comm_group of group_world from MPI_COMM_WORLD \n"); MPI_Group group_world; MPI_Comm_group(MPI_COMM_WORLD, &group_world); if (world_rank==0) print_meminfo(stdout, "after MPI_Comm_group"); int geomprog_size = (world_size==1) ? 1 : ceil(log2(world_size)); int * geomprog_list = NULL; geomprog_list = (int *) safemalloc( geomprog_size * sizeof(int) ); for (int i=0; i<geomprog_size; i++) geomprog_list[i] = pow(2,i)-1; if (world_rank==0) for (int i=0; i<geomprog_size; i++) if (world_rank==0) printf("geomprog_list[%d] = %d \n", i, geomprog_list[i]); if (world_rank==0) printf("MPI_Group_incl of group_geomprog (geometric progression) from group_world \n"); MPI_Group group_geomprog; MPI_Group_incl(group_world, geomprog_size, geomprog_list, &group_geomprog); MPI_Group_free(&group_world); if (world_rank==0) printf("MPI_Comm_create of comm_geomprog from group_geomprog on MPI_COMM_WORLD \n"); MPI_Comm comm_geomprog; MPI_Comm_create(MPI_COMM_WORLD, group_geomprog, &comm_geomprog); MPI_Group_free(&group_geomprog); if (world_rank==0) print_meminfo(stdout, "after MPI_Comm_create"); if (world_rank==0) printf("MPI_Barrier on comm_geomprog \n"); for (int i=0; i<geomprog_size; i++) if (geomprog_list[i]==world_rank) MPI_Barrier( comm_geomprog ); if (world_rank==0) printf("MPI_Barrier on MPI_COMM_WORLD 2 \n"); MPI_Barrier( MPI_COMM_WORLD ); if (world_rank==0) print_meminfo(stdout, "after MPI communicator creation"); /********************************************************************************* * COLLECTIVES *********************************************************************************/ int max_mem = (argc>1 ? atoi(argv[1]) : 32*1024*1024); MPI_Comm test_comm; #if defined(DO_COMM_WORLD) test_comm = MPI_COMM_WORLD; MPI_Barrier( MPI_COMM_WORLD ); if (world_rank==0) printf("############## %s ##############\n", "MPI_COMM_WORLD - pass 1" ); { MPI_Barrier( test_comm ); bcast_only(stdout, test_comm, max_mem); gather_only(stdout, test_comm, max_mem); allgather_only(stdout, test_comm, max_mem); scatter_only(stdout, test_comm, max_mem); alltoall_only(stdout, test_comm, max_mem); reduce_only(stdout, test_comm, max_mem); allreduce_only(stdout, test_comm, max_mem); reducescatterblock_only(stdout, test_comm, max_mem); } fflush(stdout); MPI_Barrier( MPI_COMM_WORLD ); if (world_rank==0) printf("############## %s ##############\n", "MPI_COMM_WORLD - pass 2" ); { MPI_Barrier( test_comm ); bcast_only(stdout, test_comm, max_mem); gather_only(stdout, test_comm, max_mem); allgather_only(stdout, test_comm, max_mem); scatter_only(stdout, test_comm, max_mem); alltoall_only(stdout, test_comm, max_mem); reduce_only(stdout, test_comm, max_mem); allreduce_only(stdout, test_comm, max_mem); reducescatterblock_only(stdout, test_comm, max_mem); } fflush(stdout); MPI_Barrier( MPI_COMM_WORLD ); #endif #ifdef DO_COMM_WORLD_JITTER test_comm = MPI_COMM_WORLD; MPI_Barrier( MPI_COMM_WORLD ); if (world_rank==0) printf("############## %s ##############\n", "COMM_WORLD_JITTER" ); { int jitter = 0; if ((world_rank%10)==0) jitter++; if ((world_rank%100)==0) jitter++; if ((world_rank%1000)==0) jitter++; if ((world_rank%10000)==0) jitter++; if ((world_rank%100000)==0) jitter++; MPI_Barrier( test_comm ); sleep(jitter); bcast_only(stdout, test_comm, max_mem); MPI_Barrier( test_comm ); sleep(jitter); gather_only(stdout, test_comm, max_mem); MPI_Barrier( test_comm ); sleep(jitter); allgather_only(stdout, test_comm, max_mem); MPI_Barrier( test_comm ); sleep(jitter); scatter_only(stdout, test_comm, max_mem); MPI_Barrier( test_comm ); sleep(jitter); alltoall_only(stdout, test_comm, max_mem); MPI_Barrier( test_comm ); sleep(jitter); reduce_only(stdout, test_comm, max_mem); MPI_Barrier( test_comm ); sleep(jitter); allreduce_only(stdout, test_comm, max_mem); MPI_Barrier( test_comm ); sleep(jitter); reducescatterblock_only(stdout, test_comm, max_mem); } fflush(stdout); MPI_Barrier( MPI_COMM_WORLD ); #endif #ifdef DO_COMM_WORLD_DUP test_comm = comm_world_dup; MPI_Barrier( MPI_COMM_WORLD ); if (world_rank==0) printf("############## %s ##############\n", "COMM_WORLD_DUP" ); { MPI_Barrier( test_comm ); bcast_only(stdout, test_comm, max_mem); gather_only(stdout, test_comm, max_mem); allgather_only(stdout, test_comm, max_mem); scatter_only(stdout, test_comm, max_mem); alltoall_only(stdout, test_comm, max_mem); reduce_only(stdout, test_comm, max_mem); allreduce_only(stdout, test_comm, max_mem); reducescatterblock_only(stdout, test_comm, max_mem); } fflush(stdout); MPI_Barrier( MPI_COMM_WORLD ); #endif #ifdef DO_WORLD_REORDERED test_comm = comm_world_reordered; MPI_Barrier( MPI_COMM_WORLD ); if (world_rank==0) printf("############## %s ##############\n", "WORLD_REORDERED" ); { MPI_Barrier( test_comm ); bcast_only(stdout, test_comm, max_mem); gather_only(stdout, test_comm, max_mem); allgather_only(stdout, test_comm, max_mem); scatter_only(stdout, test_comm, max_mem); alltoall_only(stdout, test_comm, max_mem); reduce_only(stdout, test_comm, max_mem); allreduce_only(stdout, test_comm, max_mem); reducescatterblock_only(stdout, test_comm, max_mem); } fflush(stdout); MPI_Barrier( MPI_COMM_WORLD ); #endif #ifdef DO_WORLD_MINUS_ONE test_comm = comm_world_minus_one; MPI_Barrier( MPI_COMM_WORLD ); if (world_rank==0) printf("############## %s ##############\n", "WORLD_MINUS_ONE" ); if (left_out==0) { MPI_Barrier( test_comm ); bcast_only(stdout, test_comm, max_mem); gather_only(stdout, test_comm, max_mem); allgather_only(stdout, test_comm, max_mem); scatter_only(stdout, test_comm, max_mem); alltoall_only(stdout, test_comm, max_mem); reduce_only(stdout, test_comm, max_mem); allreduce_only(stdout, test_comm, max_mem); reducescatterblock_only(stdout, test_comm, max_mem); } fflush(stdout); MPI_Barrier( MPI_COMM_WORLD ); #endif #if DO_LEFT_RIGHT test_comm = comm_world_leftright; for (int i=0; i<2; i++) { MPI_Barrier( MPI_COMM_WORLD ); if (world_rank==i) printf("############## %s ##############\n", (i==0 ? "LEFT" : "RIGHT") ); if (leftright==i) { MPI_Barrier( test_comm ); bcast_only(stdout, test_comm, max_mem); gather_only(stdout, test_comm, max_mem); allgather_only(stdout, test_comm, max_mem); scatter_only(stdout, test_comm, max_mem); alltoall_only(stdout, test_comm, max_mem); reduce_only(stdout, test_comm, max_mem); allreduce_only(stdout, test_comm, max_mem); reducescatterblock_only(stdout, test_comm, max_mem); } } fflush(stdout); MPI_Barrier( MPI_COMM_WORLD ); #endif #if DO_ODD_EVEN test_comm = comm_world_oddeven; for (int i=0; i<2; i++) { MPI_Barrier( MPI_COMM_WORLD ); if (world_rank==i) printf("############## %s ##############\n", (i==0 ? "EVEN" : "ODD") ); if (oddeven==i) { MPI_Barrier( test_comm ); bcast_only(stdout, test_comm, max_mem); gather_only(stdout, test_comm, max_mem); allgather_only(stdout, test_comm, max_mem); scatter_only(stdout, test_comm, max_mem); alltoall_only(stdout, test_comm, max_mem); reduce_only(stdout, test_comm, max_mem); allreduce_only(stdout, test_comm, max_mem); reducescatterblock_only(stdout, test_comm, max_mem); } } fflush(stdout); MPI_Barrier( MPI_COMM_WORLD ); #endif #ifdef DO_GEOM_PROG test_comm = comm_geomprog; MPI_Barrier( MPI_COMM_WORLD ); if (world_rank==0) printf("############## %s ##############\n", "GEOM_PROG" ); for (int i=0; i<geomprog_size; i++) if (geomprog_list[i]==world_rank) { MPI_Barrier( test_comm ); bcast_only(stdout, test_comm, max_mem); gather_only(stdout, test_comm, max_mem); allgather_only(stdout, test_comm, max_mem); scatter_only(stdout, test_comm, max_mem); alltoall_only(stdout, test_comm, max_mem); reduce_only(stdout, test_comm, max_mem); allreduce_only(stdout, test_comm, max_mem); reducescatterblock_only(stdout, test_comm, max_mem); } fflush(stdout); MPI_Barrier( MPI_COMM_WORLD ); #endif if (world_rank==0) print_meminfo(stdout, "after MPI collective tests"); /********************************************************************************* * CLEAN UP AND FINALIZE *********************************************************************************/ for (int i=0; i<geomprog_size; i++) if (geomprog_list[i]==world_rank) MPI_Comm_free(&comm_geomprog); free(geomprog_list); MPI_Comm_free(&comm_world_minus_one); MPI_Comm_free(&comm_world_oddeven); MPI_Comm_free(&comm_world_leftright); MPI_Comm_free(&comm_world_reordered); MPI_Comm_free(&comm_world_dup); MPI_Barrier( MPI_COMM_WORLD ); double t1 = MPI_Wtime(); double dt = t1-t0; if (world_rank==0) printf("TEST FINISHED SUCCESSFULLY IN %lf SECONDS \n", dt); fflush(stdout); if (world_rank==0) print_meminfo(stdout, "before MPI_Finalize"); MPI_Finalize(); return 0; }