Exemple #1
0
JNIEXPORT jint JNICALL Java_mpi_MPI_queryThread_1jni(JNIEnv *env, jclass clazz)
{
    int provided;
    int rc = MPI_Query_thread(&provided);
    ompi_java_exceptionCheck(env, rc);
    return provided;
}
Exemple #2
0
/** Initialize DALEC.  MPI must be initialized before this can be called.  It
  * invalid to make DALEC calls before initialization.  Collective on the world
  * group.
  *
  * @return            Zero on success
  */
int PDALEC_Initialize(MPI_Comm user_comm)
{
    int dalec_alive = atomic_fetch_sub_explicit(&(DALECI_GLOBAL_STATE.alive),
                                                1,memory_order_seq_cst);
    if (dalec_alive == 0) {
        /* Initialize, since this is the first call to this function. */
        int mpi_is_init, mpi_is_fin;
        MPI_Initialized(&mpi_is_init);
        MPI_Finalized(&mpi_is_fin);
        if (!mpi_is_init || mpi_is_fin) {
            DALECI_Warning("MPI must be active when calling DALEC_Initialize");
            return DALEC_ERROR_MPI_USAGE;
        }

        /* Always dupe the user communicator for internal usage. */
        /* Do not abort on MPI failure, let user handle if MPI does not abort. */
        int rc = MPI_Comm_dup(user_comm, &DALECI_GLOBAL_STATE.mpi_comm);
        return DALECI_Check_MPI("DALEC_Initialize", "MPI_Comm_dup", rc);

        /* Determine what level of threading MPI supports. */
        int mpi_thread_level;
        MPI_Query_thread(&mpi_thread_level);
        DALECI_GLOBAL_STATE.mpi_thread_level;

    } else {
        /* Library has already been initialized. */
        return DALEC_SUCCESS;
    }
}
Exemple #3
0
  bool init(int *ac, const char **av, bool useCommWorld)
  {
    int initialized = false;
    MPI_CALL(Initialized(&initialized));

    int provided = 0;
    if (!initialized) {
      /* MPI not initialized by the app - it's up to us */
      MPI_CALL(Init_thread(ac, const_cast<char ***>(&av),
                           MPI_THREAD_MULTIPLE, &provided));
    } else {
      /* MPI was already initialized by the app that called us! */
      MPI_Query_thread(&provided);
    }
    if (provided != MPI_THREAD_MULTIPLE && provided != MPI_THREAD_SERIALIZED) {
      throw std::runtime_error("MPI initialization error: The MPI runtime must"
                               " support either MPI_THREAD_MULTIPLE or"
                               " MPI_THREAD_SERIALIZED.");
    }
    mpiIsThreaded = provided == MPI_THREAD_MULTIPLE;

    if (useCommWorld) {
      world.setTo(MPI_COMM_WORLD);
    }
    return !initialized;
  }
Exemple #4
0
	threading_modes threading_mode() const {
	  int provided;
	  MPI_Query_thread(&provided);
	  switch (provided) {
	    case MPI_THREAD_SINGLE:
	      return threading_modes::single;
	    case MPI_THREAD_FUNNELED:
	      return threading_modes::funneled;
	    case MPI_THREAD_SERIALIZED:
	      return threading_modes::serialized;
	    case MPI_THREAD_MULTIPLE:
	      return threading_modes::multiple;
	  }
	  return threading_modes::single;  // make compiler happy
	}
int main( int argc, char *argv[] )
{
  int provided, claimed;
 
  /*** Select one of the following
       MPI_Init_thread( 0, 0, MPI_THREAD_SINGLE, &provided );
       MPI_Init_thread( 0, 0, MPI_THREAD_FUNNELED, &provided );
       MPI_Init_thread( 0, 0, MPI_THREAD_SERIALIZED, &provided );
       MPI_Init_thread( 0, 0, MPI_THREAD_MULTIPLE, &provided );
  ***/ 

  MPI_Init_thread(0, 0, MPI_THREAD_MULTIPLE, &provided );
  MPI_Query_thread( &claimed );
  printf( "Query thread level= %d  Init_thread level= %d\n", claimed, provided );
 
  MPI_Finalize();
}
Exemple #6
0
void init_mpi (struct pe_vars * v)
{
	int mpi_provided;

	MPI_Init_thread( NULL, NULL, MPI_THREAD_SERIALIZED, &mpi_provided );
	MPI_Query_thread(&mpi_provided);

	if (strcmp((const char *)MPI_THREAD_STRING(mpi_provided),"WTF") == 0)
		MPI_Abort (MPI_COMM_WORLD, 5);

	MPI_Comm_rank( MPI_COMM_WORLD, &(v->me) );
	MPI_Comm_size( MPI_COMM_WORLD, &(v->npes) );


	v->pairs = v->npes / 2;
	v->nxtpe = ( v->me < v->pairs ) ? ( v->me + v->pairs ) : ( v->me - v->pairs );

	return;
}
Exemple #7
0
int pmrrr
(char *jobz, char *range, int *np, double  *D,
 double *E, double *vl, double *vu, int *il,
 int *iu, int *tryracp, MPI_Comm comm, int *nzp,
 int *offsetp, double *W, double *Z, int *ldz, int *Zsupp)
{
  /* Input parameter */
  int  n      = *np;
  bool onlyW = toupper(jobz[0]) == 'N';
  bool wantZ = toupper(jobz[0]) == 'V';
  bool cntval = toupper(jobz[0]) == 'C';
  bool alleig = toupper(range[0]) == 'A';
  bool valeig = toupper(range[0]) == 'V';
  bool indeig = toupper(range[0]) == 'I';

  /* Check input parameters */
  if(!(onlyW  || wantZ  || cntval)) return 1;
  if(!(alleig || valeig || indeig)) return 1;
  if(n <= 0) return 1;
  if (valeig) {
    if(*vu<=*vl) return 1;
  } else if (indeig) {
    if (*il<1 || *il>n || *iu<*il || *iu>n) return 1;
  }
  
  /* MPI & multithreading info */
  int is_init, is_final;
  MPI_Initialized(&is_init);
  MPI_Finalized(&is_final);
  if (is_init!=1 || is_final==1) {
    fprintf(stderr, "ERROR: MPI is not active! (init=%d, final=%d) \n", 
      is_init, is_final);
    return 1;
  }
  MPI_Comm comm_dup;
  MPI_Comm_dup(comm, &comm_dup);
  int nproc, pid, thread_support;
  MPI_Comm_size(comm_dup, &nproc);
  MPI_Comm_rank(comm_dup, &pid);
  MPI_Query_thread(&thread_support);

  int nthreads;
  if ( !(thread_support == MPI_THREAD_MULTIPLE ||
         thread_support == MPI_THREAD_FUNNELED) ) {
    /* Disable multithreading; note: to support multithreading with 
     * MPI_THREAD_SERIALIZED the code must be changed slightly; this 
     * is not supported at the moment */
    nthreads = 1;
  } else {
    char *ompvar = getenv("PMR_NUM_THREADS");
    if (ompvar == NULL) {
      nthreads = DEFAULT_NUM_THREADS;
    } else {
      nthreads = atoi(ompvar);
    }
  }

#if defined(MVAPICH2_VERSION)
  if (nthreads>1) {
    int mv2_affinity=1;
    char *mv2_string = getenv("MV2_ENABLE_AFFINITY");
    if (mv2_string != NULL) 
      mv2_affinity = atoi(mv2_string);
    if (mv2_affinity!=0) {
      nthreads = 1;
      if (pid==0) {
        fprintf(stderr, "WARNING: PMRRR incurs a significant performance penalty when multithreaded with MVAPICH2 with affinity enabled. The number of threads has been reduced to one; please rerun with MV2_ENABLE_AFFINITY=0 or PMR_NUM_THREADS=1 in the future.\n");
        fflush(stderr);
      }
    }
  }
#endif

  /* If only maximal number of local eigenvectors are queried
   * return if possible here */
  *nzp     = 0;
  *offsetp = 0;
  if (cntval) {
    if ( alleig || n < DSTEMR_IF_SMALLER ) {
      *nzp = iceil(n,nproc);
      MPI_Comm_free(&comm_dup);
      return 0;
    } else if (indeig) {
      *nzp = iceil(*iu-*il+1,nproc);
      MPI_Comm_free(&comm_dup);
      return 0;
    }
  }

  /* Check if computation should be done by multiple processes */
  int info;
  if (n < DSTEMR_IF_SMALLER) {
    info = handle_small_cases(jobz, range, np, D, E, vl, vu, il,
			      iu, tryracp, comm, nzp, offsetp, W,
			      Z, ldz, Zsupp);
    MPI_Comm_free(&comm_dup);
    return info;
  }

  /* Allocate memory */
  double *Werr = (double*)malloc(n*sizeof(double)); assert(Werr!=NULL);
  double *Wgap = (double*)malloc(n*sizeof(double)); assert(Wgap!=NULL);
  double *gersch = (double*)malloc(2*n*sizeof(double)); assert(gersch!=NULL);
  int *iblock = (int*)calloc(n,sizeof(int)); assert(iblock!=NULL);
  int *iproc  = (int*)malloc(n*sizeof(int)); assert(iproc!=NULL);
  int *Windex = (int*)malloc(n*sizeof(int)); assert(Windex!=NULL);
  int *isplit = (int*)malloc(n*sizeof(int)); assert(isplit!=NULL);
  int *Zindex = (int*)malloc(n*sizeof(int)); assert(Zindex!=NULL);
  proc_t *procinfo = (proc_t*)malloc(sizeof(proc_t)); assert(procinfo!=NULL);
  in_t *Dstruct = (in_t*)malloc(sizeof(in_t)); assert(Dstruct!=NULL);
  val_t *Wstruct = (val_t*)malloc(sizeof(val_t)); assert(Wstruct!=NULL);
  vec_t *Zstruct = (vec_t*)malloc(sizeof(vec_t)); assert(Zstruct!=NULL);
  tol_t *tolstruct = (tol_t*)malloc(sizeof(tol_t)); assert(tolstruct!=NULL);

  /* Bundle variables into a structures */
  procinfo->pid            = pid;
  procinfo->nproc          = nproc;
  procinfo->comm           = comm_dup;
  procinfo->nthreads       = nthreads;
  procinfo->thread_support = thread_support;

  Dstruct->n      = n;
  Dstruct->D      = D;
  Dstruct->E      = E;
  Dstruct->isplit = isplit;

  Wstruct->n      = n;
  Wstruct->vl     = vl;
  Wstruct->vu     = vu;
  Wstruct->il     = il;
  Wstruct->iu     = iu;
  Wstruct->W      = W;
  Wstruct->Werr   = Werr;
  Wstruct->Wgap   = Wgap;
  Wstruct->Windex = Windex;
  Wstruct->iblock = iblock;
  Wstruct->iproc  = iproc;
  Wstruct->gersch = gersch;

  Zstruct->ldz    = *ldz;
  Zstruct->nz     = 0;
  Zstruct->Z      = Z;
  Zstruct->Zsupp  = Zsupp;
  Zstruct->Zindex = Zindex;

  /* Scale matrix to allowable range, returns 1.0 if not scaled */
  double scale = scale_matrix(Dstruct, Wstruct, valeig);

  /*  Test if matrix warrants more expensive computations which
   *  guarantees high relative accuracy */
  if (*tryracp)
    odrrr(&n, D, E, &info); /* 0 - rel acc */
  else info = -1;

  int i;
  double *Dcopy, *E2copy;
  if (info == 0) {
    /* This case is extremely rare in practice */ 
    tolstruct->split = DBL_EPSILON;
    /* Copy original data needed for refinement later */
    Dcopy = (double*)malloc(n*sizeof(double)); assert(Dcopy!=NULL);
    memcpy(Dcopy, D, n*sizeof(double));  
    E2copy = (double*)malloc(n*sizeof(double)); assert(E2copy!=NULL);
    for (i=0; i<n-1; i++) 
      E2copy[i] = E[i]*E[i];
  } else {
    /* Neg. threshold forces old splitting criterion */
    tolstruct->split = -DBL_EPSILON; 
    *tryracp = 0;
  }

  if (!wantZ) {
    /* Compute eigenvalues to full precision */
    tolstruct->rtol1 = 4.0 * DBL_EPSILON;
    tolstruct->rtol2 = 4.0 * DBL_EPSILON;
  } else {
    /* Do not compute to full accuracy first, but refine later */
    tolstruct->rtol1 = sqrt(DBL_EPSILON);
    tolstruct->rtol1 = fmin(1e-2*MIN_RELGAP, tolstruct->rtol1);
    tolstruct->rtol2 = sqrt(DBL_EPSILON)*5.0E-3;
    tolstruct->rtol2 = fmin(5e-6*MIN_RELGAP, tolstruct->rtol2);
    tolstruct->rtol2 = fmax(4.0 * DBL_EPSILON, tolstruct->rtol2);
  }

  /*  Compute all eigenvalues: sorted by block */
  info = plarre(procinfo,jobz,range,Dstruct,Wstruct,tolstruct,nzp,offsetp);
  assert(info == 0);

  /* If just number of local eigenvectors are queried */
  if (cntval & valeig) {    
    clean_up(comm_dup, Werr, Wgap, gersch, iblock, iproc, Windex,
	     isplit, Zindex, procinfo, Dstruct, Wstruct, Zstruct,
	     tolstruct);
    return 0;
  }

  /* If only eigenvalues are to be computed */
  if (!wantZ) {

    /* Refine to high relative with respect to input T */
    if (*tryracp) {
      info = 
        refine_to_highrac
        (procinfo, jobz, Dcopy, E2copy, Dstruct, nzp, Wstruct, tolstruct);
      assert(info == 0);
    }

    /* Sort eigenvalues */
    qsort(W, n, sizeof(double), cmp);

    /* Only keep subset ifirst:ilast */
    int ifirst, ilast, isize;
    int iil = *il;
    int iiu = *iu;
    int ifirst_tmp=iil;
    for (i=0; i<nproc; i++) {
      int chunk  = (iiu-iil+1)/nproc + (i < (iiu-iil+1)%nproc);
      int ilast_tmp;
      if (i == nproc-1) {
	ilast_tmp = iiu;
      } else {
	ilast_tmp = ifirst_tmp + chunk - 1;
	ilast_tmp = imin(ilast_tmp, iiu);
      }
      if (i == pid) {
	ifirst    = ifirst_tmp;
	ilast     = ilast_tmp;
	isize     = ilast - ifirst + 1;
	*offsetp = ifirst - iil;
	*nzp      = isize;
      }
      ifirst_tmp = ilast_tmp + 1;
      ifirst_tmp = imin(ifirst_tmp, iiu + 1);
    }
    if (isize > 0) {
      memmove(W, &W[ifirst-1], *nzp * sizeof(double));
    }

    /* If matrix was scaled, rescale eigenvalues */
    invscale_eigenvalues(Wstruct, scale, *nzp);

    clean_up
    (comm_dup, Werr, Wgap, gersch, iblock, iproc, Windex,
     isplit, Zindex, procinfo, Dstruct, Wstruct, Zstruct, tolstruct);

    return 0;
  } /* end of only eigenvalues to compute */

  /* Compute eigenvectors */
  info = plarrv(procinfo, Dstruct, Wstruct, Zstruct, tolstruct, 
		nzp, offsetp);
  assert(info == 0);

  /* Refine to high relative with respect to input matrix */
  if (*tryracp) {
    info = refine_to_highrac(procinfo, jobz, Dcopy, E2copy, 
			     Dstruct, nzp, Wstruct, tolstruct);
    assert(info == 0);
  }

  /* If matrix was scaled, rescale eigenvalues */
  invscale_eigenvalues(Wstruct, scale, n);

  /* Make the first nz elements of W contains the eigenvalues
   * associated to the process */
  int j, im=0;
  for (j=0; j<n; j++) {
    if (iproc[j] == pid) {
      W[im]      = W[j];
      Windex[im] = Windex[j];
      Zindex[im] = Zindex[j];
      im++;
    }
  }

  clean_up(comm_dup, Werr, Wgap, gersch, iblock, iproc, Windex,
	   isplit, Zindex, procinfo, Dstruct, Wstruct, Zstruct,
	   tolstruct);
  if (*tryracp) {
    free(Dcopy);
    free(E2copy);
  }

  return 0;
} /* end pmrrr */
int main(int argc, char *argv[])
{

  nptsside = atoi(argv[1]);
  print_node = atoi(argv[2]);
  nodenum = (int) atoi(argv[3]);
  ppnnum = (int) atoi(argv[4]);
  tasktype =  argv[5][0];
  side2 = nptsside / 2.0;
  side4 = nptsside / 4.0;


  int provided, claimed;
  MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &provided );
  MPI_Query_thread( &claimed );

//  MPI_Init(&argc, &argv);
  MPI_Comm_size(MPI_COMM_WORLD, &nnodes);
  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);

  if (my_rank == print_node) {
    printf( "Query thread level= %d  Init_thread level= %d\n", claimed, provided );
    printf( "Defined LEVEL= %d  (ompi_info | grep -i thread) \n", MPI_THREAD_MULTIPLE);
  }
	        

  mpi_chunksize = nptsside/nnodes;

  struct timespec bgn,nd;
  clock_gettime(CLOCK_REALTIME, &bgn);

  #ifdef RC
  scram = rpermute(nptsside, 0);

  MPI_Scatter(scram, mpi_chunksize, MPI_INT, scram, mpi_chunksize, MPI_INT, 0, MPI_COMM_WORLD);

  #else
    findmyrange(nptsside, nnodes, my_rank, myrange);
    scram = rpermute(mpi_chunksize, myrange[0]);
    printf("My range is %d %d \n", myrange[0], myrange[1]);
  #endif

  
  dowork();

  //implied barrier

  clock_gettime(CLOCK_REALTIME, &nd);

  if (my_rank == print_node) {
    printf("Random chunk RC is defined\n");
    printf("time:%g ::  maxiteration:%d ::  Pixel:%d :: threadnum:%d :: mpi_chunksize:%d :: tot_count:%d :: count:%d :: nodenum:%d :: ppnnum:%d :: tasktype:%c\n",
        timediff(bgn,nd),MAXITERS, nptsside,nnodes,mpi_chunksize,tot_count,count,nodenum,ppnnum,tasktype);
    FILE *fp;
    fp = fopen("OMP_MPI_mand.txt","a");
    fprintf(fp, "time:%g :: maxiteration:%d :: Pixel:%d :: threadnum:%d :: mpi_chunksize:%d :: tot_count:%d :: count:%d :: nodenum:%d :: ppnnum:%d :: tasktype:%c\n",
        timediff(bgn,nd),MAXITERS,nptsside,nnodes,mpi_chunksize,tot_count,count,nodenum,ppnnum,tasktype);
    fclose(fp);

  }

  MPI_Finalize();

}
Exemple #9
0
FORT_DLL_SPEC void FORT_CALL mpi_query_thread_ ( MPI_Fint *v1, MPI_Fint *ierr ){
    *ierr = MPI_Query_thread( v1 );
}
Exemple #10
0
Fichier : scf.c Projet : sg0/gtfock
/// main for SCF
int main (int argc, char **argv)
{
    // init MPI
    int myrank;
    int nprocs;
    int provided;
#if defined (USE_ELEMENTAL)
    ElInitialize( &argc, &argv );
    ElMPICommRank( MPI_COMM_WORLD, &myrank );
    ElMPICommSize( MPI_COMM_WORLD, &nprocs );
    MPI_Query_thread(&provided);
#else
    MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &provided);
    MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
    MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
#endif
    if (myrank == 0)  {
        printf("MPI thread support: %s\n", MPI_THREAD_STRING(provided));
    }
#if 0
    char hostname[1024];
    gethostname (hostname, 1024);
    printf ("Rank %d of %d running on node %s\n", myrank, nprocs, hostname);
#endif

    // create basis set    
    BasisSet_t basis;
    CInt_createBasisSet(&basis);

    // input parameters and load basis set
    int nprow_fock;
    int npcol_fock;
    int nblks_fock;
    int nprow_purif;
    int nshells;
    int natoms;
    int nfunctions;
    int niters;
    if (myrank == 0) {
        if (argc != 8) {
            usage(argv[0]);
            MPI_Finalize();
            exit(0);
        }
        // init parameters
        nprow_fock = atoi(argv[3]);
        npcol_fock = atoi(argv[4]);
        nprow_purif = atoi(argv[5]);
        nblks_fock = atoi(argv[6]);
        niters = atoi(argv[7]);
        assert(nprow_fock * npcol_fock == nprocs);
        assert(nprow_purif * nprow_purif * nprow_purif  <= nprocs);
        assert(niters > 0);       
        CInt_loadBasisSet(basis, argv[1], argv[2]);
        nshells = CInt_getNumShells(basis);
        natoms = CInt_getNumAtoms(basis);
        nfunctions = CInt_getNumFuncs(basis);
        assert(nprow_fock <= nshells && npcol_fock <= nshells);
        assert(nprow_purif <= nfunctions && nprow_purif <= nfunctions);
        printf("Job information:\n");
        char *fname;
        fname = basename(argv[2]);
        printf("  molecule:  %s\n", fname);
        fname = basename(argv[1]);
        printf("  basisset:  %s\n", fname);
        printf("  charge     = %d\n", CInt_getTotalCharge(basis));
        printf("  #atoms     = %d\n", natoms);
        printf("  #shells    = %d\n", nshells);
        printf("  #functions = %d\n", nfunctions);
        printf("  fock build uses   %d (%dx%d) nodes\n",
               nprow_fock * npcol_fock, nprow_fock, npcol_fock);
        printf("  purification uses %d (%dx%dx%d) nodes\n",
               nprow_purif * nprow_purif * nprow_purif,
               nprow_purif, nprow_purif, nprow_purif);
        printf("  #tasks = %d (%dx%d)\n",
               nblks_fock * nblks_fock * nprow_fock * nprow_fock,
               nblks_fock * nprow_fock, nblks_fock * nprow_fock);
        int nthreads = omp_get_max_threads();
        printf("  #nthreads_cpu = %d\n", nthreads);   
    }
    int btmp[8];
    btmp[0] = nprow_fock;
    btmp[1] = npcol_fock;
    btmp[2] = nprow_purif;
    btmp[3] = nblks_fock;
    btmp[4] = niters;
    btmp[5] = natoms;
    btmp[6] = nshells;
    btmp[7] = nfunctions;
    MPI_Bcast(btmp, 8, MPI_INT, 0, MPI_COMM_WORLD);
    nprow_fock = btmp[0];
    npcol_fock = btmp[1];
    nprow_purif = btmp[2];
    nblks_fock = btmp[3];
    niters = btmp[4];
    natoms = btmp[5];
    nshells = btmp[6];
    nfunctions = btmp[7];

    // broadcast basis set
    void *bsbuf;
    int bsbufsize;
    if (myrank == 0) {
        CInt_packBasisSet(basis, &bsbuf, &bsbufsize);
        MPI_Bcast(&bsbufsize, 1, MPI_INT, 0, MPI_COMM_WORLD);
        MPI_Bcast(bsbuf, bsbufsize, MPI_CHAR, 0, MPI_COMM_WORLD);
    }
    else {
        MPI_Bcast(&bsbufsize, 1, MPI_INT, 0, MPI_COMM_WORLD);
        bsbuf = (void *)malloc(bsbufsize);
        assert(bsbuf != NULL);
        MPI_Bcast(bsbuf, bsbufsize, MPI_CHAR, 0, MPI_COMM_WORLD);
        CInt_unpackBasisSet(basis, bsbuf);  
        free(bsbuf);
    }

    // init PFock
    if (myrank == 0) {
        printf("Initializing pfock ...\n");
    }
    PFock_t pfock;
    PFock_create(basis, nprow_fock, npcol_fock, nblks_fock, 1e-11,
                 MAX_NUM_D, IS_SYMM, &pfock);
    if (myrank == 0) {
        double mem_cpu;
        PFock_getMemorySize(pfock, &mem_cpu);
        printf("  CPU uses %.3f MB\n", mem_cpu / 1024.0 / 1024.0);
        printf("  Done\n");
    }

    // init purif
    purif_t *purif = create_purif(basis, nprow_purif, nprow_purif, nprow_purif);
    init_oedmat(basis, pfock, purif, nprow_fock, npcol_fock);

    // compute SCF
    if (myrank == 0) {
        printf("Computing SCF ...\n");
    }
    int rowstart = purif->srow_purif;
    int rowend = purif->nrows_purif + rowstart - 1;
    int colstart = purif->scol_purif;
    int colend = purif->ncols_purif + colstart - 1;
    double energy0 = -1.0;
    double totaltime = 0.0;
    double purif_flops = 2.0 * nfunctions * nfunctions * nfunctions;
    double diis_flops;

    // set initial guess
    if (myrank == 0) {
        printf("  initialing D ...\n");
    }
    PFock_setNumDenMat(NUM_D, pfock);
    initial_guess(pfock, basis, purif->runpurif,
                  rowstart, rowend, colstart, colend,
                  purif->D_block, purif->ldx);

    MPI_Barrier(MPI_COMM_WORLD);

    // compute nuc energy
    double ene_nuc = CInt_getNucEnergy(basis);
    if (myrank == 0) {
        printf("  nuc energy = %.10f\n", ene_nuc);
    }

    MPI_Barrier(MPI_COMM_WORLD);
    
    // main loop
    double t1, t2, t3, t4;
    for (int iter = 0; iter < niters; iter++) {
        if (myrank == 0) {
            printf("  iter %d\n", iter);
        }
        t3 = MPI_Wtime();

        // fock matrix construction
        t1 = MPI_Wtime();
        fock_build(pfock, basis, purif->runpurif,
                   rowstart, rowend, colstart, colend,
                   purif->ldx, purif->D_block, purif->F_block);
        if (myrank == 0) {
            printf("After fock build \n");
        }

        // compute energy
        double energy = compute_energy(purif, purif->F_block, purif->D_block);

        t2 = MPI_Wtime();
        if (myrank == 0) {
            printf("    fock build takes %.3f secs\n", t2 - t1);
            if (iter > 0) {
                printf("    energy %.10f (%.10f), %le\n",
                       energy + ene_nuc, energy, fabs (energy - energy0));
            }
            else {
                printf("    energy %.10f (%.10f)\n", energy + ene_nuc,
                       energy);
            }
        }
        if (iter > 0 && fabs (energy - energy0) < 1e-11) {
            niters = iter + 1;
            break;
        }
        energy0 = energy;

        // compute DIIS
        t1 = MPI_Wtime();
        compute_diis(pfock, purif, purif->D_block, purif->F_block, iter);
        t2 = MPI_Wtime();

        if (myrank == 0) {
            if (iter > 1) {
                diis_flops = purif_flops * 6.0;
            } else {
                diis_flops = purif_flops * 2.0;
            }
            printf("    diis takes %.3f secs, %.3lf Gflops\n",
                   t2 - t1, diis_flops / (t2 - t1) / 1e9);
        }
        
    #ifdef __SCF_OUT__
        if (myrank == 0) {
            double outbuf[nfunctions];
            char fname[1024];
            sprintf(fname, "XFX_%d_%d.dat", nfunctions, iter);
            FILE *fp = fopen(fname, "w+");
            assert(fp != NULL);
            for (int i = 0; i < nfunctions; i++) {
                PFock_getMat(pfock, PFOCK_MAT_TYPE_F, USE_D_ID,
                             i, i, USE_D_ID, nfunctions - 1,
                             outbuf, nfunctions);
                for (int j = 0; j < nfunctions; j++) {
                    fprintf(fp, "%.10e\n", outbuf[j]);
                }
            }
            fclose(fp);
        }
    #endif
    
        // purification
        MPI_Barrier(MPI_COMM_WORLD);
        t1 = MPI_Wtime();
        int it = compute_purification(purif, purif->F_block, purif->D_block);
        t2 = MPI_Wtime();
        MPI_Barrier(MPI_COMM_WORLD);
        if (myrank == 0) {
            printf("    purification takes %.3f secs,"
                   " %d iterations, %.3f Gflops\n",
                   t2 - t1, it,
                   (it * 2.0 + 4.0) * purif_flops / (t2 - t1) / 1e9);
        }
	/*
#if defined(USE_ELEMENTAL)
    ElGlobalArraysPrint_d( eldga, pfock->ga_D[USE_D_ID] );
#else
    GA_Print (pfock->ga_D[USE_D_ID]);
#endif
*/
        t4 = MPI_Wtime ();
        totaltime += t4 - t3;

#ifdef __SCF_TIMING__
        PFock_getStatistics(pfock);
        double purif_timedgemm;
        double purif_timepdgemm;
        double purif_timepass;
        double purif_timetr;
        MPI_Reduce(&purif->timedgemm, &purif_timedgemm,
                   1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
        MPI_Reduce(&purif->timepdgemm, &purif_timepdgemm,
                   1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
        MPI_Reduce(&purif->timepass, &purif_timepass,
                   1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
        MPI_Reduce(&purif->timetr, &purif_timetr,
                   1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
        if (myrank == 0) {
            printf("    Purification Statistics:\n");
            printf("      average totaltime  = %.3f\n"
                   "      average timetr     = %.3f\n"
                   "      average timedgemm  = %.3f, %.3f Gflops\n"
                   "      average timepdgemm = %.3f, %.3f Gflops\n",
                   purif_timepass / purif->np_purif,
                   purif_timetr / purif->np_purif,
                   purif_timedgemm / purif->np_purif,
                   (it * 2.0 + 4.0) *
                   purif_flops / (purif_timedgemm / purif->np_purif) / 1e9,
                   purif_timepdgemm / purif->np_purif,
                   (it * 2.0 + 4.0) *
                   purif_flops / (purif_timepdgemm / purif->np_purif) / 1e9);
        }
#endif
    } /* for (iter = 0; iter < NITERATIONS; iter++) */

    if (myrank == 0) {
        printf("  totally takes %.3f secs: %.3f secs/iters\n",
               totaltime, totaltime / niters);
        printf("  Done\n");
    }

    destroy_purif(purif);
    PFock_destroy(pfock);
    CInt_destroyBasisSet(basis);

    MPI_Finalize();

    return 0;
}
Exemple #11
0
void myhbwmalloc_init(void)
{
    /* set to NULL before trying to initialize.  if we return before
     * successful creation of the mspace, then it will still be NULL,
     * and we can use that in subsequent library calls to determine
     * that the library failed to initialize. */
    myhbwmalloc_mspace = NULL;

    /* verbose printout? */
    myhbwmalloc_verbose = 0;
    {
        char * env_char = getenv("HBWMALLOC_VERBOSE");
        if (env_char != NULL) {
            myhbwmalloc_verbose = 1;
            printf("hbwmalloc: HBWMALLOC_VERBOSE set\n");
        }
    }

    /* fail hard or soft? */
    myhbwmalloc_hardfail = 1;
    {
        char * env_char = getenv("HBWMALLOC_SOFTFAIL");
        if (env_char != NULL) {
            myhbwmalloc_hardfail = 0;
            printf("hbwmalloc: HBWMALLOC_SOFTFAIL set\n");
        }
    }

    /* set the atexit handler that will destroy the mspace and free the numa allocation */
    atexit(myhbwmalloc_final);

    /* detect and configure use of NUMA memory nodes */
    {
        int max_possible_node        = numa_max_possible_node();
        int num_possible_nodes       = numa_num_possible_nodes();
        int max_numa_nodes           = numa_max_node();
        int num_configured_nodes     = numa_num_configured_nodes();
        int num_configured_cpus      = numa_num_configured_cpus();
        if (myhbwmalloc_verbose) {
            printf("hbwmalloc: numa_max_possible_node()    = %d\n", max_possible_node);
            printf("hbwmalloc: numa_num_possible_nodes()   = %d\n", num_possible_nodes);
            printf("hbwmalloc: numa_max_node()             = %d\n", max_numa_nodes);
            printf("hbwmalloc: numa_num_configured_nodes() = %d\n", num_configured_nodes);
            printf("hbwmalloc: numa_num_configured_cpus()  = %d\n", num_configured_cpus);
        }
        /* FIXME this is a hack.  assumes HBW is only numa node 1. */
        if (num_configured_nodes <= 2) {
            myhbwmalloc_numa_node = num_configured_nodes-1;
        } else {
            fprintf(stderr,"hbwmalloc: we support only 2 numa nodes, not %d\n", num_configured_nodes);
        }

        if (myhbwmalloc_verbose) {
            for (int i=0; i<num_configured_nodes; i++) {
                unsigned max_numa_cpus = numa_num_configured_cpus();
                struct bitmask * mask = numa_bitmask_alloc( max_numa_cpus );
                int rc = numa_node_to_cpus(i, mask);
                if (rc != 0) {
                    fprintf(stderr, "hbwmalloc: numa_node_to_cpus failed\n");
                } else {
                    printf("hbwmalloc: numa node %d cpu mask:", i);
                    for (unsigned j=0; j<max_numa_cpus; j++) {
                        int bit = numa_bitmask_isbitset(mask,j);
                        printf(" %d", bit);
                    }
                    printf("\n");
                }
                numa_bitmask_free(mask);
            }
            fflush(stdout);
        }
    }

#if 0 /* unused */
    /* see if the user specifies a slab size */
    size_t slab_size_requested = 0;
    {
        char * env_char = getenv("HBWMALLOC_BYTES");
        if (env_char!=NULL) {
            long units = 1L;
            if      ( NULL != strstr(env_char,"G") ) units = 1000000000L;
            else if ( NULL != strstr(env_char,"M") ) units = 1000000L;
            else if ( NULL != strstr(env_char,"K") ) units = 1000L;
            else                                     units = 1L;

            int num_count = strspn(env_char, "0123456789");
            memset( &env_char[num_count], ' ', strlen(env_char)-num_count);
            slab_size_requested = units * atol(env_char);
        }
        if (myhbwmalloc_verbose) {
            printf("hbwmalloc: requested slab_size_requested = %zu\n", slab_size_requested);
        }
    }
#endif

    /* see what libnuma says is available */
    size_t myhbwmalloc_slab_size;
    {
        int node = myhbwmalloc_numa_node;
        long long freemem;
        long long maxmem = numa_node_size64(node, &freemem);
        if (myhbwmalloc_verbose) {
            printf("hbwmalloc: numa_node_size64 says maxmem=%lld freemem=%lld for numa node %d\n",
                    maxmem, freemem, node);
        }
        myhbwmalloc_slab_size = freemem;
    }

    /* assume threads, disable if MPI knows otherwise, then allow user to override. */
    int multithreaded = 1;
#ifdef HAVE_MPI
    int nprocs;
    {
        int is_init, is_final;
        MPI_Initialized(&is_init);
        MPI_Finalized(&is_final);
        if (is_init && !is_final) {
            MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
        }

        /* give equal portion to every MPI process */
        myhbwmalloc_slab_size /= nprocs;

        /* if the user initializes MPI with MPI_Init or
         * MPI_Init_thread(MPI_THREAD_SINGLE), they assert there
         * are no threads at all, which means we can skip the
         * malloc mspace lock.
         *
         * if the user lies to MPI, they deserve any bad thing
         * that comes of it. */
        int provided;
        MPI_Query_thread(&provided);
        if (provided==MPI_THREAD_SINGLE) {
            multithreaded = 0;
        } else {
            multithreaded = 1;
        }

        if (myhbwmalloc_verbose) {
            printf("hbwmalloc: MPI processes = %d (threaded = %d)\n", nprocs, multithreaded);
            printf("hbwmalloc: myhbwmalloc_slab_size = %d\n", myhbwmalloc_slab_size);
        }
    }
#endif

    /* user can assert that hbwmalloc and friends need not be thread-safe */
    {
        char * env_char = getenv("HBWMALLOC_LOCKLESS");
        if (env_char != NULL) {
            multithreaded = 0;
            if (myhbwmalloc_verbose) {
                printf("hbwmalloc: user has disabled locking in mspaces by setting HBWMALLOC_LOCKLESS\n");
            }
        }
    }

    myhbwmalloc_slab = numa_alloc_onnode( myhbwmalloc_slab_size, myhbwmalloc_numa_node);
    if (myhbwmalloc_slab==NULL) {
        fprintf(stderr, "hbwmalloc: numa_alloc_onnode returned NULL for size = %zu\n", myhbwmalloc_slab_size);
        return;
    } else {
        if (myhbwmalloc_verbose) {
            printf("hbwmalloc: numa_alloc_onnode succeeded for size %zu\n", myhbwmalloc_slab_size);
        }

        /* part (less than 128*sizeof(size_t) bytes) of this space is used for bookkeeping,
         * so the capacity must be at least this large */
        if (myhbwmalloc_slab_size < 128*sizeof(size_t)) {
            fprintf(stderr, "hbwmalloc: not enough space for mspace bookkeeping\n");
            return;
        }

        /* see above regarding if the user lies to MPI. */
        int locked = multithreaded;
        myhbwmalloc_mspace = create_mspace_with_base( myhbwmalloc_slab, myhbwmalloc_slab_size, locked);
        if (myhbwmalloc_mspace == NULL) {
            fprintf(stderr, "hbwmalloc: create_mspace_with_base returned NULL\n");
            return;
        } else if (myhbwmalloc_verbose) {
            printf("hbwmalloc: create_mspace_with_base succeeded for size %zu\n", myhbwmalloc_slab_size);
        }
    }
}
Exemple #12
0
int A1D_Initialize()
{
    int mpi_initialized, mpi_provided;
    int mpi_status;
    int i;
    size_t bytes_in, bytes_out;
    DCMF_Result dcmf_result;
    DCMF_Configure_t dcmf_config;
    DCMF_Memregion_t local_memregion;

    /***************************************************
     *
     * configure MPI
     *
     ***************************************************/

    /* MPI has to be initialized for this implementation to work */
    MPI_Initialized(&mpi_initialized);
    assert(mpi_initialized==1);

    /* MPI has to be thread-safe so that DCMF doesn't explode */
    MPI_Query_thread(&mpi_provided);
    assert(mpi_provided==MPI_THREAD_MULTIPLE);

    /* have to use our own communicator for collectives to be proper */
    mpi_status = MPI_Comm_dup(MPI_COMM_WORLD,&A1D_COMM_WORLD);
    assert(mpi_status==0);

    /* get my MPI rank */
    mpi_status = MPI_Comm_rank(A1D_COMM_WORLD,&myrank);
    assert(mpi_status==0);

    /* get MPI world size */
    mpi_status = MPI_Comm_size(A1D_COMM_WORLD,&mpi_size);
    assert(mpi_status==0);

    /* make sure MPI and DCMF agree */
    assert(myrank==DCMF_Messager_rank());
    assert(mpi_size==DCMF_Messager_size());

    /* barrier before DCMF_Messager_configure to make sure MPI is ready everywhere */
    mpi_status = MPI_Barrier(A1D_COMM_WORLD);
    assert(mpi_status==0);

    /***************************************************
     *
     * configure DCMF
     *
     ***************************************************/

    /* to be safe, but perhaps not necessary */
    dcmf_config.thread_level = DCMF_THREAD_MULTIPLE;
#ifdef ACCUMULATE_IMPLEMENTED
    /* interrupts required for accumulate only, Put/Get use DMA
     * if accumulate not used, MPI will query environment for DCMF_INTERRUPTS */
    dcmf_config.interrupts = DCMF_INTERRUPTS_ON;
#endif

    /* reconfigure DCMF with interrupts on */
    DCMF_CriticalSection_enter(0);
    dcmf_result = DCMF_Messager_configure(&dcmf_config, &dcmf_config);
    assert(dcmf_result==DCMF_SUCCESS);
    DCMF_CriticalSection_exit(0);

    /* barrier after DCMF_Messager_configure to make sure everyone has the new DCMF config */
    mpi_status = MPI_Barrier(A1D_COMM_WORLD);
    assert(mpi_status==0);

    /***************************************************
     *
     * setup DCMF memregions
     *
     ***************************************************/

    /* allocate memregion list */
    A1D_Memregion_list = malloc( mpi_size * sizeof(DCMF_Memregion_t) );
    assert(A1D_Memregion_list != NULL);

    /* allocate base pointer list */
    A1D_Baseptr_list = malloc( mpi_size * sizeof(void*) );
    assert(A1D_Memregion_list != NULL);

    /* create memregions */
    bytes_in = -1;
    DCMF_CriticalSection_enter(0);
    dcmf_result = DCMF_Memregion_create(&local_memregion,&bytes_out,bytes_in,NULL,0);
    assert(dcmf_result==DCMF_SUCCESS);
    DCMF_CriticalSection_exit(0);

    /* exchange memregions because we don't use symmetry heap */
    mpi_status = MPI_Allgather(&local_memregion,sizeof(DCMF_Memregion_t),MPI_BYTE,
                               A1D_Memregion_list,sizeof(DCMF_Memregion_t),MPI_BYTE,
                               A1D_COMM_WORLD);
    assert(mpi_status==0);

    /* destroy temporary local memregion */
    DCMF_CriticalSection_enter(0);
    dcmf_result = DCMF_Memregion_destroy(&local_memregion);
    assert(dcmf_result==DCMF_SUCCESS);
    DCMF_CriticalSection_exit(0);

    /* check for valid memregions */
    DCMF_CriticalSection_enter(0);
    for (i = 0; i < mpi_size; i++)
    {
        dcmf_result = DCMF_Memregion_query(&A1D_Memregion_list[i],
                                           &bytes_out,
                                           &A1D_Baseptr_list[i]);
        assert(dcmf_result==DCMF_SUCCESS);
    }
    DCMF_CriticalSection_exit(0);

#ifdef FLUSH_IMPLEMENTED
    /***************************************************
     *
     * setup flush list(s)
     *
     ***************************************************/

    /* allocate Put list */
    A1D_Put_flush_list = malloc( mpi_size * sizeof(int) );
    assert(A1D_Put_flush_list != NULL);

  #ifdef ACCUMULATE_IMPLEMENTED
    /* allocate Acc list */
    A1D_Send_flush_list = malloc( mpi_size * sizeof(int) );
    assert(A1D_Send_flush_list != NULL);
  #endif

#endif

    /***************************************************
     *
     * define null callback
     *
     ***************************************************/

    A1D_Nocallback.function = NULL;
    A1D_Nocallback.clientdata = NULL;

    return(0);
}
Exemple #13
0
int A1D_Initialize()
{

#ifdef DMAPPD_USES_MPI
    int mpi_initialized, mpi_provided;
    int mpi_status = MPI_SUCCESS;

    int namelen;
    char procname[MPI_MAX_PROCESSOR_NAME];
#endif

#ifdef __CRAYXE
    int                                 pmi_status  = PMI_SUCCESS;
    int                                 nodeid = -1;
    rca_mesh_coord_t                    rca_xyz;

    dmapp_return_t                      dmapp_status = DMAPP_RC_SUCCESS;

    dmapp_rma_attrs_ext_t               dmapp_config_in, dmapp_config_out;

    dmapp_jobinfo_t                     dmapp_info;
    dmapp_pe_t                          dmapp_rank = -1;
    int                                 dmapp_size = -1;
#endif
    int                                 sheapflag = 0;

#ifdef DEBUG_FUNCTION_ENTER_EXIT
    fprintf(stderr,"entering A1D_Initialize() \n");
#endif

#ifdef DMAPPD_USES_MPI

    /***************************************************
     *
     * configure MPI
     *
     ***************************************************/

    /* MPI has to be Initialized for this implementation to work */
    MPI_Initialized(&mpi_initialized);
    assert(mpi_initialized==1);

    /* MPI has to tolerate threads because A1 supports them */
    MPI_Query_thread(&mpi_provided);
    //assert(mpi_provided>MPI_THREAD_SINGLE);

    /* have to use our own communicator for collectives to be proper */
    mpi_status = MPI_Comm_dup(MPI_COMM_WORLD,&A1D_COMM_WORLD);
    assert(mpi_status==0);

    /* get my MPI rank */
    mpi_status = MPI_Comm_rank(A1D_COMM_WORLD,&mpi_rank);
    assert(mpi_status==0);

    /* get MPI world size */
    mpi_status = MPI_Comm_size(A1D_COMM_WORLD,&mpi_size);
    assert(mpi_status==0);

    /* in a perfect world, this would provide topology information like BG */
    MPI_Get_processor_name( procname, &namelen );
    printf( "%d: MPI_Get_processor_name = %s\n" , mpi_rank, procname );
    fflush( stdout );

    /* barrier to make sure MPI is ready everywhere */
    mpi_status = MPI_Barrier(A1D_COMM_WORLD);
    assert(mpi_status==0);

#endif

#ifdef __CRAYXE

    /***************************************************
     *
     * query topology
     *
     ***************************************************/

    PMI_Get_nid( mpi_rank, &nodeid );
    assert(pmi_status==PMI_SUCCESS);

    rca_get_meshcoord((uint16_t)nodeid, &rca_xyz);
    printf("%d: rca_get_meshcoord returns (%2u,%2u,%2u)\n", mpi_rank, rca_xyz.mesh_x, rca_xyz.mesh_y, rca_xyz.mesh_z );

#endif

#ifdef __CRAYXE

    /***************************************************
     *
     * configure DMAPP
     *
     ***************************************************/

    dmapp_config_in.max_outstanding_nb   = DMAPP_DEF_OUTSTANDING_NB; /*  512 */
    dmapp_config_in.offload_threshold    = DMAPP_OFFLOAD_THRESHOLD;  /* 4096 */
#ifdef DETERMINISTIC_ROUTING
    dmapp_config_in.put_relaxed_ordering = DMAPP_ROUTING_DETERMINISTIC;
    dmapp_config_in.get_relaxed_ordering = DMAPP_ROUTING_DETERMINISTIC;
#else
    dmapp_config_in.put_relaxed_ordering = DMAPP_ROUTING_ADAPTIVE;
    dmapp_config_in.get_relaxed_ordering = DMAPP_ROUTING_ADAPTIVE;
#endif
    dmapp_config_in.max_concurrency      = 1; /* not thread-safe */
#ifdef FLUSH_IMPLEMENTED
    dmapp_config_in.PI_ordering          = DMAPP_PI_ORDERING_RELAXED;
#else
    dmapp_config_in.PI_ordering          = DMAPP_PI_ORDERING_STRICT;
#endif

    dmapp_status = dmapp_init_ext( &dmapp_config_in, &dmapp_config_out );
    assert(dmapp_status==DMAPP_RC_SUCCESS);

#ifndef FLUSH_IMPLEMENTED
    /* without strict PI ordering, we have to flush remote stores with a get packet to force global visibility */
    assert( dmapp_config_out.PI_ordering == DMAPP_PI_ORDERING_STRICT);
#endif

    dmapp_status = dmapp_get_jobinfo(&dmapp_info);
    assert(dmapp_status==DMAPP_RC_SUCCESS);

    dmapp_rank     = dmapp_info.pe;
    dmapp_size     = dmapp_info.npes;
    A1D_Sheap_desc = dmapp_info.sheap_seg;

    /* make sure PMI and DMAPP agree */
    assert(mpi_rank==dmapp_rank);
    assert(mpi_size==dmapp_size);

#endif

    /***************************************************
     *
     * setup protocols
     *
     ***************************************************/

#ifdef FLUSH_IMPLEMENTED
    /* allocate Put list */
    A1D_Put_flush_list = malloc( mpi_size * sizeof(int32_t) );
    assert(A1D_Put_flush_list != NULL);
#endif

#ifdef __CRAYXE
    A1D_Acc_lock = dmapp_sheap_malloc( sizeof(int64_t) );
#endif

    A1D_Allreduce_issame64((size_t)A1D_Acc_lock, &sheapflag);
    assert(sheapflag==1);

#ifdef DEBUG_FUNCTION_ENTER_EXIT
    fprintf(stderr,"exiting A1D_Initialize() \n");
#endif

    return(0);
}
Exemple #14
0
int main(int argc, char *argv[])
{
    /*********************************************************************************
     *                            INITIALIZE MPI
     *********************************************************************************/

    int world_size = 0, world_rank = -1;
    int provided = -1;

#if defined(USE_MPI_INIT)

    MPI_Init( &argc, &argv );
    MPI_Comm_rank( MPI_COMM_WORLD, &world_rank );

    if (world_rank==0)
        print_meminfo(stdout, "after MPI_Init");

#else

    int requested = -1;

#  if defined(USE_MPI_INIT_THREAD_MULTIPLE)
    requested = MPI_THREAD_MULTIPLE;
#  elif defined(USE_MPI_INIT_THREAD_SERIALIZED)
    requested = MPI_THREAD_SERIALIZED;
#  elif defined(USE_MPI_INIT_THREAD_FUNNELED)
    requested = MPI_THREAD_FUNNELED;
#  else
    requested = MPI_THREAD_SINGLE;
#  endif

    MPI_Init_thread( &argc, &argv, requested, &provided );
    MPI_Comm_rank( MPI_COMM_WORLD, &world_rank );

    if (world_rank==0)
        print_meminfo(stdout, "after MPI_Init_thread");

    if (provided>requested)
    {
        if (world_rank==0) printf("MPI_Init_thread returned %s instead of %s, but this is okay. \n",
                                  MPI_THREAD_STRING(provided), MPI_THREAD_STRING(requested) );
    }
    if (provided<requested)
    {
        if (world_rank==0) printf("MPI_Init_thread returned %s instead of %s so the test will exit. \n",
                                  MPI_THREAD_STRING(provided), MPI_THREAD_STRING(requested) );
        MPI_Abort(MPI_COMM_WORLD, 1);
    }

#endif

    double t0 = MPI_Wtime();

    int is_init = 0;
    MPI_Initialized(&is_init);
    if (world_rank==0) printf("MPI %s initialized. \n", (is_init==1 ? "was" : "was not") );

    MPI_Query_thread(&provided);
    if (world_rank==0) printf("MPI thread support is %s. \n", MPI_THREAD_STRING(provided) );

    MPI_Comm_size( MPI_COMM_WORLD, &world_size );
    if (world_rank==0) printf("MPI test program running on %d ranks. \n", world_size);

    char procname[MPI_MAX_PROCESSOR_NAME];
    int pnlen;
    MPI_Get_processor_name(procname,&pnlen);
    printf("%d: processor name = %s\n", world_rank, procname);

    /*********************************************************************************
     *                            SETUP MPI COMMUNICATORS
     *********************************************************************************/

    if (world_rank==0) printf("MPI_Barrier on MPI_COMM_WORLD 1 \n");
    MPI_Barrier( MPI_COMM_WORLD );

    if (world_rank==0) printf("MPI_Comm_dup of MPI_COMM_WORLD \n");
    MPI_Comm comm_world_dup;
    MPI_Comm_dup(MPI_COMM_WORLD, &comm_world_dup);

    if (world_rank==0)
        print_meminfo(stdout, "after MPI_Comm_dup");

    if (world_rank==0) printf("MPI_Barrier on comm_world_dup \n");
    MPI_Barrier( comm_world_dup );

    if (world_rank==0) printf("MPI_Comm_split of MPI_COMM_WORLD into world_reordered \n");
    MPI_Comm comm_world_reordered;
    MPI_Comm_split(MPI_COMM_WORLD, 0, world_size-world_rank, &comm_world_reordered);

    if (world_rank==0)
        print_meminfo(stdout, "after MPI_Comm_split");

    if (world_rank==0) printf("MPI_Comm_split of MPI_COMM_WORLD into left-right \n");
    MPI_Comm comm_world_leftright;
    int leftright = (world_rank<(world_size/2));
    MPI_Comm_split(MPI_COMM_WORLD, leftright, world_rank, &comm_world_leftright);

    if (world_rank==0)
        print_meminfo(stdout, "after MPI_Comm_split");

    if (world_rank==0) printf("MPI_Barrier on comm_world_leftright \n");
    MPI_Barrier( comm_world_leftright );

    if (world_rank==0) printf("MPI_Comm_split of MPI_COMM_WORLD into odd-even \n");
    MPI_Comm comm_world_oddeven;
    int oddeven = (world_rank%2);
    MPI_Comm_split(MPI_COMM_WORLD, oddeven, world_rank, &comm_world_oddeven);

    if (world_rank==0)
        print_meminfo(stdout, "after MPI_Comm_split");

    if (world_rank==0) printf("MPI_Barrier on comm_world_oddeven \n");
    MPI_Barrier( comm_world_oddeven );

    if (world_rank==0) printf("MPI_Comm_split MPI_COMM_WORLD into (world-1) \n");
    MPI_Comm comm_world_minus_one;
    int left_out = world_rank==(world_size/2);
    MPI_Comm_split(MPI_COMM_WORLD, left_out, world_rank, &comm_world_minus_one);

    if (world_rank==0)
        print_meminfo(stdout, "after MPI_Comm_split");

    if (world_rank==0) printf("MPI_Barrier on comm_world_minus_one \n");
    MPI_Barrier( comm_world_minus_one );

    if (world_rank==0) printf("MPI_Comm_group of group_world from MPI_COMM_WORLD \n");
    MPI_Group group_world;
    MPI_Comm_group(MPI_COMM_WORLD, &group_world);

    if (world_rank==0)
        print_meminfo(stdout, "after MPI_Comm_group");

    int geomprog_size = (world_size==1) ? 1 : ceil(log2(world_size));

    int * geomprog_list = NULL;
    geomprog_list = (int *) safemalloc( geomprog_size * sizeof(int) );

    for (int i=0; i<geomprog_size; i++)
        geomprog_list[i] = pow(2,i)-1;

    if (world_rank==0)
        for (int i=0; i<geomprog_size; i++)
            if (world_rank==0) printf("geomprog_list[%d] = %d \n", i, geomprog_list[i]);

    if (world_rank==0) printf("MPI_Group_incl of group_geomprog (geometric progression) from group_world \n");
    MPI_Group group_geomprog;
    MPI_Group_incl(group_world, geomprog_size, geomprog_list, &group_geomprog);
    MPI_Group_free(&group_world);

    if (world_rank==0) printf("MPI_Comm_create of comm_geomprog from group_geomprog on MPI_COMM_WORLD \n");
    MPI_Comm comm_geomprog;
    MPI_Comm_create(MPI_COMM_WORLD, group_geomprog, &comm_geomprog);
    MPI_Group_free(&group_geomprog);

    if (world_rank==0)
        print_meminfo(stdout, "after MPI_Comm_create");

    if (world_rank==0) printf("MPI_Barrier on comm_geomprog \n");
    for (int i=0; i<geomprog_size; i++)
        if (geomprog_list[i]==world_rank)
            MPI_Barrier( comm_geomprog );

    if (world_rank==0) printf("MPI_Barrier on MPI_COMM_WORLD 2 \n");
    MPI_Barrier( MPI_COMM_WORLD );

    if (world_rank==0)
        print_meminfo(stdout, "after MPI communicator creation");

    /*********************************************************************************
     *                            COLLECTIVES
     *********************************************************************************/

    int max_mem = (argc>1 ? atoi(argv[1]) : 32*1024*1024);

    MPI_Comm test_comm;

#if defined(DO_COMM_WORLD)
    test_comm = MPI_COMM_WORLD;

    MPI_Barrier( MPI_COMM_WORLD );

    if (world_rank==0)
    	printf("############## %s ##############\n", "MPI_COMM_WORLD - pass 1" );

    {
    	MPI_Barrier( test_comm );
    	bcast_only(stdout, test_comm, max_mem);
    	gather_only(stdout, test_comm, max_mem);
    	allgather_only(stdout, test_comm, max_mem);
    	scatter_only(stdout, test_comm, max_mem);
    	alltoall_only(stdout, test_comm, max_mem);
    	reduce_only(stdout, test_comm, max_mem);
    	allreduce_only(stdout, test_comm, max_mem);
    	reducescatterblock_only(stdout, test_comm, max_mem);
    }

    fflush(stdout);
    MPI_Barrier( MPI_COMM_WORLD );

    if (world_rank==0)
    	printf("############## %s ##############\n", "MPI_COMM_WORLD - pass 2" );

    {
    	MPI_Barrier( test_comm );
    	bcast_only(stdout, test_comm, max_mem);
    	gather_only(stdout, test_comm, max_mem);
    	allgather_only(stdout, test_comm, max_mem);
    	scatter_only(stdout, test_comm, max_mem);
    	alltoall_only(stdout, test_comm, max_mem);
    	reduce_only(stdout, test_comm, max_mem);
    	allreduce_only(stdout, test_comm, max_mem);
    	reducescatterblock_only(stdout, test_comm, max_mem);
    }

    fflush(stdout);
    MPI_Barrier( MPI_COMM_WORLD );
#endif

#ifdef DO_COMM_WORLD_JITTER
    test_comm = MPI_COMM_WORLD;

    MPI_Barrier( MPI_COMM_WORLD );

    if (world_rank==0)
    	printf("############## %s ##############\n", "COMM_WORLD_JITTER" );

    {
    	int jitter = 0;
    	if ((world_rank%10)==0) jitter++;
    	if ((world_rank%100)==0) jitter++;
    	if ((world_rank%1000)==0) jitter++;
    	if ((world_rank%10000)==0) jitter++;
    	if ((world_rank%100000)==0) jitter++;

    	MPI_Barrier( test_comm );
    	sleep(jitter);
    	bcast_only(stdout, test_comm, max_mem);

    	MPI_Barrier( test_comm );
    	sleep(jitter);
    	gather_only(stdout, test_comm, max_mem);

    	MPI_Barrier( test_comm );
    	sleep(jitter);
    	allgather_only(stdout, test_comm, max_mem);

    	MPI_Barrier( test_comm );
    	sleep(jitter);
    	scatter_only(stdout, test_comm, max_mem);

    	MPI_Barrier( test_comm );
    	sleep(jitter);
    	alltoall_only(stdout, test_comm, max_mem);

    	MPI_Barrier( test_comm );
    	sleep(jitter);
    	reduce_only(stdout, test_comm, max_mem);

    	MPI_Barrier( test_comm );
    	sleep(jitter);
    	allreduce_only(stdout, test_comm, max_mem);

    	MPI_Barrier( test_comm );
    	sleep(jitter);
    	reducescatterblock_only(stdout, test_comm, max_mem);
    }

    fflush(stdout);
    MPI_Barrier( MPI_COMM_WORLD );
#endif

#ifdef DO_COMM_WORLD_DUP
    test_comm = comm_world_dup;

    MPI_Barrier( MPI_COMM_WORLD );

    if (world_rank==0)
    	printf("############## %s ##############\n", "COMM_WORLD_DUP" );

    {
    	MPI_Barrier( test_comm );
    	bcast_only(stdout, test_comm, max_mem);
    	gather_only(stdout, test_comm, max_mem);
    	allgather_only(stdout, test_comm, max_mem);
    	scatter_only(stdout, test_comm, max_mem);
    	alltoall_only(stdout, test_comm, max_mem);
    	reduce_only(stdout, test_comm, max_mem);
    	allreduce_only(stdout, test_comm, max_mem);
    	reducescatterblock_only(stdout, test_comm, max_mem);
    }

    fflush(stdout);
    MPI_Barrier( MPI_COMM_WORLD );
#endif

#ifdef DO_WORLD_REORDERED
    test_comm = comm_world_reordered;

    MPI_Barrier( MPI_COMM_WORLD );

    if (world_rank==0)
    	printf("############## %s ##############\n", "WORLD_REORDERED" );

    {
    	MPI_Barrier( test_comm );
    	bcast_only(stdout, test_comm, max_mem);
    	gather_only(stdout, test_comm, max_mem);
    	allgather_only(stdout, test_comm, max_mem);
    	scatter_only(stdout, test_comm, max_mem);
    	alltoall_only(stdout, test_comm, max_mem);
    	reduce_only(stdout, test_comm, max_mem);
    	allreduce_only(stdout, test_comm, max_mem);
    	reducescatterblock_only(stdout, test_comm, max_mem);
    }

    fflush(stdout);
    MPI_Barrier( MPI_COMM_WORLD );
#endif

#ifdef DO_WORLD_MINUS_ONE
    test_comm = comm_world_minus_one;

    MPI_Barrier( MPI_COMM_WORLD );

    if (world_rank==0)
    	printf("############## %s ##############\n", "WORLD_MINUS_ONE" );

    if (left_out==0)
    {
    	MPI_Barrier( test_comm );
    	bcast_only(stdout, test_comm, max_mem);
    	gather_only(stdout, test_comm, max_mem);
    	allgather_only(stdout, test_comm, max_mem);
    	scatter_only(stdout, test_comm, max_mem);
    	alltoall_only(stdout, test_comm, max_mem);
    	reduce_only(stdout, test_comm, max_mem);
    	allreduce_only(stdout, test_comm, max_mem);
    	reducescatterblock_only(stdout, test_comm, max_mem);
    }

    fflush(stdout);
    MPI_Barrier( MPI_COMM_WORLD );
#endif

#if DO_LEFT_RIGHT
    test_comm = comm_world_leftright;

    for (int i=0; i<2; i++)
    {
        MPI_Barrier( MPI_COMM_WORLD );

        if (world_rank==i)
        	printf("############## %s ##############\n", (i==0 ? "LEFT" : "RIGHT") );

        if (leftright==i)
        {
        	MPI_Barrier( test_comm );
        	bcast_only(stdout, test_comm, max_mem);
        	gather_only(stdout, test_comm, max_mem);
        	allgather_only(stdout, test_comm, max_mem);
        	scatter_only(stdout, test_comm, max_mem);
        	alltoall_only(stdout, test_comm, max_mem);
        	reduce_only(stdout, test_comm, max_mem);
        	allreduce_only(stdout, test_comm, max_mem);
        	reducescatterblock_only(stdout, test_comm, max_mem);
        }
    }
    fflush(stdout);
    MPI_Barrier( MPI_COMM_WORLD );
#endif

#if DO_ODD_EVEN
    test_comm = comm_world_oddeven;

    for (int i=0; i<2; i++)
    {
        MPI_Barrier( MPI_COMM_WORLD );

        if (world_rank==i)
        	printf("############## %s ##############\n", (i==0 ? "EVEN" : "ODD") );

        if (oddeven==i)
        {
        	MPI_Barrier( test_comm );
        	bcast_only(stdout, test_comm, max_mem);
        	gather_only(stdout, test_comm, max_mem);
        	allgather_only(stdout, test_comm, max_mem);
        	scatter_only(stdout, test_comm, max_mem);
        	alltoall_only(stdout, test_comm, max_mem);
        	reduce_only(stdout, test_comm, max_mem);
        	allreduce_only(stdout, test_comm, max_mem);
        	reducescatterblock_only(stdout, test_comm, max_mem);
        }
    }
    fflush(stdout);
    MPI_Barrier( MPI_COMM_WORLD );
#endif

#ifdef DO_GEOM_PROG
    test_comm = comm_geomprog;

    MPI_Barrier( MPI_COMM_WORLD );

    if (world_rank==0)
    	printf("############## %s ##############\n", "GEOM_PROG" );

    for (int i=0; i<geomprog_size; i++)
    	if (geomprog_list[i]==world_rank)
    	{
        	MPI_Barrier( test_comm );
        	bcast_only(stdout, test_comm, max_mem);
        	gather_only(stdout, test_comm, max_mem);
        	allgather_only(stdout, test_comm, max_mem);
        	scatter_only(stdout, test_comm, max_mem);
        	alltoall_only(stdout, test_comm, max_mem);
        	reduce_only(stdout, test_comm, max_mem);
        	allreduce_only(stdout, test_comm, max_mem);
        	reducescatterblock_only(stdout, test_comm, max_mem);
    	}

    fflush(stdout);
    MPI_Barrier( MPI_COMM_WORLD );
#endif

    if (world_rank==0)
        print_meminfo(stdout, "after MPI collective tests");

    /*********************************************************************************
     *                            CLEAN UP AND FINALIZE
     *********************************************************************************/

    for (int i=0; i<geomprog_size; i++)
        if (geomprog_list[i]==world_rank)
            MPI_Comm_free(&comm_geomprog);

    free(geomprog_list);

    MPI_Comm_free(&comm_world_minus_one);
    MPI_Comm_free(&comm_world_oddeven);
    MPI_Comm_free(&comm_world_leftright);
    MPI_Comm_free(&comm_world_reordered);
    MPI_Comm_free(&comm_world_dup);

    MPI_Barrier( MPI_COMM_WORLD );

    double t1 = MPI_Wtime();
    double dt = t1-t0;
    if (world_rank==0)
       printf("TEST FINISHED SUCCESSFULLY IN %lf SECONDS \n", dt);
    fflush(stdout);

    if (world_rank==0)
        print_meminfo(stdout, "before MPI_Finalize");

    MPI_Finalize();

    return 0;
}