Exemple #1
0
int main(int argc, char ** argv)
{
    int rc;

    /* These are the desired and available thread support.
       A hybrid code where all MPI calls are made from the main thread can used FUNNELED.
       If threads are making MPI calls, MULTIPLE is appropriate. */
    int requested = MPI_THREAD_FUNNELED, provided;

    /* MPICH2 will be substantially more efficient than OpenMPI 
       for MPI_THREAD_{FUNNELED,SERIALIZED} but this is unlikely
       to be a serious bottleneck. */
    rc = MPI_Init_thread(&argc, &argv, requested, &provided); CHECK_MPI(rc);
    if (provided<requested)
    {
        printf("MPI_Init_thread provided %s when %s was requested.  Exiting. \n",
               MPI_THREAD_STRING(provided), MPI_THREAD_STRING(requested) );
        exit(1);
    }

    int world_size, world_rank;

    rc = MPI_Comm_size(MPI_COMM_WORLD,&world_size); CHECK_MPI(rc);
    rc = MPI_Comm_rank(MPI_COMM_WORLD,&world_rank); CHECK_MPI(rc);

    int root = 0, count = 1;

    /* the ternary is often branchless... */
    long i, n = (argc>1 ? atol(argv[1]) : 100000);
    rc = MPI_Bcast(&n, count, MPI_LONG, root, MPI_COMM_WORLD); CHECK_MPI(rc);
    if (world_rank==0)
        printf("%d: using %ld samples.\n", world_rank, world_size*n);

    /* seed the RNG with something unique to a rank */
    srand(world_rank);

    long in = 0, total = 0;
    for (i=0;i<n;i++)
    {
        register double x = (double)rand()/(double)RAND_MAX;
        register double y = (double)rand()/(double)RAND_MAX;
        register double z = x*x + y*y;
        if (z<1.0) in++;
    }

    rc = MPI_Reduce(&in, &total, count, MPI_LONG, MPI_SUM, root, MPI_COMM_WORLD); CHECK_MPI(rc);
    double pi = 4.0*(double)total/(world_size*n);
    if (world_rank==0)
        printf("%d: pi = %12.8lf.\n", world_rank, pi);

    MPI_Finalize();
    return 0;
}
Exemple #2
0
void init_mpi (struct pe_vars * v)
{
	int mpi_provided;

	MPI_Init_thread( NULL, NULL, MPI_THREAD_SERIALIZED, &mpi_provided );
	MPI_Query_thread(&mpi_provided);

	if (strcmp((const char *)MPI_THREAD_STRING(mpi_provided),"WTF") == 0)
		MPI_Abort (MPI_COMM_WORLD, 5);

	MPI_Comm_rank( MPI_COMM_WORLD, &(v->me) );
	MPI_Comm_size( MPI_COMM_WORLD, &(v->npes) );


	v->pairs = v->npes / 2;
	v->nxtpe = ( v->me < v->pairs ) ? ( v->me + v->pairs ) : ( v->me - v->pairs );

	return;
}
Exemple #3
0
Fichier : scf.c Projet : sg0/gtfock
/// main for SCF
int main (int argc, char **argv)
{
    // init MPI
    int myrank;
    int nprocs;
    int provided;
#if defined (USE_ELEMENTAL)
    ElInitialize( &argc, &argv );
    ElMPICommRank( MPI_COMM_WORLD, &myrank );
    ElMPICommSize( MPI_COMM_WORLD, &nprocs );
    MPI_Query_thread(&provided);
#else
    MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &provided);
    MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
    MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
#endif
    if (myrank == 0)  {
        printf("MPI thread support: %s\n", MPI_THREAD_STRING(provided));
    }
#if 0
    char hostname[1024];
    gethostname (hostname, 1024);
    printf ("Rank %d of %d running on node %s\n", myrank, nprocs, hostname);
#endif

    // create basis set    
    BasisSet_t basis;
    CInt_createBasisSet(&basis);

    // input parameters and load basis set
    int nprow_fock;
    int npcol_fock;
    int nblks_fock;
    int nprow_purif;
    int nshells;
    int natoms;
    int nfunctions;
    int niters;
    if (myrank == 0) {
        if (argc != 8) {
            usage(argv[0]);
            MPI_Finalize();
            exit(0);
        }
        // init parameters
        nprow_fock = atoi(argv[3]);
        npcol_fock = atoi(argv[4]);
        nprow_purif = atoi(argv[5]);
        nblks_fock = atoi(argv[6]);
        niters = atoi(argv[7]);
        assert(nprow_fock * npcol_fock == nprocs);
        assert(nprow_purif * nprow_purif * nprow_purif  <= nprocs);
        assert(niters > 0);       
        CInt_loadBasisSet(basis, argv[1], argv[2]);
        nshells = CInt_getNumShells(basis);
        natoms = CInt_getNumAtoms(basis);
        nfunctions = CInt_getNumFuncs(basis);
        assert(nprow_fock <= nshells && npcol_fock <= nshells);
        assert(nprow_purif <= nfunctions && nprow_purif <= nfunctions);
        printf("Job information:\n");
        char *fname;
        fname = basename(argv[2]);
        printf("  molecule:  %s\n", fname);
        fname = basename(argv[1]);
        printf("  basisset:  %s\n", fname);
        printf("  charge     = %d\n", CInt_getTotalCharge(basis));
        printf("  #atoms     = %d\n", natoms);
        printf("  #shells    = %d\n", nshells);
        printf("  #functions = %d\n", nfunctions);
        printf("  fock build uses   %d (%dx%d) nodes\n",
               nprow_fock * npcol_fock, nprow_fock, npcol_fock);
        printf("  purification uses %d (%dx%dx%d) nodes\n",
               nprow_purif * nprow_purif * nprow_purif,
               nprow_purif, nprow_purif, nprow_purif);
        printf("  #tasks = %d (%dx%d)\n",
               nblks_fock * nblks_fock * nprow_fock * nprow_fock,
               nblks_fock * nprow_fock, nblks_fock * nprow_fock);
        int nthreads = omp_get_max_threads();
        printf("  #nthreads_cpu = %d\n", nthreads);   
    }
    int btmp[8];
    btmp[0] = nprow_fock;
    btmp[1] = npcol_fock;
    btmp[2] = nprow_purif;
    btmp[3] = nblks_fock;
    btmp[4] = niters;
    btmp[5] = natoms;
    btmp[6] = nshells;
    btmp[7] = nfunctions;
    MPI_Bcast(btmp, 8, MPI_INT, 0, MPI_COMM_WORLD);
    nprow_fock = btmp[0];
    npcol_fock = btmp[1];
    nprow_purif = btmp[2];
    nblks_fock = btmp[3];
    niters = btmp[4];
    natoms = btmp[5];
    nshells = btmp[6];
    nfunctions = btmp[7];

    // broadcast basis set
    void *bsbuf;
    int bsbufsize;
    if (myrank == 0) {
        CInt_packBasisSet(basis, &bsbuf, &bsbufsize);
        MPI_Bcast(&bsbufsize, 1, MPI_INT, 0, MPI_COMM_WORLD);
        MPI_Bcast(bsbuf, bsbufsize, MPI_CHAR, 0, MPI_COMM_WORLD);
    }
    else {
        MPI_Bcast(&bsbufsize, 1, MPI_INT, 0, MPI_COMM_WORLD);
        bsbuf = (void *)malloc(bsbufsize);
        assert(bsbuf != NULL);
        MPI_Bcast(bsbuf, bsbufsize, MPI_CHAR, 0, MPI_COMM_WORLD);
        CInt_unpackBasisSet(basis, bsbuf);  
        free(bsbuf);
    }

    // init PFock
    if (myrank == 0) {
        printf("Initializing pfock ...\n");
    }
    PFock_t pfock;
    PFock_create(basis, nprow_fock, npcol_fock, nblks_fock, 1e-11,
                 MAX_NUM_D, IS_SYMM, &pfock);
    if (myrank == 0) {
        double mem_cpu;
        PFock_getMemorySize(pfock, &mem_cpu);
        printf("  CPU uses %.3f MB\n", mem_cpu / 1024.0 / 1024.0);
        printf("  Done\n");
    }

    // init purif
    purif_t *purif = create_purif(basis, nprow_purif, nprow_purif, nprow_purif);
    init_oedmat(basis, pfock, purif, nprow_fock, npcol_fock);

    // compute SCF
    if (myrank == 0) {
        printf("Computing SCF ...\n");
    }
    int rowstart = purif->srow_purif;
    int rowend = purif->nrows_purif + rowstart - 1;
    int colstart = purif->scol_purif;
    int colend = purif->ncols_purif + colstart - 1;
    double energy0 = -1.0;
    double totaltime = 0.0;
    double purif_flops = 2.0 * nfunctions * nfunctions * nfunctions;
    double diis_flops;

    // set initial guess
    if (myrank == 0) {
        printf("  initialing D ...\n");
    }
    PFock_setNumDenMat(NUM_D, pfock);
    initial_guess(pfock, basis, purif->runpurif,
                  rowstart, rowend, colstart, colend,
                  purif->D_block, purif->ldx);

    MPI_Barrier(MPI_COMM_WORLD);

    // compute nuc energy
    double ene_nuc = CInt_getNucEnergy(basis);
    if (myrank == 0) {
        printf("  nuc energy = %.10f\n", ene_nuc);
    }

    MPI_Barrier(MPI_COMM_WORLD);
    
    // main loop
    double t1, t2, t3, t4;
    for (int iter = 0; iter < niters; iter++) {
        if (myrank == 0) {
            printf("  iter %d\n", iter);
        }
        t3 = MPI_Wtime();

        // fock matrix construction
        t1 = MPI_Wtime();
        fock_build(pfock, basis, purif->runpurif,
                   rowstart, rowend, colstart, colend,
                   purif->ldx, purif->D_block, purif->F_block);
        if (myrank == 0) {
            printf("After fock build \n");
        }

        // compute energy
        double energy = compute_energy(purif, purif->F_block, purif->D_block);

        t2 = MPI_Wtime();
        if (myrank == 0) {
            printf("    fock build takes %.3f secs\n", t2 - t1);
            if (iter > 0) {
                printf("    energy %.10f (%.10f), %le\n",
                       energy + ene_nuc, energy, fabs (energy - energy0));
            }
            else {
                printf("    energy %.10f (%.10f)\n", energy + ene_nuc,
                       energy);
            }
        }
        if (iter > 0 && fabs (energy - energy0) < 1e-11) {
            niters = iter + 1;
            break;
        }
        energy0 = energy;

        // compute DIIS
        t1 = MPI_Wtime();
        compute_diis(pfock, purif, purif->D_block, purif->F_block, iter);
        t2 = MPI_Wtime();

        if (myrank == 0) {
            if (iter > 1) {
                diis_flops = purif_flops * 6.0;
            } else {
                diis_flops = purif_flops * 2.0;
            }
            printf("    diis takes %.3f secs, %.3lf Gflops\n",
                   t2 - t1, diis_flops / (t2 - t1) / 1e9);
        }
        
    #ifdef __SCF_OUT__
        if (myrank == 0) {
            double outbuf[nfunctions];
            char fname[1024];
            sprintf(fname, "XFX_%d_%d.dat", nfunctions, iter);
            FILE *fp = fopen(fname, "w+");
            assert(fp != NULL);
            for (int i = 0; i < nfunctions; i++) {
                PFock_getMat(pfock, PFOCK_MAT_TYPE_F, USE_D_ID,
                             i, i, USE_D_ID, nfunctions - 1,
                             outbuf, nfunctions);
                for (int j = 0; j < nfunctions; j++) {
                    fprintf(fp, "%.10e\n", outbuf[j]);
                }
            }
            fclose(fp);
        }
    #endif
    
        // purification
        MPI_Barrier(MPI_COMM_WORLD);
        t1 = MPI_Wtime();
        int it = compute_purification(purif, purif->F_block, purif->D_block);
        t2 = MPI_Wtime();
        MPI_Barrier(MPI_COMM_WORLD);
        if (myrank == 0) {
            printf("    purification takes %.3f secs,"
                   " %d iterations, %.3f Gflops\n",
                   t2 - t1, it,
                   (it * 2.0 + 4.0) * purif_flops / (t2 - t1) / 1e9);
        }
	/*
#if defined(USE_ELEMENTAL)
    ElGlobalArraysPrint_d( eldga, pfock->ga_D[USE_D_ID] );
#else
    GA_Print (pfock->ga_D[USE_D_ID]);
#endif
*/
        t4 = MPI_Wtime ();
        totaltime += t4 - t3;

#ifdef __SCF_TIMING__
        PFock_getStatistics(pfock);
        double purif_timedgemm;
        double purif_timepdgemm;
        double purif_timepass;
        double purif_timetr;
        MPI_Reduce(&purif->timedgemm, &purif_timedgemm,
                   1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
        MPI_Reduce(&purif->timepdgemm, &purif_timepdgemm,
                   1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
        MPI_Reduce(&purif->timepass, &purif_timepass,
                   1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
        MPI_Reduce(&purif->timetr, &purif_timetr,
                   1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
        if (myrank == 0) {
            printf("    Purification Statistics:\n");
            printf("      average totaltime  = %.3f\n"
                   "      average timetr     = %.3f\n"
                   "      average timedgemm  = %.3f, %.3f Gflops\n"
                   "      average timepdgemm = %.3f, %.3f Gflops\n",
                   purif_timepass / purif->np_purif,
                   purif_timetr / purif->np_purif,
                   purif_timedgemm / purif->np_purif,
                   (it * 2.0 + 4.0) *
                   purif_flops / (purif_timedgemm / purif->np_purif) / 1e9,
                   purif_timepdgemm / purif->np_purif,
                   (it * 2.0 + 4.0) *
                   purif_flops / (purif_timepdgemm / purif->np_purif) / 1e9);
        }
#endif
    } /* for (iter = 0; iter < NITERATIONS; iter++) */

    if (myrank == 0) {
        printf("  totally takes %.3f secs: %.3f secs/iters\n",
               totaltime, totaltime / niters);
        printf("  Done\n");
    }

    destroy_purif(purif);
    PFock_destroy(pfock);
    CInt_destroyBasisSet(basis);

    MPI_Finalize();

    return 0;
}
Exemple #4
0
int main(int argc, char *argv[])
{
    /*********************************************************************************
     *                            INITIALIZE MPI
     *********************************************************************************/

    int world_size = 0, world_rank = -1;
    int provided = -1;

#if defined(USE_MPI_INIT)

    MPI_Init( &argc, &argv );
    MPI_Comm_rank( MPI_COMM_WORLD, &world_rank );

    if (world_rank==0)
        print_meminfo(stdout, "after MPI_Init");

#else

    int requested = -1;

#  if defined(USE_MPI_INIT_THREAD_MULTIPLE)
    requested = MPI_THREAD_MULTIPLE;
#  elif defined(USE_MPI_INIT_THREAD_SERIALIZED)
    requested = MPI_THREAD_SERIALIZED;
#  elif defined(USE_MPI_INIT_THREAD_FUNNELED)
    requested = MPI_THREAD_FUNNELED;
#  else
    requested = MPI_THREAD_SINGLE;
#  endif

    MPI_Init_thread( &argc, &argv, requested, &provided );
    MPI_Comm_rank( MPI_COMM_WORLD, &world_rank );

    if (world_rank==0)
        print_meminfo(stdout, "after MPI_Init_thread");

    if (provided>requested)
    {
        if (world_rank==0) printf("MPI_Init_thread returned %s instead of %s, but this is okay. \n",
                                  MPI_THREAD_STRING(provided), MPI_THREAD_STRING(requested) );
    }
    if (provided<requested)
    {
        if (world_rank==0) printf("MPI_Init_thread returned %s instead of %s so the test will exit. \n",
                                  MPI_THREAD_STRING(provided), MPI_THREAD_STRING(requested) );
        MPI_Abort(MPI_COMM_WORLD, 1);
    }

#endif

    double t0 = MPI_Wtime();

    int is_init = 0;
    MPI_Initialized(&is_init);
    if (world_rank==0) printf("MPI %s initialized. \n", (is_init==1 ? "was" : "was not") );

    MPI_Query_thread(&provided);
    if (world_rank==0) printf("MPI thread support is %s. \n", MPI_THREAD_STRING(provided) );

    MPI_Comm_size( MPI_COMM_WORLD, &world_size );
    if (world_rank==0) printf("MPI test program running on %d ranks. \n", world_size);

    char procname[MPI_MAX_PROCESSOR_NAME];
    int pnlen;
    MPI_Get_processor_name(procname,&pnlen);
    printf("%d: processor name = %s\n", world_rank, procname);

    /*********************************************************************************
     *                            SETUP MPI COMMUNICATORS
     *********************************************************************************/

    if (world_rank==0) printf("MPI_Barrier on MPI_COMM_WORLD 1 \n");
    MPI_Barrier( MPI_COMM_WORLD );

    if (world_rank==0) printf("MPI_Comm_dup of MPI_COMM_WORLD \n");
    MPI_Comm comm_world_dup;
    MPI_Comm_dup(MPI_COMM_WORLD, &comm_world_dup);

    if (world_rank==0)
        print_meminfo(stdout, "after MPI_Comm_dup");

    if (world_rank==0) printf("MPI_Barrier on comm_world_dup \n");
    MPI_Barrier( comm_world_dup );

    if (world_rank==0) printf("MPI_Comm_split of MPI_COMM_WORLD into world_reordered \n");
    MPI_Comm comm_world_reordered;
    MPI_Comm_split(MPI_COMM_WORLD, 0, world_size-world_rank, &comm_world_reordered);

    if (world_rank==0)
        print_meminfo(stdout, "after MPI_Comm_split");

    if (world_rank==0) printf("MPI_Comm_split of MPI_COMM_WORLD into left-right \n");
    MPI_Comm comm_world_leftright;
    int leftright = (world_rank<(world_size/2));
    MPI_Comm_split(MPI_COMM_WORLD, leftright, world_rank, &comm_world_leftright);

    if (world_rank==0)
        print_meminfo(stdout, "after MPI_Comm_split");

    if (world_rank==0) printf("MPI_Barrier on comm_world_leftright \n");
    MPI_Barrier( comm_world_leftright );

    if (world_rank==0) printf("MPI_Comm_split of MPI_COMM_WORLD into odd-even \n");
    MPI_Comm comm_world_oddeven;
    int oddeven = (world_rank%2);
    MPI_Comm_split(MPI_COMM_WORLD, oddeven, world_rank, &comm_world_oddeven);

    if (world_rank==0)
        print_meminfo(stdout, "after MPI_Comm_split");

    if (world_rank==0) printf("MPI_Barrier on comm_world_oddeven \n");
    MPI_Barrier( comm_world_oddeven );

    if (world_rank==0) printf("MPI_Comm_split MPI_COMM_WORLD into (world-1) \n");
    MPI_Comm comm_world_minus_one;
    int left_out = world_rank==(world_size/2);
    MPI_Comm_split(MPI_COMM_WORLD, left_out, world_rank, &comm_world_minus_one);

    if (world_rank==0)
        print_meminfo(stdout, "after MPI_Comm_split");

    if (world_rank==0) printf("MPI_Barrier on comm_world_minus_one \n");
    MPI_Barrier( comm_world_minus_one );

    if (world_rank==0) printf("MPI_Comm_group of group_world from MPI_COMM_WORLD \n");
    MPI_Group group_world;
    MPI_Comm_group(MPI_COMM_WORLD, &group_world);

    if (world_rank==0)
        print_meminfo(stdout, "after MPI_Comm_group");

    int geomprog_size = (world_size==1) ? 1 : ceil(log2(world_size));

    int * geomprog_list = NULL;
    geomprog_list = (int *) safemalloc( geomprog_size * sizeof(int) );

    for (int i=0; i<geomprog_size; i++)
        geomprog_list[i] = pow(2,i)-1;

    if (world_rank==0)
        for (int i=0; i<geomprog_size; i++)
            if (world_rank==0) printf("geomprog_list[%d] = %d \n", i, geomprog_list[i]);

    if (world_rank==0) printf("MPI_Group_incl of group_geomprog (geometric progression) from group_world \n");
    MPI_Group group_geomprog;
    MPI_Group_incl(group_world, geomprog_size, geomprog_list, &group_geomprog);
    MPI_Group_free(&group_world);

    if (world_rank==0) printf("MPI_Comm_create of comm_geomprog from group_geomprog on MPI_COMM_WORLD \n");
    MPI_Comm comm_geomprog;
    MPI_Comm_create(MPI_COMM_WORLD, group_geomprog, &comm_geomprog);
    MPI_Group_free(&group_geomprog);

    if (world_rank==0)
        print_meminfo(stdout, "after MPI_Comm_create");

    if (world_rank==0) printf("MPI_Barrier on comm_geomprog \n");
    for (int i=0; i<geomprog_size; i++)
        if (geomprog_list[i]==world_rank)
            MPI_Barrier( comm_geomprog );

    if (world_rank==0) printf("MPI_Barrier on MPI_COMM_WORLD 2 \n");
    MPI_Barrier( MPI_COMM_WORLD );

    if (world_rank==0)
        print_meminfo(stdout, "after MPI communicator creation");

    /*********************************************************************************
     *                            COLLECTIVES
     *********************************************************************************/

    int max_mem = (argc>1 ? atoi(argv[1]) : 32*1024*1024);

    MPI_Comm test_comm;

#if defined(DO_COMM_WORLD)
    test_comm = MPI_COMM_WORLD;

    MPI_Barrier( MPI_COMM_WORLD );

    if (world_rank==0)
    	printf("############## %s ##############\n", "MPI_COMM_WORLD - pass 1" );

    {
    	MPI_Barrier( test_comm );
    	bcast_only(stdout, test_comm, max_mem);
    	gather_only(stdout, test_comm, max_mem);
    	allgather_only(stdout, test_comm, max_mem);
    	scatter_only(stdout, test_comm, max_mem);
    	alltoall_only(stdout, test_comm, max_mem);
    	reduce_only(stdout, test_comm, max_mem);
    	allreduce_only(stdout, test_comm, max_mem);
    	reducescatterblock_only(stdout, test_comm, max_mem);
    }

    fflush(stdout);
    MPI_Barrier( MPI_COMM_WORLD );

    if (world_rank==0)
    	printf("############## %s ##############\n", "MPI_COMM_WORLD - pass 2" );

    {
    	MPI_Barrier( test_comm );
    	bcast_only(stdout, test_comm, max_mem);
    	gather_only(stdout, test_comm, max_mem);
    	allgather_only(stdout, test_comm, max_mem);
    	scatter_only(stdout, test_comm, max_mem);
    	alltoall_only(stdout, test_comm, max_mem);
    	reduce_only(stdout, test_comm, max_mem);
    	allreduce_only(stdout, test_comm, max_mem);
    	reducescatterblock_only(stdout, test_comm, max_mem);
    }

    fflush(stdout);
    MPI_Barrier( MPI_COMM_WORLD );
#endif

#ifdef DO_COMM_WORLD_JITTER
    test_comm = MPI_COMM_WORLD;

    MPI_Barrier( MPI_COMM_WORLD );

    if (world_rank==0)
    	printf("############## %s ##############\n", "COMM_WORLD_JITTER" );

    {
    	int jitter = 0;
    	if ((world_rank%10)==0) jitter++;
    	if ((world_rank%100)==0) jitter++;
    	if ((world_rank%1000)==0) jitter++;
    	if ((world_rank%10000)==0) jitter++;
    	if ((world_rank%100000)==0) jitter++;

    	MPI_Barrier( test_comm );
    	sleep(jitter);
    	bcast_only(stdout, test_comm, max_mem);

    	MPI_Barrier( test_comm );
    	sleep(jitter);
    	gather_only(stdout, test_comm, max_mem);

    	MPI_Barrier( test_comm );
    	sleep(jitter);
    	allgather_only(stdout, test_comm, max_mem);

    	MPI_Barrier( test_comm );
    	sleep(jitter);
    	scatter_only(stdout, test_comm, max_mem);

    	MPI_Barrier( test_comm );
    	sleep(jitter);
    	alltoall_only(stdout, test_comm, max_mem);

    	MPI_Barrier( test_comm );
    	sleep(jitter);
    	reduce_only(stdout, test_comm, max_mem);

    	MPI_Barrier( test_comm );
    	sleep(jitter);
    	allreduce_only(stdout, test_comm, max_mem);

    	MPI_Barrier( test_comm );
    	sleep(jitter);
    	reducescatterblock_only(stdout, test_comm, max_mem);
    }

    fflush(stdout);
    MPI_Barrier( MPI_COMM_WORLD );
#endif

#ifdef DO_COMM_WORLD_DUP
    test_comm = comm_world_dup;

    MPI_Barrier( MPI_COMM_WORLD );

    if (world_rank==0)
    	printf("############## %s ##############\n", "COMM_WORLD_DUP" );

    {
    	MPI_Barrier( test_comm );
    	bcast_only(stdout, test_comm, max_mem);
    	gather_only(stdout, test_comm, max_mem);
    	allgather_only(stdout, test_comm, max_mem);
    	scatter_only(stdout, test_comm, max_mem);
    	alltoall_only(stdout, test_comm, max_mem);
    	reduce_only(stdout, test_comm, max_mem);
    	allreduce_only(stdout, test_comm, max_mem);
    	reducescatterblock_only(stdout, test_comm, max_mem);
    }

    fflush(stdout);
    MPI_Barrier( MPI_COMM_WORLD );
#endif

#ifdef DO_WORLD_REORDERED
    test_comm = comm_world_reordered;

    MPI_Barrier( MPI_COMM_WORLD );

    if (world_rank==0)
    	printf("############## %s ##############\n", "WORLD_REORDERED" );

    {
    	MPI_Barrier( test_comm );
    	bcast_only(stdout, test_comm, max_mem);
    	gather_only(stdout, test_comm, max_mem);
    	allgather_only(stdout, test_comm, max_mem);
    	scatter_only(stdout, test_comm, max_mem);
    	alltoall_only(stdout, test_comm, max_mem);
    	reduce_only(stdout, test_comm, max_mem);
    	allreduce_only(stdout, test_comm, max_mem);
    	reducescatterblock_only(stdout, test_comm, max_mem);
    }

    fflush(stdout);
    MPI_Barrier( MPI_COMM_WORLD );
#endif

#ifdef DO_WORLD_MINUS_ONE
    test_comm = comm_world_minus_one;

    MPI_Barrier( MPI_COMM_WORLD );

    if (world_rank==0)
    	printf("############## %s ##############\n", "WORLD_MINUS_ONE" );

    if (left_out==0)
    {
    	MPI_Barrier( test_comm );
    	bcast_only(stdout, test_comm, max_mem);
    	gather_only(stdout, test_comm, max_mem);
    	allgather_only(stdout, test_comm, max_mem);
    	scatter_only(stdout, test_comm, max_mem);
    	alltoall_only(stdout, test_comm, max_mem);
    	reduce_only(stdout, test_comm, max_mem);
    	allreduce_only(stdout, test_comm, max_mem);
    	reducescatterblock_only(stdout, test_comm, max_mem);
    }

    fflush(stdout);
    MPI_Barrier( MPI_COMM_WORLD );
#endif

#if DO_LEFT_RIGHT
    test_comm = comm_world_leftright;

    for (int i=0; i<2; i++)
    {
        MPI_Barrier( MPI_COMM_WORLD );

        if (world_rank==i)
        	printf("############## %s ##############\n", (i==0 ? "LEFT" : "RIGHT") );

        if (leftright==i)
        {
        	MPI_Barrier( test_comm );
        	bcast_only(stdout, test_comm, max_mem);
        	gather_only(stdout, test_comm, max_mem);
        	allgather_only(stdout, test_comm, max_mem);
        	scatter_only(stdout, test_comm, max_mem);
        	alltoall_only(stdout, test_comm, max_mem);
        	reduce_only(stdout, test_comm, max_mem);
        	allreduce_only(stdout, test_comm, max_mem);
        	reducescatterblock_only(stdout, test_comm, max_mem);
        }
    }
    fflush(stdout);
    MPI_Barrier( MPI_COMM_WORLD );
#endif

#if DO_ODD_EVEN
    test_comm = comm_world_oddeven;

    for (int i=0; i<2; i++)
    {
        MPI_Barrier( MPI_COMM_WORLD );

        if (world_rank==i)
        	printf("############## %s ##############\n", (i==0 ? "EVEN" : "ODD") );

        if (oddeven==i)
        {
        	MPI_Barrier( test_comm );
        	bcast_only(stdout, test_comm, max_mem);
        	gather_only(stdout, test_comm, max_mem);
        	allgather_only(stdout, test_comm, max_mem);
        	scatter_only(stdout, test_comm, max_mem);
        	alltoall_only(stdout, test_comm, max_mem);
        	reduce_only(stdout, test_comm, max_mem);
        	allreduce_only(stdout, test_comm, max_mem);
        	reducescatterblock_only(stdout, test_comm, max_mem);
        }
    }
    fflush(stdout);
    MPI_Barrier( MPI_COMM_WORLD );
#endif

#ifdef DO_GEOM_PROG
    test_comm = comm_geomprog;

    MPI_Barrier( MPI_COMM_WORLD );

    if (world_rank==0)
    	printf("############## %s ##############\n", "GEOM_PROG" );

    for (int i=0; i<geomprog_size; i++)
    	if (geomprog_list[i]==world_rank)
    	{
        	MPI_Barrier( test_comm );
        	bcast_only(stdout, test_comm, max_mem);
        	gather_only(stdout, test_comm, max_mem);
        	allgather_only(stdout, test_comm, max_mem);
        	scatter_only(stdout, test_comm, max_mem);
        	alltoall_only(stdout, test_comm, max_mem);
        	reduce_only(stdout, test_comm, max_mem);
        	allreduce_only(stdout, test_comm, max_mem);
        	reducescatterblock_only(stdout, test_comm, max_mem);
    	}

    fflush(stdout);
    MPI_Barrier( MPI_COMM_WORLD );
#endif

    if (world_rank==0)
        print_meminfo(stdout, "after MPI collective tests");

    /*********************************************************************************
     *                            CLEAN UP AND FINALIZE
     *********************************************************************************/

    for (int i=0; i<geomprog_size; i++)
        if (geomprog_list[i]==world_rank)
            MPI_Comm_free(&comm_geomprog);

    free(geomprog_list);

    MPI_Comm_free(&comm_world_minus_one);
    MPI_Comm_free(&comm_world_oddeven);
    MPI_Comm_free(&comm_world_leftright);
    MPI_Comm_free(&comm_world_reordered);
    MPI_Comm_free(&comm_world_dup);

    MPI_Barrier( MPI_COMM_WORLD );

    double t1 = MPI_Wtime();
    double dt = t1-t0;
    if (world_rank==0)
       printf("TEST FINISHED SUCCESSFULLY IN %lf SECONDS \n", dt);
    fflush(stdout);

    if (world_rank==0)
        print_meminfo(stdout, "before MPI_Finalize");

    MPI_Finalize();

    return 0;
}