Exemple #1
0
FORT_DLL_SPEC void FORT_CALL mpi_alltoallw_ ( void*v1, MPI_Fint v2[], MPI_Fint v3[], MPI_Fint v4[], void*v5, MPI_Fint v6[], MPI_Fint v7[], MPI_Fint v8[], MPI_Fint *v9, MPI_Fint *ierr ){

#ifndef HAVE_MPI_F_INIT_WORKS_WITH_C
    if (MPIR_F_NeedInit){ mpirinitf_(); MPIR_F_NeedInit = 0; }
#endif
    if (v1 == MPIR_F_MPI_IN_PLACE) v1 = MPI_IN_PLACE;
    *ierr = MPI_Alltoallw( v1, v2, v3, (MPI_Datatype *)(v4), v5, v6, v7, (MPI_Datatype *)(v8), (MPI_Comm)(*v9) );
}
Exemple #2
0
FC_FUNC( mpi_alltoallw , MPI_ALLTOALLW )
           ( void *sendbuf, int *sendcounts, int *sdispls, int *sendtypes,
	     void *recvbuf, int *recvcounts, int *rdispls, int *recvtypes,
             int *comm, int *ierror )
{

  *ierror=MPI_Alltoallw(sendbuf, sendcounts, sdispls, sendtypes,
			recvbuf, recvcounts, rdispls, recvtypes,
			*comm);

}
Exemple #3
0
int main( int argc, char **argv )
{

    MPI_Comm comm;
    int      *sbuf, *rbuf;
    int      rank, size;
    int      *sendcounts, *recvcounts, *rdispls, *sdispls;
    int      i, j, *p, err;
    MPI_Datatype *sendtypes, *recvtypes;
    
    MTest_Init( &argc, &argv );
    err = 0;
    
    while (MTestGetIntracommGeneral( &comm, 2, 1 )) {
      if (comm == MPI_COMM_NULL) continue;

      /* Create the buffer */
      MPI_Comm_size( comm, &size );
      MPI_Comm_rank( comm, &rank );
      sbuf = (int *)malloc( size * size * sizeof(int) );
      rbuf = (int *)malloc( size * size * sizeof(int) );
      if (!sbuf || !rbuf) {
	fprintf( stderr, "Could not allocated buffers!\n" );
	MPI_Abort( comm, 1 );
      }
      
      /* Load up the buffers */
      for (i=0; i<size*size; i++) {
	sbuf[i] = i + 100*rank;
	rbuf[i] = -i;
      }
      
      /* Create and load the arguments to alltoallv */
      sendcounts = (int *)malloc( size * sizeof(int) );
      recvcounts = (int *)malloc( size * sizeof(int) );
      rdispls    = (int *)malloc( size * sizeof(int) );
      sdispls    = (int *)malloc( size * sizeof(int) );
      sendtypes    = (MPI_Datatype *)malloc( size * sizeof(MPI_Datatype) );
      recvtypes    = (MPI_Datatype *)malloc( size * sizeof(MPI_Datatype) );
      if (!sendcounts || !recvcounts || !rdispls || !sdispls || !sendtypes || !recvtypes) {
	fprintf( stderr, "Could not allocate arg items!\n" );
	MPI_Abort( comm, 1 );
      }
      /* Note that process 0 sends no data (sendcounts[0] = 0) */
      for (i=0; i<size; i++) {
	sendcounts[i] = i;
	recvcounts[i] = rank;
	rdispls[i]    = i * rank * sizeof(int);
	sdispls[i]    = (((i+1) * (i))/2) * sizeof(int);
        sendtypes[i] = recvtypes[i] = MPI_INT;
      }
      MPI_Alltoallw( sbuf, sendcounts, sdispls, sendtypes,
		     rbuf, recvcounts, rdispls, recvtypes, comm );
      
      /* Check rbuf */
      for (i=0; i<size; i++) {
	p = rbuf + rdispls[i]/sizeof(int);
	for (j=0; j<rank; j++) {
	  if (p[j] != i * 100 + (rank*(rank+1))/2 + j) {
	    fprintf( stderr, "[%d] got %d expected %d for %dth\n",
		     rank, p[j],(i*(i+1))/2 + j, j );
	    err++;
	  }
	}
      }

      free(sendtypes);
      free(sdispls);
      free(sendcounts);
      free(sbuf);

#if MTEST_HAVE_MIN_MPI_VERSION(2,2)
      /* check MPI_IN_PLACE, added in MPI-2.2 */
      free( rbuf );
      rbuf = (int *)malloc( size * (2 * size) * sizeof(int) );
      if (!rbuf) {
        fprintf( stderr, "Could not reallocate rbuf!\n" );
        MPI_Abort( comm, 1 );
      }

      /* Load up the buffers */
      for (i = 0; i < size; i++) {
        /* alltoallw displs are in bytes, not in type extents */
        rdispls[i]    = i * (2 * size) * sizeof(int);
        recvtypes[i]  = MPI_INT;
        recvcounts[i] = i + rank;
      }
      memset(rbuf, -1, size * (2 * size) * sizeof(int));
      for (i=0; i < size; i++) {
        p = rbuf + (rdispls[i] / sizeof(int));
        for (j = 0; j < recvcounts[i]; ++j) {
          p[j] = 100 * rank + 10 * i + j;
        }
      }

      MPI_Alltoallw( MPI_IN_PLACE, NULL, NULL, NULL,
                     rbuf, recvcounts, rdispls, recvtypes, comm );

      /* Check rbuf */
      for (i=0; i<size; i++) {
        p = rbuf + (rdispls[i] / sizeof(int));
        for (j=0; j<recvcounts[i]; j++) {
          int expected = 100 * i + 10 * rank + j;
          if (p[j] != expected) {
            fprintf(stderr, "[%d] got %d expected %d for block=%d, element=%dth\n",
                    rank, p[j], expected, i, j);
            ++err;
          }
        }
      }
#endif

      free(recvtypes);
      free(rdispls);
      free(recvcounts);
      free(rbuf);
      MTestFreeComm( &comm );
    }

    MTest_Finalize( err );
    MPI_Finalize();
    return 0;
}
Exemple #4
0
int main(int argc, char **argv)
{
    int errs = 0;
    int i;
    int rank, size;
    int *sbuf = NULL;
    int *rbuf = NULL;
    int *scounts = NULL;
    int *rcounts = NULL;
    int *sdispls = NULL;
    int *rdispls = NULL;
    MPI_Datatype *types = NULL;
    MPI_Comm comm;

    /* intentionally not using MTest_Init/MTest_Finalize in order to make it
     * easy to take this test and use it as an NBC sanity test outside of the
     * MPICH test suite */
    MPI_Init(&argc, &argv);

    comm = MPI_COMM_WORLD;

    MPI_Comm_size(comm, &size);
    MPI_Comm_rank(comm, &rank);

    MPI_Comm_set_errhandler(MPI_COMM_WORLD, MPI_ERRORS_RETURN);

    /* enough space for every process to contribute at least NUM_INTS ints to any
     * collective operation */
    sbuf = malloc(NUM_INTS * size * sizeof(int));
    my_assert(sbuf);
    rbuf = malloc(NUM_INTS * size * sizeof(int));
    my_assert(rbuf);
    scounts = malloc(size * sizeof(int));
    my_assert(scounts);
    rcounts = malloc(size * sizeof(int));
    my_assert(rcounts);
    sdispls = malloc(size * sizeof(int));
    my_assert(sdispls);
    rdispls = malloc(size * sizeof(int));
    my_assert(rdispls);
    types = malloc(size * sizeof(MPI_Datatype));
    my_assert(types);

    for (i = 0; i < size; ++i) {
        sbuf[2 * i] = i;
        sbuf[2 * i + 1] = i;
        rbuf[2 * i] = i;
        rbuf[2 * i + 1] = i;
        scounts[i] = NUM_INTS;
        rcounts[i] = NUM_INTS;
        sdispls[i] = i * NUM_INTS;
        rdispls[i] = i * NUM_INTS;
        types[i] = MPI_INT;
    }

    if (rank == 0 && MPI_SUCCESS ==
        MPI_Gather(sbuf, NUM_INTS, MPI_INT, sbuf, NUM_INTS, MPI_INT, 0, comm))
        errs++;

    if (rank == 0 && MPI_SUCCESS ==
        MPI_Gatherv(sbuf, NUM_INTS, MPI_INT, sbuf, rcounts, rdispls, MPI_INT, 0, comm))
        errs++;

    if (rank == 0 && MPI_SUCCESS ==
        MPI_Scatter(sbuf, NUM_INTS, MPI_INT, sbuf, NUM_INTS, MPI_INT, 0, comm))
        errs++;

    if (rank == 0 && MPI_SUCCESS ==
        MPI_Scatterv(sbuf, scounts, sdispls, MPI_INT, sbuf, NUM_INTS, MPI_INT, 0, comm))
        errs++;

    if (MPI_SUCCESS == MPI_Allgather(&sbuf[rank], 1, MPI_INT, sbuf, 1, MPI_INT, comm))
        errs++;

    if (MPI_SUCCESS ==
        MPI_Allgatherv(&sbuf[rank * rcounts[rank]], rcounts[rank], MPI_INT, sbuf, rcounts, rdispls,
                       MPI_INT, comm))
        errs++;

    if (MPI_SUCCESS == MPI_Alltoall(sbuf, NUM_INTS, MPI_INT, sbuf, NUM_INTS, MPI_INT, comm))
        errs++;

    if (MPI_SUCCESS ==
        MPI_Alltoallv(sbuf, scounts, sdispls, MPI_INT, sbuf, scounts, sdispls, MPI_INT, comm))
        errs++;

    if (MPI_SUCCESS ==
        MPI_Alltoallw(sbuf, scounts, sdispls, types, sbuf, scounts, sdispls, types, comm))
        errs++;

    if (rank == 0 && MPI_SUCCESS == MPI_Reduce(sbuf, sbuf, NUM_INTS, MPI_INT, MPI_SUM, 0, comm))
        errs++;

    if (MPI_SUCCESS == MPI_Allreduce(sbuf, sbuf, NUM_INTS, MPI_INT, MPI_SUM, comm))
        errs++;

    if (MPI_SUCCESS == MPI_Reduce_scatter(sbuf, sbuf, rcounts, MPI_INT, MPI_SUM, comm))
        errs++;

    if (MPI_SUCCESS == MPI_Reduce_scatter_block(sbuf, sbuf, NUM_INTS, MPI_INT, MPI_SUM, comm))
        errs++;

    if (MPI_SUCCESS == MPI_Scan(sbuf, sbuf, NUM_INTS, MPI_INT, MPI_SUM, comm))
        errs++;

    if (MPI_SUCCESS == MPI_Exscan(sbuf, sbuf, NUM_INTS, MPI_INT, MPI_SUM, comm))
        errs++;

    if (sbuf)
        free(sbuf);
    if (rbuf)
        free(rbuf);
    if (scounts)
        free(scounts);
    if (rcounts)
        free(rcounts);
    if (sdispls)
        free(sdispls);
    if (rdispls)
        free(rdispls);
    if (types)
        free(types);

    if (rank == 0) {
        if (errs)
            fprintf(stderr, "Found %d errors\n", errs);
        else
            printf(" No errors\n");
    }
    MPI_Finalize();
    return 0;
}
void Transpose(float *localA, float *localB, int M, int N, MPI_Comm comm)
/* transpose MxN matrix A that is block distributed (1-D) on  
   processes of comm onto block distributed matrix B  */
{
  int i, j, extent, myrank, p, n[2], m[2];
  int lasti, lastj;
  int *sendcounts, *recvcounts;
  int *sdispls, *rdispls;
  MPI_Datatype xtype[2][2], stype[2][2], *sendtypes, *recvtypes;

  MTestPrintfMsg( 2, "M = %d, N = %d\n", M, N );

  /* compute parameters */
  MPI_Comm_size(comm, &p);
  MPI_Comm_rank(comm, &myrank);
  extent = sizeof(float);

  /* allocate arrays */
  sendcounts = (int *)malloc(p*sizeof(int));
  recvcounts = (int *)malloc(p*sizeof(int));
  sdispls    = (int *)malloc(p*sizeof(int));
  rdispls    = (int *)malloc(p*sizeof(int));
  sendtypes  = (MPI_Datatype *)malloc(p*sizeof(MPI_Datatype));
  recvtypes  = (MPI_Datatype *)malloc(p*sizeof(MPI_Datatype));

  /* compute block sizes */
  m[0] = M/p;
  m[1] = M - (p-1)*(M/p);
  n[0] = N/p;
  n[1] = N - (p-1)*(N/p);

  /* compute types */
  for (i=0; i <= 1; i++)
      for (j=0; j <= 1; j++) {
	  xtype[i][j] = transpose_type(N, m[i], n[j], MPI_FLOAT);
	  stype[i][j] = submatrix_type(M, m[i], n[j], MPI_FLOAT);
      }
  
  /* prepare collective operation arguments */
  lasti = myrank == p-1;
  for (j=0;  j < p; j++) {
    lastj	  = j == p-1;
    sendcounts[j] = 1;
    sdispls[j]	  = j*n[0]*extent;
    sendtypes[j]  = xtype[lasti][lastj];
    recvcounts[j] = 1;
    rdispls[j]	  = j*m[0]*extent;
    recvtypes[j]  = stype[lastj][lasti];
  }
  
  /* communicate */
  MTestPrintfMsg( 2, "Begin Alltoallw...\n" ); 
  /* -- Note that the book incorrectly uses &localA and &localB 
     as arguments to MPI_Alltoallw */
  MPI_Alltoallw(localA, sendcounts, sdispls, sendtypes, 
                localB, recvcounts, rdispls, recvtypes, comm);
  MTestPrintfMsg( 2, "Done with Alltoallw\n" ); 

  /* Free buffers */
  free( sendcounts );
  free( recvcounts );
  free( sdispls );
  free( rdispls );
  free( sendtypes );
  free( recvtypes );

  /* Free datatypes */
  for (i=0; i <= 1; i++)
      for (j=0; j <= 1; j++) {
	  MPI_Type_free( &xtype[i][j] );
	  MPI_Type_free( &stype[i][j] );
      }
}
Exemple #6
0
/* Avery Ching and Kenin Columa's reworked two-phase algorithm.  Key features
 * - persistent file domains
 * - an option to use alltoall instead of point-to-point
 */
void ADIOI_IOStridedColl(ADIO_File fd, void *buf, int count, int rdwr,
                         MPI_Datatype datatype, int file_ptr_type,
                         ADIO_Offset offset, ADIO_Status * status, int *error_code)
{
    ADIO_Offset min_st_offset = 0, max_end_offset = 0;
    ADIO_Offset st_end_offset[2];
    ADIO_Offset *all_st_end_offsets = NULL;
    int filetype_is_contig, buftype_is_contig, is_contig;
    ADIO_Offset off;
    int interleave_count = 0, i, nprocs, myrank, nprocs_for_coll;
    int cb_enable;
    ADIO_Offset bufsize;
    MPI_Aint extent;
#ifdef DEBUG2
    MPI_Aint bufextent;
#endif
    MPI_Count size;
    int agg_rank;

    ADIO_Offset agg_disp;       /* aggregated file offset */
    MPI_Datatype agg_dtype;     /* aggregated file datatype */

    int aggregators_done = 0;
    ADIO_Offset buffered_io_size = 0;

    int *alltoallw_disps;

    int *alltoallw_counts;
    int *client_alltoallw_counts;
    int *agg_alltoallw_counts;

    char *cb_buf = NULL;

    MPI_Datatype *client_comm_dtype_arr;        /* aggregator perspective */
    MPI_Datatype *agg_comm_dtype_arr;   /* client perspective */
    ADIO_Offset *client_comm_sz_arr;    /* aggregator perspective */
    ADIO_Offset *agg_comm_sz_arr;       /* client perspective */

    /* file views for each client and aggregator */
    view_state *client_file_view_state_arr = NULL;
    view_state *agg_file_view_state_arr = NULL;
    /* mem views for local process */
    view_state *my_mem_view_state_arr = NULL;

    MPI_Status *agg_comm_statuses = NULL;
    MPI_Request *agg_comm_requests = NULL;
    MPI_Status *client_comm_statuses = NULL;
    MPI_Request *client_comm_requests = NULL;
    int aggs_client_count = 0;
    int clients_agg_count = 0;

    MPI_Comm_size(fd->comm, &nprocs);
    MPI_Comm_rank(fd->comm, &myrank);
#ifdef DEBUG
    fprintf(stderr, "p%d: entering ADIOI_IOStridedColl\n", myrank);
#endif
#ifdef AGGREGATION_PROFILE
    if (rdwr == ADIOI_READ)
        MPE_Log_event(5010, 0, NULL);
    else
        MPE_Log_event(5012, 0, NULL);
#endif

    /* I need to check if there are any outstanding nonblocking writes
     * to the file, which could potentially interfere with the writes
     * taking place in this collective write call. Since this is not
     * likely to be common, let me do the simplest thing possible here:
     * Each process completes all pending nonblocking operations before
     * completing. */

    nprocs_for_coll = fd->hints->cb_nodes;

    if (rdwr == ADIOI_READ)
        cb_enable = fd->hints->cb_read;
    else
        cb_enable = fd->hints->cb_write;

    /* only check for interleaving if cb_read isn't disabled */
    if (cb_enable != ADIOI_HINT_DISABLE) {
        /* find the starting and ending byte of my I/O access */
        ADIOI_Calc_bounds(fd, count, datatype, file_ptr_type, offset,
                          &st_end_offset[0], &st_end_offset[1]);

        /* allocate an array of start/end pairs */
        all_st_end_offsets = (ADIO_Offset *)
            ADIOI_Malloc(2 * nprocs * sizeof(ADIO_Offset));
        MPI_Allgather(st_end_offset, 2, ADIO_OFFSET, all_st_end_offsets, 2, ADIO_OFFSET, fd->comm);

        min_st_offset = all_st_end_offsets[0];
        max_end_offset = all_st_end_offsets[1];

        for (i = 1; i < nprocs; i++) {
            /* are the accesses of different processes interleaved? */
            if ((all_st_end_offsets[i * 2] < all_st_end_offsets[i * 2 - 1]) &&
                (all_st_end_offsets[i * 2] <= all_st_end_offsets[i * 2 + 1]))
                interleave_count++;
            /* This is a rudimentary check for interleaving, but should
             * suffice for the moment. */

            min_st_offset = MPL_MIN(all_st_end_offsets[i * 2], min_st_offset);
            max_end_offset = MPL_MAX(all_st_end_offsets[i * 2 + 1], max_end_offset);
        }
    }

    ADIOI_Datatype_iscontig(datatype, &buftype_is_contig);
    ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig);

    if ((cb_enable == ADIOI_HINT_DISABLE || (!interleave_count && (cb_enable == ADIOI_HINT_AUTO)))
        && (fd->hints->cb_pfr != ADIOI_HINT_ENABLE)) {
        if (cb_enable != ADIOI_HINT_DISABLE) {
            ADIOI_Free(all_st_end_offsets);
        }

        if (buftype_is_contig && filetype_is_contig) {
            if (file_ptr_type == ADIO_EXPLICIT_OFFSET) {
                off = fd->disp + (fd->etype_size) * offset;
                if (rdwr == ADIOI_READ)
                    ADIO_ReadContig(fd, buf, count, datatype,
                                    ADIO_EXPLICIT_OFFSET, off, status, error_code);
                else
                    ADIO_WriteContig(fd, buf, count, datatype,
                                     ADIO_EXPLICIT_OFFSET, off, status, error_code);
            } else {
                if (rdwr == ADIOI_READ)
                    ADIO_ReadContig(fd, buf, count, datatype, ADIO_INDIVIDUAL,
                                    0, status, error_code);
                else
                    ADIO_WriteContig(fd, buf, count, datatype, ADIO_INDIVIDUAL,
                                     0, status, error_code);
            }
        } else {
            if (rdwr == ADIOI_READ)
                ADIO_ReadStrided(fd, buf, count, datatype, file_ptr_type,
                                 offset, status, error_code);
            else
                ADIO_WriteStrided(fd, buf, count, datatype, file_ptr_type,
                                  offset, status, error_code);
        }
        return;
    }

    MPI_Type_extent(datatype, &extent);
#ifdef DEBUG2
    bufextent = extent * count;
#endif
    MPI_Type_size_x(datatype, &size);
    bufsize = size * (MPI_Count) count;

    /* Calculate file realms */
    if ((fd->hints->cb_pfr != ADIOI_HINT_ENABLE) || (fd->file_realm_types == NULL))
        ADIOI_Calc_file_realms(fd, min_st_offset, max_end_offset);

    my_mem_view_state_arr = (view_state *)
        ADIOI_Calloc(1, nprocs * sizeof(view_state));
    agg_file_view_state_arr = (view_state *)
        ADIOI_Calloc(1, nprocs * sizeof(view_state));
    client_comm_sz_arr = (ADIO_Offset *)
        ADIOI_Calloc(1, nprocs * sizeof(ADIO_Offset));

    if (fd->is_agg) {
        client_file_view_state_arr = (view_state *)
            ADIOI_Calloc(1, nprocs * sizeof(view_state));
    } else {
        client_file_view_state_arr = NULL;
    }

    /* Alltoallw doesn't like a null array even if the counts are
     * zero.  If you do not include this code, it will fail. */
    client_comm_dtype_arr = (MPI_Datatype *)
        ADIOI_Calloc(1, nprocs * sizeof(MPI_Datatype));
    if (!fd->is_agg)
        for (i = 0; i < nprocs; i++)
            client_comm_dtype_arr[i] = MPI_BYTE;

    ADIOI_Exch_file_views(myrank, nprocs, file_ptr_type, fd, count,
                          datatype, offset, my_mem_view_state_arr,
                          agg_file_view_state_arr, client_file_view_state_arr);

    agg_comm_sz_arr = (ADIO_Offset *)
        ADIOI_Calloc(1, nprocs * sizeof(ADIO_Offset));
    agg_comm_dtype_arr = (MPI_Datatype *)
        ADIOI_Malloc(nprocs * sizeof(MPI_Datatype));
    if (fd->is_agg) {
        ADIOI_Build_agg_reqs(fd, rdwr, nprocs,
                             client_file_view_state_arr,
                             client_comm_dtype_arr, client_comm_sz_arr, &agg_disp, &agg_dtype);
        buffered_io_size = 0;
        for (i = 0; i < nprocs; i++) {
            if (client_comm_sz_arr[i] > 0)
                buffered_io_size += client_comm_sz_arr[i];
        }
    }
#ifdef USE_PRE_REQ
    else {
        /* Example use of ADIOI_Build_client_pre_req. to an
         * appropriate section */

        for (i = 0; i < fd->hints->cb_nodes; i++) {
            agg_rank = fd->hints->ranklist[(i + myrank) % fd->hints->cb_nodes];
#ifdef AGGREGATION_PROFILE
            MPE_Log_event(5040, 0, NULL);
#endif
            ADIOI_Build_client_pre_req(fd, agg_rank, (i + myrank) % fd->hints->cb_nodes,
                                       &(my_mem_view_state_arr[agg_rank]),
                                       &(agg_file_view_state_arr[agg_rank]),
                                       2 * 1024 * 1024, 64 * 1024);
#ifdef AGGREGATION_PROFILE
            MPE_Log_event(5041, 0, NULL);
#endif
        }
    }
#endif


    if (fd->is_agg)
        cb_buf = (char *) ADIOI_Malloc(fd->hints->cb_buffer_size);
    alltoallw_disps = (int *) ADIOI_Calloc(nprocs, sizeof(int));
    alltoallw_counts = client_alltoallw_counts = (int *)
        ADIOI_Calloc(2 * nprocs, sizeof(int));
    agg_alltoallw_counts = &alltoallw_counts[nprocs];

    if (fd->hints->cb_alltoall == ADIOI_HINT_DISABLE) {
        /* aggregators pre-post all Irecv's for incoming data from clients */
        if ((fd->is_agg) && (rdwr == ADIOI_WRITE))
            post_aggregator_comm(fd->comm, rdwr, nprocs, cb_buf,
                                 client_comm_dtype_arr,
                                 client_comm_sz_arr, &agg_comm_requests, &aggs_client_count);
    }
    /* Aggregators send amounts for data requested to clients */
    Exch_data_amounts(fd, nprocs, client_comm_sz_arr, agg_comm_sz_arr,
                      client_alltoallw_counts, agg_alltoallw_counts, &aggregators_done);

#ifdef DEBUG
    fprintf(stderr, "client_alltoallw_counts[ ");
    for (i = 0; i < nprocs; i++) {
        fprintf(stderr, "%d ", client_alltoallw_counts[i]);
    }
    fprintf(stderr, "]\n");
    fprintf(stderr, "agg_alltoallw_counts[ ");
    for (i = 0; i < nprocs; i++) {
        fprintf(stderr, "%d ", agg_alltoallw_counts[i]);
    }
    fprintf(stderr, "]\n");
#endif

    /* keep looping while aggregators still have I/O to do */
    while (aggregators_done != nprocs_for_coll) {
        if (fd->hints->cb_alltoall == ADIOI_HINT_DISABLE) {
            /* clients should build datatypes for local memory locations
             * for data communication with aggregators and post
             * communication as the datatypes are built */

            client_comm_requests = (MPI_Request *)
                ADIOI_Calloc(fd->hints->cb_nodes, sizeof(MPI_Request));

            for (i = 0; i < fd->hints->cb_nodes; i++) {
                clients_agg_count = 0;
                agg_rank = fd->hints->ranklist[(i + myrank) % fd->hints->cb_nodes];
                if (agg_comm_sz_arr[agg_rank] > 0) {
                    ADIOI_Build_client_req(fd, agg_rank,
                                           (i + myrank) % fd->hints->cb_nodes,
                                           &(my_mem_view_state_arr[agg_rank]),
                                           &(agg_file_view_state_arr[agg_rank]),
                                           agg_comm_sz_arr[agg_rank],
                                           &(agg_comm_dtype_arr[agg_rank]));

#ifdef AGGREGATION_PROFILE
                    if (i == 0)
                        MPE_Log_event(5038, 0, NULL);
#endif
                    post_client_comm(fd, rdwr, agg_rank, buf,
                                     agg_comm_dtype_arr[agg_rank],
                                     agg_alltoallw_counts[agg_rank],
                                     &client_comm_requests[clients_agg_count]);
                    clients_agg_count++;
                }
            }
#ifdef AGGREGATION_PROFILE
            if (!clients_agg_count)
                MPE_Log_event(5039, 0, NULL);
#endif

            if (rdwr == ADIOI_READ) {
                if (fd->is_agg && buffered_io_size) {
                    ADIOI_IOFiletype(fd, cb_buf, buffered_io_size, MPI_BYTE,
                                     ADIO_EXPLICIT_OFFSET, agg_disp, agg_dtype,
                                     ADIOI_READ, status, error_code);
                    if (*error_code != MPI_SUCCESS)
                        return;
                    MPI_Type_free(&agg_dtype);
                }
#ifdef DEBUG
                fprintf(stderr, "expecting from [agg](disp,size,cnt)=");
                for (i = 0; i < nprocs; i++) {
                    MPI_Type_size_x(agg_comm_dtype_arr[i], &size);
                    fprintf(stderr, "[%d](%d,%d,%d)", i, alltoallw_disps[i],
                            size, agg_alltoallw_counts[i]);
                    if (i != nprocs - 1)
                        fprintf(stderr, ",");
                }
                fprintf(stderr, "]\n");
                if (fd->is_agg) {
                    fprintf(stderr, "sending to [client](disp,size,cnt)=");
                    for (i = 0; i < nprocs; i++) {
                        if (fd->is_agg)
                            MPI_Type_size_x(client_comm_dtype_arr[i], &size);
                        else
                            size = -1;

                        fprintf(stderr, "[%d](%d,%d,%d)", i, alltoallw_disps[i],
                                size, client_alltoallw_counts[i]);
                        if (i != nprocs - 1)
                            fprintf(stderr, ",");
                    }
                    fprintf(stderr, "\n");
                }
                fflush(NULL);
#endif
                /* aggregators post all Isends for outgoing data to clients */
                if (fd->is_agg)
                    post_aggregator_comm(fd->comm, rdwr, nprocs, cb_buf,
                                         client_comm_dtype_arr,
                                         client_comm_sz_arr,
                                         &agg_comm_requests, &aggs_client_count);

                if (fd->is_agg && aggs_client_count) {
#ifdef MPI_STATUSES_IGNORE
                    agg_comm_statuses = MPI_STATUSES_IGNORE;
#else
                    agg_comm_statuses = ADIOI_Malloc(aggs_client_count * sizeof(MPI_Status));
#endif
                    MPI_Waitall(aggs_client_count, agg_comm_requests, agg_comm_statuses);
#ifdef AGGREGATION_PROFILE
                    MPE_Log_event(5033, 0, NULL);
#endif
                    ADIOI_Free(agg_comm_requests);
#ifndef MPI_STATUSES_IGNORE
                    ADIOI_Free(agg_comm_statuses);
#endif
                }

                if (clients_agg_count) {
#ifdef MPI_STATUSES_IGNORE
                    client_comm_statuses = MPI_STATUSES_IGNORE;
#else
                    client_comm_statuses = ADIOI_Malloc(clients_agg_count * sizeof(MPI_Status));
#endif
                    MPI_Waitall(clients_agg_count, client_comm_requests, client_comm_statuses);
#ifdef AGGREGATION_PROFILE
                    MPE_Log_event(5039, 0, NULL);
#endif
                    ADIOI_Free(client_comm_requests);
#ifndef MPI_STATUSES_IGNORE
                    ADIOI_Free(client_comm_statuses);
#endif
                }
#ifdef DEBUG2
                fprintf(stderr, "buffered_io_size = %lld\n", buffered_io_size);
                if (fd->is_agg && buffered_io_size) {
                    fprintf(stderr, "buf = [");
                    for (i = 0; i < bufextent; i++)
                        fprintf(stderr, "%c", ((char *) buf)[i]);
                    fprintf(stderr, "]\n");
                    fprintf(stderr, "cb_buf = [");
                    for (i = 0; i < buffered_io_size; i++)
                        fprintf(stderr, "%c", cb_buf[i]);
                    fprintf(stderr, "]\n");
                    fflush(NULL);
                }
#endif
            } else {    /* Write Case */
#ifdef DEBUG
                fprintf(stderr, "sending to [agg](disp,size,cnt)=");
                for (i = 0; i < nprocs; i++) {
                    MPI_Type_size_x(agg_comm_dtype_arr[i], &size);
                    fprintf(stderr, "[%d](%d,%d,%d)", i, alltoallw_disps[i],
                            size, agg_alltoallw_counts[i]);
                    if (i != nprocs - 1)
                        fprintf(stderr, ",");
                }
                fprintf(stderr, "]\n");
                fprintf(stderr, "expecting from [client](disp,size,cnt)=");
                for (i = 0; i < nprocs; i++) {
                    if (fd->is_agg)
                        MPI_Type_size_x(client_comm_dtype_arr[i], &size);
                    else
                        size = -1;

                    fprintf(stderr, "[%d](%d,%d,%d)", i, alltoallw_disps[i],
                            size, client_alltoallw_counts[i]);
                    if (i != nprocs - 1)
                        fprintf(stderr, ",");
                }
                fprintf(stderr, "\n");
                fflush(NULL);
#endif
#ifdef DEBUG
                fprintf(stderr, "buffered_io_size = %lld\n", buffered_io_size);
#endif

                if (clients_agg_count) {
#ifdef MPI_STATUSES_IGNORE
                    client_comm_statuses = MPI_STATUSES_IGNORE;
#else
                    client_comm_statuses = ADIOI_Malloc(clients_agg_count * sizeof(MPI_Status));
#endif
                    MPI_Waitall(clients_agg_count, client_comm_requests, client_comm_statuses);
#ifdef AGGREGATION_PROFILE
                    MPE_Log_event(5039, 0, NULL);
#endif
                    ADIOI_Free(client_comm_requests);
#ifndef MPI_STATUSES_IGNORE
                    ADIOI_Free(client_comm_statuses);
#endif
                }
#ifdef DEBUG2
                if (bufextent) {
                    fprintf(stderr, "buf = [");
                    for (i = 0; i < bufextent; i++)
                        fprintf(stderr, "%c", ((char *) buf)[i]);
                    fprintf(stderr, "]\n");
                }
#endif

                if (fd->is_agg && buffered_io_size) {
                    ADIOI_Assert(aggs_client_count != 0);
                    /* make sure we actually have the data to write out */
#ifdef MPI_STATUSES_IGNORE
                    agg_comm_statuses = MPI_STATUSES_IGNORE;
#else
                    agg_comm_statuses = (MPI_Status *)
                        ADIOI_Malloc(aggs_client_count * sizeof(MPI_Status));
#endif

                    MPI_Waitall(aggs_client_count, agg_comm_requests, agg_comm_statuses);
#ifdef AGGREGATION_PROFILE
                    MPE_Log_event(5033, 0, NULL);
#endif
                    ADIOI_Free(agg_comm_requests);
#ifndef MPI_STATUSES_IGNORE
                    ADIOI_Free(agg_comm_statuses);
#endif
#ifdef DEBUG2
                    fprintf(stderr, "cb_buf = [");
                    for (i = 0; i < buffered_io_size; i++)
                        fprintf(stderr, "%c", cb_buf[i]);
                    fprintf(stderr, "]\n");
                    fflush(NULL);
#endif
                    ADIOI_IOFiletype(fd, cb_buf, buffered_io_size, MPI_BYTE,
                                     ADIO_EXPLICIT_OFFSET, agg_disp, agg_dtype,
                                     ADIOI_WRITE, status, error_code);
                    if (*error_code != MPI_SUCCESS)
                        return;
                    MPI_Type_free(&agg_dtype);
                }

            }
        } else {
            /* Alltoallw version of everything */
            ADIOI_Build_client_reqs(fd, nprocs, my_mem_view_state_arr,
                                    agg_file_view_state_arr, agg_comm_sz_arr, agg_comm_dtype_arr);

            if (rdwr == ADIOI_READ) {
                if (fd->is_agg && buffered_io_size) {
                    ADIOI_IOFiletype(fd, cb_buf, buffered_io_size, MPI_BYTE,
                                     ADIO_EXPLICIT_OFFSET, agg_disp, agg_dtype,
                                     ADIOI_READ, status, error_code);
                    if (*error_code != MPI_SUCCESS)
                        return;
                    MPI_Type_free(&agg_dtype);
                }
#ifdef AGGREGATION_PROFILE
                MPE_Log_event(5032, 0, NULL);
#endif
                MPI_Alltoallw(cb_buf, client_alltoallw_counts, alltoallw_disps,
                              client_comm_dtype_arr,
                              buf, agg_alltoallw_counts, alltoallw_disps,
                              agg_comm_dtype_arr, fd->comm);
#ifdef AGGREGATION_PROFILE
                MPE_Log_event(5033, 0, NULL);
#endif
            } else {    /* Write Case */
#ifdef AGGREGATION_PROFILE
                MPE_Log_event(5032, 0, NULL);
#endif
                MPI_Alltoallw(buf, agg_alltoallw_counts, alltoallw_disps,
                              agg_comm_dtype_arr,
                              cb_buf, client_alltoallw_counts, alltoallw_disps,
                              client_comm_dtype_arr, fd->comm);
#ifdef AGGREGATION_PROFILE
                MPE_Log_event(5033, 0, NULL);
#endif
                if (fd->is_agg && buffered_io_size) {
                    ADIOI_IOFiletype(fd, cb_buf, buffered_io_size, MPI_BYTE,
                                     ADIO_EXPLICIT_OFFSET, agg_disp, agg_dtype,
                                     ADIOI_WRITE, status, error_code);
                    if (*error_code != MPI_SUCCESS)
                        return;
                    MPI_Type_free(&agg_dtype);
                }
            }
        }

        /* Free (uncommit) datatypes for reuse */
        if (fd->is_agg) {
            if (buffered_io_size > 0) {
                for (i = 0; i < nprocs; i++) {
                    if (client_comm_sz_arr[i] > 0)
                        MPI_Type_free(&client_comm_dtype_arr[i]);
                }
            }
        }
        for (i = 0; i < nprocs; i++) {
            if (agg_comm_sz_arr[i] > 0)
                MPI_Type_free(&agg_comm_dtype_arr[i]);
        }

        /* figure out next set up requests */
        if (fd->is_agg) {
            ADIOI_Build_agg_reqs(fd, rdwr, nprocs,
                                 client_file_view_state_arr,
                                 client_comm_dtype_arr, client_comm_sz_arr, &agg_disp, &agg_dtype);
            buffered_io_size = 0;
            for (i = 0; i < nprocs; i++) {
                if (client_comm_sz_arr[i] > 0)
                    buffered_io_size += client_comm_sz_arr[i];
            }
        }
#ifdef USE_PRE_REQ
        else {
            /* Example use of ADIOI_Build_client_pre_req. to an
             * appropriate section */
            for (i = 0; i < fd->hints->cb_nodes; i++) {
                agg_rank = fd->hints->ranklist[(i + myrank) % fd->hints->cb_nodes];
#ifdef AGGREGATION_PROFILE
                MPE_Log_event(5040, 0, NULL);
#endif
                ADIOI_Build_client_pre_req(fd, agg_rank, (i + myrank) % fd->hints->cb_nodes,
                                           &(my_mem_view_state_arr[agg_rank]),
                                           &(agg_file_view_state_arr[agg_rank]),
                                           2 * 1024 * 1024, 64 * 1024);
#ifdef AGGREGATION_PROFILE
                MPE_Log_event(5041, 0, NULL);
#endif
            }
        }
#endif

        /* aggregators pre-post all Irecv's for incoming data from
         * clients.  if nothing is needed, agg_comm_requests is not
         * allocated */
        if (fd->hints->cb_alltoall == ADIOI_HINT_DISABLE) {
            if ((fd->is_agg) && (rdwr == ADIOI_WRITE))
                post_aggregator_comm(fd->comm, rdwr, nprocs, cb_buf,
                                     client_comm_dtype_arr,
                                     client_comm_sz_arr, &agg_comm_requests, &aggs_client_count);
        }

        /* Aggregators send amounts for data requested to clients */
        Exch_data_amounts(fd, nprocs, client_comm_sz_arr, agg_comm_sz_arr,
                          client_alltoallw_counts, agg_alltoallw_counts, &aggregators_done);

    }

    /* Clean up */

    if (fd->hints->cb_pfr != ADIOI_HINT_ENABLE) {
        /* AAR, FSIZE, and User provided uniform File realms */
        if (1) {
            MPI_Type_free(&fd->file_realm_types[0]);
        } else {
            for (i = 0; i < fd->hints->cb_nodes; i++) {
                ADIOI_Datatype_iscontig(fd->file_realm_types[i], &is_contig);
                MPI_Type_free(&fd->file_realm_types[i]);
            }
        }
        ADIOI_Free(fd->file_realm_types);
        ADIOI_Free(fd->file_realm_st_offs);
    }


    if (fd->is_agg) {
        if (buffered_io_size > 0)
            MPI_Type_free(&agg_dtype);
        for (i = 0; i < nprocs; i++) {
            MPI_Type_free(&client_comm_dtype_arr[i]);
            ADIOI_Free(client_file_view_state_arr[i].flat_type_p->indices);
            ADIOI_Free(client_file_view_state_arr[i].flat_type_p->blocklens);
            ADIOI_Free(client_file_view_state_arr[i].flat_type_p);
        }
        ADIOI_Free(client_file_view_state_arr);
        ADIOI_Free(cb_buf);
    }
    for (i = 0; i < nprocs; i++)
        if (agg_comm_sz_arr[i] > 0)
            MPI_Type_free(&agg_comm_dtype_arr[i]);

    ADIOI_Free(client_comm_sz_arr);
    ADIOI_Free(client_comm_dtype_arr);
    ADIOI_Free(my_mem_view_state_arr);
    ADIOI_Free(agg_file_view_state_arr);
    ADIOI_Free(agg_comm_sz_arr);
    ADIOI_Free(agg_comm_dtype_arr);
    ADIOI_Free(alltoallw_disps);
    ADIOI_Free(alltoallw_counts);
    ADIOI_Free(all_st_end_offsets);

#ifdef HAVE_STATUS_SET_BYTES
    MPIR_Status_set_bytes(status, datatype, bufsize);
    /* This is a temporary way of filling in status.  The right way is
     * to keep track of how much data was actually read and placed in
     * buf during collective I/O. */
#endif
    fd->fp_sys_posn = -1;       /* set it to null. */
#ifdef AGGREGATION_PROFILE
    if (rdwr == ADIOI_READ)
        MPE_Log_event(5011, 0, NULL);
    else
        MPE_Log_event(5013, 0, NULL);
#endif
}