FORT_DLL_SPEC void FORT_CALL mpi_alltoallw_ ( void*v1, MPI_Fint v2[], MPI_Fint v3[], MPI_Fint v4[], void*v5, MPI_Fint v6[], MPI_Fint v7[], MPI_Fint v8[], MPI_Fint *v9, MPI_Fint *ierr ){ #ifndef HAVE_MPI_F_INIT_WORKS_WITH_C if (MPIR_F_NeedInit){ mpirinitf_(); MPIR_F_NeedInit = 0; } #endif if (v1 == MPIR_F_MPI_IN_PLACE) v1 = MPI_IN_PLACE; *ierr = MPI_Alltoallw( v1, v2, v3, (MPI_Datatype *)(v4), v5, v6, v7, (MPI_Datatype *)(v8), (MPI_Comm)(*v9) ); }
FC_FUNC( mpi_alltoallw , MPI_ALLTOALLW ) ( void *sendbuf, int *sendcounts, int *sdispls, int *sendtypes, void *recvbuf, int *recvcounts, int *rdispls, int *recvtypes, int *comm, int *ierror ) { *ierror=MPI_Alltoallw(sendbuf, sendcounts, sdispls, sendtypes, recvbuf, recvcounts, rdispls, recvtypes, *comm); }
int main( int argc, char **argv ) { MPI_Comm comm; int *sbuf, *rbuf; int rank, size; int *sendcounts, *recvcounts, *rdispls, *sdispls; int i, j, *p, err; MPI_Datatype *sendtypes, *recvtypes; MTest_Init( &argc, &argv ); err = 0; while (MTestGetIntracommGeneral( &comm, 2, 1 )) { if (comm == MPI_COMM_NULL) continue; /* Create the buffer */ MPI_Comm_size( comm, &size ); MPI_Comm_rank( comm, &rank ); sbuf = (int *)malloc( size * size * sizeof(int) ); rbuf = (int *)malloc( size * size * sizeof(int) ); if (!sbuf || !rbuf) { fprintf( stderr, "Could not allocated buffers!\n" ); MPI_Abort( comm, 1 ); } /* Load up the buffers */ for (i=0; i<size*size; i++) { sbuf[i] = i + 100*rank; rbuf[i] = -i; } /* Create and load the arguments to alltoallv */ sendcounts = (int *)malloc( size * sizeof(int) ); recvcounts = (int *)malloc( size * sizeof(int) ); rdispls = (int *)malloc( size * sizeof(int) ); sdispls = (int *)malloc( size * sizeof(int) ); sendtypes = (MPI_Datatype *)malloc( size * sizeof(MPI_Datatype) ); recvtypes = (MPI_Datatype *)malloc( size * sizeof(MPI_Datatype) ); if (!sendcounts || !recvcounts || !rdispls || !sdispls || !sendtypes || !recvtypes) { fprintf( stderr, "Could not allocate arg items!\n" ); MPI_Abort( comm, 1 ); } /* Note that process 0 sends no data (sendcounts[0] = 0) */ for (i=0; i<size; i++) { sendcounts[i] = i; recvcounts[i] = rank; rdispls[i] = i * rank * sizeof(int); sdispls[i] = (((i+1) * (i))/2) * sizeof(int); sendtypes[i] = recvtypes[i] = MPI_INT; } MPI_Alltoallw( sbuf, sendcounts, sdispls, sendtypes, rbuf, recvcounts, rdispls, recvtypes, comm ); /* Check rbuf */ for (i=0; i<size; i++) { p = rbuf + rdispls[i]/sizeof(int); for (j=0; j<rank; j++) { if (p[j] != i * 100 + (rank*(rank+1))/2 + j) { fprintf( stderr, "[%d] got %d expected %d for %dth\n", rank, p[j],(i*(i+1))/2 + j, j ); err++; } } } free(sendtypes); free(sdispls); free(sendcounts); free(sbuf); #if MTEST_HAVE_MIN_MPI_VERSION(2,2) /* check MPI_IN_PLACE, added in MPI-2.2 */ free( rbuf ); rbuf = (int *)malloc( size * (2 * size) * sizeof(int) ); if (!rbuf) { fprintf( stderr, "Could not reallocate rbuf!\n" ); MPI_Abort( comm, 1 ); } /* Load up the buffers */ for (i = 0; i < size; i++) { /* alltoallw displs are in bytes, not in type extents */ rdispls[i] = i * (2 * size) * sizeof(int); recvtypes[i] = MPI_INT; recvcounts[i] = i + rank; } memset(rbuf, -1, size * (2 * size) * sizeof(int)); for (i=0; i < size; i++) { p = rbuf + (rdispls[i] / sizeof(int)); for (j = 0; j < recvcounts[i]; ++j) { p[j] = 100 * rank + 10 * i + j; } } MPI_Alltoallw( MPI_IN_PLACE, NULL, NULL, NULL, rbuf, recvcounts, rdispls, recvtypes, comm ); /* Check rbuf */ for (i=0; i<size; i++) { p = rbuf + (rdispls[i] / sizeof(int)); for (j=0; j<recvcounts[i]; j++) { int expected = 100 * i + 10 * rank + j; if (p[j] != expected) { fprintf(stderr, "[%d] got %d expected %d for block=%d, element=%dth\n", rank, p[j], expected, i, j); ++err; } } } #endif free(recvtypes); free(rdispls); free(recvcounts); free(rbuf); MTestFreeComm( &comm ); } MTest_Finalize( err ); MPI_Finalize(); return 0; }
int main(int argc, char **argv) { int errs = 0; int i; int rank, size; int *sbuf = NULL; int *rbuf = NULL; int *scounts = NULL; int *rcounts = NULL; int *sdispls = NULL; int *rdispls = NULL; MPI_Datatype *types = NULL; MPI_Comm comm; /* intentionally not using MTest_Init/MTest_Finalize in order to make it * easy to take this test and use it as an NBC sanity test outside of the * MPICH test suite */ MPI_Init(&argc, &argv); comm = MPI_COMM_WORLD; MPI_Comm_size(comm, &size); MPI_Comm_rank(comm, &rank); MPI_Comm_set_errhandler(MPI_COMM_WORLD, MPI_ERRORS_RETURN); /* enough space for every process to contribute at least NUM_INTS ints to any * collective operation */ sbuf = malloc(NUM_INTS * size * sizeof(int)); my_assert(sbuf); rbuf = malloc(NUM_INTS * size * sizeof(int)); my_assert(rbuf); scounts = malloc(size * sizeof(int)); my_assert(scounts); rcounts = malloc(size * sizeof(int)); my_assert(rcounts); sdispls = malloc(size * sizeof(int)); my_assert(sdispls); rdispls = malloc(size * sizeof(int)); my_assert(rdispls); types = malloc(size * sizeof(MPI_Datatype)); my_assert(types); for (i = 0; i < size; ++i) { sbuf[2 * i] = i; sbuf[2 * i + 1] = i; rbuf[2 * i] = i; rbuf[2 * i + 1] = i; scounts[i] = NUM_INTS; rcounts[i] = NUM_INTS; sdispls[i] = i * NUM_INTS; rdispls[i] = i * NUM_INTS; types[i] = MPI_INT; } if (rank == 0 && MPI_SUCCESS == MPI_Gather(sbuf, NUM_INTS, MPI_INT, sbuf, NUM_INTS, MPI_INT, 0, comm)) errs++; if (rank == 0 && MPI_SUCCESS == MPI_Gatherv(sbuf, NUM_INTS, MPI_INT, sbuf, rcounts, rdispls, MPI_INT, 0, comm)) errs++; if (rank == 0 && MPI_SUCCESS == MPI_Scatter(sbuf, NUM_INTS, MPI_INT, sbuf, NUM_INTS, MPI_INT, 0, comm)) errs++; if (rank == 0 && MPI_SUCCESS == MPI_Scatterv(sbuf, scounts, sdispls, MPI_INT, sbuf, NUM_INTS, MPI_INT, 0, comm)) errs++; if (MPI_SUCCESS == MPI_Allgather(&sbuf[rank], 1, MPI_INT, sbuf, 1, MPI_INT, comm)) errs++; if (MPI_SUCCESS == MPI_Allgatherv(&sbuf[rank * rcounts[rank]], rcounts[rank], MPI_INT, sbuf, rcounts, rdispls, MPI_INT, comm)) errs++; if (MPI_SUCCESS == MPI_Alltoall(sbuf, NUM_INTS, MPI_INT, sbuf, NUM_INTS, MPI_INT, comm)) errs++; if (MPI_SUCCESS == MPI_Alltoallv(sbuf, scounts, sdispls, MPI_INT, sbuf, scounts, sdispls, MPI_INT, comm)) errs++; if (MPI_SUCCESS == MPI_Alltoallw(sbuf, scounts, sdispls, types, sbuf, scounts, sdispls, types, comm)) errs++; if (rank == 0 && MPI_SUCCESS == MPI_Reduce(sbuf, sbuf, NUM_INTS, MPI_INT, MPI_SUM, 0, comm)) errs++; if (MPI_SUCCESS == MPI_Allreduce(sbuf, sbuf, NUM_INTS, MPI_INT, MPI_SUM, comm)) errs++; if (MPI_SUCCESS == MPI_Reduce_scatter(sbuf, sbuf, rcounts, MPI_INT, MPI_SUM, comm)) errs++; if (MPI_SUCCESS == MPI_Reduce_scatter_block(sbuf, sbuf, NUM_INTS, MPI_INT, MPI_SUM, comm)) errs++; if (MPI_SUCCESS == MPI_Scan(sbuf, sbuf, NUM_INTS, MPI_INT, MPI_SUM, comm)) errs++; if (MPI_SUCCESS == MPI_Exscan(sbuf, sbuf, NUM_INTS, MPI_INT, MPI_SUM, comm)) errs++; if (sbuf) free(sbuf); if (rbuf) free(rbuf); if (scounts) free(scounts); if (rcounts) free(rcounts); if (sdispls) free(sdispls); if (rdispls) free(rdispls); if (types) free(types); if (rank == 0) { if (errs) fprintf(stderr, "Found %d errors\n", errs); else printf(" No errors\n"); } MPI_Finalize(); return 0; }
void Transpose(float *localA, float *localB, int M, int N, MPI_Comm comm) /* transpose MxN matrix A that is block distributed (1-D) on processes of comm onto block distributed matrix B */ { int i, j, extent, myrank, p, n[2], m[2]; int lasti, lastj; int *sendcounts, *recvcounts; int *sdispls, *rdispls; MPI_Datatype xtype[2][2], stype[2][2], *sendtypes, *recvtypes; MTestPrintfMsg( 2, "M = %d, N = %d\n", M, N ); /* compute parameters */ MPI_Comm_size(comm, &p); MPI_Comm_rank(comm, &myrank); extent = sizeof(float); /* allocate arrays */ sendcounts = (int *)malloc(p*sizeof(int)); recvcounts = (int *)malloc(p*sizeof(int)); sdispls = (int *)malloc(p*sizeof(int)); rdispls = (int *)malloc(p*sizeof(int)); sendtypes = (MPI_Datatype *)malloc(p*sizeof(MPI_Datatype)); recvtypes = (MPI_Datatype *)malloc(p*sizeof(MPI_Datatype)); /* compute block sizes */ m[0] = M/p; m[1] = M - (p-1)*(M/p); n[0] = N/p; n[1] = N - (p-1)*(N/p); /* compute types */ for (i=0; i <= 1; i++) for (j=0; j <= 1; j++) { xtype[i][j] = transpose_type(N, m[i], n[j], MPI_FLOAT); stype[i][j] = submatrix_type(M, m[i], n[j], MPI_FLOAT); } /* prepare collective operation arguments */ lasti = myrank == p-1; for (j=0; j < p; j++) { lastj = j == p-1; sendcounts[j] = 1; sdispls[j] = j*n[0]*extent; sendtypes[j] = xtype[lasti][lastj]; recvcounts[j] = 1; rdispls[j] = j*m[0]*extent; recvtypes[j] = stype[lastj][lasti]; } /* communicate */ MTestPrintfMsg( 2, "Begin Alltoallw...\n" ); /* -- Note that the book incorrectly uses &localA and &localB as arguments to MPI_Alltoallw */ MPI_Alltoallw(localA, sendcounts, sdispls, sendtypes, localB, recvcounts, rdispls, recvtypes, comm); MTestPrintfMsg( 2, "Done with Alltoallw\n" ); /* Free buffers */ free( sendcounts ); free( recvcounts ); free( sdispls ); free( rdispls ); free( sendtypes ); free( recvtypes ); /* Free datatypes */ for (i=0; i <= 1; i++) for (j=0; j <= 1; j++) { MPI_Type_free( &xtype[i][j] ); MPI_Type_free( &stype[i][j] ); } }
/* Avery Ching and Kenin Columa's reworked two-phase algorithm. Key features * - persistent file domains * - an option to use alltoall instead of point-to-point */ void ADIOI_IOStridedColl(ADIO_File fd, void *buf, int count, int rdwr, MPI_Datatype datatype, int file_ptr_type, ADIO_Offset offset, ADIO_Status * status, int *error_code) { ADIO_Offset min_st_offset = 0, max_end_offset = 0; ADIO_Offset st_end_offset[2]; ADIO_Offset *all_st_end_offsets = NULL; int filetype_is_contig, buftype_is_contig, is_contig; ADIO_Offset off; int interleave_count = 0, i, nprocs, myrank, nprocs_for_coll; int cb_enable; ADIO_Offset bufsize; MPI_Aint extent; #ifdef DEBUG2 MPI_Aint bufextent; #endif MPI_Count size; int agg_rank; ADIO_Offset agg_disp; /* aggregated file offset */ MPI_Datatype agg_dtype; /* aggregated file datatype */ int aggregators_done = 0; ADIO_Offset buffered_io_size = 0; int *alltoallw_disps; int *alltoallw_counts; int *client_alltoallw_counts; int *agg_alltoallw_counts; char *cb_buf = NULL; MPI_Datatype *client_comm_dtype_arr; /* aggregator perspective */ MPI_Datatype *agg_comm_dtype_arr; /* client perspective */ ADIO_Offset *client_comm_sz_arr; /* aggregator perspective */ ADIO_Offset *agg_comm_sz_arr; /* client perspective */ /* file views for each client and aggregator */ view_state *client_file_view_state_arr = NULL; view_state *agg_file_view_state_arr = NULL; /* mem views for local process */ view_state *my_mem_view_state_arr = NULL; MPI_Status *agg_comm_statuses = NULL; MPI_Request *agg_comm_requests = NULL; MPI_Status *client_comm_statuses = NULL; MPI_Request *client_comm_requests = NULL; int aggs_client_count = 0; int clients_agg_count = 0; MPI_Comm_size(fd->comm, &nprocs); MPI_Comm_rank(fd->comm, &myrank); #ifdef DEBUG fprintf(stderr, "p%d: entering ADIOI_IOStridedColl\n", myrank); #endif #ifdef AGGREGATION_PROFILE if (rdwr == ADIOI_READ) MPE_Log_event(5010, 0, NULL); else MPE_Log_event(5012, 0, NULL); #endif /* I need to check if there are any outstanding nonblocking writes * to the file, which could potentially interfere with the writes * taking place in this collective write call. Since this is not * likely to be common, let me do the simplest thing possible here: * Each process completes all pending nonblocking operations before * completing. */ nprocs_for_coll = fd->hints->cb_nodes; if (rdwr == ADIOI_READ) cb_enable = fd->hints->cb_read; else cb_enable = fd->hints->cb_write; /* only check for interleaving if cb_read isn't disabled */ if (cb_enable != ADIOI_HINT_DISABLE) { /* find the starting and ending byte of my I/O access */ ADIOI_Calc_bounds(fd, count, datatype, file_ptr_type, offset, &st_end_offset[0], &st_end_offset[1]); /* allocate an array of start/end pairs */ all_st_end_offsets = (ADIO_Offset *) ADIOI_Malloc(2 * nprocs * sizeof(ADIO_Offset)); MPI_Allgather(st_end_offset, 2, ADIO_OFFSET, all_st_end_offsets, 2, ADIO_OFFSET, fd->comm); min_st_offset = all_st_end_offsets[0]; max_end_offset = all_st_end_offsets[1]; for (i = 1; i < nprocs; i++) { /* are the accesses of different processes interleaved? */ if ((all_st_end_offsets[i * 2] < all_st_end_offsets[i * 2 - 1]) && (all_st_end_offsets[i * 2] <= all_st_end_offsets[i * 2 + 1])) interleave_count++; /* This is a rudimentary check for interleaving, but should * suffice for the moment. */ min_st_offset = MPL_MIN(all_st_end_offsets[i * 2], min_st_offset); max_end_offset = MPL_MAX(all_st_end_offsets[i * 2 + 1], max_end_offset); } } ADIOI_Datatype_iscontig(datatype, &buftype_is_contig); ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig); if ((cb_enable == ADIOI_HINT_DISABLE || (!interleave_count && (cb_enable == ADIOI_HINT_AUTO))) && (fd->hints->cb_pfr != ADIOI_HINT_ENABLE)) { if (cb_enable != ADIOI_HINT_DISABLE) { ADIOI_Free(all_st_end_offsets); } if (buftype_is_contig && filetype_is_contig) { if (file_ptr_type == ADIO_EXPLICIT_OFFSET) { off = fd->disp + (fd->etype_size) * offset; if (rdwr == ADIOI_READ) ADIO_ReadContig(fd, buf, count, datatype, ADIO_EXPLICIT_OFFSET, off, status, error_code); else ADIO_WriteContig(fd, buf, count, datatype, ADIO_EXPLICIT_OFFSET, off, status, error_code); } else { if (rdwr == ADIOI_READ) ADIO_ReadContig(fd, buf, count, datatype, ADIO_INDIVIDUAL, 0, status, error_code); else ADIO_WriteContig(fd, buf, count, datatype, ADIO_INDIVIDUAL, 0, status, error_code); } } else { if (rdwr == ADIOI_READ) ADIO_ReadStrided(fd, buf, count, datatype, file_ptr_type, offset, status, error_code); else ADIO_WriteStrided(fd, buf, count, datatype, file_ptr_type, offset, status, error_code); } return; } MPI_Type_extent(datatype, &extent); #ifdef DEBUG2 bufextent = extent * count; #endif MPI_Type_size_x(datatype, &size); bufsize = size * (MPI_Count) count; /* Calculate file realms */ if ((fd->hints->cb_pfr != ADIOI_HINT_ENABLE) || (fd->file_realm_types == NULL)) ADIOI_Calc_file_realms(fd, min_st_offset, max_end_offset); my_mem_view_state_arr = (view_state *) ADIOI_Calloc(1, nprocs * sizeof(view_state)); agg_file_view_state_arr = (view_state *) ADIOI_Calloc(1, nprocs * sizeof(view_state)); client_comm_sz_arr = (ADIO_Offset *) ADIOI_Calloc(1, nprocs * sizeof(ADIO_Offset)); if (fd->is_agg) { client_file_view_state_arr = (view_state *) ADIOI_Calloc(1, nprocs * sizeof(view_state)); } else { client_file_view_state_arr = NULL; } /* Alltoallw doesn't like a null array even if the counts are * zero. If you do not include this code, it will fail. */ client_comm_dtype_arr = (MPI_Datatype *) ADIOI_Calloc(1, nprocs * sizeof(MPI_Datatype)); if (!fd->is_agg) for (i = 0; i < nprocs; i++) client_comm_dtype_arr[i] = MPI_BYTE; ADIOI_Exch_file_views(myrank, nprocs, file_ptr_type, fd, count, datatype, offset, my_mem_view_state_arr, agg_file_view_state_arr, client_file_view_state_arr); agg_comm_sz_arr = (ADIO_Offset *) ADIOI_Calloc(1, nprocs * sizeof(ADIO_Offset)); agg_comm_dtype_arr = (MPI_Datatype *) ADIOI_Malloc(nprocs * sizeof(MPI_Datatype)); if (fd->is_agg) { ADIOI_Build_agg_reqs(fd, rdwr, nprocs, client_file_view_state_arr, client_comm_dtype_arr, client_comm_sz_arr, &agg_disp, &agg_dtype); buffered_io_size = 0; for (i = 0; i < nprocs; i++) { if (client_comm_sz_arr[i] > 0) buffered_io_size += client_comm_sz_arr[i]; } } #ifdef USE_PRE_REQ else { /* Example use of ADIOI_Build_client_pre_req. to an * appropriate section */ for (i = 0; i < fd->hints->cb_nodes; i++) { agg_rank = fd->hints->ranklist[(i + myrank) % fd->hints->cb_nodes]; #ifdef AGGREGATION_PROFILE MPE_Log_event(5040, 0, NULL); #endif ADIOI_Build_client_pre_req(fd, agg_rank, (i + myrank) % fd->hints->cb_nodes, &(my_mem_view_state_arr[agg_rank]), &(agg_file_view_state_arr[agg_rank]), 2 * 1024 * 1024, 64 * 1024); #ifdef AGGREGATION_PROFILE MPE_Log_event(5041, 0, NULL); #endif } } #endif if (fd->is_agg) cb_buf = (char *) ADIOI_Malloc(fd->hints->cb_buffer_size); alltoallw_disps = (int *) ADIOI_Calloc(nprocs, sizeof(int)); alltoallw_counts = client_alltoallw_counts = (int *) ADIOI_Calloc(2 * nprocs, sizeof(int)); agg_alltoallw_counts = &alltoallw_counts[nprocs]; if (fd->hints->cb_alltoall == ADIOI_HINT_DISABLE) { /* aggregators pre-post all Irecv's for incoming data from clients */ if ((fd->is_agg) && (rdwr == ADIOI_WRITE)) post_aggregator_comm(fd->comm, rdwr, nprocs, cb_buf, client_comm_dtype_arr, client_comm_sz_arr, &agg_comm_requests, &aggs_client_count); } /* Aggregators send amounts for data requested to clients */ Exch_data_amounts(fd, nprocs, client_comm_sz_arr, agg_comm_sz_arr, client_alltoallw_counts, agg_alltoallw_counts, &aggregators_done); #ifdef DEBUG fprintf(stderr, "client_alltoallw_counts[ "); for (i = 0; i < nprocs; i++) { fprintf(stderr, "%d ", client_alltoallw_counts[i]); } fprintf(stderr, "]\n"); fprintf(stderr, "agg_alltoallw_counts[ "); for (i = 0; i < nprocs; i++) { fprintf(stderr, "%d ", agg_alltoallw_counts[i]); } fprintf(stderr, "]\n"); #endif /* keep looping while aggregators still have I/O to do */ while (aggregators_done != nprocs_for_coll) { if (fd->hints->cb_alltoall == ADIOI_HINT_DISABLE) { /* clients should build datatypes for local memory locations * for data communication with aggregators and post * communication as the datatypes are built */ client_comm_requests = (MPI_Request *) ADIOI_Calloc(fd->hints->cb_nodes, sizeof(MPI_Request)); for (i = 0; i < fd->hints->cb_nodes; i++) { clients_agg_count = 0; agg_rank = fd->hints->ranklist[(i + myrank) % fd->hints->cb_nodes]; if (agg_comm_sz_arr[agg_rank] > 0) { ADIOI_Build_client_req(fd, agg_rank, (i + myrank) % fd->hints->cb_nodes, &(my_mem_view_state_arr[agg_rank]), &(agg_file_view_state_arr[agg_rank]), agg_comm_sz_arr[agg_rank], &(agg_comm_dtype_arr[agg_rank])); #ifdef AGGREGATION_PROFILE if (i == 0) MPE_Log_event(5038, 0, NULL); #endif post_client_comm(fd, rdwr, agg_rank, buf, agg_comm_dtype_arr[agg_rank], agg_alltoallw_counts[agg_rank], &client_comm_requests[clients_agg_count]); clients_agg_count++; } } #ifdef AGGREGATION_PROFILE if (!clients_agg_count) MPE_Log_event(5039, 0, NULL); #endif if (rdwr == ADIOI_READ) { if (fd->is_agg && buffered_io_size) { ADIOI_IOFiletype(fd, cb_buf, buffered_io_size, MPI_BYTE, ADIO_EXPLICIT_OFFSET, agg_disp, agg_dtype, ADIOI_READ, status, error_code); if (*error_code != MPI_SUCCESS) return; MPI_Type_free(&agg_dtype); } #ifdef DEBUG fprintf(stderr, "expecting from [agg](disp,size,cnt)="); for (i = 0; i < nprocs; i++) { MPI_Type_size_x(agg_comm_dtype_arr[i], &size); fprintf(stderr, "[%d](%d,%d,%d)", i, alltoallw_disps[i], size, agg_alltoallw_counts[i]); if (i != nprocs - 1) fprintf(stderr, ","); } fprintf(stderr, "]\n"); if (fd->is_agg) { fprintf(stderr, "sending to [client](disp,size,cnt)="); for (i = 0; i < nprocs; i++) { if (fd->is_agg) MPI_Type_size_x(client_comm_dtype_arr[i], &size); else size = -1; fprintf(stderr, "[%d](%d,%d,%d)", i, alltoallw_disps[i], size, client_alltoallw_counts[i]); if (i != nprocs - 1) fprintf(stderr, ","); } fprintf(stderr, "\n"); } fflush(NULL); #endif /* aggregators post all Isends for outgoing data to clients */ if (fd->is_agg) post_aggregator_comm(fd->comm, rdwr, nprocs, cb_buf, client_comm_dtype_arr, client_comm_sz_arr, &agg_comm_requests, &aggs_client_count); if (fd->is_agg && aggs_client_count) { #ifdef MPI_STATUSES_IGNORE agg_comm_statuses = MPI_STATUSES_IGNORE; #else agg_comm_statuses = ADIOI_Malloc(aggs_client_count * sizeof(MPI_Status)); #endif MPI_Waitall(aggs_client_count, agg_comm_requests, agg_comm_statuses); #ifdef AGGREGATION_PROFILE MPE_Log_event(5033, 0, NULL); #endif ADIOI_Free(agg_comm_requests); #ifndef MPI_STATUSES_IGNORE ADIOI_Free(agg_comm_statuses); #endif } if (clients_agg_count) { #ifdef MPI_STATUSES_IGNORE client_comm_statuses = MPI_STATUSES_IGNORE; #else client_comm_statuses = ADIOI_Malloc(clients_agg_count * sizeof(MPI_Status)); #endif MPI_Waitall(clients_agg_count, client_comm_requests, client_comm_statuses); #ifdef AGGREGATION_PROFILE MPE_Log_event(5039, 0, NULL); #endif ADIOI_Free(client_comm_requests); #ifndef MPI_STATUSES_IGNORE ADIOI_Free(client_comm_statuses); #endif } #ifdef DEBUG2 fprintf(stderr, "buffered_io_size = %lld\n", buffered_io_size); if (fd->is_agg && buffered_io_size) { fprintf(stderr, "buf = ["); for (i = 0; i < bufextent; i++) fprintf(stderr, "%c", ((char *) buf)[i]); fprintf(stderr, "]\n"); fprintf(stderr, "cb_buf = ["); for (i = 0; i < buffered_io_size; i++) fprintf(stderr, "%c", cb_buf[i]); fprintf(stderr, "]\n"); fflush(NULL); } #endif } else { /* Write Case */ #ifdef DEBUG fprintf(stderr, "sending to [agg](disp,size,cnt)="); for (i = 0; i < nprocs; i++) { MPI_Type_size_x(agg_comm_dtype_arr[i], &size); fprintf(stderr, "[%d](%d,%d,%d)", i, alltoallw_disps[i], size, agg_alltoallw_counts[i]); if (i != nprocs - 1) fprintf(stderr, ","); } fprintf(stderr, "]\n"); fprintf(stderr, "expecting from [client](disp,size,cnt)="); for (i = 0; i < nprocs; i++) { if (fd->is_agg) MPI_Type_size_x(client_comm_dtype_arr[i], &size); else size = -1; fprintf(stderr, "[%d](%d,%d,%d)", i, alltoallw_disps[i], size, client_alltoallw_counts[i]); if (i != nprocs - 1) fprintf(stderr, ","); } fprintf(stderr, "\n"); fflush(NULL); #endif #ifdef DEBUG fprintf(stderr, "buffered_io_size = %lld\n", buffered_io_size); #endif if (clients_agg_count) { #ifdef MPI_STATUSES_IGNORE client_comm_statuses = MPI_STATUSES_IGNORE; #else client_comm_statuses = ADIOI_Malloc(clients_agg_count * sizeof(MPI_Status)); #endif MPI_Waitall(clients_agg_count, client_comm_requests, client_comm_statuses); #ifdef AGGREGATION_PROFILE MPE_Log_event(5039, 0, NULL); #endif ADIOI_Free(client_comm_requests); #ifndef MPI_STATUSES_IGNORE ADIOI_Free(client_comm_statuses); #endif } #ifdef DEBUG2 if (bufextent) { fprintf(stderr, "buf = ["); for (i = 0; i < bufextent; i++) fprintf(stderr, "%c", ((char *) buf)[i]); fprintf(stderr, "]\n"); } #endif if (fd->is_agg && buffered_io_size) { ADIOI_Assert(aggs_client_count != 0); /* make sure we actually have the data to write out */ #ifdef MPI_STATUSES_IGNORE agg_comm_statuses = MPI_STATUSES_IGNORE; #else agg_comm_statuses = (MPI_Status *) ADIOI_Malloc(aggs_client_count * sizeof(MPI_Status)); #endif MPI_Waitall(aggs_client_count, agg_comm_requests, agg_comm_statuses); #ifdef AGGREGATION_PROFILE MPE_Log_event(5033, 0, NULL); #endif ADIOI_Free(agg_comm_requests); #ifndef MPI_STATUSES_IGNORE ADIOI_Free(agg_comm_statuses); #endif #ifdef DEBUG2 fprintf(stderr, "cb_buf = ["); for (i = 0; i < buffered_io_size; i++) fprintf(stderr, "%c", cb_buf[i]); fprintf(stderr, "]\n"); fflush(NULL); #endif ADIOI_IOFiletype(fd, cb_buf, buffered_io_size, MPI_BYTE, ADIO_EXPLICIT_OFFSET, agg_disp, agg_dtype, ADIOI_WRITE, status, error_code); if (*error_code != MPI_SUCCESS) return; MPI_Type_free(&agg_dtype); } } } else { /* Alltoallw version of everything */ ADIOI_Build_client_reqs(fd, nprocs, my_mem_view_state_arr, agg_file_view_state_arr, agg_comm_sz_arr, agg_comm_dtype_arr); if (rdwr == ADIOI_READ) { if (fd->is_agg && buffered_io_size) { ADIOI_IOFiletype(fd, cb_buf, buffered_io_size, MPI_BYTE, ADIO_EXPLICIT_OFFSET, agg_disp, agg_dtype, ADIOI_READ, status, error_code); if (*error_code != MPI_SUCCESS) return; MPI_Type_free(&agg_dtype); } #ifdef AGGREGATION_PROFILE MPE_Log_event(5032, 0, NULL); #endif MPI_Alltoallw(cb_buf, client_alltoallw_counts, alltoallw_disps, client_comm_dtype_arr, buf, agg_alltoallw_counts, alltoallw_disps, agg_comm_dtype_arr, fd->comm); #ifdef AGGREGATION_PROFILE MPE_Log_event(5033, 0, NULL); #endif } else { /* Write Case */ #ifdef AGGREGATION_PROFILE MPE_Log_event(5032, 0, NULL); #endif MPI_Alltoallw(buf, agg_alltoallw_counts, alltoallw_disps, agg_comm_dtype_arr, cb_buf, client_alltoallw_counts, alltoallw_disps, client_comm_dtype_arr, fd->comm); #ifdef AGGREGATION_PROFILE MPE_Log_event(5033, 0, NULL); #endif if (fd->is_agg && buffered_io_size) { ADIOI_IOFiletype(fd, cb_buf, buffered_io_size, MPI_BYTE, ADIO_EXPLICIT_OFFSET, agg_disp, agg_dtype, ADIOI_WRITE, status, error_code); if (*error_code != MPI_SUCCESS) return; MPI_Type_free(&agg_dtype); } } } /* Free (uncommit) datatypes for reuse */ if (fd->is_agg) { if (buffered_io_size > 0) { for (i = 0; i < nprocs; i++) { if (client_comm_sz_arr[i] > 0) MPI_Type_free(&client_comm_dtype_arr[i]); } } } for (i = 0; i < nprocs; i++) { if (agg_comm_sz_arr[i] > 0) MPI_Type_free(&agg_comm_dtype_arr[i]); } /* figure out next set up requests */ if (fd->is_agg) { ADIOI_Build_agg_reqs(fd, rdwr, nprocs, client_file_view_state_arr, client_comm_dtype_arr, client_comm_sz_arr, &agg_disp, &agg_dtype); buffered_io_size = 0; for (i = 0; i < nprocs; i++) { if (client_comm_sz_arr[i] > 0) buffered_io_size += client_comm_sz_arr[i]; } } #ifdef USE_PRE_REQ else { /* Example use of ADIOI_Build_client_pre_req. to an * appropriate section */ for (i = 0; i < fd->hints->cb_nodes; i++) { agg_rank = fd->hints->ranklist[(i + myrank) % fd->hints->cb_nodes]; #ifdef AGGREGATION_PROFILE MPE_Log_event(5040, 0, NULL); #endif ADIOI_Build_client_pre_req(fd, agg_rank, (i + myrank) % fd->hints->cb_nodes, &(my_mem_view_state_arr[agg_rank]), &(agg_file_view_state_arr[agg_rank]), 2 * 1024 * 1024, 64 * 1024); #ifdef AGGREGATION_PROFILE MPE_Log_event(5041, 0, NULL); #endif } } #endif /* aggregators pre-post all Irecv's for incoming data from * clients. if nothing is needed, agg_comm_requests is not * allocated */ if (fd->hints->cb_alltoall == ADIOI_HINT_DISABLE) { if ((fd->is_agg) && (rdwr == ADIOI_WRITE)) post_aggregator_comm(fd->comm, rdwr, nprocs, cb_buf, client_comm_dtype_arr, client_comm_sz_arr, &agg_comm_requests, &aggs_client_count); } /* Aggregators send amounts for data requested to clients */ Exch_data_amounts(fd, nprocs, client_comm_sz_arr, agg_comm_sz_arr, client_alltoallw_counts, agg_alltoallw_counts, &aggregators_done); } /* Clean up */ if (fd->hints->cb_pfr != ADIOI_HINT_ENABLE) { /* AAR, FSIZE, and User provided uniform File realms */ if (1) { MPI_Type_free(&fd->file_realm_types[0]); } else { for (i = 0; i < fd->hints->cb_nodes; i++) { ADIOI_Datatype_iscontig(fd->file_realm_types[i], &is_contig); MPI_Type_free(&fd->file_realm_types[i]); } } ADIOI_Free(fd->file_realm_types); ADIOI_Free(fd->file_realm_st_offs); } if (fd->is_agg) { if (buffered_io_size > 0) MPI_Type_free(&agg_dtype); for (i = 0; i < nprocs; i++) { MPI_Type_free(&client_comm_dtype_arr[i]); ADIOI_Free(client_file_view_state_arr[i].flat_type_p->indices); ADIOI_Free(client_file_view_state_arr[i].flat_type_p->blocklens); ADIOI_Free(client_file_view_state_arr[i].flat_type_p); } ADIOI_Free(client_file_view_state_arr); ADIOI_Free(cb_buf); } for (i = 0; i < nprocs; i++) if (agg_comm_sz_arr[i] > 0) MPI_Type_free(&agg_comm_dtype_arr[i]); ADIOI_Free(client_comm_sz_arr); ADIOI_Free(client_comm_dtype_arr); ADIOI_Free(my_mem_view_state_arr); ADIOI_Free(agg_file_view_state_arr); ADIOI_Free(agg_comm_sz_arr); ADIOI_Free(agg_comm_dtype_arr); ADIOI_Free(alltoallw_disps); ADIOI_Free(alltoallw_counts); ADIOI_Free(all_st_end_offsets); #ifdef HAVE_STATUS_SET_BYTES MPIR_Status_set_bytes(status, datatype, bufsize); /* This is a temporary way of filling in status. The right way is * to keep track of how much data was actually read and placed in * buf during collective I/O. */ #endif fd->fp_sys_posn = -1; /* set it to null. */ #ifdef AGGREGATION_PROFILE if (rdwr == ADIOI_READ) MPE_Log_event(5011, 0, NULL); else MPE_Log_event(5013, 0, NULL); #endif }