/* Generic implementation of IreadStrided calls the blocking ReadStrided * immediately. */ void ADIOI_FAKE_IreadStrided(ADIO_File fd, void *buf, int count, MPI_Datatype datatype, int file_ptr_type, ADIO_Offset offset, ADIO_Request *request, int *error_code) { ADIO_Status status; #ifdef HAVE_STATUS_SET_BYTES int typesize; #endif *request = ADIOI_Malloc_request(); (*request)->optype = ADIOI_READ; (*request)->fd = fd; (*request)->datatype = datatype; (*request)->queued = 0; (*request)->handle = 0; /* Call the blocking function. It will create an error code * if necessary. */ ADIO_ReadStrided(fd, buf, count, datatype, file_ptr_type, offset, &status, error_code); fd->async_count++; #ifdef HAVE_STATUS_SET_BYTES if (*error_code == MPI_SUCCESS) { MPI_Type_size(datatype, &typesize); (*request)->nbytes = count * typesize; } #endif }
/* Generic implementation of IreadStrided calls the blocking ReadStrided * immediately. */ void ADIOI_FAKE_IreadStrided(ADIO_File fd, void *buf, int count, MPI_Datatype datatype, int file_ptr_type, ADIO_Offset offset, ADIO_Request *request, int *error_code) { ADIO_Status status; int typesize; MPI_Offset nbytes=0; /* Call the blocking function. It will create an error code * if necessary. */ ADIO_ReadStrided(fd, buf, count, datatype, file_ptr_type, offset, &status, error_code); if (*error_code == MPI_SUCCESS) { MPI_Type_size(datatype, &typesize); nbytes = (MPI_Offset)count*(MPI_Offset)typesize; } MPIO_Completed_request_create(&fd, nbytes, error_code, request); }
/*@ MPI_File_read_shared - Read using shared file pointer Input Parameters: . fh - file handle (handle) . count - number of elements in buffer (nonnegative integer) . datatype - datatype of each buffer element (handle) Output Parameters: . buf - initial address of buffer (choice) . status - status object (Status) .N fortran @*/ int MPI_File_read_shared(MPI_File fh, void *buf, int count, MPI_Datatype datatype, MPI_Status *status) { int error_code, bufsize, buftype_is_contig, filetype_is_contig; #ifndef PRINT_ERR_MSG static char myname[] = "MPI_FILE_READ_SHARED"; #endif int datatype_size, incr; ADIO_Offset off, shared_fp; #ifdef PRINT_ERR_MSG if ((fh <= (MPI_File) 0) || (fh->cookie != ADIOI_FILE_COOKIE)) { FPRINTF(stderr, "MPI_File_read_shared: Invalid file handle\n"); MPI_Abort(MPI_COMM_WORLD, 1); } #else ADIOI_TEST_FILE_HANDLE(fh, myname); #endif if (count < 0) { #ifdef PRINT_ERR_MSG FPRINTF(stderr, "MPI_File_read_shared: Invalid count argument\n"); MPI_Abort(MPI_COMM_WORLD, 1); #else error_code = MPIR_Err_setmsg(MPI_ERR_ARG, MPIR_ERR_COUNT_ARG, myname, (char *) 0, (char *) 0); return ADIOI_Error(fh, error_code, myname); #endif } if (datatype == MPI_DATATYPE_NULL) { #ifdef PRINT_ERR_MSG FPRINTF(stderr, "MPI_File_read_shared: Invalid datatype\n"); MPI_Abort(MPI_COMM_WORLD, 1); #else error_code = MPIR_Err_setmsg(MPI_ERR_TYPE, MPIR_ERR_TYPE_NULL, myname, (char *) 0, (char *) 0); return ADIOI_Error(fh, error_code, myname); #endif } MPI_Type_size(datatype, &datatype_size); if (count*datatype_size == 0) return MPI_SUCCESS; if ((count*datatype_size) % fh->etype_size != 0) { #ifdef PRINT_ERR_MSG FPRINTF(stderr, "MPI_File_read_shared: Only an integral number of etypes can be accessed\n"); MPI_Abort(MPI_COMM_WORLD, 1); #else error_code = MPIR_Err_setmsg(MPI_ERR_IO, MPIR_ERR_ETYPE_FRACTIONAL, myname, (char *) 0, (char *) 0); return ADIOI_Error(fh, error_code, myname); #endif } if ((fh->file_system == ADIO_PIOFS) || (fh->file_system == ADIO_PVFS)) { #ifdef PRINT_ERR_MSG FPRINTF(stderr, "MPI_File_read_shared: Shared file pointer not supported on PIOFS and PVFS\n"); MPI_Abort(MPI_COMM_WORLD, 1); #else error_code = MPIR_Err_setmsg(MPI_ERR_UNSUPPORTED_OPERATION, MPIR_ERR_NO_SHARED_FP, myname, (char *) 0, (char *) 0); return ADIOI_Error(fh, error_code, myname); #endif } ADIOI_Datatype_iscontig(datatype, &buftype_is_contig); ADIOI_Datatype_iscontig(fh->filetype, &filetype_is_contig); incr = (count*datatype_size)/fh->etype_size; ADIO_Get_shared_fp(fh, incr, &shared_fp, &error_code); if (error_code != MPI_SUCCESS) { FPRINTF(stderr, "MPI_File_read_shared: Error! Could not access shared file pointer.\n"); MPI_Abort(MPI_COMM_WORLD, 1); } /* contiguous or strided? */ if (buftype_is_contig && filetype_is_contig) { /* convert count and shared_fp to bytes */ bufsize = datatype_size * count; off = fh->disp + fh->etype_size * shared_fp; /* if atomic mode requested, lock (exclusive) the region, because there could be a concurrent noncontiguous request. On NFS, locking is done in the ADIO_ReadContig.*/ if ((fh->atomicity) && (fh->file_system != ADIO_NFS)) ADIOI_WRITE_LOCK(fh, off, SEEK_SET, bufsize); ADIO_ReadContig(fh, buf, count, datatype, ADIO_EXPLICIT_OFFSET, off, status, &error_code); if ((fh->atomicity) && (fh->file_system != ADIO_NFS)) ADIOI_UNLOCK(fh, off, SEEK_SET, bufsize); } else ADIO_ReadStrided(fh, buf, count, datatype, ADIO_EXPLICIT_OFFSET, shared_fp, status, &error_code); /* For strided and atomic mode, locking is done in ADIO_ReadStrided */ return error_code; }
void ADIOI_GEN_ReadStridedColl(ADIO_File fd, void *buf, int count, MPI_Datatype datatype, int file_ptr_type, ADIO_Offset offset, ADIO_Status *status, int *error_code) { /* Uses a generalized version of the extended two-phase method described in "An Extended Two-Phase Method for Accessing Sections of Out-of-Core Arrays", Rajeev Thakur and Alok Choudhary, Scientific Programming, (5)4:301--317, Winter 1996. http://www.mcs.anl.gov/home/thakur/ext2ph.ps */ ADIOI_Access *my_req; /* array of nprocs structures, one for each other process in whose file domain this process's request lies */ ADIOI_Access *others_req; /* array of nprocs structures, one for each other process whose request lies in this process's file domain. */ int i, filetype_is_contig, nprocs, nprocs_for_coll, myrank; int contig_access_count=0, interleave_count = 0, buftype_is_contig; int *count_my_req_per_proc, count_my_req_procs, count_others_req_procs; ADIO_Offset start_offset, end_offset, orig_fp, fd_size, min_st_offset, off; ADIO_Offset *offset_list = NULL, *st_offsets = NULL, *fd_start = NULL, *fd_end = NULL, *end_offsets = NULL; ADIO_Offset *len_list = NULL; int *buf_idx = NULL; #ifdef HAVE_STATUS_SET_BYTES MPI_Count bufsize, size; #endif if (fd->hints->cb_pfr != ADIOI_HINT_DISABLE) { ADIOI_IOStridedColl (fd, buf, count, ADIOI_READ, datatype, file_ptr_type, offset, status, error_code); return; } MPI_Comm_size(fd->comm, &nprocs); MPI_Comm_rank(fd->comm, &myrank); /* number of aggregators, cb_nodes, is stored in the hints */ nprocs_for_coll = fd->hints->cb_nodes; orig_fp = fd->fp_ind; /* only check for interleaving if cb_read isn't disabled */ if (fd->hints->cb_read != ADIOI_HINT_DISABLE) { /* For this process's request, calculate the list of offsets and lengths in the file and determine the start and end offsets. */ /* Note: end_offset points to the last byte-offset that will be accessed. e.g., if start_offset=0 and 100 bytes to be read, end_offset=99*/ ADIOI_Calc_my_off_len(fd, count, datatype, file_ptr_type, offset, &offset_list, &len_list, &start_offset, &end_offset, &contig_access_count); #ifdef RDCOLL_DEBUG for (i=0; i<contig_access_count; i++) { DBG_FPRINTF(stderr, "rank %d off %lld len %lld\n", myrank, offset_list[i], len_list[i]); } #endif /* each process communicates its start and end offsets to other processes. The result is an array each of start and end offsets stored in order of process rank. */ st_offsets = (ADIO_Offset *) ADIOI_Malloc(nprocs*sizeof(ADIO_Offset)); end_offsets = (ADIO_Offset *) ADIOI_Malloc(nprocs*sizeof(ADIO_Offset)); MPI_Allgather(&start_offset, 1, ADIO_OFFSET, st_offsets, 1, ADIO_OFFSET, fd->comm); MPI_Allgather(&end_offset, 1, ADIO_OFFSET, end_offsets, 1, ADIO_OFFSET, fd->comm); /* are the accesses of different processes interleaved? */ for (i=1; i<nprocs; i++) if ((st_offsets[i] < end_offsets[i-1]) && (st_offsets[i] <= end_offsets[i])) interleave_count++; /* This is a rudimentary check for interleaving, but should suffice for the moment. */ } ADIOI_Datatype_iscontig(datatype, &buftype_is_contig); if (fd->hints->cb_read == ADIOI_HINT_DISABLE || (!interleave_count && (fd->hints->cb_read == ADIOI_HINT_AUTO))) { /* don't do aggregation */ if (fd->hints->cb_read != ADIOI_HINT_DISABLE) { ADIOI_Free(offset_list); ADIOI_Free(len_list); ADIOI_Free(st_offsets); ADIOI_Free(end_offsets); } fd->fp_ind = orig_fp; ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig); if (buftype_is_contig && filetype_is_contig) { if (file_ptr_type == ADIO_EXPLICIT_OFFSET) { off = fd->disp + (fd->etype_size) * offset; ADIO_ReadContig(fd, buf, count, datatype, ADIO_EXPLICIT_OFFSET, off, status, error_code); } else ADIO_ReadContig(fd, buf, count, datatype, ADIO_INDIVIDUAL, 0, status, error_code); } else ADIO_ReadStrided(fd, buf, count, datatype, file_ptr_type, offset, status, error_code); return; } /* We're going to perform aggregation of I/O. Here we call * ADIOI_Calc_file_domains() to determine what processes will handle I/O * to what regions. We pass nprocs_for_coll into this function; it is * used to determine how many processes will perform I/O, which is also * the number of regions into which the range of bytes must be divided. * These regions are called "file domains", or FDs. * * When this function returns, fd_start, fd_end, fd_size, and * min_st_offset will be filled in. fd_start holds the starting byte * location for each file domain. fd_end holds the ending byte location. * min_st_offset holds the minimum byte location that will be accessed. * * Both fd_start[] and fd_end[] are indexed by an aggregator number; this * needs to be mapped to an actual rank in the communicator later. * */ ADIOI_Calc_file_domains(st_offsets, end_offsets, nprocs, nprocs_for_coll, &min_st_offset, &fd_start, &fd_end, fd->hints->min_fdomain_size, &fd_size, fd->hints->striping_unit); /* calculate where the portions of the access requests of this process * are located in terms of the file domains. this could be on the same * process or on other processes. this function fills in: * count_my_req_procs - number of processes (including this one) for which * this process has requests in their file domain * count_my_req_per_proc - count of requests for each process, indexed * by rank of the process * my_req[] - array of data structures describing the requests to be * performed by each process (including self). indexed by rank. * buf_idx[] - array of locations into which data can be directly moved; * this is only valid for contiguous buffer case */ ADIOI_Calc_my_req(fd, offset_list, len_list, contig_access_count, min_st_offset, fd_start, fd_end, fd_size, nprocs, &count_my_req_procs, &count_my_req_per_proc, &my_req, &buf_idx); /* perform a collective communication in order to distribute the * data calculated above. fills in the following: * count_others_req_procs - number of processes (including this * one) which have requests in this process's file domain. * count_others_req_per_proc[] - number of separate contiguous * requests from proc i lie in this process's file domain. */ ADIOI_Calc_others_req(fd, count_my_req_procs, count_my_req_per_proc, my_req, nprocs, myrank, &count_others_req_procs, &others_req); /* my_req[] and count_my_req_per_proc aren't needed at this point, so * let's free the memory */ ADIOI_Free(count_my_req_per_proc); for (i=0; i<nprocs; i++) { if (my_req[i].count) { ADIOI_Free(my_req[i].offsets); ADIOI_Free(my_req[i].lens); } } ADIOI_Free(my_req); /* read data in sizes of no more than ADIOI_Coll_bufsize, * communicate, and fill user buf. */ ADIOI_Read_and_exch(fd, buf, datatype, nprocs, myrank, others_req, offset_list, len_list, contig_access_count, min_st_offset, fd_size, fd_start, fd_end, buf_idx, error_code); if (!buftype_is_contig) ADIOI_Delete_flattened(datatype); /* free all memory allocated for collective I/O */ for (i=0; i<nprocs; i++) { if (others_req[i].count) { ADIOI_Free(others_req[i].offsets); ADIOI_Free(others_req[i].lens); ADIOI_Free(others_req[i].mem_ptrs); } } ADIOI_Free(others_req); ADIOI_Free(buf_idx); ADIOI_Free(offset_list); ADIOI_Free(len_list); ADIOI_Free(st_offsets); ADIOI_Free(end_offsets); ADIOI_Free(fd_start); ADIOI_Free(fd_end); #ifdef HAVE_STATUS_SET_BYTES MPI_Type_size_x(datatype, &size); bufsize = size * count; MPIR_Status_set_bytes(status, datatype, bufsize); /* This is a temporary way of filling in status. The right way is to keep track of how much data was actually read and placed in buf during collective I/O. */ #endif fd->fp_sys_posn = -1; /* set it to null. */ }
/*@ MPI_File_read_at - Read using explict offset Input Parameters: . fh - file handle (handle) . offset - file offset (nonnegative integer) . count - number of elements in buffer (nonnegative integer) . datatype - datatype of each buffer element (handle) Output Parameters: . buf - initial address of buffer (choice) . status - status object (Status) .N fortran @*/ int MPI_File_read_at(MPI_File fh, MPI_Offset offset, void *buf, int count, MPI_Datatype datatype, MPI_Status *status) { int error_code, bufsize, buftype_is_contig, filetype_is_contig; #ifndef PRINT_ERR_MSG static char myname[] = "MPI_FILE_READ_AT"; #endif int datatype_size; ADIO_Offset off; #ifdef MPI_hpux int fl_xmpi; HPMP_IO_START(fl_xmpi, BLKMPIFILEREADAT, TRDTBLOCK, fh, datatype, count); #endif /* MPI_hpux */ #ifdef PRINT_ERR_MSG if ((fh <= (MPI_File) 0) || (fh->cookie != ADIOI_FILE_COOKIE)) { FPRINTF(stderr, "MPI_File_read_at: Invalid file handle\n"); MPI_Abort(MPI_COMM_WORLD, 1); } #else ADIOI_TEST_FILE_HANDLE(fh, myname); #endif if (offset < 0) { #ifdef PRINT_ERR_MSG FPRINTF(stderr, "MPI_File_read_at: Invalid offset argument\n"); MPI_Abort(MPI_COMM_WORLD, 1); #else error_code = MPIR_Err_setmsg(MPI_ERR_ARG, MPIR_ERR_OFFSET_ARG, myname, (char *) 0, (char *) 0); return ADIOI_Error(fh, error_code, myname); #endif } if (count < 0) { #ifdef PRINT_ERR_MSG FPRINTF(stderr, "MPI_File_read_at: Invalid count argument\n"); MPI_Abort(MPI_COMM_WORLD, 1); #else error_code = MPIR_Err_setmsg(MPI_ERR_ARG, MPIR_ERR_COUNT_ARG, myname, (char *) 0, (char *) 0); return ADIOI_Error(fh, error_code, myname); #endif } if (datatype == MPI_DATATYPE_NULL) { #ifdef PRINT_ERR_MSG FPRINTF(stderr, "MPI_File_read_at: Invalid datatype\n"); MPI_Abort(MPI_COMM_WORLD, 1); #else error_code = MPIR_Err_setmsg(MPI_ERR_TYPE, MPIR_ERR_TYPE_NULL, myname, (char *) 0, (char *) 0); return ADIOI_Error(fh, error_code, myname); #endif } MPI_Type_size(datatype, &datatype_size); if (count*datatype_size == 0) { #ifdef MPI_hpux HPMP_IO_END(fl_xmpi, fh, datatype, count); #endif /* MPI_hpux */ return MPI_SUCCESS; } if ((count*datatype_size) % fh->etype_size != 0) { #ifdef PRINT_ERR_MSG FPRINTF(stderr, "MPI_File_read_at: Only an integral number of etypes can be accessed\n"); MPI_Abort(MPI_COMM_WORLD, 1); #else error_code = MPIR_Err_setmsg(MPI_ERR_IO, MPIR_ERR_ETYPE_FRACTIONAL, myname, (char *) 0, (char *) 0); return ADIOI_Error(fh, error_code, myname); #endif } if (fh->access_mode & MPI_MODE_WRONLY) { #ifdef PRINT_ERR_MSG FPRINTF(stderr, "MPI_File_read_at: Can't read from a file opened with MPI_MODE_WRONLY\n"); MPI_Abort(MPI_COMM_WORLD, 1); #else error_code = MPIR_Err_setmsg(MPI_ERR_UNSUPPORTED_OPERATION, MPIR_ERR_MODE_WRONLY, myname, (char *) 0, (char *) 0); return ADIOI_Error(fh, error_code, myname); #endif } if (fh->access_mode & MPI_MODE_SEQUENTIAL) { #ifdef PRINT_ERR_MSG FPRINTF(stderr, "MPI_File_read_at: Can't use this function because file was opened with MPI_MODE_SEQUENTIAL\n"); MPI_Abort(MPI_COMM_WORLD, 1); #else error_code = MPIR_Err_setmsg(MPI_ERR_UNSUPPORTED_OPERATION, MPIR_ERR_AMODE_SEQ, myname, (char *) 0, (char *) 0); return ADIOI_Error(fh, error_code, myname); #endif } ADIOI_Datatype_iscontig(datatype, &buftype_is_contig); ADIOI_Datatype_iscontig(fh->filetype, &filetype_is_contig); /* contiguous or strided? */ if (buftype_is_contig && filetype_is_contig) { /* convert count and offset to bytes */ bufsize = datatype_size * count; off = fh->disp + fh->etype_size * offset; /* if atomic mode requested, lock (exclusive) the region, because there could be a concurrent noncontiguous request. Locking doesn't work on PIOFS and PVFS, and on NFS it is done in the ADIO_ReadContig.*/ if ((fh->atomicity) && (fh->file_system != ADIO_PIOFS) && (fh->file_system != ADIO_NFS) && (fh->file_system != ADIO_PVFS)) ADIOI_WRITE_LOCK(fh, off, SEEK_SET, bufsize); ADIO_ReadContig(fh, buf, count, datatype, ADIO_EXPLICIT_OFFSET, off, status, &error_code); if ((fh->atomicity) && (fh->file_system != ADIO_PIOFS) && (fh->file_system != ADIO_NFS) && (fh->file_system != ADIO_PVFS)) ADIOI_UNLOCK(fh, off, SEEK_SET, bufsize); } else ADIO_ReadStrided(fh, buf, count, datatype, ADIO_EXPLICIT_OFFSET, offset, status, &error_code); /* For strided and atomic mode, locking is done in ADIO_ReadStrided */ #ifdef MPI_hpux HPMP_IO_END(fl_xmpi, fh, datatype, count); #endif /* MPI_hpux */ return error_code; }
int MPIOI_File_read(MPI_File mpi_fh, MPI_Offset offset, int file_ptr_type, void *buf, int count, MPI_Datatype datatype, char *myname, MPI_Status *status) { int error_code, bufsize, buftype_is_contig, filetype_is_contig; int datatype_size; ADIO_File fh; ADIO_Offset off; MPID_CS_ENTER(); MPIR_Nest_incr(); fh = MPIO_File_resolve(mpi_fh); /* --BEGIN ERROR HANDLING-- */ MPIO_CHECK_FILE_HANDLE(fh, myname, error_code); MPIO_CHECK_COUNT(fh, count, myname, error_code); MPIO_CHECK_DATATYPE(fh, datatype, myname, error_code); if (file_ptr_type == ADIO_EXPLICIT_OFFSET && offset < 0) { error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_ARG, "**iobadoffset", 0); error_code = MPIO_Err_return_file(fh, error_code); goto fn_exit; } /* --END ERROR HANDLING-- */ MPI_Type_size(datatype, &datatype_size); if (count*datatype_size == 0) { #ifdef HAVE_STATUS_SET_BYTES MPIR_Status_set_bytes(status, datatype, 0); #endif error_code = MPI_SUCCESS; goto fn_exit; } /* --BEGIN ERROR HANDLING-- */ MPIO_CHECK_INTEGRAL_ETYPE(fh, count, datatype_size, myname, error_code); MPIO_CHECK_READABLE(fh, myname, error_code); MPIO_CHECK_NOT_SEQUENTIAL_MODE(fh, myname, error_code); /* --END ERROR HANDLING-- */ ADIOI_Datatype_iscontig(datatype, &buftype_is_contig); ADIOI_Datatype_iscontig(fh->filetype, &filetype_is_contig); ADIOI_TEST_DEFERRED(fh, myname, &error_code); if (buftype_is_contig && filetype_is_contig) { /* convert count and offset to bytes */ bufsize = datatype_size * count; if (file_ptr_type == ADIO_EXPLICIT_OFFSET) { off = fh->disp + fh->etype_size * offset; } else /* ADIO_INDIVIDUAL */ { off = fh->fp_ind; } /* if atomic mode requested, lock (exclusive) the region, because there could be a concurrent noncontiguous request. Locking doesn't work on PIOFS and PVFS, and on NFS it is done in the ADIO_ReadContig. */ if ((fh->atomicity) && (fh->file_system != ADIO_PIOFS) && (fh->file_system != ADIO_NFS) && (fh->file_system != ADIO_PVFS) && (fh->file_system != ADIO_PVFS2)) ADIOI_WRITE_LOCK(fh, off, SEEK_SET, bufsize); ADIO_ReadContig(fh, buf, count, datatype, file_ptr_type, off, status, &error_code); if ((fh->atomicity) && (fh->file_system != ADIO_PIOFS) && (fh->file_system != ADIO_NFS) && (fh->file_system != ADIO_PVFS) && (fh->file_system != ADIO_PVFS2)) ADIOI_UNLOCK(fh, off, SEEK_SET, bufsize); } else { ADIO_ReadStrided(fh, buf, count, datatype, file_ptr_type, offset, status, &error_code); /* For strided and atomic mode, locking is done in ADIO_ReadStrided */ } fn_exit: MPIR_Nest_decr(); MPID_CS_EXIT(); return error_code; }
int MPIOI_File_read(MPI_File fh, MPI_Offset offset, int file_ptr_type, void *buf, int count, MPI_Datatype datatype, char *myname, MPI_Status *status) { int error_code, buftype_is_contig, filetype_is_contig; MPI_Count datatype_size; ADIO_File adio_fh; ADIO_Offset off, bufsize; void *xbuf=NULL, *e32_buf=NULL; ROMIO_THREAD_CS_ENTER(); adio_fh = MPIO_File_resolve(fh); /* --BEGIN ERROR HANDLING-- */ MPIO_CHECK_FILE_HANDLE(adio_fh, myname, error_code); MPIO_CHECK_COUNT(adio_fh, count, myname, error_code); MPIO_CHECK_DATATYPE(adio_fh, datatype, myname, error_code); if (file_ptr_type == ADIO_EXPLICIT_OFFSET && offset < 0) { error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_ARG, "**iobadoffset", 0); error_code = MPIO_Err_return_file(adio_fh, error_code); goto fn_exit; } /* --END ERROR HANDLING-- */ MPI_Type_size_x(datatype, &datatype_size); /* --BEGIN ERROR HANDLING-- */ MPIO_CHECK_COUNT_SIZE(adio_fh, count, datatype_size, myname, error_code); /* --END ERROR HANDLING-- */ if (count*datatype_size == 0) { #ifdef HAVE_STATUS_SET_BYTES MPIR_Status_set_bytes(status, datatype, 0); #endif error_code = MPI_SUCCESS; goto fn_exit; } /* --BEGIN ERROR HANDLING-- */ MPIO_CHECK_INTEGRAL_ETYPE(adio_fh, count, datatype_size, myname, error_code); MPIO_CHECK_READABLE(adio_fh, myname, error_code); MPIO_CHECK_NOT_SEQUENTIAL_MODE(adio_fh, myname, error_code); /* --END ERROR HANDLING-- */ ADIOI_Datatype_iscontig(datatype, &buftype_is_contig); ADIOI_Datatype_iscontig(adio_fh->filetype, &filetype_is_contig); ADIOI_TEST_DEFERRED(adio_fh, myname, &error_code); xbuf = buf; if (adio_fh->is_external32) { MPI_Aint e32_size = 0; error_code = MPIU_datatype_full_size(datatype, &e32_size); if (error_code != MPI_SUCCESS) goto fn_exit; e32_buf = ADIOI_Malloc(e32_size*count); xbuf = e32_buf; } if (buftype_is_contig && filetype_is_contig) { /* convert count and offset to bytes */ bufsize = datatype_size * count; if (file_ptr_type == ADIO_EXPLICIT_OFFSET) { off = adio_fh->disp + adio_fh->etype_size * offset; } else /* ADIO_INDIVIDUAL */ { off = adio_fh->fp_ind; } /* if atomic mode requested, lock (exclusive) the region, because there could be a concurrent noncontiguous request. */ if ((adio_fh->atomicity) && ADIO_Feature(adio_fh, ADIO_LOCKS)) { ADIOI_WRITE_LOCK(adio_fh, off, SEEK_SET, bufsize); } ADIO_ReadContig(adio_fh, xbuf, count, datatype, file_ptr_type, off, status, &error_code); if ((adio_fh->atomicity) && ADIO_Feature(adio_fh, ADIO_LOCKS)) { ADIOI_UNLOCK(adio_fh, off, SEEK_SET, bufsize); } } else { ADIO_ReadStrided(adio_fh, xbuf, count, datatype, file_ptr_type, offset, status, &error_code); /* For strided and atomic mode, locking is done in ADIO_ReadStrided */ } /* --BEGIN ERROR HANDLING-- */ if (error_code != MPI_SUCCESS) error_code = MPIO_Err_return_file(adio_fh, error_code); /* --END ERROR HANDLING-- */ if (e32_buf != NULL) { error_code = MPIU_read_external32_conversion_fn(buf, datatype, count, e32_buf); ADIOI_Free(e32_buf); } fn_exit: ROMIO_THREAD_CS_EXIT(); return error_code; }
void ADIOI_GEN_ReadStridedColl(ADIO_File fd, void *buf, int count, MPI_Datatype datatype, int file_ptr_type, ADIO_Offset offset, ADIO_Status *status, int *error_code) { /* Uses a generalized version of the extended two-phase method described in "An Extended Two-Phase Method for Accessing Sections of Out-of-Core Arrays", Rajeev Thakur and Alok Choudhary, Scientific Programming, (5)4:301--317, Winter 1996. http://www.mcs.anl.gov/home/thakur/ext2ph.ps */ ADIOI_Access *my_req; /* array of nprocs structures, one for each other process in whose file domain this process's request lies */ ADIOI_Access *others_req; /* array of nprocs structures, one for each other process whose request lies in this process's file domain. */ int i, filetype_is_contig, nprocs, nprocs_for_coll, myrank; int *len_list, contig_access_count, interleave_count, info_flag; int *count_my_req_per_proc, count_my_req_procs, count_others_req_procs; int buftype_is_contig, *buf_idx; ADIO_Offset *offset_list, start_offset, end_offset, *st_offsets, orig_fp; ADIO_Offset *fd_start, *fd_end, fd_size, min_st_offset, *end_offsets; ADIO_Offset off; char *value; #ifdef HAVE_STATUS_SET_BYTES int bufsize, size; #endif #ifdef PROFILE MPE_Log_event(13, 0, "start computation"); #endif MPI_Comm_size(fd->comm, &nprocs); MPI_Comm_rank(fd->comm, &myrank); /* the number of processes that actually perform I/O, nprocs_for_coll, is stored in the info object. it is either = nprocs or a smaller number set by the user. get it from info. */ value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char)); MPI_Info_get(fd->info, "cb_nodes", MPI_MAX_INFO_VAL, value, &info_flag); nprocs_for_coll = atoi(value); ADIOI_Free(value); /* For this process's request, calculate the list of offsets and lengths in the file and determine the start and end offsets. */ /* Note: end_offset points to the last byte-offset that will be accessed. e.g., if start_offset=0 and 100 bytes to be read, end_offset=99*/ orig_fp = fd->fp_ind; ADIOI_Calc_my_off_len(fd, count, datatype, file_ptr_type, offset, &offset_list, &len_list, &start_offset, &end_offset, &contig_access_count); /* for (i=0; i<contig_access_count; i++) { FPRINTF(stderr, "rank %d off %ld len %d\n", myrank, offset_list[i], len_list[i]); }*/ /* each process communicates its start and end offsets to other processes. The result is an array each of start and end offsets stored in order of process rank. */ st_offsets = (ADIO_Offset *) ADIOI_Malloc(nprocs*sizeof(ADIO_Offset)); end_offsets = (ADIO_Offset *) ADIOI_Malloc(nprocs*sizeof(ADIO_Offset)); MPI_Allgather(&start_offset, 1, ADIO_OFFSET, st_offsets, 1, ADIO_OFFSET, fd->comm); MPI_Allgather(&end_offset, 1, ADIO_OFFSET, end_offsets, 1, ADIO_OFFSET, fd->comm); /* are the accesses of different processes interleaved? */ interleave_count = 0; for (i=1; i<nprocs; i++) if (st_offsets[i] < end_offsets[i-1]) interleave_count++; /* This is a rudimentary check for interleaving, but should suffice for the moment. */ ADIOI_Datatype_iscontig(datatype, &buftype_is_contig); if (!interleave_count) { /* no interleaving of requests. noncollective is good enough */ ADIOI_Free(offset_list); ADIOI_Free(len_list); ADIOI_Free(st_offsets); ADIOI_Free(end_offsets); fd->fp_ind = orig_fp; ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig); if (buftype_is_contig && filetype_is_contig) { if (file_ptr_type == ADIO_EXPLICIT_OFFSET) { off = fd->disp + (fd->etype_size) * offset; ADIO_ReadContig(fd, buf, count, datatype, ADIO_EXPLICIT_OFFSET, off, status, error_code); } else ADIO_ReadContig(fd, buf, count, datatype, ADIO_INDIVIDUAL, 0, status, error_code); } else ADIO_ReadStrided(fd, buf, count, datatype, file_ptr_type, offset, status, error_code); return; } /* Divide the I/O workload among "nprocs_for_coll" processes. This is done by (logically) dividing the file into file domains (FDs); each process may directly access only its own file domain. */ ADIOI_Calc_file_domains(st_offsets, end_offsets, nprocs, nprocs_for_coll, &min_st_offset, &fd_start, &fd_end, &fd_size); /* calculate what portions of the access requests of this process are located in the file domains of other processes */ ADIOI_Calc_my_req(offset_list, len_list, contig_access_count, min_st_offset, fd_start, fd_end, fd_size, nprocs, nprocs_for_coll, &count_my_req_procs, &count_my_req_per_proc, &my_req, &buf_idx); /* based on everyone's my_req, calculate what requests of other processes lie in this process's file domain. count_others_req_procs = number of processes whose requests lie in this process's file domain (including this process itself) count_others_req_per_proc[i] indicates how many separate contiguous requests of proc. i lie in this process's file domain. */ ADIOI_Calc_others_req(fd, count_my_req_procs, count_my_req_per_proc, my_req, nprocs, myrank, nprocs_for_coll, &count_others_req_procs, &others_req); ADIOI_Free(count_my_req_per_proc); for (i=0; i<nprocs_for_coll; i++) { if (my_req[i].count) { ADIOI_Free(my_req[i].offsets); ADIOI_Free(my_req[i].lens); } } ADIOI_Free(my_req); /* read data in sizes of no more than ADIOI_Coll_bufsize, communicate, and fill user buf. */ ADIOI_Read_and_exch(fd, buf, datatype, nprocs, nprocs_for_coll, myrank, others_req, offset_list, len_list, contig_access_count, min_st_offset, fd_size, fd_start, fd_end, buf_idx, error_code); if (!buftype_is_contig) ADIOI_Delete_flattened(datatype); /* free all memory allocated for collective I/O */ for (i=0; i<nprocs; i++) { if (others_req[i].count) { ADIOI_Free(others_req[i].offsets); ADIOI_Free(others_req[i].lens); ADIOI_Free(others_req[i].mem_ptrs); } } ADIOI_Free(others_req); ADIOI_Free(buf_idx); ADIOI_Free(offset_list); ADIOI_Free(len_list); ADIOI_Free(st_offsets); ADIOI_Free(end_offsets); ADIOI_Free(fd_start); ADIOI_Free(fd_end); #ifdef HAVE_STATUS_SET_BYTES MPI_Type_size(datatype, &size); bufsize = size * count; MPIR_Status_set_bytes(status, datatype, bufsize); /* This is a temporary way of filling in status. The right way is to keep track of how much data was actually read and placed in buf during collective I/O. */ #endif fd->fp_sys_posn = -1; /* set it to null. */ }
/*@ MPI_File_read_shared - Read using shared file pointer Input Parameters: . fh - file handle (handle) . count - number of elements in buffer (nonnegative integer) . datatype - datatype of each buffer element (handle) Output Parameters: . buf - initial address of buffer (choice) . status - status object (Status) .N fortran @*/ int MPI_File_read_shared(MPI_File fh, void *buf, int count, MPI_Datatype datatype, MPI_Status *status) { int error_code, buftype_is_contig, filetype_is_contig; static char myname[] = "MPI_FILE_READ_SHARED"; MPI_Count datatype_size; ADIO_Offset off, shared_fp, incr, bufsize; ADIO_File adio_fh; void *xbuf=NULL, *e32_buf=NULL; ROMIO_THREAD_CS_ENTER(); adio_fh = MPIO_File_resolve(fh); /* --BEGIN ERROR HANDLING-- */ MPIO_CHECK_FILE_HANDLE(adio_fh, myname, error_code); MPIO_CHECK_COUNT(adio_fh, count, myname, error_code); MPIO_CHECK_DATATYPE(adio_fh, datatype, myname, error_code); /* --END ERROR HANDLING-- */ MPI_Type_size_x(datatype, &datatype_size); /* --BEGIN ERROR HANDLING-- */ MPIO_CHECK_COUNT_SIZE(adio_fh, count, datatype_size, myname, error_code); /* --END ERROR HANDLING-- */ if (count*datatype_size == 0) { #ifdef HAVE_STATUS_SET_BYTES MPIR_Status_set_bytes(status, datatype, 0); #endif error_code = MPI_SUCCESS; goto fn_exit; } /* --BEGIN ERROR HANDLING-- */ MPIO_CHECK_INTEGRAL_ETYPE(adio_fh, count, datatype_size, myname, error_code); MPIO_CHECK_READABLE(adio_fh, myname, error_code); MPIO_CHECK_FS_SUPPORTS_SHARED(adio_fh, myname, error_code); /* --END ERROR HANDLING-- */ ADIOI_Datatype_iscontig(datatype, &buftype_is_contig); ADIOI_Datatype_iscontig(adio_fh->filetype, &filetype_is_contig); ADIOI_TEST_DEFERRED(adio_fh, myname, &error_code); incr = (count*datatype_size)/adio_fh->etype_size; ADIO_Get_shared_fp(adio_fh, incr, &shared_fp, &error_code); /* --BEGIN ERROR HANDLING-- */ if (error_code != MPI_SUCCESS) { error_code = MPIO_Err_return_file(adio_fh, error_code); goto fn_exit; } /* --END ERROR HANDLING-- */ xbuf = buf; if (adio_fh->is_external32) { MPI_Aint e32_size = 0; error_code = MPIU_datatype_full_size(datatype, &e32_size); if (error_code != MPI_SUCCESS) goto fn_exit; e32_buf = ADIOI_Malloc(e32_size*count); xbuf = e32_buf; } /* contiguous or strided? */ if (buftype_is_contig && filetype_is_contig) { /* convert count and shared_fp to bytes */ bufsize = datatype_size * count; off = adio_fh->disp + adio_fh->etype_size * shared_fp; /* if atomic mode requested, lock (exclusive) the region, because there could be a concurrent noncontiguous request. On NFS, locking is done in the ADIO_ReadContig.*/ if ((adio_fh->atomicity) && (adio_fh->file_system != ADIO_NFS)) ADIOI_WRITE_LOCK(adio_fh, off, SEEK_SET, bufsize); ADIO_ReadContig(adio_fh, xbuf, count, datatype, ADIO_EXPLICIT_OFFSET, off, status, &error_code); if ((adio_fh->atomicity) && (adio_fh->file_system != ADIO_NFS)) ADIOI_UNLOCK(adio_fh, off, SEEK_SET, bufsize); } else { ADIO_ReadStrided(adio_fh, xbuf, count, datatype, ADIO_EXPLICIT_OFFSET, shared_fp, status, &error_code); /* For strided and atomic mode, locking is done in ADIO_ReadStrided */ } /* --BEGIN ERROR HANDLING-- */ if (error_code != MPI_SUCCESS) error_code = MPIO_Err_return_file(adio_fh, error_code); /* --END ERROR HANDLING-- */ if (e32_buf != NULL) { error_code = MPIU_read_external32_conversion_fn(buf, datatype, count, e32_buf); ADIOI_Free(e32_buf); } fn_exit: ROMIO_THREAD_CS_EXIT(); return error_code; }
/* wrapper function for ADIO_WriteStrided and ADIO_ReadStrided. Used * by new 2 phase code to pass an arbitrary file type directly to * WriteStrided call without affecting existing code. For the new 2 * phase code, we really only need to set a custom_ftype, and we can * assume that this uses MPI_BYTE for the etype, and disp is 0 */ void ADIOI_IOFiletype(ADIO_File fd, void *buf, int count, MPI_Datatype datatype, int file_ptr_type, ADIO_Offset offset, MPI_Datatype custom_ftype, int rdwr, ADIO_Status * status, int *error_code) { MPI_Datatype user_filetype; MPI_Datatype user_etype; ADIO_Offset user_disp; int user_ind_wr_buffer_size; int user_ind_rd_buffer_size; int f_is_contig, m_is_contig; int user_ds_read, user_ds_write; MPI_Aint f_extent; MPI_Count f_size; int f_ds_percent; /* size/extent */ #ifdef AGGREGATION_PROFILE if (rdwr == ADIOI_READ) MPE_Log_event(5006, 0, NULL); else MPE_Log_event(5008, 0, NULL); #endif MPI_Type_extent(custom_ftype, &f_extent); MPI_Type_size_x(custom_ftype, &f_size); f_ds_percent = 100 * f_size / f_extent; /* temporarily store file view information */ user_filetype = fd->filetype; user_etype = fd->etype; user_disp = fd->disp; user_ds_read = fd->hints->ds_read; user_ds_write = fd->hints->ds_write; /* temporarily override the independent I/O datasieve buffer size */ user_ind_wr_buffer_size = fd->hints->ind_wr_buffer_size; user_ind_rd_buffer_size = fd->hints->ind_rd_buffer_size; /* set new values for temporary file view */ fd->filetype = custom_ftype; fd->etype = MPI_BYTE; /* set new values for independent I/O datasieve buffer size */ fd->hints->ind_wr_buffer_size = fd->hints->cb_buffer_size; fd->hints->ind_rd_buffer_size = fd->hints->cb_buffer_size; /* decide whether or not to do datasieving */ #ifdef DEBUG printf("f_ds_percent = %d cb_ds_threshold = %d\n", f_ds_percent, fd->hints->cb_ds_threshold); #endif if (f_ds_percent >= fd->hints->cb_ds_threshold) { fd->hints->ds_read = ADIOI_HINT_ENABLE; fd->hints->ds_write = ADIOI_HINT_ENABLE; } else { fd->hints->ds_read = ADIOI_HINT_DISABLE; fd->hints->ds_write = ADIOI_HINT_DISABLE; } /* flatten the new filetype since the strided calls expect it to * have been flattened in set file view. in the two phase code, * the datatype passed down should always be MPI_BYTE, and * therefore contiguous, but just for completeness sake, we'll * check the memory datatype anyway */ ADIOI_Datatype_iscontig(custom_ftype, &f_is_contig); ADIOI_Datatype_iscontig(datatype, &m_is_contig); if (!f_is_contig) ADIOI_Flatten_datatype(custom_ftype); /* make appropriate Read/Write calls. Let ROMIO figure out file * system specific stuff. */ if (f_is_contig && m_is_contig) { fd->disp = 0; if (rdwr == ADIOI_READ) ADIO_ReadContig(fd, buf, count, datatype, file_ptr_type, offset, status, error_code); else ADIO_WriteContig(fd, buf, count, datatype, file_ptr_type, offset, status, error_code); } else { fd->disp = offset; if (rdwr == ADIOI_READ) ADIO_ReadStrided(fd, buf, count, datatype, file_ptr_type, 0, status, error_code); else ADIO_WriteStrided(fd, buf, count, datatype, file_ptr_type, 0, status, error_code); } /* restore the user specified file view to cover our tracks */ fd->filetype = user_filetype; fd->etype = user_etype; fd->disp = user_disp; fd->hints->ds_read = user_ds_read; fd->hints->ds_write = user_ds_write; fd->hints->ind_wr_buffer_size = user_ind_wr_buffer_size; fd->hints->ind_rd_buffer_size = user_ind_rd_buffer_size; #ifdef AGGREGATION_PROFILE if (rdwr == ADIOI_READ) MPE_Log_event(5007, 0, NULL); else MPE_Log_event(5009, 0, NULL); #endif }
/* Avery Ching and Kenin Columa's reworked two-phase algorithm. Key features * - persistent file domains * - an option to use alltoall instead of point-to-point */ void ADIOI_IOStridedColl(ADIO_File fd, void *buf, int count, int rdwr, MPI_Datatype datatype, int file_ptr_type, ADIO_Offset offset, ADIO_Status * status, int *error_code) { ADIO_Offset min_st_offset = 0, max_end_offset = 0; ADIO_Offset st_end_offset[2]; ADIO_Offset *all_st_end_offsets = NULL; int filetype_is_contig, buftype_is_contig, is_contig; ADIO_Offset off; int interleave_count = 0, i, nprocs, myrank, nprocs_for_coll; int cb_enable; ADIO_Offset bufsize; MPI_Aint extent; #ifdef DEBUG2 MPI_Aint bufextent; #endif MPI_Count size; int agg_rank; ADIO_Offset agg_disp; /* aggregated file offset */ MPI_Datatype agg_dtype; /* aggregated file datatype */ int aggregators_done = 0; ADIO_Offset buffered_io_size = 0; int *alltoallw_disps; int *alltoallw_counts; int *client_alltoallw_counts; int *agg_alltoallw_counts; char *cb_buf = NULL; MPI_Datatype *client_comm_dtype_arr; /* aggregator perspective */ MPI_Datatype *agg_comm_dtype_arr; /* client perspective */ ADIO_Offset *client_comm_sz_arr; /* aggregator perspective */ ADIO_Offset *agg_comm_sz_arr; /* client perspective */ /* file views for each client and aggregator */ view_state *client_file_view_state_arr = NULL; view_state *agg_file_view_state_arr = NULL; /* mem views for local process */ view_state *my_mem_view_state_arr = NULL; MPI_Status *agg_comm_statuses = NULL; MPI_Request *agg_comm_requests = NULL; MPI_Status *client_comm_statuses = NULL; MPI_Request *client_comm_requests = NULL; int aggs_client_count = 0; int clients_agg_count = 0; MPI_Comm_size(fd->comm, &nprocs); MPI_Comm_rank(fd->comm, &myrank); #ifdef DEBUG fprintf(stderr, "p%d: entering ADIOI_IOStridedColl\n", myrank); #endif #ifdef AGGREGATION_PROFILE if (rdwr == ADIOI_READ) MPE_Log_event(5010, 0, NULL); else MPE_Log_event(5012, 0, NULL); #endif /* I need to check if there are any outstanding nonblocking writes * to the file, which could potentially interfere with the writes * taking place in this collective write call. Since this is not * likely to be common, let me do the simplest thing possible here: * Each process completes all pending nonblocking operations before * completing. */ nprocs_for_coll = fd->hints->cb_nodes; if (rdwr == ADIOI_READ) cb_enable = fd->hints->cb_read; else cb_enable = fd->hints->cb_write; /* only check for interleaving if cb_read isn't disabled */ if (cb_enable != ADIOI_HINT_DISABLE) { /* find the starting and ending byte of my I/O access */ ADIOI_Calc_bounds(fd, count, datatype, file_ptr_type, offset, &st_end_offset[0], &st_end_offset[1]); /* allocate an array of start/end pairs */ all_st_end_offsets = (ADIO_Offset *) ADIOI_Malloc(2 * nprocs * sizeof(ADIO_Offset)); MPI_Allgather(st_end_offset, 2, ADIO_OFFSET, all_st_end_offsets, 2, ADIO_OFFSET, fd->comm); min_st_offset = all_st_end_offsets[0]; max_end_offset = all_st_end_offsets[1]; for (i = 1; i < nprocs; i++) { /* are the accesses of different processes interleaved? */ if ((all_st_end_offsets[i * 2] < all_st_end_offsets[i * 2 - 1]) && (all_st_end_offsets[i * 2] <= all_st_end_offsets[i * 2 + 1])) interleave_count++; /* This is a rudimentary check for interleaving, but should * suffice for the moment. */ min_st_offset = MPL_MIN(all_st_end_offsets[i * 2], min_st_offset); max_end_offset = MPL_MAX(all_st_end_offsets[i * 2 + 1], max_end_offset); } } ADIOI_Datatype_iscontig(datatype, &buftype_is_contig); ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig); if ((cb_enable == ADIOI_HINT_DISABLE || (!interleave_count && (cb_enable == ADIOI_HINT_AUTO))) && (fd->hints->cb_pfr != ADIOI_HINT_ENABLE)) { if (cb_enable != ADIOI_HINT_DISABLE) { ADIOI_Free(all_st_end_offsets); } if (buftype_is_contig && filetype_is_contig) { if (file_ptr_type == ADIO_EXPLICIT_OFFSET) { off = fd->disp + (fd->etype_size) * offset; if (rdwr == ADIOI_READ) ADIO_ReadContig(fd, buf, count, datatype, ADIO_EXPLICIT_OFFSET, off, status, error_code); else ADIO_WriteContig(fd, buf, count, datatype, ADIO_EXPLICIT_OFFSET, off, status, error_code); } else { if (rdwr == ADIOI_READ) ADIO_ReadContig(fd, buf, count, datatype, ADIO_INDIVIDUAL, 0, status, error_code); else ADIO_WriteContig(fd, buf, count, datatype, ADIO_INDIVIDUAL, 0, status, error_code); } } else { if (rdwr == ADIOI_READ) ADIO_ReadStrided(fd, buf, count, datatype, file_ptr_type, offset, status, error_code); else ADIO_WriteStrided(fd, buf, count, datatype, file_ptr_type, offset, status, error_code); } return; } MPI_Type_extent(datatype, &extent); #ifdef DEBUG2 bufextent = extent * count; #endif MPI_Type_size_x(datatype, &size); bufsize = size * (MPI_Count) count; /* Calculate file realms */ if ((fd->hints->cb_pfr != ADIOI_HINT_ENABLE) || (fd->file_realm_types == NULL)) ADIOI_Calc_file_realms(fd, min_st_offset, max_end_offset); my_mem_view_state_arr = (view_state *) ADIOI_Calloc(1, nprocs * sizeof(view_state)); agg_file_view_state_arr = (view_state *) ADIOI_Calloc(1, nprocs * sizeof(view_state)); client_comm_sz_arr = (ADIO_Offset *) ADIOI_Calloc(1, nprocs * sizeof(ADIO_Offset)); if (fd->is_agg) { client_file_view_state_arr = (view_state *) ADIOI_Calloc(1, nprocs * sizeof(view_state)); } else { client_file_view_state_arr = NULL; } /* Alltoallw doesn't like a null array even if the counts are * zero. If you do not include this code, it will fail. */ client_comm_dtype_arr = (MPI_Datatype *) ADIOI_Calloc(1, nprocs * sizeof(MPI_Datatype)); if (!fd->is_agg) for (i = 0; i < nprocs; i++) client_comm_dtype_arr[i] = MPI_BYTE; ADIOI_Exch_file_views(myrank, nprocs, file_ptr_type, fd, count, datatype, offset, my_mem_view_state_arr, agg_file_view_state_arr, client_file_view_state_arr); agg_comm_sz_arr = (ADIO_Offset *) ADIOI_Calloc(1, nprocs * sizeof(ADIO_Offset)); agg_comm_dtype_arr = (MPI_Datatype *) ADIOI_Malloc(nprocs * sizeof(MPI_Datatype)); if (fd->is_agg) { ADIOI_Build_agg_reqs(fd, rdwr, nprocs, client_file_view_state_arr, client_comm_dtype_arr, client_comm_sz_arr, &agg_disp, &agg_dtype); buffered_io_size = 0; for (i = 0; i < nprocs; i++) { if (client_comm_sz_arr[i] > 0) buffered_io_size += client_comm_sz_arr[i]; } } #ifdef USE_PRE_REQ else { /* Example use of ADIOI_Build_client_pre_req. to an * appropriate section */ for (i = 0; i < fd->hints->cb_nodes; i++) { agg_rank = fd->hints->ranklist[(i + myrank) % fd->hints->cb_nodes]; #ifdef AGGREGATION_PROFILE MPE_Log_event(5040, 0, NULL); #endif ADIOI_Build_client_pre_req(fd, agg_rank, (i + myrank) % fd->hints->cb_nodes, &(my_mem_view_state_arr[agg_rank]), &(agg_file_view_state_arr[agg_rank]), 2 * 1024 * 1024, 64 * 1024); #ifdef AGGREGATION_PROFILE MPE_Log_event(5041, 0, NULL); #endif } } #endif if (fd->is_agg) cb_buf = (char *) ADIOI_Malloc(fd->hints->cb_buffer_size); alltoallw_disps = (int *) ADIOI_Calloc(nprocs, sizeof(int)); alltoallw_counts = client_alltoallw_counts = (int *) ADIOI_Calloc(2 * nprocs, sizeof(int)); agg_alltoallw_counts = &alltoallw_counts[nprocs]; if (fd->hints->cb_alltoall == ADIOI_HINT_DISABLE) { /* aggregators pre-post all Irecv's for incoming data from clients */ if ((fd->is_agg) && (rdwr == ADIOI_WRITE)) post_aggregator_comm(fd->comm, rdwr, nprocs, cb_buf, client_comm_dtype_arr, client_comm_sz_arr, &agg_comm_requests, &aggs_client_count); } /* Aggregators send amounts for data requested to clients */ Exch_data_amounts(fd, nprocs, client_comm_sz_arr, agg_comm_sz_arr, client_alltoallw_counts, agg_alltoallw_counts, &aggregators_done); #ifdef DEBUG fprintf(stderr, "client_alltoallw_counts[ "); for (i = 0; i < nprocs; i++) { fprintf(stderr, "%d ", client_alltoallw_counts[i]); } fprintf(stderr, "]\n"); fprintf(stderr, "agg_alltoallw_counts[ "); for (i = 0; i < nprocs; i++) { fprintf(stderr, "%d ", agg_alltoallw_counts[i]); } fprintf(stderr, "]\n"); #endif /* keep looping while aggregators still have I/O to do */ while (aggregators_done != nprocs_for_coll) { if (fd->hints->cb_alltoall == ADIOI_HINT_DISABLE) { /* clients should build datatypes for local memory locations * for data communication with aggregators and post * communication as the datatypes are built */ client_comm_requests = (MPI_Request *) ADIOI_Calloc(fd->hints->cb_nodes, sizeof(MPI_Request)); for (i = 0; i < fd->hints->cb_nodes; i++) { clients_agg_count = 0; agg_rank = fd->hints->ranklist[(i + myrank) % fd->hints->cb_nodes]; if (agg_comm_sz_arr[agg_rank] > 0) { ADIOI_Build_client_req(fd, agg_rank, (i + myrank) % fd->hints->cb_nodes, &(my_mem_view_state_arr[agg_rank]), &(agg_file_view_state_arr[agg_rank]), agg_comm_sz_arr[agg_rank], &(agg_comm_dtype_arr[agg_rank])); #ifdef AGGREGATION_PROFILE if (i == 0) MPE_Log_event(5038, 0, NULL); #endif post_client_comm(fd, rdwr, agg_rank, buf, agg_comm_dtype_arr[agg_rank], agg_alltoallw_counts[agg_rank], &client_comm_requests[clients_agg_count]); clients_agg_count++; } } #ifdef AGGREGATION_PROFILE if (!clients_agg_count) MPE_Log_event(5039, 0, NULL); #endif if (rdwr == ADIOI_READ) { if (fd->is_agg && buffered_io_size) { ADIOI_IOFiletype(fd, cb_buf, buffered_io_size, MPI_BYTE, ADIO_EXPLICIT_OFFSET, agg_disp, agg_dtype, ADIOI_READ, status, error_code); if (*error_code != MPI_SUCCESS) return; MPI_Type_free(&agg_dtype); } #ifdef DEBUG fprintf(stderr, "expecting from [agg](disp,size,cnt)="); for (i = 0; i < nprocs; i++) { MPI_Type_size_x(agg_comm_dtype_arr[i], &size); fprintf(stderr, "[%d](%d,%d,%d)", i, alltoallw_disps[i], size, agg_alltoallw_counts[i]); if (i != nprocs - 1) fprintf(stderr, ","); } fprintf(stderr, "]\n"); if (fd->is_agg) { fprintf(stderr, "sending to [client](disp,size,cnt)="); for (i = 0; i < nprocs; i++) { if (fd->is_agg) MPI_Type_size_x(client_comm_dtype_arr[i], &size); else size = -1; fprintf(stderr, "[%d](%d,%d,%d)", i, alltoallw_disps[i], size, client_alltoallw_counts[i]); if (i != nprocs - 1) fprintf(stderr, ","); } fprintf(stderr, "\n"); } fflush(NULL); #endif /* aggregators post all Isends for outgoing data to clients */ if (fd->is_agg) post_aggregator_comm(fd->comm, rdwr, nprocs, cb_buf, client_comm_dtype_arr, client_comm_sz_arr, &agg_comm_requests, &aggs_client_count); if (fd->is_agg && aggs_client_count) { #ifdef MPI_STATUSES_IGNORE agg_comm_statuses = MPI_STATUSES_IGNORE; #else agg_comm_statuses = ADIOI_Malloc(aggs_client_count * sizeof(MPI_Status)); #endif MPI_Waitall(aggs_client_count, agg_comm_requests, agg_comm_statuses); #ifdef AGGREGATION_PROFILE MPE_Log_event(5033, 0, NULL); #endif ADIOI_Free(agg_comm_requests); #ifndef MPI_STATUSES_IGNORE ADIOI_Free(agg_comm_statuses); #endif } if (clients_agg_count) { #ifdef MPI_STATUSES_IGNORE client_comm_statuses = MPI_STATUSES_IGNORE; #else client_comm_statuses = ADIOI_Malloc(clients_agg_count * sizeof(MPI_Status)); #endif MPI_Waitall(clients_agg_count, client_comm_requests, client_comm_statuses); #ifdef AGGREGATION_PROFILE MPE_Log_event(5039, 0, NULL); #endif ADIOI_Free(client_comm_requests); #ifndef MPI_STATUSES_IGNORE ADIOI_Free(client_comm_statuses); #endif } #ifdef DEBUG2 fprintf(stderr, "buffered_io_size = %lld\n", buffered_io_size); if (fd->is_agg && buffered_io_size) { fprintf(stderr, "buf = ["); for (i = 0; i < bufextent; i++) fprintf(stderr, "%c", ((char *) buf)[i]); fprintf(stderr, "]\n"); fprintf(stderr, "cb_buf = ["); for (i = 0; i < buffered_io_size; i++) fprintf(stderr, "%c", cb_buf[i]); fprintf(stderr, "]\n"); fflush(NULL); } #endif } else { /* Write Case */ #ifdef DEBUG fprintf(stderr, "sending to [agg](disp,size,cnt)="); for (i = 0; i < nprocs; i++) { MPI_Type_size_x(agg_comm_dtype_arr[i], &size); fprintf(stderr, "[%d](%d,%d,%d)", i, alltoallw_disps[i], size, agg_alltoallw_counts[i]); if (i != nprocs - 1) fprintf(stderr, ","); } fprintf(stderr, "]\n"); fprintf(stderr, "expecting from [client](disp,size,cnt)="); for (i = 0; i < nprocs; i++) { if (fd->is_agg) MPI_Type_size_x(client_comm_dtype_arr[i], &size); else size = -1; fprintf(stderr, "[%d](%d,%d,%d)", i, alltoallw_disps[i], size, client_alltoallw_counts[i]); if (i != nprocs - 1) fprintf(stderr, ","); } fprintf(stderr, "\n"); fflush(NULL); #endif #ifdef DEBUG fprintf(stderr, "buffered_io_size = %lld\n", buffered_io_size); #endif if (clients_agg_count) { #ifdef MPI_STATUSES_IGNORE client_comm_statuses = MPI_STATUSES_IGNORE; #else client_comm_statuses = ADIOI_Malloc(clients_agg_count * sizeof(MPI_Status)); #endif MPI_Waitall(clients_agg_count, client_comm_requests, client_comm_statuses); #ifdef AGGREGATION_PROFILE MPE_Log_event(5039, 0, NULL); #endif ADIOI_Free(client_comm_requests); #ifndef MPI_STATUSES_IGNORE ADIOI_Free(client_comm_statuses); #endif } #ifdef DEBUG2 if (bufextent) { fprintf(stderr, "buf = ["); for (i = 0; i < bufextent; i++) fprintf(stderr, "%c", ((char *) buf)[i]); fprintf(stderr, "]\n"); } #endif if (fd->is_agg && buffered_io_size) { ADIOI_Assert(aggs_client_count != 0); /* make sure we actually have the data to write out */ #ifdef MPI_STATUSES_IGNORE agg_comm_statuses = MPI_STATUSES_IGNORE; #else agg_comm_statuses = (MPI_Status *) ADIOI_Malloc(aggs_client_count * sizeof(MPI_Status)); #endif MPI_Waitall(aggs_client_count, agg_comm_requests, agg_comm_statuses); #ifdef AGGREGATION_PROFILE MPE_Log_event(5033, 0, NULL); #endif ADIOI_Free(agg_comm_requests); #ifndef MPI_STATUSES_IGNORE ADIOI_Free(agg_comm_statuses); #endif #ifdef DEBUG2 fprintf(stderr, "cb_buf = ["); for (i = 0; i < buffered_io_size; i++) fprintf(stderr, "%c", cb_buf[i]); fprintf(stderr, "]\n"); fflush(NULL); #endif ADIOI_IOFiletype(fd, cb_buf, buffered_io_size, MPI_BYTE, ADIO_EXPLICIT_OFFSET, agg_disp, agg_dtype, ADIOI_WRITE, status, error_code); if (*error_code != MPI_SUCCESS) return; MPI_Type_free(&agg_dtype); } } } else { /* Alltoallw version of everything */ ADIOI_Build_client_reqs(fd, nprocs, my_mem_view_state_arr, agg_file_view_state_arr, agg_comm_sz_arr, agg_comm_dtype_arr); if (rdwr == ADIOI_READ) { if (fd->is_agg && buffered_io_size) { ADIOI_IOFiletype(fd, cb_buf, buffered_io_size, MPI_BYTE, ADIO_EXPLICIT_OFFSET, agg_disp, agg_dtype, ADIOI_READ, status, error_code); if (*error_code != MPI_SUCCESS) return; MPI_Type_free(&agg_dtype); } #ifdef AGGREGATION_PROFILE MPE_Log_event(5032, 0, NULL); #endif MPI_Alltoallw(cb_buf, client_alltoallw_counts, alltoallw_disps, client_comm_dtype_arr, buf, agg_alltoallw_counts, alltoallw_disps, agg_comm_dtype_arr, fd->comm); #ifdef AGGREGATION_PROFILE MPE_Log_event(5033, 0, NULL); #endif } else { /* Write Case */ #ifdef AGGREGATION_PROFILE MPE_Log_event(5032, 0, NULL); #endif MPI_Alltoallw(buf, agg_alltoallw_counts, alltoallw_disps, agg_comm_dtype_arr, cb_buf, client_alltoallw_counts, alltoallw_disps, client_comm_dtype_arr, fd->comm); #ifdef AGGREGATION_PROFILE MPE_Log_event(5033, 0, NULL); #endif if (fd->is_agg && buffered_io_size) { ADIOI_IOFiletype(fd, cb_buf, buffered_io_size, MPI_BYTE, ADIO_EXPLICIT_OFFSET, agg_disp, agg_dtype, ADIOI_WRITE, status, error_code); if (*error_code != MPI_SUCCESS) return; MPI_Type_free(&agg_dtype); } } } /* Free (uncommit) datatypes for reuse */ if (fd->is_agg) { if (buffered_io_size > 0) { for (i = 0; i < nprocs; i++) { if (client_comm_sz_arr[i] > 0) MPI_Type_free(&client_comm_dtype_arr[i]); } } } for (i = 0; i < nprocs; i++) { if (agg_comm_sz_arr[i] > 0) MPI_Type_free(&agg_comm_dtype_arr[i]); } /* figure out next set up requests */ if (fd->is_agg) { ADIOI_Build_agg_reqs(fd, rdwr, nprocs, client_file_view_state_arr, client_comm_dtype_arr, client_comm_sz_arr, &agg_disp, &agg_dtype); buffered_io_size = 0; for (i = 0; i < nprocs; i++) { if (client_comm_sz_arr[i] > 0) buffered_io_size += client_comm_sz_arr[i]; } } #ifdef USE_PRE_REQ else { /* Example use of ADIOI_Build_client_pre_req. to an * appropriate section */ for (i = 0; i < fd->hints->cb_nodes; i++) { agg_rank = fd->hints->ranklist[(i + myrank) % fd->hints->cb_nodes]; #ifdef AGGREGATION_PROFILE MPE_Log_event(5040, 0, NULL); #endif ADIOI_Build_client_pre_req(fd, agg_rank, (i + myrank) % fd->hints->cb_nodes, &(my_mem_view_state_arr[agg_rank]), &(agg_file_view_state_arr[agg_rank]), 2 * 1024 * 1024, 64 * 1024); #ifdef AGGREGATION_PROFILE MPE_Log_event(5041, 0, NULL); #endif } } #endif /* aggregators pre-post all Irecv's for incoming data from * clients. if nothing is needed, agg_comm_requests is not * allocated */ if (fd->hints->cb_alltoall == ADIOI_HINT_DISABLE) { if ((fd->is_agg) && (rdwr == ADIOI_WRITE)) post_aggregator_comm(fd->comm, rdwr, nprocs, cb_buf, client_comm_dtype_arr, client_comm_sz_arr, &agg_comm_requests, &aggs_client_count); } /* Aggregators send amounts for data requested to clients */ Exch_data_amounts(fd, nprocs, client_comm_sz_arr, agg_comm_sz_arr, client_alltoallw_counts, agg_alltoallw_counts, &aggregators_done); } /* Clean up */ if (fd->hints->cb_pfr != ADIOI_HINT_ENABLE) { /* AAR, FSIZE, and User provided uniform File realms */ if (1) { MPI_Type_free(&fd->file_realm_types[0]); } else { for (i = 0; i < fd->hints->cb_nodes; i++) { ADIOI_Datatype_iscontig(fd->file_realm_types[i], &is_contig); MPI_Type_free(&fd->file_realm_types[i]); } } ADIOI_Free(fd->file_realm_types); ADIOI_Free(fd->file_realm_st_offs); } if (fd->is_agg) { if (buffered_io_size > 0) MPI_Type_free(&agg_dtype); for (i = 0; i < nprocs; i++) { MPI_Type_free(&client_comm_dtype_arr[i]); ADIOI_Free(client_file_view_state_arr[i].flat_type_p->indices); ADIOI_Free(client_file_view_state_arr[i].flat_type_p->blocklens); ADIOI_Free(client_file_view_state_arr[i].flat_type_p); } ADIOI_Free(client_file_view_state_arr); ADIOI_Free(cb_buf); } for (i = 0; i < nprocs; i++) if (agg_comm_sz_arr[i] > 0) MPI_Type_free(&agg_comm_dtype_arr[i]); ADIOI_Free(client_comm_sz_arr); ADIOI_Free(client_comm_dtype_arr); ADIOI_Free(my_mem_view_state_arr); ADIOI_Free(agg_file_view_state_arr); ADIOI_Free(agg_comm_sz_arr); ADIOI_Free(agg_comm_dtype_arr); ADIOI_Free(alltoallw_disps); ADIOI_Free(alltoallw_counts); ADIOI_Free(all_st_end_offsets); #ifdef HAVE_STATUS_SET_BYTES MPIR_Status_set_bytes(status, datatype, bufsize); /* This is a temporary way of filling in status. The right way is * to keep track of how much data was actually read and placed in * buf during collective I/O. */ #endif fd->fp_sys_posn = -1; /* set it to null. */ #ifdef AGGREGATION_PROFILE if (rdwr == ADIOI_READ) MPE_Log_event(5011, 0, NULL); else MPE_Log_event(5013, 0, NULL); #endif }