/* Generic implementation of IwriteStrided calls the blocking WriteStrided * immediately. */ void ADIOI_FAKE_IwriteStrided(ADIO_File fd, const void *buf, int count, MPI_Datatype datatype, int file_ptr_type, ADIO_Offset offset, ADIO_Request *request, int *error_code) { ADIO_Status status; int typesize; MPI_Offset nbytes=0; /* Call the blocking function. It will create an error code * if necessary. */ ADIO_WriteStrided(fd, buf, count, datatype, file_ptr_type, offset, &status, error_code); if (*error_code == MPI_SUCCESS) { MPI_Type_size(datatype, &typesize); nbytes = (MPI_Offset)count * (MPI_Offset)typesize; } MPIO_Completed_request_create(&fd, nbytes, error_code, request); }
void ADIOI_GEN_WriteStridedColl(ADIO_File fd, const void *buf, int count, MPI_Datatype datatype, int file_ptr_type, ADIO_Offset offset, ADIO_Status * status, int *error_code) { /* Uses a generalized version of the extended two-phase method described in "An Extended Two-Phase Method for Accessing Sections of Out-of-Core Arrays", Rajeev Thakur and Alok Choudhary, Scientific Programming, (5)4:301--317, Winter 1996. http://www.mcs.anl.gov/home/thakur/ext2ph.ps */ ADIOI_Access *my_req; /* array of nprocs access structures, one for each other process in * whose file domain this process's request lies */ ADIOI_Access *others_req; /* array of nprocs access structures, one for each other process * whose request lies in this process's file domain. */ int i, filetype_is_contig, nprocs, nprocs_for_coll, myrank; int contig_access_count = 0, interleave_count = 0, buftype_is_contig; int *count_my_req_per_proc, count_my_req_procs, count_others_req_procs; ADIO_Offset orig_fp, start_offset, end_offset, fd_size, min_st_offset, off; ADIO_Offset *offset_list = NULL, *st_offsets = NULL, *fd_start = NULL, *fd_end = NULL, *end_offsets = NULL; MPI_Aint *buf_idx = NULL; ADIO_Offset *len_list = NULL; int old_error, tmp_error; if (fd->hints->cb_pfr != ADIOI_HINT_DISABLE) { /* Cast away const'ness as the below function is used for read * and write */ ADIOI_IOStridedColl(fd, (char *) buf, count, ADIOI_WRITE, datatype, file_ptr_type, offset, status, error_code); return; } MPI_Comm_size(fd->comm, &nprocs); MPI_Comm_rank(fd->comm, &myrank); /* the number of processes that actually perform I/O, nprocs_for_coll, * is stored in the hints off the ADIO_File structure */ nprocs_for_coll = fd->hints->cb_nodes; orig_fp = fd->fp_ind; /* only check for interleaving if cb_write isn't disabled */ if (fd->hints->cb_write != ADIOI_HINT_DISABLE) { /* For this process's request, calculate the list of offsets and * lengths in the file and determine the start and end offsets. */ /* Note: end_offset points to the last byte-offset that will be accessed. * e.g., if start_offset=0 and 100 bytes to be read, end_offset=99 */ ADIOI_Calc_my_off_len(fd, count, datatype, file_ptr_type, offset, &offset_list, &len_list, &start_offset, &end_offset, &contig_access_count); /* each process communicates its start and end offsets to other * processes. The result is an array each of start and end offsets stored * in order of process rank. */ st_offsets = (ADIO_Offset *) ADIOI_Malloc(nprocs * 2 * sizeof(ADIO_Offset)); end_offsets = st_offsets + nprocs; MPI_Allgather(&start_offset, 1, ADIO_OFFSET, st_offsets, 1, ADIO_OFFSET, fd->comm); MPI_Allgather(&end_offset, 1, ADIO_OFFSET, end_offsets, 1, ADIO_OFFSET, fd->comm); /* are the accesses of different processes interleaved? */ for (i = 1; i < nprocs; i++) if ((st_offsets[i] < end_offsets[i - 1]) && (st_offsets[i] <= end_offsets[i])) interleave_count++; /* This is a rudimentary check for interleaving, but should suffice * for the moment. */ } ADIOI_Datatype_iscontig(datatype, &buftype_is_contig); if (fd->hints->cb_write == ADIOI_HINT_DISABLE || (!interleave_count && (fd->hints->cb_write == ADIOI_HINT_AUTO))) { /* use independent accesses */ if (fd->hints->cb_write != ADIOI_HINT_DISABLE) { ADIOI_Free(offset_list); ADIOI_Free(st_offsets); } fd->fp_ind = orig_fp; ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig); if (buftype_is_contig && filetype_is_contig) { if (file_ptr_type == ADIO_EXPLICIT_OFFSET) { off = fd->disp + (ADIO_Offset) (fd->etype_size) * offset; ADIO_WriteContig(fd, buf, count, datatype, ADIO_EXPLICIT_OFFSET, off, status, error_code); } else ADIO_WriteContig(fd, buf, count, datatype, ADIO_INDIVIDUAL, 0, status, error_code); } else ADIO_WriteStrided(fd, buf, count, datatype, file_ptr_type, offset, status, error_code); return; } /* Divide the I/O workload among "nprocs_for_coll" processes. This is done by (logically) dividing the file into file domains (FDs); each process may directly access only its own file domain. */ ADIOI_Calc_file_domains(st_offsets, end_offsets, nprocs, nprocs_for_coll, &min_st_offset, &fd_start, &fd_end, fd->hints->min_fdomain_size, &fd_size, fd->hints->striping_unit); /* calculate what portions of the access requests of this process are located in what file domains */ ADIOI_Calc_my_req(fd, offset_list, len_list, contig_access_count, min_st_offset, fd_start, fd_end, fd_size, nprocs, &count_my_req_procs, &count_my_req_per_proc, &my_req, &buf_idx); /* based on everyone's my_req, calculate what requests of other processes lie in this process's file domain. count_others_req_procs = number of processes whose requests lie in this process's file domain (including this process itself) count_others_req_per_proc[i] indicates how many separate contiguous requests of proc. i lie in this process's file domain. */ ADIOI_Calc_others_req(fd, count_my_req_procs, count_my_req_per_proc, my_req, nprocs, myrank, &count_others_req_procs, &others_req); ADIOI_Free(count_my_req_per_proc); ADIOI_Free(my_req[0].offsets); ADIOI_Free(my_req); /* exchange data and write in sizes of no more than coll_bufsize. */ /* Cast away const'ness for the below function */ ADIOI_Exch_and_write(fd, (char *) buf, datatype, nprocs, myrank, others_req, offset_list, len_list, contig_access_count, min_st_offset, fd_size, fd_start, fd_end, buf_idx, error_code); /* If this collective write is followed by an independent write, * it's possible to have those subsequent writes on other processes * race ahead and sneak in before the read-modify-write completes. * We carry out a collective communication at the end here so no one * can start independent i/o before collective I/O completes. * * need to do some gymnastics with the error codes so that if something * went wrong, all processes report error, but if a process has a more * specific error code, we can still have that process report the * additional information */ old_error = *error_code; if (*error_code != MPI_SUCCESS) *error_code = MPI_ERR_IO; /* optimization: if only one process performing i/o, we can perform * a less-expensive Bcast */ #ifdef ADIOI_MPE_LOGGING MPE_Log_event(ADIOI_MPE_postwrite_a, 0, NULL); #endif if (fd->hints->cb_nodes == 1) MPI_Bcast(error_code, 1, MPI_INT, fd->hints->ranklist[0], fd->comm); else { tmp_error = *error_code; MPI_Allreduce(&tmp_error, error_code, 1, MPI_INT, MPI_MAX, fd->comm); } #ifdef ADIOI_MPE_LOGGING MPE_Log_event(ADIOI_MPE_postwrite_b, 0, NULL); #endif #ifdef AGGREGATION_PROFILE MPE_Log_event(5012, 0, NULL); #endif if ((old_error != MPI_SUCCESS) && (old_error != MPI_ERR_IO)) *error_code = old_error; /* free all memory allocated for collective I/O */ ADIOI_Free(others_req[0].offsets); ADIOI_Free(others_req[0].mem_ptrs); ADIOI_Free(others_req); ADIOI_Free(buf_idx); ADIOI_Free(offset_list); ADIOI_Free(st_offsets); ADIOI_Free(fd_start); #ifdef HAVE_STATUS_SET_BYTES if (status) { MPI_Count bufsize, size; /* Don't set status if it isn't needed */ MPI_Type_size_x(datatype, &size); bufsize = size * count; MPIR_Status_set_bytes(status, datatype, bufsize); } /* This is a temporary way of filling in status. The right way is to keep track of how much data was actually written during collective I/O. */ #endif fd->fp_sys_posn = -1; /* set it to null. */ #ifdef AGGREGATION_PROFILE MPE_Log_event(5013, 0, NULL); #endif }
void ADIOI_LUSTRE_WriteStridedColl(ADIO_File fd, const void *buf, int count, MPI_Datatype datatype, int file_ptr_type, ADIO_Offset offset, ADIO_Status *status, int *error_code) { /* Uses a generalized version of the extended two-phase method described * in "An Extended Two-Phase Method for Accessing Sections of * Out-of-Core Arrays", Rajeev Thakur and Alok Choudhary, * Scientific Programming, (5)4:301--317, Winter 1996. * http://www.mcs.anl.gov/home/thakur/ext2ph.ps */ ADIOI_Access *my_req; /* array of nprocs access structures, one for each other process has this process's request */ ADIOI_Access *others_req; /* array of nprocs access structures, one for each other process whose request is written by this process. */ int i, filetype_is_contig, nprocs, myrank, do_collect = 0; int contig_access_count = 0, buftype_is_contig, interleave_count = 0; int *count_my_req_per_proc, count_my_req_procs, count_others_req_procs; ADIO_Offset orig_fp, start_offset, end_offset, off; ADIO_Offset *offset_list = NULL, *st_offsets = NULL, *end_offsets = NULL; ADIO_Offset *len_list = NULL; int **buf_idx = NULL, *striping_info = NULL; int old_error, tmp_error; MPI_Comm_size(fd->comm, &nprocs); MPI_Comm_rank(fd->comm, &myrank); orig_fp = fd->fp_ind; /* IO patten identification if cb_write isn't disabled */ if (fd->hints->cb_write != ADIOI_HINT_DISABLE) { /* For this process's request, calculate the list of offsets and lengths in the file and determine the start and end offsets. */ /* Note: end_offset points to the last byte-offset that will be accessed. * e.g., if start_offset=0 and 100 bytes to be read, end_offset=99 */ ADIOI_Calc_my_off_len(fd, count, datatype, file_ptr_type, offset, &offset_list, &len_list, &start_offset, &end_offset, &contig_access_count); /* each process communicates its start and end offsets to other * processes. The result is an array each of start and end offsets * stored in order of process rank. */ st_offsets = (ADIO_Offset *) ADIOI_Malloc(nprocs * sizeof(ADIO_Offset)); end_offsets = (ADIO_Offset *) ADIOI_Malloc(nprocs * sizeof(ADIO_Offset)); MPI_Allgather(&start_offset, 1, ADIO_OFFSET, st_offsets, 1, ADIO_OFFSET, fd->comm); MPI_Allgather(&end_offset, 1, ADIO_OFFSET, end_offsets, 1, ADIO_OFFSET, fd->comm); /* are the accesses of different processes interleaved? */ for (i = 1; i < nprocs; i++) if ((st_offsets[i] < end_offsets[i-1]) && (st_offsets[i] <= end_offsets[i])) interleave_count++; /* This is a rudimentary check for interleaving, but should suffice for the moment. */ /* Two typical access patterns can benefit from collective write. * 1) the processes are interleaved, and * 2) the req size is small. */ if (interleave_count > 0) { do_collect = 1; } else { do_collect = ADIOI_LUSTRE_Docollect(fd, contig_access_count, len_list, nprocs); } } ADIOI_Datatype_iscontig(datatype, &buftype_is_contig); /* Decide if collective I/O should be done */ if ((!do_collect && fd->hints->cb_write == ADIOI_HINT_AUTO) || fd->hints->cb_write == ADIOI_HINT_DISABLE) { /* use independent accesses */ if (fd->hints->cb_write != ADIOI_HINT_DISABLE) { ADIOI_Free(offset_list); ADIOI_Free(len_list); ADIOI_Free(st_offsets); ADIOI_Free(end_offsets); } fd->fp_ind = orig_fp; ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig); if (buftype_is_contig && filetype_is_contig) { if (file_ptr_type == ADIO_EXPLICIT_OFFSET) { off = fd->disp + (ADIO_Offset)(fd->etype_size) * offset; ADIO_WriteContig(fd, buf, count, datatype, ADIO_EXPLICIT_OFFSET, off, status, error_code); } else ADIO_WriteContig(fd, buf, count, datatype, ADIO_INDIVIDUAL, 0, status, error_code); } else { ADIO_WriteStrided(fd, buf, count, datatype, file_ptr_type, offset, status, error_code); } return; } /* Get Lustre hints information */ ADIOI_LUSTRE_Get_striping_info(fd, &striping_info, 1); /* calculate what portions of the access requests of this process are * located in which process */ ADIOI_LUSTRE_Calc_my_req(fd, offset_list, len_list, contig_access_count, striping_info, nprocs, &count_my_req_procs, &count_my_req_per_proc, &my_req, &buf_idx); /* based on everyone's my_req, calculate what requests of other processes * will be accessed by this process. * count_others_req_procs = number of processes whose requests (including * this process itself) will be accessed by this process * count_others_req_per_proc[i] indicates how many separate contiguous * requests of proc. i will be accessed by this process. */ ADIOI_Calc_others_req(fd, count_my_req_procs, count_my_req_per_proc, my_req, nprocs, myrank, &count_others_req_procs, &others_req); ADIOI_Free(count_my_req_per_proc); /* exchange data and write in sizes of no more than stripe_size. */ ADIOI_LUSTRE_Exch_and_write(fd, buf, datatype, nprocs, myrank, others_req, my_req, offset_list, len_list, contig_access_count, striping_info, buf_idx, error_code); /* If this collective write is followed by an independent write, * it's possible to have those subsequent writes on other processes * race ahead and sneak in before the read-modify-write completes. * We carry out a collective communication at the end here so no one * can start independent i/o before collective I/O completes. * * need to do some gymnastics with the error codes so that if something * went wrong, all processes report error, but if a process has a more * specific error code, we can still have that process report the * additional information */ old_error = *error_code; if (*error_code != MPI_SUCCESS) *error_code = MPI_ERR_IO; /* optimization: if only one process performing i/o, we can perform * a less-expensive Bcast */ #ifdef ADIOI_MPE_LOGGING MPE_Log_event(ADIOI_MPE_postwrite_a, 0, NULL); #endif if (fd->hints->cb_nodes == 1) MPI_Bcast(error_code, 1, MPI_INT, fd->hints->ranklist[0], fd->comm); else { tmp_error = *error_code; MPI_Allreduce(&tmp_error, error_code, 1, MPI_INT, MPI_MAX, fd->comm); } #ifdef ADIOI_MPE_LOGGING MPE_Log_event(ADIOI_MPE_postwrite_b, 0, NULL); #endif if ((old_error != MPI_SUCCESS) && (old_error != MPI_ERR_IO)) *error_code = old_error; if (!buftype_is_contig) ADIOI_Delete_flattened(datatype); /* free all memory allocated for collective I/O */ /* free others_req */ for (i = 0; i < nprocs; i++) { if (others_req[i].count) { ADIOI_Free(others_req[i].offsets); ADIOI_Free(others_req[i].lens); ADIOI_Free(others_req[i].mem_ptrs); } } ADIOI_Free(others_req); /* free my_req here */ for (i = 0; i < nprocs; i++) { if (my_req[i].count) { ADIOI_Free(my_req[i].offsets); ADIOI_Free(my_req[i].lens); } } ADIOI_Free(my_req); for (i = 0; i < nprocs; i++) { ADIOI_Free(buf_idx[i]); } ADIOI_Free(buf_idx); ADIOI_Free(offset_list); ADIOI_Free(len_list); ADIOI_Free(st_offsets); ADIOI_Free(end_offsets); ADIOI_Free(striping_info); #ifdef HAVE_STATUS_SET_BYTES if (status) { MPI_Count bufsize, size; /* Don't set status if it isn't needed */ MPI_Type_size_x(datatype, &size); bufsize = size * count; MPIR_Status_set_bytes(status, datatype, bufsize); } /* This is a temporary way of filling in status. The right way is to * keep track of how much data was actually written during collective I/O. */ #endif fd->fp_sys_posn = -1; /* set it to null. */ }
/*@ MPI_File_write - Write using individual file pointer Input Parameters: . fh - file handle (handle) . buf - initial address of buffer (choice) . count - number of elements in buffer (nonnegative integer) . datatype - datatype of each buffer element (handle) Output Parameters: . status - status object (Status) .N fortran @*/ int MPI_File_write(MPI_File fh, void *buf, int count, MPI_Datatype datatype, MPI_Status *status) { int error_code, bufsize, buftype_is_contig, filetype_is_contig; #ifndef PRINT_ERR_MSG static char myname[] = "MPI_FILE_WRITE"; #endif int datatype_size; ADIO_Offset off; #ifdef MPI_hpux int fl_xmpi; HPMP_IO_START(fl_xmpi, BLKMPIFILEWRITE, TRDTBLOCK, fh, datatype, count); #endif /* MPI_hpux */ #ifdef PRINT_ERR_MSG if ((fh <= (MPI_File) 0) || (fh->cookie != ADIOI_FILE_COOKIE)) { FPRINTF(stderr, "MPI_File_write: Invalid file handle\n"); MPI_Abort(MPI_COMM_WORLD, 1); } #else ADIOI_TEST_FILE_HANDLE(fh, myname); #endif if (count < 0) { #ifdef PRINT_ERR_MSG FPRINTF(stderr, "MPI_File_write: Invalid count argument\n"); MPI_Abort(MPI_COMM_WORLD, 1); #else error_code = MPIR_Err_setmsg(MPI_ERR_ARG, MPIR_ERR_COUNT_ARG, myname, (char *) 0, (char *) 0); return ADIOI_Error(fh, error_code, myname); #endif } if (datatype == MPI_DATATYPE_NULL) { #ifdef PRINT_ERR_MSG FPRINTF(stderr, "MPI_File_write: Invalid datatype\n"); MPI_Abort(MPI_COMM_WORLD, 1); #else error_code = MPIR_Err_setmsg(MPI_ERR_TYPE, MPIR_ERR_TYPE_NULL, myname, (char *) 0, (char *) 0); return ADIOI_Error(fh, error_code, myname); #endif } MPI_Type_size(datatype, &datatype_size); if (count*datatype_size == 0) { #ifdef MPI_hpux HPMP_IO_END(fl_xmpi, fh, datatype, count); #endif /* MPI_hpux */ return MPI_SUCCESS; } if ((count*datatype_size) % fh->etype_size != 0) { #ifdef PRINT_ERR_MSG FPRINTF(stderr, "MPI_File_write: Only an integral number of etypes can be accessed\n"); MPI_Abort(MPI_COMM_WORLD, 1); #else error_code = MPIR_Err_setmsg(MPI_ERR_IO, MPIR_ERR_ETYPE_FRACTIONAL, myname, (char *) 0, (char *) 0); return ADIOI_Error(fh, error_code, myname); #endif } if (fh->access_mode & MPI_MODE_SEQUENTIAL) { #ifdef PRINT_ERR_MSG FPRINTF(stderr, "MPI_File_write: Can't use this function because file was opened with MPI_MODE_SEQUENTIAL\n"); MPI_Abort(MPI_COMM_WORLD, 1); #else error_code = MPIR_Err_setmsg(MPI_ERR_UNSUPPORTED_OPERATION, MPIR_ERR_AMODE_SEQ, myname, (char *) 0, (char *) 0); return ADIOI_Error(fh, error_code, myname); #endif } ADIOI_Datatype_iscontig(datatype, &buftype_is_contig); ADIOI_Datatype_iscontig(fh->filetype, &filetype_is_contig); /* contiguous or strided? */ if (buftype_is_contig && filetype_is_contig) { bufsize = datatype_size * count; /* if atomic mode requested, lock (exclusive) the region, because there could be a concurrent noncontiguous request. Locking doesn't work on PIOFS and PVFS, and on NFS it is done in the ADIO_WriteContig.*/ off = fh->fp_ind; if ((fh->atomicity) && (fh->file_system != ADIO_PIOFS) && (fh->file_system != ADIO_PVFS) && (fh->file_system != ADIO_NFS)) ADIOI_WRITE_LOCK(fh, off, SEEK_SET, bufsize); ADIO_WriteContig(fh, buf, count, datatype, ADIO_INDIVIDUAL, 0, status, &error_code); if ((fh->atomicity) && (fh->file_system != ADIO_PIOFS) && (fh->file_system != ADIO_PVFS) && (fh->file_system != ADIO_NFS)) ADIOI_UNLOCK(fh, off, SEEK_SET, bufsize); } else ADIO_WriteStrided(fh, buf, count, datatype, ADIO_INDIVIDUAL, 0, status, &error_code); /* For strided and atomic mode, locking is done in ADIO_WriteStrided */ #ifdef MPI_hpux HPMP_IO_END(fl_xmpi, fh, datatype, count); #endif /* MPI_hpux */ return error_code; }
/*@ MPI_File_write_shared - Write using shared file pointer Input Parameters: . fh - file handle (handle) . buf - initial address of buffer (choice) . count - number of elements in buffer (nonnegative integer) . datatype - datatype of each buffer element (handle) Output Parameters: . status - status object (Status) .N fortran @*/ int MPI_File_write_shared(MPI_File fh, void *buf, int count, MPI_Datatype datatype, MPI_Status *status) { int error_code, bufsize, buftype_is_contig, filetype_is_contig; #ifndef PRINT_ERR_MSG static char myname[] = "MPI_FILE_WRITE_SHARED"; #endif int datatype_size, incr; ADIO_Offset off, shared_fp; #ifdef PRINT_ERR_MSG if ((fh <= (MPI_File) 0) || (fh->cookie != ADIOI_FILE_COOKIE)) { FPRINTF(stderr, "MPI_File_write_shared: Invalid file handle\n"); MPI_Abort(MPI_COMM_WORLD, 1); } #else ADIOI_TEST_FILE_HANDLE(fh, myname); #endif if (count < 0) { #ifdef PRINT_ERR_MSG FPRINTF(stderr, "MPI_File_write_shared: Invalid count argument\n"); MPI_Abort(MPI_COMM_WORLD, 1); #else error_code = MPIR_Err_setmsg(MPI_ERR_ARG, MPIR_ERR_COUNT_ARG, myname, (char *) 0, (char *) 0); return ADIOI_Error(fh, error_code, myname); #endif } if (datatype == MPI_DATATYPE_NULL) { #ifdef PRINT_ERR_MSG FPRINTF(stderr, "MPI_File_write_shared: Invalid datatype\n"); MPI_Abort(MPI_COMM_WORLD, 1); #else error_code = MPIR_Err_setmsg(MPI_ERR_TYPE, MPIR_ERR_TYPE_NULL, myname, (char *) 0, (char *) 0); return ADIOI_Error(fh, error_code, myname); #endif } MPI_Type_size(datatype, &datatype_size); if (count*datatype_size == 0) return MPI_SUCCESS; if ((count*datatype_size) % fh->etype_size != 0) { #ifdef PRINT_ERR_MSG FPRINTF(stderr, "MPI_File_write_shared: Only an integral number of etypes can be accessed\n"); MPI_Abort(MPI_COMM_WORLD, 1); #else error_code = MPIR_Err_setmsg(MPI_ERR_IO, MPIR_ERR_ETYPE_FRACTIONAL, myname, (char *) 0, (char *) 0); return ADIOI_Error(fh, error_code, myname); #endif } if ((fh->file_system == ADIO_PIOFS) || (fh->file_system == ADIO_PVFS)) { #ifdef PRINT_ERR_MSG FPRINTF(stderr, "MPI_File_write_shared: Shared file pointer not supported on PIOFS and PVFS\n"); MPI_Abort(MPI_COMM_WORLD, 1); #else error_code = MPIR_Err_setmsg(MPI_ERR_UNSUPPORTED_OPERATION, MPIR_ERR_NO_SHARED_FP, myname, (char *) 0, (char *) 0); return ADIOI_Error(fh, error_code, myname); #endif } ADIOI_Datatype_iscontig(datatype, &buftype_is_contig); ADIOI_Datatype_iscontig(fh->filetype, &filetype_is_contig); incr = (count*datatype_size)/fh->etype_size; ADIO_Get_shared_fp(fh, incr, &shared_fp, &error_code); if (error_code != MPI_SUCCESS) { FPRINTF(stderr, "MPI_File_write_shared: Error! Could not access shared file pointer.\n"); MPI_Abort(MPI_COMM_WORLD, 1); } /* contiguous or strided? */ if (buftype_is_contig && filetype_is_contig) { /* convert bufocunt and shared_fp to bytes */ bufsize = datatype_size * count; off = fh->disp + fh->etype_size * shared_fp; /* if atomic mode requested, lock (exclusive) the region, because there could be a concurrent noncontiguous request. On NFS, locking is done in the ADIO_WriteContig.*/ if ((fh->atomicity) && (fh->file_system != ADIO_NFS)) ADIOI_WRITE_LOCK(fh, off, SEEK_SET, bufsize); ADIO_WriteContig(fh, buf, count, datatype, ADIO_EXPLICIT_OFFSET, off, status, &error_code); if ((fh->atomicity) && (fh->file_system != ADIO_NFS)) ADIOI_UNLOCK(fh, off, SEEK_SET, bufsize); } else ADIO_WriteStrided(fh, buf, count, datatype, ADIO_EXPLICIT_OFFSET, shared_fp, status, &error_code); /* For strided and atomic mode, locking is done in ADIO_WriteStrided */ return error_code; }
int MPIOI_File_write(MPI_File fh, MPI_Offset offset, int file_ptr_type, const void *buf, int count, MPI_Datatype datatype, char *myname, MPI_Status *status) { int error_code, bufsize, buftype_is_contig, filetype_is_contig; int datatype_size; ADIO_Offset off; ADIO_File adio_fh; void *e32buf=NULL; const void *xbuf=NULL; MPIU_THREAD_CS_ENTER(ALLFUNC,); adio_fh = MPIO_File_resolve(fh); /* --BEGIN ERROR HANDLING-- */ MPIO_CHECK_FILE_HANDLE(adio_fh, myname, error_code); MPIO_CHECK_COUNT(adio_fh, count, myname, error_code); MPIO_CHECK_DATATYPE(adio_fh, datatype, myname, error_code); if (file_ptr_type == ADIO_EXPLICIT_OFFSET && offset < 0) { error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_ARG, "**iobadoffset", 0); error_code = MPIO_Err_return_file(adio_fh, error_code); goto fn_exit; } /* --END ERROR HANDLING-- */ MPI_Type_size(datatype, &datatype_size); /* --BEGIN ERROR HANDLING-- */ MPIO_CHECK_COUNT_SIZE(adio_fh, count, datatype_size, myname, error_code); /* --END ERROR HANDLING-- */ if (count*datatype_size == 0) { #ifdef HAVE_STATUS_SET_BYTES MPIR_Status_set_bytes(status, datatype, 0); #endif error_code = MPI_SUCCESS; goto fn_exit; } /* --BEGIN ERROR HANDLING-- */ MPIO_CHECK_INTEGRAL_ETYPE(adio_fh, count, datatype_size, myname, error_code); MPIO_CHECK_WRITABLE(adio_fh, myname, error_code); MPIO_CHECK_NOT_SEQUENTIAL_MODE(adio_fh, myname, error_code); /* --END ERROR HANDLING-- */ ADIOI_Datatype_iscontig(datatype, &buftype_is_contig); ADIOI_Datatype_iscontig(adio_fh->filetype, &filetype_is_contig); ADIOI_TEST_DEFERRED(adio_fh, myname, &error_code); xbuf = buf; if (adio_fh->is_external32) { error_code = MPIU_external32_buffer_setup(buf, count, datatype, &e32buf); if (error_code != MPI_SUCCESS) goto fn_exit; xbuf = e32buf; } if (buftype_is_contig && filetype_is_contig) { /* convert bufcount and offset to bytes */ bufsize = datatype_size * count; if (file_ptr_type == ADIO_EXPLICIT_OFFSET) { off = adio_fh->disp + adio_fh->etype_size * offset; } else /* ADIO_INDIVIDUAL */ { off = adio_fh->fp_ind; } /* if atomic mode requested, lock (exclusive) the region, because there could be a concurrent noncontiguous request. Locking doesn't work on PIOFS and PVFS, and on NFS it is done in the ADIO_WriteContig. */ if ((adio_fh->atomicity) && ADIO_Feature(adio_fh, ADIO_LOCKS)) { ADIOI_WRITE_LOCK(adio_fh, off, SEEK_SET, bufsize); } ADIO_WriteContig(adio_fh, xbuf, count, datatype, file_ptr_type, off, status, &error_code); if ((adio_fh->atomicity) && ADIO_Feature(adio_fh, ADIO_LOCKS)) { ADIOI_UNLOCK(adio_fh, off, SEEK_SET, bufsize); } } else { /* For strided and atomic mode, locking is done in ADIO_WriteStrided */ ADIO_WriteStrided(adio_fh, xbuf, count, datatype, file_ptr_type, offset, status, &error_code); } /* --BEGIN ERROR HANDLING-- */ if (error_code != MPI_SUCCESS) error_code = MPIO_Err_return_file(adio_fh, error_code); /* --END ERROR HANDLING-- */ fn_exit: if (e32buf!= NULL) ADIOI_Free(e32buf); MPIU_THREAD_CS_EXIT(ALLFUNC,); return error_code; }
/*@ MPI_File_write_shared - Write using shared file pointer Input Parameters: . fh - file handle (handle) . buf - initial address of buffer (choice) . count - number of elements in buffer (nonnegative integer) . datatype - datatype of each buffer element (handle) Output Parameters: . status - status object (Status) .N fortran @*/ int MPI_File_write_shared(MPI_File mpi_fh, void *buf, int count, MPI_Datatype datatype, MPI_Status *status) { int error_code, bufsize, buftype_is_contig, filetype_is_contig; static char myname[] = "MPI_FILE_READ_SHARED"; int datatype_size, incr; ADIO_Offset off, shared_fp; ADIO_File fh; MPIU_THREAD_SINGLE_CS_ENTER("io"); MPIR_Nest_incr(); fh = MPIO_File_resolve(mpi_fh); /* --BEGIN ERROR HANDLING-- */ MPIO_CHECK_FILE_HANDLE(fh, myname, error_code); MPIO_CHECK_COUNT(fh, count, myname, error_code); MPIO_CHECK_DATATYPE(fh, datatype, myname, error_code); /* --END ERROR HANDLING-- */ MPI_Type_size(datatype, &datatype_size); if (count*datatype_size == 0) { #ifdef HAVE_STATUS_SET_BYTES MPIR_Status_set_bytes(status, datatype, 0); #endif error_code = MPI_SUCCESS; goto fn_exit; } /* --BEGIN ERROR HANDLING-- */ MPIO_CHECK_INTEGRAL_ETYPE(fh, count, datatype_size, myname, error_code); MPIO_CHECK_FS_SUPPORTS_SHARED(fh, myname, error_code); /* --END ERROR HANDLING-- */ ADIOI_Datatype_iscontig(datatype, &buftype_is_contig); ADIOI_Datatype_iscontig(fh->filetype, &filetype_is_contig); ADIOI_TEST_DEFERRED(fh, myname, &error_code); incr = (count*datatype_size)/fh->etype_size; ADIO_Get_shared_fp(fh, incr, &shared_fp, &error_code); /* --BEGIN ERROR HANDLING-- */ if (error_code != MPI_SUCCESS) { error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_FATAL, myname, __LINE__, MPI_ERR_INTERN, "**iosharedfailed", 0); error_code = MPIO_Err_return_file(fh, error_code); goto fn_exit; } /* --END ERROR HANDLING-- */ if (buftype_is_contig && filetype_is_contig) { /* convert bufocunt and shared_fp to bytes */ bufsize = datatype_size * count; off = fh->disp + fh->etype_size * shared_fp; /* if atomic mode requested, lock (exclusive) the region, because there could be a concurrent noncontiguous request. On NFS, locking is done in the ADIO_WriteContig.*/ if ((fh->atomicity) && (fh->file_system != ADIO_NFS)) ADIOI_WRITE_LOCK(fh, off, SEEK_SET, bufsize); ADIO_WriteContig(fh, buf, count, datatype, ADIO_EXPLICIT_OFFSET, off, status, &error_code); if ((fh->atomicity) && (fh->file_system != ADIO_NFS)) ADIOI_UNLOCK(fh, off, SEEK_SET, bufsize); } else { ADIO_WriteStrided(fh, buf, count, datatype, ADIO_EXPLICIT_OFFSET, shared_fp, status, &error_code); /* For strided and atomic mode, locking is done in ADIO_WriteStrided */ } /* --BEGIN ERROR HANDLING-- */ if (error_code != MPI_SUCCESS) error_code = MPIO_Err_return_file(fh, error_code); /* --END ERROR HANDLING-- */ fn_exit: MPIR_Nest_decr(); MPIU_THREAD_SINGLE_CS_EXIT("io"); return error_code; }
/* wrapper function for ADIO_WriteStrided and ADIO_ReadStrided. Used * by new 2 phase code to pass an arbitrary file type directly to * WriteStrided call without affecting existing code. For the new 2 * phase code, we really only need to set a custom_ftype, and we can * assume that this uses MPI_BYTE for the etype, and disp is 0 */ void ADIOI_IOFiletype(ADIO_File fd, void *buf, int count, MPI_Datatype datatype, int file_ptr_type, ADIO_Offset offset, MPI_Datatype custom_ftype, int rdwr, ADIO_Status * status, int *error_code) { MPI_Datatype user_filetype; MPI_Datatype user_etype; ADIO_Offset user_disp; int user_ind_wr_buffer_size; int user_ind_rd_buffer_size; int f_is_contig, m_is_contig; int user_ds_read, user_ds_write; MPI_Aint f_extent; MPI_Count f_size; int f_ds_percent; /* size/extent */ #ifdef AGGREGATION_PROFILE if (rdwr == ADIOI_READ) MPE_Log_event(5006, 0, NULL); else MPE_Log_event(5008, 0, NULL); #endif MPI_Type_extent(custom_ftype, &f_extent); MPI_Type_size_x(custom_ftype, &f_size); f_ds_percent = 100 * f_size / f_extent; /* temporarily store file view information */ user_filetype = fd->filetype; user_etype = fd->etype; user_disp = fd->disp; user_ds_read = fd->hints->ds_read; user_ds_write = fd->hints->ds_write; /* temporarily override the independent I/O datasieve buffer size */ user_ind_wr_buffer_size = fd->hints->ind_wr_buffer_size; user_ind_rd_buffer_size = fd->hints->ind_rd_buffer_size; /* set new values for temporary file view */ fd->filetype = custom_ftype; fd->etype = MPI_BYTE; /* set new values for independent I/O datasieve buffer size */ fd->hints->ind_wr_buffer_size = fd->hints->cb_buffer_size; fd->hints->ind_rd_buffer_size = fd->hints->cb_buffer_size; /* decide whether or not to do datasieving */ #ifdef DEBUG printf("f_ds_percent = %d cb_ds_threshold = %d\n", f_ds_percent, fd->hints->cb_ds_threshold); #endif if (f_ds_percent >= fd->hints->cb_ds_threshold) { fd->hints->ds_read = ADIOI_HINT_ENABLE; fd->hints->ds_write = ADIOI_HINT_ENABLE; } else { fd->hints->ds_read = ADIOI_HINT_DISABLE; fd->hints->ds_write = ADIOI_HINT_DISABLE; } /* flatten the new filetype since the strided calls expect it to * have been flattened in set file view. in the two phase code, * the datatype passed down should always be MPI_BYTE, and * therefore contiguous, but just for completeness sake, we'll * check the memory datatype anyway */ ADIOI_Datatype_iscontig(custom_ftype, &f_is_contig); ADIOI_Datatype_iscontig(datatype, &m_is_contig); if (!f_is_contig) ADIOI_Flatten_datatype(custom_ftype); /* make appropriate Read/Write calls. Let ROMIO figure out file * system specific stuff. */ if (f_is_contig && m_is_contig) { fd->disp = 0; if (rdwr == ADIOI_READ) ADIO_ReadContig(fd, buf, count, datatype, file_ptr_type, offset, status, error_code); else ADIO_WriteContig(fd, buf, count, datatype, file_ptr_type, offset, status, error_code); } else { fd->disp = offset; if (rdwr == ADIOI_READ) ADIO_ReadStrided(fd, buf, count, datatype, file_ptr_type, 0, status, error_code); else ADIO_WriteStrided(fd, buf, count, datatype, file_ptr_type, 0, status, error_code); } /* restore the user specified file view to cover our tracks */ fd->filetype = user_filetype; fd->etype = user_etype; fd->disp = user_disp; fd->hints->ds_read = user_ds_read; fd->hints->ds_write = user_ds_write; fd->hints->ind_wr_buffer_size = user_ind_wr_buffer_size; fd->hints->ind_rd_buffer_size = user_ind_rd_buffer_size; #ifdef AGGREGATION_PROFILE if (rdwr == ADIOI_READ) MPE_Log_event(5007, 0, NULL); else MPE_Log_event(5009, 0, NULL); #endif }
/* Avery Ching and Kenin Columa's reworked two-phase algorithm. Key features * - persistent file domains * - an option to use alltoall instead of point-to-point */ void ADIOI_IOStridedColl(ADIO_File fd, void *buf, int count, int rdwr, MPI_Datatype datatype, int file_ptr_type, ADIO_Offset offset, ADIO_Status * status, int *error_code) { ADIO_Offset min_st_offset = 0, max_end_offset = 0; ADIO_Offset st_end_offset[2]; ADIO_Offset *all_st_end_offsets = NULL; int filetype_is_contig, buftype_is_contig, is_contig; ADIO_Offset off; int interleave_count = 0, i, nprocs, myrank, nprocs_for_coll; int cb_enable; ADIO_Offset bufsize; MPI_Aint extent; #ifdef DEBUG2 MPI_Aint bufextent; #endif MPI_Count size; int agg_rank; ADIO_Offset agg_disp; /* aggregated file offset */ MPI_Datatype agg_dtype; /* aggregated file datatype */ int aggregators_done = 0; ADIO_Offset buffered_io_size = 0; int *alltoallw_disps; int *alltoallw_counts; int *client_alltoallw_counts; int *agg_alltoallw_counts; char *cb_buf = NULL; MPI_Datatype *client_comm_dtype_arr; /* aggregator perspective */ MPI_Datatype *agg_comm_dtype_arr; /* client perspective */ ADIO_Offset *client_comm_sz_arr; /* aggregator perspective */ ADIO_Offset *agg_comm_sz_arr; /* client perspective */ /* file views for each client and aggregator */ view_state *client_file_view_state_arr = NULL; view_state *agg_file_view_state_arr = NULL; /* mem views for local process */ view_state *my_mem_view_state_arr = NULL; MPI_Status *agg_comm_statuses = NULL; MPI_Request *agg_comm_requests = NULL; MPI_Status *client_comm_statuses = NULL; MPI_Request *client_comm_requests = NULL; int aggs_client_count = 0; int clients_agg_count = 0; MPI_Comm_size(fd->comm, &nprocs); MPI_Comm_rank(fd->comm, &myrank); #ifdef DEBUG fprintf(stderr, "p%d: entering ADIOI_IOStridedColl\n", myrank); #endif #ifdef AGGREGATION_PROFILE if (rdwr == ADIOI_READ) MPE_Log_event(5010, 0, NULL); else MPE_Log_event(5012, 0, NULL); #endif /* I need to check if there are any outstanding nonblocking writes * to the file, which could potentially interfere with the writes * taking place in this collective write call. Since this is not * likely to be common, let me do the simplest thing possible here: * Each process completes all pending nonblocking operations before * completing. */ nprocs_for_coll = fd->hints->cb_nodes; if (rdwr == ADIOI_READ) cb_enable = fd->hints->cb_read; else cb_enable = fd->hints->cb_write; /* only check for interleaving if cb_read isn't disabled */ if (cb_enable != ADIOI_HINT_DISABLE) { /* find the starting and ending byte of my I/O access */ ADIOI_Calc_bounds(fd, count, datatype, file_ptr_type, offset, &st_end_offset[0], &st_end_offset[1]); /* allocate an array of start/end pairs */ all_st_end_offsets = (ADIO_Offset *) ADIOI_Malloc(2 * nprocs * sizeof(ADIO_Offset)); MPI_Allgather(st_end_offset, 2, ADIO_OFFSET, all_st_end_offsets, 2, ADIO_OFFSET, fd->comm); min_st_offset = all_st_end_offsets[0]; max_end_offset = all_st_end_offsets[1]; for (i = 1; i < nprocs; i++) { /* are the accesses of different processes interleaved? */ if ((all_st_end_offsets[i * 2] < all_st_end_offsets[i * 2 - 1]) && (all_st_end_offsets[i * 2] <= all_st_end_offsets[i * 2 + 1])) interleave_count++; /* This is a rudimentary check for interleaving, but should * suffice for the moment. */ min_st_offset = MPL_MIN(all_st_end_offsets[i * 2], min_st_offset); max_end_offset = MPL_MAX(all_st_end_offsets[i * 2 + 1], max_end_offset); } } ADIOI_Datatype_iscontig(datatype, &buftype_is_contig); ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig); if ((cb_enable == ADIOI_HINT_DISABLE || (!interleave_count && (cb_enable == ADIOI_HINT_AUTO))) && (fd->hints->cb_pfr != ADIOI_HINT_ENABLE)) { if (cb_enable != ADIOI_HINT_DISABLE) { ADIOI_Free(all_st_end_offsets); } if (buftype_is_contig && filetype_is_contig) { if (file_ptr_type == ADIO_EXPLICIT_OFFSET) { off = fd->disp + (fd->etype_size) * offset; if (rdwr == ADIOI_READ) ADIO_ReadContig(fd, buf, count, datatype, ADIO_EXPLICIT_OFFSET, off, status, error_code); else ADIO_WriteContig(fd, buf, count, datatype, ADIO_EXPLICIT_OFFSET, off, status, error_code); } else { if (rdwr == ADIOI_READ) ADIO_ReadContig(fd, buf, count, datatype, ADIO_INDIVIDUAL, 0, status, error_code); else ADIO_WriteContig(fd, buf, count, datatype, ADIO_INDIVIDUAL, 0, status, error_code); } } else { if (rdwr == ADIOI_READ) ADIO_ReadStrided(fd, buf, count, datatype, file_ptr_type, offset, status, error_code); else ADIO_WriteStrided(fd, buf, count, datatype, file_ptr_type, offset, status, error_code); } return; } MPI_Type_extent(datatype, &extent); #ifdef DEBUG2 bufextent = extent * count; #endif MPI_Type_size_x(datatype, &size); bufsize = size * (MPI_Count) count; /* Calculate file realms */ if ((fd->hints->cb_pfr != ADIOI_HINT_ENABLE) || (fd->file_realm_types == NULL)) ADIOI_Calc_file_realms(fd, min_st_offset, max_end_offset); my_mem_view_state_arr = (view_state *) ADIOI_Calloc(1, nprocs * sizeof(view_state)); agg_file_view_state_arr = (view_state *) ADIOI_Calloc(1, nprocs * sizeof(view_state)); client_comm_sz_arr = (ADIO_Offset *) ADIOI_Calloc(1, nprocs * sizeof(ADIO_Offset)); if (fd->is_agg) { client_file_view_state_arr = (view_state *) ADIOI_Calloc(1, nprocs * sizeof(view_state)); } else { client_file_view_state_arr = NULL; } /* Alltoallw doesn't like a null array even if the counts are * zero. If you do not include this code, it will fail. */ client_comm_dtype_arr = (MPI_Datatype *) ADIOI_Calloc(1, nprocs * sizeof(MPI_Datatype)); if (!fd->is_agg) for (i = 0; i < nprocs; i++) client_comm_dtype_arr[i] = MPI_BYTE; ADIOI_Exch_file_views(myrank, nprocs, file_ptr_type, fd, count, datatype, offset, my_mem_view_state_arr, agg_file_view_state_arr, client_file_view_state_arr); agg_comm_sz_arr = (ADIO_Offset *) ADIOI_Calloc(1, nprocs * sizeof(ADIO_Offset)); agg_comm_dtype_arr = (MPI_Datatype *) ADIOI_Malloc(nprocs * sizeof(MPI_Datatype)); if (fd->is_agg) { ADIOI_Build_agg_reqs(fd, rdwr, nprocs, client_file_view_state_arr, client_comm_dtype_arr, client_comm_sz_arr, &agg_disp, &agg_dtype); buffered_io_size = 0; for (i = 0; i < nprocs; i++) { if (client_comm_sz_arr[i] > 0) buffered_io_size += client_comm_sz_arr[i]; } } #ifdef USE_PRE_REQ else { /* Example use of ADIOI_Build_client_pre_req. to an * appropriate section */ for (i = 0; i < fd->hints->cb_nodes; i++) { agg_rank = fd->hints->ranklist[(i + myrank) % fd->hints->cb_nodes]; #ifdef AGGREGATION_PROFILE MPE_Log_event(5040, 0, NULL); #endif ADIOI_Build_client_pre_req(fd, agg_rank, (i + myrank) % fd->hints->cb_nodes, &(my_mem_view_state_arr[agg_rank]), &(agg_file_view_state_arr[agg_rank]), 2 * 1024 * 1024, 64 * 1024); #ifdef AGGREGATION_PROFILE MPE_Log_event(5041, 0, NULL); #endif } } #endif if (fd->is_agg) cb_buf = (char *) ADIOI_Malloc(fd->hints->cb_buffer_size); alltoallw_disps = (int *) ADIOI_Calloc(nprocs, sizeof(int)); alltoallw_counts = client_alltoallw_counts = (int *) ADIOI_Calloc(2 * nprocs, sizeof(int)); agg_alltoallw_counts = &alltoallw_counts[nprocs]; if (fd->hints->cb_alltoall == ADIOI_HINT_DISABLE) { /* aggregators pre-post all Irecv's for incoming data from clients */ if ((fd->is_agg) && (rdwr == ADIOI_WRITE)) post_aggregator_comm(fd->comm, rdwr, nprocs, cb_buf, client_comm_dtype_arr, client_comm_sz_arr, &agg_comm_requests, &aggs_client_count); } /* Aggregators send amounts for data requested to clients */ Exch_data_amounts(fd, nprocs, client_comm_sz_arr, agg_comm_sz_arr, client_alltoallw_counts, agg_alltoallw_counts, &aggregators_done); #ifdef DEBUG fprintf(stderr, "client_alltoallw_counts[ "); for (i = 0; i < nprocs; i++) { fprintf(stderr, "%d ", client_alltoallw_counts[i]); } fprintf(stderr, "]\n"); fprintf(stderr, "agg_alltoallw_counts[ "); for (i = 0; i < nprocs; i++) { fprintf(stderr, "%d ", agg_alltoallw_counts[i]); } fprintf(stderr, "]\n"); #endif /* keep looping while aggregators still have I/O to do */ while (aggregators_done != nprocs_for_coll) { if (fd->hints->cb_alltoall == ADIOI_HINT_DISABLE) { /* clients should build datatypes for local memory locations * for data communication with aggregators and post * communication as the datatypes are built */ client_comm_requests = (MPI_Request *) ADIOI_Calloc(fd->hints->cb_nodes, sizeof(MPI_Request)); for (i = 0; i < fd->hints->cb_nodes; i++) { clients_agg_count = 0; agg_rank = fd->hints->ranklist[(i + myrank) % fd->hints->cb_nodes]; if (agg_comm_sz_arr[agg_rank] > 0) { ADIOI_Build_client_req(fd, agg_rank, (i + myrank) % fd->hints->cb_nodes, &(my_mem_view_state_arr[agg_rank]), &(agg_file_view_state_arr[agg_rank]), agg_comm_sz_arr[agg_rank], &(agg_comm_dtype_arr[agg_rank])); #ifdef AGGREGATION_PROFILE if (i == 0) MPE_Log_event(5038, 0, NULL); #endif post_client_comm(fd, rdwr, agg_rank, buf, agg_comm_dtype_arr[agg_rank], agg_alltoallw_counts[agg_rank], &client_comm_requests[clients_agg_count]); clients_agg_count++; } } #ifdef AGGREGATION_PROFILE if (!clients_agg_count) MPE_Log_event(5039, 0, NULL); #endif if (rdwr == ADIOI_READ) { if (fd->is_agg && buffered_io_size) { ADIOI_IOFiletype(fd, cb_buf, buffered_io_size, MPI_BYTE, ADIO_EXPLICIT_OFFSET, agg_disp, agg_dtype, ADIOI_READ, status, error_code); if (*error_code != MPI_SUCCESS) return; MPI_Type_free(&agg_dtype); } #ifdef DEBUG fprintf(stderr, "expecting from [agg](disp,size,cnt)="); for (i = 0; i < nprocs; i++) { MPI_Type_size_x(agg_comm_dtype_arr[i], &size); fprintf(stderr, "[%d](%d,%d,%d)", i, alltoallw_disps[i], size, agg_alltoallw_counts[i]); if (i != nprocs - 1) fprintf(stderr, ","); } fprintf(stderr, "]\n"); if (fd->is_agg) { fprintf(stderr, "sending to [client](disp,size,cnt)="); for (i = 0; i < nprocs; i++) { if (fd->is_agg) MPI_Type_size_x(client_comm_dtype_arr[i], &size); else size = -1; fprintf(stderr, "[%d](%d,%d,%d)", i, alltoallw_disps[i], size, client_alltoallw_counts[i]); if (i != nprocs - 1) fprintf(stderr, ","); } fprintf(stderr, "\n"); } fflush(NULL); #endif /* aggregators post all Isends for outgoing data to clients */ if (fd->is_agg) post_aggregator_comm(fd->comm, rdwr, nprocs, cb_buf, client_comm_dtype_arr, client_comm_sz_arr, &agg_comm_requests, &aggs_client_count); if (fd->is_agg && aggs_client_count) { #ifdef MPI_STATUSES_IGNORE agg_comm_statuses = MPI_STATUSES_IGNORE; #else agg_comm_statuses = ADIOI_Malloc(aggs_client_count * sizeof(MPI_Status)); #endif MPI_Waitall(aggs_client_count, agg_comm_requests, agg_comm_statuses); #ifdef AGGREGATION_PROFILE MPE_Log_event(5033, 0, NULL); #endif ADIOI_Free(agg_comm_requests); #ifndef MPI_STATUSES_IGNORE ADIOI_Free(agg_comm_statuses); #endif } if (clients_agg_count) { #ifdef MPI_STATUSES_IGNORE client_comm_statuses = MPI_STATUSES_IGNORE; #else client_comm_statuses = ADIOI_Malloc(clients_agg_count * sizeof(MPI_Status)); #endif MPI_Waitall(clients_agg_count, client_comm_requests, client_comm_statuses); #ifdef AGGREGATION_PROFILE MPE_Log_event(5039, 0, NULL); #endif ADIOI_Free(client_comm_requests); #ifndef MPI_STATUSES_IGNORE ADIOI_Free(client_comm_statuses); #endif } #ifdef DEBUG2 fprintf(stderr, "buffered_io_size = %lld\n", buffered_io_size); if (fd->is_agg && buffered_io_size) { fprintf(stderr, "buf = ["); for (i = 0; i < bufextent; i++) fprintf(stderr, "%c", ((char *) buf)[i]); fprintf(stderr, "]\n"); fprintf(stderr, "cb_buf = ["); for (i = 0; i < buffered_io_size; i++) fprintf(stderr, "%c", cb_buf[i]); fprintf(stderr, "]\n"); fflush(NULL); } #endif } else { /* Write Case */ #ifdef DEBUG fprintf(stderr, "sending to [agg](disp,size,cnt)="); for (i = 0; i < nprocs; i++) { MPI_Type_size_x(agg_comm_dtype_arr[i], &size); fprintf(stderr, "[%d](%d,%d,%d)", i, alltoallw_disps[i], size, agg_alltoallw_counts[i]); if (i != nprocs - 1) fprintf(stderr, ","); } fprintf(stderr, "]\n"); fprintf(stderr, "expecting from [client](disp,size,cnt)="); for (i = 0; i < nprocs; i++) { if (fd->is_agg) MPI_Type_size_x(client_comm_dtype_arr[i], &size); else size = -1; fprintf(stderr, "[%d](%d,%d,%d)", i, alltoallw_disps[i], size, client_alltoallw_counts[i]); if (i != nprocs - 1) fprintf(stderr, ","); } fprintf(stderr, "\n"); fflush(NULL); #endif #ifdef DEBUG fprintf(stderr, "buffered_io_size = %lld\n", buffered_io_size); #endif if (clients_agg_count) { #ifdef MPI_STATUSES_IGNORE client_comm_statuses = MPI_STATUSES_IGNORE; #else client_comm_statuses = ADIOI_Malloc(clients_agg_count * sizeof(MPI_Status)); #endif MPI_Waitall(clients_agg_count, client_comm_requests, client_comm_statuses); #ifdef AGGREGATION_PROFILE MPE_Log_event(5039, 0, NULL); #endif ADIOI_Free(client_comm_requests); #ifndef MPI_STATUSES_IGNORE ADIOI_Free(client_comm_statuses); #endif } #ifdef DEBUG2 if (bufextent) { fprintf(stderr, "buf = ["); for (i = 0; i < bufextent; i++) fprintf(stderr, "%c", ((char *) buf)[i]); fprintf(stderr, "]\n"); } #endif if (fd->is_agg && buffered_io_size) { ADIOI_Assert(aggs_client_count != 0); /* make sure we actually have the data to write out */ #ifdef MPI_STATUSES_IGNORE agg_comm_statuses = MPI_STATUSES_IGNORE; #else agg_comm_statuses = (MPI_Status *) ADIOI_Malloc(aggs_client_count * sizeof(MPI_Status)); #endif MPI_Waitall(aggs_client_count, agg_comm_requests, agg_comm_statuses); #ifdef AGGREGATION_PROFILE MPE_Log_event(5033, 0, NULL); #endif ADIOI_Free(agg_comm_requests); #ifndef MPI_STATUSES_IGNORE ADIOI_Free(agg_comm_statuses); #endif #ifdef DEBUG2 fprintf(stderr, "cb_buf = ["); for (i = 0; i < buffered_io_size; i++) fprintf(stderr, "%c", cb_buf[i]); fprintf(stderr, "]\n"); fflush(NULL); #endif ADIOI_IOFiletype(fd, cb_buf, buffered_io_size, MPI_BYTE, ADIO_EXPLICIT_OFFSET, agg_disp, agg_dtype, ADIOI_WRITE, status, error_code); if (*error_code != MPI_SUCCESS) return; MPI_Type_free(&agg_dtype); } } } else { /* Alltoallw version of everything */ ADIOI_Build_client_reqs(fd, nprocs, my_mem_view_state_arr, agg_file_view_state_arr, agg_comm_sz_arr, agg_comm_dtype_arr); if (rdwr == ADIOI_READ) { if (fd->is_agg && buffered_io_size) { ADIOI_IOFiletype(fd, cb_buf, buffered_io_size, MPI_BYTE, ADIO_EXPLICIT_OFFSET, agg_disp, agg_dtype, ADIOI_READ, status, error_code); if (*error_code != MPI_SUCCESS) return; MPI_Type_free(&agg_dtype); } #ifdef AGGREGATION_PROFILE MPE_Log_event(5032, 0, NULL); #endif MPI_Alltoallw(cb_buf, client_alltoallw_counts, alltoallw_disps, client_comm_dtype_arr, buf, agg_alltoallw_counts, alltoallw_disps, agg_comm_dtype_arr, fd->comm); #ifdef AGGREGATION_PROFILE MPE_Log_event(5033, 0, NULL); #endif } else { /* Write Case */ #ifdef AGGREGATION_PROFILE MPE_Log_event(5032, 0, NULL); #endif MPI_Alltoallw(buf, agg_alltoallw_counts, alltoallw_disps, agg_comm_dtype_arr, cb_buf, client_alltoallw_counts, alltoallw_disps, client_comm_dtype_arr, fd->comm); #ifdef AGGREGATION_PROFILE MPE_Log_event(5033, 0, NULL); #endif if (fd->is_agg && buffered_io_size) { ADIOI_IOFiletype(fd, cb_buf, buffered_io_size, MPI_BYTE, ADIO_EXPLICIT_OFFSET, agg_disp, agg_dtype, ADIOI_WRITE, status, error_code); if (*error_code != MPI_SUCCESS) return; MPI_Type_free(&agg_dtype); } } } /* Free (uncommit) datatypes for reuse */ if (fd->is_agg) { if (buffered_io_size > 0) { for (i = 0; i < nprocs; i++) { if (client_comm_sz_arr[i] > 0) MPI_Type_free(&client_comm_dtype_arr[i]); } } } for (i = 0; i < nprocs; i++) { if (agg_comm_sz_arr[i] > 0) MPI_Type_free(&agg_comm_dtype_arr[i]); } /* figure out next set up requests */ if (fd->is_agg) { ADIOI_Build_agg_reqs(fd, rdwr, nprocs, client_file_view_state_arr, client_comm_dtype_arr, client_comm_sz_arr, &agg_disp, &agg_dtype); buffered_io_size = 0; for (i = 0; i < nprocs; i++) { if (client_comm_sz_arr[i] > 0) buffered_io_size += client_comm_sz_arr[i]; } } #ifdef USE_PRE_REQ else { /* Example use of ADIOI_Build_client_pre_req. to an * appropriate section */ for (i = 0; i < fd->hints->cb_nodes; i++) { agg_rank = fd->hints->ranklist[(i + myrank) % fd->hints->cb_nodes]; #ifdef AGGREGATION_PROFILE MPE_Log_event(5040, 0, NULL); #endif ADIOI_Build_client_pre_req(fd, agg_rank, (i + myrank) % fd->hints->cb_nodes, &(my_mem_view_state_arr[agg_rank]), &(agg_file_view_state_arr[agg_rank]), 2 * 1024 * 1024, 64 * 1024); #ifdef AGGREGATION_PROFILE MPE_Log_event(5041, 0, NULL); #endif } } #endif /* aggregators pre-post all Irecv's for incoming data from * clients. if nothing is needed, agg_comm_requests is not * allocated */ if (fd->hints->cb_alltoall == ADIOI_HINT_DISABLE) { if ((fd->is_agg) && (rdwr == ADIOI_WRITE)) post_aggregator_comm(fd->comm, rdwr, nprocs, cb_buf, client_comm_dtype_arr, client_comm_sz_arr, &agg_comm_requests, &aggs_client_count); } /* Aggregators send amounts for data requested to clients */ Exch_data_amounts(fd, nprocs, client_comm_sz_arr, agg_comm_sz_arr, client_alltoallw_counts, agg_alltoallw_counts, &aggregators_done); } /* Clean up */ if (fd->hints->cb_pfr != ADIOI_HINT_ENABLE) { /* AAR, FSIZE, and User provided uniform File realms */ if (1) { MPI_Type_free(&fd->file_realm_types[0]); } else { for (i = 0; i < fd->hints->cb_nodes; i++) { ADIOI_Datatype_iscontig(fd->file_realm_types[i], &is_contig); MPI_Type_free(&fd->file_realm_types[i]); } } ADIOI_Free(fd->file_realm_types); ADIOI_Free(fd->file_realm_st_offs); } if (fd->is_agg) { if (buffered_io_size > 0) MPI_Type_free(&agg_dtype); for (i = 0; i < nprocs; i++) { MPI_Type_free(&client_comm_dtype_arr[i]); ADIOI_Free(client_file_view_state_arr[i].flat_type_p->indices); ADIOI_Free(client_file_view_state_arr[i].flat_type_p->blocklens); ADIOI_Free(client_file_view_state_arr[i].flat_type_p); } ADIOI_Free(client_file_view_state_arr); ADIOI_Free(cb_buf); } for (i = 0; i < nprocs; i++) if (agg_comm_sz_arr[i] > 0) MPI_Type_free(&agg_comm_dtype_arr[i]); ADIOI_Free(client_comm_sz_arr); ADIOI_Free(client_comm_dtype_arr); ADIOI_Free(my_mem_view_state_arr); ADIOI_Free(agg_file_view_state_arr); ADIOI_Free(agg_comm_sz_arr); ADIOI_Free(agg_comm_dtype_arr); ADIOI_Free(alltoallw_disps); ADIOI_Free(alltoallw_counts); ADIOI_Free(all_st_end_offsets); #ifdef HAVE_STATUS_SET_BYTES MPIR_Status_set_bytes(status, datatype, bufsize); /* This is a temporary way of filling in status. The right way is * to keep track of how much data was actually read and placed in * buf during collective I/O. */ #endif fd->fp_sys_posn = -1; /* set it to null. */ #ifdef AGGREGATION_PROFILE if (rdwr == ADIOI_READ) MPE_Log_event(5011, 0, NULL); else MPE_Log_event(5013, 0, NULL); #endif }