int MPIOI_File_iread(MPI_File fh, MPI_Offset offset, int file_ptr_type, void *buf, int count, MPI_Datatype datatype, char *myname, MPI_Request *request) { int error_code, buftype_is_contig, filetype_is_contig; MPI_Count datatype_size; ADIO_Status status; ADIO_File adio_fh; ADIO_Offset off, bufsize; MPI_Offset nbytes=0; MPIU_THREAD_CS_ENTER(ALLFUNC,); adio_fh = MPIO_File_resolve(fh); /* --BEGIN ERROR HANDLING-- */ MPIO_CHECK_FILE_HANDLE(adio_fh, myname, error_code); MPIO_CHECK_COUNT(adio_fh, count, myname, error_code); MPIO_CHECK_DATATYPE(adio_fh, datatype, myname, error_code); if (file_ptr_type == ADIO_EXPLICIT_OFFSET && offset < 0) { error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_ARG, "**iobadoffset", 0); error_code = MPIO_Err_return_file(adio_fh, error_code); goto fn_exit; } /* --END ERROR HANDLING-- */ MPI_Type_size_x(datatype, &datatype_size); /* --BEGIN ERROR HANDLING-- */ MPIO_CHECK_INTEGRAL_ETYPE(adio_fh, count, datatype_size, myname, error_code); MPIO_CHECK_READABLE(adio_fh, myname, error_code); MPIO_CHECK_NOT_SEQUENTIAL_MODE(adio_fh, myname, error_code); MPIO_CHECK_COUNT_SIZE(adio_fh, count, datatype_size, myname, error_code); /* --END ERROR HANDLING-- */ ADIOI_Datatype_iscontig(datatype, &buftype_is_contig); ADIOI_Datatype_iscontig(adio_fh->filetype, &filetype_is_contig); ADIOI_TEST_DEFERRED(adio_fh, myname, &error_code); if (buftype_is_contig && filetype_is_contig) { /* convert count and offset to bytes */ bufsize = datatype_size * count; if (file_ptr_type == ADIO_EXPLICIT_OFFSET) { off = adio_fh->disp + adio_fh->etype_size * offset; } else { off = adio_fh->fp_ind; } if (!(adio_fh->atomicity)) ADIO_IreadContig(adio_fh, buf, count, datatype, file_ptr_type, off, request, &error_code); else { /* to maintain strict atomicity semantics with other concurrent operations, lock (exclusive) and call blocking routine */ if (ADIO_Feature(adio_fh, ADIO_LOCKS)) { ADIOI_WRITE_LOCK(adio_fh, off, SEEK_SET, bufsize); } ADIO_ReadContig(adio_fh, buf, count, datatype, file_ptr_type, off, &status, &error_code); if (ADIO_Feature(adio_fh, ADIO_LOCKS)) { ADIOI_UNLOCK(adio_fh, off, SEEK_SET, bufsize); } if (error_code == MPI_SUCCESS) { nbytes = count*datatype_size; } MPIO_Completed_request_create(&adio_fh, nbytes, &error_code, request); } } else ADIO_IreadStrided(adio_fh, buf, count, datatype, file_ptr_type, offset, request, &error_code); fn_exit: MPIU_THREAD_CS_EXIT(ALLFUNC,); return error_code; }
/*@ MPI_File_iread_shared - Nonblocking read using shared file pointer Input Parameters: . fh - file handle (handle) . count - number of elements in buffer (nonnegative integer) . datatype - datatype of each buffer element (handle) Output Parameters: . buf - initial address of buffer (choice) . request - request object (handle) .N fortran @*/ int MPI_File_iread_shared(MPI_File fh, void *buf, int count, MPI_Datatype datatype, MPI_Request *request) { int error_code, buftype_is_contig, filetype_is_contig; ADIO_Offset bufsize; ADIO_File adio_fh; static char myname[] = "MPI_FILE_IREAD_SHARED"; MPI_Count datatype_size, incr; MPI_Status status; ADIO_Offset off, shared_fp; MPI_Offset nbytes=0; MPIU_THREAD_CS_ENTER(ALLFUNC,); adio_fh = MPIO_File_resolve(fh); /* --BEGIN ERROR HANDLING-- */ MPIO_CHECK_FILE_HANDLE(adio_fh, myname, error_code); MPIO_CHECK_COUNT(adio_fh, count, myname, error_code); MPIO_CHECK_DATATYPE(adio_fh, datatype, myname, error_code); /* --END ERROR HANDLING-- */ MPI_Type_size_x(datatype, &datatype_size); /* --BEGIN ERROR HANDLING-- */ MPIO_CHECK_INTEGRAL_ETYPE(adio_fh, count, datatype_size, myname, error_code); MPIO_CHECK_FS_SUPPORTS_SHARED(adio_fh, myname, error_code); MPIO_CHECK_COUNT_SIZE(adio_fh, count, datatype_size, myname, error_code); /* --END ERROR HANDLING-- */ ADIOI_Datatype_iscontig(datatype, &buftype_is_contig); ADIOI_Datatype_iscontig(adio_fh->filetype, &filetype_is_contig); ADIOI_TEST_DEFERRED(adio_fh, myname, &error_code); incr = (count*datatype_size)/adio_fh->etype_size; ADIO_Get_shared_fp(adio_fh, incr, &shared_fp, &error_code); /* --BEGIN ERROR HANDLING-- */ if (error_code != MPI_SUCCESS) { /* note: ADIO_Get_shared_fp should have set up error code already? */ MPIO_Err_return_file(adio_fh, error_code); } /* --END ERROR HANDLING-- */ if (buftype_is_contig && filetype_is_contig) { /* convert count and shared_fp to bytes */ bufsize = datatype_size * count; off = adio_fh->disp + adio_fh->etype_size * shared_fp; if (!(adio_fh->atomicity)) { ADIO_IreadContig(adio_fh, buf, count, datatype, ADIO_EXPLICIT_OFFSET, off, request, &error_code); } else { /* to maintain strict atomicity semantics with other concurrent operations, lock (exclusive) and call blocking routine */ if (adio_fh->file_system != ADIO_NFS) { ADIOI_WRITE_LOCK(adio_fh, off, SEEK_SET, bufsize); } ADIO_ReadContig(adio_fh, buf, count, datatype, ADIO_EXPLICIT_OFFSET, off, &status, &error_code); if (adio_fh->file_system != ADIO_NFS) { ADIOI_UNLOCK(adio_fh, off, SEEK_SET, bufsize); } if (error_code == MPI_SUCCESS){ nbytes = count * datatype_size; } MPIO_Completed_request_create(&adio_fh, nbytes, &error_code, request); } } else { ADIO_IreadStrided(adio_fh, buf, count, datatype, ADIO_EXPLICIT_OFFSET, shared_fp, request, &error_code); } /* --BEGIN ERROR HANDLING-- */ if (error_code != MPI_SUCCESS) error_code = MPIO_Err_return_file(adio_fh, error_code); /* --END ERROR HANDLING-- */ fn_exit: MPIU_THREAD_CS_EXIT(ALLFUNC,); return error_code; }
static void ADIOI_Iread_and_exch_l1_begin(ADIOI_NBC_Request *nbc_req, int *error_code) { ADIOI_Iread_and_exch_vars *vars = nbc_req->data.rd.rae_vars; ADIO_File fd; int nprocs; ADIOI_Access *others_req; int i, j; ADIO_Offset real_off, req_off; char *read_buf; int *curr_offlen_ptr, *count, *send_size; int *partial_send, *start_pos; ADIO_Offset size, real_size, for_next_iter; int req_len, flag; ADIOI_R_Iexchange_data_vars *red_vars = NULL; /* loop exit condition */ if (vars->m >= vars->ntimes) { ADIOI_Iread_and_exch_reset(nbc_req, error_code); return; } fd = vars->fd; nprocs = vars->nprocs; others_req = vars->others_req; read_buf = vars->read_buf; curr_offlen_ptr = vars->curr_offlen_ptr; count = vars->count; send_size = vars->send_size; partial_send = vars->partial_send; start_pos = vars->start_pos; /* read buf of size coll_bufsize (or less) */ /* go through all others_req and check if any are satisfied by the current read */ /* since MPI guarantees that displacements in filetypes are in monotonically nondecreasing order, I can maintain a pointer (curr_offlen_ptr) to current off-len pair for each process in others_req and scan further only from there. There is still a problem of filetypes such as: (1, 2, 3 are not process nos. They are just numbers for three chunks of data, specified by a filetype.) 1 -------!-- 2 -----!---- 3 --!----- where ! indicates where the current read_size limitation cuts through the filetype. I resolve this by reading up to !, but filling the communication buffer only for 1. I copy the portion left over for 2 into a tmp_buf for use in the next iteration. i.e., 2 and 3 will be satisfied in the next iteration. This simplifies filling in the user's buf at the other end, as only one off-len pair with incomplete data will be sent. I also don't need to send the individual offsets and lens along with the data, as the data is being sent in a particular order. */ /* off = start offset in the file for the data actually read in this iteration size = size of data read corresponding to off real_off = off minus whatever data was retained in memory from previous iteration for cases like 2, 3 illustrated above real_size = size plus the extra corresponding to real_off req_off = off in file for a particular contiguous request minus what was satisfied in previous iteration req_size = size corresponding to req_off */ size = ADIOI_MIN((unsigned)vars->coll_bufsize, vars->end_loc - vars->st_loc + 1 - vars->done); real_off = vars->off - vars->for_curr_iter; real_size = size + vars->for_curr_iter; vars->size = size; vars->real_size = real_size; for (i = 0; i < nprocs; i++) count[i] = send_size[i] = 0; for_next_iter = 0; for (i = 0; i < nprocs; i++) { #ifdef RDCOLL_DEBUG DBG_FPRINTF(stderr, "rank %d, i %d, others_count %d\n", vars->myrank, i, others_req[i].count); #endif if (others_req[i].count) { start_pos[i] = curr_offlen_ptr[i]; for (j = curr_offlen_ptr[i]; j < others_req[i].count; j++) { if (partial_send[i]) { /* this request may have been partially satisfied in the previous iteration. */ req_off = others_req[i].offsets[j] + partial_send[i]; req_len = others_req[i].lens[j] - partial_send[i]; partial_send[i] = 0; /* modify the off-len pair to reflect this change */ others_req[i].offsets[j] = req_off; others_req[i].lens[j] = req_len; } else { req_off = others_req[i].offsets[j]; req_len = others_req[i].lens[j]; } if (req_off < real_off + real_size) { count[i]++; ADIOI_Assert((((ADIO_Offset)(MPIR_Upint)read_buf) + req_off - real_off) == (ADIO_Offset)(MPIR_Upint)(read_buf + req_off - real_off)); MPI_Address(read_buf + req_off - real_off, &(others_req[i].mem_ptrs[j])); ADIOI_Assert((real_off + real_size - req_off) == (int)(real_off + real_size - req_off)); send_size[i] += (int)(ADIOI_MIN(real_off + real_size - req_off, (ADIO_Offset)(unsigned)req_len)); if (real_off + real_size - req_off < (ADIO_Offset)(unsigned)req_len) { partial_send[i] = (int)(real_off + real_size - req_off); if ((j+1 < others_req[i].count) && (others_req[i].offsets[j+1] < real_off + real_size)) { /* this is the case illustrated in the figure above. */ for_next_iter = ADIOI_MAX(for_next_iter, real_off + real_size - others_req[i].offsets[j+1]); /* max because it must cover requests from different processes */ } break; } } else break; } curr_offlen_ptr[i] = j; } } vars->for_next_iter = for_next_iter; flag = 0; for (i = 0; i < nprocs; i++) if (count[i]) flag = 1; /* create a struct for ADIOI_R_Iexchange_data() */ red_vars = (ADIOI_R_Iexchange_data_vars *)ADIOI_Calloc( 1, sizeof(ADIOI_R_Iexchange_data_vars)); nbc_req->data.rd.red_vars = red_vars; red_vars->fd = vars->fd; red_vars->buf = vars->buf; red_vars->flat_buf = vars->flat_buf; red_vars->offset_list = vars->offset_list; red_vars->len_list = vars->len_list; red_vars->send_size = vars->send_size; red_vars->recv_size = vars->recv_size; red_vars->count = vars->count; red_vars->start_pos = vars->start_pos; red_vars->partial_send = vars->partial_send; red_vars->recd_from_proc = vars->recd_from_proc; red_vars->nprocs = vars->nprocs; red_vars->myrank = vars->myrank; red_vars->buftype_is_contig = vars->buftype_is_contig; red_vars->contig_access_count = vars->contig_access_count; red_vars->min_st_offset = vars->min_st_offset; red_vars->fd_size = vars->fd_size; red_vars->fd_start = vars->fd_start; red_vars->fd_end = vars->fd_end; red_vars->others_req = vars->others_req; red_vars->iter = vars->m; red_vars->buftype_extent = vars->buftype_extent; red_vars->buf_idx = vars->buf_idx; red_vars->next_fn = ADIOI_Iread_and_exch_l1_end; if (flag) { ADIOI_Assert(size == (int)size); ADIO_IreadContig(fd, read_buf+vars->for_curr_iter, (int)size, MPI_BYTE, ADIO_EXPLICIT_OFFSET, vars->off, &vars->req2, error_code); nbc_req->data.rd.state = ADIOI_IRC_STATE_IREAD_AND_EXCH_L1_BEGIN; return; } ADIOI_R_Iexchange_data(nbc_req, error_code); }
/*@ MPI_File_iread_at - Nonblocking read using explict offset Input Parameters: . fh - file handle (handle) . offset - file offset (nonnegative integer) . count - number of elements in buffer (nonnegative integer) . datatype - datatype of each buffer element (handle) Output Parameters: . buf - initial address of buffer (choice) . request - request object (handle) .N fortran @*/ int MPI_File_iread_at(MPI_File fh, MPI_Offset offset, void *buf, int count, MPI_Datatype datatype, MPIO_Request *request) { int error_code, bufsize, buftype_is_contig, filetype_is_contig; #ifndef PRINT_ERR_MSG static char myname[] = "MPI_FILE_IREAD_AT"; #endif int datatype_size; ADIO_Status status; ADIO_Offset off; #ifdef MPI_hpux int fl_xmpi; HPMP_IO_START(fl_xmpi, BLKMPIFILEIREADAT, TRDTSYSTEM, fh, datatype, count); #endif /* MPI_hpux */ #ifdef PRINT_ERR_MSG if ((fh <= (MPI_File) 0) || (fh->cookie != ADIOI_FILE_COOKIE)) { FPRINTF(stderr, "MPI_File_iread_at: Invalid file handle\n"); MPI_Abort(MPI_COMM_WORLD, 1); } #else ADIOI_TEST_FILE_HANDLE(fh, myname); #endif if (offset < 0) { #ifdef PRINT_ERR_MSG FPRINTF(stderr, "MPI_File_iread_at: Invalid offset argument\n"); MPI_Abort(MPI_COMM_WORLD, 1); #else error_code = MPIR_Err_setmsg(MPI_ERR_ARG, MPIR_ERR_OFFSET_ARG, myname, (char *) 0, (char *) 0); return ADIOI_Error(fh, error_code, myname); #endif } if (count < 0) { #ifdef PRINT_ERR_MSG FPRINTF(stderr, "MPI_File_iread_at: Invalid count argument\n"); MPI_Abort(MPI_COMM_WORLD, 1); #else error_code = MPIR_Err_setmsg(MPI_ERR_ARG, MPIR_ERR_COUNT_ARG, myname, (char *) 0, (char *) 0); return ADIOI_Error(fh, error_code, myname); #endif } if (datatype == MPI_DATATYPE_NULL) { #ifdef PRINT_ERR_MSG FPRINTF(stderr, "MPI_File_iread_at: Invalid datatype\n"); MPI_Abort(MPI_COMM_WORLD, 1); #else error_code = MPIR_Err_setmsg(MPI_ERR_TYPE, MPIR_ERR_TYPE_NULL, myname, (char *) 0, (char *) 0); return ADIOI_Error(fh, error_code, myname); #endif } MPI_Type_size(datatype, &datatype_size); if ((count*datatype_size) % fh->etype_size != 0) { #ifdef PRINT_ERR_MSG FPRINTF(stderr, "MPI_File_iread_at: Only an integral number of etypes can be accessed\n"); MPI_Abort(MPI_COMM_WORLD, 1); #else error_code = MPIR_Err_setmsg(MPI_ERR_IO, MPIR_ERR_ETYPE_FRACTIONAL, myname, (char *) 0, (char *) 0); return ADIOI_Error(fh, error_code, myname); #endif } if (fh->access_mode & MPI_MODE_WRONLY) { #ifdef PRINT_ERR_MSG FPRINTF(stderr, "MPI_File_iread_at: Can't read from a file opened with MPI_MODE_WRONLY\n"); MPI_Abort(MPI_COMM_WORLD, 1); #else error_code = MPIR_Err_setmsg(MPI_ERR_UNSUPPORTED_OPERATION, MPIR_ERR_MODE_WRONLY, myname, (char *) 0, (char *) 0); return ADIOI_Error(fh, error_code, myname); #endif } if (fh->access_mode & MPI_MODE_SEQUENTIAL) { #ifdef PRINT_ERR_MSG FPRINTF(stderr, "MPI_File_iread_at: Can't use this function because file was opened with MPI_MODE_SEQUENTIAL\n"); MPI_Abort(MPI_COMM_WORLD, 1); #else error_code = MPIR_Err_setmsg(MPI_ERR_UNSUPPORTED_OPERATION, MPIR_ERR_AMODE_SEQ, myname, (char *) 0, (char *) 0); return ADIOI_Error(fh, error_code, myname); #endif } ADIOI_Datatype_iscontig(datatype, &buftype_is_contig); ADIOI_Datatype_iscontig(fh->filetype, &filetype_is_contig); /* contiguous or strided? */ if (buftype_is_contig && filetype_is_contig) { /* convert count and offset to bytes */ bufsize = datatype_size * count; off = fh->disp + fh->etype_size * offset; if (!(fh->atomicity)) ADIO_IreadContig(fh, buf, count, datatype, ADIO_EXPLICIT_OFFSET, off, request, &error_code); else { /* to maintain strict atomicity semantics with other concurrent operations, lock (exclusive) and call blocking routine */ *request = ADIOI_Malloc_request(); (*request)->optype = ADIOI_READ; (*request)->fd = fh; (*request)->datatype = datatype; (*request)->queued = 0; (*request)->handle = 0; if ((fh->file_system != ADIO_PIOFS) && (fh->file_system != ADIO_NFS) && (fh->file_system != ADIO_PVFS)) ADIOI_WRITE_LOCK(fh, off, SEEK_SET, bufsize); ADIO_ReadContig(fh, buf, count, datatype, ADIO_EXPLICIT_OFFSET, off, &status, &error_code); if ((fh->file_system != ADIO_PIOFS) && (fh->file_system != ADIO_NFS) && (fh->file_system != ADIO_PVFS)) ADIOI_UNLOCK(fh, off, SEEK_SET, bufsize); fh->async_count++; /* status info. must be linked to the request structure, so that it can be accessed later from a wait */ } } else ADIO_IreadStrided(fh, buf, count, datatype, ADIO_EXPLICIT_OFFSET, offset, request, &error_code); #ifdef MPI_hpux HPMP_IO_END(fl_xmpi, fh, datatype, count); #endif /* MPI_hpux */ return error_code; }
static void ADIOI_GEN_IreadStridedColl_indio(ADIOI_NBC_Request *nbc_req, int *error_code) { ADIOI_GEN_IreadStridedColl_vars *vars = nbc_req->data.rd.rsc_vars; ADIOI_Icalc_others_req_vars *cor_vars = NULL; ADIO_File fd = vars->fd; void *buf; int count, file_ptr_type; MPI_Datatype datatype = vars->datatype; ADIO_Offset offset; int filetype_is_contig; ADIO_Offset off; int nprocs; ADIOI_Datatype_iscontig(datatype, &vars->buftype_is_contig); if (fd->hints->cb_read == ADIOI_HINT_DISABLE || (!vars->interleave_count && (fd->hints->cb_read == ADIOI_HINT_AUTO))) { buf = vars->buf; count = vars->count; file_ptr_type = vars->file_ptr_type; offset = vars->offset; /* don't do aggregation */ if (fd->hints->cb_read != ADIOI_HINT_DISABLE) { ADIOI_Free(vars->offset_list); ADIOI_Free(vars->len_list); ADIOI_Free(vars->st_offsets); ADIOI_Free(vars->end_offsets); } fd->fp_ind = vars->orig_fp; ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig); if (vars->buftype_is_contig && filetype_is_contig) { if (file_ptr_type == ADIO_EXPLICIT_OFFSET) { off = fd->disp + (fd->etype_size) * offset; ADIO_IreadContig(fd, buf, count, datatype, ADIO_EXPLICIT_OFFSET, off, &vars->req_ind_io, error_code); } else ADIO_IreadContig(fd, buf, count, datatype, ADIO_INDIVIDUAL, 0, &vars->req_ind_io, error_code); } else { ADIO_IreadStrided(fd, buf, count, datatype, file_ptr_type, offset, &vars->req_ind_io, error_code); } nbc_req->data.rd.state = ADIOI_IRC_STATE_GEN_IREADSTRIDEDCOLL_INDIO; return; } nprocs = vars->nprocs; /* We're going to perform aggregation of I/O. Here we call * ADIOI_Calc_file_domains() to determine what processes will handle I/O * to what regions. We pass nprocs_for_coll into this function; it is * used to determine how many processes will perform I/O, which is also * the number of regions into which the range of bytes must be divided. * These regions are called "file domains", or FDs. * * When this function returns, fd_start, fd_end, fd_size, and * min_st_offset will be filled in. fd_start holds the starting byte * location for each file domain. fd_end holds the ending byte location. * min_st_offset holds the minimum byte location that will be accessed. * * Both fd_start[] and fd_end[] are indexed by an aggregator number; this * needs to be mapped to an actual rank in the communicator later. * */ ADIOI_Calc_file_domains(vars->st_offsets, vars->end_offsets, nprocs, vars->nprocs_for_coll, &vars->min_st_offset, &vars->fd_start, &vars->fd_end, fd->hints->min_fdomain_size, &vars->fd_size, fd->hints->striping_unit); /* calculate where the portions of the access requests of this process * are located in terms of the file domains. this could be on the same * process or on other processes. this function fills in: * count_my_req_procs - number of processes (including this one) for which * this process has requests in their file domain * count_my_req_per_proc - count of requests for each process, indexed * by rank of the process * my_req[] - array of data structures describing the requests to be * performed by each process (including self). indexed by rank. * buf_idx[] - array of locations into which data can be directly moved; * this is only valid for contiguous buffer case */ ADIOI_Calc_my_req(fd, vars->offset_list, vars->len_list, vars->contig_access_count, vars->min_st_offset, vars->fd_start, vars->fd_end, vars->fd_size, nprocs, &vars->count_my_req_procs, &vars->count_my_req_per_proc, &vars->my_req, &vars->buf_idx); /* perform a collective communication in order to distribute the * data calculated above. fills in the following: * count_others_req_procs - number of processes (including this * one) which have requests in this process's file domain. * count_others_req_per_proc[] - number of separate contiguous * requests from proc i lie in this process's file domain. */ cor_vars = (ADIOI_Icalc_others_req_vars *)ADIOI_Calloc( 1, sizeof(ADIOI_Icalc_others_req_vars)); nbc_req->cor_vars = cor_vars; cor_vars->fd = vars->fd; cor_vars->count_my_req_procs = vars->count_my_req_procs; cor_vars->count_my_req_per_proc = vars->count_my_req_per_proc; cor_vars->my_req = vars->my_req; cor_vars->nprocs = vars->nprocs; cor_vars->myrank = vars->myrank; cor_vars->count_others_req_procs_ptr = &vars->count_others_req_procs; cor_vars->others_req_ptr = &vars->others_req; cor_vars->next_fn = ADIOI_GEN_IreadStridedColl_read; ADIOI_Icalc_others_req(nbc_req, error_code); }