/* this used to be implemented in every file system as an fcntl, but the code * is identical for all file systems without a real "preallocate" system call. * This naive approach will get the job done, but not in a terribly efficient * manner. */ void ADIOI_GEN_Prealloc(ADIO_File fd, ADIO_Offset diskspace, int *error_code) { ADIO_Offset curr_fsize, alloc_size, size, len, done; ADIO_Status status; int i, ntimes; char *buf; static char myname[] = "ADIOI_GEN_PREALLOC"; /* will be called by one process only */ /* On file systems with no preallocation function, we have to explicitly write to allocate space. Since there could be holes in the file, we need to read up to the current file size, write it back, and then write beyond that depending on how much preallocation is needed. read/write in sizes of no more than ADIOI_PREALLOC_BUFSZ */ curr_fsize = fd->fp_ind; alloc_size = diskspace; size = ADIOI_MIN(curr_fsize, alloc_size); ntimes = (size + ADIOI_PREALLOC_BUFSZ - 1)/ADIOI_PREALLOC_BUFSZ; buf = (char *) ADIOI_Malloc(ADIOI_PREALLOC_BUFSZ); done = 0; for (i=0; i<ntimes; i++) { len = ADIOI_MIN(size-done, ADIOI_PREALLOC_BUFSZ); ADIO_ReadContig(fd, buf, len, MPI_BYTE, ADIO_EXPLICIT_OFFSET, done, &status, error_code); if (*error_code != MPI_SUCCESS) { *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_IO, "**iopreallocrdwr", 0); return; } ADIO_WriteContig(fd, buf, len, MPI_BYTE, ADIO_EXPLICIT_OFFSET, done, &status, error_code); if (*error_code != MPI_SUCCESS) return; done += len; } if (alloc_size > curr_fsize) { memset(buf, 0, ADIOI_PREALLOC_BUFSZ); size = alloc_size - curr_fsize; ntimes = (size + ADIOI_PREALLOC_BUFSZ - 1)/ADIOI_PREALLOC_BUFSZ; for (i=0; i<ntimes; i++) { len = ADIOI_MIN(alloc_size-done, ADIOI_PREALLOC_BUFSZ); ADIO_WriteContig(fd, buf, len, MPI_BYTE, ADIO_EXPLICIT_OFFSET, done, &status, error_code); if (*error_code != MPI_SUCCESS) return; done += len; } } ADIOI_Free(buf); *error_code = MPI_SUCCESS; }
/* Returns MPI_SUCCESS on success, an MPI error code on failure. Code above * needs to call MPIO_Err_return_xxx. */ int MPIOI_Type_block(int *array_of_gsizes, int dim, int ndims, int nprocs, int rank, int darg, int order, MPI_Aint orig_extent, MPI_Datatype type_old, MPI_Datatype *type_new, MPI_Aint *st_offset) { /* nprocs = no. of processes in dimension dim of grid rank = coordinate of this process in dimension dim */ int blksize, global_size, mysize, i, j; MPI_Aint stride; global_size = array_of_gsizes[dim]; if (darg == MPI_DISTRIBUTE_DFLT_DARG) blksize = (global_size + nprocs - 1)/nprocs; else { blksize = darg; /* --BEGIN ERROR HANDLING-- */ if (blksize <= 0) { return MPI_ERR_ARG; } if (blksize * nprocs < global_size) { return MPI_ERR_ARG; } /* --END ERROR HANDLING-- */ } j = global_size - blksize*rank; mysize = ADIOI_MIN(blksize, j); if (mysize < 0) mysize = 0; stride = orig_extent; if (order == MPI_ORDER_FORTRAN) { if (dim == 0) MPI_Type_contiguous(mysize, type_old, type_new); else { for (i=0; i<dim; i++) stride *= array_of_gsizes[i]; MPI_Type_hvector(mysize, 1, stride, type_old, type_new); } } else { if (dim == ndims-1) MPI_Type_contiguous(mysize, type_old, type_new); else { for (i=ndims-1; i>dim; i--) stride *= array_of_gsizes[i]; MPI_Type_hvector(mysize, 1, stride, type_old, type_new); } } *st_offset = blksize * rank; /* in terms of no. of elements of type oldtype in this dimension */ if (mysize == 0) *st_offset = 0; return MPI_SUCCESS; }
static int ADIOI_LUSTRE_Directio(ADIO_File fd, const void *buf, int len, off_t offset, int rw) { int err=-1, diff, size=len, nbytes = 0; void *newbuf; if (offset % fd->d_miniosz) { diff = fd->d_miniosz - (offset % fd->d_miniosz); diff = ADIOI_MIN(diff, len); if (rw) nbytes = pwrite(fd->fd_sys, (void *)buf, diff, offset); else nbytes = pread(fd->fd_sys, (void *)buf, diff, offset); buf = ((char *) buf) + diff; offset += diff; size = len - diff; } if (!size) { return diff; } if (rw) { /* direct I/O enabled */ if (!(((long) buf) % fd->d_mem)) { ADIOI_LUSTRE_Aligned_Mem_File_Write(fd, buf, size, offset, &err); nbytes += err; } else { newbuf = (void *) memalign(LUSTRE_MEMALIGN, size); if (newbuf) { memcpy(newbuf, buf, size); ADIOI_LUSTRE_Aligned_Mem_File_Write(fd, newbuf, size, offset, &err); nbytes += err; ADIOI_Free(newbuf); } else nbytes += pwrite(fd->fd_sys, buf, size, offset); } err = nbytes; } else { if (!(((long) buf) % fd->d_mem)) { ADIOI_LUSTRE_Aligned_Mem_File_Read(fd, buf, size, offset, &err); nbytes += err; } else { newbuf = (void *) memalign(LUSTRE_MEMALIGN, size); if (newbuf) { ADIOI_LUSTRE_Aligned_Mem_File_Read(fd, newbuf, size, offset, &err); if (err > 0) memcpy((void *)buf, newbuf, err); nbytes += err; ADIOI_Free(newbuf); } else nbytes += pread(fd->fd_sys, (void *)buf, size, offset); } err = nbytes; } return err; }
static void ADIOI_Fill_send_buffer(ADIO_File fd, void *buf, ADIOI_Flatlist_node *flat_buf, char **send_buf, ADIO_Offset *offset_list, ADIO_Offset *len_list, int *send_size, MPI_Request *requests, int *sent_to_proc, int nprocs, int myrank, int contig_access_count, ADIO_Offset min_st_offset, ADIO_Offset fd_size, ADIO_Offset *fd_start, ADIO_Offset *fd_end, int *send_buf_idx, int *curr_to_proc, int *done_to_proc, int iter, MPI_Aint buftype_extent) { /* this function is only called if buftype is not contig */ int i, p, flat_buf_idx; ADIO_Offset flat_buf_sz, size_in_buf, buf_incr, size; int jj, n_buftypes; ADIO_Offset off, len, rem_len, user_buf_idx; /* curr_to_proc[p] = amount of data sent to proc. p that has already been accounted for so far done_to_proc[p] = amount of data already sent to proc. p in previous iterations user_buf_idx = current location in user buffer send_buf_idx[p] = current location in send_buf of proc. p */ for (i=0; i < nprocs; i++) { send_buf_idx[i] = curr_to_proc[i] = 0; done_to_proc[i] = sent_to_proc[i]; } jj = 0; user_buf_idx = flat_buf->indices[0]; flat_buf_idx = 0; n_buftypes = 0; flat_buf_sz = flat_buf->blocklens[0]; /* flat_buf_idx = current index into flattened buftype flat_buf_sz = size of current contiguous component in flattened buf */ for (i=0; i<contig_access_count; i++) { off = offset_list[i]; rem_len = len_list[i]; /*this request may span the file domains of more than one process*/ while (rem_len != 0) { len = rem_len; /* NOTE: len value is modified by ADIOI_Calc_aggregator() to be no * longer than the single region that processor "p" is responsible * for. */ p = ADIOI_Calc_aggregator(fd, off, min_st_offset, &len, fd_size, fd_start, fd_end); if (send_buf_idx[p] < send_size[p]) { if (curr_to_proc[p]+len > done_to_proc[p]) { if (done_to_proc[p] > curr_to_proc[p]) { size = ADIOI_MIN(curr_to_proc[p] + len - done_to_proc[p], send_size[p]-send_buf_idx[p]); buf_incr = done_to_proc[p] - curr_to_proc[p]; ADIOI_BUF_INCR ADIOI_Assert((curr_to_proc[p] + len - done_to_proc[p]) == (unsigned)(curr_to_proc[p] + len - done_to_proc[p])); buf_incr = curr_to_proc[p] + len - done_to_proc[p]; ADIOI_Assert((done_to_proc[p] + size) == (unsigned)(done_to_proc[p] + size)); /* ok to cast: bounded by cb buffer size */ curr_to_proc[p] = done_to_proc[p] + (int)size; ADIOI_BUF_COPY } else { size = ADIOI_MIN(len,send_size[p]-send_buf_idx[p]); buf_incr = len; ADIOI_Assert((curr_to_proc[p] + size) == (unsigned)((ADIO_Offset)curr_to_proc[p] + size)); curr_to_proc[p] += size; ADIOI_BUF_COPY } if (send_buf_idx[p] == send_size[p]) { MPI_Isend(send_buf[p], send_size[p], MPI_BYTE, p, myrank+p+100*iter, fd->comm, requests+jj); jj++; } } else { ADIOI_Assert((curr_to_proc[p] + len) == (unsigned)((ADIO_Offset)curr_to_proc[p] + len)); curr_to_proc[p] += len; buf_incr = len; ADIOI_BUF_INCR } }
void ADIOI_Calc_my_off_len(ADIO_File fd, int bufcount, MPI_Datatype datatype, int file_ptr_type, ADIO_Offset offset, ADIO_Offset **offset_list_ptr, ADIO_Offset **len_list_ptr, ADIO_Offset *start_offset_ptr, ADIO_Offset *end_offset_ptr, int *contig_access_count_ptr) { MPI_Count filetype_size, etype_size; MPI_Count buftype_size; int i, j, k; ADIO_Offset i_offset; ADIO_Offset frd_size=0, old_frd_size=0; int st_index=0; ADIO_Offset n_filetypes, etype_in_filetype; ADIO_Offset abs_off_in_filetype=0; ADIO_Offset bufsize; ADIO_Offset sum, n_etypes_in_filetype, size_in_filetype; int contig_access_count, filetype_is_contig; ADIO_Offset *len_list; MPI_Aint filetype_extent, filetype_lb; ADIOI_Flatlist_node *flat_file; ADIO_Offset *offset_list, off, end_offset=0, disp; #ifdef AGGREGATION_PROFILE MPE_Log_event (5028, 0, NULL); #endif /* For this process's request, calculate the list of offsets and lengths in the file and determine the start and end offsets. */ ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig); MPI_Type_size_x(fd->filetype, &filetype_size); MPI_Type_extent(fd->filetype, &filetype_extent); MPI_Type_lb(fd->filetype, &filetype_lb); MPI_Type_size_x(datatype, &buftype_size); etype_size = fd->etype_size; if ( ! filetype_size ) { *contig_access_count_ptr = 0; *offset_list_ptr = (ADIO_Offset *) ADIOI_Malloc(2*sizeof(ADIO_Offset)); *len_list_ptr = (ADIO_Offset *) ADIOI_Malloc(2*sizeof(ADIO_Offset)); /* 2 is for consistency. everywhere I malloc one more than needed */ offset_list = *offset_list_ptr; len_list = *len_list_ptr; offset_list[0] = (file_ptr_type == ADIO_INDIVIDUAL) ? fd->fp_ind : fd->disp + (ADIO_Offset)etype_size * offset; len_list[0] = 0; *start_offset_ptr = offset_list[0]; *end_offset_ptr = offset_list[0] + len_list[0] - 1; return; } if (filetype_is_contig) { *contig_access_count_ptr = 1; *offset_list_ptr = (ADIO_Offset *) ADIOI_Malloc(2*sizeof(ADIO_Offset)); *len_list_ptr = (ADIO_Offset *) ADIOI_Malloc(2*sizeof(ADIO_Offset)); /* 2 is for consistency. everywhere I malloc one more than needed */ offset_list = *offset_list_ptr; len_list = *len_list_ptr; offset_list[0] = (file_ptr_type == ADIO_INDIVIDUAL) ? fd->fp_ind : fd->disp + (ADIO_Offset)etype_size * offset; len_list[0] = (ADIO_Offset)bufcount * (ADIO_Offset)buftype_size; *start_offset_ptr = offset_list[0]; *end_offset_ptr = offset_list[0] + len_list[0] - 1; /* update file pointer */ if (file_ptr_type == ADIO_INDIVIDUAL) fd->fp_ind = *end_offset_ptr + 1; } else { /* First calculate what size of offset_list and len_list to allocate */ /* filetype already flattened in ADIO_Open or ADIO_Fcntl */ flat_file = ADIOI_Flatlist; while (flat_file->type != fd->filetype) flat_file = flat_file->next; disp = fd->disp; #ifdef RDCOLL_DEBUG { int ii; DBG_FPRINTF(stderr, "flattened %3lld : ", flat_file->count ); for (ii=0; ii<flat_file->count; ii++) { DBG_FPRINTF(stderr, "%16lld:%-16lld", flat_file->indices[ii], flat_file->blocklens[ii] ); } DBG_FPRINTF(stderr, "\n" ); } #endif if (file_ptr_type == ADIO_INDIVIDUAL) { /* Wei-keng reworked type processing to be a bit more efficient */ offset = fd->fp_ind - disp; n_filetypes = (offset - flat_file->indices[0]) / filetype_extent; offset -= (ADIO_Offset)n_filetypes * filetype_extent; /* now offset is local to this extent */ /* find the block where offset is located, skip blocklens[i]==0 */ for (i=0; i<flat_file->count; i++) { ADIO_Offset dist; if (flat_file->blocklens[i] == 0) continue; dist = flat_file->indices[i] + flat_file->blocklens[i] - offset; /* frd_size is from offset to the end of block i */ if (dist == 0) { i++; offset = flat_file->indices[i]; frd_size = flat_file->blocklens[i]; break; } if (dist > 0) { frd_size = dist; break; } } st_index = i; /* starting index in flat_file->indices[] */ offset += disp + (ADIO_Offset)n_filetypes*filetype_extent; } else { n_etypes_in_filetype = filetype_size/etype_size; n_filetypes = offset / n_etypes_in_filetype; etype_in_filetype = offset % n_etypes_in_filetype; size_in_filetype = etype_in_filetype * etype_size; sum = 0; for (i=0; i<flat_file->count; i++) { sum += flat_file->blocklens[i]; if (sum > size_in_filetype) { st_index = i; frd_size = sum - size_in_filetype; abs_off_in_filetype = flat_file->indices[i] + size_in_filetype - (sum - flat_file->blocklens[i]); break; } } /* abs. offset in bytes in the file */ offset = disp + n_filetypes* (ADIO_Offset)filetype_extent + abs_off_in_filetype; } /* calculate how much space to allocate for offset_list, len_list */ old_frd_size = frd_size; contig_access_count = i_offset = 0; j = st_index; bufsize = (ADIO_Offset)buftype_size * (ADIO_Offset)bufcount; frd_size = ADIOI_MIN(frd_size, bufsize); while (i_offset < bufsize) { if (frd_size) contig_access_count++; i_offset += frd_size; j = (j + 1) % flat_file->count; frd_size = ADIOI_MIN(flat_file->blocklens[j], bufsize-i_offset); } /* allocate space for offset_list and len_list */ *offset_list_ptr = (ADIO_Offset *) ADIOI_Malloc((contig_access_count+1)*sizeof(ADIO_Offset)); *len_list_ptr = (ADIO_Offset *) ADIOI_Malloc((contig_access_count+1)*sizeof(ADIO_Offset)); /* +1 to avoid a 0-size malloc */ offset_list = *offset_list_ptr; len_list = *len_list_ptr; /* find start offset, end offset, and fill in offset_list and len_list */ *start_offset_ptr = offset; /* calculated above */ i_offset = k = 0; j = st_index; off = offset; frd_size = ADIOI_MIN(old_frd_size, bufsize); while (i_offset < bufsize) { if (frd_size) { offset_list[k] = off; len_list[k] = frd_size; k++; } i_offset += frd_size; end_offset = off + frd_size - 1; /* Note: end_offset points to the last byte-offset that will be accessed. e.g., if start_offset=0 and 100 bytes to be read, end_offset=99*/ if (off + frd_size < disp + flat_file->indices[j] + flat_file->blocklens[j] + n_filetypes* (ADIO_Offset)filetype_extent) { off += frd_size; /* did not reach end of contiguous block in filetype. * no more I/O needed. off is incremented by frd_size. */ } else { j = (j+1) % flat_file->count; n_filetypes += (j == 0) ? 1 : 0; while (flat_file->blocklens[j]==0) { j = (j+1) % flat_file->count; n_filetypes += (j == 0) ? 1 : 0; /* hit end of flattened filetype; start at beginning * again */ } off = disp + flat_file->indices[j] + n_filetypes* (ADIO_Offset)filetype_extent; frd_size = ADIOI_MIN(flat_file->blocklens[j], bufsize-i_offset); } } /* update file pointer */ if (file_ptr_type == ADIO_INDIVIDUAL) fd->fp_ind = off; *contig_access_count_ptr = contig_access_count; *end_offset_ptr = end_offset; } #ifdef AGGREGATION_PROFILE MPE_Log_event (5029, 0, NULL); #endif }
void ADIOI_PVFS_WriteStridedListIO(ADIO_File fd, void *buf, int count, MPI_Datatype datatype, int file_ptr_type, ADIO_Offset offset, ADIO_Status *status, int *error_code) { /* Since PVFS does not support file locking, can't do buffered writes as on Unix */ /* offset is in units of etype relative to the filetype. */ ADIOI_Flatlist_node *flat_buf, *flat_file; int i, j, k, err=-1, bwr_size, fwr_size=0, st_index=0; int bufsize, size, sum, n_etypes_in_filetype, size_in_filetype; int n_filetypes, etype_in_filetype; ADIO_Offset abs_off_in_filetype=0; int filetype_size, etype_size, buftype_size; MPI_Aint filetype_extent, buftype_extent; int buf_count, buftype_is_contig, filetype_is_contig; ADIO_Offset userbuf_off; ADIO_Offset off, disp, start_off; int flag, st_fwr_size, st_n_filetypes; int new_bwr_size, new_fwr_size, err_flag=0; int mem_list_count, file_list_count; char ** mem_offsets; int64_t *file_offsets; int *mem_lengths; int32_t *file_lengths; int total_blks_to_write; int max_mem_list, max_file_list; int b_blks_wrote; int f_data_wrote; int size_wrote=0, n_write_lists, extra_blks; int end_bwr_size, end_fwr_size; int start_k, start_j, new_file_write, new_buffer_write; int start_mem_offset; #define MAX_ARRAY_SIZE 1024 static char myname[] = "ADIOI_PVFS_WRITESTRIDED"; /* PFS file pointer modes are not relevant here, because PFS does not support strided accesses. */ /* --BEGIN ERROR HANDLING-- */ if (fd->atomicity) { *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_INTERN, "Atomic mode set in PVFS I/O function", 0); return; } /* --END ERROR HANDLING-- */ ADIOI_Datatype_iscontig(datatype, &buftype_is_contig); ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig); MPI_Type_size(fd->filetype, &filetype_size); if ( ! filetype_size ) { *error_code = MPI_SUCCESS; return; } MPI_Type_extent(fd->filetype, &filetype_extent); MPI_Type_size(datatype, &buftype_size); MPI_Type_extent(datatype, &buftype_extent); etype_size = fd->etype_size; bufsize = buftype_size * count; if (!buftype_is_contig && filetype_is_contig) { /* noncontiguous in memory, contiguous in file. */ int64_t file_offsets; int32_t file_lengths; ADIOI_Flatten_datatype(datatype); flat_buf = ADIOI_Flatlist; while (flat_buf->type != datatype) flat_buf = flat_buf->next; if (file_ptr_type == ADIO_EXPLICIT_OFFSET) { off = fd->disp + etype_size * offset; pvfs_lseek64(fd->fd_sys, fd->fp_ind, SEEK_SET); } else off = pvfs_lseek64(fd->fd_sys, fd->fp_ind, SEEK_SET); file_list_count = 1; file_offsets = off; file_lengths = 0; total_blks_to_write = count*flat_buf->count; b_blks_wrote = 0; /* allocate arrays according to max usage */ if (total_blks_to_write > MAX_ARRAY_SIZE) mem_list_count = MAX_ARRAY_SIZE; else mem_list_count = total_blks_to_write; mem_offsets = (char**)ADIOI_Malloc(mem_list_count*sizeof(char*)); mem_lengths = (int*)ADIOI_Malloc(mem_list_count*sizeof(int)); j = 0; /* step through each block in memory, filling memory arrays */ while (b_blks_wrote < total_blks_to_write) { for (i=0; i<flat_buf->count; i++) { mem_offsets[b_blks_wrote % MAX_ARRAY_SIZE] = ((char*)buf + j*buftype_extent + flat_buf->indices[i]); mem_lengths[b_blks_wrote % MAX_ARRAY_SIZE] = flat_buf->blocklens[i]; file_lengths += flat_buf->blocklens[i]; b_blks_wrote++; if (!(b_blks_wrote % MAX_ARRAY_SIZE) || (b_blks_wrote == total_blks_to_write)) { /* in the case of the last read list call, adjust mem_list_count */ if (b_blks_wrote == total_blks_to_write) { mem_list_count = total_blks_to_write % MAX_ARRAY_SIZE; /* in case last read list call fills max arrays */ if (!mem_list_count) mem_list_count = MAX_ARRAY_SIZE; } pvfs_write_list(fd->fd_sys ,mem_list_count, mem_offsets, mem_lengths, file_list_count, &file_offsets, &file_lengths); /* in the case of the last read list call, leave here */ if (b_blks_wrote == total_blks_to_write) break; file_offsets += file_lengths; file_lengths = 0; } } /* for (i=0; i<flat_buf->count; i++) */ j++; } /* while (b_blks_wrote < total_blks_to_write) */ ADIOI_Free(mem_offsets); ADIOI_Free(mem_lengths); if (file_ptr_type == ADIO_INDIVIDUAL) fd->fp_ind = off; if (err_flag) { *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_IO, "**io", "**io %s", strerror(errno)); } else *error_code = MPI_SUCCESS; fd->fp_sys_posn = -1; /* clear this. */ #ifdef HAVE_STATUS_SET_BYTES MPIR_Status_set_bytes(status, datatype, bufsize); /* This is a temporary way of filling in status. The right way is to keep track of how much data was actually written by ADIOI_BUFFERED_WRITE. */ #endif ADIOI_Delete_flattened(datatype); return; } /* if (!buftype_is_contig && filetype_is_contig) */ /* already know that file is noncontiguous from above */ /* noncontiguous in file */ /* filetype already flattened in ADIO_Open */ flat_file = ADIOI_Flatlist; while (flat_file->type != fd->filetype) flat_file = flat_file->next; disp = fd->disp; /* for each case - ADIO_Individual pointer or explicit, find offset (file offset in bytes), n_filetypes (how many filetypes into file to start), fwr_size (remaining amount of data in present file block), and st_index (start point in terms of blocks in starting filetype) */ if (file_ptr_type == ADIO_INDIVIDUAL) { offset = fd->fp_ind; /* in bytes */ n_filetypes = -1; flag = 0; while (!flag) { n_filetypes++; for (i=0; i<flat_file->count; i++) { if (disp + flat_file->indices[i] + (ADIO_Offset) n_filetypes*filetype_extent + flat_file->blocklens[i] >= offset) { st_index = i; fwr_size = disp + flat_file->indices[i] + (ADIO_Offset) n_filetypes*filetype_extent + flat_file->blocklens[i] - offset; flag = 1; break; } } } /* while (!flag) */ } /* if (file_ptr_type == ADIO_INDIVIDUAL) */ else { n_etypes_in_filetype = filetype_size/etype_size; n_filetypes = (int) (offset / n_etypes_in_filetype); etype_in_filetype = (int) (offset % n_etypes_in_filetype); size_in_filetype = etype_in_filetype * etype_size; sum = 0; for (i=0; i<flat_file->count; i++) { sum += flat_file->blocklens[i]; if (sum > size_in_filetype) { st_index = i; fwr_size = sum - size_in_filetype; abs_off_in_filetype = flat_file->indices[i] + size_in_filetype - (sum - flat_file->blocklens[i]); break; } } /* abs. offset in bytes in the file */ offset = disp + (ADIO_Offset) n_filetypes*filetype_extent + abs_off_in_filetype; } /* else [file_ptr_type != ADIO_INDIVIDUAL] */ start_off = offset; st_fwr_size = fwr_size; st_n_filetypes = n_filetypes; if (buftype_is_contig && !filetype_is_contig) { /* contiguous in memory, noncontiguous in file. should be the most common case. */ int mem_lengths; char *mem_offsets; i = 0; j = st_index; off = offset; n_filetypes = st_n_filetypes; mem_list_count = 1; /* determine how many blocks in file to read */ f_data_wrote = ADIOI_MIN(st_fwr_size, bufsize); total_blks_to_write = 1; j++; while (f_data_wrote < bufsize) { f_data_wrote += flat_file->blocklens[j]; total_blks_to_write++; if (j<(flat_file->count-1)) j++; else j = 0; } j = st_index; n_filetypes = st_n_filetypes; n_write_lists = total_blks_to_write/MAX_ARRAY_SIZE; extra_blks = total_blks_to_write%MAX_ARRAY_SIZE; mem_offsets = buf; mem_lengths = 0; /* if at least one full readlist, allocate file arrays at max array size and don't free until very end */ if (n_write_lists) { file_offsets = (int64_t*)ADIOI_Malloc(MAX_ARRAY_SIZE* sizeof(int64_t)); file_lengths = (int32_t*)ADIOI_Malloc(MAX_ARRAY_SIZE* sizeof(int32_t)); } /* if there's no full readlist allocate file arrays according to needed size (extra_blks) */ else { file_offsets = (int64_t*)ADIOI_Malloc(extra_blks* sizeof(int64_t)); file_lengths = (int32_t*)ADIOI_Malloc(extra_blks* sizeof(int32_t)); } /* for file arrays that are of MAX_ARRAY_SIZE, build arrays */ for (i=0; i<n_write_lists; i++) { file_list_count = MAX_ARRAY_SIZE; if(!i) { file_offsets[0] = offset; file_lengths[0] = st_fwr_size; mem_lengths = st_fwr_size; } for (k=0; k<MAX_ARRAY_SIZE; k++) { if (i || k) { file_offsets[k] = disp + n_filetypes*filetype_extent + flat_file->indices[j]; file_lengths[k] = flat_file->blocklens[j]; mem_lengths += file_lengths[k]; } if (j<(flat_file->count - 1)) j++; else { j = 0; n_filetypes++; } } /* for (k=0; k<MAX_ARRAY_SIZE; k++) */ pvfs_write_list(fd->fd_sys, mem_list_count, &mem_offsets, &mem_lengths, file_list_count, file_offsets, file_lengths); mem_offsets += mem_lengths; mem_lengths = 0; } /* for (i=0; i<n_write_lists; i++) */ /* for file arrays smaller than MAX_ARRAY_SIZE (last read_list call) */ if (extra_blks) { file_list_count = extra_blks; if(!i) { file_offsets[0] = offset; file_lengths[0] = st_fwr_size; } for (k=0; k<extra_blks; k++) { if(i || k) { file_offsets[k] = disp + n_filetypes*filetype_extent + flat_file->indices[j]; if (k == (extra_blks - 1)) { file_lengths[k] = bufsize - (int32_t) mem_lengths - (int32_t) mem_offsets + (int32_t) buf; } else file_lengths[k] = flat_file->blocklens[j]; } /* if(i || k) */ mem_lengths += file_lengths[k]; if (j<(flat_file->count - 1)) j++; else { j = 0; n_filetypes++; } } /* for (k=0; k<extra_blks; k++) */ pvfs_write_list(fd->fd_sys, mem_list_count, &mem_offsets, &mem_lengths, file_list_count, file_offsets, file_lengths); } } else { /* noncontiguous in memory as well as in file */ ADIOI_Flatten_datatype(datatype); flat_buf = ADIOI_Flatlist; while (flat_buf->type != datatype) flat_buf = flat_buf->next; size_wrote = 0; n_filetypes = st_n_filetypes; fwr_size = st_fwr_size; bwr_size = flat_buf->blocklens[0]; buf_count = 0; start_mem_offset = 0; start_k = k = 0; start_j = st_index; max_mem_list = 0; max_file_list = 0; /* run through and file max_file_list and max_mem_list so that you can allocate the file and memory arrays less than MAX_ARRAY_SIZE if possible */ while (size_wrote < bufsize) { k = start_k; new_buffer_write = 0; mem_list_count = 0; while ((mem_list_count < MAX_ARRAY_SIZE) && (new_buffer_write < bufsize-size_wrote)) { /* find mem_list_count and file_list_count such that both are less than MAX_ARRAY_SIZE, the sum of their lengths are equal, and the sum of all the data read and data to be read in the next immediate read list is less than bufsize */ if(mem_list_count) { if((new_buffer_write + flat_buf->blocklens[k] + size_wrote) > bufsize) { end_bwr_size = new_buffer_write + flat_buf->blocklens[k] - (bufsize - size_wrote); new_buffer_write = bufsize - size_wrote; } else { new_buffer_write += flat_buf->blocklens[k]; end_bwr_size = flat_buf->blocklens[k]; } } else { if (bwr_size > (bufsize - size_wrote)) { new_buffer_write = bufsize - size_wrote; bwr_size = new_buffer_write; } else new_buffer_write = bwr_size; } mem_list_count++; k = (k + 1)%flat_buf->count; } /* while ((mem_list_count < MAX_ARRAY_SIZE) && (new_buffer_write < bufsize-size_wrote)) */ j = start_j; new_file_write = 0; file_list_count = 0; while ((file_list_count < MAX_ARRAY_SIZE) && (new_file_write < new_buffer_write)) { if(file_list_count) { if((new_file_write + flat_file->blocklens[j]) > new_buffer_write) { end_fwr_size = new_buffer_write - new_file_write; new_file_write = new_buffer_write; j--; } else { new_file_write += flat_file->blocklens[j]; end_fwr_size = flat_file->blocklens[j]; } } else { if (fwr_size > new_buffer_write) { new_file_write = new_buffer_write; fwr_size = new_file_write; } else new_file_write = fwr_size; } file_list_count++; if (j < (flat_file->count - 1)) j++; else j = 0; k = start_k; if ((new_file_write < new_buffer_write) && (file_list_count == MAX_ARRAY_SIZE)) { new_buffer_write = 0; mem_list_count = 0; while (new_buffer_write < new_file_write) { if(mem_list_count) { if((new_buffer_write + flat_buf->blocklens[k]) > new_file_write) { end_bwr_size = new_file_write - new_buffer_write; new_buffer_write = new_file_write; k--; } else { new_buffer_write += flat_buf->blocklens[k]; end_bwr_size = flat_buf->blocklens[k]; } } else { new_buffer_write = bwr_size; if (bwr_size > (bufsize - size_wrote)) { new_buffer_write = bufsize - size_wrote; bwr_size = new_buffer_write; } } mem_list_count++; k = (k + 1)%flat_buf->count; } /* while (new_buffer_write < new_file_write) */ } /* if ((new_file_write < new_buffer_write) && (file_list_count == MAX_ARRAY_SIZE)) */ } /* while ((mem_list_count < MAX_ARRAY_SIZE) && (new_buffer_write < bufsize-size_wrote)) */ /* fakes filling the writelist arrays of lengths found above */ k = start_k; j = start_j; for (i=0; i<mem_list_count; i++) { if(i) { if (i == (mem_list_count - 1)) { if (flat_buf->blocklens[k] == end_bwr_size) bwr_size = flat_buf->blocklens[(k+1)% flat_buf->count]; else { bwr_size = flat_buf->blocklens[k] - end_bwr_size; k--; buf_count--; } } } buf_count++; k = (k + 1)%flat_buf->count; } /* for (i=0; i<mem_list_count; i++) */ for (i=0; i<file_list_count; i++) { if (i) { if (i == (file_list_count - 1)) { if (flat_file->blocklens[j] == end_fwr_size) fwr_size = flat_file->blocklens[(j+1)% flat_file->count]; else { fwr_size = flat_file->blocklens[j] - end_fwr_size; j--; } } } if (j < flat_file->count - 1) j++; else { j = 0; n_filetypes++; } } /* for (i=0; i<file_list_count; i++) */ size_wrote += new_buffer_write; start_k = k; start_j = j; if (max_mem_list < mem_list_count) max_mem_list = mem_list_count; if (max_file_list < file_list_count) max_file_list = file_list_count; if (max_mem_list == max_mem_list == MAX_ARRAY_SIZE) break; } /* while (size_wrote < bufsize) */ mem_offsets = (char **)ADIOI_Malloc(max_mem_list*sizeof(char *)); mem_lengths = (int *)ADIOI_Malloc(max_mem_list*sizeof(int)); file_offsets = (int64_t *)ADIOI_Malloc(max_file_list*sizeof(int64_t)); file_lengths = (int32_t *)ADIOI_Malloc(max_file_list*sizeof(int32_t)); size_wrote = 0; n_filetypes = st_n_filetypes; fwr_size = st_fwr_size; bwr_size = flat_buf->blocklens[0]; buf_count = 0; start_mem_offset = 0; start_k = k = 0; start_j = st_index; /* this section calculates mem_list_count and file_list_count and also finds the possibly odd sized last array elements in new_fwr_size and new_bwr_size */ while (size_wrote < bufsize) { k = start_k; new_buffer_write = 0; mem_list_count = 0; while ((mem_list_count < MAX_ARRAY_SIZE) && (new_buffer_write < bufsize-size_wrote)) { /* find mem_list_count and file_list_count such that both are less than MAX_ARRAY_SIZE, the sum of their lengths are equal, and the sum of all the data read and data to be read in the next immediate read list is less than bufsize */ if(mem_list_count) { if((new_buffer_write + flat_buf->blocklens[k] + size_wrote) > bufsize) { end_bwr_size = new_buffer_write + flat_buf->blocklens[k] - (bufsize - size_wrote); new_buffer_write = bufsize - size_wrote; } else { new_buffer_write += flat_buf->blocklens[k]; end_bwr_size = flat_buf->blocklens[k]; } } else { if (bwr_size > (bufsize - size_wrote)) { new_buffer_write = bufsize - size_wrote; bwr_size = new_buffer_write; } else new_buffer_write = bwr_size; } mem_list_count++; k = (k + 1)%flat_buf->count; } /* while ((mem_list_count < MAX_ARRAY_SIZE) && (new_buffer_write < bufsize-size_wrote)) */ j = start_j; new_file_write = 0; file_list_count = 0; while ((file_list_count < MAX_ARRAY_SIZE) && (new_file_write < new_buffer_write)) { if(file_list_count) { if((new_file_write + flat_file->blocklens[j]) > new_buffer_write) { end_fwr_size = new_buffer_write - new_file_write; new_file_write = new_buffer_write; j--; } else { new_file_write += flat_file->blocklens[j]; end_fwr_size = flat_file->blocklens[j]; } } else { if (fwr_size > new_buffer_write) { new_file_write = new_buffer_write; fwr_size = new_file_write; } else new_file_write = fwr_size; } file_list_count++; if (j < (flat_file->count - 1)) j++; else j = 0; k = start_k; if ((new_file_write < new_buffer_write) && (file_list_count == MAX_ARRAY_SIZE)) { new_buffer_write = 0; mem_list_count = 0; while (new_buffer_write < new_file_write) { if(mem_list_count) { if((new_buffer_write + flat_buf->blocklens[k]) > new_file_write) { end_bwr_size = new_file_write - new_buffer_write; new_buffer_write = new_file_write; k--; } else { new_buffer_write += flat_buf->blocklens[k]; end_bwr_size = flat_buf->blocklens[k]; } } else { new_buffer_write = bwr_size; if (bwr_size > (bufsize - size_wrote)) { new_buffer_write = bufsize - size_wrote; bwr_size = new_buffer_write; } } mem_list_count++; k = (k + 1)%flat_buf->count; } /* while (new_buffer_write < new_file_write) */ } /* if ((new_file_write < new_buffer_write) && (file_list_count == MAX_ARRAY_SIZE)) */ } /* while ((mem_list_count < MAX_ARRAY_SIZE) && (new_buffer_write < bufsize-size_wrote)) */ /* fills the allocated readlist arrays */ k = start_k; j = start_j; for (i=0; i<mem_list_count; i++) { mem_offsets[i] = ((char*)buf + buftype_extent* (buf_count/flat_buf->count) + (int)flat_buf->indices[k]); if(!i) { mem_lengths[0] = bwr_size; mem_offsets[0] += flat_buf->blocklens[k] - bwr_size; } else { if (i == (mem_list_count - 1)) { mem_lengths[i] = end_bwr_size; if (flat_buf->blocklens[k] == end_bwr_size) bwr_size = flat_buf->blocklens[(k+1)% flat_buf->count]; else { bwr_size = flat_buf->blocklens[k] - end_bwr_size; k--; buf_count--; } } else { mem_lengths[i] = flat_buf->blocklens[k]; } } buf_count++; k = (k + 1)%flat_buf->count; } /* for (i=0; i<mem_list_count; i++) */ for (i=0; i<file_list_count; i++) { file_offsets[i] = disp + flat_file->indices[j] + n_filetypes * filetype_extent; if (!i) { file_lengths[0] = fwr_size; file_offsets[0] += flat_file->blocklens[j] - fwr_size; } else { if (i == (file_list_count - 1)) { file_lengths[i] = end_fwr_size; if (flat_file->blocklens[j] == end_fwr_size) fwr_size = flat_file->blocklens[(j+1)% flat_file->count]; else { fwr_size = flat_file->blocklens[j] - end_fwr_size; j--; } } else file_lengths[i] = flat_file->blocklens[j]; } if (j < flat_file->count - 1) j++; else { j = 0; n_filetypes++; } } /* for (i=0; i<file_list_count; i++) */ pvfs_write_list(fd->fd_sys,mem_list_count, mem_offsets, mem_lengths, file_list_count, file_offsets, file_lengths); size_wrote += new_buffer_write; start_k = k; start_j = j; } /* while (size_wrote < bufsize) */ ADIOI_Free(mem_offsets); ADIOI_Free(mem_lengths); } ADIOI_Free(file_offsets); ADIOI_Free(file_lengths); if (file_ptr_type == ADIO_INDIVIDUAL) fd->fp_ind = off; if (err_flag) { *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_IO, "**io", "**io %s", strerror(errno)); } else *error_code = MPI_SUCCESS; fd->fp_sys_posn = -1; /* set it to null. */ #ifdef HAVE_STATUS_SET_BYTES MPIR_Status_set_bytes(status, datatype, bufsize); /* This is a temporary way of filling in status. The right way is to keep track of how much data was actually written by ADIOI_BUFFERED_WRITE. */ #endif if (!buftype_is_contig) ADIOI_Delete_flattened(datatype); }
void ADIOI_PIOFS_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code) { piofs_create_t piofs_create; piofs_statfs_t piofs_statfs; char *value, *path, *slash; int flag, tmp_val, str_factor=-1, str_unit=-1, start_iodev=-1; int err, myrank, perm, old_mask, nioservers; if ((fd->info) == MPI_INFO_NULL) { /* This must be part of the open call. can set striping parameters if necessary. */ MPI_Info_create(&(fd->info)); /* has user specified striping parameters and do they have the same value on all processes? */ if (users_info != MPI_INFO_NULL) { value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char)); MPI_Info_get(users_info, "striping_factor", MPI_MAX_INFO_VAL, value, &flag); if (flag) { str_factor=atoi(value); tmp_val = str_factor; MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm); if (tmp_val != str_factor) { FPRINTF(stderr, "ADIOI_PIOFS_SetInfo: the value for key \"striping_factor\" must be the same on all processes\n"); MPI_Abort(MPI_COMM_WORLD, 1); } } MPI_Info_get(users_info, "striping_unit", MPI_MAX_INFO_VAL, value, &flag); if (flag) { str_unit=atoi(value); tmp_val = str_unit; MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm); if (tmp_val != str_unit) { FPRINTF(stderr, "ADIOI_PIOFS_SetInfo: the value for key \"striping_unit\" must be the same on all processes\n"); MPI_Abort(MPI_COMM_WORLD, 1); } } MPI_Info_get(users_info, "start_iodevice", MPI_MAX_INFO_VAL, value, &flag); if (flag) { start_iodev=atoi(value); tmp_val = start_iodev; MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm); if (tmp_val != start_iodev) { FPRINTF(stderr, "ADIOI_PIOFS_SetInfo: the value for key \"start_iodevice\" must be the same on all processes\n"); MPI_Abort(MPI_COMM_WORLD, 1); } } ADIOI_Free(value); /* if user has specified striping info, process 0 tries to set it */ if ((str_factor > 0) || (str_unit > 0) || (start_iodev >= 0)) { MPI_Comm_rank(fd->comm, &myrank); if (!myrank) { if (fd->perm == ADIO_PERM_NULL) { old_mask = umask(022); umask(old_mask); perm = old_mask ^ 0666; } else perm = fd->perm; /* to find out the number of I/O servers, I need the path to the directory containing the file */ path = strdup(fd->filename); slash = strrchr(path, '/'); if (!slash) strcpy(path, "."); else { if (slash == path) *(path + 1) = '\0'; else *slash = '\0'; } strcpy(piofs_statfs.name, path); err = piofsioctl(0, PIOFS_STATFS, &piofs_statfs); nioservers = (err) ? -1 : piofs_statfs.f_nodes; free(path); str_factor = ADIOI_MIN(nioservers, str_factor); if (start_iodev >= nioservers) start_iodev = -1; strcpy(piofs_create.name, fd->filename); piofs_create.bsu = (str_unit > 0) ? str_unit : -1; piofs_create.cells = (str_factor > 0) ? str_factor : -1; piofs_create.permissions = perm; piofs_create.base_node = (start_iodev >= 0) ? start_iodev : -1; piofs_create.flags = 0; err = piofsioctl(0, PIOFS_CREATE, &piofs_create); } MPI_Barrier(fd->comm); } } } /* set the values for collective I/O and data sieving parameters */ ADIOI_GEN_SetInfo(fd, users_info, error_code); *error_code = MPI_SUCCESS; }
/* * ADIOI_Sync_thread_start - start the synchronisation routine */ void *ADIOI_Sync_thread_start(void *ptr) { ADIOI_Sync_thread_t t = (ADIOI_Sync_thread_t)ptr; ADIOI_Atomic_queue_t q = (ADIOI_Atomic_queue_t)t->sub_; ADIOI_Sync_req_t r; size_t wr_count; MPI_Count datatype_size; char *buf; ADIO_Offset bytes_xfered, len, buf_size, offset, off; int type, count, fflags, error_code; ADIO_Request *req; MPI_Datatype datatype; /* get sync buffer size */ t->fd_; buf_size = t->fd_->hints->ind_wr_buffer_size; buf = (char *)ADIOI_Malloc(buf_size); for(;;) { /* get a new sync request */ #ifndef _USE_PTHREAD_MUTEX_ if ((r = ADIOI_Atomic_queue_front(q)) == NULL) continue; #else r = ADIOI_Atomic_queue_front(q); #endif /* pop sync request */ ADIOI_Atomic_queue_pop(q); /* get request type */ ADIOI_Sync_req_get_key(r, ADIOI_SYNC_TYPE, &type); /* check for shutdown type */ if (type == ADIOI_THREAD_SHUTDOWN) { break; } /* if sync type get all the fields */ ADIOI_Sync_req_get_key(r, ADIOI_SYNC_ALL, &offset, &datatype, &count, &req, &error_code, &fflags); /* init I/O req */ MPI_Type_size_x(datatype, &datatype_size); len = (ADIO_Offset)datatype_size * (ADIO_Offset)count; bytes_xfered = 0; off = offset; /* satisfy sync req */ while (bytes_xfered < len) { wr_count = (size_t)ADIOI_MIN(buf_size, len - bytes_xfered); #ifdef ADIOI_MPE_LOGGING MPE_Log_event(ADIOI_MPE_thread_read_a, 0, NULL); #endif /* read data from cache file */ pread(t->fd_->cache_fd->fd_sys, buf, wr_count, offset); #ifdef ADIOI_MPE_LOGGING MPE_Log_event(ADIOI_MPE_thread_read_b, 0, NULL); MPE_Log_event(ADIOI_MPE_thread_write_a, 0, NULL); #endif /* write data to global file */ pwrite(t->fd_->fd_sys, buf, wr_count, offset); #ifdef ADIOI_MPE_LOGGING MPE_Log_event(ADIOI_MPE_thread_write_b, 0, NULL); #endif /* update offset */ bytes_xfered += (ADIO_Offset)wr_count; offset += (ADIO_Offset)wr_count; } /* unlock extent locked in ADIO_WriteContig() */ if (t->fd_->hints->e10_cache_coherent == ADIOI_HINT_ENABLE) ADIOI_UNLOCK(t->fd_, off, SEEK_SET, len); /* ---Begin Error Handling--- */ /* --- End Error Handling --- */ /* complete Grequest */ MPI_Grequest_complete(*req); } ADIOI_Free(buf); pthread_exit(NULL); }
void ADIOI_NFS_ReadStrided(ADIO_File fd, void *buf, int count, MPI_Datatype datatype, int file_ptr_type, ADIO_Offset offset, ADIO_Status *status, int *error_code) { /* offset is in units of etype relative to the filetype. */ ADIOI_Flatlist_node *flat_buf, *flat_file; int i, j, k, err=-1, brd_size, st_index=0; int num, size, sum, n_etypes_in_filetype, size_in_filetype; MPI_Count bufsize; int n_filetypes, etype_in_filetype; ADIO_Offset abs_off_in_filetype=0; int req_len, partial_read; MPI_Count filetype_size, etype_size, buftype_size; MPI_Aint filetype_extent, buftype_extent; int buf_count, buftype_is_contig, filetype_is_contig; ADIO_Offset userbuf_off; ADIO_Offset off, req_off, disp, end_offset=0, readbuf_off, start_off; char *readbuf, *tmp_buf, *value; int st_n_filetypes, readbuf_len; ADIO_Offset frd_size=0, new_frd_size, st_frd_size; int new_brd_size, err_flag=0, info_flag, max_bufsize; static char myname[] = "ADIOI_NFS_READSTRIDED"; ADIOI_Datatype_iscontig(datatype, &buftype_is_contig); ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig); MPI_Type_size_x(fd->filetype, &filetype_size); if ( ! filetype_size ) { #ifdef HAVE_STATUS_SET_BYTES MPIR_Status_set_bytes(status, datatype, 0); #endif *error_code = MPI_SUCCESS; return; } MPI_Type_extent(fd->filetype, &filetype_extent); MPI_Type_size_x(datatype, &buftype_size); MPI_Type_extent(datatype, &buftype_extent); etype_size = fd->etype_size; bufsize = buftype_size * count; /* get max_bufsize from the info object. */ value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char)); ADIOI_Info_get(fd->info, "ind_rd_buffer_size", MPI_MAX_INFO_VAL, value, &info_flag); max_bufsize = atoi(value); ADIOI_Free(value); if (!buftype_is_contig && filetype_is_contig) { /* noncontiguous in memory, contiguous in file. */ ADIOI_Flatten_datatype(datatype); flat_buf = ADIOI_Flatlist; while (flat_buf->type != datatype) flat_buf = flat_buf->next; off = (file_ptr_type == ADIO_INDIVIDUAL) ? fd->fp_ind : fd->disp + etype_size * offset; start_off = off; end_offset = off + bufsize - 1; readbuf_off = off; readbuf = (char *) ADIOI_Malloc(max_bufsize); readbuf_len = (int) (ADIOI_MIN(max_bufsize, end_offset-readbuf_off+1)); /* if atomicity is true, lock (exclusive) the region to be accessed */ if (fd->atomicity) ADIOI_WRITE_LOCK(fd, start_off, SEEK_SET, end_offset-start_off+1); #ifdef ADIOI_MPE_LOGGING MPE_Log_event( ADIOI_MPE_lseek_a, 0, NULL ); #endif lseek(fd->fd_sys, readbuf_off, SEEK_SET); #ifdef ADIOI_MPE_LOGGING MPE_Log_event( ADIOI_MPE_lseek_b, 0, NULL ); #endif if (!(fd->atomicity)) ADIOI_READ_LOCK(fd, readbuf_off, SEEK_SET, readbuf_len); #ifdef ADIOI_MPE_LOGGING MPE_Log_event( ADIOI_MPE_read_a, 0, NULL ); #endif err = read(fd->fd_sys, readbuf, readbuf_len); #ifdef ADIOI_MPE_LOGGING MPE_Log_event( ADIOI_MPE_read_b, 0, NULL ); #endif if (!(fd->atomicity)) ADIOI_UNLOCK(fd, readbuf_off, SEEK_SET, readbuf_len); if (err == -1) err_flag = 1; for (j=0; j<count; j++) for (i=0; i<flat_buf->count; i++) { userbuf_off = j*buftype_extent + flat_buf->indices[i]; req_off = off; req_len = flat_buf->blocklens[i]; ADIOI_BUFFERED_READ off += flat_buf->blocklens[i]; } if (fd->atomicity) ADIOI_UNLOCK(fd, start_off, SEEK_SET, end_offset-start_off+1); if (file_ptr_type == ADIO_INDIVIDUAL) fd->fp_ind = off; ADIOI_Free(readbuf); /* malloced in the buffered_read macro */ if (err_flag) { *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_IO, "**io", "**io %s", strerror(errno)); } else *error_code = MPI_SUCCESS; } else { /* noncontiguous in file */ /* filetype already flattened in ADIO_Open */ flat_file = ADIOI_Flatlist; while (flat_file->type != fd->filetype) flat_file = flat_file->next; disp = fd->disp; if (file_ptr_type == ADIO_INDIVIDUAL) { /* Wei-keng reworked type processing to be a bit more efficient */ offset = fd->fp_ind - disp; n_filetypes = (offset - flat_file->indices[0]) / filetype_extent; offset -= (ADIO_Offset)n_filetypes * filetype_extent; /* now offset is local to this extent */ /* find the block where offset is located, skip blocklens[i]==0 */ for (i=0; i<flat_file->count; i++) { ADIO_Offset dist; if (flat_file->blocklens[i] == 0) continue; dist = flat_file->indices[i] + flat_file->blocklens[i] - offset; /* frd_size is from offset to the end of block i */ if (dist == 0) { i++; offset = flat_file->indices[i]; frd_size = flat_file->blocklens[i]; break; } if (dist > 0 ) { frd_size = dist; break; } } st_index = i; /* starting index in flat_file->indices[] */ offset += disp + (ADIO_Offset)n_filetypes*filetype_extent; } else { n_etypes_in_filetype = filetype_size/etype_size; n_filetypes = (int) (offset / n_etypes_in_filetype); etype_in_filetype = (int) (offset % n_etypes_in_filetype); size_in_filetype = etype_in_filetype * etype_size; sum = 0; for (i=0; i<flat_file->count; i++) { sum += flat_file->blocklens[i]; if (sum > size_in_filetype) { st_index = i; frd_size = sum - size_in_filetype; abs_off_in_filetype = flat_file->indices[i] + size_in_filetype - (sum - flat_file->blocklens[i]); break; } } /* abs. offset in bytes in the file */ offset = disp + (ADIO_Offset) n_filetypes*filetype_extent + abs_off_in_filetype; } start_off = offset; /* Wei-keng Liao: read request is within a single flat_file contig * block e.g. with subarray types that actually describe the whole * array */ if (buftype_is_contig && bufsize <= frd_size) { ADIO_ReadContig(fd, buf, bufsize, MPI_BYTE, ADIO_EXPLICIT_OFFSET, offset, status, error_code); if (file_ptr_type == ADIO_INDIVIDUAL) { /* update MPI-IO file pointer to point to the first byte that * can be accessed in the fileview. */ fd->fp_ind = offset + bufsize; if (bufsize == frd_size) { do { st_index++; if (st_index == flat_file->count) { st_index = 0; n_filetypes++; } } while (flat_file->blocklens[st_index] == 0); fd->fp_ind = disp + flat_file->indices[st_index] + n_filetypes*filetype_extent; } } fd->fp_sys_posn = -1; /* set it to null. */ #ifdef HAVE_STATUS_SET_BYTES MPIR_Status_set_bytes(status, datatype, bufsize); #endif return; } /* Calculate end_offset, the last byte-offset that will be accessed. e.g., if start_offset=0 and 100 bytes to be read, end_offset=99*/ st_frd_size = frd_size; st_n_filetypes = n_filetypes; i = 0; j = st_index; off = offset; frd_size = ADIOI_MIN(st_frd_size, bufsize); while (i < bufsize) { i += frd_size; end_offset = off + frd_size - 1; j = (j+1) % flat_file->count; n_filetypes += (j == 0) ? 1 : 0; while (flat_file->blocklens[j]==0) { j = (j+1) % flat_file->count; n_filetypes += (j == 0) ? 1 : 0; } off = disp + flat_file->indices[j] + (ADIO_Offset) n_filetypes*filetype_extent; frd_size = ADIOI_MIN(flat_file->blocklens[j], bufsize-i); } /* if atomicity is true, lock (exclusive) the region to be accessed */ if (fd->atomicity) ADIOI_WRITE_LOCK(fd, start_off, SEEK_SET, end_offset-start_off+1); /* initial read into readbuf */ readbuf_off = offset; readbuf = (char *) ADIOI_Malloc(max_bufsize); readbuf_len = (int) (ADIOI_MIN(max_bufsize, end_offset-readbuf_off+1)); #ifdef ADIOI_MPE_LOGGING MPE_Log_event( ADIOI_MPE_lseek_a, 0, NULL ); #endif lseek(fd->fd_sys, offset, SEEK_SET); #ifdef ADIOI_MPE_LOGGING MPE_Log_event( ADIOI_MPE_lseek_b, 0, NULL ); #endif if (!(fd->atomicity)) ADIOI_READ_LOCK(fd, offset, SEEK_SET, readbuf_len); #ifdef ADIOI_MPE_LOGGING MPE_Log_event( ADIOI_MPE_read_a, 0, NULL ); #endif err = read(fd->fd_sys, readbuf, readbuf_len); #ifdef ADIOI_MPE_LOGGING MPE_Log_event( ADIOI_MPE_read_b, 0, NULL ); #endif if (!(fd->atomicity)) ADIOI_UNLOCK(fd, offset, SEEK_SET, readbuf_len); if (err == -1) err_flag = 1; if (buftype_is_contig && !filetype_is_contig) { /* contiguous in memory, noncontiguous in file. should be the most common case. */ i = 0; j = st_index; off = offset; n_filetypes = st_n_filetypes; frd_size = ADIOI_MIN(st_frd_size, bufsize); while (i < bufsize) { if (frd_size) { /* TYPE_UB and TYPE_LB can result in frd_size = 0. save system call in such cases */ /* lseek(fd->fd_sys, off, SEEK_SET); err = read(fd->fd_sys, ((char *) buf) + i, frd_size);*/ req_off = off; req_len = frd_size; userbuf_off = i; ADIOI_BUFFERED_READ } i += frd_size; if (off + frd_size < disp + flat_file->indices[j] + flat_file->blocklens[j] + (ADIO_Offset) n_filetypes*filetype_extent) off += frd_size; /* did not reach end of contiguous block in filetype. no more I/O needed. off is incremented by frd_size. */ else { j = (j+1) % flat_file->count; n_filetypes += (j == 0) ? 1 : 0; while (flat_file->blocklens[j]==0) { j = (j+1) % flat_file->count; n_filetypes += (j == 0) ? 1 : 0; } off = disp + flat_file->indices[j] + (ADIO_Offset) n_filetypes*filetype_extent; frd_size = ADIOI_MIN(flat_file->blocklens[j], bufsize-i); } } } else {
void ADIOI_XFS_WriteContig(ADIO_File fd, void *buf, int count, MPI_Datatype datatype, int file_ptr_type, ADIO_Offset offset, ADIO_Status *status, int *error_code) { int err=-1, datatype_size, len, diff, size, nbytes; void *newbuf; static char myname[] = "ADIOI_XFS_WRITECONTIG"; MPI_Type_size(datatype, &datatype_size); len = datatype_size * count; fd->fp_sys_posn = -1; /* set it to null, since we are using pwrite */ if (file_ptr_type == ADIO_INDIVIDUAL) offset = fd->fp_ind; if (!(fd->direct_write)) /* direct I/O not enabled */ err = pwrite(fd->fd_sys, buf, len, offset); else { /* direct I/O enabled */ /* (1) if mem_aligned && file_aligned use direct I/O to write up to correct io_size use buffered I/O for remaining */ if (!(((long) buf) % fd->d_mem) && !(offset % fd->d_miniosz)) ADIOI_XFS_Aligned_Mem_File_Write(fd, buf, len, offset, &err); /* (2) if !file_aligned use buffered I/O to write up to file_aligned At that point, if still mem_aligned, use (1) else copy into aligned buf and then use (1) */ else if (offset % fd->d_miniosz) { diff = fd->d_miniosz - (offset % fd->d_miniosz); diff = ADIOI_MIN(diff, len); nbytes = pwrite(fd->fd_sys, buf, diff, offset); buf = ((char *) buf) + diff; offset += diff; size = len - diff; if (!(((long) buf) % fd->d_mem)) { ADIOI_XFS_Aligned_Mem_File_Write(fd, buf, size, offset, &err); nbytes += err; } else { newbuf = (void *) memalign(XFS_MEMALIGN, size); if (newbuf) { memcpy(newbuf, buf, size); ADIOI_XFS_Aligned_Mem_File_Write(fd, newbuf, size, offset, &err); nbytes += err; free(newbuf); } else nbytes += pwrite(fd->fd_sys, buf, size, offset); } err = nbytes; } /* (3) if !mem_aligned && file_aligned copy into aligned buf, then use (1) */ else { newbuf = (void *) memalign(XFS_MEMALIGN, len); if (newbuf) { memcpy(newbuf, buf, len); ADIOI_XFS_Aligned_Mem_File_Write(fd, newbuf, len, offset, &err); free(newbuf); } else err = pwrite(fd->fd_sys, buf, len, offset); } } if (file_ptr_type == ADIO_INDIVIDUAL) fd->fp_ind += err; #ifdef HAVE_STATUS_SET_BYTES if (err != -1) MPIR_Status_set_bytes(status, datatype, err); #endif if (err == -1) { *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_IO, "**io", "**io %s", strerror(errno)); } else *error_code = MPI_SUCCESS; }
void ADIOI_GEN_ReadStrided(ADIO_File fd, void *buf, int count, MPI_Datatype datatype, int file_ptr_type, ADIO_Offset offset, ADIO_Status *status, int *error_code) { /* offset is in units of etype relative to the filetype. */ ADIOI_Flatlist_node *flat_buf, *flat_file; ADIO_Offset i_offset, new_brd_size, brd_size, size; int i, j, k, st_index=0; MPI_Count num, bufsize; int n_etypes_in_filetype; ADIO_Offset n_filetypes, etype_in_filetype, st_n_filetypes, size_in_filetype; ADIO_Offset abs_off_in_filetype=0, new_frd_size, frd_size=0, st_frd_size; MPI_Count filetype_size, etype_size, buftype_size, partial_read; MPI_Aint filetype_extent, buftype_extent; int buf_count, buftype_is_contig, filetype_is_contig; ADIO_Offset userbuf_off, req_len, sum; ADIO_Offset off, req_off, disp, end_offset=0, readbuf_off, start_off; char *readbuf, *tmp_buf, *value; int info_flag; unsigned max_bufsize, readbuf_len; ADIO_Status status1; if (fd->hints->ds_read == ADIOI_HINT_DISABLE) { /* if user has disabled data sieving on reads, use naive * approach instead. */ ADIOI_GEN_ReadStrided_naive(fd, buf, count, datatype, file_ptr_type, offset, status, error_code); return; } *error_code = MPI_SUCCESS; /* changed below if error */ ADIOI_Datatype_iscontig(datatype, &buftype_is_contig); ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig); MPI_Type_size_x(fd->filetype, &filetype_size); if ( ! filetype_size ) { #ifdef HAVE_STATUS_SET_BYTES MPIR_Status_set_bytes(status, datatype, 0); #endif *error_code = MPI_SUCCESS; return; } MPI_Type_extent(fd->filetype, &filetype_extent); MPI_Type_size_x(datatype, &buftype_size); MPI_Type_extent(datatype, &buftype_extent); etype_size = fd->etype_size; ADIOI_Assert((buftype_size * count) == ((ADIO_Offset)(MPI_Count)buftype_size * (ADIO_Offset)count)); bufsize = buftype_size * count; /* get max_bufsize from the info object. */ value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char)); ADIOI_Info_get(fd->info, "ind_rd_buffer_size", MPI_MAX_INFO_VAL, value, &info_flag); max_bufsize = atoi(value); ADIOI_Free(value); if (!buftype_is_contig && filetype_is_contig) { /* noncontiguous in memory, contiguous in file. */ flat_buf = ADIOI_Flatten_and_find(datatype); off = (file_ptr_type == ADIO_INDIVIDUAL) ? fd->fp_ind : fd->disp + (ADIO_Offset)etype_size * offset; start_off = off; end_offset = off + bufsize - 1; readbuf_off = off; readbuf = (char *) ADIOI_Malloc(max_bufsize); readbuf_len = (unsigned) (ADIOI_MIN(max_bufsize, end_offset-readbuf_off+1)); /* if atomicity is true, lock (exclusive) the region to be accessed */ if ((fd->atomicity) && ADIO_Feature(fd, ADIO_LOCKS)) ADIOI_WRITE_LOCK(fd, start_off, SEEK_SET, end_offset-start_off+1); ADIO_ReadContig(fd, readbuf, readbuf_len, MPI_BYTE, ADIO_EXPLICIT_OFFSET, readbuf_off, &status1, error_code); if (*error_code != MPI_SUCCESS) return; for (j=0; j<count; j++) { for (i=0; i<flat_buf->count; i++) { userbuf_off = (ADIO_Offset)j*(ADIO_Offset)buftype_extent + flat_buf->indices[i]; req_off = off; req_len = flat_buf->blocklens[i]; ADIOI_BUFFERED_READ off += flat_buf->blocklens[i]; } } if ((fd->atomicity) && ADIO_Feature(fd, ADIO_LOCKS)) ADIOI_UNLOCK(fd, start_off, SEEK_SET, end_offset-start_off+1); if (file_ptr_type == ADIO_INDIVIDUAL) fd->fp_ind = off; ADIOI_Free(readbuf); } else { /* noncontiguous in file */ /* filetype already flattened in ADIO_Open */ flat_file = ADIOI_Flatlist; while (flat_file->type != fd->filetype) flat_file = flat_file->next; disp = fd->disp; if (file_ptr_type == ADIO_INDIVIDUAL) { /* Wei-keng reworked type processing to be a bit more efficient */ offset = fd->fp_ind - disp; n_filetypes = (offset - flat_file->indices[0]) / filetype_extent; offset -= (ADIO_Offset)n_filetypes * filetype_extent; /* now offset is local to this extent */ /* find the block where offset is located, skip blocklens[i]==0 */ for (i=0; i<flat_file->count; i++) { ADIO_Offset dist; if (flat_file->blocklens[i] == 0) continue; dist = flat_file->indices[i] + flat_file->blocklens[i] - offset; /* frd_size is from offset to the end of block i */ if (dist == 0) { i++; offset = flat_file->indices[i]; frd_size = flat_file->blocklens[i]; break; } if (dist > 0) { frd_size = dist; break; } } st_index = i; /* starting index in flat_file->indices[] */ offset += disp + (ADIO_Offset)n_filetypes*filetype_extent; } else { n_etypes_in_filetype = filetype_size/etype_size; n_filetypes = offset / n_etypes_in_filetype; etype_in_filetype = offset % n_etypes_in_filetype; size_in_filetype = etype_in_filetype * etype_size; sum = 0; for (i=0; i<flat_file->count; i++) { sum += flat_file->blocklens[i]; if (sum > size_in_filetype) { st_index = i; frd_size = sum - size_in_filetype; abs_off_in_filetype = flat_file->indices[i] + size_in_filetype - (sum - flat_file->blocklens[i]); break; } } /* abs. offset in bytes in the file */ offset = disp + (ADIO_Offset) n_filetypes*filetype_extent + abs_off_in_filetype; } start_off = offset; /* Wei-keng Liao: read request is within a single flat_file contig * block e.g. with subarray types that actually describe the whole * array */ if (buftype_is_contig && bufsize <= frd_size) { /* a count of bytes can overflow. operate on original type instead */ ADIO_ReadContig(fd, buf, count, datatype, ADIO_EXPLICIT_OFFSET, offset, status, error_code); if (file_ptr_type == ADIO_INDIVIDUAL) { /* update MPI-IO file pointer to point to the first byte that * can be accessed in the fileview. */ fd->fp_ind = offset + bufsize; if (bufsize == frd_size) { do { st_index++; if (st_index == flat_file->count) { st_index = 0; n_filetypes++; } } while (flat_file->blocklens[st_index] == 0); fd->fp_ind = disp + flat_file->indices[st_index] + n_filetypes*filetype_extent; } } fd->fp_sys_posn = -1; /* set it to null. */ #ifdef HAVE_STATUS_SET_BYTES MPIR_Status_set_bytes(status, datatype, bufsize); #endif return; } /* Calculate end_offset, the last byte-offset that will be accessed. e.g., if start_offset=0 and 100 bytes to be read, end_offset=99*/ st_frd_size = frd_size; st_n_filetypes = n_filetypes; i_offset = 0; j = st_index; off = offset; frd_size = ADIOI_MIN(st_frd_size, bufsize); while (i_offset < bufsize) { i_offset += frd_size; end_offset = off + frd_size - 1; j = (j+1) % flat_file->count; n_filetypes += (j == 0) ? 1 : 0; while (flat_file->blocklens[j]==0) { j = (j+1) % flat_file->count; n_filetypes += (j == 0) ? 1 : 0; } off = disp + flat_file->indices[j] + n_filetypes*(ADIO_Offset)filetype_extent; frd_size = ADIOI_MIN(flat_file->blocklens[j], bufsize-i_offset); } /* if atomicity is true, lock (exclusive) the region to be accessed */ if ((fd->atomicity) && ADIO_Feature(fd, ADIO_LOCKS)) ADIOI_WRITE_LOCK(fd, start_off, SEEK_SET, end_offset-start_off+1); readbuf_off = 0; readbuf_len = 0; readbuf = (char *) ADIOI_Malloc(max_bufsize); if (buftype_is_contig && !filetype_is_contig) { /* contiguous in memory, noncontiguous in file. should be the most common case. */ i_offset = 0; j = st_index; off = offset; n_filetypes = st_n_filetypes; frd_size = ADIOI_MIN(st_frd_size, bufsize); while (i_offset < bufsize) { if (frd_size) { /* TYPE_UB and TYPE_LB can result in frd_size = 0. save system call in such cases */ /* lseek(fd->fd_sys, off, SEEK_SET); err = read(fd->fd_sys, ((char *) buf) + i, frd_size);*/ req_off = off; req_len = frd_size; userbuf_off = i_offset; ADIOI_BUFFERED_READ } i_offset += frd_size; if (off + frd_size < disp + flat_file->indices[j] + flat_file->blocklens[j] + n_filetypes*(ADIO_Offset)filetype_extent) off += frd_size; /* did not reach end of contiguous block in filetype. no more I/O needed. off is incremented by frd_size. */ else { j = (j+1) % flat_file->count; n_filetypes += (j == 0) ? 1 : 0; while (flat_file->blocklens[j]==0) { j = (j+1) % flat_file->count; n_filetypes += (j == 0) ? 1 : 0; } off = disp + flat_file->indices[j] + n_filetypes*(ADIO_Offset)filetype_extent; frd_size = ADIOI_MIN(flat_file->blocklens[j], bufsize-i_offset); } } } else {
void ADIOI_Calc_file_domains(ADIO_Offset *st_offsets, ADIO_Offset *end_offsets, int nprocs, int nprocs_for_coll, ADIO_Offset *min_st_offset_ptr, ADIO_Offset **fd_start_ptr, ADIO_Offset **fd_end_ptr, ADIO_Offset *fd_size_ptr) { /* Divide the I/O workload among "nprocs_for_coll" processes. This is done by (logically) dividing the file into file domains (FDs); each process may directly access only its own file domain. */ ADIO_Offset min_st_offset, max_end_offset, *fd_start, *fd_end, fd_size; ADIO_Offset alignment = *fd_size_ptr; int i; #ifdef AGG_DEBUG FPRINTF(stderr, "ADIOI_Calc_file_domains: %d aggregator(s)\n", nprocs_for_coll); #endif /* find min of start offsets and max of end offsets of all processes */ min_st_offset = st_offsets[0]; max_end_offset = end_offsets[0]; for (i=1; i<nprocs; i++) { min_st_offset = ADIOI_MIN(min_st_offset, st_offsets[i]); max_end_offset = ADIOI_MAX(max_end_offset, end_offsets[i]); } /* determine the "file domain (FD)" of each process, i.e., the portion of the file that will be "owned" by each process */ /* partition the total file access range equally among nprocs_for_coll processes */ if (alignment) { min_st_offset = ALIGNDOWN(min_st_offset, alignment); fd_size = ((max_end_offset - min_st_offset + 1) + nprocs_for_coll - 1)/nprocs_for_coll; fd_size = (fd_size + alignment -1 ) / alignment * alignment; } else { fd_size = ((max_end_offset - min_st_offset + 1) + nprocs_for_coll - 1)/nprocs_for_coll; } /* ceiling division as in HPF block distribution */ *fd_start_ptr = (ADIO_Offset *) ADIOI_Malloc(nprocs_for_coll*sizeof(ADIO_Offset)); *fd_end_ptr = (ADIO_Offset *) ADIOI_Malloc(nprocs_for_coll*sizeof(ADIO_Offset)); fd_start = *fd_start_ptr; fd_end = *fd_end_ptr; fd_start[0] = min_st_offset; fd_end[0] = min_st_offset + fd_size - 1; for (i=1; i<nprocs_for_coll; i++) { fd_start[i] = fd_end[i-1] + 1; fd_end[i] = fd_start[i] + fd_size - 1; } /* take care of cases in which the total file access range is not divisible by the number of processes. In such cases, the last process, or the last few processes, may have unequal load (even 0). For example, a range of 97 divided among 16 processes. Note that the division is ceiling division. */ for (i=0; i<nprocs_for_coll; i++) { if (fd_start[i] > max_end_offset) fd_start[i] = fd_end[i] = -1; if (fd_end[i] > max_end_offset) fd_end[i] = max_end_offset; } *fd_size_ptr = fd_size; *min_st_offset_ptr = min_st_offset; }
void ADIOI_PVFS_ReadStridedListIO(ADIO_File fd, void *buf, int count, MPI_Datatype datatype, int file_ptr_type, ADIO_Offset offset, ADIO_Status *status, int *error_code) { /* offset is in units of etype relative to the filetype. */ ADIOI_Flatlist_node *flat_buf, *flat_file; int i, j, k, l, brd_size, frd_size=0, st_index=0; int bufsize, sum, n_etypes_in_filetype, size_in_filetype; int n_filetypes, etype_in_filetype; ADIO_Offset abs_off_in_filetype=0; int filetype_size, etype_size, buftype_size; MPI_Aint filetype_extent, buftype_extent; int buf_count, buftype_is_contig, filetype_is_contig; ADIO_Offset userbuf_off; ADIO_Offset off, disp, start_off; int flag, st_frd_size, st_n_filetypes; int new_brd_size, new_frd_size; int mem_list_count, file_list_count; char **mem_offsets; int64_t *file_offsets; int *mem_lengths; int32_t *file_lengths; int total_blks_to_read; int max_mem_list, max_file_list; int b_blks_read; int f_data_read; int size_read=0, n_read_lists, extra_blks; int end_brd_size, end_frd_size; int start_k, start_j, new_file_read, new_buffer_read; int start_mem_offset; #define MAX_ARRAY_SIZE 1024 #ifndef PRINT_ERR_MESG static char myname[] = "ADIOI_PVFS_ReadStrided"; #endif *error_code = MPI_SUCCESS; /* changed below if error */ ADIOI_Datatype_iscontig(datatype, &buftype_is_contig); ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig); MPI_Type_size(fd->filetype, &filetype_size); if ( ! filetype_size ) { #ifdef HAVE_STATUS_SET_BYTES MPIR_Status_set_bytes(status, datatype, 0); #endif *error_code = MPI_SUCCESS; return; } MPI_Type_extent(fd->filetype, &filetype_extent); MPI_Type_size(datatype, &buftype_size); MPI_Type_extent(datatype, &buftype_extent); etype_size = fd->etype_size; bufsize = buftype_size * count; if (!buftype_is_contig && filetype_is_contig) { /* noncontiguous in memory, contiguous in file. */ int64_t file_offsets; int32_t file_lengths; ADIOI_Flatten_datatype(datatype); flat_buf = ADIOI_Flatlist; while (flat_buf->type != datatype) flat_buf = flat_buf->next; off = (file_ptr_type == ADIO_INDIVIDUAL) ? fd->fp_ind : fd->disp + etype_size * offset; file_list_count = 1; file_offsets = off; file_lengths = 0; total_blks_to_read = count*flat_buf->count; b_blks_read = 0; /* allocate arrays according to max usage */ if (total_blks_to_read > MAX_ARRAY_SIZE) mem_list_count = MAX_ARRAY_SIZE; else mem_list_count = total_blks_to_read; mem_offsets = (char**)ADIOI_Malloc(mem_list_count*sizeof(char*)); mem_lengths = (int*)ADIOI_Malloc(mem_list_count*sizeof(int)); j = 0; /* step through each block in memory, filling memory arrays */ while (b_blks_read < total_blks_to_read) { for (i=0; i<flat_buf->count; i++) { mem_offsets[b_blks_read % MAX_ARRAY_SIZE] = (char*)((char *)buf + j*buftype_extent + flat_buf->indices[i]); mem_lengths[b_blks_read % MAX_ARRAY_SIZE] = flat_buf->blocklens[i]; file_lengths += flat_buf->blocklens[i]; b_blks_read++; if (!(b_blks_read % MAX_ARRAY_SIZE) || (b_blks_read == total_blks_to_read)) { /* in the case of the last read list call, adjust mem_list_count */ if (b_blks_read == total_blks_to_read) { mem_list_count = total_blks_to_read % MAX_ARRAY_SIZE; /* in case last read list call fills max arrays */ if (!mem_list_count) mem_list_count = MAX_ARRAY_SIZE; } pvfs_read_list(fd->fd_sys ,mem_list_count, mem_offsets, mem_lengths, file_list_count, &file_offsets, &file_lengths); /* in the case of the last read list call, leave here */ if (b_blks_read == total_blks_to_read) break; file_offsets += file_lengths; file_lengths = 0; } } /* for (i=0; i<flat_buf->count; i++) */ j++; } /* while (b_blks_read < total_blks_to_read) */ ADIOI_Free(mem_offsets); ADIOI_Free(mem_lengths); if (file_ptr_type == ADIO_INDIVIDUAL) fd->fp_ind = off; fd->fp_sys_posn = -1; /* set it to null. */ #ifdef HAVE_STATUS_SET_BYTES MPIR_Status_set_bytes(status, datatype, bufsize); /* This isa temporary way of filling in status. The right way is to keep tracke of how much data was actually read adn placed in buf by ADIOI_BUFFERED_READ. */ #endif ADIOI_Delete_flattened(datatype); return; } /* if (!buftype_is_contig && filetype_is_contig) */ /* know file is noncontiguous from above */ /* noncontiguous in file */ /* filetype already flattened in ADIO_Open */ flat_file = ADIOI_Flatlist; while (flat_file->type != fd->filetype) flat_file = flat_file->next; disp = fd->disp; /* for each case - ADIO_Individual pointer or explicit, find the file offset in bytes (offset), n_filetypes (how many filetypes into file to start), frd_size (remaining amount of data in present file block), and st_index (start point in terms of blocks in starting filetype) */ if (file_ptr_type == ADIO_INDIVIDUAL) { offset = fd->fp_ind; /* in bytes */ n_filetypes = -1; flag = 0; while (!flag) { n_filetypes++; for (i=0; i<flat_file->count; i++) { if (disp + flat_file->indices[i] + (ADIO_Offset) n_filetypes*filetype_extent + flat_file->blocklens[i] >= offset) { st_index = i; frd_size = (int) (disp + flat_file->indices[i] + (ADIO_Offset) n_filetypes*filetype_extent + flat_file->blocklens[i] - offset); flag = 1; break; } } } /* while (!flag) */ } /* if (file_ptr_type == ADIO_INDIVIDUAL) */ else { n_etypes_in_filetype = filetype_size/etype_size; n_filetypes = (int) (offset / n_etypes_in_filetype); etype_in_filetype = (int) (offset % n_etypes_in_filetype); size_in_filetype = etype_in_filetype * etype_size; sum = 0; for (i=0; i<flat_file->count; i++) { sum += flat_file->blocklens[i]; if (sum > size_in_filetype) { st_index = i; frd_size = sum - size_in_filetype; abs_off_in_filetype = flat_file->indices[i] + size_in_filetype - (sum - flat_file->blocklens[i]); break; } } /* abs. offset in bytes in the file */ offset = disp + (ADIO_Offset) n_filetypes*filetype_extent + abs_off_in_filetype; } /* else [file_ptr_type != ADIO_INDIVIDUAL] */ start_off = offset; st_frd_size = frd_size; st_n_filetypes = n_filetypes; if (buftype_is_contig && !filetype_is_contig) { /* contiguous in memory, noncontiguous in file. should be the most common case. */ int mem_lengths; char *mem_offsets; i = 0; j = st_index; n_filetypes = st_n_filetypes; mem_list_count = 1; /* determine how many blocks in file to read */ f_data_read = ADIOI_MIN(st_frd_size, bufsize); total_blks_to_read = 1; j++; while (f_data_read < bufsize) { f_data_read += flat_file->blocklens[j]; total_blks_to_read++; if (j<(flat_file->count-1)) j++; else j = 0; } j = st_index; n_filetypes = st_n_filetypes; n_read_lists = total_blks_to_read/MAX_ARRAY_SIZE; extra_blks = total_blks_to_read%MAX_ARRAY_SIZE; mem_offsets = buf; mem_lengths = 0; /* if at least one full readlist, allocate file arrays at max array size and don't free until very end */ if (n_read_lists) { file_offsets = (int64_t*)ADIOI_Malloc(MAX_ARRAY_SIZE* sizeof(int64_t)); file_lengths = (int32_t*)ADIOI_Malloc(MAX_ARRAY_SIZE* sizeof(int32_t)); } /* if there's no full readlist allocate file arrays according to needed size (extra_blks) */ else { file_offsets = (int64_t*)ADIOI_Malloc(extra_blks* sizeof(int64_t)); file_lengths = (int32_t*)ADIOI_Malloc(extra_blks* sizeof(int32_t)); } /* for file arrays that are of MAX_ARRAY_SIZE, build arrays */ for (i=0; i<n_read_lists; i++) { file_list_count = MAX_ARRAY_SIZE; if(!i) { file_offsets[0] = offset; file_lengths[0] = st_frd_size; mem_lengths = st_frd_size; } for (k=0; k<MAX_ARRAY_SIZE; k++) { if (i || k) { file_offsets[k] = disp + n_filetypes*filetype_extent + flat_file->indices[j]; file_lengths[k] = flat_file->blocklens[j]; mem_lengths += file_lengths[k]; } if (j<(flat_file->count - 1)) j++; else { j = 0; n_filetypes++; } } /* for (k=0; k<MAX_ARRAY_SIZE; k++) */ pvfs_read_list(fd->fd_sys, mem_list_count, &mem_offsets, &mem_lengths, file_list_count, file_offsets, file_lengths); mem_offsets += mem_lengths; mem_lengths = 0; } /* for (i=0; i<n_read_lists; i++) */ /* for file arrays smaller than MAX_ARRAY_SIZE (last read_list call) */ if (extra_blks) { file_list_count = extra_blks; if(!i) { file_offsets[0] = offset; file_lengths[0] = st_frd_size; } for (k=0; k<extra_blks; k++) { if(i || k) { file_offsets[k] = disp + n_filetypes*filetype_extent + flat_file->indices[j]; if (k == (extra_blks - 1)) { file_lengths[k] = bufsize - (int32_t) mem_lengths - (int32_t) mem_offsets + (int32_t) buf; } else file_lengths[k] = flat_file->blocklens[j]; } /* if(i || k) */ mem_lengths += file_lengths[k]; if (j<(flat_file->count - 1)) j++; else { j = 0; n_filetypes++; } } /* for (k=0; k<extra_blks; k++) */ pvfs_read_list(fd->fd_sys, mem_list_count, &mem_offsets, &mem_lengths, file_list_count, file_offsets, file_lengths); } } else { /* noncontiguous in memory as well as in file */ ADIOI_Flatten_datatype(datatype); flat_buf = ADIOI_Flatlist; while (flat_buf->type != datatype) flat_buf = flat_buf->next; size_read = 0; n_filetypes = st_n_filetypes; frd_size = st_frd_size; brd_size = flat_buf->blocklens[0]; buf_count = 0; start_mem_offset = 0; start_k = k = 0; start_j = st_index; max_mem_list = 0; max_file_list = 0; /* run through and file max_file_list and max_mem_list so that you can allocate the file and memory arrays less than MAX_ARRAY_SIZE if possible */ while (size_read < bufsize) { k = start_k; new_buffer_read = 0; mem_list_count = 0; while ((mem_list_count < MAX_ARRAY_SIZE) && (new_buffer_read < bufsize-size_read)) { /* find mem_list_count and file_list_count such that both are less than MAX_ARRAY_SIZE, the sum of their lengths are equal, and the sum of all the data read and data to be read in the next immediate read list is less than bufsize */ if(mem_list_count) { if((new_buffer_read + flat_buf->blocklens[k] + size_read) > bufsize) { end_brd_size = new_buffer_read + flat_buf->blocklens[k] - (bufsize - size_read); new_buffer_read = bufsize - size_read; } else { new_buffer_read += flat_buf->blocklens[k]; end_brd_size = flat_buf->blocklens[k]; } } else { if (brd_size > (bufsize - size_read)) { new_buffer_read = bufsize - size_read; brd_size = new_buffer_read; } else new_buffer_read = brd_size; } mem_list_count++; k = (k + 1)%flat_buf->count; } /* while ((mem_list_count < MAX_ARRAY_SIZE) && (new_buffer_read < bufsize-size_read)) */ j = start_j; new_file_read = 0; file_list_count = 0; while ((file_list_count < MAX_ARRAY_SIZE) && (new_file_read < new_buffer_read)) { if(file_list_count) { if((new_file_read + flat_file->blocklens[j]) > new_buffer_read) { end_frd_size = new_buffer_read - new_file_read; new_file_read = new_buffer_read; j--; } else { new_file_read += flat_file->blocklens[j]; end_frd_size = flat_file->blocklens[j]; } } else { if (frd_size > new_buffer_read) { new_file_read = new_buffer_read; frd_size = new_file_read; } else new_file_read = frd_size; } file_list_count++; if (j < (flat_file->count - 1)) j++; else j = 0; k = start_k; if ((new_file_read < new_buffer_read) && (file_list_count == MAX_ARRAY_SIZE)) { new_buffer_read = 0; mem_list_count = 0; while (new_buffer_read < new_file_read) { if(mem_list_count) { if((new_buffer_read + flat_buf->blocklens[k]) > new_file_read) { end_brd_size = new_file_read - new_buffer_read; new_buffer_read = new_file_read; k--; } else { new_buffer_read += flat_buf->blocklens[k]; end_brd_size = flat_buf->blocklens[k]; } } else { new_buffer_read = brd_size; if (brd_size > (bufsize - size_read)) { new_buffer_read = bufsize - size_read; brd_size = new_buffer_read; } } mem_list_count++; k = (k + 1)%flat_buf->count; } /* while (new_buffer_read < new_file_read) */ } /* if ((new_file_read < new_buffer_read) && (file_list_count == MAX_ARRAY_SIZE)) */ } /* while ((mem_list_count < MAX_ARRAY_SIZE) && (new_buffer_read < bufsize-size_read)) */ /* fakes filling the readlist arrays of lengths found above */ k = start_k; j = start_j; for (i=0; i<mem_list_count; i++) { if(i) { if (i == (mem_list_count - 1)) { if (flat_buf->blocklens[k] == end_brd_size) brd_size = flat_buf->blocklens[(k+1)% flat_buf->count]; else { brd_size = flat_buf->blocklens[k] - end_brd_size; k--; buf_count--; } } } buf_count++; k = (k + 1)%flat_buf->count; } /* for (i=0; i<mem_list_count; i++) */ for (i=0; i<file_list_count; i++) { if (i) { if (i == (file_list_count - 1)) { if (flat_file->blocklens[j] == end_frd_size) frd_size = flat_file->blocklens[(j+1)% flat_file->count]; else { frd_size = flat_file->blocklens[j] - end_frd_size; j--; } } } if (j < flat_file->count - 1) j++; else { j = 0; n_filetypes++; } } /* for (i=0; i<file_list_count; i++) */ size_read += new_buffer_read; start_k = k; start_j = j; if (max_mem_list < mem_list_count) max_mem_list = mem_list_count; if (max_file_list < file_list_count) max_file_list = file_list_count; } /* while (size_read < bufsize) */ mem_offsets = (char **)ADIOI_Malloc(max_mem_list*sizeof(char *)); mem_lengths = (int *)ADIOI_Malloc(max_mem_list*sizeof(int)); file_offsets = (int64_t *)ADIOI_Malloc(max_file_list*sizeof(int64_t)); file_lengths = (int32_t *)ADIOI_Malloc(max_file_list*sizeof(int32_t)); size_read = 0; n_filetypes = st_n_filetypes; frd_size = st_frd_size; brd_size = flat_buf->blocklens[0]; buf_count = 0; start_mem_offset = 0; start_k = k = 0; start_j = st_index; /* this section calculates mem_list_count and file_list_count and also finds the possibly odd sized last array elements in new_frd_size and new_brd_size */ while (size_read < bufsize) { k = start_k; new_buffer_read = 0; mem_list_count = 0; while ((mem_list_count < MAX_ARRAY_SIZE) && (new_buffer_read < bufsize-size_read)) { /* find mem_list_count and file_list_count such that both are less than MAX_ARRAY_SIZE, the sum of their lengths are equal, and the sum of all the data read and data to be read in the next immediate read list is less than bufsize */ if(mem_list_count) { if((new_buffer_read + flat_buf->blocklens[k] + size_read) > bufsize) { end_brd_size = new_buffer_read + flat_buf->blocklens[k] - (bufsize - size_read); new_buffer_read = bufsize - size_read; } else { new_buffer_read += flat_buf->blocklens[k]; end_brd_size = flat_buf->blocklens[k]; } } else { if (brd_size > (bufsize - size_read)) { new_buffer_read = bufsize - size_read; brd_size = new_buffer_read; } else new_buffer_read = brd_size; } mem_list_count++; k = (k + 1)%flat_buf->count; } /* while ((mem_list_count < MAX_ARRAY_SIZE) && (new_buffer_read < bufsize-size_read)) */ j = start_j; new_file_read = 0; file_list_count = 0; while ((file_list_count < MAX_ARRAY_SIZE) && (new_file_read < new_buffer_read)) { if(file_list_count) { if((new_file_read + flat_file->blocklens[j]) > new_buffer_read) { end_frd_size = new_buffer_read - new_file_read; new_file_read = new_buffer_read; j--; } else { new_file_read += flat_file->blocklens[j]; end_frd_size = flat_file->blocklens[j]; } } else { if (frd_size > new_buffer_read) { new_file_read = new_buffer_read; frd_size = new_file_read; } else new_file_read = frd_size; } file_list_count++; if (j < (flat_file->count - 1)) j++; else j = 0; k = start_k; if ((new_file_read < new_buffer_read) && (file_list_count == MAX_ARRAY_SIZE)) { new_buffer_read = 0; mem_list_count = 0; while (new_buffer_read < new_file_read) { if(mem_list_count) { if((new_buffer_read + flat_buf->blocklens[k]) > new_file_read) { end_brd_size = new_file_read - new_buffer_read; new_buffer_read = new_file_read; k--; } else { new_buffer_read += flat_buf->blocklens[k]; end_brd_size = flat_buf->blocklens[k]; } } else { new_buffer_read = brd_size; if (brd_size > (bufsize - size_read)) { new_buffer_read = bufsize - size_read; brd_size = new_buffer_read; } } mem_list_count++; k = (k + 1)%flat_buf->count; } /* while (new_buffer_read < new_file_read) */ } /* if ((new_file_read < new_buffer_read) && (file_list_count == MAX_ARRAY_SIZE)) */ } /* while ((mem_list_count < MAX_ARRAY_SIZE) && (new_buffer_read < bufsize-size_read)) */ /* fills the allocated readlist arrays */ k = start_k; j = start_j; for (i=0; i<mem_list_count; i++) { mem_offsets[i] = (char*)((char *)buf + buftype_extent* (buf_count/flat_buf->count) + (int)flat_buf->indices[k]); if(!i) { mem_lengths[0] = brd_size; mem_offsets[0] += flat_buf->blocklens[k] - brd_size; } else { if (i == (mem_list_count - 1)) { mem_lengths[i] = end_brd_size; if (flat_buf->blocklens[k] == end_brd_size) brd_size = flat_buf->blocklens[(k+1)% flat_buf->count]; else { brd_size = flat_buf->blocklens[k] - end_brd_size; k--; buf_count--; } } else { mem_lengths[i] = flat_buf->blocklens[k]; } } buf_count++; k = (k + 1)%flat_buf->count; } /* for (i=0; i<mem_list_count; i++) */ for (i=0; i<file_list_count; i++) { file_offsets[i] = disp + flat_file->indices[j] + n_filetypes * filetype_extent; if (!i) { file_lengths[0] = frd_size; file_offsets[0] += flat_file->blocklens[j] - frd_size; } else { if (i == (file_list_count - 1)) { file_lengths[i] = end_frd_size; if (flat_file->blocklens[j] == end_frd_size) frd_size = flat_file->blocklens[(j+1)% flat_file->count]; else { frd_size = flat_file->blocklens[j] - end_frd_size; j--; } } else file_lengths[i] = flat_file->blocklens[j]; } if (j < flat_file->count - 1) j++; else { j = 0; n_filetypes++; } } /* for (i=0; i<file_list_count; i++) */ /* printf("about to call read_list in noncontig/noncontig\n"); printf("offsets and lengths in terms of integers\n"); printf("\nmem_list_count = %d\n", mem_list_count); for (i=0; i<mem_list_count; i++) { printf("mem_offsets[%2d] = %2d ", i, (int)(mem_offsets[i] - (int)buf)/4); printf("mem_lengths[%2d] = %2d\n", i, mem_lengths[i]/4); } printf("\nfile_list_count = %d\n", file_list_count); for (i=0; i<file_list_count; i++) { printf("file_offsets[%2d] = %2d ", i, (int)file_offsets[i]/4); printf("file_lengths[%2d] = %2d\n", i, file_lengths[i]/4); } printf("\n\n"); */ pvfs_read_list(fd->fd_sys,mem_list_count, mem_offsets, mem_lengths, file_list_count, file_offsets, file_lengths); size_read += new_buffer_read; start_k = k; start_j = j; } /* while (size_read < bufsize) */ ADIOI_Free(mem_offsets); ADIOI_Free(mem_lengths); } ADIOI_Free(file_offsets); ADIOI_Free(file_lengths); if (file_ptr_type == ADIO_INDIVIDUAL) fd->fp_ind = off; fd->fp_sys_posn = -1; /* set it to null. */ #ifdef HAVE_STATUS_SET_BYTES MPIR_Status_set_bytes(status, datatype, bufsize); /* This is a temporary way of filling in status. The right way is to keep track of how much data was actually read and placed in buf by ADIOI_BUFFERED_READ. */ #endif if (!buftype_is_contig) ADIOI_Delete_flattened(datatype); }
/* * Compute a dynamic access range based file domain partition among I/O aggregators, * which align to the GPFS block size * Divide the I/O workload among "nprocs_for_coll" processes. This is * done by (logically) dividing the file into file domains (FDs); each * process may directly access only its own file domain. * Additional effort is to make sure that each I/O aggregator get * a file domain that aligns to the GPFS block size. So, there will * not be any false sharing of GPFS file blocks among multiple I/O nodes. * * The common version of this now accepts a min_fd_size and striping_unit. * It doesn't seem necessary here (using GPFS block sizes) but keep it in mind * (e.g. we could pass striping unit instead of using fs_ptr->blksize). */ void ADIOI_BGL_GPFS_Calc_file_domains(ADIO_Offset *st_offsets, ADIO_Offset *end_offsets, int nprocs, int nprocs_for_coll, ADIO_Offset *min_st_offset_ptr, ADIO_Offset **fd_start_ptr, ADIO_Offset **fd_end_ptr, ADIO_Offset *fd_size_ptr, void *fs_ptr) { ADIO_Offset min_st_offset, max_end_offset, *fd_start, *fd_end, *fd_size; int i, aggr; #ifdef AGGREGATION_PROFILE MPE_Log_event (5004, 0, NULL); #endif # if AGG_DEBUG static char myname[] = "ADIOI_BGL_GPFS_Calc_file_domains"; DBG_FPRINTF(stderr, "%s(%d): %d aggregator(s)\n", myname,__LINE__,nprocs_for_coll); # endif __blksize_t blksize = 1048576; /* default to 1M */ if(fs_ptr && ((ADIOI_BGL_fs*)fs_ptr)->blksize) /* ignore null ptr or 0 blksize */ blksize = ((ADIOI_BGL_fs*)fs_ptr)->blksize; # if AGG_DEBUG DBG_FPRINTF(stderr,"%s(%d): Blocksize=%ld\n",myname,__LINE__,blksize); # endif /* find min of start offsets and max of end offsets of all processes */ min_st_offset = st_offsets [0]; max_end_offset = end_offsets[0]; for (i=1; i<nprocs; i++) { min_st_offset = ADIOI_MIN(min_st_offset, st_offsets[i]); max_end_offset = ADIOI_MAX(max_end_offset, end_offsets[i]); } // DBG_FPRINTF(stderr, "_calc_file_domains, min_st_offset, max_ = %qd, %qd\n", min_st_offset, max_end_offset ); /* determine the "file domain (FD)" of each process, i.e., the portion of the file that will be "owned" by each process */ ADIO_Offset gpfs_ub = (max_end_offset +blksize-1) / blksize * blksize - 1; ADIO_Offset gpfs_lb = min_st_offset / blksize * blksize; ADIO_Offset gpfs_ub_rdoff = (max_end_offset +blksize-1) / blksize * blksize - 1 - max_end_offset; ADIO_Offset gpfs_lb_rdoff = min_st_offset - min_st_offset / blksize * blksize; ADIO_Offset fd_gpfs_range = gpfs_ub - gpfs_lb + 1; int naggs = nprocs_for_coll; /* Tweak the file domains so that no fd is smaller than a threshold. We * have to strike a balance between efficency and parallelism: somewhere * between 10k processes sending 32-byte requests and one process sending a * 320k request is a (system-dependent) sweet spot This is from the common code - the new min_fd_size parm that we didn't implement. (And common code uses a different declaration of fd_size so beware) */ /* this is not entirely sufficient on BlueGene: we must be mindful of * imbalance over psets. the hint processing code has already picked, say, * 8 processors per pset, so if we go increasing fd_size we'll end up with * some psets with 8 processors and some psets with none. */ /* if (fd_size < min_fd_size) fd_size = min_fd_size; */ fd_size = (ADIO_Offset *) ADIOI_Malloc(nprocs_for_coll * sizeof(ADIO_Offset)); *fd_start_ptr = (ADIO_Offset *) ADIOI_Malloc(nprocs_for_coll * sizeof(ADIO_Offset)); *fd_end_ptr = (ADIO_Offset *) ADIOI_Malloc(nprocs_for_coll * sizeof(ADIO_Offset)); fd_start = *fd_start_ptr; fd_end = *fd_end_ptr; ADIO_Offset n_gpfs_blk = fd_gpfs_range / blksize; ADIO_Offset nb_cn_small = n_gpfs_blk/naggs; ADIO_Offset naggs_large = n_gpfs_blk - naggs * (n_gpfs_blk/naggs); ADIO_Offset naggs_small = naggs - naggs_large; /* nb_cn_small * blksize: evenly split file domain among processors: * equivalent to fd_gpfs_rnage/naggs * (nb_cn_small+1) * blksize: keeps file domain at least 'blksize' big */ for (i=0; i<naggs; i++) if (i < naggs_small) fd_size[i] = nb_cn_small * blksize; else fd_size[i] = (nb_cn_small+1) * blksize; /*potential optimization: if n_gpfs_blk smalller than * naggs, slip in some zero-sized file * domains to spread the work across all psets. */ # if AGG_DEBUG DBG_FPRINTF(stderr,"%s(%d): " "gpfs_ub %llu, " "gpfs_lb %llu, " "gpfs_ub_rdoff %llu, " "gpfs_lb_rdoff %llu, " "fd_gpfs_range %llu, " "n_gpfs_blk %llu, " "nb_cn_small %llu, " "naggs_large %llu, " "naggs_small %llu, " "\n", myname,__LINE__, gpfs_ub , gpfs_lb , gpfs_ub_rdoff, gpfs_lb_rdoff, fd_gpfs_range, n_gpfs_blk , nb_cn_small , naggs_large , naggs_small ); # endif fd_size[0] -= gpfs_lb_rdoff; fd_size[naggs-1] -= gpfs_ub_rdoff; /* compute the file domain for each aggr */ ADIO_Offset offset = min_st_offset; for (aggr=0; aggr<naggs; aggr++) { fd_start[aggr] = offset; fd_end [aggr] = offset + fd_size[aggr] - 1; offset += fd_size[aggr]; } *fd_size_ptr = fd_size[0]; *min_st_offset_ptr = min_st_offset; #ifdef AGGREGATION_PROFILE MPE_Log_event (5005, 0, NULL); #endif ADIOI_Free (fd_size); }
void ADIOI_BGL_WriteStrided(ADIO_File fd, void *buf, int count, MPI_Datatype datatype, int file_ptr_type, ADIO_Offset offset, ADIO_Status *status, int *error_code) { /* offset is in units of etype relative to the filetype. */ ADIOI_Flatlist_node *flat_buf, *flat_file; ADIO_Offset i_offset, sum, size_in_filetype; int i, j, k, err=-1, st_index=0; int n_etypes_in_filetype; ADIO_Offset num, size, n_filetypes, etype_in_filetype, st_n_filetypes; ADIO_Offset abs_off_in_filetype=0; int filetype_size, etype_size, buftype_size; MPI_Aint filetype_extent, buftype_extent; int buf_count, buftype_is_contig, filetype_is_contig; ADIO_Offset userbuf_off; ADIO_Offset off, req_off, disp, end_offset=0, writebuf_off, start_off; char *writebuf, *value; unsigned bufsize, writebuf_len, max_bufsize, write_sz; int err_flag=0, info_flag; ADIO_Offset new_bwr_size, new_fwr_size, st_fwr_size, fwr_size=0, bwr_size, req_len; static char myname[] = "ADIOI_BGL_WRITESTRIDED"; if (fd->hints->ds_write == ADIOI_HINT_DISABLE) { /* if user has disabled data sieving on reads, use naive * approach instead. */ /*FPRINTF(stderr, "ADIOI_GEN_WriteStrided_naive(%d):\n", __LINE__);*/ ADIOI_GEN_WriteStrided_naive(fd, buf, count, datatype, file_ptr_type, offset, status, error_code); return; } /*FPRINTF(stderr, "%s(%d):\n",myname, __LINE__);*/ ADIOI_Datatype_iscontig(datatype, &buftype_is_contig); ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig); MPI_Type_size(fd->filetype, &filetype_size); if ( ! filetype_size ) { #ifdef HAVE_STATUS_SET_BYTES MPIR_Status_set_bytes(status, datatype, 0); #endif *error_code = MPI_SUCCESS; return; } MPI_Type_extent(fd->filetype, &filetype_extent); MPI_Type_size(datatype, &buftype_size); MPI_Type_extent(datatype, &buftype_extent); etype_size = fd->etype_size; ADIOI_Assert((buftype_size * count) == ((ADIO_Offset)(unsigned)buftype_size * (ADIO_Offset)count)); bufsize = buftype_size * count; /* get max_bufsize from the info object. */ value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char)); ADIOI_Info_get(fd->info, "ind_wr_buffer_size", MPI_MAX_INFO_VAL, value, &info_flag); max_bufsize = atoi(value); ADIOI_Free(value); if (!buftype_is_contig && filetype_is_contig) { /* noncontiguous in memory, contiguous in file. */ ADIOI_Flatten_datatype(datatype); flat_buf = ADIOI_Flatlist; while (flat_buf->type != datatype) flat_buf = flat_buf->next; off = (file_ptr_type == ADIO_INDIVIDUAL) ? fd->fp_ind : fd->disp + etype_size * offset; start_off = off; end_offset = off + bufsize - 1; writebuf_off = off; writebuf = (char *) ADIOI_Malloc(max_bufsize); writebuf_len = (unsigned) (ADIOI_MIN(max_bufsize,end_offset-writebuf_off+1)); /* if atomicity is true, lock the region to be accessed */ if (fd->atomicity) ADIOI_WRITE_LOCK(fd, start_off, SEEK_SET, end_offset-start_off+1); for (j=0; j<count; j++) { int i; for (i=0; i<flat_buf->count; i++) { userbuf_off = (ADIO_Offset)j*(ADIO_Offset)buftype_extent + flat_buf->indices[i]; req_off = off; req_len = flat_buf->blocklens[i]; ADIOI_BUFFERED_WRITE_WITHOUT_READ off += flat_buf->blocklens[i]; } } /* write the buffer out finally */ lseek(fd->fd_sys, writebuf_off, SEEK_SET); if (!(fd->atomicity)) ADIOI_WRITE_LOCK(fd, writebuf_off, SEEK_SET, writebuf_len); err = write(fd->fd_sys, writebuf, writebuf_len); if (!(fd->atomicity)) ADIOI_UNLOCK(fd, writebuf_off, SEEK_SET, writebuf_len); if (err == -1) err_flag = 1; if (fd->atomicity) ADIOI_UNLOCK(fd, start_off, SEEK_SET, end_offset-start_off+1); ADIOI_Free(writebuf); /* malloced in the buffered_write macro */ if (file_ptr_type == ADIO_INDIVIDUAL) fd->fp_ind = off; if (err_flag) { *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_IO, "**io", "**io %s", strerror(errno)); } else *error_code = MPI_SUCCESS; } else { /* noncontiguous in file */ /* filetype already flattened in ADIO_Open */ flat_file = ADIOI_Flatlist; while (flat_file->type != fd->filetype) flat_file = flat_file->next; disp = fd->disp; if (file_ptr_type == ADIO_INDIVIDUAL) { /* Wei-keng reworked type processing to be a bit more efficient */ offset = fd->fp_ind - disp; n_filetypes = (offset - flat_file->indices[0]) / filetype_extent; offset -= (ADIO_Offset)n_filetypes * filetype_extent; /* now offset is local to this extent */ /* find the block where offset is located, skip blocklens[i]==0 */ for (i=0; i<flat_file->count; i++) { ADIO_Offset dist; if (flat_file->blocklens[i] == 0) continue; dist = flat_file->indices[i] + flat_file->blocklens[i] - offset; /* fwr_size is from offset to the end of block i */ if (dist == 0) { i++; offset = flat_file->indices[i]; fwr_size = flat_file->blocklens[i]; break; } if (dist > 0) { fwr_size = dist; break; } } st_index = i; /* starting index in flat_file->indices[] */ offset += disp + (ADIO_Offset)n_filetypes*filetype_extent; } else { int i; n_etypes_in_filetype = filetype_size/etype_size; n_filetypes = offset / n_etypes_in_filetype; etype_in_filetype = offset % n_etypes_in_filetype; size_in_filetype = etype_in_filetype * etype_size; sum = 0; for (i=0; i<flat_file->count; i++) { sum += flat_file->blocklens[i]; if (sum > size_in_filetype) { st_index = i; fwr_size = sum - size_in_filetype; abs_off_in_filetype = flat_file->indices[i] + size_in_filetype - (sum - flat_file->blocklens[i]); break; } } /* abs. offset in bytes in the file */ offset = disp + (ADIO_Offset) n_filetypes*filetype_extent + abs_off_in_filetype; } start_off = offset; /* Wei-keng Liao:write request is within single flat_file contig block*/ /* this could happen, for example, with subarray types that are * actually fairly contiguous */ if (buftype_is_contig && bufsize <= fwr_size) { ADIO_WriteContig(fd, buf, bufsize, MPI_BYTE, ADIO_EXPLICIT_OFFSET, offset, status, error_code); if (file_ptr_type == ADIO_INDIVIDUAL) { /* update MPI-IO file pointer to point to the first byte * that can be accessed in the fileview. */ fd->fp_ind = offset + bufsize; if (bufsize == fwr_size) { do { st_index++; if (st_index == flat_file->count) { st_index = 0; n_filetypes++; } } while (flat_file->blocklens[st_index] == 0); fd->fp_ind = disp + flat_file->indices[st_index] + (ADIO_Offset)n_filetypes*filetype_extent; } } fd->fp_sys_posn = -1; /* set it to null. */ #ifdef HAVE_STATUS_SET_BYTES MPIR_Status_set_bytes(status, datatype, bufsize); #endif return; } /* Calculate end_offset, the last byte-offset that will be accessed. e.g., if start_offset=0 and 100 bytes to be write, end_offset=99*/ st_fwr_size = fwr_size; st_n_filetypes = n_filetypes; i_offset = 0; j = st_index; off = offset; fwr_size = ADIOI_MIN(st_fwr_size, bufsize); while (i_offset < bufsize) { i_offset += fwr_size; end_offset = off + fwr_size - 1; j = (j+1) % flat_file->count; n_filetypes += (j == 0) ? 1 : 0; while (flat_file->blocklens[j]==0) { j = (j+1) % flat_file->count; n_filetypes += (j == 0) ? 1 : 0; } off = disp + flat_file->indices[j] + n_filetypes*(ADIO_Offset)filetype_extent; fwr_size = ADIOI_MIN(flat_file->blocklens[j], bufsize-i_offset); } /* if atomicity is true, lock the region to be accessed */ if (fd->atomicity) ADIOI_WRITE_LOCK(fd, start_off, SEEK_SET, end_offset-start_off+1); /* initial read for the read-modify-write */ writebuf_off = offset; writebuf = (char *) ADIOI_Malloc(max_bufsize); writebuf_len = (unsigned)(ADIOI_MIN(max_bufsize,end_offset-writebuf_off+1)); if (!(fd->atomicity)) ADIOI_WRITE_LOCK(fd, writebuf_off, SEEK_SET, writebuf_len); lseek(fd->fd_sys, writebuf_off, SEEK_SET); err = read(fd->fd_sys, writebuf, writebuf_len); if (err == -1) { *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_IO, "ADIOI_BGL_WriteStrided: ROMIO tries to optimize this access by doing a read-modify-write, but is unable to read the file. Please give the file read permission and open it with MPI_MODE_RDWR.", 0); return; } if (buftype_is_contig && !filetype_is_contig) { /* contiguous in memory, noncontiguous in file. should be the most common case. */ i_offset = 0; j = st_index; off = offset; n_filetypes = st_n_filetypes; fwr_size = ADIOI_MIN(st_fwr_size, bufsize); while (i_offset < bufsize) { if (fwr_size) { /* TYPE_UB and TYPE_LB can result in fwr_size = 0. save system call in such cases */ /* lseek(fd->fd_sys, off, SEEK_SET); err = write(fd->fd_sys, ((char *) buf) + i_offset, fwr_size);*/ req_off = off; req_len = fwr_size; userbuf_off = i_offset; ADIOI_BUFFERED_WRITE } i_offset += fwr_size; if (off + fwr_size < disp + flat_file->indices[j] + flat_file->blocklens[j] + n_filetypes*(ADIO_Offset)filetype_extent) off += fwr_size; /* did not reach end of contiguous block in filetype. no more I/O needed. off is incremented by fwr_size. */ else { j = (j+1) % flat_file->count; n_filetypes += (j == 0) ? 1 : 0; while (flat_file->blocklens[j]==0) { j = (j+1) % flat_file->count; n_filetypes += (j == 0) ? 1 : 0; } off = disp + flat_file->indices[j] + n_filetypes*(ADIO_Offset)filetype_extent; fwr_size = ADIOI_MIN(flat_file->blocklens[j], bufsize-i_offset); } } } else {
/* #define IO_DEBUG 1 */ void ADIOI_NOLOCK_WriteStrided(ADIO_File fd, const void *buf, int count, MPI_Datatype datatype, int file_ptr_type, ADIO_Offset offset, ADIO_Status *status, int *error_code) { /* borrowed from old-school PVFS (v1) code. A driver for file systems that * cannot or do not support client-side buffering * Does not do data sieving optimization * Does contain write-combining optimization for noncontig in memory, contig in * file */ /* offset is in units of etype relative to the filetype. */ ADIOI_Flatlist_node *flat_buf, *flat_file; int j, k, st_index=0; off_t err_lseek=-1; ssize_t err=-1; ADIO_Offset fwr_size=0, bwr_size, new_bwr_size, new_fwr_size, i_offset, num; ADIO_Offset bufsize, n_etypes_in_filetype; ADIO_Offset n_filetypes, etype_in_filetype, size, sum; ADIO_Offset abs_off_in_filetype=0, size_in_filetype; MPI_Count filetype_size, etype_size, buftype_size; MPI_Aint filetype_extent, buftype_extent, indx; int buf_count, buftype_is_contig, filetype_is_contig; ADIO_Offset off, disp; int flag, err_flag=0; static char myname[] = "ADIOI_NOLOCK_WRITESTRIDED"; #ifdef IO_DEBUG int rank,nprocs; #endif /* --BEGIN ERROR HANDLING-- */ if (fd->atomicity) { *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_INTERN, "Atomic mode set in I/O function", 0); return; } /* --END ERROR HANDLING-- */ ADIOI_Datatype_iscontig(datatype, &buftype_is_contig); ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig); MPI_Type_size_x(fd->filetype, &filetype_size); if ( ! filetype_size ) { #ifdef HAVE_STATUS_SET_BYTES MPIR_Status_set_bytes(status, datatype, 0); #endif *error_code = MPI_SUCCESS; return; } #ifdef IO_DEBUG MPI_Comm_rank(fd->comm, &rank); MPI_Comm_size(fd->comm, &nprocs); #endif MPI_Type_extent(fd->filetype, &filetype_extent); MPI_Type_size_x(datatype, &buftype_size); MPI_Type_extent(datatype, &buftype_extent); etype_size = fd->etype_size; ADIOI_Assert((buftype_size * count) == ((ADIO_Offset)(unsigned)buftype_size * (ADIO_Offset)count)); bufsize = buftype_size * count; if (!buftype_is_contig && filetype_is_contig) { char *combine_buf, *combine_buf_ptr; ADIO_Offset combine_buf_remain; /* noncontiguous in memory, contiguous in file. use writev */ ADIOI_Flatten_datatype(datatype); flat_buf = ADIOI_Flatlist; while (flat_buf->type != datatype) flat_buf = flat_buf->next; /* allocate our "combine buffer" to pack data into before writing */ combine_buf = (char *) ADIOI_Malloc(fd->hints->ind_wr_buffer_size); combine_buf_ptr = combine_buf; combine_buf_remain = fd->hints->ind_wr_buffer_size; /* seek to the right spot in the file */ if (file_ptr_type == ADIO_EXPLICIT_OFFSET) { off = fd->disp + etype_size * offset; lseek(fd->fd_sys, off, SEEK_SET); } else off = lseek(fd->fd_sys, fd->fp_ind, SEEK_SET); /* loop through all the flattened pieces. combine into buffer until * no more will fit, then write. * * special case of a given piece being bigger than the combine buffer * is also handled. */ for (j=0; j<count; j++) { int i; for (i=0; i<flat_buf->count; i++) { if (flat_buf->blocklens[i] > combine_buf_remain && combine_buf != combine_buf_ptr) { /* there is data in the buffer; write out the buffer so far */ #ifdef IO_DEBUG printf("[%d/%d] nc mem c file (0) writing loc = %Ld sz = %Ld\n", rank, nprocs, off, fd->hints->ind_wr_buffer_size-combine_buf_remain); #endif #ifdef ADIOI_MPE_LOGGING MPE_Log_event( ADIOI_MPE_write_a, 0, NULL ); #endif err = write(fd->fd_sys, combine_buf, fd->hints->ind_wr_buffer_size - combine_buf_remain); #ifdef ADIOI_MPE_LOGGING MPE_Log_event( ADIOI_MPE_write_b, 0, NULL ); #endif if (err == -1) err_flag = 1; /* reset our buffer info */ combine_buf_ptr = combine_buf; combine_buf_remain = fd->hints->ind_wr_buffer_size; } /* TODO: heuristic for when to not bother to use combine buffer? */ if (flat_buf->blocklens[i] >= combine_buf_remain) { /* special case: blocklen is as big as or bigger than the combine buf; * write directly */ #ifdef IO_DEBUG printf("[%d/%d] nc mem c file (1) writing loc = %Ld sz = %d\n", rank, nprocs, off, flat_buf->blocklens[i]); #endif ADIOI_Assert(flat_buf->blocklens[i] == (unsigned)flat_buf->blocklens[i]); ADIOI_Assert((((ADIO_Offset)(MPIR_Upint)buf) + (ADIO_Offset)j*(ADIO_Offset)buftype_extent + flat_buf->indices[i]) == (ADIO_Offset)((MPIR_Upint)buf + (ADIO_Offset)j*(ADIO_Offset)buftype_extent + flat_buf->indices[i])); #ifdef ADIOI_MPE_LOGGING MPE_Log_event( ADIOI_MPE_write_a, 0, NULL ); #endif err = write(fd->fd_sys, ((char *) buf) + (ADIO_Offset)j*(ADIO_Offset)buftype_extent + flat_buf->indices[i], (unsigned)flat_buf->blocklens[i]); #ifdef ADIOI_MPE_LOGGING MPE_Log_event( ADIOI_MPE_write_b, 0, NULL ); #endif if (err == -1) err_flag = 1; off += flat_buf->blocklens[i]; /* keep up with the final file offset too */ } else { /* copy more data into combine buffer */ memcpy(combine_buf_ptr, ((char *) buf) + j*buftype_extent + flat_buf->indices[i], flat_buf->blocklens[i]); combine_buf_ptr += flat_buf->blocklens[i]; combine_buf_remain -= flat_buf->blocklens[i]; off += flat_buf->blocklens[i]; /* keep up with the final file offset too */ } } } if (combine_buf_ptr != combine_buf) { /* data left in buffer to write */ #ifdef IO_DEBUG printf("[%d/%d] nc mem c file (2) writing loc = %Ld sz = %Ld\n", rank, nprocs, off, fd->hints->ind_wr_buffer_size-combine_buf_remain); #endif #ifdef ADIOI_MPE_LOGGING MPE_Log_event( ADIOI_MPE_write_a, 0, NULL ); #endif err = write(fd->fd_sys, combine_buf, fd->hints->ind_wr_buffer_size - combine_buf_remain); #ifdef ADIOI_MPE_LOGGING MPE_Log_event( ADIOI_MPE_write_b, 0, NULL ); #endif if (err == -1) err_flag = 1; } if (file_ptr_type == ADIO_INDIVIDUAL) fd->fp_ind = off; ADIOI_Free(combine_buf); if (err_flag) { *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_IO, "**io", "**io %s", strerror(errno)); } else *error_code = MPI_SUCCESS; } /* if (!buftype_is_contig && filetype_is_contig) ... */ else { /* noncontiguous in file */ /* split up into several contiguous writes */ /* find starting location in the file */ /* filetype already flattened in ADIO_Open */ flat_file = ADIOI_Flatlist; while (flat_file->type != fd->filetype) flat_file = flat_file->next; disp = fd->disp; if (file_ptr_type == ADIO_INDIVIDUAL) { offset = fd->fp_ind; /* in bytes */ n_filetypes = -1; flag = 0; while (!flag) { int i; n_filetypes++; for (i=0; i<flat_file->count; i++) { if (disp + flat_file->indices[i] + n_filetypes*(ADIO_Offset)filetype_extent + flat_file->blocklens[i] >= offset) { st_index = i; fwr_size = disp + flat_file->indices[i] + n_filetypes*(ADIO_Offset)filetype_extent + flat_file->blocklens[i] - offset; flag = 1; break; } } } } else { int i; n_etypes_in_filetype = filetype_size/etype_size; n_filetypes = offset / n_etypes_in_filetype; etype_in_filetype = offset % n_etypes_in_filetype; size_in_filetype = etype_in_filetype * etype_size; sum = 0; for (i=0; i<flat_file->count; i++) { sum += flat_file->blocklens[i]; if (sum > size_in_filetype) { st_index = i; fwr_size = sum - size_in_filetype; abs_off_in_filetype = flat_file->indices[i] + size_in_filetype - (sum - flat_file->blocklens[i]); break; } } /* abs. offset in bytes in the file */ offset = disp + n_filetypes*(ADIO_Offset)filetype_extent + abs_off_in_filetype; } if (buftype_is_contig && !filetype_is_contig) { /* contiguous in memory, noncontiguous in file. should be the most common case. */ i_offset = 0; j = st_index; off = offset; fwr_size = ADIOI_MIN(fwr_size, bufsize); while (i_offset < bufsize) { if (fwr_size) { /* TYPE_UB and TYPE_LB can result in fwr_size = 0. save system call in such cases */ #ifdef ADIOI_MPE_LOGGING MPE_Log_event(ADIOI_MPE_lseek_a, 0, NULL); #endif #ifdef IO_DEBUG printf("[%d/%d] c mem nc file writing loc = %Ld sz = %d\n", rank, nprocs, off, fwr_size); #endif err_lseek = lseek(fd->fd_sys, off, SEEK_SET); #ifdef ADIOI_MPE_LOGGING MPE_Log_event(ADIOI_MPE_lseek_b, 0, NULL); #endif if (err_lseek == -1) err_flag = 1; #ifdef ADIOI_MPE_LOGGING MPE_Log_event(ADIOI_MPE_write_a, 0, NULL); #endif err = write(fd->fd_sys, ((char *) buf) + i_offset, fwr_size); #ifdef ADIOI_MPE_LOGGING MPE_Log_event(ADIOI_MPE_write_b, 0, NULL); #endif if (err == -1) err_flag = 1; } i_offset += fwr_size; if (off + fwr_size < disp + flat_file->indices[j] + flat_file->blocklens[j] + n_filetypes*(ADIO_Offset)filetype_extent) off += fwr_size; /* did not reach end of contiguous block in filetype. no more I/O needed. off is incremented by fwr_size. */ else { if (j < (flat_file->count - 1)) j++; else { j = 0; n_filetypes++; } off = disp + flat_file->indices[j] + n_filetypes*(ADIO_Offset)filetype_extent; fwr_size = ADIOI_MIN(flat_file->blocklens[j], bufsize-i_offset); } } } else { /* noncontiguous in memory as well as in file */ ADIOI_Flatten_datatype(datatype); flat_buf = ADIOI_Flatlist; while (flat_buf->type != datatype) flat_buf = flat_buf->next; k = num = buf_count = 0; indx = flat_buf->indices[0]; j = st_index; off = offset; bwr_size = flat_buf->blocklens[0]; while (num < bufsize) { size = ADIOI_MIN(fwr_size, bwr_size); if (size) { #ifdef IO_DEBUG printf("[%d/%d] nc mem nc file writing loc = %Ld sz = %d\n", rank, nprocs, off, size); #endif #ifdef ADIOI_MPE_LOGGING MPE_Log_event( ADIOI_MPE_lseek_a, 0, NULL ); #endif lseek(fd->fd_sys, off, SEEK_SET); #ifdef ADIOI_MPE_LOGGING MPE_Log_event( ADIOI_MPE_lseek_b, 0, NULL ); #endif if (err == -1) err_flag = 1; #ifdef ADIOI_MPE_LOGGING MPE_Log_event( ADIOI_MPE_write_a, 0, NULL ); #endif ADIOI_Assert(size == (size_t) size); ADIOI_Assert(off == (off_t) off); err = write(fd->fd_sys, ((char *) buf) + indx, size); #ifdef ADIOI_MPE_LOGGING MPE_Log_event( ADIOI_MPE_write_b, 0, NULL ); #endif if (err == -1) err_flag = 1; } new_fwr_size = fwr_size; new_bwr_size = bwr_size; if (size == fwr_size) { /* reached end of contiguous block in file */ if (j < (flat_file->count - 1)) j++; else { j = 0; n_filetypes++; } off = disp + flat_file->indices[j] + n_filetypes*(ADIO_Offset)filetype_extent; new_fwr_size = flat_file->blocklens[j]; if (size != bwr_size) { indx += size; new_bwr_size -= size; } } if (size == bwr_size) { /* reached end of contiguous block in memory */ k = (k + 1)%flat_buf->count; buf_count++; indx = buftype_extent*(buf_count/flat_buf->count) + flat_buf->indices[k]; new_bwr_size = flat_buf->blocklens[k]; if (size != fwr_size) { off += size; new_fwr_size -= size; } } num += size; fwr_size = new_fwr_size; bwr_size = new_bwr_size; } } if (file_ptr_type == ADIO_INDIVIDUAL) fd->fp_ind = off; if (err_flag) { *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_IO, "**io", "**io %s", strerror(errno)); } else *error_code = MPI_SUCCESS; } fd->fp_sys_posn = -1; /* set it to null. */ #ifdef HAVE_STATUS_SET_BYTES MPIR_Status_set_bytes(status, datatype, bufsize); /* This is a temporary way of filling in status. The right way is to keep track of how much data was actually written by ADIOI_BUFFERED_WRITE. */ #endif if (!buftype_is_contig) ADIOI_Delete_flattened(datatype); }
static void ADIOI_Read_and_exch(ADIO_File fd, void *buf, MPI_Datatype datatype, int nprocs, int myrank, ADIOI_Access *others_req, ADIO_Offset *offset_list, ADIO_Offset *len_list, int contig_access_count, ADIO_Offset min_st_offset, ADIO_Offset fd_size, ADIO_Offset *fd_start, ADIO_Offset *fd_end, int *buf_idx, int *error_code) { /* Read in sizes of no more than coll_bufsize, an info parameter. Send data to appropriate processes. Place recd. data in user buf. The idea is to reduce the amount of extra memory required for collective I/O. If all data were read all at once, which is much easier, it would require temp space more than the size of user_buf, which is often unacceptable. For example, to read a distributed array from a file, where each local array is 8Mbytes, requiring at least another 8Mbytes of temp space is unacceptable. */ int i, j, m, ntimes, max_ntimes, buftype_is_contig; ADIO_Offset st_loc=-1, end_loc=-1, off, done, real_off, req_off; char *read_buf = NULL, *tmp_buf; int *curr_offlen_ptr, *count, *send_size, *recv_size; int *partial_send, *recd_from_proc, *start_pos; /* Not convinced end_loc-st_loc couldn't be > int, so make these offsets*/ ADIO_Offset real_size, size, for_curr_iter, for_next_iter; int req_len, flag, rank; MPI_Status status; ADIOI_Flatlist_node *flat_buf=NULL; MPI_Aint buftype_extent; int coll_bufsize; *error_code = MPI_SUCCESS; /* changed below if error */ /* only I/O errors are currently reported */ /* calculate the number of reads of size coll_bufsize to be done by each process and the max among all processes. That gives the no. of communication phases as well. coll_bufsize is obtained from the hints object. */ coll_bufsize = fd->hints->cb_buffer_size; /* grab some initial values for st_loc and end_loc */ for (i=0; i < nprocs; i++) { if (others_req[i].count) { st_loc = others_req[i].offsets[0]; end_loc = others_req[i].offsets[0]; break; } } /* now find the real values */ for (i=0; i < nprocs; i++) for (j=0; j<others_req[i].count; j++) { st_loc = ADIOI_MIN(st_loc, others_req[i].offsets[j]); end_loc = ADIOI_MAX(end_loc, (others_req[i].offsets[j] + others_req[i].lens[j] - 1)); } /* calculate ntimes, the number of times this process must perform I/O * operations in order to complete all the requests it has received. * the need for multiple I/O operations comes from the restriction that * we only use coll_bufsize bytes of memory for internal buffering. */ if ((st_loc==-1) && (end_loc==-1)) { /* this process does no I/O. */ ntimes = 0; } else { /* ntimes=ceiling_div(end_loc - st_loc + 1, coll_bufsize)*/ ntimes = (int) ((end_loc - st_loc + coll_bufsize)/coll_bufsize); } MPI_Allreduce(&ntimes, &max_ntimes, 1, MPI_INT, MPI_MAX, fd->comm); read_buf = fd->io_buf; /* Allocated at open time */ curr_offlen_ptr = (int *) ADIOI_Calloc(nprocs, sizeof(int)); /* its use is explained below. calloc initializes to 0. */ count = (int *) ADIOI_Malloc(nprocs * sizeof(int)); /* to store count of how many off-len pairs per proc are satisfied in an iteration. */ partial_send = (int *) ADIOI_Calloc(nprocs, sizeof(int)); /* if only a portion of the last off-len pair is sent to a process in a particular iteration, the length sent is stored here. calloc initializes to 0. */ send_size = (int *) ADIOI_Malloc(nprocs * sizeof(int)); /* total size of data to be sent to each proc. in an iteration */ recv_size = (int *) ADIOI_Malloc(nprocs * sizeof(int)); /* total size of data to be recd. from each proc. in an iteration. Of size nprocs so that I can use MPI_Alltoall later. */ recd_from_proc = (int *) ADIOI_Calloc(nprocs, sizeof(int)); /* amount of data recd. so far from each proc. Used in ADIOI_Fill_user_buffer. initialized to 0 here. */ start_pos = (int *) ADIOI_Malloc(nprocs*sizeof(int)); /* used to store the starting value of curr_offlen_ptr[i] in this iteration */ ADIOI_Datatype_iscontig(datatype, &buftype_is_contig); if (!buftype_is_contig) { ADIOI_Flatten_datatype(datatype); flat_buf = ADIOI_Flatlist; while (flat_buf->type != datatype) flat_buf = flat_buf->next; } MPI_Type_extent(datatype, &buftype_extent); done = 0; off = st_loc; for_curr_iter = for_next_iter = 0; MPI_Comm_rank(fd->comm, &rank); for (m=0; m<ntimes; m++) { /* read buf of size coll_bufsize (or less) */ /* go through all others_req and check if any are satisfied by the current read */ /* since MPI guarantees that displacements in filetypes are in monotonically nondecreasing order, I can maintain a pointer (curr_offlen_ptr) to current off-len pair for each process in others_req and scan further only from there. There is still a problem of filetypes such as: (1, 2, 3 are not process nos. They are just numbers for three chunks of data, specified by a filetype.) 1 -------!-- 2 -----!---- 3 --!----- where ! indicates where the current read_size limitation cuts through the filetype. I resolve this by reading up to !, but filling the communication buffer only for 1. I copy the portion left over for 2 into a tmp_buf for use in the next iteration. i.e., 2 and 3 will be satisfied in the next iteration. This simplifies filling in the user's buf at the other end, as only one off-len pair with incomplete data will be sent. I also don't need to send the individual offsets and lens along with the data, as the data is being sent in a particular order. */ /* off = start offset in the file for the data actually read in this iteration size = size of data read corresponding to off real_off = off minus whatever data was retained in memory from previous iteration for cases like 2, 3 illustrated above real_size = size plus the extra corresponding to real_off req_off = off in file for a particular contiguous request minus what was satisfied in previous iteration req_size = size corresponding to req_off */ size = ADIOI_MIN((unsigned)coll_bufsize, end_loc-st_loc+1-done); real_off = off - for_curr_iter; real_size = size + for_curr_iter; for (i=0; i<nprocs; i++) count[i] = send_size[i] = 0; for_next_iter = 0; for (i=0; i<nprocs; i++) { #ifdef RDCOLL_DEBUG DBG_FPRINTF(stderr, "rank %d, i %d, others_count %d\n", rank, i, others_req[i].count); #endif if (others_req[i].count) { start_pos[i] = curr_offlen_ptr[i]; for (j=curr_offlen_ptr[i]; j<others_req[i].count; j++) { if (partial_send[i]) { /* this request may have been partially satisfied in the previous iteration. */ req_off = others_req[i].offsets[j] + partial_send[i]; req_len = others_req[i].lens[j] - partial_send[i]; partial_send[i] = 0; /* modify the off-len pair to reflect this change */ others_req[i].offsets[j] = req_off; others_req[i].lens[j] = req_len; } else { req_off = others_req[i].offsets[j]; req_len = others_req[i].lens[j]; } if (req_off < real_off + real_size) { count[i]++; ADIOI_Assert((((ADIO_Offset)(MPIR_Upint)read_buf)+req_off-real_off) == (ADIO_Offset)(MPIR_Upint)(read_buf+req_off-real_off)); MPI_Address(read_buf+req_off-real_off, &(others_req[i].mem_ptrs[j])); ADIOI_Assert((real_off + real_size - req_off) == (int)(real_off + real_size - req_off)); send_size[i] += (int)(ADIOI_MIN(real_off + real_size - req_off, (ADIO_Offset)(unsigned)req_len)); if (real_off+real_size-req_off < (ADIO_Offset)(unsigned)req_len) { partial_send[i] = (int) (real_off + real_size - req_off); if ((j+1 < others_req[i].count) && (others_req[i].offsets[j+1] < real_off+real_size)) { /* this is the case illustrated in the figure above. */ for_next_iter = ADIOI_MAX(for_next_iter, real_off + real_size - others_req[i].offsets[j+1]); /* max because it must cover requests from different processes */ } break; } } else break; } curr_offlen_ptr[i] = j; } } flag = 0; for (i=0; i<nprocs; i++) if (count[i]) flag = 1; if (flag) { ADIOI_Assert(size == (int)size); ADIO_ReadContig(fd, read_buf+for_curr_iter, (int)size, MPI_BYTE, ADIO_EXPLICIT_OFFSET, off, &status, error_code); if (*error_code != MPI_SUCCESS) return; } for_curr_iter = for_next_iter; ADIOI_R_Exchange_data(fd, buf, flat_buf, offset_list, len_list, send_size, recv_size, count, start_pos, partial_send, recd_from_proc, nprocs, myrank, buftype_is_contig, contig_access_count, min_st_offset, fd_size, fd_start, fd_end, others_req, m, buftype_extent, buf_idx); if (for_next_iter) { tmp_buf = (char *) ADIOI_Malloc(for_next_iter); ADIOI_Assert((((ADIO_Offset)(MPIR_Upint)read_buf)+real_size-for_next_iter) == (ADIO_Offset)(MPIR_Upint)(read_buf+real_size-for_next_iter)); ADIOI_Assert((for_next_iter+coll_bufsize) == (size_t)(for_next_iter+coll_bufsize)); memcpy(tmp_buf, read_buf+real_size-for_next_iter, for_next_iter); ADIOI_Free(fd->io_buf); fd->io_buf = (char *) ADIOI_Malloc(for_next_iter+coll_bufsize); memcpy(fd->io_buf, tmp_buf, for_next_iter); read_buf = fd->io_buf; ADIOI_Free(tmp_buf); } off += size; done += size; } for (i=0; i<nprocs; i++) count[i] = send_size[i] = 0; for (m=ntimes; m<max_ntimes; m++) /* nothing to send, but check for recv. */ ADIOI_R_Exchange_data(fd, buf, flat_buf, offset_list, len_list, send_size, recv_size, count, start_pos, partial_send, recd_from_proc, nprocs, myrank, buftype_is_contig, contig_access_count, min_st_offset, fd_size, fd_start, fd_end, others_req, m, buftype_extent, buf_idx); ADIOI_Free(curr_offlen_ptr); ADIOI_Free(count); ADIOI_Free(partial_send); ADIOI_Free(send_size); ADIOI_Free(recv_size); ADIOI_Free(recd_from_proc); ADIOI_Free(start_pos); }
void ADIOI_LUSTRE_WriteStrided(ADIO_File fd, void *buf, int count, MPI_Datatype datatype, int file_ptr_type, ADIO_Offset offset, ADIO_Status * status, int *error_code) { /* offset is in units of etype relative to the filetype. */ ADIOI_Flatlist_node *flat_buf, *flat_file; ADIO_Offset i_offset, sum, size_in_filetype; int i, j, k, st_index=0; int n_etypes_in_filetype; ADIO_Offset num, size, n_filetypes, etype_in_filetype, st_n_filetypes; ADIO_Offset abs_off_in_filetype=0; int filetype_size, etype_size, buftype_size; MPI_Aint filetype_extent, buftype_extent; int buf_count, buftype_is_contig, filetype_is_contig; ADIO_Offset userbuf_off; ADIO_Offset off, req_off, disp, end_offset=0, writebuf_off, start_off; char *writebuf; unsigned bufsize, writebuf_len, write_sz; ADIO_Status status1; ADIO_Offset new_bwr_size, new_fwr_size, st_fwr_size, fwr_size=0, bwr_size, req_len; int stripe_size; static char myname[] = "ADIOI_LUSTRE_WriteStrided"; if (fd->hints->ds_write == ADIOI_HINT_DISABLE) { /* if user has disabled data sieving on writes, use naive * approach instead. */ ADIOI_GEN_WriteStrided_naive(fd, buf, count, datatype, file_ptr_type, offset, status, error_code); return; } *error_code = MPI_SUCCESS; /* changed below if error */ ADIOI_Datatype_iscontig(datatype, &buftype_is_contig); ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig); MPI_Type_size(fd->filetype, &filetype_size); if (!filetype_size) { #ifdef HAVE_STATUS_SET_BYTES MPIR_Status_set_bytes(status, datatype, 0); #endif *error_code = MPI_SUCCESS; return; } MPI_Type_extent(fd->filetype, &filetype_extent); MPI_Type_size(datatype, &buftype_size); MPI_Type_extent(datatype, &buftype_extent); etype_size = fd->etype_size; ADIOI_Assert((buftype_size * count) == ((ADIO_Offset)(unsigned)buftype_size * (ADIO_Offset)count)); bufsize = buftype_size * count; /* get striping info */ stripe_size = fd->hints->striping_unit; /* Different buftype to different filetype */ if (!buftype_is_contig && filetype_is_contig) { /* noncontiguous in memory, contiguous in file. */ ADIOI_Flatten_datatype(datatype); flat_buf = ADIOI_Flatlist; while (flat_buf->type != datatype) flat_buf = flat_buf->next; off = (file_ptr_type == ADIO_INDIVIDUAL) ? fd->fp_ind : fd->disp + (ADIO_Offset)etype_size * offset; start_off = off; end_offset = start_off + bufsize - 1; /* write stripe size buffer each time */ writebuf = (char *) ADIOI_Malloc(ADIOI_MIN(bufsize, stripe_size)); writebuf_off = 0; writebuf_len = 0; /* if atomicity is true, lock the region to be accessed */ if (fd->atomicity) ADIOI_WRITE_LOCK(fd, start_off, SEEK_SET, bufsize); for (j = 0; j < count; j++) { for (i = 0; i < flat_buf->count; i++) { userbuf_off = (ADIO_Offset)j * (ADIO_Offset)buftype_extent + flat_buf->indices[i]; req_off = off; req_len = flat_buf->blocklens[i]; ADIOI_BUFFERED_WRITE_WITHOUT_READ off += flat_buf->blocklens[i]; } } /* write the buffer out finally */ ADIO_WriteContig(fd, writebuf, writebuf_len, MPI_BYTE, ADIO_EXPLICIT_OFFSET, writebuf_off, &status1, error_code); if (fd->atomicity) ADIOI_UNLOCK(fd, start_off, SEEK_SET, bufsize); if (*error_code != MPI_SUCCESS) { ADIOI_Free(writebuf); return; } ADIOI_Free(writebuf); if (file_ptr_type == ADIO_INDIVIDUAL) fd->fp_ind = off; } else { /* noncontiguous in file */ /* filetype already flattened in ADIO_Open */ flat_file = ADIOI_Flatlist; while (flat_file->type != fd->filetype) flat_file = flat_file->next; disp = fd->disp; if (file_ptr_type == ADIO_INDIVIDUAL) { /* Wei-keng reworked type processing to be a bit more efficient */ offset = fd->fp_ind - disp; n_filetypes = (offset - flat_file->indices[0]) / filetype_extent; offset -= (ADIO_Offset)n_filetypes * filetype_extent; /* now offset is local to this extent */ /* find the block where offset is located, skip blocklens[i]==0 */ for (i=0; i<flat_file->count; i++) { ADIO_Offset dist; if (flat_file->blocklens[i] == 0) continue; dist = flat_file->indices[i] + flat_file->blocklens[i] - offset; /* fwr_size is from offset to the end of block i */ if (dist == 0) { i++; offset = flat_file->indices[i]; fwr_size = flat_file->blocklens[i]; break; } if (dist > 0) { fwr_size = dist; break; } } st_index = i; /* starting index in flat_file->indices[] */ offset += disp + (ADIO_Offset)n_filetypes*filetype_extent; } else { n_etypes_in_filetype = filetype_size/etype_size; n_filetypes = offset / n_etypes_in_filetype; etype_in_filetype = offset % n_etypes_in_filetype; size_in_filetype = etype_in_filetype * etype_size; sum = 0; for (i = 0; i < flat_file->count; i++) { sum += flat_file->blocklens[i]; if (sum > size_in_filetype) { st_index = i; fwr_size = sum - size_in_filetype; abs_off_in_filetype = flat_file->indices[i] + size_in_filetype - (sum - flat_file->blocklens[i]); break; } } /* abs. offset in bytes in the file */ offset = disp + (ADIO_Offset) n_filetypes *filetype_extent + abs_off_in_filetype; } start_off = offset; /* Wei-keng Liao:write request is within single flat_file * contig block*/ /* this could happen, for example, with subarray types that are * actually fairly contiguous */ if (buftype_is_contig && bufsize <= fwr_size) { req_off = start_off; req_len = bufsize; end_offset = start_off + bufsize - 1; writebuf = (char *) ADIOI_Malloc(ADIOI_MIN(bufsize, stripe_size)); memset(writebuf, -1, ADIOI_MIN(bufsize, stripe_size)); writebuf_off = 0; writebuf_len = 0; userbuf_off = 0; ADIOI_BUFFERED_WRITE_WITHOUT_READ /* write the buffer out finally */ ADIO_WriteContig(fd, writebuf, writebuf_len, MPI_BYTE, ADIO_EXPLICIT_OFFSET, writebuf_off, &status1, error_code); if (file_ptr_type == ADIO_INDIVIDUAL) { /* update MPI-IO file pointer to point to the first byte * that can be accessed in the fileview. */ fd->fp_ind = offset + bufsize; if (bufsize == fwr_size) { do { st_index++; if (st_index == flat_file->count) { st_index = 0; n_filetypes++; } } while (flat_file->blocklens[st_index] == 0); fd->fp_ind = disp + flat_file->indices[st_index] + (ADIO_Offset)n_filetypes*filetype_extent; } } fd->fp_sys_posn = -1; /* set it to null. */ #ifdef HAVE_STATUS_SET_BYTES MPIR_Status_set_bytes(status, datatype, bufsize); #endif ADIOI_Free(writebuf); return; } /* Calculate end_offset, the last byte-offset that will be accessed. e.g., if start_offset=0 and 100 bytes to be write, end_offset=99*/ st_fwr_size = fwr_size; st_n_filetypes = n_filetypes; i_offset = 0; j = st_index; off = offset; fwr_size = ADIOI_MIN(st_fwr_size, bufsize); while (i_offset < bufsize) { i_offset += fwr_size; end_offset = off + fwr_size - 1; j = (j+1) % flat_file->count; n_filetypes += (j == 0) ? 1 : 0; while (flat_file->blocklens[j]==0) { j = (j+1) % flat_file->count; n_filetypes += (j == 0) ? 1 : 0; } off = disp + flat_file->indices[j] + n_filetypes*(ADIO_Offset)filetype_extent; fwr_size = ADIOI_MIN(flat_file->blocklens[j], bufsize-i_offset); } /* if atomicity is true, lock the region to be accessed */ if (fd->atomicity) ADIOI_WRITE_LOCK(fd, start_off, SEEK_SET, end_offset-start_off+1); writebuf_off = 0; writebuf_len = 0; writebuf = (char *) ADIOI_Malloc(stripe_size); memset(writebuf, -1, stripe_size); if (buftype_is_contig && !filetype_is_contig) { /* contiguous in memory, noncontiguous in file. should be the most common case. */ i_offset = 0; j = st_index; off = offset; n_filetypes = st_n_filetypes; fwr_size = ADIOI_MIN(st_fwr_size, bufsize); while (i_offset < bufsize) { if (fwr_size) { /* TYPE_UB and TYPE_LB can result in fwr_size = 0. save system call in such cases */ /* lseek(fd->fd_sys, off, SEEK_SET); err = write(fd->fd_sys, ((char *) buf) + i_offset, fwr_size);*/ req_off = off; req_len = fwr_size; userbuf_off = i_offset; ADIOI_BUFFERED_WRITE } i_offset += fwr_size; if (off + fwr_size < disp + flat_file->indices[j] + flat_file->blocklens[j] + n_filetypes*(ADIO_Offset)filetype_extent) off += fwr_size; /* did not reach end of contiguous block in filetype. no more I/O needed. off is incremented by fwr_size. */ else { j = (j+1) % flat_file->count; n_filetypes += (j == 0) ? 1 : 0; while (flat_file->blocklens[j]==0) { j = (j+1) % flat_file->count; n_filetypes += (j == 0) ? 1 : 0; } off = disp + flat_file->indices[j] + n_filetypes*(ADIO_Offset)filetype_extent; fwr_size = ADIOI_MIN(flat_file->blocklens[j], bufsize-i_offset); } } } else {
void ADIOI_Fill_user_buffer(ADIO_File fd, void *buf, ADIOI_Flatlist_node *flat_buf, char **recv_buf, ADIO_Offset *offset_list, ADIO_Offset *len_list, unsigned *recv_size, MPI_Request *requests, MPI_Status *statuses, int *recd_from_proc, int nprocs, int contig_access_count, ADIO_Offset min_st_offset, ADIO_Offset fd_size, ADIO_Offset *fd_start, ADIO_Offset *fd_end, MPI_Aint buftype_extent) { /* this function is only called if buftype is not contig */ int i, p, flat_buf_idx; ADIO_Offset flat_buf_sz, size_in_buf, buf_incr, size; int n_buftypes; ADIO_Offset off, len, rem_len, user_buf_idx; /* Not sure unsigned is necessary, but it makes the math safer */ unsigned *curr_from_proc, *done_from_proc, *recv_buf_idx; ADIOI_UNREFERENCED_ARG(requests); ADIOI_UNREFERENCED_ARG(statuses); /* curr_from_proc[p] = amount of data recd from proc. p that has already been accounted for so far done_from_proc[p] = amount of data already recd from proc. p and filled into user buffer in previous iterations user_buf_idx = current location in user buffer recv_buf_idx[p] = current location in recv_buf of proc. p */ curr_from_proc = (unsigned *) ADIOI_Malloc(nprocs * sizeof(unsigned)); done_from_proc = (unsigned *) ADIOI_Malloc(nprocs * sizeof(unsigned)); recv_buf_idx = (unsigned *) ADIOI_Malloc(nprocs * sizeof(unsigned)); for (i=0; i < nprocs; i++) { recv_buf_idx[i] = curr_from_proc[i] = 0; done_from_proc[i] = recd_from_proc[i]; } user_buf_idx = flat_buf->indices[0]; flat_buf_idx = 0; n_buftypes = 0; flat_buf_sz = flat_buf->blocklens[0]; /* flat_buf_idx = current index into flattened buftype flat_buf_sz = size of current contiguous component in flattened buf */ for (i=0; i<contig_access_count; i++) { off = offset_list[i]; rem_len = len_list[i]; /* this request may span the file domains of more than one process */ while (rem_len != 0) { len = rem_len; /* NOTE: len value is modified by ADIOI_Calc_aggregator() to be no * longer than the single region that processor "p" is responsible * for. */ p = ADIOI_Calc_aggregator(fd, off, min_st_offset, &len, fd_size, fd_start, fd_end); if (recv_buf_idx[p] < recv_size[p]) { if (curr_from_proc[p]+len > done_from_proc[p]) { if (done_from_proc[p] > curr_from_proc[p]) { size = ADIOI_MIN(curr_from_proc[p] + len - done_from_proc[p], recv_size[p]-recv_buf_idx[p]); buf_incr = done_from_proc[p] - curr_from_proc[p]; ADIOI_BUF_INCR buf_incr = curr_from_proc[p]+len-done_from_proc[p]; ADIOI_Assert((done_from_proc[p] + size) == (unsigned)((ADIO_Offset)done_from_proc[p] + size)); curr_from_proc[p] = done_from_proc[p] + size; ADIOI_BUF_COPY } else { size = ADIOI_MIN(len,recv_size[p]-recv_buf_idx[p]); buf_incr = len; ADIOI_Assert((curr_from_proc[p] + size) == (unsigned)((ADIO_Offset)curr_from_proc[p] + size)); curr_from_proc[p] += (unsigned) size; ADIOI_BUF_COPY } } else { ADIOI_Assert((curr_from_proc[p] + len) == (unsigned)((ADIO_Offset)curr_from_proc[p] + len)); curr_from_proc[p] += (unsigned) len; buf_incr = len; ADIOI_BUF_INCR } }
void ADIOI_GEN_ReadStrided_naive(ADIO_File fd, void *buf, int count, MPI_Datatype buftype, int file_ptr_type, ADIO_Offset offset, ADIO_Status *status, int *error_code) { /* offset is in units of etype relative to the filetype. */ ADIOI_Flatlist_node *flat_buf, *flat_file; int brd_size, frd_size=0, b_index; int bufsize, size, sum, n_etypes_in_filetype, size_in_filetype; int n_filetypes, etype_in_filetype; ADIO_Offset abs_off_in_filetype=0; int filetype_size, etype_size, buftype_size, req_len; MPI_Aint filetype_extent, buftype_extent; int buf_count, buftype_is_contig, filetype_is_contig; ADIO_Offset userbuf_off; ADIO_Offset off, req_off, disp, end_offset=0, start_off; ADIO_Status status1; *error_code = MPI_SUCCESS; /* changed below if error */ ADIOI_Datatype_iscontig(buftype, &buftype_is_contig); ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig); MPI_Type_size(fd->filetype, &filetype_size); if ( ! filetype_size ) { *error_code = MPI_SUCCESS; return; } MPI_Type_extent(fd->filetype, &filetype_extent); MPI_Type_size(buftype, &buftype_size); MPI_Type_extent(buftype, &buftype_extent); etype_size = fd->etype_size; bufsize = buftype_size * count; /* contiguous in buftype and filetype is handled elsewhere */ if (!buftype_is_contig && filetype_is_contig) { int b_count; /* noncontiguous in memory, contiguous in file. */ ADIOI_Flatten_datatype(buftype); flat_buf = ADIOI_Flatlist; while (flat_buf->type != buftype) flat_buf = flat_buf->next; off = (file_ptr_type == ADIO_INDIVIDUAL) ? fd->fp_ind : fd->disp + etype_size * offset; start_off = off; end_offset = off + bufsize - 1; /* if atomicity is true, lock (exclusive) the region to be accessed */ if ((fd->atomicity) && (fd->file_system != ADIO_PIOFS) && (fd->file_system != ADIO_PVFS)) { ADIOI_WRITE_LOCK(fd, start_off, SEEK_SET, end_offset-start_off+1); } /* for each region in the buffer, grab the data and put it in * place */ for (b_count=0; b_count < count; b_count++) { for (b_index=0; b_index < flat_buf->count; b_index++) { userbuf_off = b_count*buftype_extent + flat_buf->indices[b_index]; req_off = off; req_len = flat_buf->blocklens[b_index]; ADIO_ReadContig(fd, (char *) buf + userbuf_off, req_len, MPI_BYTE, ADIO_EXPLICIT_OFFSET, req_off, &status1, error_code); if (*error_code != MPI_SUCCESS) return; /* off is (potentially) used to save the final offset later */ off += flat_buf->blocklens[b_index]; } } if ((fd->atomicity) && (fd->file_system != ADIO_PIOFS) && (fd->file_system != ADIO_PVFS)) { ADIOI_UNLOCK(fd, start_off, SEEK_SET, end_offset-start_off+1); } if (file_ptr_type == ADIO_INDIVIDUAL) fd->fp_ind = off; } else { /* noncontiguous in file */ int f_index, st_frd_size, st_index = 0, st_n_filetypes; int flag; /* First we're going to calculate a set of values for use in all * the noncontiguous in file cases: * start_off - starting byte position of data in file * end_offset - last byte offset to be acessed in the file * st_n_filetypes - how far into the file we start in terms of * whole filetypes * st_index - index of block in first filetype that we will be * starting in (?) * st_frd_size - size of the data in the first filetype block * that we will read (accounts for being part-way * into reading this block of the filetype * */ /* filetype already flattened in ADIO_Open */ flat_file = ADIOI_Flatlist; while (flat_file->type != fd->filetype) flat_file = flat_file->next; disp = fd->disp; if (file_ptr_type == ADIO_INDIVIDUAL) { start_off = fd->fp_ind; /* in bytes */ n_filetypes = -1; flag = 0; while (!flag) { n_filetypes++; for (f_index=0; f_index < flat_file->count; f_index++) { if (disp + flat_file->indices[f_index] + (ADIO_Offset) n_filetypes*filetype_extent + flat_file->blocklens[f_index] >= start_off) { /* this block contains our starting position */ st_index = f_index; frd_size = (int) (disp + flat_file->indices[f_index] + (ADIO_Offset) n_filetypes*filetype_extent + flat_file->blocklens[f_index] - start_off); flag = 1; break; } } } } else { n_etypes_in_filetype = filetype_size/etype_size; n_filetypes = (int) (offset / n_etypes_in_filetype); etype_in_filetype = (int) (offset % n_etypes_in_filetype); size_in_filetype = etype_in_filetype * etype_size; sum = 0; for (f_index=0; f_index < flat_file->count; f_index++) { sum += flat_file->blocklens[f_index]; if (sum > size_in_filetype) { st_index = f_index; frd_size = sum - size_in_filetype; abs_off_in_filetype = flat_file->indices[f_index] + size_in_filetype - (sum - flat_file->blocklens[f_index]); break; } } /* abs. offset in bytes in the file */ start_off = disp + (ADIO_Offset) n_filetypes*filetype_extent + abs_off_in_filetype; } st_frd_size = frd_size; st_n_filetypes = n_filetypes; /* start_off, st_n_filetypes, st_index, and st_frd_size are * all calculated at this point */ /* Calculate end_offset, the last byte-offset that will be accessed. * e.g., if start_off=0 and 100 bytes to be read, end_offset=99 */ userbuf_off = 0; f_index = st_index; off = start_off; frd_size = ADIOI_MIN(st_frd_size, bufsize); while (userbuf_off < bufsize) { userbuf_off += frd_size; end_offset = off + frd_size - 1; if (f_index < (flat_file->count - 1)) f_index++; else { f_index = 0; n_filetypes++; } off = disp + flat_file->indices[f_index] + (ADIO_Offset) n_filetypes*filetype_extent; frd_size = ADIOI_MIN(flat_file->blocklens[f_index], bufsize-(int)userbuf_off); } /* End of calculations. At this point the following values have * been calculated and are ready for use: * - start_off * - end_offset * - st_n_filetypes * - st_index * - st_frd_size */ /* if atomicity is true, lock (exclusive) the region to be accessed */ if ((fd->atomicity) && (fd->file_system != ADIO_PIOFS) && (fd->file_system != ADIO_PVFS)) { ADIOI_WRITE_LOCK(fd, start_off, SEEK_SET, end_offset-start_off+1); } if (buftype_is_contig && !filetype_is_contig) { /* contiguous in memory, noncontiguous in file. should be the * most common case. */ userbuf_off = 0; f_index = st_index; off = start_off; n_filetypes = st_n_filetypes; frd_size = ADIOI_MIN(st_frd_size, bufsize); /* while there is still space in the buffer, read more data */ while (userbuf_off < bufsize) { if (frd_size) { /* TYPE_UB and TYPE_LB can result in frd_size = 0. save system call in such cases */ req_off = off; req_len = frd_size; ADIO_ReadContig(fd, (char *) buf + userbuf_off, req_len, MPI_BYTE, ADIO_EXPLICIT_OFFSET, req_off, &status1, error_code); if (*error_code != MPI_SUCCESS) return; } userbuf_off += frd_size; if (off + frd_size < disp + flat_file->indices[f_index] + flat_file->blocklens[f_index] + (ADIO_Offset) n_filetypes*filetype_extent) { /* important that this value be correct, as it is * used to set the offset in the fd near the end of * this function. */ off += frd_size; } /* did not reach end of contiguous block in filetype. * no more I/O needed. off is incremented by frd_size. */ else { if (f_index < (flat_file->count - 1)) f_index++; else { f_index = 0; n_filetypes++; } off = disp + flat_file->indices[f_index] + (ADIO_Offset) n_filetypes*filetype_extent; frd_size = ADIOI_MIN(flat_file->blocklens[f_index], bufsize-(int)userbuf_off); } } } else { int i, tmp_bufsize = 0; /* noncontiguous in memory as well as in file */ ADIOI_Flatten_datatype(buftype); flat_buf = ADIOI_Flatlist; while (flat_buf->type != buftype) flat_buf = flat_buf->next; b_index = buf_count = 0; i = (int) (flat_buf->indices[0]); f_index = st_index; off = start_off; n_filetypes = st_n_filetypes; frd_size = st_frd_size; brd_size = flat_buf->blocklens[0]; /* while we haven't read size * count bytes, keep going */ while (tmp_bufsize < bufsize) { int new_brd_size = brd_size, new_frd_size = frd_size; size = ADIOI_MIN(frd_size, brd_size); if (size) { req_off = off; req_len = size; userbuf_off = i; ADIO_ReadContig(fd, (char *) buf + userbuf_off, req_len, MPI_BYTE, ADIO_EXPLICIT_OFFSET, req_off, &status1, error_code); if (*error_code != MPI_SUCCESS) return; } if (size == frd_size) { /* reached end of contiguous block in file */ if (f_index < (flat_file->count - 1)) f_index++; else { f_index = 0; n_filetypes++; } off = disp + flat_file->indices[f_index] + (ADIO_Offset) n_filetypes*filetype_extent; new_frd_size = flat_file->blocklens[f_index]; if (size != brd_size) { i += size; new_brd_size -= size; } } if (size == brd_size) { /* reached end of contiguous block in memory */ b_index = (b_index + 1)%flat_buf->count; buf_count++; i = (int) (buftype_extent*(buf_count/flat_buf->count) + flat_buf->indices[b_index]); new_brd_size = flat_buf->blocklens[b_index]; if (size != frd_size) { off += size; new_frd_size -= size; } } tmp_bufsize += size; frd_size = new_frd_size; brd_size = new_brd_size; } } /* unlock the file region if we locked it */ if ((fd->atomicity) && (fd->file_system != ADIO_PIOFS) && (fd->file_system != ADIO_PVFS)) { ADIOI_UNLOCK(fd, start_off, SEEK_SET, end_offset-start_off+1); } if (file_ptr_type == ADIO_INDIVIDUAL) fd->fp_ind = off; } /* end of (else noncontiguous in file) */ fd->fp_sys_posn = -1; /* mark it as invalid. */ #ifdef HAVE_STATUS_SET_BYTES MPIR_Status_set_bytes(status, buftype, bufsize); /* This is a temporary way of filling in status. The right way is to * keep track of how much data was actually read and placed in buf */ #endif if (!buftype_is_contig) ADIOI_Delete_flattened(buftype); }
static void ADIOI_Iread_and_exch_l1_begin(ADIOI_NBC_Request *nbc_req, int *error_code) { ADIOI_Iread_and_exch_vars *vars = nbc_req->data.rd.rae_vars; ADIO_File fd; int nprocs; ADIOI_Access *others_req; int i, j; ADIO_Offset real_off, req_off; char *read_buf; int *curr_offlen_ptr, *count, *send_size; int *partial_send, *start_pos; ADIO_Offset size, real_size, for_next_iter; int req_len, flag; ADIOI_R_Iexchange_data_vars *red_vars = NULL; /* loop exit condition */ if (vars->m >= vars->ntimes) { ADIOI_Iread_and_exch_reset(nbc_req, error_code); return; } fd = vars->fd; nprocs = vars->nprocs; others_req = vars->others_req; read_buf = vars->read_buf; curr_offlen_ptr = vars->curr_offlen_ptr; count = vars->count; send_size = vars->send_size; partial_send = vars->partial_send; start_pos = vars->start_pos; /* read buf of size coll_bufsize (or less) */ /* go through all others_req and check if any are satisfied by the current read */ /* since MPI guarantees that displacements in filetypes are in monotonically nondecreasing order, I can maintain a pointer (curr_offlen_ptr) to current off-len pair for each process in others_req and scan further only from there. There is still a problem of filetypes such as: (1, 2, 3 are not process nos. They are just numbers for three chunks of data, specified by a filetype.) 1 -------!-- 2 -----!---- 3 --!----- where ! indicates where the current read_size limitation cuts through the filetype. I resolve this by reading up to !, but filling the communication buffer only for 1. I copy the portion left over for 2 into a tmp_buf for use in the next iteration. i.e., 2 and 3 will be satisfied in the next iteration. This simplifies filling in the user's buf at the other end, as only one off-len pair with incomplete data will be sent. I also don't need to send the individual offsets and lens along with the data, as the data is being sent in a particular order. */ /* off = start offset in the file for the data actually read in this iteration size = size of data read corresponding to off real_off = off minus whatever data was retained in memory from previous iteration for cases like 2, 3 illustrated above real_size = size plus the extra corresponding to real_off req_off = off in file for a particular contiguous request minus what was satisfied in previous iteration req_size = size corresponding to req_off */ size = ADIOI_MIN((unsigned)vars->coll_bufsize, vars->end_loc - vars->st_loc + 1 - vars->done); real_off = vars->off - vars->for_curr_iter; real_size = size + vars->for_curr_iter; vars->size = size; vars->real_size = real_size; for (i = 0; i < nprocs; i++) count[i] = send_size[i] = 0; for_next_iter = 0; for (i = 0; i < nprocs; i++) { #ifdef RDCOLL_DEBUG DBG_FPRINTF(stderr, "rank %d, i %d, others_count %d\n", vars->myrank, i, others_req[i].count); #endif if (others_req[i].count) { start_pos[i] = curr_offlen_ptr[i]; for (j = curr_offlen_ptr[i]; j < others_req[i].count; j++) { if (partial_send[i]) { /* this request may have been partially satisfied in the previous iteration. */ req_off = others_req[i].offsets[j] + partial_send[i]; req_len = others_req[i].lens[j] - partial_send[i]; partial_send[i] = 0; /* modify the off-len pair to reflect this change */ others_req[i].offsets[j] = req_off; others_req[i].lens[j] = req_len; } else { req_off = others_req[i].offsets[j]; req_len = others_req[i].lens[j]; } if (req_off < real_off + real_size) { count[i]++; ADIOI_Assert((((ADIO_Offset)(MPIR_Upint)read_buf) + req_off - real_off) == (ADIO_Offset)(MPIR_Upint)(read_buf + req_off - real_off)); MPI_Address(read_buf + req_off - real_off, &(others_req[i].mem_ptrs[j])); ADIOI_Assert((real_off + real_size - req_off) == (int)(real_off + real_size - req_off)); send_size[i] += (int)(ADIOI_MIN(real_off + real_size - req_off, (ADIO_Offset)(unsigned)req_len)); if (real_off + real_size - req_off < (ADIO_Offset)(unsigned)req_len) { partial_send[i] = (int)(real_off + real_size - req_off); if ((j+1 < others_req[i].count) && (others_req[i].offsets[j+1] < real_off + real_size)) { /* this is the case illustrated in the figure above. */ for_next_iter = ADIOI_MAX(for_next_iter, real_off + real_size - others_req[i].offsets[j+1]); /* max because it must cover requests from different processes */ } break; } } else break; } curr_offlen_ptr[i] = j; } } vars->for_next_iter = for_next_iter; flag = 0; for (i = 0; i < nprocs; i++) if (count[i]) flag = 1; /* create a struct for ADIOI_R_Iexchange_data() */ red_vars = (ADIOI_R_Iexchange_data_vars *)ADIOI_Calloc( 1, sizeof(ADIOI_R_Iexchange_data_vars)); nbc_req->data.rd.red_vars = red_vars; red_vars->fd = vars->fd; red_vars->buf = vars->buf; red_vars->flat_buf = vars->flat_buf; red_vars->offset_list = vars->offset_list; red_vars->len_list = vars->len_list; red_vars->send_size = vars->send_size; red_vars->recv_size = vars->recv_size; red_vars->count = vars->count; red_vars->start_pos = vars->start_pos; red_vars->partial_send = vars->partial_send; red_vars->recd_from_proc = vars->recd_from_proc; red_vars->nprocs = vars->nprocs; red_vars->myrank = vars->myrank; red_vars->buftype_is_contig = vars->buftype_is_contig; red_vars->contig_access_count = vars->contig_access_count; red_vars->min_st_offset = vars->min_st_offset; red_vars->fd_size = vars->fd_size; red_vars->fd_start = vars->fd_start; red_vars->fd_end = vars->fd_end; red_vars->others_req = vars->others_req; red_vars->iter = vars->m; red_vars->buftype_extent = vars->buftype_extent; red_vars->buf_idx = vars->buf_idx; red_vars->next_fn = ADIOI_Iread_and_exch_l1_end; if (flag) { ADIOI_Assert(size == (int)size); ADIO_IreadContig(fd, read_buf+vars->for_curr_iter, (int)size, MPI_BYTE, ADIO_EXPLICIT_OFFSET, vars->off, &vars->req2, error_code); nbc_req->data.rd.state = ADIOI_IRC_STATE_IREAD_AND_EXCH_L1_BEGIN; return; } ADIOI_R_Iexchange_data(nbc_req, error_code); }
void ADIOI_PIOFS_WriteStrided(ADIO_File fd, void *buf, int count, MPI_Datatype datatype, int file_ptr_type, ADIO_Offset offset, ADIO_Status *status, int *error_code) { /* Since PIOFS does not support file locking, can't do buffered writes as on Unix */ /* offset is in units of etype relative to the filetype. */ ADIOI_Flatlist_node *flat_buf, *flat_file; struct iovec *iov; int i, j, k, err=-1, bwr_size, fwr_size=0, st_index=0; int bufsize, num, size, sum, n_etypes_in_filetype, size_in_filetype; int n_filetypes, etype_in_filetype; ADIO_Offset abs_off_in_filetype=0; int filetype_size, etype_size, buftype_size; MPI_Aint filetype_extent, buftype_extent, indx; int buf_count, buftype_is_contig, filetype_is_contig; ADIO_Offset off, disp; int flag, new_bwr_size, new_fwr_size, err_flag=0; #ifndef PRINT_ERR_MSG static char myname[] = "ADIOI_PIOFS_WRITESTRIDED"; #endif if (fd->atomicity) { FPRINTF(stderr, "ROMIO cannot guarantee atomicity of noncontiguous accesses in atomic mode, as PIOFS doesn't support file locking. Use nonatomic mode and its associated semantics.\n"); MPI_Abort(MPI_COMM_WORLD, 1); } ADIOI_Datatype_iscontig(datatype, &buftype_is_contig); ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig); MPI_Type_size(fd->filetype, &filetype_size); if ( ! filetype_size ) { *error_code = MPI_SUCCESS; return; } MPI_Type_extent(fd->filetype, &filetype_extent); MPI_Type_size(datatype, &buftype_size); MPI_Type_extent(datatype, &buftype_extent); etype_size = fd->etype_size; bufsize = buftype_size * count; if (!buftype_is_contig && filetype_is_contig) { /* noncontiguous in memory, contiguous in file. use writev */ ADIOI_Flatten_datatype(datatype); flat_buf = ADIOI_Flatlist; while (flat_buf->type != datatype) flat_buf = flat_buf->next; /* There is a limit of 16 on the number of iovecs for readv/writev! */ iov = (struct iovec *) ADIOI_Malloc(16*sizeof(struct iovec)); if (file_ptr_type == ADIO_EXPLICIT_OFFSET) { off = fd->disp + etype_size * offset; llseek(fd->fd_sys, off, SEEK_SET); } else off = llseek(fd->fd_sys, fd->fp_ind, SEEK_SET); k = 0; for (j=0; j<count; j++) for (i=0; i<flat_buf->count; i++) { iov[k].iov_base = ((char *) buf) + j*buftype_extent + flat_buf->indices[i]; iov[k].iov_len = flat_buf->blocklens[i]; /*FPRINTF(stderr, "%d %d\n", iov[k].iov_base, iov[k].iov_len);*/ off += flat_buf->blocklens[i]; k = (k+1)%16; if (!k) { err = writev(fd->fd_sys, iov, 16); if (err == -1) err_flag = 1; } } if (k) { err = writev(fd->fd_sys, iov, k); if (err == -1) err_flag = 1; } if (file_ptr_type == ADIO_INDIVIDUAL) fd->fp_ind = off; ADIOI_Free(iov); if (err_flag) { #ifdef MPICH2 *error_code = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_IO, "**io", "**io %s", strerror(errno)); #elif defined(PRINT_ERR_MSG) *error_code = MPI_ERR_UNKNOWN; #else /* MPICH-1 */ *error_code = MPIR_Err_setmsg(MPI_ERR_IO, MPIR_ADIO_ERROR, myname, "I/O Error", "%s", strerror(errno)); ADIOI_Error(fd, *error_code, myname); #endif } else *error_code = MPI_SUCCESS; } /* if (!buftype_is_contig && filetype_is_contig) ... */ else { /* noncontiguous in file */ /* split up into several contiguous writes */ /* find starting location in the file */ /* filetype already flattened in ADIO_Open */ flat_file = ADIOI_Flatlist; while (flat_file->type != fd->filetype) flat_file = flat_file->next; disp = fd->disp; if (file_ptr_type == ADIO_INDIVIDUAL) { offset = fd->fp_ind; /* in bytes */ n_filetypes = -1; flag = 0; while (!flag) { n_filetypes++; for (i=0; i<flat_file->count; i++) { if (disp + flat_file->indices[i] + (ADIO_Offset) n_filetypes*filetype_extent + flat_file->blocklens[i] >= offset) { st_index = i; fwr_size = disp + flat_file->indices[i] + (ADIO_Offset) n_filetypes*filetype_extent + flat_file->blocklens[i] - offset; flag = 1; break; } } } } else { n_etypes_in_filetype = filetype_size/etype_size; n_filetypes = (int) (offset / n_etypes_in_filetype); etype_in_filetype = (int) (offset % n_etypes_in_filetype); size_in_filetype = etype_in_filetype * etype_size; sum = 0; for (i=0; i<flat_file->count; i++) { sum += flat_file->blocklens[i]; if (sum > size_in_filetype) { st_index = i; fwr_size = sum - size_in_filetype; abs_off_in_filetype = flat_file->indices[i] + size_in_filetype - (sum - flat_file->blocklens[i]); break; } } /* abs. offset in bytes in the file */ offset = disp + (ADIO_Offset) n_filetypes*filetype_extent + abs_off_in_filetype; } if (buftype_is_contig && !filetype_is_contig) { /* contiguous in memory, noncontiguous in file. should be the most common case. */ i = 0; j = st_index; off = offset; fwr_size = ADIOI_MIN(fwr_size, bufsize); while (i < bufsize) { if (fwr_size) { /* TYPE_UB and TYPE_LB can result in fwr_size = 0. save system call in such cases */ #ifdef PROFILE MPE_Log_event(11, 0, "start seek"); #endif llseek(fd->fd_sys, off, SEEK_SET); #ifdef PROFILE MPE_Log_event(12, 0, "end seek"); MPE_Log_event(5, 0, "start write"); #endif err = write(fd->fd_sys, ((char *) buf) + i, fwr_size); #ifdef PROFILE MPE_Log_event(6, 0, "end write"); #endif if (err == -1) err_flag = 1; } i += fwr_size; if (off + fwr_size < disp + flat_file->indices[j] + flat_file->blocklens[j] + (ADIO_Offset) n_filetypes*filetype_extent) off += fwr_size; /* did not reach end of contiguous block in filetype. no more I/O needed. off is incremented by fwr_size. */ else { if (j < (flat_file->count - 1)) j++; else { j = 0; n_filetypes++; } off = disp + flat_file->indices[j] + (ADIO_Offset) n_filetypes*filetype_extent; fwr_size = ADIOI_MIN(flat_file->blocklens[j], bufsize-i); } } } else { /* noncontiguous in memory as well as in file */ ADIOI_Flatten_datatype(datatype); flat_buf = ADIOI_Flatlist; while (flat_buf->type != datatype) flat_buf = flat_buf->next; k = num = buf_count = 0; indx = flat_buf->indices[0]; j = st_index; off = offset; bwr_size = flat_buf->blocklens[0]; while (num < bufsize) { size = ADIOI_MIN(fwr_size, bwr_size); if (size) { #ifdef PROFILE MPE_Log_event(11, 0, "start seek"); #endif llseek(fd->fd_sys, off, SEEK_SET); #ifdef PROFILE MPE_Log_event(12, 0, "end seek"); MPE_Log_event(5, 0, "start write"); #endif err = write(fd->fd_sys, ((char *) buf) + indx, size); #ifdef PROFILE MPE_Log_event(6, 0, "end write"); #endif if (err == -1) err_flag = 1; } new_fwr_size = fwr_size; new_bwr_size = bwr_size; if (size == fwr_size) { /* reached end of contiguous block in file */ if (j < (flat_file->count - 1)) j++; else { j = 0; n_filetypes++; } off = disp + flat_file->indices[j] + (ADIO_Offset) n_filetypes*filetype_extent; new_fwr_size = flat_file->blocklens[j]; if (size != bwr_size) { indx += size; new_bwr_size -= size; } } if (size == bwr_size) { /* reached end of contiguous block in memory */ k = (k + 1)%flat_buf->count; buf_count++; indx = buftype_extent*(buf_count/flat_buf->count) + flat_buf->indices[k]; new_bwr_size = flat_buf->blocklens[k]; if (size != fwr_size) { off += size; new_fwr_size -= size; } } num += size; fwr_size = new_fwr_size; bwr_size = new_bwr_size; } } if (file_ptr_type == ADIO_INDIVIDUAL) fd->fp_ind = off; if (err_flag) { #ifdef MPICH2 *error_code = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_IO, "**io", "**io %s", strerror(errno)); #elif defined(PRINT_ERR_MSG) *error_code = MPI_ERR_UNKNOWN; #else /* MPICH-1 */ *error_code = MPIR_Err_setmsg(MPI_ERR_IO, MPIR_ADIO_ERROR, myname, "I/O Error", "%s", strerror(errno)); ADIOI_Error(fd, *error_code, myname); #endif } else *error_code = MPI_SUCCESS; } fd->fp_sys_posn = -1; /* set it to null. */ #ifdef HAVE_STATUS_SET_BYTES MPIR_Status_set_bytes(status, datatype, bufsize); /* This is a temporary way of filling in status. The right way is to keep track of how much data was actually written by ADIOI_BUFFERED_WRITE. */ #endif if (!buftype_is_contig) ADIOI_Delete_flattened(datatype); }
/* Copied from ADIOI_PVFS2_OldWriteStrided. It would be good to have fewer * copies of this code... */ void ADIOI_ZOIDFS_WriteStrided(ADIO_File fd, void *buf, int count, MPI_Datatype datatype, int file_ptr_type, ADIO_Offset offset, ADIO_Status *status, int *error_code) { /* as with all the other WriteStrided functions, offset is in units of * etype relative to the filetype */ /* Since zoidfs does not support file locking, can't do buffered writes as on Unix */ ADIOI_Flatlist_node *flat_buf, *flat_file; int i, j, k, bwr_size, fwr_size=0, st_index=0; int bufsize, sum, n_etypes_in_filetype, size_in_filetype; int n_filetypes, etype_in_filetype; ADIO_Offset abs_off_in_filetype=0; int filetype_size, etype_size, buftype_size; MPI_Aint filetype_extent, buftype_extent; int buf_count, buftype_is_contig, filetype_is_contig; ADIO_Offset off, disp, start_off, initial_off; int flag, st_fwr_size, st_n_filetypes; int err_flag=0; size_t mem_list_count, file_list_count; const void ** mem_offsets; uint64_t *file_offsets; size_t *mem_lengths; uint64_t *file_lengths; int total_blks_to_write; int max_mem_list, max_file_list; int b_blks_wrote; int f_data_wrote; int size_wrote=0, n_write_lists, extra_blks; int end_bwr_size, end_fwr_size; int start_k, start_j, new_file_write, new_buffer_write; int start_mem_offset; ADIOI_ZOIDFS_object *zoidfs_obj_ptr; MPI_Offset total_bytes_written=0; static char myname[] = "ADIOI_ZOIDFS_WRITESTRIDED"; /* note: I don't know what zoidfs will do if you pass it a super-long list, * so let's keep with the PVFS limit for now */ #define MAX_ARRAY_SIZE 64 /* --BEGIN ERROR HANDLING-- */ if (fd->atomicity) { *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_ARG, "Atomic noncontiguous writes are not supported by ZOIDFS", 0); return; } /* --END ERROR HANDLING-- */ ADIOI_Datatype_iscontig(datatype, &buftype_is_contig); ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig); /* the HDF5 tests showed a bug in this list processing code (see many many * lines down below). We added a workaround, but common HDF5 file types * are actually contiguous and do not need the expensive workarond */ if (!filetype_is_contig) { flat_file = ADIOI_Flatlist; while (flat_file->type != fd->filetype) flat_file = flat_file->next; if (flat_file->count == 1 && !buftype_is_contig) filetype_is_contig = 1; } MPI_Type_size(fd->filetype, &filetype_size); if ( ! filetype_size ) { *error_code = MPI_SUCCESS; return; } MPI_Type_extent(fd->filetype, &filetype_extent); MPI_Type_size(datatype, &buftype_size); MPI_Type_extent(datatype, &buftype_extent); etype_size = fd->etype_size; bufsize = buftype_size * count; zoidfs_obj_ptr = (ADIOI_ZOIDFS_object*)fd->fs_ptr; if (!buftype_is_contig && filetype_is_contig) { /* noncontiguous in memory, contiguous in file. */ uint64_t file_offsets; uint64_t file_lengths; ADIOI_Flatten_datatype(datatype); flat_buf = ADIOI_Flatlist; while (flat_buf->type != datatype) flat_buf = flat_buf->next; if (file_ptr_type == ADIO_EXPLICIT_OFFSET) { off = fd->disp + etype_size * offset; } else off = fd->fp_ind; file_list_count = 1; file_offsets = off; file_lengths = 0; total_blks_to_write = count*flat_buf->count; b_blks_wrote = 0; /* allocate arrays according to max usage */ if (total_blks_to_write > MAX_ARRAY_SIZE) mem_list_count = MAX_ARRAY_SIZE; else mem_list_count = total_blks_to_write; mem_offsets = (void*)ADIOI_Malloc(mem_list_count*sizeof(void*)); mem_lengths = (size_t*)ADIOI_Malloc(mem_list_count*sizeof(size_t)); j = 0; /* step through each block in memory, filling memory arrays */ while (b_blks_wrote < total_blks_to_write) { for (i=0; i<flat_buf->count; i++) { mem_offsets[b_blks_wrote % MAX_ARRAY_SIZE] = buf + j*buftype_extent + flat_buf->indices[i]; mem_lengths[b_blks_wrote % MAX_ARRAY_SIZE] = flat_buf->blocklens[i]; file_lengths += flat_buf->blocklens[i]; b_blks_wrote++; if (!(b_blks_wrote % MAX_ARRAY_SIZE) || (b_blks_wrote == total_blks_to_write)) { /* in the case of the last write list call, adjust mem_list_count */ if (b_blks_wrote == total_blks_to_write) { mem_list_count = total_blks_to_write % MAX_ARRAY_SIZE; /* in case last write list call fills max arrays */ if (!mem_list_count) mem_list_count = MAX_ARRAY_SIZE; } #ifdef ADIOI_MPE_LOGGING MPE_Log_event( ADIOI_MPE_write_a, 0, NULL ); #endif NO_STALE(err_flag, fd, zoidfs_obj_ptr, zoidfs_write(zoidfs_obj_ptr, mem_list_count, mem_offsets, mem_lengths, 1, &file_offsets, &file_lengths, ZOIDFS_NO_OP_HINT)); /* --BEGIN ERROR HANDLING-- */ if (err_flag != ZFS_OK) { *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, ADIOI_ZOIDFS_error_convert(err_flag), "Error in zoidfs_write", 0); break; } #ifdef ADIOI_MPE_LOGGING MPE_Log_event( ADIOI_MPE_write_b, 0, NULL ); #endif total_bytes_written += file_lengths; /* in the case of error or the last write list call, * leave here */ /* --BEGIN ERROR HANDLING-- */ if (err_flag) { *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, ADIOI_ZOIDFS_error_convert(err_flag), "Error in zoidfs_write", 0); break; } /* --END ERROR HANDLING-- */ if (b_blks_wrote == total_blks_to_write) break; file_offsets += file_lengths; file_lengths = 0; } } /* for (i=0; i<flat_buf->count; i++) */ j++; } /* while (b_blks_wrote < total_blks_to_write) */ ADIOI_Free(mem_offsets); ADIOI_Free(mem_lengths); if (file_ptr_type == ADIO_INDIVIDUAL) fd->fp_ind += total_bytes_written; if (!err_flag) *error_code = MPI_SUCCESS; fd->fp_sys_posn = -1; /* clear this. */ #ifdef HAVE_STATUS_SET_BYTES MPIR_Status_set_bytes(status, datatype, bufsize); /* This is a temporary way of filling in status. The right way is to keep track of how much data was actually written by ADIOI_BUFFERED_WRITE. */ #endif ADIOI_Delete_flattened(datatype); return; } /* if (!buftype_is_contig && filetype_is_contig) */ /* already know that file is noncontiguous from above */ /* noncontiguous in file */ /* filetype already flattened in ADIO_Open */ flat_file = ADIOI_Flatlist; while (flat_file->type != fd->filetype) flat_file = flat_file->next; disp = fd->disp; initial_off = offset; /* for each case - ADIO_Individual pointer or explicit, find offset (file offset in bytes), n_filetypes (how many filetypes into file to start), fwr_size (remaining amount of data in present file block), and st_index (start point in terms of blocks in starting filetype) */ if (file_ptr_type == ADIO_INDIVIDUAL) { offset = fd->fp_ind; /* in bytes */ n_filetypes = -1; flag = 0; while (!flag) { n_filetypes++; for (i=0; i<flat_file->count; i++) { if (disp + flat_file->indices[i] + ((ADIO_Offset) n_filetypes)*filetype_extent + flat_file->blocklens[i] >= offset) { st_index = i; fwr_size = disp + flat_file->indices[i] + ((ADIO_Offset) n_filetypes)*filetype_extent + flat_file->blocklens[i] - offset; flag = 1; break; } } } /* while (!flag) */ } /* if (file_ptr_type == ADIO_INDIVIDUAL) */ else { n_etypes_in_filetype = filetype_size/etype_size; n_filetypes = (int) (offset / n_etypes_in_filetype); etype_in_filetype = (int) (offset % n_etypes_in_filetype); size_in_filetype = etype_in_filetype * etype_size; sum = 0; for (i=0; i<flat_file->count; i++) { sum += flat_file->blocklens[i]; if (sum > size_in_filetype) { st_index = i; fwr_size = sum - size_in_filetype; abs_off_in_filetype = flat_file->indices[i] + size_in_filetype - (sum - flat_file->blocklens[i]); break; } } /* abs. offset in bytes in the file */ offset = disp + ((ADIO_Offset) n_filetypes)*filetype_extent + abs_off_in_filetype; } /* else [file_ptr_type != ADIO_INDIVIDUAL] */ start_off = offset; st_fwr_size = fwr_size; st_n_filetypes = n_filetypes; if (buftype_is_contig && !filetype_is_contig) { /* contiguous in memory, noncontiguous in file. should be the most common case. */ /* only one memory off-len pair, so no array */ size_t mem_lengths; size_t mem_offsets; i = 0; j = st_index; off = offset; n_filetypes = st_n_filetypes; mem_list_count = 1; /* determine how many blocks in file to write */ f_data_wrote = ADIOI_MIN(st_fwr_size, bufsize); total_blks_to_write = 1; if (j < (flat_file->count -1)) j++; else { j = 0; n_filetypes++; } while (f_data_wrote < bufsize) { f_data_wrote += flat_file->blocklens[j]; total_blks_to_write++; if (j<(flat_file->count-1)) j++; else j = 0; } j = st_index; n_filetypes = st_n_filetypes; n_write_lists = total_blks_to_write/MAX_ARRAY_SIZE; extra_blks = total_blks_to_write%MAX_ARRAY_SIZE; mem_offsets = (size_t)buf; mem_lengths = 0; /* if at least one full writelist, allocate file arrays at max array size and don't free until very end */ if (n_write_lists) { file_offsets = (int64_t*)ADIOI_Malloc(MAX_ARRAY_SIZE* sizeof(int64_t)); file_lengths = (uint64_t*)ADIOI_Malloc(MAX_ARRAY_SIZE* sizeof(uint64_t)); } /* if there's no full writelist allocate file arrays according to needed size (extra_blks) */ else { file_offsets = (int64_t*)ADIOI_Malloc(extra_blks* sizeof(int64_t)); file_lengths = (uint64_t*)ADIOI_Malloc(extra_blks* sizeof(uint64_t)); } /* for file arrays that are of MAX_ARRAY_SIZE, build arrays */ for (i=0; i<n_write_lists; i++) { file_list_count = MAX_ARRAY_SIZE; if(!i) { file_offsets[0] = offset; file_lengths[0] = st_fwr_size; mem_lengths = st_fwr_size; } for (k=0; k<MAX_ARRAY_SIZE; k++) { if (i || k) { file_offsets[k] = disp + ((ADIO_Offset)n_filetypes)*filetype_extent + flat_file->indices[j]; file_lengths[k] = flat_file->blocklens[j]; mem_lengths += file_lengths[k]; } if (j<(flat_file->count - 1)) j++; else { j = 0; n_filetypes++; } } /* for (k=0; k<MAX_ARRAY_SIZE; k++) */ #ifdef ADIOI_MPE_LOGGING MPE_Log_event( ADIOI_MPE_write_a, 0, NULL ); #endif NO_STALE(err_flag, fd, zoidfs_obj_ptr, zoidfs_write(zoidfs_obj_ptr, 1, buf, &mem_lengths, file_list_count, file_offsets, file_lengths, ZOIDFS_NO_OP_HINT)); #ifdef ADIOI_MPE_LOGGING MPE_Log_event( ADIOI_MPE_write_b, 0, NULL ); #endif /* --BEGIN ERROR HANDLING-- */ if (err_flag != ZFS_OK) { *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, ADIOI_ZOIDFS_error_convert(err_flag), "Error in zoidfs_write", 0); goto error_state; } /* --END ERROR HANDLING-- */ total_bytes_written += mem_lengths; mem_offsets += mem_lengths; mem_lengths = 0; } /* for (i=0; i<n_write_lists; i++) */ /* for file arrays smaller than MAX_ARRAY_SIZE (last write_list call) */ if (extra_blks) { file_list_count = extra_blks; if(!i) { file_offsets[0] = offset; file_lengths[0] = ADIOI_MIN(st_fwr_size, bufsize); } for (k=0; k<extra_blks; k++) { if(i || k) { file_offsets[k] = disp + ((ADIO_Offset)n_filetypes)*filetype_extent + flat_file->indices[j]; /* XXX: double-check these casts */ if (k == (extra_blks - 1)) { file_lengths[k] = bufsize - mem_lengths - mem_offsets + (size_t)buf; } else file_lengths[k] = flat_file->blocklens[j]; } /* if(i || k) */ mem_lengths += file_lengths[k]; if (j<(flat_file->count - 1)) j++; else { j = 0; n_filetypes++; } } /* for (k=0; k<extra_blks; k++) */ #ifdef ADIOI_MPE_LOGGING MPE_Log_event( ADIOI_MPE_write_a, 0, NULL ); #endif NO_STALE(err_flag, fd, zoidfs_obj_ptr, zoidfs_write(zoidfs_obj_ptr, 1, (const void **)&mem_offsets, &mem_lengths, file_list_count, file_offsets, file_lengths, ZOIDFS_NO_OP_HINT)); #ifdef ADIOI_MPE_LOGGING MPE_Log_event( ADIOI_MPE_write_b, 0, NULL ); #endif /* --BEGIN ERROR HANDLING-- */ if (err_flag != 0) { *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, ADIOI_ZOIDFS_error_convert(err_flag), "Error in zoidfs_write", 0); goto error_state; } /* --END ERROR HANDLING-- */ total_bytes_written += mem_lengths; } } else { /* noncontiguous in memory as well as in file */ ADIOI_Flatten_datatype(datatype); flat_buf = ADIOI_Flatlist; while (flat_buf->type != datatype) flat_buf = flat_buf->next; size_wrote = 0; n_filetypes = st_n_filetypes; fwr_size = st_fwr_size; bwr_size = flat_buf->blocklens[0]; buf_count = 0; start_mem_offset = 0; start_k = k = 0; start_j = st_index; max_mem_list = 0; max_file_list = 0; /* run through and file max_file_list and max_mem_list so that you can allocate the file and memory arrays less than MAX_ARRAY_SIZE if possible */ while (size_wrote < bufsize) { k = start_k; new_buffer_write = 0; mem_list_count = 0; while ((mem_list_count < MAX_ARRAY_SIZE) && (new_buffer_write < bufsize-size_wrote)) { /* find mem_list_count and file_list_count such that both are less than MAX_ARRAY_SIZE, the sum of their lengths are equal, and the sum of all the data written and data to be written in the next immediate write list is less than bufsize */ if(mem_list_count) { if((new_buffer_write + flat_buf->blocklens[k] + size_wrote) > bufsize) { end_bwr_size = new_buffer_write + flat_buf->blocklens[k] - (bufsize - size_wrote); new_buffer_write = bufsize - size_wrote; } else { new_buffer_write += flat_buf->blocklens[k]; end_bwr_size = flat_buf->blocklens[k]; } } else { if (bwr_size > (bufsize - size_wrote)) { new_buffer_write = bufsize - size_wrote; bwr_size = new_buffer_write; } else new_buffer_write = bwr_size; } mem_list_count++; k = (k + 1)%flat_buf->count; } /* while ((mem_list_count < MAX_ARRAY_SIZE) && (new_buffer_write < bufsize-size_wrote)) */ j = start_j; new_file_write = 0; file_list_count = 0; while ((file_list_count < MAX_ARRAY_SIZE) && (new_file_write < new_buffer_write)) { if(file_list_count) { if((new_file_write + flat_file->blocklens[j]) > new_buffer_write) { end_fwr_size = new_buffer_write - new_file_write; new_file_write = new_buffer_write; j--; } else { new_file_write += flat_file->blocklens[j]; end_fwr_size = flat_file->blocklens[j]; } } else { if (fwr_size > new_buffer_write) { new_file_write = new_buffer_write; fwr_size = new_file_write; } else new_file_write = fwr_size; } file_list_count++; if (j < (flat_file->count - 1)) j++; else j = 0; k = start_k; if ((new_file_write < new_buffer_write) && (file_list_count == MAX_ARRAY_SIZE)) { new_buffer_write = 0; mem_list_count = 0; while (new_buffer_write < new_file_write) { if(mem_list_count) { if((new_buffer_write + flat_buf->blocklens[k]) > new_file_write) { end_bwr_size = new_file_write - new_buffer_write; new_buffer_write = new_file_write; k--; } else { new_buffer_write += flat_buf->blocklens[k]; end_bwr_size = flat_buf->blocklens[k]; } } else { new_buffer_write = bwr_size; if (bwr_size > (bufsize - size_wrote)) { new_buffer_write = bufsize - size_wrote; bwr_size = new_buffer_write; } } mem_list_count++; k = (k + 1)%flat_buf->count; } /* while (new_buffer_write < new_file_write) */ } /* if ((new_file_write < new_buffer_write) && (file_list_count == MAX_ARRAY_SIZE)) */ } /* while ((mem_list_count < MAX_ARRAY_SIZE) && (new_buffer_write < bufsize-size_wrote)) */ /* fakes filling the writelist arrays of lengths found above */ k = start_k; j = start_j; for (i=0; i<mem_list_count; i++) { if(i) { if (i == (mem_list_count - 1)) { if (flat_buf->blocklens[k] == end_bwr_size) bwr_size = flat_buf->blocklens[(k+1)% flat_buf->count]; else { bwr_size = flat_buf->blocklens[k] - end_bwr_size; k--; buf_count--; } } } buf_count++; k = (k + 1)%flat_buf->count; } /* for (i=0; i<mem_list_count; i++) */ for (i=0; i<file_list_count; i++) { if (i) { if (i == (file_list_count - 1)) { if (flat_file->blocklens[j] == end_fwr_size) fwr_size = flat_file->blocklens[(j+1)% flat_file->count]; else { fwr_size = flat_file->blocklens[j] - end_fwr_size; j--; } } } if (j < flat_file->count - 1) j++; else { j = 0; n_filetypes++; } } /* for (i=0; i<file_list_count; i++) */ size_wrote += new_buffer_write; start_k = k; start_j = j; if (max_mem_list < mem_list_count) max_mem_list = mem_list_count; if (max_file_list < file_list_count) max_file_list = file_list_count; } /* while (size_wrote < bufsize) */ /* one last check before we actually carry out the operation: * this code has hard-to-fix bugs when a noncontiguous file type has * such large pieces that the sum of the lengths of the memory type is * not larger than one of those pieces (and vice versa for large memory * types and many pices of file types. In these cases, give up and * fall back to naive reads and writes. The testphdf5 test created a * type with two very large memory regions and 600 very small file * regions. The same test also created a type with one very large file * region and many (700) very small memory regions. both cases caused * problems for this code */ if ( ( (file_list_count == 1) && (new_file_write < flat_file->blocklens[0] ) ) || ((mem_list_count == 1) && (new_buffer_write < flat_buf->blocklens[0]) ) || ((file_list_count == MAX_ARRAY_SIZE) && (new_file_write < flat_buf->blocklens[0]) ) || ( (mem_list_count == MAX_ARRAY_SIZE) && (new_buffer_write < flat_file->blocklens[0])) ) { ADIOI_Delete_flattened(datatype); ADIOI_GEN_WriteStrided_naive(fd, buf, count, datatype, file_ptr_type, initial_off, status, error_code); return; } mem_offsets = (void *)ADIOI_Malloc(max_mem_list*sizeof(void *)); mem_lengths = (size_t*)ADIOI_Malloc(max_mem_list*sizeof(size_t)); file_offsets = (uint64_t *)ADIOI_Malloc(max_file_list*sizeof(uint64_t)); file_lengths = (uint64_t*)ADIOI_Malloc(max_file_list*sizeof(uint64_t)); size_wrote = 0; n_filetypes = st_n_filetypes; fwr_size = st_fwr_size; bwr_size = flat_buf->blocklens[0]; buf_count = 0; start_mem_offset = 0; start_k = k = 0; start_j = st_index; /* this section calculates mem_list_count and file_list_count and also finds the possibly odd sized last array elements in new_fwr_size and new_bwr_size */ while (size_wrote < bufsize) { k = start_k; new_buffer_write = 0; mem_list_count = 0; while ((mem_list_count < MAX_ARRAY_SIZE) && (new_buffer_write < bufsize-size_wrote)) { /* find mem_list_count and file_list_count such that both are less than MAX_ARRAY_SIZE, the sum of their lengths are equal, and the sum of all the data written and data to be written in the next immediate write list is less than bufsize */ if(mem_list_count) { if((new_buffer_write + flat_buf->blocklens[k] + size_wrote) > bufsize) { end_bwr_size = new_buffer_write + flat_buf->blocklens[k] - (bufsize - size_wrote); new_buffer_write = bufsize - size_wrote; } else { new_buffer_write += flat_buf->blocklens[k]; end_bwr_size = flat_buf->blocklens[k]; } } else { if (bwr_size > (bufsize - size_wrote)) { new_buffer_write = bufsize - size_wrote; bwr_size = new_buffer_write; } else new_buffer_write = bwr_size; } mem_list_count++; k = (k + 1)%flat_buf->count; } /* while ((mem_list_count < MAX_ARRAY_SIZE) && (new_buffer_write < bufsize-size_wrote)) */ j = start_j; new_file_write = 0; file_list_count = 0; while ((file_list_count < MAX_ARRAY_SIZE) && (new_file_write < new_buffer_write)) { if(file_list_count) { if((new_file_write + flat_file->blocklens[j]) > new_buffer_write) { end_fwr_size = new_buffer_write - new_file_write; new_file_write = new_buffer_write; j--; } else { new_file_write += flat_file->blocklens[j]; end_fwr_size = flat_file->blocklens[j]; } } else { if (fwr_size > new_buffer_write) { new_file_write = new_buffer_write; fwr_size = new_file_write; } else new_file_write = fwr_size; } file_list_count++; if (j < (flat_file->count - 1)) j++; else j = 0; k = start_k; if ((new_file_write < new_buffer_write) && (file_list_count == MAX_ARRAY_SIZE)) { new_buffer_write = 0; mem_list_count = 0; while (new_buffer_write < new_file_write) { if(mem_list_count) { if((new_buffer_write + flat_buf->blocklens[k]) > new_file_write) { end_bwr_size = new_file_write - new_buffer_write; new_buffer_write = new_file_write; k--; } else { new_buffer_write += flat_buf->blocklens[k]; end_bwr_size = flat_buf->blocklens[k]; } } else { new_buffer_write = bwr_size; if (bwr_size > (bufsize - size_wrote)) { new_buffer_write = bufsize - size_wrote; bwr_size = new_buffer_write; } } mem_list_count++; k = (k + 1)%flat_buf->count; } /* while (new_buffer_write < new_file_write) */ } /* if ((new_file_write < new_buffer_write) && (file_list_count == MAX_ARRAY_SIZE)) */ } /* while ((mem_list_count < MAX_ARRAY_SIZE) && (new_buffer_write < bufsize-size_wrote)) */ /* fills the allocated writelist arrays */ k = start_k; j = start_j; for (i=0; i<mem_list_count; i++) { mem_offsets[i] = buf + buftype_extent* (buf_count/flat_buf->count) + flat_buf->indices[k]; if(!i) { mem_lengths[0] = bwr_size; mem_offsets[0] += flat_buf->blocklens[k] - bwr_size; } else { if (i == (mem_list_count - 1)) { mem_lengths[i] = end_bwr_size; if (flat_buf->blocklens[k] == end_bwr_size) bwr_size = flat_buf->blocklens[(k+1)% flat_buf->count]; else { bwr_size = flat_buf->blocklens[k] - end_bwr_size; k--; buf_count--; } } else { mem_lengths[i] = flat_buf->blocklens[k]; } } buf_count++; k = (k + 1)%flat_buf->count; } /* for (i=0; i<mem_list_count; i++) */ for (i=0; i<file_list_count; i++) { file_offsets[i] = disp + flat_file->indices[j] + ((ADIO_Offset)n_filetypes) * filetype_extent; if (!i) { file_lengths[0] = fwr_size; file_offsets[0] += flat_file->blocklens[j] - fwr_size; } else { if (i == (file_list_count - 1)) { file_lengths[i] = end_fwr_size; if (flat_file->blocklens[j] == end_fwr_size) fwr_size = flat_file->blocklens[(j+1)% flat_file->count]; else { fwr_size = flat_file->blocklens[j] - end_fwr_size; j--; } } else file_lengths[i] = flat_file->blocklens[j]; } if (j < flat_file->count - 1) j++; else { j = 0; n_filetypes++; } } /* for (i=0; i<file_list_count; i++) */ #ifdef ADIOI_MPE_LOGGING MPE_Log_event( ADIOI_MPE_write_a, 0, NULL ); #endif NO_STALE(err_flag, fd, zoidfs_obj_ptr, zoidfs_write(zoidfs_obj_ptr, mem_list_count, mem_offsets, mem_lengths, file_list_count, file_offsets, file_lengths, ZOIDFS_NO_OP_HINT)); /* --BEGIN ERROR HANDLING-- */ if (err_flag != ZFS_OK) { *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, ADIOI_ZOIDFS_error_convert(err_flag), "Error in zoidfs_write", 0); goto error_state; } /* --END ERROR HANDLING-- */ #ifdef ADIOI_MPE_LOGGING MPE_Log_event( ADIOI_MPE_write_b, 0, NULL ); #endif size_wrote += new_buffer_write; total_bytes_written += new_buffer_write; /* XXX: is this right? */ start_k = k; start_j = j; } /* while (size_wrote < bufsize) */ ADIOI_Free(mem_offsets); ADIOI_Free(mem_lengths); } /* when incrementing fp_ind, need to also take into account the file type: * consider an N-element 1-d subarray with a lb and ub: ( |---xxxxx-----| * if we wrote N elements, offset needs to point at beginning of type, not * at empty region at offset N+1). * * As we discussed on mpich-discuss in may/june 2009, the code below might * look wierd, but by putting fp_ind at the last byte written, the next * time we run through the strided code we'll update the fp_ind to the * right location. */ if (file_ptr_type == ADIO_INDIVIDUAL) { fd->fp_ind = file_offsets[file_list_count-1]+ file_lengths[file_list_count-1]; } ADIOI_Free(file_offsets); ADIOI_Free(file_lengths); *error_code = MPI_SUCCESS; error_state: fd->fp_sys_posn = -1; /* set it to null. */ #ifdef HAVE_STATUS_SET_BYTES MPIR_Status_set_bytes(status, datatype, bufsize); /* This is a temporary way of filling in status. The right way is to keep track of how much data was actually written by ADIOI_BUFFERED_WRITE. */ #endif if (!buftype_is_contig) ADIOI_Delete_flattened(datatype); }
void ADIOI_PVFS_Fcntl(ADIO_File fd, int flag, ADIO_Fcntl_t *fcntl_struct, int *error_code) { MPI_Datatype copy_etype, copy_filetype; int combiner, i, j, k, filetype_is_contig, ntimes, err; ADIOI_Flatlist_node *flat_file; ADIO_Offset curr_fsize, alloc_size, size, len, done; ADIO_Status status; char *buf; #ifndef PRINT_ERR_MSG static char myname[] = "ADIOI_PVFS_FCNTL"; #endif switch(flag) { case ADIO_FCNTL_SET_VIEW: /* free copies of old etypes and filetypes and delete flattened version of filetype if necessary */ MPI_Type_get_envelope(fd->etype, &i, &j, &k, &combiner); if (combiner != MPI_COMBINER_NAMED) MPI_Type_free(&(fd->etype)); ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig); if (!filetype_is_contig) ADIOI_Delete_flattened(fd->filetype); MPI_Type_get_envelope(fd->filetype, &i, &j, &k, &combiner); if (combiner != MPI_COMBINER_NAMED) MPI_Type_free(&(fd->filetype)); /* set new info */ ADIO_SetInfo(fd, fcntl_struct->info, &err); /* set new etypes and filetypes */ MPI_Type_get_envelope(fcntl_struct->etype, &i, &j, &k, &combiner); if (combiner == MPI_COMBINER_NAMED) fd->etype = fcntl_struct->etype; else { MPI_Type_contiguous(1, fcntl_struct->etype, ©_etype); MPI_Type_commit(©_etype); fd->etype = copy_etype; } MPI_Type_get_envelope(fcntl_struct->filetype, &i, &j, &k, &combiner); if (combiner == MPI_COMBINER_NAMED) fd->filetype = fcntl_struct->filetype; else { MPI_Type_contiguous(1, fcntl_struct->filetype, ©_filetype); MPI_Type_commit(©_filetype); fd->filetype = copy_filetype; ADIOI_Flatten_datatype(fd->filetype); /* this function will not flatten the filetype if it turns out to be all contiguous. */ } MPI_Type_size(fd->etype, &(fd->etype_size)); fd->disp = fcntl_struct->disp; /* reset MPI-IO file pointer to point to the first byte that can be accessed in this view. */ ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig); if (filetype_is_contig) fd->fp_ind = fcntl_struct->disp; else { flat_file = ADIOI_Flatlist; while (flat_file->type != fd->filetype) flat_file = flat_file->next; for (i=0; i<flat_file->count; i++) { if (flat_file->blocklens[i]) { fd->fp_ind = fcntl_struct->disp + flat_file->indices[i]; break; } } } *error_code = MPI_SUCCESS; break; case ADIO_FCNTL_GET_FSIZE: fcntl_struct->fsize = pvfs_lseek(fd->fd_sys, 0, SEEK_END); if (fd->fp_sys_posn != -1) pvfs_lseek(fd->fd_sys, fd->fp_sys_posn, SEEK_SET); #ifdef PRINT_ERR_MSG *error_code = (fcntl_struct->fsize == -1) ? MPI_ERR_UNKNOWN : MPI_SUCCESS; #else if (fcntl_struct->fsize == -1) { *error_code = MPIR_Err_setmsg(MPI_ERR_IO, MPIR_ADIO_ERROR, myname, "I/O Error", "%s", strerror(errno)); ADIOI_Error(fd, *error_code, myname); } else *error_code = MPI_SUCCESS; #endif break; case ADIO_FCNTL_SET_DISKSPACE: /* will be called by one process only */ /* On file systems with no preallocation function, I have to explicitly write to allocate space. Since there could be holes in the file, I need to read up to the current file size, write it back, and then write beyond that depending on how much preallocation is needed. read/write in sizes of no more than ADIOI_PREALLOC_BUFSZ */ curr_fsize = pvfs_lseek(fd->fd_sys, 0, SEEK_END); alloc_size = fcntl_struct->diskspace; size = ADIOI_MIN(curr_fsize, alloc_size); ntimes = (size + ADIOI_PREALLOC_BUFSZ - 1)/ADIOI_PREALLOC_BUFSZ; buf = (char *) ADIOI_Malloc(ADIOI_PREALLOC_BUFSZ); done = 0; for (i=0; i<ntimes; i++) { len = ADIOI_MIN(size-done, ADIOI_PREALLOC_BUFSZ); ADIO_ReadContig(fd, buf, len, MPI_BYTE, ADIO_EXPLICIT_OFFSET, done, &status, error_code); if (*error_code != MPI_SUCCESS) { #ifdef PRINT_ERR_MSG FPRINTF(stderr, "ADIOI_PVFS_Fcntl: To preallocate disk space, ROMIO needs to read the file and write it back, but is unable to read the file. Please give the file read permission and open it with MPI_MODE_RDWR.\n"); MPI_Abort(MPI_COMM_WORLD, 1); #else *error_code = MPIR_Err_setmsg(MPI_ERR_IO, MPIR_PREALLOC_PERM, myname, (char *) 0, (char *) 0); ADIOI_Error(fd, *error_code, myname); return; #endif } ADIO_WriteContig(fd, buf, len, MPI_BYTE, ADIO_EXPLICIT_OFFSET, done, &status, error_code); if (*error_code != MPI_SUCCESS) return; done += len; } if (alloc_size > curr_fsize) { memset(buf, 0, ADIOI_PREALLOC_BUFSZ); size = alloc_size - curr_fsize; ntimes = (size + ADIOI_PREALLOC_BUFSZ - 1)/ADIOI_PREALLOC_BUFSZ; for (i=0; i<ntimes; i++) { len = ADIOI_MIN(alloc_size-done, ADIOI_PREALLOC_BUFSZ); ADIO_WriteContig(fd, buf, len, MPI_BYTE, ADIO_EXPLICIT_OFFSET, done, &status, error_code); if (*error_code != MPI_SUCCESS) return; done += len; } } ADIOI_Free(buf); if (fd->fp_sys_posn != -1) pvfs_lseek(fd->fd_sys, fd->fp_sys_posn, SEEK_SET); *error_code = MPI_SUCCESS; break; case ADIO_FCNTL_SET_IOMODE: /* for implementing PFS I/O modes. will not occur in MPI-IO implementation.*/ if (fd->iomode != fcntl_struct->iomode) { fd->iomode = fcntl_struct->iomode; MPI_Barrier(MPI_COMM_WORLD); } *error_code = MPI_SUCCESS; break; case ADIO_FCNTL_SET_ATOMICITY: *error_code = MPI_ERR_UNKNOWN; break; default: FPRINTF(stderr, "Unknown flag passed to ADIOI_PVFS_Fcntl\n"); MPI_Abort(MPI_COMM_WORLD, 1); } }
void ADIOI_Calc_file_domains(ADIO_Offset *st_offsets, ADIO_Offset *end_offsets, int nprocs, int nprocs_for_coll, ADIO_Offset *min_st_offset_ptr, ADIO_Offset **fd_start_ptr, ADIO_Offset **fd_end_ptr, int min_fd_size, ADIO_Offset *fd_size_ptr, int striping_unit) { /* Divide the I/O workload among "nprocs_for_coll" processes. This is done by (logically) dividing the file into file domains (FDs); each process may directly access only its own file domain. */ ADIO_Offset min_st_offset, max_end_offset, *fd_start, *fd_end, fd_size; int i; #ifdef AGGREGATION_PROFILE MPE_Log_event (5004, 0, NULL); #endif #ifdef AGG_DEBUG FPRINTF(stderr, "ADIOI_Calc_file_domains: %d aggregator(s)\n", nprocs_for_coll); #endif /* find min of start offsets and max of end offsets of all processes */ min_st_offset = st_offsets[0]; max_end_offset = end_offsets[0]; for (i=1; i<nprocs; i++) { min_st_offset = ADIOI_MIN(min_st_offset, st_offsets[i]); max_end_offset = ADIOI_MAX(max_end_offset, end_offsets[i]); } /* determine the "file domain (FD)" of each process, i.e., the portion of the file that will be "owned" by each process */ /* partition the total file access range equally among nprocs_for_coll processes */ fd_size = ((max_end_offset - min_st_offset + 1) + nprocs_for_coll - 1)/nprocs_for_coll; /* ceiling division as in HPF block distribution */ /* Tweak the file domains so that no fd is smaller than a threshold. We * have to strike a balance between efficency and parallelism: somewhere * between 10k processes sending 32-byte requests and one process sending a * 320k request is a (system-dependent) sweet spot */ if (fd_size < min_fd_size) fd_size = min_fd_size; *fd_start_ptr = (ADIO_Offset *) ADIOI_Malloc(nprocs_for_coll*sizeof(ADIO_Offset)); *fd_end_ptr = (ADIO_Offset *) ADIOI_Malloc(nprocs_for_coll*sizeof(ADIO_Offset)); fd_start = *fd_start_ptr; fd_end = *fd_end_ptr; /* Wei-keng Liao: implementation for fild domain alignment to nearest file * lock boundary (as specified by striping_unit hint). Could also * experiment with other alignment strategies here */ if (striping_unit > 0) { ADIO_Offset end_off; int rem_front, rem_back; /* align fd_end[0] to the nearest file lock boundary */ fd_start[0] = min_st_offset; end_off = fd_start[0] + fd_size; rem_front = end_off % striping_unit; rem_back = striping_unit - rem_front; if (rem_front < rem_back) end_off -= rem_front; else end_off += rem_back; fd_end[0] = end_off - 1; /* align fd_end[i] to the nearest file lock boundary */ for (i=1; i<nprocs_for_coll; i++) { fd_start[i] = fd_end[i-1] + 1; end_off = min_st_offset + fd_size * (i+1); rem_front = end_off % striping_unit; rem_back = striping_unit - rem_front; if (rem_front < rem_back) end_off -= rem_front; else end_off += rem_back; fd_end[i] = end_off - 1; } fd_end[nprocs_for_coll-1] = max_end_offset; } else { /* no hints set: do things the 'old' way */ fd_start[0] = min_st_offset; fd_end[0] = min_st_offset + fd_size - 1; for (i=1; i<nprocs_for_coll; i++) { fd_start[i] = fd_end[i-1] + 1; fd_end[i] = fd_start[i] + fd_size - 1; } } /* take care of cases in which the total file access range is not divisible by the number of processes. In such cases, the last process, or the last few processes, may have unequal load (even 0). For example, a range of 97 divided among 16 processes. Note that the division is ceiling division. */ for (i=0; i<nprocs_for_coll; i++) { if (fd_start[i] > max_end_offset) fd_start[i] = fd_end[i] = -1; if (fd_end[i] > max_end_offset) fd_end[i] = max_end_offset; } *fd_size_ptr = fd_size; *min_st_offset_ptr = min_st_offset; #ifdef AGGREGATION_PROFILE MPE_Log_event (5005, 0, NULL); #endif }
static void ADIOI_Iread_and_exch(ADIOI_NBC_Request *nbc_req, int *error_code) { ADIOI_Iread_and_exch_vars *vars = nbc_req->data.rd.rae_vars; ADIO_File fd = vars->fd; MPI_Datatype datatype = vars->datatype; int nprocs = vars->nprocs; ADIOI_Access *others_req = vars->others_req; /* Read in sizes of no more than coll_bufsize, an info parameter. Send data to appropriate processes. Place recd. data in user buf. The idea is to reduce the amount of extra memory required for collective I/O. If all data were read all at once, which is much easier, it would require temp space more than the size of user_buf, which is often unacceptable. For example, to read a distributed array from a file, where each local array is 8Mbytes, requiring at least another 8Mbytes of temp space is unacceptable. */ int i, j; ADIO_Offset st_loc = -1, end_loc = -1; ADIOI_Flatlist_node *flat_buf = NULL; int coll_bufsize; *error_code = MPI_SUCCESS; /* changed below if error */ /* only I/O errors are currently reported */ /* calculate the number of reads of size coll_bufsize to be done by each process and the max among all processes. That gives the no. of communication phases as well. coll_bufsize is obtained from the hints object. */ coll_bufsize = fd->hints->cb_buffer_size; vars->coll_bufsize = coll_bufsize; /* grab some initial values for st_loc and end_loc */ for (i = 0; i < nprocs; i++) { if (others_req[i].count) { st_loc = others_req[i].offsets[0]; end_loc = others_req[i].offsets[0]; break; } } /* now find the real values */ for (i = 0; i < nprocs; i++) for (j = 0; j < others_req[i].count; j++) { st_loc = ADIOI_MIN(st_loc, others_req[i].offsets[j]); end_loc = ADIOI_MAX(end_loc, (others_req[i].offsets[j] + others_req[i].lens[j] - 1)); } vars->st_loc = st_loc; vars->end_loc = end_loc; /* calculate ntimes, the number of times this process must perform I/O * operations in order to complete all the requests it has received. * the need for multiple I/O operations comes from the restriction that * we only use coll_bufsize bytes of memory for internal buffering. */ if ((st_loc == -1) && (end_loc == -1)) { /* this process does no I/O. */ vars->ntimes = 0; } else { /* ntimes=ceiling_div(end_loc - st_loc + 1, coll_bufsize)*/ vars->ntimes = (int)((end_loc - st_loc + coll_bufsize) / coll_bufsize); } *error_code = MPI_Iallreduce(&vars->ntimes, &vars->max_ntimes, 1, MPI_INT, MPI_MAX, fd->comm, &vars->req1); vars->read_buf = fd->io_buf; /* Allocated at open time */ vars->curr_offlen_ptr = (int *)ADIOI_Calloc(nprocs, sizeof(int)); /* its use is explained below. calloc initializes to 0. */ vars->count = (int *)ADIOI_Malloc(nprocs * sizeof(int)); /* to store count of how many off-len pairs per proc are satisfied in an iteration. */ vars->partial_send = (int *)ADIOI_Calloc(nprocs, sizeof(int)); /* if only a portion of the last off-len pair is sent to a process in a particular iteration, the length sent is stored here. calloc initializes to 0. */ vars->send_size = (int *)ADIOI_Malloc(nprocs * sizeof(int)); /* total size of data to be sent to each proc. in an iteration */ vars->recv_size = (int *)ADIOI_Malloc(nprocs * sizeof(int)); /* total size of data to be recd. from each proc. in an iteration. Of size nprocs so that I can use MPI_Alltoall later. */ vars->recd_from_proc = (int *)ADIOI_Calloc(nprocs, sizeof(int)); /* amount of data recd. so far from each proc. Used in ADIOI_Fill_user_buffer. initialized to 0 here. */ vars->start_pos = (int *)ADIOI_Malloc(nprocs*sizeof(int)); /* used to store the starting value of curr_offlen_ptr[i] in this iteration */ ADIOI_Datatype_iscontig(datatype, &vars->buftype_is_contig); if (!vars->buftype_is_contig) { ADIOI_Flatten_datatype(datatype); flat_buf = ADIOI_Flatlist; while (flat_buf->type != datatype) flat_buf = flat_buf->next; vars->flat_buf = flat_buf; } MPI_Type_extent(datatype, &vars->buftype_extent); vars->done = 0; vars->off = st_loc; vars->for_curr_iter = vars->for_next_iter = 0; /* set the state to wait until MPI_Ialltoall finishes. */ nbc_req->data.rd.state = ADIOI_IRC_STATE_IREAD_AND_EXCH; }
void ADIOI_PVFS_WriteStrided(ADIO_File fd, void *buf, int count, MPI_Datatype datatype, int file_ptr_type, ADIO_Offset offset, ADIO_Status *status, int *error_code) { /* Since PVFS does not support file locking, can't do buffered writes as on Unix */ /* offset is in units of etype relative to the filetype. */ ADIOI_Flatlist_node *flat_buf, *flat_file; int i, j, k, err=-1, bwr_size, fwr_size=0, st_index=0; int bufsize, num, size, sum, n_etypes_in_filetype, size_in_filetype; int n_filetypes, etype_in_filetype; ADIO_Offset abs_off_in_filetype=0; int filetype_size, etype_size, buftype_size; MPI_Aint filetype_extent, buftype_extent, indx; int buf_count, buftype_is_contig, filetype_is_contig; ADIO_Offset off, disp; int flag, new_bwr_size, new_fwr_size, err_flag=0; static char myname[] = "ADIOI_PVFS_WRITESTRIDED"; #ifdef HAVE_PVFS_LISTIO if ( fd->hints->fs_hints.pvfs.listio_write == ADIOI_HINT_ENABLE ) { ADIOI_PVFS_WriteStridedListIO(fd, buf, count, datatype, file_ptr_type, offset, status, error_code); return; } #endif /* if hint set to DISABLE or AUTOMATIC, don't use listio */ /* --BEGIN ERROR HANDLING-- */ if (fd->atomicity) { *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_INTERN, "Atomic mode set in PVFS I/O function", 0); return; } /* --END ERROR HANDLING-- */ ADIOI_Datatype_iscontig(datatype, &buftype_is_contig); ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig); MPI_Type_size(fd->filetype, &filetype_size); if ( ! filetype_size ) { *error_code = MPI_SUCCESS; return; } MPI_Type_extent(fd->filetype, &filetype_extent); MPI_Type_size(datatype, &buftype_size); MPI_Type_extent(datatype, &buftype_extent); etype_size = fd->etype_size; bufsize = buftype_size * count; if (!buftype_is_contig && filetype_is_contig) { char *combine_buf, *combine_buf_ptr; ADIO_Offset combine_buf_remain; /* noncontiguous in memory, contiguous in file. use writev */ ADIOI_Flatten_datatype(datatype); flat_buf = ADIOI_Flatlist; while (flat_buf->type != datatype) flat_buf = flat_buf->next; /* allocate our "combine buffer" to pack data into before writing */ combine_buf = (char *) ADIOI_Malloc(fd->hints->ind_wr_buffer_size); combine_buf_ptr = combine_buf; combine_buf_remain = fd->hints->ind_wr_buffer_size; /* seek to the right spot in the file */ if (file_ptr_type == ADIO_EXPLICIT_OFFSET) { off = fd->disp + etype_size * offset; pvfs_lseek64(fd->fd_sys, off, SEEK_SET); } else off = pvfs_lseek64(fd->fd_sys, fd->fp_ind, SEEK_SET); /* loop through all the flattened pieces. combine into buffer until * no more will fit, then write. * * special case of a given piece being bigger than the combine buffer * is also handled. */ for (j=0; j<count; j++) { for (i=0; i<flat_buf->count; i++) { if (flat_buf->blocklens[i] > combine_buf_remain && combine_buf != combine_buf_ptr) { /* there is data in the buffer; write out the buffer so far */ err = pvfs_write(fd->fd_sys, combine_buf, fd->hints->ind_wr_buffer_size - combine_buf_remain); if (err == -1) err_flag = 1; /* reset our buffer info */ combine_buf_ptr = combine_buf; combine_buf_remain = fd->hints->ind_wr_buffer_size; } /* TODO: heuristic for when to not bother to use combine buffer? */ if (flat_buf->blocklens[i] >= combine_buf_remain) { /* special case: blocklen is as big as or bigger than the combine buf; * write directly */ err = pvfs_write(fd->fd_sys, ((char *) buf) + j*buftype_extent + flat_buf->indices[i], flat_buf->blocklens[i]); if (err == -1) err_flag = 1; off += flat_buf->blocklens[i]; /* keep up with the final file offset too */ } else { /* copy more data into combine buffer */ memcpy(combine_buf_ptr, ((char *) buf) + j*buftype_extent + flat_buf->indices[i], flat_buf->blocklens[i]); combine_buf_ptr += flat_buf->blocklens[i]; combine_buf_remain -= flat_buf->blocklens[i]; off += flat_buf->blocklens[i]; /* keep up with the final file offset too */ } } } if (combine_buf_ptr != combine_buf) { /* data left in buffer to write */ err = pvfs_write(fd->fd_sys, combine_buf, fd->hints->ind_wr_buffer_size - combine_buf_remain); if (err == -1) err_flag = 1; } if (file_ptr_type == ADIO_INDIVIDUAL) fd->fp_ind = off; ADIOI_Free(combine_buf); if (err_flag) { *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_IO, "**io", "**io %s", strerror(errno)); } else *error_code = MPI_SUCCESS; } /* if (!buftype_is_contig && filetype_is_contig) ... */ else { /* noncontiguous in file */ /* split up into several contiguous writes */ /* find starting location in the file */ /* filetype already flattened in ADIO_Open */ flat_file = ADIOI_Flatlist; while (flat_file->type != fd->filetype) flat_file = flat_file->next; disp = fd->disp; if (file_ptr_type == ADIO_INDIVIDUAL) { offset = fd->fp_ind; /* in bytes */ n_filetypes = -1; flag = 0; while (!flag) { n_filetypes++; for (i=0; i<flat_file->count; i++) { if (disp + flat_file->indices[i] + (ADIO_Offset) n_filetypes*filetype_extent + flat_file->blocklens[i] >= offset) { st_index = i; fwr_size = disp + flat_file->indices[i] + (ADIO_Offset) n_filetypes*filetype_extent + flat_file->blocklens[i] - offset; flag = 1; break; } } } } else { n_etypes_in_filetype = filetype_size/etype_size; n_filetypes = (int) (offset / n_etypes_in_filetype); etype_in_filetype = (int) (offset % n_etypes_in_filetype); size_in_filetype = etype_in_filetype * etype_size; sum = 0; for (i=0; i<flat_file->count; i++) { sum += flat_file->blocklens[i]; if (sum > size_in_filetype) { st_index = i; fwr_size = sum - size_in_filetype; abs_off_in_filetype = flat_file->indices[i] + size_in_filetype - (sum - flat_file->blocklens[i]); break; } } /* abs. offset in bytes in the file */ offset = disp + (ADIO_Offset) n_filetypes*filetype_extent + abs_off_in_filetype; } if (buftype_is_contig && !filetype_is_contig) { /* contiguous in memory, noncontiguous in file. should be the most common case. */ i = 0; j = st_index; off = offset; fwr_size = ADIOI_MIN(fwr_size, bufsize); while (i < bufsize) { if (fwr_size) { /* TYPE_UB and TYPE_LB can result in fwr_size = 0. save system call in such cases */ #ifdef PROFILE MPE_Log_event(11, 0, "start seek"); #endif pvfs_lseek64(fd->fd_sys, off, SEEK_SET); #ifdef PROFILE MPE_Log_event(12, 0, "end seek"); MPE_Log_event(5, 0, "start write"); #endif err = pvfs_write(fd->fd_sys, ((char *) buf) + i, fwr_size); #ifdef PROFILE MPE_Log_event(6, 0, "end write"); #endif if (err == -1) err_flag = 1; } i += fwr_size; if (off + fwr_size < disp + flat_file->indices[j] + flat_file->blocklens[j] + (ADIO_Offset) n_filetypes*filetype_extent) off += fwr_size; /* did not reach end of contiguous block in filetype. no more I/O needed. off is incremented by fwr_size. */ else { if (j < (flat_file->count - 1)) j++; else { j = 0; n_filetypes++; } off = disp + flat_file->indices[j] + (ADIO_Offset) n_filetypes*filetype_extent; fwr_size = ADIOI_MIN(flat_file->blocklens[j], bufsize-i); } } } else { /* noncontiguous in memory as well as in file */ ADIOI_Flatten_datatype(datatype); flat_buf = ADIOI_Flatlist; while (flat_buf->type != datatype) flat_buf = flat_buf->next; k = num = buf_count = 0; indx = flat_buf->indices[0]; j = st_index; off = offset; bwr_size = flat_buf->blocklens[0]; while (num < bufsize) { size = ADIOI_MIN(fwr_size, bwr_size); if (size) { #ifdef PROFILE MPE_Log_event(11, 0, "start seek"); #endif pvfs_lseek64(fd->fd_sys, off, SEEK_SET); #ifdef PROFILE MPE_Log_event(12, 0, "end seek"); MPE_Log_event(5, 0, "start write"); #endif err = pvfs_write(fd->fd_sys, ((char *) buf) + indx, size); #ifdef PROFILE MPE_Log_event(6, 0, "end write"); #endif if (err == -1) err_flag = 1; } new_fwr_size = fwr_size; new_bwr_size = bwr_size; if (size == fwr_size) { /* reached end of contiguous block in file */ if (j < (flat_file->count - 1)) j++; else { j = 0; n_filetypes++; } off = disp + flat_file->indices[j] + (ADIO_Offset) n_filetypes*filetype_extent; new_fwr_size = flat_file->blocklens[j]; if (size != bwr_size) { indx += size; new_bwr_size -= size; } } if (size == bwr_size) { /* reached end of contiguous block in memory */ k = (k + 1)%flat_buf->count; buf_count++; indx = buftype_extent*(buf_count/flat_buf->count) + flat_buf->indices[k]; new_bwr_size = flat_buf->blocklens[k]; if (size != fwr_size) { off += size; new_fwr_size -= size; } } num += size; fwr_size = new_fwr_size; bwr_size = new_bwr_size; } } if (file_ptr_type == ADIO_INDIVIDUAL) fd->fp_ind = off; if (err_flag) { *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_IO, "**io", "**io %s", strerror(errno)); } else *error_code = MPI_SUCCESS; } fd->fp_sys_posn = -1; /* set it to null. */ #ifdef HAVE_STATUS_SET_BYTES MPIR_Status_set_bytes(status, datatype, bufsize); /* This is a temporary way of filling in status. The right way is to keep track of how much data was actually written by ADIOI_BUFFERED_WRITE. */ #endif if (!buftype_is_contig) ADIOI_Delete_flattened(datatype); }
void ADIOI_PVFS2_OldReadStrided(ADIO_File fd, void *buf, int count, MPI_Datatype datatype, int file_ptr_type, ADIO_Offset offset, ADIO_Status *status, int *error_code) { /* offset is in units of etype relative to the filetype. */ ADIOI_Flatlist_node *flat_buf, *flat_file; int i, j, k, brd_size, frd_size=0, st_index=0; int sum, n_etypes_in_filetype, size_in_filetype; MPI_Count bufsize; int n_filetypes, etype_in_filetype; ADIO_Offset abs_off_in_filetype=0; MPI_Count filetype_size, etype_size, buftype_size; MPI_Aint filetype_extent, buftype_extent; int buf_count, buftype_is_contig, filetype_is_contig; ADIO_Offset off, disp, start_off, initial_off; int flag, st_frd_size, st_n_filetypes; int mem_list_count, file_list_count; PVFS_size *mem_offsets; int64_t *file_offsets; int *mem_lengths; int32_t *file_lengths; int total_blks_to_read; int max_mem_list, max_file_list; int b_blks_read; int f_data_read; int size_read=0, n_read_lists, extra_blks; int end_brd_size, end_frd_size; int start_k, start_j, new_file_read, new_buffer_read; int start_mem_offset; PVFS_Request mem_req, file_req; ADIOI_PVFS2_fs * pvfs_fs; PVFS_sysresp_io resp_io; int err_flag=0; MPI_Offset total_bytes_read = 0; static char myname[] = "ADIOI_PVFS2_ReadStrided"; #define MAX_ARRAY_SIZE 64 *error_code = MPI_SUCCESS; /* changed below if error */ ADIOI_Datatype_iscontig(datatype, &buftype_is_contig); ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig); /* the HDF5 tests showed a bug in this list processing code (see many many * lines down below). We added a workaround, but common HDF5 file types * are actually contiguous and do not need the expensive workarond */ if (!filetype_is_contig) { flat_file = ADIOI_Flatlist; while (flat_file->type != fd->filetype) flat_file = flat_file->next; if (flat_file->count == 1 && !buftype_is_contig) filetype_is_contig = 1; } MPI_Type_size_x(fd->filetype, &filetype_size); if ( ! filetype_size ) { #ifdef HAVE_STATUS_SET_BYTES MPIR_Status_set_bytes(status, datatype, 0); #endif *error_code = MPI_SUCCESS; return; } MPI_Type_extent(fd->filetype, &filetype_extent); MPI_Type_size_x(datatype, &buftype_size); MPI_Type_extent(datatype, &buftype_extent); etype_size = fd->etype_size; bufsize = buftype_size * count; pvfs_fs = (ADIOI_PVFS2_fs*)fd->fs_ptr; if (!buftype_is_contig && filetype_is_contig) { /* noncontiguous in memory, contiguous in file. */ int64_t file_offset; int32_t file_length; ADIOI_Flatten_datatype(datatype); flat_buf = ADIOI_Flatlist; while (flat_buf->type != datatype) flat_buf = flat_buf->next; off = (file_ptr_type == ADIO_INDIVIDUAL) ? fd->fp_ind : fd->disp + etype_size * offset; file_list_count = 1; file_offset = off; file_length = 0; total_blks_to_read = count*flat_buf->count; b_blks_read = 0; /* allocate arrays according to max usage */ if (total_blks_to_read > MAX_ARRAY_SIZE) mem_list_count = MAX_ARRAY_SIZE; else mem_list_count = total_blks_to_read; mem_offsets = (PVFS_size*)ADIOI_Malloc(mem_list_count*sizeof(PVFS_size)); mem_lengths = (int*)ADIOI_Malloc(mem_list_count*sizeof(int)); /* TODO: CHECK RESULTS OF MEMORY ALLOCATION */ j = 0; /* step through each block in memory, filling memory arrays */ while (b_blks_read < total_blks_to_read) { for (i=0; i<flat_buf->count; i++) { mem_offsets[b_blks_read % MAX_ARRAY_SIZE] = /* TODO: fix this compiler warning */ ((PVFS_size)buf + j*buftype_extent + flat_buf->indices[i]); mem_lengths[b_blks_read % MAX_ARRAY_SIZE] = flat_buf->blocklens[i]; file_length += flat_buf->blocklens[i]; b_blks_read++; if (!(b_blks_read % MAX_ARRAY_SIZE) || (b_blks_read == total_blks_to_read)) { /* in the case of the last read list call, adjust mem_list_count */ if (b_blks_read == total_blks_to_read) { mem_list_count = total_blks_to_read % MAX_ARRAY_SIZE; /* in case last read list call fills max arrays */ if (!mem_list_count) mem_list_count = MAX_ARRAY_SIZE; } err_flag = PVFS_Request_hindexed(mem_list_count, mem_lengths, mem_offsets, PVFS_BYTE, &mem_req); if (err_flag < 0) break; err_flag = PVFS_Request_contiguous(file_length, PVFS_BYTE, &file_req); if (err_flag < 0) break; #ifdef ADIOI_MPE_LOGGING MPE_Log_event( ADIOI_MPE_read_a, 0, NULL ); #endif err_flag = PVFS_sys_read(pvfs_fs->object_ref, file_req, file_offset, PVFS_BOTTOM, mem_req, &(pvfs_fs->credentials), &resp_io); #ifdef ADIOI_MPE_LOGGING MPE_Log_event( ADIOI_MPE_read_b, 0, NULL ); #endif /* --BEGIN ERROR HANDLING-- */ if (err_flag != 0) { *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, ADIOI_PVFS2_error_convert(err_flag), "Error in PVFS_sys_read", 0); goto error_state; } PVFS_Request_free(&mem_req); PVFS_Request_free(&file_req); total_bytes_read += resp_io.total_completed; /* --END ERROR HANDLING-- */ /* in the case of error or the last read list call, * leave here */ if (err_flag || b_blks_read == total_blks_to_read) break; file_offset += file_length; file_length = 0; } } /* for (i=0; i<flat_buf->count; i++) */ j++; } /* while (b_blks_read < total_blks_to_read) */ ADIOI_Free(mem_offsets); ADIOI_Free(mem_lengths); if (file_ptr_type == ADIO_INDIVIDUAL) fd->fp_ind += total_bytes_read; fd->fp_sys_posn = -1; /* set it to null. */ #ifdef HAVE_STATUS_SET_BYTES MPIR_Status_set_bytes(status, datatype, bufsize); /* This isa temporary way of filling in status. The right way is to keep tracke of how much data was actually read adn placed in buf by ADIOI_BUFFERED_READ. */ #endif ADIOI_Delete_flattened(datatype); return; } /* if (!buftype_is_contig && filetype_is_contig) */ /* know file is noncontiguous from above */ /* noncontiguous in file */ /* filetype already flattened in ADIO_Open */ flat_file = ADIOI_Flatlist; while (flat_file->type != fd->filetype) flat_file = flat_file->next; disp = fd->disp; initial_off = offset; /* for each case - ADIO_Individual pointer or explicit, find the file offset in bytes (offset), n_filetypes (how many filetypes into file to start), frd_size (remaining amount of data in present file block), and st_index (start point in terms of blocks in starting filetype) */ if (file_ptr_type == ADIO_INDIVIDUAL) { offset = fd->fp_ind; /* in bytes */ n_filetypes = -1; flag = 0; while (!flag) { n_filetypes++; for (i=0; i<flat_file->count; i++) { if (disp + flat_file->indices[i] + ((ADIO_Offset) n_filetypes)*filetype_extent + flat_file->blocklens[i] >= offset) { st_index = i; frd_size = (int) (disp + flat_file->indices[i] + ((ADIO_Offset) n_filetypes)*filetype_extent + flat_file->blocklens[i] - offset); flag = 1; break; } } } /* while (!flag) */ } /* if (file_ptr_type == ADIO_INDIVIDUAL) */ else { n_etypes_in_filetype = filetype_size/etype_size; n_filetypes = (int) (offset / n_etypes_in_filetype); etype_in_filetype = (int) (offset % n_etypes_in_filetype); size_in_filetype = etype_in_filetype * etype_size; sum = 0; for (i=0; i<flat_file->count; i++) { sum += flat_file->blocklens[i]; if (sum > size_in_filetype) { st_index = i; frd_size = sum - size_in_filetype; abs_off_in_filetype = flat_file->indices[i] + size_in_filetype - (sum - flat_file->blocklens[i]); break; } } /* abs. offset in bytes in the file */ offset = disp + ((ADIO_Offset) n_filetypes)*filetype_extent + abs_off_in_filetype; } /* else [file_ptr_type != ADIO_INDIVIDUAL] */ start_off = offset; st_frd_size = frd_size; st_n_filetypes = n_filetypes; if (buftype_is_contig && !filetype_is_contig) { /* contiguous in memory, noncontiguous in file. should be the most common case. */ int mem_length=0; intptr_t mem_offset; i = 0; j = st_index; n_filetypes = st_n_filetypes; mem_list_count = 1; /* determine how many blocks in file to read */ f_data_read = ADIOI_MIN(st_frd_size, bufsize); total_blks_to_read = 1; if (j < (flat_file->count-1)) j++; else { j = 0; n_filetypes++; } while (f_data_read < bufsize) { f_data_read += flat_file->blocklens[j]; total_blks_to_read++; if (j<(flat_file->count-1)) j++; else j = 0; } j = st_index; n_filetypes = st_n_filetypes; n_read_lists = total_blks_to_read/MAX_ARRAY_SIZE; extra_blks = total_blks_to_read%MAX_ARRAY_SIZE; mem_offset = (intptr_t)buf; mem_lengths = 0; /* if at least one full readlist, allocate file arrays at max array size and don't free until very end */ if (n_read_lists) { file_offsets = (int64_t*)ADIOI_Malloc(MAX_ARRAY_SIZE* sizeof(int64_t)); file_lengths = (int32_t*)ADIOI_Malloc(MAX_ARRAY_SIZE* sizeof(int32_t)); } /* if there's no full readlist allocate file arrays according to needed size (extra_blks) */ else { file_offsets = (int64_t*)ADIOI_Malloc(extra_blks* sizeof(int64_t)); file_lengths = (int32_t*)ADIOI_Malloc(extra_blks* sizeof(int32_t)); } /* for file arrays that are of MAX_ARRAY_SIZE, build arrays */ for (i=0; i<n_read_lists; i++) { file_list_count = MAX_ARRAY_SIZE; if(!i) { file_offsets[0] = offset; file_lengths[0] = st_frd_size; mem_length = st_frd_size; } for (k=0; k<MAX_ARRAY_SIZE; k++) { if (i || k) { file_offsets[k] = disp + ((ADIO_Offset)n_filetypes)*filetype_extent + flat_file->indices[j]; file_lengths[k] = flat_file->blocklens[j]; mem_lengths += file_lengths[k]; } if (j<(flat_file->count - 1)) j++; else { j = 0; n_filetypes++; } } /* for (k=0; k<MAX_ARRAY_SIZE; k++) */ err_flag = PVFS_Request_contiguous(mem_length, PVFS_BYTE, &mem_req); /* --BEGIN ERROR HANDLING-- */ if (err_flag != 0) { *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, ADIOI_PVFS2_error_convert(err_flag), "Error in PVFS_Request_contiguous (memory)", 0); goto error_state; } /* --END ERROR HANDLING-- */ err_flag = PVFS_Request_hindexed(file_list_count, file_lengths, file_offsets, PVFS_BYTE, &file_req); /* --BEGIN ERROR HANDLING-- */ if (err_flag != 0) { *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, ADIOI_PVFS2_error_convert(err_flag), "Error in PVFS_Request_hindexed (file)", 0); goto error_state; } /* --END ERROR HANDLING-- */ /* PVFS_Request_hindexed already expresses the offsets into the * file, so we should not pass in an offset if we are using * hindexed for the file type */ #ifdef ADIOI_MPE_LOGGING MPE_Log_event( ADIOI_MPE_read_a, 0, NULL ); #endif err_flag = PVFS_sys_read(pvfs_fs->object_ref, file_req, 0, (void *)mem_offset, mem_req, &(pvfs_fs->credentials), &resp_io); #ifdef ADIOI_MPE_LOGGING MPE_Log_event( ADIOI_MPE_read_b, 0, NULL ); #endif /* --BEGIN ERROR HANDLING-- */ if (err_flag != 0) { *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, ADIOI_PVFS2_error_convert(err_flag), "Error in PVFS_sys_read", 0); goto error_state; } /* --END ERROR HANDING-- */ PVFS_Request_free(&mem_req); PVFS_Request_free(&file_req); total_bytes_read += resp_io.total_completed; mem_offset += mem_length; mem_lengths = 0; } /* for (i=0; i<n_read_lists; i++) */ /* for file arrays smaller than MAX_ARRAY_SIZE (last read_list call) */ if (extra_blks) { file_list_count = extra_blks; if(!i) { file_offsets[0] = offset; file_lengths[0] = ADIOI_MIN(st_frd_size, bufsize); } for (k=0; k<extra_blks; k++) { if(i || k) { file_offsets[k] = disp + ((ADIO_Offset)n_filetypes)*filetype_extent + flat_file->indices[j]; if (k == (extra_blks - 1)) { file_lengths[k] = bufsize - (int32_t) mem_lengths - mem_offset + (int32_t) buf; } else file_lengths[k] = flat_file->blocklens[j]; } /* if(i || k) */ mem_lengths += file_lengths[k]; if (j<(flat_file->count - 1)) j++; else { j = 0; n_filetypes++; } } /* for (k=0; k<extra_blks; k++) */ err_flag = PVFS_Request_contiguous(mem_length, PVFS_BYTE, &mem_req); /* --BEGIN ERROR HANDLING-- */ if (err_flag != 0) { *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, ADIOI_PVFS2_error_convert(err_flag), "Error in PVFS_Request_contiguous (memory)", 0); goto error_state; } /* --END ERROR HANDLING-- */ err_flag = PVFS_Request_hindexed(file_list_count, file_lengths, file_offsets, PVFS_BYTE, &file_req); /* --BEGIN ERROR HANDLING-- */ if (err_flag != 0) { *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, ADIOI_PVFS2_error_convert(err_flag), "Error in PVFS_Request_hindexed (file)", 0); goto error_state; } /* --END ERROR HANDLING-- */ /* as above, use 0 for 'offset' when using hindexed file type */ #ifdef ADIOI_MPE_LOGGING MPE_Log_event( ADIOI_MPE_read_a, 0, NULL ); #endif err_flag = PVFS_sys_read(pvfs_fs->object_ref, file_req, 0, (void *)mem_offset, mem_req, &(pvfs_fs->credentials), &resp_io); #ifdef ADIOI_MPE_LOGGING MPE_Log_event( ADIOI_MPE_read_b, 0, NULL ); #endif /* --BEGIN ERROR HANDLING-- */ if (err_flag != 0) { *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, ADIOI_PVFS2_error_convert(err_flag), "Error in PVFS_sys_read", 0); goto error_state; } /* --END ERROR HANDLING-- */ PVFS_Request_free(&mem_req); PVFS_Request_free(&file_req); total_bytes_read += resp_io.total_completed; } } else { /* noncontiguous in memory as well as in file */ ADIOI_Flatten_datatype(datatype); flat_buf = ADIOI_Flatlist; while (flat_buf->type != datatype) flat_buf = flat_buf->next; size_read = 0; n_filetypes = st_n_filetypes; frd_size = st_frd_size; brd_size = flat_buf->blocklens[0]; buf_count = 0; start_mem_offset = 0; start_k = k = 0; start_j = st_index; max_mem_list = 0; max_file_list = 0; /* run through and file max_file_list and max_mem_list so that you can allocate the file and memory arrays less than MAX_ARRAY_SIZE if possible */ while (size_read < bufsize) { k = start_k; new_buffer_read = 0; mem_list_count = 0; while ((mem_list_count < MAX_ARRAY_SIZE) && (new_buffer_read < bufsize-size_read)) { /* find mem_list_count and file_list_count such that both are less than MAX_ARRAY_SIZE, the sum of their lengths are equal, and the sum of all the data read and data to be read in the next immediate read list is less than bufsize */ if(mem_list_count) { if((new_buffer_read + flat_buf->blocklens[k] + size_read) > bufsize) { end_brd_size = new_buffer_read + flat_buf->blocklens[k] - (bufsize - size_read); new_buffer_read = bufsize - size_read; } else { new_buffer_read += flat_buf->blocklens[k]; end_brd_size = flat_buf->blocklens[k]; } } else { if (brd_size > (bufsize - size_read)) { new_buffer_read = bufsize - size_read; brd_size = new_buffer_read; } else new_buffer_read = brd_size; } mem_list_count++; k = (k + 1)%flat_buf->count; } /* while ((mem_list_count < MAX_ARRAY_SIZE) && (new_buffer_read < bufsize-size_read)) */ j = start_j; new_file_read = 0; file_list_count = 0; while ((file_list_count < MAX_ARRAY_SIZE) && (new_file_read < new_buffer_read)) { if(file_list_count) { if((new_file_read + flat_file->blocklens[j]) > new_buffer_read) { end_frd_size = new_buffer_read - new_file_read; new_file_read = new_buffer_read; j--; } else { new_file_read += flat_file->blocklens[j]; end_frd_size = flat_file->blocklens[j]; } } else { if (frd_size > new_buffer_read) { new_file_read = new_buffer_read; frd_size = new_file_read; } else new_file_read = frd_size; } file_list_count++; if (j < (flat_file->count - 1)) j++; else j = 0; k = start_k; if ((new_file_read < new_buffer_read) && (file_list_count == MAX_ARRAY_SIZE)) { new_buffer_read = 0; mem_list_count = 0; while (new_buffer_read < new_file_read) { if(mem_list_count) { if((new_buffer_read + flat_buf->blocklens[k]) > new_file_read) { end_brd_size = new_file_read - new_buffer_read; new_buffer_read = new_file_read; k--; } else { new_buffer_read += flat_buf->blocklens[k]; end_brd_size = flat_buf->blocklens[k]; } } else { new_buffer_read = brd_size; if (brd_size > (bufsize - size_read)) { new_buffer_read = bufsize - size_read; brd_size = new_buffer_read; } } mem_list_count++; k = (k + 1)%flat_buf->count; } /* while (new_buffer_read < new_file_read) */ } /* if ((new_file_read < new_buffer_read) && (file_list_count == MAX_ARRAY_SIZE)) */ } /* while ((mem_list_count < MAX_ARRAY_SIZE) && (new_buffer_read < bufsize-size_read)) */ /* fakes filling the readlist arrays of lengths found above */ k = start_k; j = start_j; for (i=0; i<mem_list_count; i++) { if(i) { if (i == (mem_list_count - 1)) { if (flat_buf->blocklens[k] == end_brd_size) brd_size = flat_buf->blocklens[(k+1)% flat_buf->count]; else { brd_size = flat_buf->blocklens[k] - end_brd_size; k--; buf_count--; } } } buf_count++; k = (k + 1)%flat_buf->count; } /* for (i=0; i<mem_list_count; i++) */ for (i=0; i<file_list_count; i++) { if (i) { if (i == (file_list_count - 1)) { if (flat_file->blocklens[j] == end_frd_size) frd_size = flat_file->blocklens[(j+1)% flat_file->count]; else { frd_size = flat_file->blocklens[j] - end_frd_size; j--; } } } if (j < flat_file->count - 1) j++; else { j = 0; n_filetypes++; } } /* for (i=0; i<file_list_count; i++) */ size_read += new_buffer_read; start_k = k; start_j = j; if (max_mem_list < mem_list_count) max_mem_list = mem_list_count; if (max_file_list < file_list_count) max_file_list = file_list_count; } /* while (size_read < bufsize) */ /* one last check before we actually carry out the operation: * this code has hard-to-fix bugs when a noncontiguous file type has * such large pieces that the sum of the lengths of the memory type is * not larger than one of those pieces (and vice versa for large memory * types and many pices of file types. In these cases, give up and * fall back to naive reads and writes. The testphdf5 test created a * type with two very large memory regions and 600 very small file * regions. The same test also created a type with one very large file * region and many (700) very small memory regions. both cases caused * problems for this code */ if ( ( (file_list_count == 1) && (new_file_read < flat_file->blocklens[0] ) ) || ((mem_list_count == 1) && (new_buffer_read < flat_buf->blocklens[0]) ) || ((file_list_count == MAX_ARRAY_SIZE) && (new_file_read < flat_buf->blocklens[0]) ) || ( (mem_list_count == MAX_ARRAY_SIZE) && (new_buffer_read < flat_file->blocklens[0])) ) { ADIOI_Delete_flattened(datatype); ADIOI_GEN_ReadStrided_naive(fd, buf, count, datatype, file_ptr_type, initial_off, status, error_code); return; } mem_offsets = (PVFS_size*)ADIOI_Malloc(max_mem_list*sizeof(PVFS_size)); mem_lengths = (int *)ADIOI_Malloc(max_mem_list*sizeof(int)); file_offsets = (int64_t *)ADIOI_Malloc(max_file_list*sizeof(int64_t)); file_lengths = (int32_t *)ADIOI_Malloc(max_file_list*sizeof(int32_t)); size_read = 0; n_filetypes = st_n_filetypes; frd_size = st_frd_size; brd_size = flat_buf->blocklens[0]; buf_count = 0; start_mem_offset = 0; start_k = k = 0; start_j = st_index; /* this section calculates mem_list_count and file_list_count and also finds the possibly odd sized last array elements in new_frd_size and new_brd_size */ while (size_read < bufsize) { k = start_k; new_buffer_read = 0; mem_list_count = 0; while ((mem_list_count < MAX_ARRAY_SIZE) && (new_buffer_read < bufsize-size_read)) { /* find mem_list_count and file_list_count such that both are less than MAX_ARRAY_SIZE, the sum of their lengths are equal, and the sum of all the data read and data to be read in the next immediate read list is less than bufsize */ if(mem_list_count) { if((new_buffer_read + flat_buf->blocklens[k] + size_read) > bufsize) { end_brd_size = new_buffer_read + flat_buf->blocklens[k] - (bufsize - size_read); new_buffer_read = bufsize - size_read; } else { new_buffer_read += flat_buf->blocklens[k]; end_brd_size = flat_buf->blocklens[k]; } } else { if (brd_size > (bufsize - size_read)) { new_buffer_read = bufsize - size_read; brd_size = new_buffer_read; } else new_buffer_read = brd_size; } mem_list_count++; k = (k + 1)%flat_buf->count; } /* while ((mem_list_count < MAX_ARRAY_SIZE) && (new_buffer_read < bufsize-size_read)) */ j = start_j; new_file_read = 0; file_list_count = 0; while ((file_list_count < MAX_ARRAY_SIZE) && (new_file_read < new_buffer_read)) { if(file_list_count) { if((new_file_read + flat_file->blocklens[j]) > new_buffer_read) { end_frd_size = new_buffer_read - new_file_read; new_file_read = new_buffer_read; j--; } else { new_file_read += flat_file->blocklens[j]; end_frd_size = flat_file->blocklens[j]; } } else { if (frd_size > new_buffer_read) { new_file_read = new_buffer_read; frd_size = new_file_read; } else new_file_read = frd_size; } file_list_count++; if (j < (flat_file->count - 1)) j++; else j = 0; k = start_k; if ((new_file_read < new_buffer_read) && (file_list_count == MAX_ARRAY_SIZE)) { new_buffer_read = 0; mem_list_count = 0; while (new_buffer_read < new_file_read) { if(mem_list_count) { if((new_buffer_read + flat_buf->blocklens[k]) > new_file_read) { end_brd_size = new_file_read - new_buffer_read; new_buffer_read = new_file_read; k--; } else { new_buffer_read += flat_buf->blocklens[k]; end_brd_size = flat_buf->blocklens[k]; } } else { new_buffer_read = brd_size; if (brd_size > (bufsize - size_read)) { new_buffer_read = bufsize - size_read; brd_size = new_buffer_read; } } mem_list_count++; k = (k + 1)%flat_buf->count; } /* while (new_buffer_read < new_file_read) */ } /* if ((new_file_read < new_buffer_read) && (file_list_count == MAX_ARRAY_SIZE)) */ } /* while ((mem_list_count < MAX_ARRAY_SIZE) && (new_buffer_read < bufsize-size_read)) */ /* fills the allocated readlist arrays */ k = start_k; j = start_j; for (i=0; i<mem_list_count; i++) { mem_offsets[i] = ((PVFS_size)buf + buftype_extent* (buf_count/flat_buf->count) + (int)flat_buf->indices[k]); if(!i) { mem_lengths[0] = brd_size; mem_offsets[0] += flat_buf->blocklens[k] - brd_size; } else { if (i == (mem_list_count - 1)) { mem_lengths[i] = end_brd_size; if (flat_buf->blocklens[k] == end_brd_size) brd_size = flat_buf->blocklens[(k+1)% flat_buf->count]; else { brd_size = flat_buf->blocklens[k] - end_brd_size; k--; buf_count--; } } else { mem_lengths[i] = flat_buf->blocklens[k]; } } buf_count++; k = (k + 1)%flat_buf->count; } /* for (i=0; i<mem_list_count; i++) */ for (i=0; i<file_list_count; i++) { file_offsets[i] = disp + flat_file->indices[j] + ((ADIO_Offset)n_filetypes) * filetype_extent; if (!i) { file_lengths[0] = frd_size; file_offsets[0] += flat_file->blocklens[j] - frd_size; } else { if (i == (file_list_count - 1)) { file_lengths[i] = end_frd_size; if (flat_file->blocklens[j] == end_frd_size) frd_size = flat_file->blocklens[(j+1)% flat_file->count]; else { frd_size = flat_file->blocklens[j] - end_frd_size; j--; } } else file_lengths[i] = flat_file->blocklens[j]; } if (j < flat_file->count - 1) j++; else { j = 0; n_filetypes++; } } /* for (i=0; i<file_list_count; i++) */ err_flag = PVFS_Request_hindexed(mem_list_count, mem_lengths, mem_offsets, PVFS_BYTE, &mem_req); /* --BEGIN ERROR HANDLING-- */ if (err_flag != 0 ) { *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, ADIOI_PVFS2_error_convert(err_flag), "Error in PVFS_Request_hindexed (memory)", 0); goto error_state; } /* -- END ERROR HANDLING-- */ err_flag = PVFS_Request_hindexed(file_list_count, file_lengths, file_offsets, PVFS_BYTE, &file_req); /* --BEGIN ERROR HANDLING-- */ if (err_flag != 0) { *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, ADIOI_PVFS2_error_convert(err_flag), "Error in PVFS_Request_hindexed (file)", 0); goto error_state; } /* --END ERROR HANDLING-- */ /* offset will be expressed in memory and file datatypes */ #ifdef ADIOI_MPE_LOGGING MPE_Log_event( ADIOI_MPE_read_a, 0, NULL ); #endif err_flag = PVFS_sys_read(pvfs_fs->object_ref, file_req, 0, PVFS_BOTTOM, mem_req, &(pvfs_fs->credentials), &resp_io); #ifdef ADIOI_MPE_LOGGING MPE_Log_event( ADIOI_MPE_read_b, 0, NULL ); #endif /* --BEGIN ERROR HANDLING-- */ if (err_flag != 0) { *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, ADIOI_PVFS2_error_convert(err_flag), "Error in PVFS_sys_read", 0); } /* --END ERROR HANDLING-- */ PVFS_Request_free(&mem_req); PVFS_Request_free(&file_req); total_bytes_read += resp_io.total_completed; size_read += new_buffer_read; start_k = k; start_j = j; } /* while (size_read < bufsize) */ ADIOI_Free(mem_offsets); ADIOI_Free(mem_lengths); } /* Other ADIO routines will convert absolute bytes into counts of datatypes */ /* when incrementing fp_ind, need to also take into account the file type: * consider an N-element 1-d subarray with a lb and ub: ( |---xxxxx-----| * if we wrote N elements, offset needs to point at beginning of type, not * at empty region at offset N+1) * * As we discussed on mpich-discuss in may/june 2009, the code below might * look wierd, but by putting fp_ind at the last byte written, the next * time we run through the strided code we'll update the fp_ind to the * right location. */ if (file_ptr_type == ADIO_INDIVIDUAL) { fd->fp_ind = file_offsets[file_list_count-1]+ file_lengths[file_list_count-1]; } ADIOI_Free(file_offsets); ADIOI_Free(file_lengths); if (err_flag == 0) *error_code = MPI_SUCCESS; error_state: fd->fp_sys_posn = -1; /* set it to null. */ #ifdef HAVE_STATUS_SET_BYTES MPIR_Status_set_bytes(status, datatype, bufsize); /* This is a temporary way of filling in status. The right way is to keep track of how much data was actually read and placed in buf by ADIOI_BUFFERED_READ. */ #endif if (!buftype_is_contig) ADIOI_Delete_flattened(datatype); }
void ADIOI_NFS_WriteStrided(ADIO_File fd, void *buf, int count, MPI_Datatype datatype, int file_ptr_type, ADIO_Offset offset, ADIO_Status *status, int *error_code) { /* offset is in units of etype relative to the filetype. */ ADIOI_Flatlist_node *flat_buf, *flat_file; int i, j, k, err=-1, bwr_size, fwr_size=0, st_index=0; int bufsize, num, size, sum, n_etypes_in_filetype, size_in_filetype; int n_filetypes, etype_in_filetype; ADIO_Offset abs_off_in_filetype=0; int filetype_size, etype_size, buftype_size, req_len; MPI_Aint filetype_extent, buftype_extent; int buf_count, buftype_is_contig, filetype_is_contig; ADIO_Offset userbuf_off; ADIO_Offset off, req_off, disp, end_offset=0, writebuf_off, start_off; char *writebuf, *value; int flag, st_fwr_size, st_n_filetypes, writebuf_len, write_sz; int new_bwr_size, new_fwr_size, err_flag=0, info_flag, max_bufsize; static char myname[] = "ADIOI_NFS_WRITESTRIDED"; ADIOI_Datatype_iscontig(datatype, &buftype_is_contig); ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig); MPI_Type_size(fd->filetype, &filetype_size); if ( ! filetype_size ) { *error_code = MPI_SUCCESS; return; } MPI_Type_extent(fd->filetype, &filetype_extent); MPI_Type_size(datatype, &buftype_size); MPI_Type_extent(datatype, &buftype_extent); etype_size = fd->etype_size; bufsize = buftype_size * count; /* get max_bufsize from the info object. */ value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char)); MPI_Info_get(fd->info, "ind_wr_buffer_size", MPI_MAX_INFO_VAL, value, &info_flag); max_bufsize = atoi(value); ADIOI_Free(value); if (!buftype_is_contig && filetype_is_contig) { /* noncontiguous in memory, contiguous in file. */ ADIOI_Flatten_datatype(datatype); flat_buf = ADIOI_Flatlist; while (flat_buf->type != datatype) flat_buf = flat_buf->next; off = (file_ptr_type == ADIO_INDIVIDUAL) ? fd->fp_ind : fd->disp + etype_size * offset; start_off = off; end_offset = off + bufsize - 1; writebuf_off = off; writebuf = (char *) ADIOI_Malloc(max_bufsize); writebuf_len = (int) (ADIOI_MIN(max_bufsize,end_offset-writebuf_off+1)); /* if atomicity is true, lock the region to be accessed */ if (fd->atomicity) ADIOI_WRITE_LOCK(fd, start_off, SEEK_SET, end_offset-start_off+1); for (j=0; j<count; j++) for (i=0; i<flat_buf->count; i++) { userbuf_off = j*buftype_extent + flat_buf->indices[i]; req_off = off; req_len = flat_buf->blocklens[i]; ADIOI_BUFFERED_WRITE_WITHOUT_READ off += flat_buf->blocklens[i]; } /* write the buffer out finally */ lseek(fd->fd_sys, writebuf_off, SEEK_SET); if (!(fd->atomicity)) ADIOI_WRITE_LOCK(fd, writebuf_off, SEEK_SET, writebuf_len); err = write(fd->fd_sys, writebuf, writebuf_len); if (!(fd->atomicity)) ADIOI_UNLOCK(fd, writebuf_off, SEEK_SET, writebuf_len); if (err == -1) err_flag = 1; if (fd->atomicity) ADIOI_UNLOCK(fd, start_off, SEEK_SET, end_offset-start_off+1); ADIOI_Free(writebuf); /* malloced in the buffered_write macro */ if (file_ptr_type == ADIO_INDIVIDUAL) fd->fp_ind = off; if (err_flag) { *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_IO, "**io", "**io %s", strerror(errno)); } else *error_code = MPI_SUCCESS; } else { /* noncontiguous in file */ /* filetype already flattened in ADIO_Open */ flat_file = ADIOI_Flatlist; while (flat_file->type != fd->filetype) flat_file = flat_file->next; disp = fd->disp; if (file_ptr_type == ADIO_INDIVIDUAL) { offset = fd->fp_ind; /* in bytes */ n_filetypes = -1; flag = 0; while (!flag) { n_filetypes++; for (i=0; i<flat_file->count; i++) { if (disp + flat_file->indices[i] + (ADIO_Offset) n_filetypes*filetype_extent + flat_file->blocklens[i] >= offset) { st_index = i; fwr_size = (int) (disp + flat_file->indices[i] + (ADIO_Offset) n_filetypes*filetype_extent + flat_file->blocklens[i] - offset); flag = 1; break; } } } } else { n_etypes_in_filetype = filetype_size/etype_size; n_filetypes = (int) (offset / n_etypes_in_filetype); etype_in_filetype = (int) (offset % n_etypes_in_filetype); size_in_filetype = etype_in_filetype * etype_size; sum = 0; for (i=0; i<flat_file->count; i++) { sum += flat_file->blocklens[i]; if (sum > size_in_filetype) { st_index = i; fwr_size = sum - size_in_filetype; abs_off_in_filetype = flat_file->indices[i] + size_in_filetype - (sum - flat_file->blocklens[i]); break; } } /* abs. offset in bytes in the file */ offset = disp + (ADIO_Offset) n_filetypes*filetype_extent + abs_off_in_filetype; } start_off = offset; /* Calculate end_offset, the last byte-offset that will be accessed. e.g., if start_offset=0 and 100 bytes to be write, end_offset=99*/ st_fwr_size = fwr_size; st_n_filetypes = n_filetypes; i = 0; j = st_index; off = offset; fwr_size = ADIOI_MIN(st_fwr_size, bufsize); while (i < bufsize) { i += fwr_size; end_offset = off + fwr_size - 1; if (j < (flat_file->count - 1)) j++; else { j = 0; n_filetypes++; } off = disp + flat_file->indices[j] + (ADIO_Offset) n_filetypes*filetype_extent; fwr_size = ADIOI_MIN(flat_file->blocklens[j], bufsize-i); } /* if atomicity is true, lock the region to be accessed */ if (fd->atomicity) ADIOI_WRITE_LOCK(fd, start_off, SEEK_SET, end_offset-start_off+1); /* initial read for the read-modify-write */ writebuf_off = offset; writebuf = (char *) ADIOI_Malloc(max_bufsize); writebuf_len = (int)(ADIOI_MIN(max_bufsize,end_offset-writebuf_off+1)); if (!(fd->atomicity)) ADIOI_WRITE_LOCK(fd, writebuf_off, SEEK_SET, writebuf_len); lseek(fd->fd_sys, writebuf_off, SEEK_SET); err = read(fd->fd_sys, writebuf, writebuf_len); if (err == -1) { *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_IO, "ADIOI_NFS_WriteStrided: ROMIO tries to optimize this access by doing a read-modify-write, but is unable to read the file. Please give the file read permission and open it with MPI_MODE_RDWR.", 0); return; } if (buftype_is_contig && !filetype_is_contig) { /* contiguous in memory, noncontiguous in file. should be the most common case. */ i = 0; j = st_index; off = offset; n_filetypes = st_n_filetypes; fwr_size = ADIOI_MIN(st_fwr_size, bufsize); while (i < bufsize) { if (fwr_size) { /* TYPE_UB and TYPE_LB can result in fwr_size = 0. save system call in such cases */ /* lseek(fd->fd_sys, off, SEEK_SET); err = write(fd->fd_sys, ((char *) buf) + i, fwr_size);*/ req_off = off; req_len = fwr_size; userbuf_off = i; ADIOI_BUFFERED_WRITE } i += fwr_size; if (off + fwr_size < disp + flat_file->indices[j] + flat_file->blocklens[j] + (ADIO_Offset) n_filetypes*filetype_extent) off += fwr_size; /* did not reach end of contiguous block in filetype. no more I/O needed. off is incremented by fwr_size. */ else { if (j < (flat_file->count - 1)) j++; else { j = 0; n_filetypes++; } off = disp + flat_file->indices[j] + (ADIO_Offset) n_filetypes*filetype_extent; fwr_size = ADIOI_MIN(flat_file->blocklens[j], bufsize-i); } } } else {
/* If successful, error_code is set to MPI_SUCCESS. Otherwise an error * code is created and returned in error_code. */ static void ADIOI_Exch_and_write(ADIO_File fd, void *buf, MPI_Datatype datatype, int nprocs, int myrank, ADIOI_Access *others_req, ADIO_Offset *offset_list, ADIO_Offset *len_list, int contig_access_count, ADIO_Offset min_st_offset, ADIO_Offset fd_size, ADIO_Offset *fd_start, ADIO_Offset *fd_end, int *buf_idx, int *error_code) { /* Send data to appropriate processes and write in sizes of no more than coll_bufsize. The idea is to reduce the amount of extra memory required for collective I/O. If all data were written all at once, which is much easier, it would require temp space more than the size of user_buf, which is often unacceptable. For example, to write a distributed array to a file, where each local array is 8Mbytes, requiring at least another 8Mbytes of temp space is unacceptable. */ /* Not convinced end_loc-st_loc couldn't be > int, so make these offsets*/ ADIO_Offset size=0; int hole, i, j, m, ntimes, max_ntimes, buftype_is_contig; ADIO_Offset st_loc=-1, end_loc=-1, off, done, req_off; char *write_buf=NULL; int *curr_offlen_ptr, *count, *send_size, req_len, *recv_size; int *partial_recv, *sent_to_proc, *start_pos, flag; int *send_buf_idx, *curr_to_proc, *done_to_proc; MPI_Status status; ADIOI_Flatlist_node *flat_buf=NULL; MPI_Aint buftype_extent; int info_flag, coll_bufsize; char *value; static char myname[] = "ADIOI_EXCH_AND_WRITE"; *error_code = MPI_SUCCESS; /* changed below if error */ /* only I/O errors are currently reported */ /* calculate the number of writes of size coll_bufsize to be done by each process and the max among all processes. That gives the no. of communication phases as well. */ value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char)); ADIOI_Info_get(fd->info, "cb_buffer_size", MPI_MAX_INFO_VAL, value, &info_flag); coll_bufsize = atoi(value); ADIOI_Free(value); for (i=0; i < nprocs; i++) { if (others_req[i].count) { st_loc = others_req[i].offsets[0]; end_loc = others_req[i].offsets[0]; break; } } for (i=0; i < nprocs; i++) for (j=0; j < others_req[i].count; j++) { st_loc = ADIOI_MIN(st_loc, others_req[i].offsets[j]); end_loc = ADIOI_MAX(end_loc, (others_req[i].offsets[j] + others_req[i].lens[j] - 1)); } /* ntimes=ceiling_div(end_loc - st_loc + 1, coll_bufsize)*/ ntimes = (int) ((end_loc - st_loc + coll_bufsize)/coll_bufsize); if ((st_loc==-1) && (end_loc==-1)) { ntimes = 0; /* this process does no writing. */ } MPI_Allreduce(&ntimes, &max_ntimes, 1, MPI_INT, MPI_MAX, fd->comm); write_buf = fd->io_buf; curr_offlen_ptr = (int *) ADIOI_Calloc(nprocs, sizeof(int)); /* its use is explained below. calloc initializes to 0. */ count = (int *) ADIOI_Malloc(nprocs*sizeof(int)); /* to store count of how many off-len pairs per proc are satisfied in an iteration. */ partial_recv = (int *) ADIOI_Calloc(nprocs, sizeof(int)); /* if only a portion of the last off-len pair is recd. from a process in a particular iteration, the length recd. is stored here. calloc initializes to 0. */ send_size = (int *) ADIOI_Malloc(nprocs*sizeof(int)); /* total size of data to be sent to each proc. in an iteration. Of size nprocs so that I can use MPI_Alltoall later. */ recv_size = (int *) ADIOI_Malloc(nprocs*sizeof(int)); /* total size of data to be recd. from each proc. in an iteration.*/ sent_to_proc = (int *) ADIOI_Calloc(nprocs, sizeof(int)); /* amount of data sent to each proc so far. Used in ADIOI_Fill_send_buffer. initialized to 0 here. */ send_buf_idx = (int *) ADIOI_Malloc(nprocs*sizeof(int)); curr_to_proc = (int *) ADIOI_Malloc(nprocs*sizeof(int)); done_to_proc = (int *) ADIOI_Malloc(nprocs*sizeof(int)); /* Above three are used in ADIOI_Fill_send_buffer*/ start_pos = (int *) ADIOI_Malloc(nprocs*sizeof(int)); /* used to store the starting value of curr_offlen_ptr[i] in this iteration */ ADIOI_Datatype_iscontig(datatype, &buftype_is_contig); if (!buftype_is_contig) { ADIOI_Flatten_datatype(datatype); flat_buf = ADIOI_Flatlist; while (flat_buf->type != datatype) flat_buf = flat_buf->next; } MPI_Type_extent(datatype, &buftype_extent); /* I need to check if there are any outstanding nonblocking writes to the file, which could potentially interfere with the writes taking place in this collective write call. Since this is not likely to be common, let me do the simplest thing possible here: Each process completes all pending nonblocking operations before completing. */ /*ADIOI_Complete_async(error_code); if (*error_code != MPI_SUCCESS) return; MPI_Barrier(fd->comm); */ done = 0; off = st_loc; for (m=0; m < ntimes; m++) { /* go through all others_req and check which will be satisfied by the current write */ /* Note that MPI guarantees that displacements in filetypes are in monotonically nondecreasing order and that, for writes, the filetypes cannot specify overlapping regions in the file. This simplifies implementation a bit compared to reads. */ /* off = start offset in the file for the data to be written in this iteration size = size of data written (bytes) corresponding to off req_off = off in file for a particular contiguous request minus what was satisfied in previous iteration req_size = size corresponding to req_off */ /* first calculate what should be communicated */ for (i=0; i < nprocs; i++) count[i] = recv_size[i] = 0; size = ADIOI_MIN((unsigned)coll_bufsize, end_loc-st_loc+1-done); for (i=0; i < nprocs; i++) { if (others_req[i].count) { start_pos[i] = curr_offlen_ptr[i]; for (j=curr_offlen_ptr[i]; j<others_req[i].count; j++) { if (partial_recv[i]) { /* this request may have been partially satisfied in the previous iteration. */ req_off = others_req[i].offsets[j] + partial_recv[i]; req_len = others_req[i].lens[j] - partial_recv[i]; partial_recv[i] = 0; /* modify the off-len pair to reflect this change */ others_req[i].offsets[j] = req_off; others_req[i].lens[j] = req_len; } else { req_off = others_req[i].offsets[j]; req_len = others_req[i].lens[j]; } if (req_off < off + size) { count[i]++; ADIOI_Assert((((ADIO_Offset)(MPIR_Upint)write_buf)+req_off-off) == (ADIO_Offset)(MPIR_Upint)(write_buf+req_off-off)); MPI_Address(write_buf+req_off-off, &(others_req[i].mem_ptrs[j])); ADIOI_Assert((off + size - req_off) == (int)(off + size - req_off)); recv_size[i] += (int)(ADIOI_MIN(off + size - req_off, (unsigned)req_len)); if (off+size-req_off < (unsigned)req_len) { partial_recv[i] = (int) (off + size - req_off); /* --BEGIN ERROR HANDLING-- */ if ((j+1 < others_req[i].count) && (others_req[i].offsets[j+1] < off+size)) { *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_ARG, "Filetype specifies overlapping write regions (which is illegal according to the MPI-2 specification)", 0); /* allow to continue since additional * communication might have to occur */ } /* --END ERROR HANDLING-- */ break; } } else break; } curr_offlen_ptr[i] = j; } } ADIOI_W_Exchange_data(fd, buf, write_buf, flat_buf, offset_list, len_list, send_size, recv_size, off, size, count, start_pos, partial_recv, sent_to_proc, nprocs, myrank, buftype_is_contig, contig_access_count, min_st_offset, fd_size, fd_start, fd_end, others_req, send_buf_idx, curr_to_proc, done_to_proc, &hole, m, buftype_extent, buf_idx, error_code); if (*error_code != MPI_SUCCESS) return; flag = 0; for (i=0; i<nprocs; i++) if (count[i]) flag = 1; if (flag) { ADIOI_Assert(size == (int)size); ADIO_WriteContig(fd, write_buf, (int)size, MPI_BYTE, ADIO_EXPLICIT_OFFSET, off, &status, error_code); if (*error_code != MPI_SUCCESS) return; } off += size; done += size; } for (i=0; i<nprocs; i++) count[i] = recv_size[i] = 0; for (m=ntimes; m<max_ntimes; m++) { ADIOI_Assert(size == (int)size); /* nothing to recv, but check for send. */ ADIOI_W_Exchange_data(fd, buf, write_buf, flat_buf, offset_list, len_list, send_size, recv_size, off, (int)size, count, start_pos, partial_recv, sent_to_proc, nprocs, myrank, buftype_is_contig, contig_access_count, min_st_offset, fd_size, fd_start, fd_end, others_req, send_buf_idx, curr_to_proc, done_to_proc, &hole, m, buftype_extent, buf_idx, error_code); if (*error_code != MPI_SUCCESS) return; } ADIOI_Free(curr_offlen_ptr); ADIOI_Free(count); ADIOI_Free(partial_recv); ADIOI_Free(send_size); ADIOI_Free(recv_size); ADIOI_Free(sent_to_proc); ADIOI_Free(start_pos); ADIOI_Free(send_buf_idx); ADIOI_Free(curr_to_proc); ADIOI_Free(done_to_proc); }