void ADIOI_SCI_WriteContig(ADIO_File fd, void *buf, int count, MPI_Datatype datatype, int file_ptr_type, ADIO_Offset offset, ADIO_Status *status, int *error_code) { int err=-1, datatype_size, len; #ifndef PRINT_ERR_MSG static char myname[] = "ADIOI_SCI_WRITECONTIG"; #endif MPI_Type_size(datatype, &datatype_size); len = datatype_size * count; if (file_ptr_type == ADIO_EXPLICIT_OFFSET) { if (fd->fp_sys_posn != offset) lseek(fd->fd_sys, offset, SEEK_SET); err = write(fd->fd_sys, buf, len); fd->fp_sys_posn = offset + err; /* individual file pointer not updated */ } else { /* write from curr. location of ind. file pointer */ if (fd->fp_sys_posn != fd->fp_ind) lseek(fd->fd_sys, fd->fp_ind, SEEK_SET); err = write(fd->fd_sys, buf, len); fd->fp_ind += err; fd->fp_sys_posn = fd->fp_ind; } #ifdef HAVE_STATUS_SET_BYTES if (err != -1) MPIR_Status_set_bytes(status, datatype, err); #endif #ifdef PRINT_ERR_MSG *error_code = (err == -1) ? MPI_ERR_UNKNOWN : MPI_SUCCESS; #else if (err == -1) { *error_code = MPIR_Err_setmsg(MPI_ERR_IO, MPIR_ADIO_ERROR, myname, "I/O Error", "%s", strerror(errno)); ADIOI_Error(fd, *error_code, myname); } else *error_code = MPI_SUCCESS; #endif }
int ADIOI_PFS_ReadDone(ADIO_Request *request, ADIO_Status *status, int *error_code) { int done=0; static char myname[] = "ADIOI_PFS_READDONE"; if (*request == ADIO_REQUEST_NULL) { *error_code = MPI_SUCCESS; return 1; } if ((*request)->queued) done = _iodone(*((long *) (*request)->handle)); else done = 1; /* ADIOI_Complete_Async completed this request, but request object was not freed. */ #ifdef HAVE_STATUS_SET_BYTES if ((done >= 0) && ((*request)->nbytes != -1)) MPIR_Status_set_bytes(status, (*request)->datatype, (*request)->nbytes); #endif if (done >= 0) { /* if request is still queued in the system, it is also there on ADIOI_Async_list. Delete it from there. */ if ((*request)->queued) ADIOI_Del_req_from_list(request); (*request)->fd->async_count--; if ((*request)->handle) ADIOI_Free((*request)->handle); ADIOI_Free_request((ADIOI_Req_node *) (*request)); *request = ADIO_REQUEST_NULL; } if (done == -1 && errno != 0) { *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_IO, "**io", "**io %s", strerror(errno)); } else *error_code = MPI_SUCCESS; return done; }
void ADIOI_PFS_WriteContig(ADIO_File fd, void *buf, int count, MPI_Datatype datatype, int file_ptr_type, ADIO_Offset offset, ADIO_Status *status, int *error_code) { MPI_Count err=-1, datatype_size, len; static char myname[] = "ADIOI_PFS_WRITECONTIG"; MPI_Type_size_x(datatype, &datatype_size); len = datatype_size * count; if (file_ptr_type == ADIO_EXPLICIT_OFFSET) { if (fd->fp_sys_posn != offset) { lseek(fd->fd_sys, offset, SEEK_SET); } err = _cwrite(fd->fd_sys, buf, len); fd->fp_sys_posn = offset + err; /* individual file pointer not updated */ } else { /* write from curr. location of ind. file pointer */ if (fd->fp_sys_posn != fd->fp_ind) { lseek(fd->fd_sys, fd->fp_ind, SEEK_SET); } err = _cwrite(fd->fd_sys, buf, len); fd->fp_ind += err; fd->fp_sys_posn = fd->fp_ind; } #ifdef HAVE_STATUS_SET_BYTES if (err != -1) MPIR_Status_set_bytes(status, datatype, err); #endif if (err == -1) { *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_IO, "**io", "**io %s", strerror(errno)); } else *error_code = MPI_SUCCESS; }
int ADIOI_NTFS_ReadDone(ADIO_Request *request, ADIO_Status *status, int *error_code) { DWORD ret_val; int done = 0; static char myname[] = "ADIOI_NTFS_ReadDone"; if (*request == ADIO_REQUEST_NULL) { *error_code = MPI_SUCCESS; return 1; } if ((*request)->queued) { (*request)->nbytes = 0; ret_val = GetOverlappedResult((*request)->fd, (*request)->handle, &(*request)->nbytes, FALSE); if (!ret_val) { /* --BEGIN ERROR HANDLING-- */ ret_val = GetLastError(); if (ret_val == ERROR_IO_INCOMPLETE) { done = 0; *error_code = MPI_SUCCESS; } else { *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_IO, "**io", "**io %s", ADIOI_NTFS_Strerror(ret_val)); } /* --END ERROR HANDLING-- */ } else { done = 1; *error_code = MPI_SUCCESS; } } else { done = 1; *error_code = MPI_SUCCESS; } #ifdef HAVE_STATUS_SET_BYTES if (done && ((*request)->nbytes != -1)) MPIR_Status_set_bytes(status, (*request)->datatype, (*request)->nbytes); #endif if (done) { /* if request is still queued in the system, it is also there on ADIOI_Async_list. Delete it from there. */ if ((*request)->queued) ADIOI_Del_req_from_list(request); (*request)->fd->async_count--; if ((*request)->handle) { if (!CloseHandle(((OVERLAPPED*)((*request)->handle))->hEvent)) { ret_val = GetLastError(); *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_IO, "**io", "**io %s", ADIOI_NTFS_Strerror(ret_val)); } ADIOI_Free((*request)->handle); } ADIOI_Free_request((ADIOI_Req_node *) (*request)); *request = ADIO_REQUEST_NULL; } return done; }
void ADIOI_PVFS_WriteStridedListIO(ADIO_File fd, void *buf, int count, MPI_Datatype datatype, int file_ptr_type, ADIO_Offset offset, ADIO_Status *status, int *error_code) { /* Since PVFS does not support file locking, can't do buffered writes as on Unix */ /* offset is in units of etype relative to the filetype. */ ADIOI_Flatlist_node *flat_buf, *flat_file; int i, j, k, err=-1, bwr_size, fwr_size=0, st_index=0; int bufsize, size, sum, n_etypes_in_filetype, size_in_filetype; int n_filetypes, etype_in_filetype; ADIO_Offset abs_off_in_filetype=0; int filetype_size, etype_size, buftype_size; MPI_Aint filetype_extent, buftype_extent; int buf_count, buftype_is_contig, filetype_is_contig; ADIO_Offset userbuf_off; ADIO_Offset off, disp, start_off; int flag, st_fwr_size, st_n_filetypes; int new_bwr_size, new_fwr_size, err_flag=0; int mem_list_count, file_list_count; char ** mem_offsets; int64_t *file_offsets; int *mem_lengths; int32_t *file_lengths; int total_blks_to_write; int max_mem_list, max_file_list; int b_blks_wrote; int f_data_wrote; int size_wrote=0, n_write_lists, extra_blks; int end_bwr_size, end_fwr_size; int start_k, start_j, new_file_write, new_buffer_write; int start_mem_offset; #define MAX_ARRAY_SIZE 1024 static char myname[] = "ADIOI_PVFS_WRITESTRIDED"; /* PFS file pointer modes are not relevant here, because PFS does not support strided accesses. */ /* --BEGIN ERROR HANDLING-- */ if (fd->atomicity) { *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_INTERN, "Atomic mode set in PVFS I/O function", 0); return; } /* --END ERROR HANDLING-- */ ADIOI_Datatype_iscontig(datatype, &buftype_is_contig); ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig); MPI_Type_size(fd->filetype, &filetype_size); if ( ! filetype_size ) { *error_code = MPI_SUCCESS; return; } MPI_Type_extent(fd->filetype, &filetype_extent); MPI_Type_size(datatype, &buftype_size); MPI_Type_extent(datatype, &buftype_extent); etype_size = fd->etype_size; bufsize = buftype_size * count; if (!buftype_is_contig && filetype_is_contig) { /* noncontiguous in memory, contiguous in file. */ int64_t file_offsets; int32_t file_lengths; ADIOI_Flatten_datatype(datatype); flat_buf = ADIOI_Flatlist; while (flat_buf->type != datatype) flat_buf = flat_buf->next; if (file_ptr_type == ADIO_EXPLICIT_OFFSET) { off = fd->disp + etype_size * offset; pvfs_lseek64(fd->fd_sys, fd->fp_ind, SEEK_SET); } else off = pvfs_lseek64(fd->fd_sys, fd->fp_ind, SEEK_SET); file_list_count = 1; file_offsets = off; file_lengths = 0; total_blks_to_write = count*flat_buf->count; b_blks_wrote = 0; /* allocate arrays according to max usage */ if (total_blks_to_write > MAX_ARRAY_SIZE) mem_list_count = MAX_ARRAY_SIZE; else mem_list_count = total_blks_to_write; mem_offsets = (char**)ADIOI_Malloc(mem_list_count*sizeof(char*)); mem_lengths = (int*)ADIOI_Malloc(mem_list_count*sizeof(int)); j = 0; /* step through each block in memory, filling memory arrays */ while (b_blks_wrote < total_blks_to_write) { for (i=0; i<flat_buf->count; i++) { mem_offsets[b_blks_wrote % MAX_ARRAY_SIZE] = ((char*)buf + j*buftype_extent + flat_buf->indices[i]); mem_lengths[b_blks_wrote % MAX_ARRAY_SIZE] = flat_buf->blocklens[i]; file_lengths += flat_buf->blocklens[i]; b_blks_wrote++; if (!(b_blks_wrote % MAX_ARRAY_SIZE) || (b_blks_wrote == total_blks_to_write)) { /* in the case of the last read list call, adjust mem_list_count */ if (b_blks_wrote == total_blks_to_write) { mem_list_count = total_blks_to_write % MAX_ARRAY_SIZE; /* in case last read list call fills max arrays */ if (!mem_list_count) mem_list_count = MAX_ARRAY_SIZE; } pvfs_write_list(fd->fd_sys ,mem_list_count, mem_offsets, mem_lengths, file_list_count, &file_offsets, &file_lengths); /* in the case of the last read list call, leave here */ if (b_blks_wrote == total_blks_to_write) break; file_offsets += file_lengths; file_lengths = 0; } } /* for (i=0; i<flat_buf->count; i++) */ j++; } /* while (b_blks_wrote < total_blks_to_write) */ ADIOI_Free(mem_offsets); ADIOI_Free(mem_lengths); if (file_ptr_type == ADIO_INDIVIDUAL) fd->fp_ind = off; if (err_flag) { *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_IO, "**io", "**io %s", strerror(errno)); } else *error_code = MPI_SUCCESS; fd->fp_sys_posn = -1; /* clear this. */ #ifdef HAVE_STATUS_SET_BYTES MPIR_Status_set_bytes(status, datatype, bufsize); /* This is a temporary way of filling in status. The right way is to keep track of how much data was actually written by ADIOI_BUFFERED_WRITE. */ #endif ADIOI_Delete_flattened(datatype); return; } /* if (!buftype_is_contig && filetype_is_contig) */ /* already know that file is noncontiguous from above */ /* noncontiguous in file */ /* filetype already flattened in ADIO_Open */ flat_file = ADIOI_Flatlist; while (flat_file->type != fd->filetype) flat_file = flat_file->next; disp = fd->disp; /* for each case - ADIO_Individual pointer or explicit, find offset (file offset in bytes), n_filetypes (how many filetypes into file to start), fwr_size (remaining amount of data in present file block), and st_index (start point in terms of blocks in starting filetype) */ if (file_ptr_type == ADIO_INDIVIDUAL) { offset = fd->fp_ind; /* in bytes */ n_filetypes = -1; flag = 0; while (!flag) { n_filetypes++; for (i=0; i<flat_file->count; i++) { if (disp + flat_file->indices[i] + (ADIO_Offset) n_filetypes*filetype_extent + flat_file->blocklens[i] >= offset) { st_index = i; fwr_size = disp + flat_file->indices[i] + (ADIO_Offset) n_filetypes*filetype_extent + flat_file->blocklens[i] - offset; flag = 1; break; } } } /* while (!flag) */ } /* if (file_ptr_type == ADIO_INDIVIDUAL) */ else { n_etypes_in_filetype = filetype_size/etype_size; n_filetypes = (int) (offset / n_etypes_in_filetype); etype_in_filetype = (int) (offset % n_etypes_in_filetype); size_in_filetype = etype_in_filetype * etype_size; sum = 0; for (i=0; i<flat_file->count; i++) { sum += flat_file->blocklens[i]; if (sum > size_in_filetype) { st_index = i; fwr_size = sum - size_in_filetype; abs_off_in_filetype = flat_file->indices[i] + size_in_filetype - (sum - flat_file->blocklens[i]); break; } } /* abs. offset in bytes in the file */ offset = disp + (ADIO_Offset) n_filetypes*filetype_extent + abs_off_in_filetype; } /* else [file_ptr_type != ADIO_INDIVIDUAL] */ start_off = offset; st_fwr_size = fwr_size; st_n_filetypes = n_filetypes; if (buftype_is_contig && !filetype_is_contig) { /* contiguous in memory, noncontiguous in file. should be the most common case. */ int mem_lengths; char *mem_offsets; i = 0; j = st_index; off = offset; n_filetypes = st_n_filetypes; mem_list_count = 1; /* determine how many blocks in file to read */ f_data_wrote = ADIOI_MIN(st_fwr_size, bufsize); total_blks_to_write = 1; j++; while (f_data_wrote < bufsize) { f_data_wrote += flat_file->blocklens[j]; total_blks_to_write++; if (j<(flat_file->count-1)) j++; else j = 0; } j = st_index; n_filetypes = st_n_filetypes; n_write_lists = total_blks_to_write/MAX_ARRAY_SIZE; extra_blks = total_blks_to_write%MAX_ARRAY_SIZE; mem_offsets = buf; mem_lengths = 0; /* if at least one full readlist, allocate file arrays at max array size and don't free until very end */ if (n_write_lists) { file_offsets = (int64_t*)ADIOI_Malloc(MAX_ARRAY_SIZE* sizeof(int64_t)); file_lengths = (int32_t*)ADIOI_Malloc(MAX_ARRAY_SIZE* sizeof(int32_t)); } /* if there's no full readlist allocate file arrays according to needed size (extra_blks) */ else { file_offsets = (int64_t*)ADIOI_Malloc(extra_blks* sizeof(int64_t)); file_lengths = (int32_t*)ADIOI_Malloc(extra_blks* sizeof(int32_t)); } /* for file arrays that are of MAX_ARRAY_SIZE, build arrays */ for (i=0; i<n_write_lists; i++) { file_list_count = MAX_ARRAY_SIZE; if(!i) { file_offsets[0] = offset; file_lengths[0] = st_fwr_size; mem_lengths = st_fwr_size; } for (k=0; k<MAX_ARRAY_SIZE; k++) { if (i || k) { file_offsets[k] = disp + n_filetypes*filetype_extent + flat_file->indices[j]; file_lengths[k] = flat_file->blocklens[j]; mem_lengths += file_lengths[k]; } if (j<(flat_file->count - 1)) j++; else { j = 0; n_filetypes++; } } /* for (k=0; k<MAX_ARRAY_SIZE; k++) */ pvfs_write_list(fd->fd_sys, mem_list_count, &mem_offsets, &mem_lengths, file_list_count, file_offsets, file_lengths); mem_offsets += mem_lengths; mem_lengths = 0; } /* for (i=0; i<n_write_lists; i++) */ /* for file arrays smaller than MAX_ARRAY_SIZE (last read_list call) */ if (extra_blks) { file_list_count = extra_blks; if(!i) { file_offsets[0] = offset; file_lengths[0] = st_fwr_size; } for (k=0; k<extra_blks; k++) { if(i || k) { file_offsets[k] = disp + n_filetypes*filetype_extent + flat_file->indices[j]; if (k == (extra_blks - 1)) { file_lengths[k] = bufsize - (int32_t) mem_lengths - (int32_t) mem_offsets + (int32_t) buf; } else file_lengths[k] = flat_file->blocklens[j]; } /* if(i || k) */ mem_lengths += file_lengths[k]; if (j<(flat_file->count - 1)) j++; else { j = 0; n_filetypes++; } } /* for (k=0; k<extra_blks; k++) */ pvfs_write_list(fd->fd_sys, mem_list_count, &mem_offsets, &mem_lengths, file_list_count, file_offsets, file_lengths); } } else { /* noncontiguous in memory as well as in file */ ADIOI_Flatten_datatype(datatype); flat_buf = ADIOI_Flatlist; while (flat_buf->type != datatype) flat_buf = flat_buf->next; size_wrote = 0; n_filetypes = st_n_filetypes; fwr_size = st_fwr_size; bwr_size = flat_buf->blocklens[0]; buf_count = 0; start_mem_offset = 0; start_k = k = 0; start_j = st_index; max_mem_list = 0; max_file_list = 0; /* run through and file max_file_list and max_mem_list so that you can allocate the file and memory arrays less than MAX_ARRAY_SIZE if possible */ while (size_wrote < bufsize) { k = start_k; new_buffer_write = 0; mem_list_count = 0; while ((mem_list_count < MAX_ARRAY_SIZE) && (new_buffer_write < bufsize-size_wrote)) { /* find mem_list_count and file_list_count such that both are less than MAX_ARRAY_SIZE, the sum of their lengths are equal, and the sum of all the data read and data to be read in the next immediate read list is less than bufsize */ if(mem_list_count) { if((new_buffer_write + flat_buf->blocklens[k] + size_wrote) > bufsize) { end_bwr_size = new_buffer_write + flat_buf->blocklens[k] - (bufsize - size_wrote); new_buffer_write = bufsize - size_wrote; } else { new_buffer_write += flat_buf->blocklens[k]; end_bwr_size = flat_buf->blocklens[k]; } } else { if (bwr_size > (bufsize - size_wrote)) { new_buffer_write = bufsize - size_wrote; bwr_size = new_buffer_write; } else new_buffer_write = bwr_size; } mem_list_count++; k = (k + 1)%flat_buf->count; } /* while ((mem_list_count < MAX_ARRAY_SIZE) && (new_buffer_write < bufsize-size_wrote)) */ j = start_j; new_file_write = 0; file_list_count = 0; while ((file_list_count < MAX_ARRAY_SIZE) && (new_file_write < new_buffer_write)) { if(file_list_count) { if((new_file_write + flat_file->blocklens[j]) > new_buffer_write) { end_fwr_size = new_buffer_write - new_file_write; new_file_write = new_buffer_write; j--; } else { new_file_write += flat_file->blocklens[j]; end_fwr_size = flat_file->blocklens[j]; } } else { if (fwr_size > new_buffer_write) { new_file_write = new_buffer_write; fwr_size = new_file_write; } else new_file_write = fwr_size; } file_list_count++; if (j < (flat_file->count - 1)) j++; else j = 0; k = start_k; if ((new_file_write < new_buffer_write) && (file_list_count == MAX_ARRAY_SIZE)) { new_buffer_write = 0; mem_list_count = 0; while (new_buffer_write < new_file_write) { if(mem_list_count) { if((new_buffer_write + flat_buf->blocklens[k]) > new_file_write) { end_bwr_size = new_file_write - new_buffer_write; new_buffer_write = new_file_write; k--; } else { new_buffer_write += flat_buf->blocklens[k]; end_bwr_size = flat_buf->blocklens[k]; } } else { new_buffer_write = bwr_size; if (bwr_size > (bufsize - size_wrote)) { new_buffer_write = bufsize - size_wrote; bwr_size = new_buffer_write; } } mem_list_count++; k = (k + 1)%flat_buf->count; } /* while (new_buffer_write < new_file_write) */ } /* if ((new_file_write < new_buffer_write) && (file_list_count == MAX_ARRAY_SIZE)) */ } /* while ((mem_list_count < MAX_ARRAY_SIZE) && (new_buffer_write < bufsize-size_wrote)) */ /* fakes filling the writelist arrays of lengths found above */ k = start_k; j = start_j; for (i=0; i<mem_list_count; i++) { if(i) { if (i == (mem_list_count - 1)) { if (flat_buf->blocklens[k] == end_bwr_size) bwr_size = flat_buf->blocklens[(k+1)% flat_buf->count]; else { bwr_size = flat_buf->blocklens[k] - end_bwr_size; k--; buf_count--; } } } buf_count++; k = (k + 1)%flat_buf->count; } /* for (i=0; i<mem_list_count; i++) */ for (i=0; i<file_list_count; i++) { if (i) { if (i == (file_list_count - 1)) { if (flat_file->blocklens[j] == end_fwr_size) fwr_size = flat_file->blocklens[(j+1)% flat_file->count]; else { fwr_size = flat_file->blocklens[j] - end_fwr_size; j--; } } } if (j < flat_file->count - 1) j++; else { j = 0; n_filetypes++; } } /* for (i=0; i<file_list_count; i++) */ size_wrote += new_buffer_write; start_k = k; start_j = j; if (max_mem_list < mem_list_count) max_mem_list = mem_list_count; if (max_file_list < file_list_count) max_file_list = file_list_count; if (max_mem_list == max_mem_list == MAX_ARRAY_SIZE) break; } /* while (size_wrote < bufsize) */ mem_offsets = (char **)ADIOI_Malloc(max_mem_list*sizeof(char *)); mem_lengths = (int *)ADIOI_Malloc(max_mem_list*sizeof(int)); file_offsets = (int64_t *)ADIOI_Malloc(max_file_list*sizeof(int64_t)); file_lengths = (int32_t *)ADIOI_Malloc(max_file_list*sizeof(int32_t)); size_wrote = 0; n_filetypes = st_n_filetypes; fwr_size = st_fwr_size; bwr_size = flat_buf->blocklens[0]; buf_count = 0; start_mem_offset = 0; start_k = k = 0; start_j = st_index; /* this section calculates mem_list_count and file_list_count and also finds the possibly odd sized last array elements in new_fwr_size and new_bwr_size */ while (size_wrote < bufsize) { k = start_k; new_buffer_write = 0; mem_list_count = 0; while ((mem_list_count < MAX_ARRAY_SIZE) && (new_buffer_write < bufsize-size_wrote)) { /* find mem_list_count and file_list_count such that both are less than MAX_ARRAY_SIZE, the sum of their lengths are equal, and the sum of all the data read and data to be read in the next immediate read list is less than bufsize */ if(mem_list_count) { if((new_buffer_write + flat_buf->blocklens[k] + size_wrote) > bufsize) { end_bwr_size = new_buffer_write + flat_buf->blocklens[k] - (bufsize - size_wrote); new_buffer_write = bufsize - size_wrote; } else { new_buffer_write += flat_buf->blocklens[k]; end_bwr_size = flat_buf->blocklens[k]; } } else { if (bwr_size > (bufsize - size_wrote)) { new_buffer_write = bufsize - size_wrote; bwr_size = new_buffer_write; } else new_buffer_write = bwr_size; } mem_list_count++; k = (k + 1)%flat_buf->count; } /* while ((mem_list_count < MAX_ARRAY_SIZE) && (new_buffer_write < bufsize-size_wrote)) */ j = start_j; new_file_write = 0; file_list_count = 0; while ((file_list_count < MAX_ARRAY_SIZE) && (new_file_write < new_buffer_write)) { if(file_list_count) { if((new_file_write + flat_file->blocklens[j]) > new_buffer_write) { end_fwr_size = new_buffer_write - new_file_write; new_file_write = new_buffer_write; j--; } else { new_file_write += flat_file->blocklens[j]; end_fwr_size = flat_file->blocklens[j]; } } else { if (fwr_size > new_buffer_write) { new_file_write = new_buffer_write; fwr_size = new_file_write; } else new_file_write = fwr_size; } file_list_count++; if (j < (flat_file->count - 1)) j++; else j = 0; k = start_k; if ((new_file_write < new_buffer_write) && (file_list_count == MAX_ARRAY_SIZE)) { new_buffer_write = 0; mem_list_count = 0; while (new_buffer_write < new_file_write) { if(mem_list_count) { if((new_buffer_write + flat_buf->blocklens[k]) > new_file_write) { end_bwr_size = new_file_write - new_buffer_write; new_buffer_write = new_file_write; k--; } else { new_buffer_write += flat_buf->blocklens[k]; end_bwr_size = flat_buf->blocklens[k]; } } else { new_buffer_write = bwr_size; if (bwr_size > (bufsize - size_wrote)) { new_buffer_write = bufsize - size_wrote; bwr_size = new_buffer_write; } } mem_list_count++; k = (k + 1)%flat_buf->count; } /* while (new_buffer_write < new_file_write) */ } /* if ((new_file_write < new_buffer_write) && (file_list_count == MAX_ARRAY_SIZE)) */ } /* while ((mem_list_count < MAX_ARRAY_SIZE) && (new_buffer_write < bufsize-size_wrote)) */ /* fills the allocated readlist arrays */ k = start_k; j = start_j; for (i=0; i<mem_list_count; i++) { mem_offsets[i] = ((char*)buf + buftype_extent* (buf_count/flat_buf->count) + (int)flat_buf->indices[k]); if(!i) { mem_lengths[0] = bwr_size; mem_offsets[0] += flat_buf->blocklens[k] - bwr_size; } else { if (i == (mem_list_count - 1)) { mem_lengths[i] = end_bwr_size; if (flat_buf->blocklens[k] == end_bwr_size) bwr_size = flat_buf->blocklens[(k+1)% flat_buf->count]; else { bwr_size = flat_buf->blocklens[k] - end_bwr_size; k--; buf_count--; } } else { mem_lengths[i] = flat_buf->blocklens[k]; } } buf_count++; k = (k + 1)%flat_buf->count; } /* for (i=0; i<mem_list_count; i++) */ for (i=0; i<file_list_count; i++) { file_offsets[i] = disp + flat_file->indices[j] + n_filetypes * filetype_extent; if (!i) { file_lengths[0] = fwr_size; file_offsets[0] += flat_file->blocklens[j] - fwr_size; } else { if (i == (file_list_count - 1)) { file_lengths[i] = end_fwr_size; if (flat_file->blocklens[j] == end_fwr_size) fwr_size = flat_file->blocklens[(j+1)% flat_file->count]; else { fwr_size = flat_file->blocklens[j] - end_fwr_size; j--; } } else file_lengths[i] = flat_file->blocklens[j]; } if (j < flat_file->count - 1) j++; else { j = 0; n_filetypes++; } } /* for (i=0; i<file_list_count; i++) */ pvfs_write_list(fd->fd_sys,mem_list_count, mem_offsets, mem_lengths, file_list_count, file_offsets, file_lengths); size_wrote += new_buffer_write; start_k = k; start_j = j; } /* while (size_wrote < bufsize) */ ADIOI_Free(mem_offsets); ADIOI_Free(mem_lengths); } ADIOI_Free(file_offsets); ADIOI_Free(file_lengths); if (file_ptr_type == ADIO_INDIVIDUAL) fd->fp_ind = off; if (err_flag) { *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_IO, "**io", "**io %s", strerror(errno)); } else *error_code = MPI_SUCCESS; fd->fp_sys_posn = -1; /* set it to null. */ #ifdef HAVE_STATUS_SET_BYTES MPIR_Status_set_bytes(status, datatype, bufsize); /* This is a temporary way of filling in status. The right way is to keep track of how much data was actually written by ADIOI_BUFFERED_WRITE. */ #endif if (!buftype_is_contig) ADIOI_Delete_flattened(datatype); }
void ADIOI_LUSTRE_WriteStridedColl(ADIO_File fd, const void *buf, int count, MPI_Datatype datatype, int file_ptr_type, ADIO_Offset offset, ADIO_Status *status, int *error_code) { /* Uses a generalized version of the extended two-phase method described * in "An Extended Two-Phase Method for Accessing Sections of * Out-of-Core Arrays", Rajeev Thakur and Alok Choudhary, * Scientific Programming, (5)4:301--317, Winter 1996. * http://www.mcs.anl.gov/home/thakur/ext2ph.ps */ ADIOI_Access *my_req; /* array of nprocs access structures, one for each other process has this process's request */ ADIOI_Access *others_req; /* array of nprocs access structures, one for each other process whose request is written by this process. */ int i, filetype_is_contig, nprocs, myrank, do_collect = 0; int contig_access_count = 0, buftype_is_contig, interleave_count = 0; int *count_my_req_per_proc, count_my_req_procs, count_others_req_procs; ADIO_Offset orig_fp, start_offset, end_offset, off; ADIO_Offset *offset_list = NULL, *st_offsets = NULL, *end_offsets = NULL; ADIO_Offset *len_list = NULL; int **buf_idx = NULL, *striping_info = NULL; int old_error, tmp_error; MPI_Comm_size(fd->comm, &nprocs); MPI_Comm_rank(fd->comm, &myrank); orig_fp = fd->fp_ind; /* IO patten identification if cb_write isn't disabled */ if (fd->hints->cb_write != ADIOI_HINT_DISABLE) { /* For this process's request, calculate the list of offsets and lengths in the file and determine the start and end offsets. */ /* Note: end_offset points to the last byte-offset that will be accessed. * e.g., if start_offset=0 and 100 bytes to be read, end_offset=99 */ ADIOI_Calc_my_off_len(fd, count, datatype, file_ptr_type, offset, &offset_list, &len_list, &start_offset, &end_offset, &contig_access_count); /* each process communicates its start and end offsets to other * processes. The result is an array each of start and end offsets * stored in order of process rank. */ st_offsets = (ADIO_Offset *) ADIOI_Malloc(nprocs * sizeof(ADIO_Offset)); end_offsets = (ADIO_Offset *) ADIOI_Malloc(nprocs * sizeof(ADIO_Offset)); MPI_Allgather(&start_offset, 1, ADIO_OFFSET, st_offsets, 1, ADIO_OFFSET, fd->comm); MPI_Allgather(&end_offset, 1, ADIO_OFFSET, end_offsets, 1, ADIO_OFFSET, fd->comm); /* are the accesses of different processes interleaved? */ for (i = 1; i < nprocs; i++) if ((st_offsets[i] < end_offsets[i-1]) && (st_offsets[i] <= end_offsets[i])) interleave_count++; /* This is a rudimentary check for interleaving, but should suffice for the moment. */ /* Two typical access patterns can benefit from collective write. * 1) the processes are interleaved, and * 2) the req size is small. */ if (interleave_count > 0) { do_collect = 1; } else { do_collect = ADIOI_LUSTRE_Docollect(fd, contig_access_count, len_list, nprocs); } } ADIOI_Datatype_iscontig(datatype, &buftype_is_contig); /* Decide if collective I/O should be done */ if ((!do_collect && fd->hints->cb_write == ADIOI_HINT_AUTO) || fd->hints->cb_write == ADIOI_HINT_DISABLE) { /* use independent accesses */ if (fd->hints->cb_write != ADIOI_HINT_DISABLE) { ADIOI_Free(offset_list); ADIOI_Free(len_list); ADIOI_Free(st_offsets); ADIOI_Free(end_offsets); } fd->fp_ind = orig_fp; ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig); if (buftype_is_contig && filetype_is_contig) { if (file_ptr_type == ADIO_EXPLICIT_OFFSET) { off = fd->disp + (ADIO_Offset)(fd->etype_size) * offset; ADIO_WriteContig(fd, buf, count, datatype, ADIO_EXPLICIT_OFFSET, off, status, error_code); } else ADIO_WriteContig(fd, buf, count, datatype, ADIO_INDIVIDUAL, 0, status, error_code); } else { ADIO_WriteStrided(fd, buf, count, datatype, file_ptr_type, offset, status, error_code); } return; } /* Get Lustre hints information */ ADIOI_LUSTRE_Get_striping_info(fd, &striping_info, 1); /* calculate what portions of the access requests of this process are * located in which process */ ADIOI_LUSTRE_Calc_my_req(fd, offset_list, len_list, contig_access_count, striping_info, nprocs, &count_my_req_procs, &count_my_req_per_proc, &my_req, &buf_idx); /* based on everyone's my_req, calculate what requests of other processes * will be accessed by this process. * count_others_req_procs = number of processes whose requests (including * this process itself) will be accessed by this process * count_others_req_per_proc[i] indicates how many separate contiguous * requests of proc. i will be accessed by this process. */ ADIOI_Calc_others_req(fd, count_my_req_procs, count_my_req_per_proc, my_req, nprocs, myrank, &count_others_req_procs, &others_req); ADIOI_Free(count_my_req_per_proc); /* exchange data and write in sizes of no more than stripe_size. */ ADIOI_LUSTRE_Exch_and_write(fd, buf, datatype, nprocs, myrank, others_req, my_req, offset_list, len_list, contig_access_count, striping_info, buf_idx, error_code); /* If this collective write is followed by an independent write, * it's possible to have those subsequent writes on other processes * race ahead and sneak in before the read-modify-write completes. * We carry out a collective communication at the end here so no one * can start independent i/o before collective I/O completes. * * need to do some gymnastics with the error codes so that if something * went wrong, all processes report error, but if a process has a more * specific error code, we can still have that process report the * additional information */ old_error = *error_code; if (*error_code != MPI_SUCCESS) *error_code = MPI_ERR_IO; /* optimization: if only one process performing i/o, we can perform * a less-expensive Bcast */ #ifdef ADIOI_MPE_LOGGING MPE_Log_event(ADIOI_MPE_postwrite_a, 0, NULL); #endif if (fd->hints->cb_nodes == 1) MPI_Bcast(error_code, 1, MPI_INT, fd->hints->ranklist[0], fd->comm); else { tmp_error = *error_code; MPI_Allreduce(&tmp_error, error_code, 1, MPI_INT, MPI_MAX, fd->comm); } #ifdef ADIOI_MPE_LOGGING MPE_Log_event(ADIOI_MPE_postwrite_b, 0, NULL); #endif if ((old_error != MPI_SUCCESS) && (old_error != MPI_ERR_IO)) *error_code = old_error; if (!buftype_is_contig) ADIOI_Delete_flattened(datatype); /* free all memory allocated for collective I/O */ /* free others_req */ for (i = 0; i < nprocs; i++) { if (others_req[i].count) { ADIOI_Free(others_req[i].offsets); ADIOI_Free(others_req[i].lens); ADIOI_Free(others_req[i].mem_ptrs); } } ADIOI_Free(others_req); /* free my_req here */ for (i = 0; i < nprocs; i++) { if (my_req[i].count) { ADIOI_Free(my_req[i].offsets); ADIOI_Free(my_req[i].lens); } } ADIOI_Free(my_req); for (i = 0; i < nprocs; i++) { ADIOI_Free(buf_idx[i]); } ADIOI_Free(buf_idx); ADIOI_Free(offset_list); ADIOI_Free(len_list); ADIOI_Free(st_offsets); ADIOI_Free(end_offsets); ADIOI_Free(striping_info); #ifdef HAVE_STATUS_SET_BYTES if (status) { MPI_Count bufsize, size; /* Don't set status if it isn't needed */ MPI_Type_size_x(datatype, &size); bufsize = size * count; MPIR_Status_set_bytes(status, datatype, bufsize); } /* This is a temporary way of filling in status. The right way is to * keep track of how much data was actually written during collective I/O. */ #endif fd->fp_sys_posn = -1; /* set it to null. */ }
void ADIOI_XFS_ReadComplete(ADIO_Request *request, ADIO_Status *status, int *error_code) { int err; #ifndef PRINT_ERR_MSG static char myname[] = "ADIOI_XFS_READCOMPLETE"; #endif if (*request == ADIO_REQUEST_NULL) { *error_code = MPI_SUCCESS; return; } if ((*request)->queued) { do { err = aio_suspend64((const aiocb64_t **) &((*request)->handle), 1, 0); } while ((err == -1) && (errno == EINTR)); if (err != -1) { err = aio_return64((aiocb64_t *) (*request)->handle); (*request)->nbytes = err; errno = aio_error64((aiocb64_t *) (*request)->handle); } else (*request)->nbytes = -1; #ifdef PRINT_ERR_MSG *error_code = (err == -1) ? MPI_ERR_UNKNOWN : MPI_SUCCESS; #else if (err == -1) { *error_code = MPIR_Err_setmsg(MPI_ERR_IO, MPIR_ADIO_ERROR, myname, "I/O Error", "%s", strerror(errno)); ADIOI_Error((*request)->fd, *error_code, myname); } else *error_code = MPI_SUCCESS; #endif } else *error_code = MPI_SUCCESS; #ifdef HAVE_STATUS_SET_BYTES if ((*request)->nbytes != -1) MPIR_Status_set_bytes(status, (*request)->datatype, (*request)->nbytes); #endif if ((*request)->queued != -1) { /* queued = -1 is an internal hack used when the request must be completed, but the request object should not be freed. This is used in ADIOI_Complete_async, because the user will call MPI_Wait later, which would require status to be filled. Ugly but works. queued = -1 should be used only in ADIOI_Complete_async. This should not affect the user in any way. */ /* if request is still queued in the system, it is also there on ADIOI_Async_list. Delete it from there. */ if ((*request)->queued) ADIOI_Del_req_from_list(request); (*request)->fd->async_count--; if ((*request)->handle) ADIOI_Free((*request)->handle); ADIOI_Free_request((ADIOI_Req_node *) (*request)); *request = ADIO_REQUEST_NULL; } }
void ADIOI_NFS_ReadContig(ADIO_File fd, void *buf, int count, MPI_Datatype datatype, int file_ptr_type, ADIO_Offset offset, ADIO_Status *status, int *error_code) { ssize_t err=-1; MPI_Count datatype_size, len; ADIO_Offset bytes_xfered=0; size_t rd_count; static char myname[] = "ADIOI_NFS_READCONTIG"; char *p; MPI_Type_size_x(datatype, &datatype_size); len = datatype_size * count; if (file_ptr_type == ADIO_INDIVIDUAL) { offset = fd->fp_ind; } p = buf; while (bytes_xfered < len ) { rd_count = len - bytes_xfered; /* FreeBSD and Darwin workaround: bigger than INT_MAX is an error */ if (rd_count > INT_MAX) rd_count = INT_MAX; if (fd->atomicity) ADIOI_WRITE_LOCK(fd, offset+bytes_xfered, SEEK_SET, rd_count); else ADIOI_READ_LOCK(fd, offset+bytes_xfered, SEEK_SET, rd_count); #ifdef ADIOI_MPE_LOGGING MPE_Log_event( ADIOI_MPE_read_a, 0, NULL ); #endif err = pread(fd->fd_sys, p, rd_count, offset+bytes_xfered); /* --BEGIN ERROR HANDLING-- */ if (err == -1) { *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_IO, "**io", "**io %s", strerror(errno)); } /* --END ERROR HANDLING-- */ #ifdef ADIOI_MPE_LOGGING MPE_Log_event( ADIOI_MPE_read_b, 0, NULL ); #endif ADIOI_UNLOCK(fd, offset+bytes_xfered, SEEK_SET, rd_count); if (err == 0) { /* end of file */ break; } bytes_xfered += err; p += err; } fd->fp_sys_posn = offset + bytes_xfered; if (file_ptr_type == ADIO_INDIVIDUAL) { fd->fp_ind += bytes_xfered; } /* --END ERROR HANDLING-- */ #ifdef HAVE_STATUS_SET_BYTES if (err != -1) MPIR_Status_set_bytes(status, datatype, bytes_xfered); #endif *error_code = MPI_SUCCESS; }
void ADIOI_GEN_ReadContig(ADIO_File fd, void *buf, int count, MPI_Datatype datatype, int file_ptr_type, ADIO_Offset offset, ADIO_Status *status, int *error_code) { off_t err_lseek = -1; ssize_t err = -1; MPI_Count datatype_size; ADIO_Offset len, bytes_xfered=0; size_t rd_count; static char myname[] = "ADIOI_GEN_READCONTIG"; char *p; #ifdef AGGREGATION_PROFILE MPE_Log_event (5034, 0, NULL); #endif MPI_Type_size_x(datatype, &datatype_size); len = datatype_size * (ADIO_Offset)count; if (file_ptr_type == ADIO_INDIVIDUAL) { offset = fd->fp_ind; } if (fd->fp_sys_posn != offset) { #ifdef ADIOI_MPE_LOGGING MPE_Log_event( ADIOI_MPE_lseek_a, 0, NULL ); #endif err_lseek = lseek(fd->fd_sys, offset, SEEK_SET); #ifdef ADIOI_MPE_LOGGING MPE_Log_event( ADIOI_MPE_lseek_b, 0, NULL ); #endif /* --BEGIN ERROR HANDLING-- */ if (err_lseek == -1) { *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_IO, "**io", "**io %s", strerror(errno)); fd->fp_sys_posn = -1; return; } /* --END ERROR HANDLING-- */ } p=buf; while (bytes_xfered < len) { #ifdef ADIOI_MPE_LOGGING MPE_Log_event( ADIOI_MPE_read_a, 0, NULL ); #endif rd_count = len - bytes_xfered; err = read(fd->fd_sys, p, rd_count); /* --BEGIN ERROR HANDLING-- */ if (err == -1) { *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_IO, "**io", "**io %s", strerror(errno)); fd->fp_sys_posn = -1; return; } /* --END ERROR HANDLING-- */ if (err == 0) { /* end of file */ break; } #ifdef ADIOI_MPE_LOGGING MPE_Log_event( ADIOI_MPE_read_b, 0, NULL ); #endif bytes_xfered += err; p += err; } fd->fp_sys_posn = offset + bytes_xfered; if (file_ptr_type == ADIO_INDIVIDUAL) { fd->fp_ind += bytes_xfered; } #ifdef HAVE_STATUS_SET_BYTES /* what if we only read half a datatype? */ /* bytes_xfered could be larger than int */ if (err != -1) MPIR_Status_set_bytes(status, datatype, bytes_xfered); #endif *error_code = MPI_SUCCESS; #ifdef AGGREGATION_PROFILE MPE_Log_event (5035, 0, NULL); #endif }
int ADIOI_PVFS2_StridedDtypeIO(ADIO_File fd, void *buf, int count, MPI_Datatype datatype, int file_ptr_type, ADIO_Offset offset, ADIO_Status *status, int *error_code, int rw_type) { int ret = -1, filetype_is_contig = -1; MPI_Count filetype_size = -1; int num_filetypes = 0, cur_flat_file_reg_off = 0; PVFS_Request tmp_mem_req, mem_req, tmp_file_req, file_req; PVFS_sysresp_io resp_io; ADIO_Offset off = -1, bytes_into_filetype = 0; MPI_Aint filetype_extent = -1; int i = -1; MPI_Count etype_size; PVFS_size pvfs_disp = -1; ADIOI_Flatlist_node *flat_file_p = ADIOI_Flatlist; /* Use for offseting the PVFS2 filetype */ int pvfs_blk = 1; ADIOI_PVFS2_fs *pvfs_fs; static char myname[] = "ADIOI_PVFS2_STRIDED_DTYPE"; memset(&tmp_mem_req, 0, sizeof(PVFS_Request)); memset(&mem_req, 0, sizeof(PVFS_Request)); memset(&tmp_file_req, 0, sizeof(PVFS_Request)); memset(&file_req, 0, sizeof(PVFS_Request)); pvfs_fs = (ADIOI_PVFS2_fs*)fd->fs_ptr; ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig); /* changed below if error */ *error_code = MPI_SUCCESS; /* datatype is the memory type * fd->filetype is the file type */ MPI_Type_size_x(fd->filetype, &filetype_size); if (filetype_size == 0) { *error_code = MPI_SUCCESS; return -1; } MPI_Type_extent(fd->filetype, &filetype_extent); MPI_Type_size_x(fd->etype, &etype_size); if (filetype_size == 0) { *error_code = MPI_SUCCESS; return -1; } /* offset is in units of etype relative to the filetype. We * convert this to off in terms of actual data bytes (the offset * minus the number of bytes that are not used). We are allowed * to do this since PVFS2 handles offsets with respect to a * file_req in bytes, otherwise we would have to convert into a * pure byte offset as is done in other methods. Explicit offset * case is handled by using fd->disp and byte-converted off. */ pvfs_disp = fd->disp; if (file_ptr_type == ADIO_INDIVIDUAL) { if (filetype_is_contig) { off = fd->fp_ind - fd->disp; } else { int flag = 0; /* Should have already been flattened in ADIO_Open*/ while (flat_file_p->type != fd->filetype) { flat_file_p = flat_file_p->next; } num_filetypes = -1; while (!flag) { num_filetypes++; for (i = 0; i < flat_file_p->count; i++) { /* Start on a non zero-length region */ if (flat_file_p->blocklens[i]) { if (fd->disp + flat_file_p->indices[i] + (num_filetypes * filetype_extent) + flat_file_p->blocklens[i] > fd->fp_ind && fd->disp + flat_file_p->indices[i] <= fd->fp_ind) { cur_flat_file_reg_off = fd->fp_ind - (fd->disp + flat_file_p->indices[i] + (num_filetypes * filetype_extent)); flag = 1; break; } else bytes_into_filetype += flat_file_p->blocklens[i]; } } } /* Impossible that we don't find it in this datatype */ assert(i != flat_file_p->count); off = bytes_into_filetype + cur_flat_file_reg_off; } } else /* ADIO_EXPLICIT */ { off = etype_size * offset; } #ifdef DEBUG_DTYPE fprintf(stderr, "ADIOI_PVFS2_StridedDtypeIO: (fd->fp_ind=%Ld,fd->disp=%Ld," " offset=%Ld),(pvfs_disp=%Ld,off=%Ld)\n", fd->fp_ind, fd->disp, offset, pvfs_disp, off); #endif /* Convert the MPI memory and file datatypes into * PVFS2 datatypes */ ret = convert_mpi_pvfs2_dtype(&datatype, &tmp_mem_req); if (ret < 0) { goto error_state; } ret = convert_mpi_pvfs2_dtype(&(fd->filetype), &tmp_file_req); if (ret < 0) { goto error_state; } ret = PVFS_Request_contiguous(count, tmp_mem_req, &mem_req); if (ret != 0) /* TODO: convert this to MPIO error handling */ fprintf(stderr, "ADIOI_PVFS2_stridedDtypeIO: error in final" " CONTIG memory type\n"); PVFS_Request_free(&tmp_mem_req); /* pvfs_disp is used to offset the filetype */ ret = PVFS_Request_hindexed(1, &pvfs_blk, &pvfs_disp, tmp_file_req, &file_req); if (ret != 0) fprintf(stderr, "ADIOI_PVFS2_StridedDtypeIO: error in final" " HINDEXED file type\n"); PVFS_Request_free(&tmp_file_req); if (rw_type == READ) ret = PVFS_sys_read(pvfs_fs->object_ref, file_req, off, buf, mem_req, &(pvfs_fs->credentials), &resp_io); else ret = PVFS_sys_write(pvfs_fs->object_ref, file_req, off, buf, mem_req, &(pvfs_fs->credentials), &resp_io); if (ret != 0) { fprintf(stderr, "ADIOI_PVFS2_StridedDtypeIO: Warning - PVFS_sys_" "read/write returned %d and completed %Ld bytes.\n", ret, (long long)resp_io.total_completed); *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, ADIOI_PVFS2_error_convert(ret), "Error in PVFS_sys_io \n", 0); goto error_state; } if (file_ptr_type == ADIO_INDIVIDUAL) { fd->fp_ind = off += resp_io.total_completed; } error_state: fd->fp_sys_posn = -1; /* set it to null. */ PVFS_Request_free(&mem_req); PVFS_Request_free(&file_req); #ifdef DEBUG_DTYPE fprintf(stderr, "ADIOI_PVFS2_StridedDtypeIO: " "resp_io.total_completed=%Ld,ret=%d\n", resp_io.total_completed, ret); #endif #ifdef HAVE_STATUS_SET_BYTES MPIR_Status_set_bytes(status, datatype, resp_io.total_completed); /* This is a temporary way of filling in status. The right way is to * keep track of how much data was actually acccessed by * ADIOI_BUFFERED operations */ #endif return ret; }
void ADIOI_XFS_WriteContig(ADIO_File fd, void *buf, int count, MPI_Datatype datatype, int file_ptr_type, ADIO_Offset offset, ADIO_Status *status, int *error_code) { int err=-1, datatype_size, len, diff, size, nbytes; void *newbuf; static char myname[] = "ADIOI_XFS_WRITECONTIG"; MPI_Type_size(datatype, &datatype_size); len = datatype_size * count; fd->fp_sys_posn = -1; /* set it to null, since we are using pwrite */ if (file_ptr_type == ADIO_INDIVIDUAL) offset = fd->fp_ind; if (!(fd->direct_write)) /* direct I/O not enabled */ err = pwrite(fd->fd_sys, buf, len, offset); else { /* direct I/O enabled */ /* (1) if mem_aligned && file_aligned use direct I/O to write up to correct io_size use buffered I/O for remaining */ if (!(((long) buf) % fd->d_mem) && !(offset % fd->d_miniosz)) ADIOI_XFS_Aligned_Mem_File_Write(fd, buf, len, offset, &err); /* (2) if !file_aligned use buffered I/O to write up to file_aligned At that point, if still mem_aligned, use (1) else copy into aligned buf and then use (1) */ else if (offset % fd->d_miniosz) { diff = fd->d_miniosz - (offset % fd->d_miniosz); diff = ADIOI_MIN(diff, len); nbytes = pwrite(fd->fd_sys, buf, diff, offset); buf = ((char *) buf) + diff; offset += diff; size = len - diff; if (!(((long) buf) % fd->d_mem)) { ADIOI_XFS_Aligned_Mem_File_Write(fd, buf, size, offset, &err); nbytes += err; } else { newbuf = (void *) memalign(XFS_MEMALIGN, size); if (newbuf) { memcpy(newbuf, buf, size); ADIOI_XFS_Aligned_Mem_File_Write(fd, newbuf, size, offset, &err); nbytes += err; free(newbuf); } else nbytes += pwrite(fd->fd_sys, buf, size, offset); } err = nbytes; } /* (3) if !mem_aligned && file_aligned copy into aligned buf, then use (1) */ else { newbuf = (void *) memalign(XFS_MEMALIGN, len); if (newbuf) { memcpy(newbuf, buf, len); ADIOI_XFS_Aligned_Mem_File_Write(fd, newbuf, len, offset, &err); free(newbuf); } else err = pwrite(fd->fd_sys, buf, len, offset); } } if (file_ptr_type == ADIO_INDIVIDUAL) fd->fp_ind += err; #ifdef HAVE_STATUS_SET_BYTES if (err != -1) MPIR_Status_set_bytes(status, datatype, err); #endif if (err == -1) { *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_IO, "**io", "**io %s", strerror(errno)); } else *error_code = MPI_SUCCESS; }
void ADIOI_PVFS_ReadContig(ADIO_File fd, void *buf, int count, MPI_Datatype datatype, int file_ptr_type, ADIO_Offset offset, ADIO_Status *status, int *error_code) { int err=-1, datatype_size, len; static char myname[] = "ADIOI_PVFS_READCONTIG"; MPI_Type_size(datatype, &datatype_size); len = datatype_size * count; if (file_ptr_type == ADIO_EXPLICIT_OFFSET) { if (fd->fp_sys_posn != offset) { #ifdef ADIOI_MPE_LOGGING MPE_Log_event( ADIOI_MPE_lseek_a, 0, NULL ); #endif pvfs_lseek64(fd->fd_sys, offset, SEEK_SET); #ifdef ADIOI_MPE_LOGGING MPE_Log_event( ADIOI_MPE_lseek_b, 0, NULL ); #endif } #ifdef ADIOI_MPE_LOGGING MPE_Log_event( ADIOI_MPE_read_a, 0, NULL ); #endif err = pvfs_read(fd->fd_sys, buf, len); #ifdef ADIOI_MPE_LOGGING MPE_Log_event( ADIOI_MPE_read_b, 0, NULL ); #endif if (err>0) fd->fp_sys_posn = offset + err; /* individual file pointer not updated */ } else { /* read from curr. location of ind. file pointer */ if (fd->fp_sys_posn != fd->fp_ind) { #ifdef ADIOI_MPE_LOGGING MPE_Log_event( ADIOI_MPE_lseek_a, 0, NULL ); #endif pvfs_lseek64(fd->fd_sys, fd->fp_ind, SEEK_SET); #ifdef ADIOI_MPE_LOGGING MPE_Log_event( ADIOI_MPE_lseek_b, 0, NULL ); #endif } #ifdef ADIOI_MPE_LOGGING MPE_Log_event( ADIOI_MPE_read_a, 0, NULL ); #endif err = pvfs_read(fd->fd_sys, buf, len); #ifdef ADIOI_MPE_LOGGING MPE_Log_event( ADIOI_MPE_read_b, 0, NULL ); #endif if (err > 0) fd->fp_ind += err; fd->fp_sys_posn = fd->fp_ind; } #ifdef HAVE_STATUS_SET_BYTES if (err != -1) MPIR_Status_set_bytes(status, datatype, err); #endif if (err == -1) { *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_IO, "**io", "**io %s", strerror(errno)); } else *error_code = MPI_SUCCESS; }
void ADIOI_PVFS_ReadStridedListIO(ADIO_File fd, void *buf, int count, MPI_Datatype datatype, int file_ptr_type, ADIO_Offset offset, ADIO_Status *status, int *error_code) { /* offset is in units of etype relative to the filetype. */ ADIOI_Flatlist_node *flat_buf, *flat_file; int i, j, k, l, brd_size, frd_size=0, st_index=0; int bufsize, sum, n_etypes_in_filetype, size_in_filetype; int n_filetypes, etype_in_filetype; ADIO_Offset abs_off_in_filetype=0; int filetype_size, etype_size, buftype_size; MPI_Aint filetype_extent, buftype_extent; int buf_count, buftype_is_contig, filetype_is_contig; ADIO_Offset userbuf_off; ADIO_Offset off, disp, start_off; int flag, st_frd_size, st_n_filetypes; int new_brd_size, new_frd_size; int mem_list_count, file_list_count; char **mem_offsets; int64_t *file_offsets; int *mem_lengths; int32_t *file_lengths; int total_blks_to_read; int max_mem_list, max_file_list; int b_blks_read; int f_data_read; int size_read=0, n_read_lists, extra_blks; int end_brd_size, end_frd_size; int start_k, start_j, new_file_read, new_buffer_read; int start_mem_offset; #define MAX_ARRAY_SIZE 1024 #ifndef PRINT_ERR_MESG static char myname[] = "ADIOI_PVFS_ReadStrided"; #endif *error_code = MPI_SUCCESS; /* changed below if error */ ADIOI_Datatype_iscontig(datatype, &buftype_is_contig); ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig); MPI_Type_size(fd->filetype, &filetype_size); if ( ! filetype_size ) { #ifdef HAVE_STATUS_SET_BYTES MPIR_Status_set_bytes(status, datatype, 0); #endif *error_code = MPI_SUCCESS; return; } MPI_Type_extent(fd->filetype, &filetype_extent); MPI_Type_size(datatype, &buftype_size); MPI_Type_extent(datatype, &buftype_extent); etype_size = fd->etype_size; bufsize = buftype_size * count; if (!buftype_is_contig && filetype_is_contig) { /* noncontiguous in memory, contiguous in file. */ int64_t file_offsets; int32_t file_lengths; ADIOI_Flatten_datatype(datatype); flat_buf = ADIOI_Flatlist; while (flat_buf->type != datatype) flat_buf = flat_buf->next; off = (file_ptr_type == ADIO_INDIVIDUAL) ? fd->fp_ind : fd->disp + etype_size * offset; file_list_count = 1; file_offsets = off; file_lengths = 0; total_blks_to_read = count*flat_buf->count; b_blks_read = 0; /* allocate arrays according to max usage */ if (total_blks_to_read > MAX_ARRAY_SIZE) mem_list_count = MAX_ARRAY_SIZE; else mem_list_count = total_blks_to_read; mem_offsets = (char**)ADIOI_Malloc(mem_list_count*sizeof(char*)); mem_lengths = (int*)ADIOI_Malloc(mem_list_count*sizeof(int)); j = 0; /* step through each block in memory, filling memory arrays */ while (b_blks_read < total_blks_to_read) { for (i=0; i<flat_buf->count; i++) { mem_offsets[b_blks_read % MAX_ARRAY_SIZE] = (char*)((char *)buf + j*buftype_extent + flat_buf->indices[i]); mem_lengths[b_blks_read % MAX_ARRAY_SIZE] = flat_buf->blocklens[i]; file_lengths += flat_buf->blocklens[i]; b_blks_read++; if (!(b_blks_read % MAX_ARRAY_SIZE) || (b_blks_read == total_blks_to_read)) { /* in the case of the last read list call, adjust mem_list_count */ if (b_blks_read == total_blks_to_read) { mem_list_count = total_blks_to_read % MAX_ARRAY_SIZE; /* in case last read list call fills max arrays */ if (!mem_list_count) mem_list_count = MAX_ARRAY_SIZE; } pvfs_read_list(fd->fd_sys ,mem_list_count, mem_offsets, mem_lengths, file_list_count, &file_offsets, &file_lengths); /* in the case of the last read list call, leave here */ if (b_blks_read == total_blks_to_read) break; file_offsets += file_lengths; file_lengths = 0; } } /* for (i=0; i<flat_buf->count; i++) */ j++; } /* while (b_blks_read < total_blks_to_read) */ ADIOI_Free(mem_offsets); ADIOI_Free(mem_lengths); if (file_ptr_type == ADIO_INDIVIDUAL) fd->fp_ind = off; fd->fp_sys_posn = -1; /* set it to null. */ #ifdef HAVE_STATUS_SET_BYTES MPIR_Status_set_bytes(status, datatype, bufsize); /* This isa temporary way of filling in status. The right way is to keep tracke of how much data was actually read adn placed in buf by ADIOI_BUFFERED_READ. */ #endif ADIOI_Delete_flattened(datatype); return; } /* if (!buftype_is_contig && filetype_is_contig) */ /* know file is noncontiguous from above */ /* noncontiguous in file */ /* filetype already flattened in ADIO_Open */ flat_file = ADIOI_Flatlist; while (flat_file->type != fd->filetype) flat_file = flat_file->next; disp = fd->disp; /* for each case - ADIO_Individual pointer or explicit, find the file offset in bytes (offset), n_filetypes (how many filetypes into file to start), frd_size (remaining amount of data in present file block), and st_index (start point in terms of blocks in starting filetype) */ if (file_ptr_type == ADIO_INDIVIDUAL) { offset = fd->fp_ind; /* in bytes */ n_filetypes = -1; flag = 0; while (!flag) { n_filetypes++; for (i=0; i<flat_file->count; i++) { if (disp + flat_file->indices[i] + (ADIO_Offset) n_filetypes*filetype_extent + flat_file->blocklens[i] >= offset) { st_index = i; frd_size = (int) (disp + flat_file->indices[i] + (ADIO_Offset) n_filetypes*filetype_extent + flat_file->blocklens[i] - offset); flag = 1; break; } } } /* while (!flag) */ } /* if (file_ptr_type == ADIO_INDIVIDUAL) */ else { n_etypes_in_filetype = filetype_size/etype_size; n_filetypes = (int) (offset / n_etypes_in_filetype); etype_in_filetype = (int) (offset % n_etypes_in_filetype); size_in_filetype = etype_in_filetype * etype_size; sum = 0; for (i=0; i<flat_file->count; i++) { sum += flat_file->blocklens[i]; if (sum > size_in_filetype) { st_index = i; frd_size = sum - size_in_filetype; abs_off_in_filetype = flat_file->indices[i] + size_in_filetype - (sum - flat_file->blocklens[i]); break; } } /* abs. offset in bytes in the file */ offset = disp + (ADIO_Offset) n_filetypes*filetype_extent + abs_off_in_filetype; } /* else [file_ptr_type != ADIO_INDIVIDUAL] */ start_off = offset; st_frd_size = frd_size; st_n_filetypes = n_filetypes; if (buftype_is_contig && !filetype_is_contig) { /* contiguous in memory, noncontiguous in file. should be the most common case. */ int mem_lengths; char *mem_offsets; i = 0; j = st_index; n_filetypes = st_n_filetypes; mem_list_count = 1; /* determine how many blocks in file to read */ f_data_read = ADIOI_MIN(st_frd_size, bufsize); total_blks_to_read = 1; j++; while (f_data_read < bufsize) { f_data_read += flat_file->blocklens[j]; total_blks_to_read++; if (j<(flat_file->count-1)) j++; else j = 0; } j = st_index; n_filetypes = st_n_filetypes; n_read_lists = total_blks_to_read/MAX_ARRAY_SIZE; extra_blks = total_blks_to_read%MAX_ARRAY_SIZE; mem_offsets = buf; mem_lengths = 0; /* if at least one full readlist, allocate file arrays at max array size and don't free until very end */ if (n_read_lists) { file_offsets = (int64_t*)ADIOI_Malloc(MAX_ARRAY_SIZE* sizeof(int64_t)); file_lengths = (int32_t*)ADIOI_Malloc(MAX_ARRAY_SIZE* sizeof(int32_t)); } /* if there's no full readlist allocate file arrays according to needed size (extra_blks) */ else { file_offsets = (int64_t*)ADIOI_Malloc(extra_blks* sizeof(int64_t)); file_lengths = (int32_t*)ADIOI_Malloc(extra_blks* sizeof(int32_t)); } /* for file arrays that are of MAX_ARRAY_SIZE, build arrays */ for (i=0; i<n_read_lists; i++) { file_list_count = MAX_ARRAY_SIZE; if(!i) { file_offsets[0] = offset; file_lengths[0] = st_frd_size; mem_lengths = st_frd_size; } for (k=0; k<MAX_ARRAY_SIZE; k++) { if (i || k) { file_offsets[k] = disp + n_filetypes*filetype_extent + flat_file->indices[j]; file_lengths[k] = flat_file->blocklens[j]; mem_lengths += file_lengths[k]; } if (j<(flat_file->count - 1)) j++; else { j = 0; n_filetypes++; } } /* for (k=0; k<MAX_ARRAY_SIZE; k++) */ pvfs_read_list(fd->fd_sys, mem_list_count, &mem_offsets, &mem_lengths, file_list_count, file_offsets, file_lengths); mem_offsets += mem_lengths; mem_lengths = 0; } /* for (i=0; i<n_read_lists; i++) */ /* for file arrays smaller than MAX_ARRAY_SIZE (last read_list call) */ if (extra_blks) { file_list_count = extra_blks; if(!i) { file_offsets[0] = offset; file_lengths[0] = st_frd_size; } for (k=0; k<extra_blks; k++) { if(i || k) { file_offsets[k] = disp + n_filetypes*filetype_extent + flat_file->indices[j]; if (k == (extra_blks - 1)) { file_lengths[k] = bufsize - (int32_t) mem_lengths - (int32_t) mem_offsets + (int32_t) buf; } else file_lengths[k] = flat_file->blocklens[j]; } /* if(i || k) */ mem_lengths += file_lengths[k]; if (j<(flat_file->count - 1)) j++; else { j = 0; n_filetypes++; } } /* for (k=0; k<extra_blks; k++) */ pvfs_read_list(fd->fd_sys, mem_list_count, &mem_offsets, &mem_lengths, file_list_count, file_offsets, file_lengths); } } else { /* noncontiguous in memory as well as in file */ ADIOI_Flatten_datatype(datatype); flat_buf = ADIOI_Flatlist; while (flat_buf->type != datatype) flat_buf = flat_buf->next; size_read = 0; n_filetypes = st_n_filetypes; frd_size = st_frd_size; brd_size = flat_buf->blocklens[0]; buf_count = 0; start_mem_offset = 0; start_k = k = 0; start_j = st_index; max_mem_list = 0; max_file_list = 0; /* run through and file max_file_list and max_mem_list so that you can allocate the file and memory arrays less than MAX_ARRAY_SIZE if possible */ while (size_read < bufsize) { k = start_k; new_buffer_read = 0; mem_list_count = 0; while ((mem_list_count < MAX_ARRAY_SIZE) && (new_buffer_read < bufsize-size_read)) { /* find mem_list_count and file_list_count such that both are less than MAX_ARRAY_SIZE, the sum of their lengths are equal, and the sum of all the data read and data to be read in the next immediate read list is less than bufsize */ if(mem_list_count) { if((new_buffer_read + flat_buf->blocklens[k] + size_read) > bufsize) { end_brd_size = new_buffer_read + flat_buf->blocklens[k] - (bufsize - size_read); new_buffer_read = bufsize - size_read; } else { new_buffer_read += flat_buf->blocklens[k]; end_brd_size = flat_buf->blocklens[k]; } } else { if (brd_size > (bufsize - size_read)) { new_buffer_read = bufsize - size_read; brd_size = new_buffer_read; } else new_buffer_read = brd_size; } mem_list_count++; k = (k + 1)%flat_buf->count; } /* while ((mem_list_count < MAX_ARRAY_SIZE) && (new_buffer_read < bufsize-size_read)) */ j = start_j; new_file_read = 0; file_list_count = 0; while ((file_list_count < MAX_ARRAY_SIZE) && (new_file_read < new_buffer_read)) { if(file_list_count) { if((new_file_read + flat_file->blocklens[j]) > new_buffer_read) { end_frd_size = new_buffer_read - new_file_read; new_file_read = new_buffer_read; j--; } else { new_file_read += flat_file->blocklens[j]; end_frd_size = flat_file->blocklens[j]; } } else { if (frd_size > new_buffer_read) { new_file_read = new_buffer_read; frd_size = new_file_read; } else new_file_read = frd_size; } file_list_count++; if (j < (flat_file->count - 1)) j++; else j = 0; k = start_k; if ((new_file_read < new_buffer_read) && (file_list_count == MAX_ARRAY_SIZE)) { new_buffer_read = 0; mem_list_count = 0; while (new_buffer_read < new_file_read) { if(mem_list_count) { if((new_buffer_read + flat_buf->blocklens[k]) > new_file_read) { end_brd_size = new_file_read - new_buffer_read; new_buffer_read = new_file_read; k--; } else { new_buffer_read += flat_buf->blocklens[k]; end_brd_size = flat_buf->blocklens[k]; } } else { new_buffer_read = brd_size; if (brd_size > (bufsize - size_read)) { new_buffer_read = bufsize - size_read; brd_size = new_buffer_read; } } mem_list_count++; k = (k + 1)%flat_buf->count; } /* while (new_buffer_read < new_file_read) */ } /* if ((new_file_read < new_buffer_read) && (file_list_count == MAX_ARRAY_SIZE)) */ } /* while ((mem_list_count < MAX_ARRAY_SIZE) && (new_buffer_read < bufsize-size_read)) */ /* fakes filling the readlist arrays of lengths found above */ k = start_k; j = start_j; for (i=0; i<mem_list_count; i++) { if(i) { if (i == (mem_list_count - 1)) { if (flat_buf->blocklens[k] == end_brd_size) brd_size = flat_buf->blocklens[(k+1)% flat_buf->count]; else { brd_size = flat_buf->blocklens[k] - end_brd_size; k--; buf_count--; } } } buf_count++; k = (k + 1)%flat_buf->count; } /* for (i=0; i<mem_list_count; i++) */ for (i=0; i<file_list_count; i++) { if (i) { if (i == (file_list_count - 1)) { if (flat_file->blocklens[j] == end_frd_size) frd_size = flat_file->blocklens[(j+1)% flat_file->count]; else { frd_size = flat_file->blocklens[j] - end_frd_size; j--; } } } if (j < flat_file->count - 1) j++; else { j = 0; n_filetypes++; } } /* for (i=0; i<file_list_count; i++) */ size_read += new_buffer_read; start_k = k; start_j = j; if (max_mem_list < mem_list_count) max_mem_list = mem_list_count; if (max_file_list < file_list_count) max_file_list = file_list_count; } /* while (size_read < bufsize) */ mem_offsets = (char **)ADIOI_Malloc(max_mem_list*sizeof(char *)); mem_lengths = (int *)ADIOI_Malloc(max_mem_list*sizeof(int)); file_offsets = (int64_t *)ADIOI_Malloc(max_file_list*sizeof(int64_t)); file_lengths = (int32_t *)ADIOI_Malloc(max_file_list*sizeof(int32_t)); size_read = 0; n_filetypes = st_n_filetypes; frd_size = st_frd_size; brd_size = flat_buf->blocklens[0]; buf_count = 0; start_mem_offset = 0; start_k = k = 0; start_j = st_index; /* this section calculates mem_list_count and file_list_count and also finds the possibly odd sized last array elements in new_frd_size and new_brd_size */ while (size_read < bufsize) { k = start_k; new_buffer_read = 0; mem_list_count = 0; while ((mem_list_count < MAX_ARRAY_SIZE) && (new_buffer_read < bufsize-size_read)) { /* find mem_list_count and file_list_count such that both are less than MAX_ARRAY_SIZE, the sum of their lengths are equal, and the sum of all the data read and data to be read in the next immediate read list is less than bufsize */ if(mem_list_count) { if((new_buffer_read + flat_buf->blocklens[k] + size_read) > bufsize) { end_brd_size = new_buffer_read + flat_buf->blocklens[k] - (bufsize - size_read); new_buffer_read = bufsize - size_read; } else { new_buffer_read += flat_buf->blocklens[k]; end_brd_size = flat_buf->blocklens[k]; } } else { if (brd_size > (bufsize - size_read)) { new_buffer_read = bufsize - size_read; brd_size = new_buffer_read; } else new_buffer_read = brd_size; } mem_list_count++; k = (k + 1)%flat_buf->count; } /* while ((mem_list_count < MAX_ARRAY_SIZE) && (new_buffer_read < bufsize-size_read)) */ j = start_j; new_file_read = 0; file_list_count = 0; while ((file_list_count < MAX_ARRAY_SIZE) && (new_file_read < new_buffer_read)) { if(file_list_count) { if((new_file_read + flat_file->blocklens[j]) > new_buffer_read) { end_frd_size = new_buffer_read - new_file_read; new_file_read = new_buffer_read; j--; } else { new_file_read += flat_file->blocklens[j]; end_frd_size = flat_file->blocklens[j]; } } else { if (frd_size > new_buffer_read) { new_file_read = new_buffer_read; frd_size = new_file_read; } else new_file_read = frd_size; } file_list_count++; if (j < (flat_file->count - 1)) j++; else j = 0; k = start_k; if ((new_file_read < new_buffer_read) && (file_list_count == MAX_ARRAY_SIZE)) { new_buffer_read = 0; mem_list_count = 0; while (new_buffer_read < new_file_read) { if(mem_list_count) { if((new_buffer_read + flat_buf->blocklens[k]) > new_file_read) { end_brd_size = new_file_read - new_buffer_read; new_buffer_read = new_file_read; k--; } else { new_buffer_read += flat_buf->blocklens[k]; end_brd_size = flat_buf->blocklens[k]; } } else { new_buffer_read = brd_size; if (brd_size > (bufsize - size_read)) { new_buffer_read = bufsize - size_read; brd_size = new_buffer_read; } } mem_list_count++; k = (k + 1)%flat_buf->count; } /* while (new_buffer_read < new_file_read) */ } /* if ((new_file_read < new_buffer_read) && (file_list_count == MAX_ARRAY_SIZE)) */ } /* while ((mem_list_count < MAX_ARRAY_SIZE) && (new_buffer_read < bufsize-size_read)) */ /* fills the allocated readlist arrays */ k = start_k; j = start_j; for (i=0; i<mem_list_count; i++) { mem_offsets[i] = (char*)((char *)buf + buftype_extent* (buf_count/flat_buf->count) + (int)flat_buf->indices[k]); if(!i) { mem_lengths[0] = brd_size; mem_offsets[0] += flat_buf->blocklens[k] - brd_size; } else { if (i == (mem_list_count - 1)) { mem_lengths[i] = end_brd_size; if (flat_buf->blocklens[k] == end_brd_size) brd_size = flat_buf->blocklens[(k+1)% flat_buf->count]; else { brd_size = flat_buf->blocklens[k] - end_brd_size; k--; buf_count--; } } else { mem_lengths[i] = flat_buf->blocklens[k]; } } buf_count++; k = (k + 1)%flat_buf->count; } /* for (i=0; i<mem_list_count; i++) */ for (i=0; i<file_list_count; i++) { file_offsets[i] = disp + flat_file->indices[j] + n_filetypes * filetype_extent; if (!i) { file_lengths[0] = frd_size; file_offsets[0] += flat_file->blocklens[j] - frd_size; } else { if (i == (file_list_count - 1)) { file_lengths[i] = end_frd_size; if (flat_file->blocklens[j] == end_frd_size) frd_size = flat_file->blocklens[(j+1)% flat_file->count]; else { frd_size = flat_file->blocklens[j] - end_frd_size; j--; } } else file_lengths[i] = flat_file->blocklens[j]; } if (j < flat_file->count - 1) j++; else { j = 0; n_filetypes++; } } /* for (i=0; i<file_list_count; i++) */ /* printf("about to call read_list in noncontig/noncontig\n"); printf("offsets and lengths in terms of integers\n"); printf("\nmem_list_count = %d\n", mem_list_count); for (i=0; i<mem_list_count; i++) { printf("mem_offsets[%2d] = %2d ", i, (int)(mem_offsets[i] - (int)buf)/4); printf("mem_lengths[%2d] = %2d\n", i, mem_lengths[i]/4); } printf("\nfile_list_count = %d\n", file_list_count); for (i=0; i<file_list_count; i++) { printf("file_offsets[%2d] = %2d ", i, (int)file_offsets[i]/4); printf("file_lengths[%2d] = %2d\n", i, file_lengths[i]/4); } printf("\n\n"); */ pvfs_read_list(fd->fd_sys,mem_list_count, mem_offsets, mem_lengths, file_list_count, file_offsets, file_lengths); size_read += new_buffer_read; start_k = k; start_j = j; } /* while (size_read < bufsize) */ ADIOI_Free(mem_offsets); ADIOI_Free(mem_lengths); } ADIOI_Free(file_offsets); ADIOI_Free(file_lengths); if (file_ptr_type == ADIO_INDIVIDUAL) fd->fp_ind = off; fd->fp_sys_posn = -1; /* set it to null. */ #ifdef HAVE_STATUS_SET_BYTES MPIR_Status_set_bytes(status, datatype, bufsize); /* This is a temporary way of filling in status. The right way is to keep track of how much data was actually read and placed in buf by ADIOI_BUFFERED_READ. */ #endif if (!buftype_is_contig) ADIOI_Delete_flattened(datatype); }
void ADIOI_GRIDFTP_WriteDiscontig(ADIO_File fd, void *buf, int count, MPI_Datatype datatype, int file_ptr_type, ADIO_Offset offset, ADIO_Status *status, int *error_code) { char myname[]="ADIOI_GRIDFTP_WriteDiscontig"; int myrank,nprocs; MPI_Aint btype_size,btype_extent; MPI_Aint ftype_size,ftype_extent; MPI_Aint etype_size; MPI_Aint extent; ADIOI_Flatlist_node *flat_file; int buf_contig,boff,i,nblks; globus_off_t start,end,goff; globus_size_t bytes_written; globus_result_t result; MPI_Comm_rank(fd->comm,&myrank); MPI_Comm_size(fd->comm,&nprocs); etype_size=fd->etype_size; MPI_Type_size(fd->filetype,&ftype_size); MPI_Type_extent(fd->filetype,&ftype_extent); /* This is arguably unnecessary, as this routine assumes that the buffer in memory is contiguous */ MPI_Type_size(datatype,&btype_size); MPI_Type_extent(datatype,&btype_extent); ADIOI_Datatype_iscontig(datatype,&buf_contig); if ( ( btype_extent!=btype_size ) || ( ! buf_contig ) ) { FPRINTF(stderr,"[%d/%d] %s called with discontigous memory buffer\n", myrank,nprocs,myname); fflush(stderr); *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_IO, "**io", "**io %s", globus_object_printable_to_string(globus_error_get(result))); return; } /* from here we can assume btype_extent==btype_size */ /* Flatten out fd->filetype so we know which blocks to skip */ ADIOI_Flatten_datatype(fd->filetype); flat_file = ADIOI_Flatlist; while (flat_file->type != fd->filetype && flat_file->next!=NULL) flat_file = flat_file->next; /* Figure out how big the area to write is */ /* ASSUMPTION: ftype_size is an integer multiple of btype_size or vice versa. */ start=(globus_off_t)(offset*etype_size); goff=start; boff=0; extent=0; nblks=0; while ( boff < (count*btype_size) ) { int blklen; for (i=0;i<flat_file->count;i++) { if ( (boff+flat_file->blocklens[i]) < (count*btype_size) ) blklen=flat_file->blocklens[i]; else blklen=(count*btype_size)-boff; boff+=blklen; extent=MAX(extent,nblks*ftype_extent+flat_file->indices[i]+blklen); if ( boff>=(count*btype_size) ) break; } nblks++; } if ( extent < count*btype_size ) { FPRINTF(stderr,"[%d/%d] %s error in computing extent -- extent %d is smaller than total bytes requested %d!\n", myrank,nprocs,myname,extent,count*btype_size); fflush(stderr); *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_IO, "**io", "**io %s", globus_object_printable_to_string(globus_error_get(result))); return; } end=start+(globus_off_t)extent; FPRINTF(stderr,"[%d/%d] %s writing %d bytes into extent of %d bytes starting at offset %Ld\n", myrank,nprocs,myname,count*btype_size,extent,(long long)start); fflush(stderr); /* start up the globus partial write */ globus_mutex_init(&writediscontig_ctl_lock, GLOBUS_NULL); globus_cond_init(&writediscontig_ctl_cond, GLOBUS_NULL); writediscontig_ctl_done=GLOBUS_FALSE; if ( (result=globus_ftp_client_partial_put(&(gridftp_fh[fd->fd_sys]), fd->filename, &(oattr[fd->fd_sys]), GLOBUS_NULL, start, end, writediscontig_ctl_cb, GLOBUS_NULL))!=GLOBUS_SUCCESS ) { globus_err_handler("globus_ftp_client_partial_get",myname,result); *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_IO, "**io", "**io %s", globus_object_printable_to_string(globus_error_get(result))); return; } /* Do all the actual I/Os */ boff=0; nblks=0; while ( boff < (count*btype_size) ) { int i,blklen; for (i=0;i<flat_file->count;i++) { if ( (boff+flat_file->blocklens[i]) < (count*btype_size) ) blklen=flat_file->blocklens[i]; else blklen=(count*btype_size)-boff; if ( blklen > 0 ) { goff=start+nblks*ftype_extent+((globus_off_t)flat_file->indices[i]); /* FPRINTF(stderr,"[%d/%d] %s writing %d bytes from boff=%d at goff=%Ld\n",myrank,nprocs,myname,blklen,boff,goff); */ if ( (result=globus_ftp_client_register_write(&(gridftp_fh[fd->fd_sys]), ((globus_byte_t *)buf)+boff, (globus_size_t)blklen, goff, GLOBUS_TRUE, writediscontig_data_cb, (void *)(&bytes_written)))!=GLOBUS_SUCCESS ) { globus_err_handler("globus_ftp_client_register_write",myname,result); *error_code=MPI_ERR_IO; ADIOI_Error(fd,*error_code,myname); return; } boff+=blklen; if ( boff>=(count*btype_size) ) break; } } nblks++; } /* The ctl callback won't start till the data callbacks complete, so it's safe to wait on just the ctl callback */ globus_mutex_lock(&writediscontig_ctl_lock); while ( writediscontig_ctl_done!=GLOBUS_TRUE ) globus_cond_wait(&writediscontig_ctl_cond,&writediscontig_ctl_lock); globus_mutex_unlock(&writediscontig_ctl_lock); globus_mutex_destroy(&writediscontig_ctl_lock); globus_cond_destroy(&writediscontig_ctl_cond); #ifdef HAVE_STATUS_SET_BYTES MPIR_Status_set_bytes(status, datatype, bytes_written); #endif if (file_ptr_type != ADIO_EXPLICIT_OFFSET) { fd->fp_ind += extent; fd->fp_sys_posn = fd->fp_ind; } else { fd->fp_sys_posn = offset + extent; } }
void ADIOI_GRIDFTP_WriteContig(ADIO_File fd, void *buf, int count, MPI_Datatype datatype, int file_ptr_type, ADIO_Offset offset, ADIO_Status *status, int *error_code) { char myname[]="ADIOI_GRIDFTP_WriteContig"; int myrank, nprocs, datatype_size; globus_size_t len,bytes_written=0; globus_off_t goff; globus_result_t result; if ( fd->access_mode&ADIO_RDONLY ) { *error_code=MPI_ERR_AMODE; return; } *error_code = MPI_SUCCESS; MPI_Comm_size(fd->comm, &nprocs); MPI_Comm_rank(fd->comm, &myrank); MPI_Type_size(datatype, &datatype_size); if (file_ptr_type != ADIO_EXPLICIT_OFFSET) { offset = fd->fp_ind; } /* Do the gridftp I/O transfer */ goff = (globus_off_t)offset; len = ((globus_size_t)datatype_size)*((globus_size_t)count); globus_mutex_init(&writecontig_ctl_lock, GLOBUS_NULL); globus_cond_init(&writecontig_ctl_cond, GLOBUS_NULL); writecontig_ctl_done=GLOBUS_FALSE; if ( (result=globus_ftp_client_partial_put(&(gridftp_fh[fd->fd_sys]), fd->filename, &(oattr[fd->fd_sys]), GLOBUS_NULL, goff, goff+(globus_off_t)len, writecontig_ctl_cb, GLOBUS_NULL))!=GLOBUS_SUCCESS ) { globus_err_handler("globus_ftp_client_partial_put",myname,result); *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_IO, "**io", "**io %s", globus_object_printable_to_string(globus_error_get(result))); return; } if ( (result=globus_ftp_client_register_write(&(gridftp_fh[fd->fd_sys]), (globus_byte_t *)buf, len, goff, GLOBUS_TRUE, writecontig_data_cb, (void *)(&bytes_written)))!=GLOBUS_SUCCESS ) { globus_err_handler("globus_ftp_client_register_write",myname,result); *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_IO, "**io", "**io %s", globus_object_printable_to_string(globus_error_get(result))); return; } /* The ctl callback won't start till the data callbacks complete, so it's safe to wait on just the ctl callback */ globus_mutex_lock(&writecontig_ctl_lock); while ( writecontig_ctl_done!=GLOBUS_TRUE ) globus_cond_wait(&writecontig_ctl_cond,&writecontig_ctl_lock); globus_mutex_unlock(&writecontig_ctl_lock); globus_mutex_destroy(&writecontig_ctl_lock); globus_cond_destroy(&writecontig_ctl_cond); #ifdef HAVE_STATUS_SET_BYTES MPIR_Status_set_bytes(status, datatype, bytes_written); #endif if (file_ptr_type != ADIO_EXPLICIT_OFFSET) { offset = fd->fp_ind; fd->fp_ind += bytes_written; fd->fp_sys_posn = fd->fp_ind; } else { fd->fp_sys_posn = offset + bytes_written; } }
void ADIOI_GEN_ReadStridedColl(ADIO_File fd, void *buf, int count, MPI_Datatype datatype, int file_ptr_type, ADIO_Offset offset, ADIO_Status *status, int *error_code) { /* Uses a generalized version of the extended two-phase method described in "An Extended Two-Phase Method for Accessing Sections of Out-of-Core Arrays", Rajeev Thakur and Alok Choudhary, Scientific Programming, (5)4:301--317, Winter 1996. http://www.mcs.anl.gov/home/thakur/ext2ph.ps */ ADIOI_Access *my_req; /* array of nprocs structures, one for each other process in whose file domain this process's request lies */ ADIOI_Access *others_req; /* array of nprocs structures, one for each other process whose request lies in this process's file domain. */ int i, filetype_is_contig, nprocs, nprocs_for_coll, myrank; int contig_access_count=0, interleave_count = 0, buftype_is_contig; int *count_my_req_per_proc, count_my_req_procs, count_others_req_procs; ADIO_Offset start_offset, end_offset, orig_fp, fd_size, min_st_offset, off; ADIO_Offset *offset_list = NULL, *st_offsets = NULL, *fd_start = NULL, *fd_end = NULL, *end_offsets = NULL; ADIO_Offset *len_list = NULL; int *buf_idx = NULL; #ifdef HAVE_STATUS_SET_BYTES MPI_Count bufsize, size; #endif if (fd->hints->cb_pfr != ADIOI_HINT_DISABLE) { ADIOI_IOStridedColl (fd, buf, count, ADIOI_READ, datatype, file_ptr_type, offset, status, error_code); return; } MPI_Comm_size(fd->comm, &nprocs); MPI_Comm_rank(fd->comm, &myrank); /* number of aggregators, cb_nodes, is stored in the hints */ nprocs_for_coll = fd->hints->cb_nodes; orig_fp = fd->fp_ind; /* only check for interleaving if cb_read isn't disabled */ if (fd->hints->cb_read != ADIOI_HINT_DISABLE) { /* For this process's request, calculate the list of offsets and lengths in the file and determine the start and end offsets. */ /* Note: end_offset points to the last byte-offset that will be accessed. e.g., if start_offset=0 and 100 bytes to be read, end_offset=99*/ ADIOI_Calc_my_off_len(fd, count, datatype, file_ptr_type, offset, &offset_list, &len_list, &start_offset, &end_offset, &contig_access_count); #ifdef RDCOLL_DEBUG for (i=0; i<contig_access_count; i++) { DBG_FPRINTF(stderr, "rank %d off %lld len %lld\n", myrank, offset_list[i], len_list[i]); } #endif /* each process communicates its start and end offsets to other processes. The result is an array each of start and end offsets stored in order of process rank. */ st_offsets = (ADIO_Offset *) ADIOI_Malloc(nprocs*sizeof(ADIO_Offset)); end_offsets = (ADIO_Offset *) ADIOI_Malloc(nprocs*sizeof(ADIO_Offset)); MPI_Allgather(&start_offset, 1, ADIO_OFFSET, st_offsets, 1, ADIO_OFFSET, fd->comm); MPI_Allgather(&end_offset, 1, ADIO_OFFSET, end_offsets, 1, ADIO_OFFSET, fd->comm); /* are the accesses of different processes interleaved? */ for (i=1; i<nprocs; i++) if ((st_offsets[i] < end_offsets[i-1]) && (st_offsets[i] <= end_offsets[i])) interleave_count++; /* This is a rudimentary check for interleaving, but should suffice for the moment. */ } ADIOI_Datatype_iscontig(datatype, &buftype_is_contig); if (fd->hints->cb_read == ADIOI_HINT_DISABLE || (!interleave_count && (fd->hints->cb_read == ADIOI_HINT_AUTO))) { /* don't do aggregation */ if (fd->hints->cb_read != ADIOI_HINT_DISABLE) { ADIOI_Free(offset_list); ADIOI_Free(len_list); ADIOI_Free(st_offsets); ADIOI_Free(end_offsets); } fd->fp_ind = orig_fp; ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig); if (buftype_is_contig && filetype_is_contig) { if (file_ptr_type == ADIO_EXPLICIT_OFFSET) { off = fd->disp + (fd->etype_size) * offset; ADIO_ReadContig(fd, buf, count, datatype, ADIO_EXPLICIT_OFFSET, off, status, error_code); } else ADIO_ReadContig(fd, buf, count, datatype, ADIO_INDIVIDUAL, 0, status, error_code); } else ADIO_ReadStrided(fd, buf, count, datatype, file_ptr_type, offset, status, error_code); return; } /* We're going to perform aggregation of I/O. Here we call * ADIOI_Calc_file_domains() to determine what processes will handle I/O * to what regions. We pass nprocs_for_coll into this function; it is * used to determine how many processes will perform I/O, which is also * the number of regions into which the range of bytes must be divided. * These regions are called "file domains", or FDs. * * When this function returns, fd_start, fd_end, fd_size, and * min_st_offset will be filled in. fd_start holds the starting byte * location for each file domain. fd_end holds the ending byte location. * min_st_offset holds the minimum byte location that will be accessed. * * Both fd_start[] and fd_end[] are indexed by an aggregator number; this * needs to be mapped to an actual rank in the communicator later. * */ ADIOI_Calc_file_domains(st_offsets, end_offsets, nprocs, nprocs_for_coll, &min_st_offset, &fd_start, &fd_end, fd->hints->min_fdomain_size, &fd_size, fd->hints->striping_unit); /* calculate where the portions of the access requests of this process * are located in terms of the file domains. this could be on the same * process or on other processes. this function fills in: * count_my_req_procs - number of processes (including this one) for which * this process has requests in their file domain * count_my_req_per_proc - count of requests for each process, indexed * by rank of the process * my_req[] - array of data structures describing the requests to be * performed by each process (including self). indexed by rank. * buf_idx[] - array of locations into which data can be directly moved; * this is only valid for contiguous buffer case */ ADIOI_Calc_my_req(fd, offset_list, len_list, contig_access_count, min_st_offset, fd_start, fd_end, fd_size, nprocs, &count_my_req_procs, &count_my_req_per_proc, &my_req, &buf_idx); /* perform a collective communication in order to distribute the * data calculated above. fills in the following: * count_others_req_procs - number of processes (including this * one) which have requests in this process's file domain. * count_others_req_per_proc[] - number of separate contiguous * requests from proc i lie in this process's file domain. */ ADIOI_Calc_others_req(fd, count_my_req_procs, count_my_req_per_proc, my_req, nprocs, myrank, &count_others_req_procs, &others_req); /* my_req[] and count_my_req_per_proc aren't needed at this point, so * let's free the memory */ ADIOI_Free(count_my_req_per_proc); for (i=0; i<nprocs; i++) { if (my_req[i].count) { ADIOI_Free(my_req[i].offsets); ADIOI_Free(my_req[i].lens); } } ADIOI_Free(my_req); /* read data in sizes of no more than ADIOI_Coll_bufsize, * communicate, and fill user buf. */ ADIOI_Read_and_exch(fd, buf, datatype, nprocs, myrank, others_req, offset_list, len_list, contig_access_count, min_st_offset, fd_size, fd_start, fd_end, buf_idx, error_code); if (!buftype_is_contig) ADIOI_Delete_flattened(datatype); /* free all memory allocated for collective I/O */ for (i=0; i<nprocs; i++) { if (others_req[i].count) { ADIOI_Free(others_req[i].offsets); ADIOI_Free(others_req[i].lens); ADIOI_Free(others_req[i].mem_ptrs); } } ADIOI_Free(others_req); ADIOI_Free(buf_idx); ADIOI_Free(offset_list); ADIOI_Free(len_list); ADIOI_Free(st_offsets); ADIOI_Free(end_offsets); ADIOI_Free(fd_start); ADIOI_Free(fd_end); #ifdef HAVE_STATUS_SET_BYTES MPI_Type_size_x(datatype, &size); bufsize = size * count; MPIR_Status_set_bytes(status, datatype, bufsize); /* This is a temporary way of filling in status. The right way is to keep track of how much data was actually read and placed in buf during collective I/O. */ #endif fd->fp_sys_posn = -1; /* set it to null. */ }
void ADIOI_NFS_WriteContig(ADIO_File fd, const void *buf, int count, MPI_Datatype datatype, int file_ptr_type, ADIO_Offset offset, ADIO_Status *status, int *error_code) { int err=-1; MPI_Count datatype_size, len; static char myname[] = "ADIOI_NFS_WRITECONTIG"; MPI_Type_size_x(datatype, &datatype_size); len = datatype_size * count; if (file_ptr_type == ADIO_EXPLICIT_OFFSET) { if (fd->fp_sys_posn != offset) { #ifdef ADIOI_MPE_LOGGING MPE_Log_event( ADIOI_MPE_lseek_a, 0, NULL ); #endif lseek(fd->fd_sys, offset, SEEK_SET); #ifdef ADIOI_MPE_LOGGING MPE_Log_event( ADIOI_MPE_lseek_b, 0, NULL ); #endif } ADIOI_WRITE_LOCK(fd, offset, SEEK_SET, len); #ifdef ADIOI_MPE_LOGGING MPE_Log_event( ADIOI_MPE_write_a, 0, NULL ); #endif err = write(fd->fd_sys, buf, len); #ifdef ADIOI_MPE_LOGGING MPE_Log_event( ADIOI_MPE_write_b, 0, NULL ); #endif ADIOI_UNLOCK(fd, offset, SEEK_SET, len); fd->fp_sys_posn = offset + err; /* individual file pointer not updated */ } else { /* write from curr. location of ind. file pointer */ offset = fd->fp_ind; if (fd->fp_sys_posn != fd->fp_ind) { #ifdef ADIOI_MPE_LOGGING MPE_Log_event( ADIOI_MPE_lseek_a, 0, NULL ); #endif lseek(fd->fd_sys, fd->fp_ind, SEEK_SET); #ifdef ADIOI_MPE_LOGGING MPE_Log_event( ADIOI_MPE_lseek_b, 0, NULL ); #endif } ADIOI_WRITE_LOCK(fd, offset, SEEK_SET, len); #ifdef ADIOI_MPE_LOGGING MPE_Log_event( ADIOI_MPE_write_a, 0, NULL ); #endif err = write(fd->fd_sys, buf, len); #ifdef ADIOI_MPE_LOGGING MPE_Log_event( ADIOI_MPE_write_b, 0, NULL ); #endif ADIOI_UNLOCK(fd, offset, SEEK_SET, len); fd->fp_ind += err; fd->fp_sys_posn = fd->fp_ind; } /* --BEGIN ERROR HANDLING-- */ if (err == -1) { *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_IO, "**io", "**io %s", strerror(errno)); return; } /* --END ERROR HANDLING-- */ #ifdef HAVE_STATUS_SET_BYTES MPIR_Status_set_bytes(status, datatype, err); #endif *error_code = MPI_SUCCESS; }
void ADIOI_LUSTRE_WriteStrided(ADIO_File fd, void *buf, int count, MPI_Datatype datatype, int file_ptr_type, ADIO_Offset offset, ADIO_Status * status, int *error_code) { /* offset is in units of etype relative to the filetype. */ ADIOI_Flatlist_node *flat_buf, *flat_file; ADIO_Offset i_offset, sum, size_in_filetype; int i, j, k, st_index=0; int n_etypes_in_filetype; ADIO_Offset num, size, n_filetypes, etype_in_filetype, st_n_filetypes; ADIO_Offset abs_off_in_filetype=0; int filetype_size, etype_size, buftype_size; MPI_Aint filetype_extent, buftype_extent; int buf_count, buftype_is_contig, filetype_is_contig; ADIO_Offset userbuf_off; ADIO_Offset off, req_off, disp, end_offset=0, writebuf_off, start_off; char *writebuf; unsigned bufsize, writebuf_len, write_sz; ADIO_Status status1; ADIO_Offset new_bwr_size, new_fwr_size, st_fwr_size, fwr_size=0, bwr_size, req_len; int stripe_size; static char myname[] = "ADIOI_LUSTRE_WriteStrided"; if (fd->hints->ds_write == ADIOI_HINT_DISABLE) { /* if user has disabled data sieving on writes, use naive * approach instead. */ ADIOI_GEN_WriteStrided_naive(fd, buf, count, datatype, file_ptr_type, offset, status, error_code); return; } *error_code = MPI_SUCCESS; /* changed below if error */ ADIOI_Datatype_iscontig(datatype, &buftype_is_contig); ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig); MPI_Type_size(fd->filetype, &filetype_size); if (!filetype_size) { #ifdef HAVE_STATUS_SET_BYTES MPIR_Status_set_bytes(status, datatype, 0); #endif *error_code = MPI_SUCCESS; return; } MPI_Type_extent(fd->filetype, &filetype_extent); MPI_Type_size(datatype, &buftype_size); MPI_Type_extent(datatype, &buftype_extent); etype_size = fd->etype_size; ADIOI_Assert((buftype_size * count) == ((ADIO_Offset)(unsigned)buftype_size * (ADIO_Offset)count)); bufsize = buftype_size * count; /* get striping info */ stripe_size = fd->hints->striping_unit; /* Different buftype to different filetype */ if (!buftype_is_contig && filetype_is_contig) { /* noncontiguous in memory, contiguous in file. */ ADIOI_Flatten_datatype(datatype); flat_buf = ADIOI_Flatlist; while (flat_buf->type != datatype) flat_buf = flat_buf->next; off = (file_ptr_type == ADIO_INDIVIDUAL) ? fd->fp_ind : fd->disp + (ADIO_Offset)etype_size * offset; start_off = off; end_offset = start_off + bufsize - 1; /* write stripe size buffer each time */ writebuf = (char *) ADIOI_Malloc(ADIOI_MIN(bufsize, stripe_size)); writebuf_off = 0; writebuf_len = 0; /* if atomicity is true, lock the region to be accessed */ if (fd->atomicity) ADIOI_WRITE_LOCK(fd, start_off, SEEK_SET, bufsize); for (j = 0; j < count; j++) { for (i = 0; i < flat_buf->count; i++) { userbuf_off = (ADIO_Offset)j * (ADIO_Offset)buftype_extent + flat_buf->indices[i]; req_off = off; req_len = flat_buf->blocklens[i]; ADIOI_BUFFERED_WRITE_WITHOUT_READ off += flat_buf->blocklens[i]; } } /* write the buffer out finally */ ADIO_WriteContig(fd, writebuf, writebuf_len, MPI_BYTE, ADIO_EXPLICIT_OFFSET, writebuf_off, &status1, error_code); if (fd->atomicity) ADIOI_UNLOCK(fd, start_off, SEEK_SET, bufsize); if (*error_code != MPI_SUCCESS) { ADIOI_Free(writebuf); return; } ADIOI_Free(writebuf); if (file_ptr_type == ADIO_INDIVIDUAL) fd->fp_ind = off; } else { /* noncontiguous in file */ /* filetype already flattened in ADIO_Open */ flat_file = ADIOI_Flatlist; while (flat_file->type != fd->filetype) flat_file = flat_file->next; disp = fd->disp; if (file_ptr_type == ADIO_INDIVIDUAL) { /* Wei-keng reworked type processing to be a bit more efficient */ offset = fd->fp_ind - disp; n_filetypes = (offset - flat_file->indices[0]) / filetype_extent; offset -= (ADIO_Offset)n_filetypes * filetype_extent; /* now offset is local to this extent */ /* find the block where offset is located, skip blocklens[i]==0 */ for (i=0; i<flat_file->count; i++) { ADIO_Offset dist; if (flat_file->blocklens[i] == 0) continue; dist = flat_file->indices[i] + flat_file->blocklens[i] - offset; /* fwr_size is from offset to the end of block i */ if (dist == 0) { i++; offset = flat_file->indices[i]; fwr_size = flat_file->blocklens[i]; break; } if (dist > 0) { fwr_size = dist; break; } } st_index = i; /* starting index in flat_file->indices[] */ offset += disp + (ADIO_Offset)n_filetypes*filetype_extent; } else { n_etypes_in_filetype = filetype_size/etype_size; n_filetypes = offset / n_etypes_in_filetype; etype_in_filetype = offset % n_etypes_in_filetype; size_in_filetype = etype_in_filetype * etype_size; sum = 0; for (i = 0; i < flat_file->count; i++) { sum += flat_file->blocklens[i]; if (sum > size_in_filetype) { st_index = i; fwr_size = sum - size_in_filetype; abs_off_in_filetype = flat_file->indices[i] + size_in_filetype - (sum - flat_file->blocklens[i]); break; } } /* abs. offset in bytes in the file */ offset = disp + (ADIO_Offset) n_filetypes *filetype_extent + abs_off_in_filetype; } start_off = offset; /* Wei-keng Liao:write request is within single flat_file * contig block*/ /* this could happen, for example, with subarray types that are * actually fairly contiguous */ if (buftype_is_contig && bufsize <= fwr_size) { req_off = start_off; req_len = bufsize; end_offset = start_off + bufsize - 1; writebuf = (char *) ADIOI_Malloc(ADIOI_MIN(bufsize, stripe_size)); memset(writebuf, -1, ADIOI_MIN(bufsize, stripe_size)); writebuf_off = 0; writebuf_len = 0; userbuf_off = 0; ADIOI_BUFFERED_WRITE_WITHOUT_READ /* write the buffer out finally */ ADIO_WriteContig(fd, writebuf, writebuf_len, MPI_BYTE, ADIO_EXPLICIT_OFFSET, writebuf_off, &status1, error_code); if (file_ptr_type == ADIO_INDIVIDUAL) { /* update MPI-IO file pointer to point to the first byte * that can be accessed in the fileview. */ fd->fp_ind = offset + bufsize; if (bufsize == fwr_size) { do { st_index++; if (st_index == flat_file->count) { st_index = 0; n_filetypes++; } } while (flat_file->blocklens[st_index] == 0); fd->fp_ind = disp + flat_file->indices[st_index] + (ADIO_Offset)n_filetypes*filetype_extent; } } fd->fp_sys_posn = -1; /* set it to null. */ #ifdef HAVE_STATUS_SET_BYTES MPIR_Status_set_bytes(status, datatype, bufsize); #endif ADIOI_Free(writebuf); return; } /* Calculate end_offset, the last byte-offset that will be accessed. e.g., if start_offset=0 and 100 bytes to be write, end_offset=99*/ st_fwr_size = fwr_size; st_n_filetypes = n_filetypes; i_offset = 0; j = st_index; off = offset; fwr_size = ADIOI_MIN(st_fwr_size, bufsize); while (i_offset < bufsize) { i_offset += fwr_size; end_offset = off + fwr_size - 1; j = (j+1) % flat_file->count; n_filetypes += (j == 0) ? 1 : 0; while (flat_file->blocklens[j]==0) { j = (j+1) % flat_file->count; n_filetypes += (j == 0) ? 1 : 0; } off = disp + flat_file->indices[j] + n_filetypes*(ADIO_Offset)filetype_extent; fwr_size = ADIOI_MIN(flat_file->blocklens[j], bufsize-i_offset); } /* if atomicity is true, lock the region to be accessed */ if (fd->atomicity) ADIOI_WRITE_LOCK(fd, start_off, SEEK_SET, end_offset-start_off+1); writebuf_off = 0; writebuf_len = 0; writebuf = (char *) ADIOI_Malloc(stripe_size); memset(writebuf, -1, stripe_size); if (buftype_is_contig && !filetype_is_contig) { /* contiguous in memory, noncontiguous in file. should be the most common case. */ i_offset = 0; j = st_index; off = offset; n_filetypes = st_n_filetypes; fwr_size = ADIOI_MIN(st_fwr_size, bufsize); while (i_offset < bufsize) { if (fwr_size) { /* TYPE_UB and TYPE_LB can result in fwr_size = 0. save system call in such cases */ /* lseek(fd->fd_sys, off, SEEK_SET); err = write(fd->fd_sys, ((char *) buf) + i_offset, fwr_size);*/ req_off = off; req_len = fwr_size; userbuf_off = i_offset; ADIOI_BUFFERED_WRITE } i_offset += fwr_size; if (off + fwr_size < disp + flat_file->indices[j] + flat_file->blocklens[j] + n_filetypes*(ADIO_Offset)filetype_extent) off += fwr_size; /* did not reach end of contiguous block in filetype. no more I/O needed. off is incremented by fwr_size. */ else { j = (j+1) % flat_file->count; n_filetypes += (j == 0) ? 1 : 0; while (flat_file->blocklens[j]==0) { j = (j+1) % flat_file->count; n_filetypes += (j == 0) ? 1 : 0; } off = disp + flat_file->indices[j] + n_filetypes*(ADIO_Offset)filetype_extent; fwr_size = ADIOI_MIN(flat_file->blocklens[j], bufsize-i_offset); } } } else {
/* Copied from ADIOI_PVFS2_OldWriteStrided. It would be good to have fewer * copies of this code... */ void ADIOI_ZOIDFS_WriteStrided(ADIO_File fd, void *buf, int count, MPI_Datatype datatype, int file_ptr_type, ADIO_Offset offset, ADIO_Status *status, int *error_code) { /* as with all the other WriteStrided functions, offset is in units of * etype relative to the filetype */ /* Since zoidfs does not support file locking, can't do buffered writes as on Unix */ ADIOI_Flatlist_node *flat_buf, *flat_file; int i, j, k, bwr_size, fwr_size=0, st_index=0; int bufsize, sum, n_etypes_in_filetype, size_in_filetype; int n_filetypes, etype_in_filetype; ADIO_Offset abs_off_in_filetype=0; int filetype_size, etype_size, buftype_size; MPI_Aint filetype_extent, buftype_extent; int buf_count, buftype_is_contig, filetype_is_contig; ADIO_Offset off, disp, start_off, initial_off; int flag, st_fwr_size, st_n_filetypes; int err_flag=0; size_t mem_list_count, file_list_count; const void ** mem_offsets; uint64_t *file_offsets; size_t *mem_lengths; uint64_t *file_lengths; int total_blks_to_write; int max_mem_list, max_file_list; int b_blks_wrote; int f_data_wrote; int size_wrote=0, n_write_lists, extra_blks; int end_bwr_size, end_fwr_size; int start_k, start_j, new_file_write, new_buffer_write; int start_mem_offset; ADIOI_ZOIDFS_object *zoidfs_obj_ptr; MPI_Offset total_bytes_written=0; static char myname[] = "ADIOI_ZOIDFS_WRITESTRIDED"; /* note: I don't know what zoidfs will do if you pass it a super-long list, * so let's keep with the PVFS limit for now */ #define MAX_ARRAY_SIZE 64 /* --BEGIN ERROR HANDLING-- */ if (fd->atomicity) { *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_ARG, "Atomic noncontiguous writes are not supported by ZOIDFS", 0); return; } /* --END ERROR HANDLING-- */ ADIOI_Datatype_iscontig(datatype, &buftype_is_contig); ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig); /* the HDF5 tests showed a bug in this list processing code (see many many * lines down below). We added a workaround, but common HDF5 file types * are actually contiguous and do not need the expensive workarond */ if (!filetype_is_contig) { flat_file = ADIOI_Flatlist; while (flat_file->type != fd->filetype) flat_file = flat_file->next; if (flat_file->count == 1 && !buftype_is_contig) filetype_is_contig = 1; } MPI_Type_size(fd->filetype, &filetype_size); if ( ! filetype_size ) { *error_code = MPI_SUCCESS; return; } MPI_Type_extent(fd->filetype, &filetype_extent); MPI_Type_size(datatype, &buftype_size); MPI_Type_extent(datatype, &buftype_extent); etype_size = fd->etype_size; bufsize = buftype_size * count; zoidfs_obj_ptr = (ADIOI_ZOIDFS_object*)fd->fs_ptr; if (!buftype_is_contig && filetype_is_contig) { /* noncontiguous in memory, contiguous in file. */ uint64_t file_offsets; uint64_t file_lengths; ADIOI_Flatten_datatype(datatype); flat_buf = ADIOI_Flatlist; while (flat_buf->type != datatype) flat_buf = flat_buf->next; if (file_ptr_type == ADIO_EXPLICIT_OFFSET) { off = fd->disp + etype_size * offset; } else off = fd->fp_ind; file_list_count = 1; file_offsets = off; file_lengths = 0; total_blks_to_write = count*flat_buf->count; b_blks_wrote = 0; /* allocate arrays according to max usage */ if (total_blks_to_write > MAX_ARRAY_SIZE) mem_list_count = MAX_ARRAY_SIZE; else mem_list_count = total_blks_to_write; mem_offsets = (void*)ADIOI_Malloc(mem_list_count*sizeof(void*)); mem_lengths = (size_t*)ADIOI_Malloc(mem_list_count*sizeof(size_t)); j = 0; /* step through each block in memory, filling memory arrays */ while (b_blks_wrote < total_blks_to_write) { for (i=0; i<flat_buf->count; i++) { mem_offsets[b_blks_wrote % MAX_ARRAY_SIZE] = buf + j*buftype_extent + flat_buf->indices[i]; mem_lengths[b_blks_wrote % MAX_ARRAY_SIZE] = flat_buf->blocklens[i]; file_lengths += flat_buf->blocklens[i]; b_blks_wrote++; if (!(b_blks_wrote % MAX_ARRAY_SIZE) || (b_blks_wrote == total_blks_to_write)) { /* in the case of the last write list call, adjust mem_list_count */ if (b_blks_wrote == total_blks_to_write) { mem_list_count = total_blks_to_write % MAX_ARRAY_SIZE; /* in case last write list call fills max arrays */ if (!mem_list_count) mem_list_count = MAX_ARRAY_SIZE; } #ifdef ADIOI_MPE_LOGGING MPE_Log_event( ADIOI_MPE_write_a, 0, NULL ); #endif NO_STALE(err_flag, fd, zoidfs_obj_ptr, zoidfs_write(zoidfs_obj_ptr, mem_list_count, mem_offsets, mem_lengths, 1, &file_offsets, &file_lengths, ZOIDFS_NO_OP_HINT)); /* --BEGIN ERROR HANDLING-- */ if (err_flag != ZFS_OK) { *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, ADIOI_ZOIDFS_error_convert(err_flag), "Error in zoidfs_write", 0); break; } #ifdef ADIOI_MPE_LOGGING MPE_Log_event( ADIOI_MPE_write_b, 0, NULL ); #endif total_bytes_written += file_lengths; /* in the case of error or the last write list call, * leave here */ /* --BEGIN ERROR HANDLING-- */ if (err_flag) { *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, ADIOI_ZOIDFS_error_convert(err_flag), "Error in zoidfs_write", 0); break; } /* --END ERROR HANDLING-- */ if (b_blks_wrote == total_blks_to_write) break; file_offsets += file_lengths; file_lengths = 0; } } /* for (i=0; i<flat_buf->count; i++) */ j++; } /* while (b_blks_wrote < total_blks_to_write) */ ADIOI_Free(mem_offsets); ADIOI_Free(mem_lengths); if (file_ptr_type == ADIO_INDIVIDUAL) fd->fp_ind += total_bytes_written; if (!err_flag) *error_code = MPI_SUCCESS; fd->fp_sys_posn = -1; /* clear this. */ #ifdef HAVE_STATUS_SET_BYTES MPIR_Status_set_bytes(status, datatype, bufsize); /* This is a temporary way of filling in status. The right way is to keep track of how much data was actually written by ADIOI_BUFFERED_WRITE. */ #endif ADIOI_Delete_flattened(datatype); return; } /* if (!buftype_is_contig && filetype_is_contig) */ /* already know that file is noncontiguous from above */ /* noncontiguous in file */ /* filetype already flattened in ADIO_Open */ flat_file = ADIOI_Flatlist; while (flat_file->type != fd->filetype) flat_file = flat_file->next; disp = fd->disp; initial_off = offset; /* for each case - ADIO_Individual pointer or explicit, find offset (file offset in bytes), n_filetypes (how many filetypes into file to start), fwr_size (remaining amount of data in present file block), and st_index (start point in terms of blocks in starting filetype) */ if (file_ptr_type == ADIO_INDIVIDUAL) { offset = fd->fp_ind; /* in bytes */ n_filetypes = -1; flag = 0; while (!flag) { n_filetypes++; for (i=0; i<flat_file->count; i++) { if (disp + flat_file->indices[i] + ((ADIO_Offset) n_filetypes)*filetype_extent + flat_file->blocklens[i] >= offset) { st_index = i; fwr_size = disp + flat_file->indices[i] + ((ADIO_Offset) n_filetypes)*filetype_extent + flat_file->blocklens[i] - offset; flag = 1; break; } } } /* while (!flag) */ } /* if (file_ptr_type == ADIO_INDIVIDUAL) */ else { n_etypes_in_filetype = filetype_size/etype_size; n_filetypes = (int) (offset / n_etypes_in_filetype); etype_in_filetype = (int) (offset % n_etypes_in_filetype); size_in_filetype = etype_in_filetype * etype_size; sum = 0; for (i=0; i<flat_file->count; i++) { sum += flat_file->blocklens[i]; if (sum > size_in_filetype) { st_index = i; fwr_size = sum - size_in_filetype; abs_off_in_filetype = flat_file->indices[i] + size_in_filetype - (sum - flat_file->blocklens[i]); break; } } /* abs. offset in bytes in the file */ offset = disp + ((ADIO_Offset) n_filetypes)*filetype_extent + abs_off_in_filetype; } /* else [file_ptr_type != ADIO_INDIVIDUAL] */ start_off = offset; st_fwr_size = fwr_size; st_n_filetypes = n_filetypes; if (buftype_is_contig && !filetype_is_contig) { /* contiguous in memory, noncontiguous in file. should be the most common case. */ /* only one memory off-len pair, so no array */ size_t mem_lengths; size_t mem_offsets; i = 0; j = st_index; off = offset; n_filetypes = st_n_filetypes; mem_list_count = 1; /* determine how many blocks in file to write */ f_data_wrote = ADIOI_MIN(st_fwr_size, bufsize); total_blks_to_write = 1; if (j < (flat_file->count -1)) j++; else { j = 0; n_filetypes++; } while (f_data_wrote < bufsize) { f_data_wrote += flat_file->blocklens[j]; total_blks_to_write++; if (j<(flat_file->count-1)) j++; else j = 0; } j = st_index; n_filetypes = st_n_filetypes; n_write_lists = total_blks_to_write/MAX_ARRAY_SIZE; extra_blks = total_blks_to_write%MAX_ARRAY_SIZE; mem_offsets = (size_t)buf; mem_lengths = 0; /* if at least one full writelist, allocate file arrays at max array size and don't free until very end */ if (n_write_lists) { file_offsets = (int64_t*)ADIOI_Malloc(MAX_ARRAY_SIZE* sizeof(int64_t)); file_lengths = (uint64_t*)ADIOI_Malloc(MAX_ARRAY_SIZE* sizeof(uint64_t)); } /* if there's no full writelist allocate file arrays according to needed size (extra_blks) */ else { file_offsets = (int64_t*)ADIOI_Malloc(extra_blks* sizeof(int64_t)); file_lengths = (uint64_t*)ADIOI_Malloc(extra_blks* sizeof(uint64_t)); } /* for file arrays that are of MAX_ARRAY_SIZE, build arrays */ for (i=0; i<n_write_lists; i++) { file_list_count = MAX_ARRAY_SIZE; if(!i) { file_offsets[0] = offset; file_lengths[0] = st_fwr_size; mem_lengths = st_fwr_size; } for (k=0; k<MAX_ARRAY_SIZE; k++) { if (i || k) { file_offsets[k] = disp + ((ADIO_Offset)n_filetypes)*filetype_extent + flat_file->indices[j]; file_lengths[k] = flat_file->blocklens[j]; mem_lengths += file_lengths[k]; } if (j<(flat_file->count - 1)) j++; else { j = 0; n_filetypes++; } } /* for (k=0; k<MAX_ARRAY_SIZE; k++) */ #ifdef ADIOI_MPE_LOGGING MPE_Log_event( ADIOI_MPE_write_a, 0, NULL ); #endif NO_STALE(err_flag, fd, zoidfs_obj_ptr, zoidfs_write(zoidfs_obj_ptr, 1, buf, &mem_lengths, file_list_count, file_offsets, file_lengths, ZOIDFS_NO_OP_HINT)); #ifdef ADIOI_MPE_LOGGING MPE_Log_event( ADIOI_MPE_write_b, 0, NULL ); #endif /* --BEGIN ERROR HANDLING-- */ if (err_flag != ZFS_OK) { *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, ADIOI_ZOIDFS_error_convert(err_flag), "Error in zoidfs_write", 0); goto error_state; } /* --END ERROR HANDLING-- */ total_bytes_written += mem_lengths; mem_offsets += mem_lengths; mem_lengths = 0; } /* for (i=0; i<n_write_lists; i++) */ /* for file arrays smaller than MAX_ARRAY_SIZE (last write_list call) */ if (extra_blks) { file_list_count = extra_blks; if(!i) { file_offsets[0] = offset; file_lengths[0] = ADIOI_MIN(st_fwr_size, bufsize); } for (k=0; k<extra_blks; k++) { if(i || k) { file_offsets[k] = disp + ((ADIO_Offset)n_filetypes)*filetype_extent + flat_file->indices[j]; /* XXX: double-check these casts */ if (k == (extra_blks - 1)) { file_lengths[k] = bufsize - mem_lengths - mem_offsets + (size_t)buf; } else file_lengths[k] = flat_file->blocklens[j]; } /* if(i || k) */ mem_lengths += file_lengths[k]; if (j<(flat_file->count - 1)) j++; else { j = 0; n_filetypes++; } } /* for (k=0; k<extra_blks; k++) */ #ifdef ADIOI_MPE_LOGGING MPE_Log_event( ADIOI_MPE_write_a, 0, NULL ); #endif NO_STALE(err_flag, fd, zoidfs_obj_ptr, zoidfs_write(zoidfs_obj_ptr, 1, (const void **)&mem_offsets, &mem_lengths, file_list_count, file_offsets, file_lengths, ZOIDFS_NO_OP_HINT)); #ifdef ADIOI_MPE_LOGGING MPE_Log_event( ADIOI_MPE_write_b, 0, NULL ); #endif /* --BEGIN ERROR HANDLING-- */ if (err_flag != 0) { *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, ADIOI_ZOIDFS_error_convert(err_flag), "Error in zoidfs_write", 0); goto error_state; } /* --END ERROR HANDLING-- */ total_bytes_written += mem_lengths; } } else { /* noncontiguous in memory as well as in file */ ADIOI_Flatten_datatype(datatype); flat_buf = ADIOI_Flatlist; while (flat_buf->type != datatype) flat_buf = flat_buf->next; size_wrote = 0; n_filetypes = st_n_filetypes; fwr_size = st_fwr_size; bwr_size = flat_buf->blocklens[0]; buf_count = 0; start_mem_offset = 0; start_k = k = 0; start_j = st_index; max_mem_list = 0; max_file_list = 0; /* run through and file max_file_list and max_mem_list so that you can allocate the file and memory arrays less than MAX_ARRAY_SIZE if possible */ while (size_wrote < bufsize) { k = start_k; new_buffer_write = 0; mem_list_count = 0; while ((mem_list_count < MAX_ARRAY_SIZE) && (new_buffer_write < bufsize-size_wrote)) { /* find mem_list_count and file_list_count such that both are less than MAX_ARRAY_SIZE, the sum of their lengths are equal, and the sum of all the data written and data to be written in the next immediate write list is less than bufsize */ if(mem_list_count) { if((new_buffer_write + flat_buf->blocklens[k] + size_wrote) > bufsize) { end_bwr_size = new_buffer_write + flat_buf->blocklens[k] - (bufsize - size_wrote); new_buffer_write = bufsize - size_wrote; } else { new_buffer_write += flat_buf->blocklens[k]; end_bwr_size = flat_buf->blocklens[k]; } } else { if (bwr_size > (bufsize - size_wrote)) { new_buffer_write = bufsize - size_wrote; bwr_size = new_buffer_write; } else new_buffer_write = bwr_size; } mem_list_count++; k = (k + 1)%flat_buf->count; } /* while ((mem_list_count < MAX_ARRAY_SIZE) && (new_buffer_write < bufsize-size_wrote)) */ j = start_j; new_file_write = 0; file_list_count = 0; while ((file_list_count < MAX_ARRAY_SIZE) && (new_file_write < new_buffer_write)) { if(file_list_count) { if((new_file_write + flat_file->blocklens[j]) > new_buffer_write) { end_fwr_size = new_buffer_write - new_file_write; new_file_write = new_buffer_write; j--; } else { new_file_write += flat_file->blocklens[j]; end_fwr_size = flat_file->blocklens[j]; } } else { if (fwr_size > new_buffer_write) { new_file_write = new_buffer_write; fwr_size = new_file_write; } else new_file_write = fwr_size; } file_list_count++; if (j < (flat_file->count - 1)) j++; else j = 0; k = start_k; if ((new_file_write < new_buffer_write) && (file_list_count == MAX_ARRAY_SIZE)) { new_buffer_write = 0; mem_list_count = 0; while (new_buffer_write < new_file_write) { if(mem_list_count) { if((new_buffer_write + flat_buf->blocklens[k]) > new_file_write) { end_bwr_size = new_file_write - new_buffer_write; new_buffer_write = new_file_write; k--; } else { new_buffer_write += flat_buf->blocklens[k]; end_bwr_size = flat_buf->blocklens[k]; } } else { new_buffer_write = bwr_size; if (bwr_size > (bufsize - size_wrote)) { new_buffer_write = bufsize - size_wrote; bwr_size = new_buffer_write; } } mem_list_count++; k = (k + 1)%flat_buf->count; } /* while (new_buffer_write < new_file_write) */ } /* if ((new_file_write < new_buffer_write) && (file_list_count == MAX_ARRAY_SIZE)) */ } /* while ((mem_list_count < MAX_ARRAY_SIZE) && (new_buffer_write < bufsize-size_wrote)) */ /* fakes filling the writelist arrays of lengths found above */ k = start_k; j = start_j; for (i=0; i<mem_list_count; i++) { if(i) { if (i == (mem_list_count - 1)) { if (flat_buf->blocklens[k] == end_bwr_size) bwr_size = flat_buf->blocklens[(k+1)% flat_buf->count]; else { bwr_size = flat_buf->blocklens[k] - end_bwr_size; k--; buf_count--; } } } buf_count++; k = (k + 1)%flat_buf->count; } /* for (i=0; i<mem_list_count; i++) */ for (i=0; i<file_list_count; i++) { if (i) { if (i == (file_list_count - 1)) { if (flat_file->blocklens[j] == end_fwr_size) fwr_size = flat_file->blocklens[(j+1)% flat_file->count]; else { fwr_size = flat_file->blocklens[j] - end_fwr_size; j--; } } } if (j < flat_file->count - 1) j++; else { j = 0; n_filetypes++; } } /* for (i=0; i<file_list_count; i++) */ size_wrote += new_buffer_write; start_k = k; start_j = j; if (max_mem_list < mem_list_count) max_mem_list = mem_list_count; if (max_file_list < file_list_count) max_file_list = file_list_count; } /* while (size_wrote < bufsize) */ /* one last check before we actually carry out the operation: * this code has hard-to-fix bugs when a noncontiguous file type has * such large pieces that the sum of the lengths of the memory type is * not larger than one of those pieces (and vice versa for large memory * types and many pices of file types. In these cases, give up and * fall back to naive reads and writes. The testphdf5 test created a * type with two very large memory regions and 600 very small file * regions. The same test also created a type with one very large file * region and many (700) very small memory regions. both cases caused * problems for this code */ if ( ( (file_list_count == 1) && (new_file_write < flat_file->blocklens[0] ) ) || ((mem_list_count == 1) && (new_buffer_write < flat_buf->blocklens[0]) ) || ((file_list_count == MAX_ARRAY_SIZE) && (new_file_write < flat_buf->blocklens[0]) ) || ( (mem_list_count == MAX_ARRAY_SIZE) && (new_buffer_write < flat_file->blocklens[0])) ) { ADIOI_Delete_flattened(datatype); ADIOI_GEN_WriteStrided_naive(fd, buf, count, datatype, file_ptr_type, initial_off, status, error_code); return; } mem_offsets = (void *)ADIOI_Malloc(max_mem_list*sizeof(void *)); mem_lengths = (size_t*)ADIOI_Malloc(max_mem_list*sizeof(size_t)); file_offsets = (uint64_t *)ADIOI_Malloc(max_file_list*sizeof(uint64_t)); file_lengths = (uint64_t*)ADIOI_Malloc(max_file_list*sizeof(uint64_t)); size_wrote = 0; n_filetypes = st_n_filetypes; fwr_size = st_fwr_size; bwr_size = flat_buf->blocklens[0]; buf_count = 0; start_mem_offset = 0; start_k = k = 0; start_j = st_index; /* this section calculates mem_list_count and file_list_count and also finds the possibly odd sized last array elements in new_fwr_size and new_bwr_size */ while (size_wrote < bufsize) { k = start_k; new_buffer_write = 0; mem_list_count = 0; while ((mem_list_count < MAX_ARRAY_SIZE) && (new_buffer_write < bufsize-size_wrote)) { /* find mem_list_count and file_list_count such that both are less than MAX_ARRAY_SIZE, the sum of their lengths are equal, and the sum of all the data written and data to be written in the next immediate write list is less than bufsize */ if(mem_list_count) { if((new_buffer_write + flat_buf->blocklens[k] + size_wrote) > bufsize) { end_bwr_size = new_buffer_write + flat_buf->blocklens[k] - (bufsize - size_wrote); new_buffer_write = bufsize - size_wrote; } else { new_buffer_write += flat_buf->blocklens[k]; end_bwr_size = flat_buf->blocklens[k]; } } else { if (bwr_size > (bufsize - size_wrote)) { new_buffer_write = bufsize - size_wrote; bwr_size = new_buffer_write; } else new_buffer_write = bwr_size; } mem_list_count++; k = (k + 1)%flat_buf->count; } /* while ((mem_list_count < MAX_ARRAY_SIZE) && (new_buffer_write < bufsize-size_wrote)) */ j = start_j; new_file_write = 0; file_list_count = 0; while ((file_list_count < MAX_ARRAY_SIZE) && (new_file_write < new_buffer_write)) { if(file_list_count) { if((new_file_write + flat_file->blocklens[j]) > new_buffer_write) { end_fwr_size = new_buffer_write - new_file_write; new_file_write = new_buffer_write; j--; } else { new_file_write += flat_file->blocklens[j]; end_fwr_size = flat_file->blocklens[j]; } } else { if (fwr_size > new_buffer_write) { new_file_write = new_buffer_write; fwr_size = new_file_write; } else new_file_write = fwr_size; } file_list_count++; if (j < (flat_file->count - 1)) j++; else j = 0; k = start_k; if ((new_file_write < new_buffer_write) && (file_list_count == MAX_ARRAY_SIZE)) { new_buffer_write = 0; mem_list_count = 0; while (new_buffer_write < new_file_write) { if(mem_list_count) { if((new_buffer_write + flat_buf->blocklens[k]) > new_file_write) { end_bwr_size = new_file_write - new_buffer_write; new_buffer_write = new_file_write; k--; } else { new_buffer_write += flat_buf->blocklens[k]; end_bwr_size = flat_buf->blocklens[k]; } } else { new_buffer_write = bwr_size; if (bwr_size > (bufsize - size_wrote)) { new_buffer_write = bufsize - size_wrote; bwr_size = new_buffer_write; } } mem_list_count++; k = (k + 1)%flat_buf->count; } /* while (new_buffer_write < new_file_write) */ } /* if ((new_file_write < new_buffer_write) && (file_list_count == MAX_ARRAY_SIZE)) */ } /* while ((mem_list_count < MAX_ARRAY_SIZE) && (new_buffer_write < bufsize-size_wrote)) */ /* fills the allocated writelist arrays */ k = start_k; j = start_j; for (i=0; i<mem_list_count; i++) { mem_offsets[i] = buf + buftype_extent* (buf_count/flat_buf->count) + flat_buf->indices[k]; if(!i) { mem_lengths[0] = bwr_size; mem_offsets[0] += flat_buf->blocklens[k] - bwr_size; } else { if (i == (mem_list_count - 1)) { mem_lengths[i] = end_bwr_size; if (flat_buf->blocklens[k] == end_bwr_size) bwr_size = flat_buf->blocklens[(k+1)% flat_buf->count]; else { bwr_size = flat_buf->blocklens[k] - end_bwr_size; k--; buf_count--; } } else { mem_lengths[i] = flat_buf->blocklens[k]; } } buf_count++; k = (k + 1)%flat_buf->count; } /* for (i=0; i<mem_list_count; i++) */ for (i=0; i<file_list_count; i++) { file_offsets[i] = disp + flat_file->indices[j] + ((ADIO_Offset)n_filetypes) * filetype_extent; if (!i) { file_lengths[0] = fwr_size; file_offsets[0] += flat_file->blocklens[j] - fwr_size; } else { if (i == (file_list_count - 1)) { file_lengths[i] = end_fwr_size; if (flat_file->blocklens[j] == end_fwr_size) fwr_size = flat_file->blocklens[(j+1)% flat_file->count]; else { fwr_size = flat_file->blocklens[j] - end_fwr_size; j--; } } else file_lengths[i] = flat_file->blocklens[j]; } if (j < flat_file->count - 1) j++; else { j = 0; n_filetypes++; } } /* for (i=0; i<file_list_count; i++) */ #ifdef ADIOI_MPE_LOGGING MPE_Log_event( ADIOI_MPE_write_a, 0, NULL ); #endif NO_STALE(err_flag, fd, zoidfs_obj_ptr, zoidfs_write(zoidfs_obj_ptr, mem_list_count, mem_offsets, mem_lengths, file_list_count, file_offsets, file_lengths, ZOIDFS_NO_OP_HINT)); /* --BEGIN ERROR HANDLING-- */ if (err_flag != ZFS_OK) { *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, ADIOI_ZOIDFS_error_convert(err_flag), "Error in zoidfs_write", 0); goto error_state; } /* --END ERROR HANDLING-- */ #ifdef ADIOI_MPE_LOGGING MPE_Log_event( ADIOI_MPE_write_b, 0, NULL ); #endif size_wrote += new_buffer_write; total_bytes_written += new_buffer_write; /* XXX: is this right? */ start_k = k; start_j = j; } /* while (size_wrote < bufsize) */ ADIOI_Free(mem_offsets); ADIOI_Free(mem_lengths); } /* when incrementing fp_ind, need to also take into account the file type: * consider an N-element 1-d subarray with a lb and ub: ( |---xxxxx-----| * if we wrote N elements, offset needs to point at beginning of type, not * at empty region at offset N+1). * * As we discussed on mpich-discuss in may/june 2009, the code below might * look wierd, but by putting fp_ind at the last byte written, the next * time we run through the strided code we'll update the fp_ind to the * right location. */ if (file_ptr_type == ADIO_INDIVIDUAL) { fd->fp_ind = file_offsets[file_list_count-1]+ file_lengths[file_list_count-1]; } ADIOI_Free(file_offsets); ADIOI_Free(file_lengths); *error_code = MPI_SUCCESS; error_state: fd->fp_sys_posn = -1; /* set it to null. */ #ifdef HAVE_STATUS_SET_BYTES MPIR_Status_set_bytes(status, datatype, bufsize); /* This is a temporary way of filling in status. The right way is to keep track of how much data was actually written by ADIOI_BUFFERED_WRITE. */ #endif if (!buftype_is_contig) ADIOI_Delete_flattened(datatype); }
void ADIOI_GEN_ReadContig(ADIO_File fd, void *buf, int count, MPI_Datatype datatype, int file_ptr_type, ADIO_Offset offset, ADIO_Status *status, int *error_code) { int err = -1, datatype_size; ADIO_Offset len; static char myname[] = "ADIOI_GEN_READCONTIG"; #ifdef AGGREGATION_PROFILE MPE_Log_event (5034, 0, NULL); #endif MPI_Type_size(datatype, &datatype_size); len = (ADIO_Offset)datatype_size * (ADIO_Offset)count; ADIOI_Assert(len == (unsigned int) len); /* read takes an unsigned int parm */ if (file_ptr_type == ADIO_INDIVIDUAL) { offset = fd->fp_ind; } if (fd->fp_sys_posn != offset) { #ifdef ADIOI_MPE_LOGGING MPE_Log_event( ADIOI_MPE_lseek_a, 0, NULL ); #endif err = lseek(fd->fd_sys, offset, SEEK_SET); #ifdef ADIOI_MPE_LOGGING MPE_Log_event( ADIOI_MPE_lseek_b, 0, NULL ); #endif /* --BEGIN ERROR HANDLING-- */ if (err == -1) { *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_IO, "**io", "**io %s", strerror(errno)); fd->fp_sys_posn = -1; return; } /* --END ERROR HANDLING-- */ } #ifdef ADIOI_MPE_LOGGING MPE_Log_event( ADIOI_MPE_read_a, 0, NULL ); #endif err = read(fd->fd_sys, buf, (unsigned int)len); #ifdef ADIOI_MPE_LOGGING MPE_Log_event( ADIOI_MPE_read_b, 0, NULL ); #endif /* --BEGIN ERROR HANDLING-- */ if (err == -1) { *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_IO, "**io", "**io %s", strerror(errno)); fd->fp_sys_posn = -1; return; } /* --END ERROR HANDLING-- */ fd->fp_sys_posn = offset + err; if (file_ptr_type == ADIO_INDIVIDUAL) { fd->fp_ind += err; } #ifdef HAVE_STATUS_SET_BYTES if (err != -1) MPIR_Status_set_bytes(status, datatype, err); #endif *error_code = MPI_SUCCESS; #ifdef AGGREGATION_PROFILE MPE_Log_event (5035, 0, NULL); #endif }
void ADIOI_NFS_ReadStrided(ADIO_File fd, void *buf, int count, MPI_Datatype datatype, int file_ptr_type, ADIO_Offset offset, ADIO_Status *status, int *error_code) { /* offset is in units of etype relative to the filetype. */ ADIOI_Flatlist_node *flat_buf, *flat_file; int i, j, k, err=-1, brd_size, st_index=0; int num, size, sum, n_etypes_in_filetype, size_in_filetype; MPI_Count bufsize; int n_filetypes, etype_in_filetype; ADIO_Offset abs_off_in_filetype=0; int req_len, partial_read; MPI_Count filetype_size, etype_size, buftype_size; MPI_Aint filetype_extent, buftype_extent; int buf_count, buftype_is_contig, filetype_is_contig; ADIO_Offset userbuf_off; ADIO_Offset off, req_off, disp, end_offset=0, readbuf_off, start_off; char *readbuf, *tmp_buf, *value; int st_n_filetypes, readbuf_len; ADIO_Offset frd_size=0, new_frd_size, st_frd_size; int new_brd_size, err_flag=0, info_flag, max_bufsize; static char myname[] = "ADIOI_NFS_READSTRIDED"; ADIOI_Datatype_iscontig(datatype, &buftype_is_contig); ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig); MPI_Type_size_x(fd->filetype, &filetype_size); if ( ! filetype_size ) { #ifdef HAVE_STATUS_SET_BYTES MPIR_Status_set_bytes(status, datatype, 0); #endif *error_code = MPI_SUCCESS; return; } MPI_Type_extent(fd->filetype, &filetype_extent); MPI_Type_size_x(datatype, &buftype_size); MPI_Type_extent(datatype, &buftype_extent); etype_size = fd->etype_size; bufsize = buftype_size * count; /* get max_bufsize from the info object. */ value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char)); ADIOI_Info_get(fd->info, "ind_rd_buffer_size", MPI_MAX_INFO_VAL, value, &info_flag); max_bufsize = atoi(value); ADIOI_Free(value); if (!buftype_is_contig && filetype_is_contig) { /* noncontiguous in memory, contiguous in file. */ ADIOI_Flatten_datatype(datatype); flat_buf = ADIOI_Flatlist; while (flat_buf->type != datatype) flat_buf = flat_buf->next; off = (file_ptr_type == ADIO_INDIVIDUAL) ? fd->fp_ind : fd->disp + etype_size * offset; start_off = off; end_offset = off + bufsize - 1; readbuf_off = off; readbuf = (char *) ADIOI_Malloc(max_bufsize); readbuf_len = (int) (ADIOI_MIN(max_bufsize, end_offset-readbuf_off+1)); /* if atomicity is true, lock (exclusive) the region to be accessed */ if (fd->atomicity) ADIOI_WRITE_LOCK(fd, start_off, SEEK_SET, end_offset-start_off+1); #ifdef ADIOI_MPE_LOGGING MPE_Log_event( ADIOI_MPE_lseek_a, 0, NULL ); #endif lseek(fd->fd_sys, readbuf_off, SEEK_SET); #ifdef ADIOI_MPE_LOGGING MPE_Log_event( ADIOI_MPE_lseek_b, 0, NULL ); #endif if (!(fd->atomicity)) ADIOI_READ_LOCK(fd, readbuf_off, SEEK_SET, readbuf_len); #ifdef ADIOI_MPE_LOGGING MPE_Log_event( ADIOI_MPE_read_a, 0, NULL ); #endif err = read(fd->fd_sys, readbuf, readbuf_len); #ifdef ADIOI_MPE_LOGGING MPE_Log_event( ADIOI_MPE_read_b, 0, NULL ); #endif if (!(fd->atomicity)) ADIOI_UNLOCK(fd, readbuf_off, SEEK_SET, readbuf_len); if (err == -1) err_flag = 1; for (j=0; j<count; j++) for (i=0; i<flat_buf->count; i++) { userbuf_off = j*buftype_extent + flat_buf->indices[i]; req_off = off; req_len = flat_buf->blocklens[i]; ADIOI_BUFFERED_READ off += flat_buf->blocklens[i]; } if (fd->atomicity) ADIOI_UNLOCK(fd, start_off, SEEK_SET, end_offset-start_off+1); if (file_ptr_type == ADIO_INDIVIDUAL) fd->fp_ind = off; ADIOI_Free(readbuf); /* malloced in the buffered_read macro */ if (err_flag) { *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_IO, "**io", "**io %s", strerror(errno)); } else *error_code = MPI_SUCCESS; } else { /* noncontiguous in file */ /* filetype already flattened in ADIO_Open */ flat_file = ADIOI_Flatlist; while (flat_file->type != fd->filetype) flat_file = flat_file->next; disp = fd->disp; if (file_ptr_type == ADIO_INDIVIDUAL) { /* Wei-keng reworked type processing to be a bit more efficient */ offset = fd->fp_ind - disp; n_filetypes = (offset - flat_file->indices[0]) / filetype_extent; offset -= (ADIO_Offset)n_filetypes * filetype_extent; /* now offset is local to this extent */ /* find the block where offset is located, skip blocklens[i]==0 */ for (i=0; i<flat_file->count; i++) { ADIO_Offset dist; if (flat_file->blocklens[i] == 0) continue; dist = flat_file->indices[i] + flat_file->blocklens[i] - offset; /* frd_size is from offset to the end of block i */ if (dist == 0) { i++; offset = flat_file->indices[i]; frd_size = flat_file->blocklens[i]; break; } if (dist > 0 ) { frd_size = dist; break; } } st_index = i; /* starting index in flat_file->indices[] */ offset += disp + (ADIO_Offset)n_filetypes*filetype_extent; } else { n_etypes_in_filetype = filetype_size/etype_size; n_filetypes = (int) (offset / n_etypes_in_filetype); etype_in_filetype = (int) (offset % n_etypes_in_filetype); size_in_filetype = etype_in_filetype * etype_size; sum = 0; for (i=0; i<flat_file->count; i++) { sum += flat_file->blocklens[i]; if (sum > size_in_filetype) { st_index = i; frd_size = sum - size_in_filetype; abs_off_in_filetype = flat_file->indices[i] + size_in_filetype - (sum - flat_file->blocklens[i]); break; } } /* abs. offset in bytes in the file */ offset = disp + (ADIO_Offset) n_filetypes*filetype_extent + abs_off_in_filetype; } start_off = offset; /* Wei-keng Liao: read request is within a single flat_file contig * block e.g. with subarray types that actually describe the whole * array */ if (buftype_is_contig && bufsize <= frd_size) { ADIO_ReadContig(fd, buf, bufsize, MPI_BYTE, ADIO_EXPLICIT_OFFSET, offset, status, error_code); if (file_ptr_type == ADIO_INDIVIDUAL) { /* update MPI-IO file pointer to point to the first byte that * can be accessed in the fileview. */ fd->fp_ind = offset + bufsize; if (bufsize == frd_size) { do { st_index++; if (st_index == flat_file->count) { st_index = 0; n_filetypes++; } } while (flat_file->blocklens[st_index] == 0); fd->fp_ind = disp + flat_file->indices[st_index] + n_filetypes*filetype_extent; } } fd->fp_sys_posn = -1; /* set it to null. */ #ifdef HAVE_STATUS_SET_BYTES MPIR_Status_set_bytes(status, datatype, bufsize); #endif return; } /* Calculate end_offset, the last byte-offset that will be accessed. e.g., if start_offset=0 and 100 bytes to be read, end_offset=99*/ st_frd_size = frd_size; st_n_filetypes = n_filetypes; i = 0; j = st_index; off = offset; frd_size = ADIOI_MIN(st_frd_size, bufsize); while (i < bufsize) { i += frd_size; end_offset = off + frd_size - 1; j = (j+1) % flat_file->count; n_filetypes += (j == 0) ? 1 : 0; while (flat_file->blocklens[j]==0) { j = (j+1) % flat_file->count; n_filetypes += (j == 0) ? 1 : 0; } off = disp + flat_file->indices[j] + (ADIO_Offset) n_filetypes*filetype_extent; frd_size = ADIOI_MIN(flat_file->blocklens[j], bufsize-i); } /* if atomicity is true, lock (exclusive) the region to be accessed */ if (fd->atomicity) ADIOI_WRITE_LOCK(fd, start_off, SEEK_SET, end_offset-start_off+1); /* initial read into readbuf */ readbuf_off = offset; readbuf = (char *) ADIOI_Malloc(max_bufsize); readbuf_len = (int) (ADIOI_MIN(max_bufsize, end_offset-readbuf_off+1)); #ifdef ADIOI_MPE_LOGGING MPE_Log_event( ADIOI_MPE_lseek_a, 0, NULL ); #endif lseek(fd->fd_sys, offset, SEEK_SET); #ifdef ADIOI_MPE_LOGGING MPE_Log_event( ADIOI_MPE_lseek_b, 0, NULL ); #endif if (!(fd->atomicity)) ADIOI_READ_LOCK(fd, offset, SEEK_SET, readbuf_len); #ifdef ADIOI_MPE_LOGGING MPE_Log_event( ADIOI_MPE_read_a, 0, NULL ); #endif err = read(fd->fd_sys, readbuf, readbuf_len); #ifdef ADIOI_MPE_LOGGING MPE_Log_event( ADIOI_MPE_read_b, 0, NULL ); #endif if (!(fd->atomicity)) ADIOI_UNLOCK(fd, offset, SEEK_SET, readbuf_len); if (err == -1) err_flag = 1; if (buftype_is_contig && !filetype_is_contig) { /* contiguous in memory, noncontiguous in file. should be the most common case. */ i = 0; j = st_index; off = offset; n_filetypes = st_n_filetypes; frd_size = ADIOI_MIN(st_frd_size, bufsize); while (i < bufsize) { if (frd_size) { /* TYPE_UB and TYPE_LB can result in frd_size = 0. save system call in such cases */ /* lseek(fd->fd_sys, off, SEEK_SET); err = read(fd->fd_sys, ((char *) buf) + i, frd_size);*/ req_off = off; req_len = frd_size; userbuf_off = i; ADIOI_BUFFERED_READ } i += frd_size; if (off + frd_size < disp + flat_file->indices[j] + flat_file->blocklens[j] + (ADIO_Offset) n_filetypes*filetype_extent) off += frd_size; /* did not reach end of contiguous block in filetype. no more I/O needed. off is incremented by frd_size. */ else { j = (j+1) % flat_file->count; n_filetypes += (j == 0) ? 1 : 0; while (flat_file->blocklens[j]==0) { j = (j+1) % flat_file->count; n_filetypes += (j == 0) ? 1 : 0; } off = disp + flat_file->indices[j] + (ADIO_Offset) n_filetypes*filetype_extent; frd_size = ADIOI_MIN(flat_file->blocklens[j], bufsize-i); } } } else {
void ADIOI_GEN_ReadContig(ADIO_File fd, void *buf, int count, MPI_Datatype datatype, int file_ptr_type, ADIO_Offset offset, ADIO_Status * status, int *error_code) { ssize_t err = -1; MPI_Count datatype_size; ADIO_Offset len, bytes_xfered = 0; size_t rd_count; static char myname[] = "ADIOI_GEN_READCONTIG"; #ifdef ROMIO_GPFS double io_time = 0; #endif char *p; #ifdef AGGREGATION_PROFILE MPE_Log_event(5034, 0, NULL); #endif MPI_Type_size_x(datatype, &datatype_size); len = datatype_size * (ADIO_Offset) count; #ifdef ROMIO_GPFS io_time = MPI_Wtime(); if (gpfsmpio_timing) { gpfsmpio_prof_cr[GPFSMPIO_CIO_DATA_SIZE] += len; } #endif if (file_ptr_type == ADIO_INDIVIDUAL) { offset = fd->fp_ind; } p = buf; while (bytes_xfered < len) { #ifdef ADIOI_MPE_LOGGING MPE_Log_event(ADIOI_MPE_read_a, 0, NULL); #endif rd_count = len - bytes_xfered; /* stupid FreeBSD and Darwin do not like a count larger than a signed * int, even though size_t is eight bytes... */ if (rd_count > INT_MAX) rd_count = INT_MAX; #ifdef ROMIO_GPFS if (gpfsmpio_devnullio) err = pread(fd->null_fd, p, rd_count, offset + bytes_xfered); else #endif err = pread(fd->fd_sys, p, rd_count, offset + bytes_xfered); /* --BEGIN ERROR HANDLING-- */ if (err == -1) { *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_IO, "**io", "**io %s", strerror(errno)); fd->fp_sys_posn = -1; return; } /* --END ERROR HANDLING-- */ if (err == 0) { /* end of file */ break; } #ifdef ADIOI_MPE_LOGGING MPE_Log_event(ADIOI_MPE_read_b, 0, NULL); #endif bytes_xfered += err; p += err; } #ifdef ROMIO_GPFS if (gpfsmpio_timing) gpfsmpio_prof_cr[GPFSMPIO_CIO_T_POSI_RW] += (MPI_Wtime() - io_time); #endif fd->fp_sys_posn = offset + bytes_xfered; if (file_ptr_type == ADIO_INDIVIDUAL) { fd->fp_ind += bytes_xfered; } #ifdef HAVE_STATUS_SET_BYTES /* what if we only read half a datatype? */ /* bytes_xfered could be larger than int */ if (err != -1) MPIR_Status_set_bytes(status, datatype, bytes_xfered); #endif *error_code = MPI_SUCCESS; #ifdef AGGREGATION_PROFILE MPE_Log_event(5035, 0, NULL); #endif #ifdef ROMIO_GPFS if (gpfsmpio_timing) gpfsmpio_prof_cr[GPFSMPIO_CIO_T_MPIO_RW] += (MPI_Wtime() - io_time); #endif }
void ADIOI_NFS_WriteStrided(ADIO_File fd, const void *buf, int count, MPI_Datatype datatype, int file_ptr_type, ADIO_Offset offset, ADIO_Status * status, int *error_code) { /* offset is in units of etype relative to the filetype. */ ADIOI_Flatlist_node *flat_buf, *flat_file; int i, j, k, err = -1, bwr_size, st_index = 0; ADIO_Offset i_offset, sum, size_in_filetype; ADIO_Offset num, size, n_etypes_in_filetype; MPI_Count bufsize; ADIO_Offset n_filetypes, etype_in_filetype; ADIO_Offset abs_off_in_filetype = 0; int req_len; MPI_Count filetype_size, etype_size, buftype_size; MPI_Aint filetype_extent, buftype_extent; int buf_count, buftype_is_contig, filetype_is_contig; ADIO_Offset userbuf_off; ADIO_Offset off, req_off, disp, end_offset = 0, writebuf_off, start_off; char *writebuf = NULL, *value; int st_n_filetypes, writebuf_len, write_sz; ADIO_Offset fwr_size = 0, new_fwr_size, st_fwr_size; int new_bwr_size, err_flag = 0, info_flag, max_bufsize; static char myname[] = "ADIOI_NFS_WRITESTRIDED"; ADIOI_Datatype_iscontig(datatype, &buftype_is_contig); ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig); MPI_Type_size_x(fd->filetype, &filetype_size); if (!filetype_size) { #ifdef HAVE_STATUS_SET_BYTES MPIR_Status_set_bytes(status, datatype, 0); #endif *error_code = MPI_SUCCESS; return; } MPI_Type_extent(fd->filetype, &filetype_extent); MPI_Type_size_x(datatype, &buftype_size); MPI_Type_extent(datatype, &buftype_extent); etype_size = fd->etype_size; bufsize = buftype_size * count; /* get max_bufsize from the info object. */ value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL + 1) * sizeof(char)); ADIOI_Info_get(fd->info, "ind_wr_buffer_size", MPI_MAX_INFO_VAL, value, &info_flag); max_bufsize = atoi(value); ADIOI_Free(value); if (!buftype_is_contig && filetype_is_contig) { /* noncontiguous in memory, contiguous in file. */ flat_buf = ADIOI_Flatten_and_find(datatype); off = (file_ptr_type == ADIO_INDIVIDUAL) ? fd->fp_ind : fd->disp + etype_size * offset; start_off = off; end_offset = off + bufsize - 1; writebuf_off = off; writebuf = (char *) ADIOI_Malloc(max_bufsize); writebuf_len = (int) (MPL_MIN(max_bufsize, end_offset - writebuf_off + 1)); /* if atomicity is true, lock the region to be accessed */ if (fd->atomicity) ADIOI_WRITE_LOCK(fd, start_off, SEEK_SET, end_offset - start_off + 1); for (j = 0; j < count; j++) for (i = 0; i < flat_buf->count; i++) { userbuf_off = j * buftype_extent + flat_buf->indices[i]; req_off = off; req_len = flat_buf->blocklens[i]; ADIOI_BUFFERED_WRITE_WITHOUT_READ off += flat_buf->blocklens[i]; } /* write the buffer out finally */ #ifdef ADIOI_MPE_LOGGING MPE_Log_event(ADIOI_MPE_lseek_a, 0, NULL); #endif lseek(fd->fd_sys, writebuf_off, SEEK_SET); #ifdef ADIOI_MPE_LOGGING MPE_Log_event(ADIOI_MPE_lseek_b, 0, NULL); #endif if (!(fd->atomicity)) ADIOI_WRITE_LOCK(fd, writebuf_off, SEEK_SET, writebuf_len); #ifdef ADIOI_MPE_LOGGING MPE_Log_event(ADIOI_MPE_write_a, 0, NULL); #endif err = write(fd->fd_sys, writebuf, writebuf_len); #ifdef ADIOI_MPE_LOGGING MPE_Log_event(ADIOI_MPE_write_b, 0, NULL); #endif if (!(fd->atomicity)) ADIOI_UNLOCK(fd, writebuf_off, SEEK_SET, writebuf_len); if (err == -1) err_flag = 1; if (fd->atomicity) ADIOI_UNLOCK(fd, start_off, SEEK_SET, end_offset - start_off + 1); if (file_ptr_type == ADIO_INDIVIDUAL) fd->fp_ind = off; if (err_flag) { *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_IO, "**io", "**io %s", strerror(errno)); } else *error_code = MPI_SUCCESS; } else { /* noncontiguous in file */ flat_file = ADIOI_Flatten_and_find(fd->filetype); disp = fd->disp; if (file_ptr_type == ADIO_INDIVIDUAL) { /* Wei-keng reworked type processing to be a bit more efficient */ offset = fd->fp_ind - disp; n_filetypes = (offset - flat_file->indices[0]) / filetype_extent; offset -= (ADIO_Offset) n_filetypes *filetype_extent; /* now offset is local to this extent */ /* find the block where offset is located, skip blocklens[i]==0 */ for (i = 0; i < flat_file->count; i++) { ADIO_Offset dist; if (flat_file->blocklens[i] == 0) continue; dist = flat_file->indices[i] + flat_file->blocklens[i] - offset; /* fwr_size is from offset to the end of block i */ if (dist == 0) { i++; offset = flat_file->indices[i]; fwr_size = flat_file->blocklens[i]; break; } if (dist > 0) { fwr_size = dist; break; } } st_index = i; /* starting index in flat_file->indices[] */ offset += disp + (ADIO_Offset) n_filetypes *filetype_extent; } else { n_etypes_in_filetype = filetype_size / etype_size; n_filetypes = offset / n_etypes_in_filetype; etype_in_filetype = offset % n_etypes_in_filetype; size_in_filetype = etype_in_filetype * etype_size; sum = 0; for (i = 0; i < flat_file->count; i++) { sum += flat_file->blocklens[i]; if (sum > size_in_filetype) { st_index = i; fwr_size = sum - size_in_filetype; abs_off_in_filetype = flat_file->indices[i] + size_in_filetype - (sum - flat_file->blocklens[i]); break; } } /* abs. offset in bytes in the file */ offset = disp + (ADIO_Offset) n_filetypes *filetype_extent + abs_off_in_filetype; } start_off = offset; /* Wei-keng Liao:write request is within single flat_file contig block */ /* this could happen, for example, with subarray types that are * actually fairly contiguous */ if (buftype_is_contig && bufsize <= fwr_size) { /* though MPI api has an integer 'count' parameter, derived * datatypes might describe more bytes than can fit into an integer. * if we've made it this far, we can pass a count of original * datatypes, instead of a count of bytes (which might overflow) * Other WriteContig calls in this path are operating on data * sieving buffer */ ADIO_WriteContig(fd, buf, count, datatype, ADIO_EXPLICIT_OFFSET, offset, status, error_code); if (file_ptr_type == ADIO_INDIVIDUAL) { /* update MPI-IO file pointer to point to the first byte * that can be accessed in the fileview. */ fd->fp_ind = offset + bufsize; if (bufsize == fwr_size) { do { st_index++; if (st_index == flat_file->count) { st_index = 0; n_filetypes++; } } while (flat_file->blocklens[st_index] == 0); fd->fp_ind = disp + flat_file->indices[st_index] + (ADIO_Offset) n_filetypes *filetype_extent; } } fd->fp_sys_posn = -1; /* set it to null. */ #ifdef HAVE_STATUS_SET_BYTES MPIR_Status_set_bytes(status, datatype, bufsize); #endif goto fn_exit; } /* Calculate end_offset, the last byte-offset that will be accessed. * e.g., if start_offset=0 and 100 bytes to be write, end_offset=99 */ st_fwr_size = fwr_size; st_n_filetypes = n_filetypes; i_offset = 0; j = st_index; off = offset; fwr_size = MPL_MIN(st_fwr_size, bufsize); while (i_offset < bufsize) { i_offset += fwr_size; end_offset = off + fwr_size - 1; j = (j + 1) % flat_file->count; n_filetypes += (j == 0) ? 1 : 0; while (flat_file->blocklens[j] == 0) { j = (j + 1) % flat_file->count; n_filetypes += (j == 0) ? 1 : 0; } off = disp + flat_file->indices[j] + n_filetypes * (ADIO_Offset) filetype_extent; fwr_size = MPL_MIN(flat_file->blocklens[j], bufsize - i_offset); } /* if atomicity is true, lock the region to be accessed */ if (fd->atomicity) ADIOI_WRITE_LOCK(fd, start_off, SEEK_SET, end_offset - start_off + 1); /* initial read for the read-modify-write */ writebuf_off = offset; writebuf = (char *) ADIOI_Malloc(max_bufsize); memset(writebuf, -1, max_bufsize); writebuf_len = (int) (MPL_MIN(max_bufsize, end_offset - writebuf_off + 1)); if (!(fd->atomicity)) ADIOI_WRITE_LOCK(fd, writebuf_off, SEEK_SET, writebuf_len); #ifdef ADIOI_MPE_LOGGING MPE_Log_event(ADIOI_MPE_lseek_a, 0, NULL); #endif lseek(fd->fd_sys, writebuf_off, SEEK_SET); #ifdef ADIOI_MPE_LOGGING MPE_Log_event(ADIOI_MPE_lseek_b, 0, NULL); #endif #ifdef ADIOI_MPE_LOGGING MPE_Log_event(ADIOI_MPE_read_a, 0, NULL); #endif err = read(fd->fd_sys, writebuf, writebuf_len); #ifdef ADIOI_MPE_LOGGING MPE_Log_event(ADIOI_MPE_read_b, 0, NULL); #endif if (err == -1) { *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_IO, "ADIOI_NFS_WriteStrided: ROMIO tries to optimize this access by doing a read-modify-write, but is unable to read the file. Please give the file read permission and open it with MPI_MODE_RDWR.", 0); goto fn_exit; } if (buftype_is_contig && !filetype_is_contig) { /* contiguous in memory, noncontiguous in file. should be the most common case. */ i_offset = 0; j = st_index; off = offset; n_filetypes = st_n_filetypes; fwr_size = MPL_MIN(st_fwr_size, bufsize); while (i_offset < bufsize) { if (fwr_size) { /* TYPE_UB and TYPE_LB can result in * fwr_size = 0. save system call in such cases */ /* lseek(fd->fd_sys, off, SEEK_SET); * err = write(fd->fd_sys, ((char *) buf) + i, fwr_size); */ req_off = off; req_len = fwr_size; userbuf_off = i_offset; ADIOI_BUFFERED_WRITE} i_offset += fwr_size; if (off + fwr_size < disp + flat_file->indices[j] + flat_file->blocklens[j] + n_filetypes * (ADIO_Offset) filetype_extent) off += fwr_size; /* did not reach end of contiguous block in filetype. * no more I/O needed. off is incremented by fwr_size. */ else { j = (j + 1) % flat_file->count; n_filetypes += (j == 0) ? 1 : 0; while (flat_file->blocklens[j] == 0) { j = (j + 1) % flat_file->count; n_filetypes += (j == 0) ? 1 : 0; } off = disp + flat_file->indices[j] + n_filetypes * (ADIO_Offset) filetype_extent; fwr_size = MPL_MIN(flat_file->blocklens[j], bufsize - i_offset); } } } else {
void ADIOI_GEN_ReadStrided_naive(ADIO_File fd, void *buf, int count, MPI_Datatype buftype, int file_ptr_type, ADIO_Offset offset, ADIO_Status *status, int *error_code) { /* offset is in units of etype relative to the filetype. */ ADIOI_Flatlist_node *flat_buf, *flat_file; int brd_size, frd_size=0, b_index; int bufsize, size, sum, n_etypes_in_filetype, size_in_filetype; int n_filetypes, etype_in_filetype; ADIO_Offset abs_off_in_filetype=0; int filetype_size, etype_size, buftype_size, req_len; MPI_Aint filetype_extent, buftype_extent; int buf_count, buftype_is_contig, filetype_is_contig; ADIO_Offset userbuf_off; ADIO_Offset off, req_off, disp, end_offset=0, start_off; ADIO_Status status1; *error_code = MPI_SUCCESS; /* changed below if error */ ADIOI_Datatype_iscontig(buftype, &buftype_is_contig); ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig); MPI_Type_size(fd->filetype, &filetype_size); if ( ! filetype_size ) { *error_code = MPI_SUCCESS; return; } MPI_Type_extent(fd->filetype, &filetype_extent); MPI_Type_size(buftype, &buftype_size); MPI_Type_extent(buftype, &buftype_extent); etype_size = fd->etype_size; bufsize = buftype_size * count; /* contiguous in buftype and filetype is handled elsewhere */ if (!buftype_is_contig && filetype_is_contig) { int b_count; /* noncontiguous in memory, contiguous in file. */ ADIOI_Flatten_datatype(buftype); flat_buf = ADIOI_Flatlist; while (flat_buf->type != buftype) flat_buf = flat_buf->next; off = (file_ptr_type == ADIO_INDIVIDUAL) ? fd->fp_ind : fd->disp + etype_size * offset; start_off = off; end_offset = off + bufsize - 1; /* if atomicity is true, lock (exclusive) the region to be accessed */ if ((fd->atomicity) && (fd->file_system != ADIO_PIOFS) && (fd->file_system != ADIO_PVFS)) { ADIOI_WRITE_LOCK(fd, start_off, SEEK_SET, end_offset-start_off+1); } /* for each region in the buffer, grab the data and put it in * place */ for (b_count=0; b_count < count; b_count++) { for (b_index=0; b_index < flat_buf->count; b_index++) { userbuf_off = b_count*buftype_extent + flat_buf->indices[b_index]; req_off = off; req_len = flat_buf->blocklens[b_index]; ADIO_ReadContig(fd, (char *) buf + userbuf_off, req_len, MPI_BYTE, ADIO_EXPLICIT_OFFSET, req_off, &status1, error_code); if (*error_code != MPI_SUCCESS) return; /* off is (potentially) used to save the final offset later */ off += flat_buf->blocklens[b_index]; } } if ((fd->atomicity) && (fd->file_system != ADIO_PIOFS) && (fd->file_system != ADIO_PVFS)) { ADIOI_UNLOCK(fd, start_off, SEEK_SET, end_offset-start_off+1); } if (file_ptr_type == ADIO_INDIVIDUAL) fd->fp_ind = off; } else { /* noncontiguous in file */ int f_index, st_frd_size, st_index = 0, st_n_filetypes; int flag; /* First we're going to calculate a set of values for use in all * the noncontiguous in file cases: * start_off - starting byte position of data in file * end_offset - last byte offset to be acessed in the file * st_n_filetypes - how far into the file we start in terms of * whole filetypes * st_index - index of block in first filetype that we will be * starting in (?) * st_frd_size - size of the data in the first filetype block * that we will read (accounts for being part-way * into reading this block of the filetype * */ /* filetype already flattened in ADIO_Open */ flat_file = ADIOI_Flatlist; while (flat_file->type != fd->filetype) flat_file = flat_file->next; disp = fd->disp; if (file_ptr_type == ADIO_INDIVIDUAL) { start_off = fd->fp_ind; /* in bytes */ n_filetypes = -1; flag = 0; while (!flag) { n_filetypes++; for (f_index=0; f_index < flat_file->count; f_index++) { if (disp + flat_file->indices[f_index] + (ADIO_Offset) n_filetypes*filetype_extent + flat_file->blocklens[f_index] >= start_off) { /* this block contains our starting position */ st_index = f_index; frd_size = (int) (disp + flat_file->indices[f_index] + (ADIO_Offset) n_filetypes*filetype_extent + flat_file->blocklens[f_index] - start_off); flag = 1; break; } } } } else { n_etypes_in_filetype = filetype_size/etype_size; n_filetypes = (int) (offset / n_etypes_in_filetype); etype_in_filetype = (int) (offset % n_etypes_in_filetype); size_in_filetype = etype_in_filetype * etype_size; sum = 0; for (f_index=0; f_index < flat_file->count; f_index++) { sum += flat_file->blocklens[f_index]; if (sum > size_in_filetype) { st_index = f_index; frd_size = sum - size_in_filetype; abs_off_in_filetype = flat_file->indices[f_index] + size_in_filetype - (sum - flat_file->blocklens[f_index]); break; } } /* abs. offset in bytes in the file */ start_off = disp + (ADIO_Offset) n_filetypes*filetype_extent + abs_off_in_filetype; } st_frd_size = frd_size; st_n_filetypes = n_filetypes; /* start_off, st_n_filetypes, st_index, and st_frd_size are * all calculated at this point */ /* Calculate end_offset, the last byte-offset that will be accessed. * e.g., if start_off=0 and 100 bytes to be read, end_offset=99 */ userbuf_off = 0; f_index = st_index; off = start_off; frd_size = ADIOI_MIN(st_frd_size, bufsize); while (userbuf_off < bufsize) { userbuf_off += frd_size; end_offset = off + frd_size - 1; if (f_index < (flat_file->count - 1)) f_index++; else { f_index = 0; n_filetypes++; } off = disp + flat_file->indices[f_index] + (ADIO_Offset) n_filetypes*filetype_extent; frd_size = ADIOI_MIN(flat_file->blocklens[f_index], bufsize-(int)userbuf_off); } /* End of calculations. At this point the following values have * been calculated and are ready for use: * - start_off * - end_offset * - st_n_filetypes * - st_index * - st_frd_size */ /* if atomicity is true, lock (exclusive) the region to be accessed */ if ((fd->atomicity) && (fd->file_system != ADIO_PIOFS) && (fd->file_system != ADIO_PVFS)) { ADIOI_WRITE_LOCK(fd, start_off, SEEK_SET, end_offset-start_off+1); } if (buftype_is_contig && !filetype_is_contig) { /* contiguous in memory, noncontiguous in file. should be the * most common case. */ userbuf_off = 0; f_index = st_index; off = start_off; n_filetypes = st_n_filetypes; frd_size = ADIOI_MIN(st_frd_size, bufsize); /* while there is still space in the buffer, read more data */ while (userbuf_off < bufsize) { if (frd_size) { /* TYPE_UB and TYPE_LB can result in frd_size = 0. save system call in such cases */ req_off = off; req_len = frd_size; ADIO_ReadContig(fd, (char *) buf + userbuf_off, req_len, MPI_BYTE, ADIO_EXPLICIT_OFFSET, req_off, &status1, error_code); if (*error_code != MPI_SUCCESS) return; } userbuf_off += frd_size; if (off + frd_size < disp + flat_file->indices[f_index] + flat_file->blocklens[f_index] + (ADIO_Offset) n_filetypes*filetype_extent) { /* important that this value be correct, as it is * used to set the offset in the fd near the end of * this function. */ off += frd_size; } /* did not reach end of contiguous block in filetype. * no more I/O needed. off is incremented by frd_size. */ else { if (f_index < (flat_file->count - 1)) f_index++; else { f_index = 0; n_filetypes++; } off = disp + flat_file->indices[f_index] + (ADIO_Offset) n_filetypes*filetype_extent; frd_size = ADIOI_MIN(flat_file->blocklens[f_index], bufsize-(int)userbuf_off); } } } else { int i, tmp_bufsize = 0; /* noncontiguous in memory as well as in file */ ADIOI_Flatten_datatype(buftype); flat_buf = ADIOI_Flatlist; while (flat_buf->type != buftype) flat_buf = flat_buf->next; b_index = buf_count = 0; i = (int) (flat_buf->indices[0]); f_index = st_index; off = start_off; n_filetypes = st_n_filetypes; frd_size = st_frd_size; brd_size = flat_buf->blocklens[0]; /* while we haven't read size * count bytes, keep going */ while (tmp_bufsize < bufsize) { int new_brd_size = brd_size, new_frd_size = frd_size; size = ADIOI_MIN(frd_size, brd_size); if (size) { req_off = off; req_len = size; userbuf_off = i; ADIO_ReadContig(fd, (char *) buf + userbuf_off, req_len, MPI_BYTE, ADIO_EXPLICIT_OFFSET, req_off, &status1, error_code); if (*error_code != MPI_SUCCESS) return; } if (size == frd_size) { /* reached end of contiguous block in file */ if (f_index < (flat_file->count - 1)) f_index++; else { f_index = 0; n_filetypes++; } off = disp + flat_file->indices[f_index] + (ADIO_Offset) n_filetypes*filetype_extent; new_frd_size = flat_file->blocklens[f_index]; if (size != brd_size) { i += size; new_brd_size -= size; } } if (size == brd_size) { /* reached end of contiguous block in memory */ b_index = (b_index + 1)%flat_buf->count; buf_count++; i = (int) (buftype_extent*(buf_count/flat_buf->count) + flat_buf->indices[b_index]); new_brd_size = flat_buf->blocklens[b_index]; if (size != frd_size) { off += size; new_frd_size -= size; } } tmp_bufsize += size; frd_size = new_frd_size; brd_size = new_brd_size; } } /* unlock the file region if we locked it */ if ((fd->atomicity) && (fd->file_system != ADIO_PIOFS) && (fd->file_system != ADIO_PVFS)) { ADIOI_UNLOCK(fd, start_off, SEEK_SET, end_offset-start_off+1); } if (file_ptr_type == ADIO_INDIVIDUAL) fd->fp_ind = off; } /* end of (else noncontiguous in file) */ fd->fp_sys_posn = -1; /* mark it as invalid. */ #ifdef HAVE_STATUS_SET_BYTES MPIR_Status_set_bytes(status, buftype, bufsize); /* This is a temporary way of filling in status. The right way is to * keep track of how much data was actually read and placed in buf */ #endif if (!buftype_is_contig) ADIOI_Delete_flattened(buftype); }
void ADIOI_GEN_WriteStridedColl(ADIO_File fd, const void *buf, int count, MPI_Datatype datatype, int file_ptr_type, ADIO_Offset offset, ADIO_Status * status, int *error_code) { /* Uses a generalized version of the extended two-phase method described in "An Extended Two-Phase Method for Accessing Sections of Out-of-Core Arrays", Rajeev Thakur and Alok Choudhary, Scientific Programming, (5)4:301--317, Winter 1996. http://www.mcs.anl.gov/home/thakur/ext2ph.ps */ ADIOI_Access *my_req; /* array of nprocs access structures, one for each other process in * whose file domain this process's request lies */ ADIOI_Access *others_req; /* array of nprocs access structures, one for each other process * whose request lies in this process's file domain. */ int i, filetype_is_contig, nprocs, nprocs_for_coll, myrank; int contig_access_count = 0, interleave_count = 0, buftype_is_contig; int *count_my_req_per_proc, count_my_req_procs, count_others_req_procs; ADIO_Offset orig_fp, start_offset, end_offset, fd_size, min_st_offset, off; ADIO_Offset *offset_list = NULL, *st_offsets = NULL, *fd_start = NULL, *fd_end = NULL, *end_offsets = NULL; MPI_Aint *buf_idx = NULL; ADIO_Offset *len_list = NULL; int old_error, tmp_error; if (fd->hints->cb_pfr != ADIOI_HINT_DISABLE) { /* Cast away const'ness as the below function is used for read * and write */ ADIOI_IOStridedColl(fd, (char *) buf, count, ADIOI_WRITE, datatype, file_ptr_type, offset, status, error_code); return; } MPI_Comm_size(fd->comm, &nprocs); MPI_Comm_rank(fd->comm, &myrank); /* the number of processes that actually perform I/O, nprocs_for_coll, * is stored in the hints off the ADIO_File structure */ nprocs_for_coll = fd->hints->cb_nodes; orig_fp = fd->fp_ind; /* only check for interleaving if cb_write isn't disabled */ if (fd->hints->cb_write != ADIOI_HINT_DISABLE) { /* For this process's request, calculate the list of offsets and * lengths in the file and determine the start and end offsets. */ /* Note: end_offset points to the last byte-offset that will be accessed. * e.g., if start_offset=0 and 100 bytes to be read, end_offset=99 */ ADIOI_Calc_my_off_len(fd, count, datatype, file_ptr_type, offset, &offset_list, &len_list, &start_offset, &end_offset, &contig_access_count); /* each process communicates its start and end offsets to other * processes. The result is an array each of start and end offsets stored * in order of process rank. */ st_offsets = (ADIO_Offset *) ADIOI_Malloc(nprocs * 2 * sizeof(ADIO_Offset)); end_offsets = st_offsets + nprocs; MPI_Allgather(&start_offset, 1, ADIO_OFFSET, st_offsets, 1, ADIO_OFFSET, fd->comm); MPI_Allgather(&end_offset, 1, ADIO_OFFSET, end_offsets, 1, ADIO_OFFSET, fd->comm); /* are the accesses of different processes interleaved? */ for (i = 1; i < nprocs; i++) if ((st_offsets[i] < end_offsets[i - 1]) && (st_offsets[i] <= end_offsets[i])) interleave_count++; /* This is a rudimentary check for interleaving, but should suffice * for the moment. */ } ADIOI_Datatype_iscontig(datatype, &buftype_is_contig); if (fd->hints->cb_write == ADIOI_HINT_DISABLE || (!interleave_count && (fd->hints->cb_write == ADIOI_HINT_AUTO))) { /* use independent accesses */ if (fd->hints->cb_write != ADIOI_HINT_DISABLE) { ADIOI_Free(offset_list); ADIOI_Free(st_offsets); } fd->fp_ind = orig_fp; ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig); if (buftype_is_contig && filetype_is_contig) { if (file_ptr_type == ADIO_EXPLICIT_OFFSET) { off = fd->disp + (ADIO_Offset) (fd->etype_size) * offset; ADIO_WriteContig(fd, buf, count, datatype, ADIO_EXPLICIT_OFFSET, off, status, error_code); } else ADIO_WriteContig(fd, buf, count, datatype, ADIO_INDIVIDUAL, 0, status, error_code); } else ADIO_WriteStrided(fd, buf, count, datatype, file_ptr_type, offset, status, error_code); return; } /* Divide the I/O workload among "nprocs_for_coll" processes. This is done by (logically) dividing the file into file domains (FDs); each process may directly access only its own file domain. */ ADIOI_Calc_file_domains(st_offsets, end_offsets, nprocs, nprocs_for_coll, &min_st_offset, &fd_start, &fd_end, fd->hints->min_fdomain_size, &fd_size, fd->hints->striping_unit); /* calculate what portions of the access requests of this process are located in what file domains */ ADIOI_Calc_my_req(fd, offset_list, len_list, contig_access_count, min_st_offset, fd_start, fd_end, fd_size, nprocs, &count_my_req_procs, &count_my_req_per_proc, &my_req, &buf_idx); /* based on everyone's my_req, calculate what requests of other processes lie in this process's file domain. count_others_req_procs = number of processes whose requests lie in this process's file domain (including this process itself) count_others_req_per_proc[i] indicates how many separate contiguous requests of proc. i lie in this process's file domain. */ ADIOI_Calc_others_req(fd, count_my_req_procs, count_my_req_per_proc, my_req, nprocs, myrank, &count_others_req_procs, &others_req); ADIOI_Free(count_my_req_per_proc); ADIOI_Free(my_req[0].offsets); ADIOI_Free(my_req); /* exchange data and write in sizes of no more than coll_bufsize. */ /* Cast away const'ness for the below function */ ADIOI_Exch_and_write(fd, (char *) buf, datatype, nprocs, myrank, others_req, offset_list, len_list, contig_access_count, min_st_offset, fd_size, fd_start, fd_end, buf_idx, error_code); /* If this collective write is followed by an independent write, * it's possible to have those subsequent writes on other processes * race ahead and sneak in before the read-modify-write completes. * We carry out a collective communication at the end here so no one * can start independent i/o before collective I/O completes. * * need to do some gymnastics with the error codes so that if something * went wrong, all processes report error, but if a process has a more * specific error code, we can still have that process report the * additional information */ old_error = *error_code; if (*error_code != MPI_SUCCESS) *error_code = MPI_ERR_IO; /* optimization: if only one process performing i/o, we can perform * a less-expensive Bcast */ #ifdef ADIOI_MPE_LOGGING MPE_Log_event(ADIOI_MPE_postwrite_a, 0, NULL); #endif if (fd->hints->cb_nodes == 1) MPI_Bcast(error_code, 1, MPI_INT, fd->hints->ranklist[0], fd->comm); else { tmp_error = *error_code; MPI_Allreduce(&tmp_error, error_code, 1, MPI_INT, MPI_MAX, fd->comm); } #ifdef ADIOI_MPE_LOGGING MPE_Log_event(ADIOI_MPE_postwrite_b, 0, NULL); #endif #ifdef AGGREGATION_PROFILE MPE_Log_event(5012, 0, NULL); #endif if ((old_error != MPI_SUCCESS) && (old_error != MPI_ERR_IO)) *error_code = old_error; /* free all memory allocated for collective I/O */ ADIOI_Free(others_req[0].offsets); ADIOI_Free(others_req[0].mem_ptrs); ADIOI_Free(others_req); ADIOI_Free(buf_idx); ADIOI_Free(offset_list); ADIOI_Free(st_offsets); ADIOI_Free(fd_start); #ifdef HAVE_STATUS_SET_BYTES if (status) { MPI_Count bufsize, size; /* Don't set status if it isn't needed */ MPI_Type_size_x(datatype, &size); bufsize = size * count; MPIR_Status_set_bytes(status, datatype, bufsize); } /* This is a temporary way of filling in status. The right way is to keep track of how much data was actually written during collective I/O. */ #endif fd->fp_sys_posn = -1; /* set it to null. */ #ifdef AGGREGATION_PROFILE MPE_Log_event(5013, 0, NULL); #endif }
void ADIOI_PIOFS_WriteContig(ADIO_File fd, void *buf, int count, MPI_Datatype datatype, int file_ptr_type, ADIO_Offset offset, ADIO_Status *status, int *error_code) { int err=-1, datatype_size, len; #ifndef PRINT_ERR_MSG static char myname[] = "ADIOI_PIOFS_WRITECONTIG"; #endif MPI_Type_size(datatype, &datatype_size); len = datatype_size * count; if (file_ptr_type == ADIO_EXPLICIT_OFFSET) { if (fd->fp_sys_posn != offset) { #ifdef PROFILE MPE_Log_event(11, 0, "start seek"); #endif llseek(fd->fd_sys, offset, SEEK_SET); #ifdef PROFILE MPE_Log_event(12, 0, "end seek"); #endif } #ifdef PROFILE MPE_Log_event(5, 0, "start write"); #endif err = write(fd->fd_sys, buf, len); #ifdef PROFILE MPE_Log_event(6, 0, "end write"); #endif fd->fp_sys_posn = offset + err; /* individual file pointer not updated */ } else { /* write from curr. location of ind. file pointer */ if (fd->fp_sys_posn != fd->fp_ind) { #ifdef PROFILE MPE_Log_event(11, 0, "start seek"); #endif llseek(fd->fd_sys, fd->fp_ind, SEEK_SET); #ifdef PROFILE MPE_Log_event(12, 0, "end seek"); #endif } #ifdef PROFILE MPE_Log_event(5, 0, "start write"); #endif err = write(fd->fd_sys, buf, len); #ifdef PROFILE MPE_Log_event(6, 0, "end write"); #endif fd->fp_ind += err; fd->fp_sys_posn = fd->fp_ind; } #ifdef HAVE_STATUS_SET_BYTES if (err != -1) MPIR_Status_set_bytes(status, datatype, err); #endif if (err == -1) { #ifdef MPICH2 *error_code = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_IO, "**io", "**io %s", strerror(errno)); #elif defined(PRINT_ERR_MSG) *error_code = MPI_ERR_UNKNOWN; #else *error_code = MPIR_Err_setmsg(MPI_ERR_IO, MPIR_ADIO_ERROR, myname, "I/O Error", "%s", strerror(errno)); ADIOI_Error(fd, *error_code, myname); #endif } else *error_code = MPI_SUCCESS; }
void ADIOI_PVFS_WriteStrided(ADIO_File fd, void *buf, int count, MPI_Datatype datatype, int file_ptr_type, ADIO_Offset offset, ADIO_Status *status, int *error_code) { /* Since PVFS does not support file locking, can't do buffered writes as on Unix */ /* offset is in units of etype relative to the filetype. */ ADIOI_Flatlist_node *flat_buf, *flat_file; int i, j, k, err=-1, bwr_size, fwr_size=0, st_index=0; int bufsize, num, size, sum, n_etypes_in_filetype, size_in_filetype; int n_filetypes, etype_in_filetype; ADIO_Offset abs_off_in_filetype=0; int filetype_size, etype_size, buftype_size; MPI_Aint filetype_extent, buftype_extent, indx; int buf_count, buftype_is_contig, filetype_is_contig; ADIO_Offset off, disp; int flag, new_bwr_size, new_fwr_size, err_flag=0; static char myname[] = "ADIOI_PVFS_WRITESTRIDED"; #ifdef HAVE_PVFS_LISTIO if ( fd->hints->fs_hints.pvfs.listio_write == ADIOI_HINT_ENABLE ) { ADIOI_PVFS_WriteStridedListIO(fd, buf, count, datatype, file_ptr_type, offset, status, error_code); return; } #endif /* if hint set to DISABLE or AUTOMATIC, don't use listio */ /* --BEGIN ERROR HANDLING-- */ if (fd->atomicity) { *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_INTERN, "Atomic mode set in PVFS I/O function", 0); return; } /* --END ERROR HANDLING-- */ ADIOI_Datatype_iscontig(datatype, &buftype_is_contig); ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig); MPI_Type_size(fd->filetype, &filetype_size); if ( ! filetype_size ) { *error_code = MPI_SUCCESS; return; } MPI_Type_extent(fd->filetype, &filetype_extent); MPI_Type_size(datatype, &buftype_size); MPI_Type_extent(datatype, &buftype_extent); etype_size = fd->etype_size; bufsize = buftype_size * count; if (!buftype_is_contig && filetype_is_contig) { char *combine_buf, *combine_buf_ptr; ADIO_Offset combine_buf_remain; /* noncontiguous in memory, contiguous in file. use writev */ ADIOI_Flatten_datatype(datatype); flat_buf = ADIOI_Flatlist; while (flat_buf->type != datatype) flat_buf = flat_buf->next; /* allocate our "combine buffer" to pack data into before writing */ combine_buf = (char *) ADIOI_Malloc(fd->hints->ind_wr_buffer_size); combine_buf_ptr = combine_buf; combine_buf_remain = fd->hints->ind_wr_buffer_size; /* seek to the right spot in the file */ if (file_ptr_type == ADIO_EXPLICIT_OFFSET) { off = fd->disp + etype_size * offset; pvfs_lseek64(fd->fd_sys, off, SEEK_SET); } else off = pvfs_lseek64(fd->fd_sys, fd->fp_ind, SEEK_SET); /* loop through all the flattened pieces. combine into buffer until * no more will fit, then write. * * special case of a given piece being bigger than the combine buffer * is also handled. */ for (j=0; j<count; j++) { for (i=0; i<flat_buf->count; i++) { if (flat_buf->blocklens[i] > combine_buf_remain && combine_buf != combine_buf_ptr) { /* there is data in the buffer; write out the buffer so far */ err = pvfs_write(fd->fd_sys, combine_buf, fd->hints->ind_wr_buffer_size - combine_buf_remain); if (err == -1) err_flag = 1; /* reset our buffer info */ combine_buf_ptr = combine_buf; combine_buf_remain = fd->hints->ind_wr_buffer_size; } /* TODO: heuristic for when to not bother to use combine buffer? */ if (flat_buf->blocklens[i] >= combine_buf_remain) { /* special case: blocklen is as big as or bigger than the combine buf; * write directly */ err = pvfs_write(fd->fd_sys, ((char *) buf) + j*buftype_extent + flat_buf->indices[i], flat_buf->blocklens[i]); if (err == -1) err_flag = 1; off += flat_buf->blocklens[i]; /* keep up with the final file offset too */ } else { /* copy more data into combine buffer */ memcpy(combine_buf_ptr, ((char *) buf) + j*buftype_extent + flat_buf->indices[i], flat_buf->blocklens[i]); combine_buf_ptr += flat_buf->blocklens[i]; combine_buf_remain -= flat_buf->blocklens[i]; off += flat_buf->blocklens[i]; /* keep up with the final file offset too */ } } } if (combine_buf_ptr != combine_buf) { /* data left in buffer to write */ err = pvfs_write(fd->fd_sys, combine_buf, fd->hints->ind_wr_buffer_size - combine_buf_remain); if (err == -1) err_flag = 1; } if (file_ptr_type == ADIO_INDIVIDUAL) fd->fp_ind = off; ADIOI_Free(combine_buf); if (err_flag) { *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_IO, "**io", "**io %s", strerror(errno)); } else *error_code = MPI_SUCCESS; } /* if (!buftype_is_contig && filetype_is_contig) ... */ else { /* noncontiguous in file */ /* split up into several contiguous writes */ /* find starting location in the file */ /* filetype already flattened in ADIO_Open */ flat_file = ADIOI_Flatlist; while (flat_file->type != fd->filetype) flat_file = flat_file->next; disp = fd->disp; if (file_ptr_type == ADIO_INDIVIDUAL) { offset = fd->fp_ind; /* in bytes */ n_filetypes = -1; flag = 0; while (!flag) { n_filetypes++; for (i=0; i<flat_file->count; i++) { if (disp + flat_file->indices[i] + (ADIO_Offset) n_filetypes*filetype_extent + flat_file->blocklens[i] >= offset) { st_index = i; fwr_size = disp + flat_file->indices[i] + (ADIO_Offset) n_filetypes*filetype_extent + flat_file->blocklens[i] - offset; flag = 1; break; } } } } else { n_etypes_in_filetype = filetype_size/etype_size; n_filetypes = (int) (offset / n_etypes_in_filetype); etype_in_filetype = (int) (offset % n_etypes_in_filetype); size_in_filetype = etype_in_filetype * etype_size; sum = 0; for (i=0; i<flat_file->count; i++) { sum += flat_file->blocklens[i]; if (sum > size_in_filetype) { st_index = i; fwr_size = sum - size_in_filetype; abs_off_in_filetype = flat_file->indices[i] + size_in_filetype - (sum - flat_file->blocklens[i]); break; } } /* abs. offset in bytes in the file */ offset = disp + (ADIO_Offset) n_filetypes*filetype_extent + abs_off_in_filetype; } if (buftype_is_contig && !filetype_is_contig) { /* contiguous in memory, noncontiguous in file. should be the most common case. */ i = 0; j = st_index; off = offset; fwr_size = ADIOI_MIN(fwr_size, bufsize); while (i < bufsize) { if (fwr_size) { /* TYPE_UB and TYPE_LB can result in fwr_size = 0. save system call in such cases */ #ifdef PROFILE MPE_Log_event(11, 0, "start seek"); #endif pvfs_lseek64(fd->fd_sys, off, SEEK_SET); #ifdef PROFILE MPE_Log_event(12, 0, "end seek"); MPE_Log_event(5, 0, "start write"); #endif err = pvfs_write(fd->fd_sys, ((char *) buf) + i, fwr_size); #ifdef PROFILE MPE_Log_event(6, 0, "end write"); #endif if (err == -1) err_flag = 1; } i += fwr_size; if (off + fwr_size < disp + flat_file->indices[j] + flat_file->blocklens[j] + (ADIO_Offset) n_filetypes*filetype_extent) off += fwr_size; /* did not reach end of contiguous block in filetype. no more I/O needed. off is incremented by fwr_size. */ else { if (j < (flat_file->count - 1)) j++; else { j = 0; n_filetypes++; } off = disp + flat_file->indices[j] + (ADIO_Offset) n_filetypes*filetype_extent; fwr_size = ADIOI_MIN(flat_file->blocklens[j], bufsize-i); } } } else { /* noncontiguous in memory as well as in file */ ADIOI_Flatten_datatype(datatype); flat_buf = ADIOI_Flatlist; while (flat_buf->type != datatype) flat_buf = flat_buf->next; k = num = buf_count = 0; indx = flat_buf->indices[0]; j = st_index; off = offset; bwr_size = flat_buf->blocklens[0]; while (num < bufsize) { size = ADIOI_MIN(fwr_size, bwr_size); if (size) { #ifdef PROFILE MPE_Log_event(11, 0, "start seek"); #endif pvfs_lseek64(fd->fd_sys, off, SEEK_SET); #ifdef PROFILE MPE_Log_event(12, 0, "end seek"); MPE_Log_event(5, 0, "start write"); #endif err = pvfs_write(fd->fd_sys, ((char *) buf) + indx, size); #ifdef PROFILE MPE_Log_event(6, 0, "end write"); #endif if (err == -1) err_flag = 1; } new_fwr_size = fwr_size; new_bwr_size = bwr_size; if (size == fwr_size) { /* reached end of contiguous block in file */ if (j < (flat_file->count - 1)) j++; else { j = 0; n_filetypes++; } off = disp + flat_file->indices[j] + (ADIO_Offset) n_filetypes*filetype_extent; new_fwr_size = flat_file->blocklens[j]; if (size != bwr_size) { indx += size; new_bwr_size -= size; } } if (size == bwr_size) { /* reached end of contiguous block in memory */ k = (k + 1)%flat_buf->count; buf_count++; indx = buftype_extent*(buf_count/flat_buf->count) + flat_buf->indices[k]; new_bwr_size = flat_buf->blocklens[k]; if (size != fwr_size) { off += size; new_fwr_size -= size; } } num += size; fwr_size = new_fwr_size; bwr_size = new_bwr_size; } } if (file_ptr_type == ADIO_INDIVIDUAL) fd->fp_ind = off; if (err_flag) { *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_IO, "**io", "**io %s", strerror(errno)); } else *error_code = MPI_SUCCESS; } fd->fp_sys_posn = -1; /* set it to null. */ #ifdef HAVE_STATUS_SET_BYTES MPIR_Status_set_bytes(status, datatype, bufsize); /* This is a temporary way of filling in status. The right way is to keep track of how much data was actually written by ADIOI_BUFFERED_WRITE. */ #endif if (!buftype_is_contig) ADIOI_Delete_flattened(datatype); }
void ADIOI_PIOFS_WriteStrided(ADIO_File fd, void *buf, int count, MPI_Datatype datatype, int file_ptr_type, ADIO_Offset offset, ADIO_Status *status, int *error_code) { /* Since PIOFS does not support file locking, can't do buffered writes as on Unix */ /* offset is in units of etype relative to the filetype. */ ADIOI_Flatlist_node *flat_buf, *flat_file; struct iovec *iov; int i, j, k, err=-1, bwr_size, fwr_size=0, st_index=0; int bufsize, num, size, sum, n_etypes_in_filetype, size_in_filetype; int n_filetypes, etype_in_filetype; ADIO_Offset abs_off_in_filetype=0; int filetype_size, etype_size, buftype_size; MPI_Aint filetype_extent, buftype_extent, indx; int buf_count, buftype_is_contig, filetype_is_contig; ADIO_Offset off, disp; int flag, new_bwr_size, new_fwr_size, err_flag=0; #ifndef PRINT_ERR_MSG static char myname[] = "ADIOI_PIOFS_WRITESTRIDED"; #endif if (fd->atomicity) { FPRINTF(stderr, "ROMIO cannot guarantee atomicity of noncontiguous accesses in atomic mode, as PIOFS doesn't support file locking. Use nonatomic mode and its associated semantics.\n"); MPI_Abort(MPI_COMM_WORLD, 1); } ADIOI_Datatype_iscontig(datatype, &buftype_is_contig); ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig); MPI_Type_size(fd->filetype, &filetype_size); if ( ! filetype_size ) { *error_code = MPI_SUCCESS; return; } MPI_Type_extent(fd->filetype, &filetype_extent); MPI_Type_size(datatype, &buftype_size); MPI_Type_extent(datatype, &buftype_extent); etype_size = fd->etype_size; bufsize = buftype_size * count; if (!buftype_is_contig && filetype_is_contig) { /* noncontiguous in memory, contiguous in file. use writev */ ADIOI_Flatten_datatype(datatype); flat_buf = ADIOI_Flatlist; while (flat_buf->type != datatype) flat_buf = flat_buf->next; /* There is a limit of 16 on the number of iovecs for readv/writev! */ iov = (struct iovec *) ADIOI_Malloc(16*sizeof(struct iovec)); if (file_ptr_type == ADIO_EXPLICIT_OFFSET) { off = fd->disp + etype_size * offset; llseek(fd->fd_sys, off, SEEK_SET); } else off = llseek(fd->fd_sys, fd->fp_ind, SEEK_SET); k = 0; for (j=0; j<count; j++) for (i=0; i<flat_buf->count; i++) { iov[k].iov_base = ((char *) buf) + j*buftype_extent + flat_buf->indices[i]; iov[k].iov_len = flat_buf->blocklens[i]; /*FPRINTF(stderr, "%d %d\n", iov[k].iov_base, iov[k].iov_len);*/ off += flat_buf->blocklens[i]; k = (k+1)%16; if (!k) { err = writev(fd->fd_sys, iov, 16); if (err == -1) err_flag = 1; } } if (k) { err = writev(fd->fd_sys, iov, k); if (err == -1) err_flag = 1; } if (file_ptr_type == ADIO_INDIVIDUAL) fd->fp_ind = off; ADIOI_Free(iov); if (err_flag) { #ifdef MPICH2 *error_code = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_IO, "**io", "**io %s", strerror(errno)); #elif defined(PRINT_ERR_MSG) *error_code = MPI_ERR_UNKNOWN; #else /* MPICH-1 */ *error_code = MPIR_Err_setmsg(MPI_ERR_IO, MPIR_ADIO_ERROR, myname, "I/O Error", "%s", strerror(errno)); ADIOI_Error(fd, *error_code, myname); #endif } else *error_code = MPI_SUCCESS; } /* if (!buftype_is_contig && filetype_is_contig) ... */ else { /* noncontiguous in file */ /* split up into several contiguous writes */ /* find starting location in the file */ /* filetype already flattened in ADIO_Open */ flat_file = ADIOI_Flatlist; while (flat_file->type != fd->filetype) flat_file = flat_file->next; disp = fd->disp; if (file_ptr_type == ADIO_INDIVIDUAL) { offset = fd->fp_ind; /* in bytes */ n_filetypes = -1; flag = 0; while (!flag) { n_filetypes++; for (i=0; i<flat_file->count; i++) { if (disp + flat_file->indices[i] + (ADIO_Offset) n_filetypes*filetype_extent + flat_file->blocklens[i] >= offset) { st_index = i; fwr_size = disp + flat_file->indices[i] + (ADIO_Offset) n_filetypes*filetype_extent + flat_file->blocklens[i] - offset; flag = 1; break; } } } } else { n_etypes_in_filetype = filetype_size/etype_size; n_filetypes = (int) (offset / n_etypes_in_filetype); etype_in_filetype = (int) (offset % n_etypes_in_filetype); size_in_filetype = etype_in_filetype * etype_size; sum = 0; for (i=0; i<flat_file->count; i++) { sum += flat_file->blocklens[i]; if (sum > size_in_filetype) { st_index = i; fwr_size = sum - size_in_filetype; abs_off_in_filetype = flat_file->indices[i] + size_in_filetype - (sum - flat_file->blocklens[i]); break; } } /* abs. offset in bytes in the file */ offset = disp + (ADIO_Offset) n_filetypes*filetype_extent + abs_off_in_filetype; } if (buftype_is_contig && !filetype_is_contig) { /* contiguous in memory, noncontiguous in file. should be the most common case. */ i = 0; j = st_index; off = offset; fwr_size = ADIOI_MIN(fwr_size, bufsize); while (i < bufsize) { if (fwr_size) { /* TYPE_UB and TYPE_LB can result in fwr_size = 0. save system call in such cases */ #ifdef PROFILE MPE_Log_event(11, 0, "start seek"); #endif llseek(fd->fd_sys, off, SEEK_SET); #ifdef PROFILE MPE_Log_event(12, 0, "end seek"); MPE_Log_event(5, 0, "start write"); #endif err = write(fd->fd_sys, ((char *) buf) + i, fwr_size); #ifdef PROFILE MPE_Log_event(6, 0, "end write"); #endif if (err == -1) err_flag = 1; } i += fwr_size; if (off + fwr_size < disp + flat_file->indices[j] + flat_file->blocklens[j] + (ADIO_Offset) n_filetypes*filetype_extent) off += fwr_size; /* did not reach end of contiguous block in filetype. no more I/O needed. off is incremented by fwr_size. */ else { if (j < (flat_file->count - 1)) j++; else { j = 0; n_filetypes++; } off = disp + flat_file->indices[j] + (ADIO_Offset) n_filetypes*filetype_extent; fwr_size = ADIOI_MIN(flat_file->blocklens[j], bufsize-i); } } } else { /* noncontiguous in memory as well as in file */ ADIOI_Flatten_datatype(datatype); flat_buf = ADIOI_Flatlist; while (flat_buf->type != datatype) flat_buf = flat_buf->next; k = num = buf_count = 0; indx = flat_buf->indices[0]; j = st_index; off = offset; bwr_size = flat_buf->blocklens[0]; while (num < bufsize) { size = ADIOI_MIN(fwr_size, bwr_size); if (size) { #ifdef PROFILE MPE_Log_event(11, 0, "start seek"); #endif llseek(fd->fd_sys, off, SEEK_SET); #ifdef PROFILE MPE_Log_event(12, 0, "end seek"); MPE_Log_event(5, 0, "start write"); #endif err = write(fd->fd_sys, ((char *) buf) + indx, size); #ifdef PROFILE MPE_Log_event(6, 0, "end write"); #endif if (err == -1) err_flag = 1; } new_fwr_size = fwr_size; new_bwr_size = bwr_size; if (size == fwr_size) { /* reached end of contiguous block in file */ if (j < (flat_file->count - 1)) j++; else { j = 0; n_filetypes++; } off = disp + flat_file->indices[j] + (ADIO_Offset) n_filetypes*filetype_extent; new_fwr_size = flat_file->blocklens[j]; if (size != bwr_size) { indx += size; new_bwr_size -= size; } } if (size == bwr_size) { /* reached end of contiguous block in memory */ k = (k + 1)%flat_buf->count; buf_count++; indx = buftype_extent*(buf_count/flat_buf->count) + flat_buf->indices[k]; new_bwr_size = flat_buf->blocklens[k]; if (size != fwr_size) { off += size; new_fwr_size -= size; } } num += size; fwr_size = new_fwr_size; bwr_size = new_bwr_size; } } if (file_ptr_type == ADIO_INDIVIDUAL) fd->fp_ind = off; if (err_flag) { #ifdef MPICH2 *error_code = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_IO, "**io", "**io %s", strerror(errno)); #elif defined(PRINT_ERR_MSG) *error_code = MPI_ERR_UNKNOWN; #else /* MPICH-1 */ *error_code = MPIR_Err_setmsg(MPI_ERR_IO, MPIR_ADIO_ERROR, myname, "I/O Error", "%s", strerror(errno)); ADIOI_Error(fd, *error_code, myname); #endif } else *error_code = MPI_SUCCESS; } fd->fp_sys_posn = -1; /* set it to null. */ #ifdef HAVE_STATUS_SET_BYTES MPIR_Status_set_bytes(status, datatype, bufsize); /* This is a temporary way of filling in status. The right way is to keep track of how much data was actually written by ADIOI_BUFFERED_WRITE. */ #endif if (!buftype_is_contig) ADIOI_Delete_flattened(datatype); }
void ADIOI_GEN_ReadStrided(ADIO_File fd, void *buf, int count, MPI_Datatype datatype, int file_ptr_type, ADIO_Offset offset, ADIO_Status *status, int *error_code) { /* offset is in units of etype relative to the filetype. */ ADIOI_Flatlist_node *flat_buf, *flat_file; ADIO_Offset i_offset, new_brd_size, brd_size, size; int i, j, k, st_index=0; MPI_Count num, bufsize; int n_etypes_in_filetype; ADIO_Offset n_filetypes, etype_in_filetype, st_n_filetypes, size_in_filetype; ADIO_Offset abs_off_in_filetype=0, new_frd_size, frd_size=0, st_frd_size; MPI_Count filetype_size, etype_size, buftype_size, partial_read; MPI_Aint filetype_extent, buftype_extent; int buf_count, buftype_is_contig, filetype_is_contig; ADIO_Offset userbuf_off, req_len, sum; ADIO_Offset off, req_off, disp, end_offset=0, readbuf_off, start_off; char *readbuf, *tmp_buf, *value; int info_flag; unsigned max_bufsize, readbuf_len; ADIO_Status status1; if (fd->hints->ds_read == ADIOI_HINT_DISABLE) { /* if user has disabled data sieving on reads, use naive * approach instead. */ ADIOI_GEN_ReadStrided_naive(fd, buf, count, datatype, file_ptr_type, offset, status, error_code); return; } *error_code = MPI_SUCCESS; /* changed below if error */ ADIOI_Datatype_iscontig(datatype, &buftype_is_contig); ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig); MPI_Type_size_x(fd->filetype, &filetype_size); if ( ! filetype_size ) { #ifdef HAVE_STATUS_SET_BYTES MPIR_Status_set_bytes(status, datatype, 0); #endif *error_code = MPI_SUCCESS; return; } MPI_Type_extent(fd->filetype, &filetype_extent); MPI_Type_size_x(datatype, &buftype_size); MPI_Type_extent(datatype, &buftype_extent); etype_size = fd->etype_size; ADIOI_Assert((buftype_size * count) == ((ADIO_Offset)(MPI_Count)buftype_size * (ADIO_Offset)count)); bufsize = buftype_size * count; /* get max_bufsize from the info object. */ value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char)); ADIOI_Info_get(fd->info, "ind_rd_buffer_size", MPI_MAX_INFO_VAL, value, &info_flag); max_bufsize = atoi(value); ADIOI_Free(value); if (!buftype_is_contig && filetype_is_contig) { /* noncontiguous in memory, contiguous in file. */ flat_buf = ADIOI_Flatten_and_find(datatype); off = (file_ptr_type == ADIO_INDIVIDUAL) ? fd->fp_ind : fd->disp + (ADIO_Offset)etype_size * offset; start_off = off; end_offset = off + bufsize - 1; readbuf_off = off; readbuf = (char *) ADIOI_Malloc(max_bufsize); readbuf_len = (unsigned) (MPL_MIN(max_bufsize, end_offset-readbuf_off+1)); /* if atomicity is true, lock (exclusive) the region to be accessed */ if ((fd->atomicity) && ADIO_Feature(fd, ADIO_LOCKS)) ADIOI_WRITE_LOCK(fd, start_off, SEEK_SET, end_offset-start_off+1); ADIO_ReadContig(fd, readbuf, readbuf_len, MPI_BYTE, ADIO_EXPLICIT_OFFSET, readbuf_off, &status1, error_code); if (*error_code != MPI_SUCCESS) return; for (j=0; j<count; j++) { for (i=0; i<flat_buf->count; i++) { userbuf_off = (ADIO_Offset)j*(ADIO_Offset)buftype_extent + flat_buf->indices[i]; req_off = off; req_len = flat_buf->blocklens[i]; ADIOI_BUFFERED_READ off += flat_buf->blocklens[i]; } } if ((fd->atomicity) && ADIO_Feature(fd, ADIO_LOCKS)) ADIOI_UNLOCK(fd, start_off, SEEK_SET, end_offset-start_off+1); if (file_ptr_type == ADIO_INDIVIDUAL) fd->fp_ind = off; ADIOI_Free(readbuf); } else { /* noncontiguous in file */ flat_file = ADIOI_Flatten_and_find(fd->filetype); disp = fd->disp; if (file_ptr_type == ADIO_INDIVIDUAL) { /* Wei-keng reworked type processing to be a bit more efficient */ offset = fd->fp_ind - disp; n_filetypes = (offset - flat_file->indices[0]) / filetype_extent; offset -= (ADIO_Offset)n_filetypes * filetype_extent; /* now offset is local to this extent */ /* find the block where offset is located, skip blocklens[i]==0 */ for (i=0; i<flat_file->count; i++) { ADIO_Offset dist; if (flat_file->blocklens[i] == 0) continue; dist = flat_file->indices[i] + flat_file->blocklens[i] - offset; /* frd_size is from offset to the end of block i */ if (dist == 0) { i++; offset = flat_file->indices[i]; frd_size = flat_file->blocklens[i]; break; } if (dist > 0) { frd_size = dist; break; } } st_index = i; /* starting index in flat_file->indices[] */ offset += disp + (ADIO_Offset)n_filetypes*filetype_extent; } else { n_etypes_in_filetype = filetype_size/etype_size; n_filetypes = offset / n_etypes_in_filetype; etype_in_filetype = offset % n_etypes_in_filetype; size_in_filetype = etype_in_filetype * etype_size; sum = 0; for (i=0; i<flat_file->count; i++) { sum += flat_file->blocklens[i]; if (sum > size_in_filetype) { st_index = i; frd_size = sum - size_in_filetype; abs_off_in_filetype = flat_file->indices[i] + size_in_filetype - (sum - flat_file->blocklens[i]); break; } } /* abs. offset in bytes in the file */ offset = disp + (ADIO_Offset) n_filetypes*filetype_extent + abs_off_in_filetype; } start_off = offset; /* Wei-keng Liao: read request is within a single flat_file contig * block e.g. with subarray types that actually describe the whole * array */ if (buftype_is_contig && bufsize <= frd_size) { /* a count of bytes can overflow. operate on original type instead */ ADIO_ReadContig(fd, buf, count, datatype, ADIO_EXPLICIT_OFFSET, offset, status, error_code); if (file_ptr_type == ADIO_INDIVIDUAL) { /* update MPI-IO file pointer to point to the first byte that * can be accessed in the fileview. */ fd->fp_ind = offset + bufsize; if (bufsize == frd_size) { do { st_index++; if (st_index == flat_file->count) { st_index = 0; n_filetypes++; } } while (flat_file->blocklens[st_index] == 0); fd->fp_ind = disp + flat_file->indices[st_index] + n_filetypes*filetype_extent; } } fd->fp_sys_posn = -1; /* set it to null. */ #ifdef HAVE_STATUS_SET_BYTES MPIR_Status_set_bytes(status, datatype, bufsize); #endif return; } /* Calculate end_offset, the last byte-offset that will be accessed. e.g., if start_offset=0 and 100 bytes to be read, end_offset=99*/ st_frd_size = frd_size; st_n_filetypes = n_filetypes; i_offset = 0; j = st_index; off = offset; frd_size = MPL_MIN(st_frd_size, bufsize); while (i_offset < bufsize) { i_offset += frd_size; end_offset = off + frd_size - 1; j = (j+1) % flat_file->count; n_filetypes += (j == 0) ? 1 : 0; while (flat_file->blocklens[j]==0) { j = (j+1) % flat_file->count; n_filetypes += (j == 0) ? 1 : 0; } off = disp + flat_file->indices[j] + n_filetypes*(ADIO_Offset)filetype_extent; frd_size = MPL_MIN(flat_file->blocklens[j], bufsize-i_offset); } /* if atomicity is true, lock (exclusive) the region to be accessed */ if ((fd->atomicity) && ADIO_Feature(fd, ADIO_LOCKS)) ADIOI_WRITE_LOCK(fd, start_off, SEEK_SET, end_offset-start_off+1); readbuf_off = 0; readbuf_len = 0; readbuf = (char *) ADIOI_Malloc(max_bufsize); if (buftype_is_contig && !filetype_is_contig) { /* contiguous in memory, noncontiguous in file. should be the most common case. */ i_offset = 0; j = st_index; off = offset; n_filetypes = st_n_filetypes; frd_size = MPL_MIN(st_frd_size, bufsize); while (i_offset < bufsize) { if (frd_size) { /* TYPE_UB and TYPE_LB can result in frd_size = 0. save system call in such cases */ /* lseek(fd->fd_sys, off, SEEK_SET); err = read(fd->fd_sys, ((char *) buf) + i, frd_size);*/ req_off = off; req_len = frd_size; userbuf_off = i_offset; ADIOI_BUFFERED_READ } i_offset += frd_size; if (off + frd_size < disp + flat_file->indices[j] + flat_file->blocklens[j] + n_filetypes*(ADIO_Offset)filetype_extent) off += frd_size; /* did not reach end of contiguous block in filetype. no more I/O needed. off is incremented by frd_size. */ else { j = (j+1) % flat_file->count; n_filetypes += (j == 0) ? 1 : 0; while (flat_file->blocklens[j]==0) { j = (j+1) % flat_file->count; n_filetypes += (j == 0) ? 1 : 0; } off = disp + flat_file->indices[j] + n_filetypes*(ADIO_Offset)filetype_extent; frd_size = MPL_MIN(flat_file->blocklens[j], bufsize-i_offset); } } } else {
static void ADIOI_LUSTRE_IOContig(ADIO_File fd, const void *buf, int count, MPI_Datatype datatype, int file_ptr_type, ADIO_Offset offset, ADIO_Status *status, int io_mode, int *error_code) { int err=-1; MPI_Count datatype_size, len; static char myname[] = "ADIOI_LUSTRE_IOCONTIG"; MPI_Type_size_x(datatype, &datatype_size); len = datatype_size * count; if (file_ptr_type == ADIO_INDIVIDUAL) { offset = fd->fp_ind; } if (!(fd->direct_read || fd->direct_write)) { if (fd->fp_sys_posn != offset) { err = lseek(fd->fd_sys, offset, SEEK_SET); if (err == -1) goto ioerr; } if (io_mode) { #ifdef ADIOI_MPE_LOGGING MPE_Log_event(ADIOI_MPE_write_a, 0, NULL); #endif err = write(fd->fd_sys, buf, len); #ifdef ADIOI_MPE_LOGGING MPE_Log_event(ADIOI_MPE_write_b, 0, NULL); #endif } else { #ifdef ADIOI_MPE_LOGGING MPE_Log_event(ADIOI_MPE_read_a, 0, NULL); #endif err = read(fd->fd_sys, (void *)buf, len); #ifdef ADIOI_MPE_LOGGING MPE_Log_event(ADIOI_MPE_read_b, 0, NULL); #endif } } else { err = ADIOI_LUSTRE_Directio(fd, buf, len, offset, io_mode); } if (err == -1) goto ioerr; fd->fp_sys_posn = offset + err; if (file_ptr_type == ADIO_INDIVIDUAL) { fd->fp_ind += err; } #ifdef HAVE_STATUS_SET_BYTES if (status) MPIR_Status_set_bytes(status, datatype, err); #endif *error_code = MPI_SUCCESS; ioerr: /* --BEGIN ERROR HANDLING-- */ if (err == -1) { *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_IO, "**io", "**io %s", strerror(errno)); fd->fp_sys_posn = -1; return; } /* --END ERROR HANDLING-- */ }