/* flatten datatype and add it to Flatlist */ void ADIOI_Flatten_datatype(MPI_Datatype datatype) { #ifdef HAVE_MPIR_TYPE_FLATTEN MPI_Aint flatten_idx; #endif MPI_Count curr_index=0; int is_contig; ADIOI_Flatlist_node *flat, *prev=0; /* check if necessary to flatten. */ /* is it entirely contiguous? */ ADIOI_Datatype_iscontig(datatype, &is_contig); #ifdef FLATTEN_DEBUG DBG_FPRINTF(stderr,"ADIOI_Flatten_datatype:: is_contig %#X\n",is_contig); #endif if (is_contig) return; /* has it already been flattened? */ flat = ADIOI_Flatlist; while (flat) { if (flat->type == datatype) { #ifdef FLATTEN_DEBUG DBG_FPRINTF(stderr,"ADIOI_Flatten_datatype:: found datatype %#X\n", datatype); #endif return; } else { prev = flat; flat = flat->next; } } /* flatten and add to the list */ flat = prev; flat->next = (ADIOI_Flatlist_node *)ADIOI_Malloc(sizeof(ADIOI_Flatlist_node)); flat = flat->next; flat->type = datatype; flat->next = NULL; flat->blocklens = NULL; flat->indices = NULL; flat->count = ADIOI_Count_contiguous_blocks(datatype, &curr_index); #ifdef FLATTEN_DEBUG DBG_FPRINTF(stderr,"ADIOI_Flatten_datatype:: count %llX, cur_idx = %#llX\n",flat->count,curr_index); #endif /* DBG_FPRINTF(stderr, "%d\n", flat->count);*/ if (flat->count) { flat->blocklens = (ADIO_Offset *) ADIOI_Malloc(flat->count * sizeof(ADIO_Offset)); flat->indices = (ADIO_Offset *) ADIOI_Malloc(flat->count * sizeof(ADIO_Offset)); } curr_index = 0; #ifdef HAVE_MPIR_TYPE_FLATTEN flatten_idx = (MPI_Aint) flat->count; MPIR_Type_flatten(datatype, flat->indices, flat->blocklens, &flatten_idx); #ifdef FLATTEN_DEBUG DBG_FPRINTF(stderr,"ADIOI_Flatten_datatype:: MPIR_Type_flatten\n"); #endif #else ADIOI_Flatten(datatype, flat, 0, &curr_index); #ifdef FLATTEN_DEBUG DBG_FPRINTF(stderr,"ADIOI_Flatten_datatype:: ADIOI_Flatten\n"); #endif ADIOI_Optimize_flattened(flat); #endif /* debug */ #ifdef FLATTEN_DEBUG { int i; for (i=0; i<flat->count; i++) DBG_FPRINTF(stderr,"ADIOI_Flatten_datatype:: i %#X, blocklens %#llX, indices %#llX\n", i, flat->blocklens[i], flat->indices[i] ); } #endif }
/* * Pick IO aggregators based on the under PSET organization and stores the ranks of the proxy CNs in tmp_ranklist. * The first order of tmp_ranklist is : PSET number * The secondary order of the list is determined in ADIOI_BGL_select_agg_in_pset() and thus adjustable. */ static int ADIOI_BGL_compute_agg_ranklist_serial_do (const ADIOI_BGL_ConfInfo_t *confInfo, ADIOI_BGL_ProcInfo_t *all_procInfo, int *aggrsInPset, int *tmp_ranklist) { int i, j; /* a list of the numbers of all the PSETS */ int *psetNumList = (int *) ADIOI_Malloc ( confInfo->nProcs * sizeof(int) ); /* sweep through all processes' records, collect the numbers of all the PSETS. * The reason for not doing MIN, MAX is that the owned PSETs may not have contiguous numbers */ int n_psets=0; for (i=0; i<confInfo->nProcs; i++) { ADIOI_BGL_ProcInfo_t *info_p = all_procInfo+i; int exist = 0; for (j=n_psets-1; j>=0; j--) if (info_p->psetNum == psetNumList[j]) { exist=1; break; } if (!exist) { psetNumList [n_psets] = info_p->psetNum; n_psets ++; } } /* bucket sort: put the CN nodes into ordered buckets, each of which represents a PSET */ /* bucket space for bucket sort */ ADIOI_BGL_ProcInfo_t *sorted_procInfo = ADIOI_BGL_ProcInfo_new_n ( n_psets * confInfo->virtualPsetSize ); int *PsetIdx = (int *) ADIOI_Malloc ( n_psets * sizeof(int) ); AD_BGL_assert ( (PsetIdx != NULL) ); /* initialize bucket pointer */ for (i=0; i<n_psets; i++) { PsetIdx[i] = i*confInfo->virtualPsetSize; } /* sort */ for (i=0; i<confInfo->nProcs; i++) { int pset_id = all_procInfo[i].psetNum; for (j=n_psets-1; j>=0; j--) if (pset_id == psetNumList[j]) break; AD_BGL_assert ( (j >= 0) ); /* got to find a PSET bucket */ sorted_procInfo[ PsetIdx[j] ++ ] = all_procInfo[i]; } ADIOI_Free(psetNumList); /* select a number of CN aggregators from each Pset */ int naggs = 0; for (i=0; i<n_psets; i++) { /* the number of CN in this PSET -- may not be a full PSET */ int nCN_in_pset = PsetIdx[i] - i*confInfo->virtualPsetSize; /* select aggregators and put them into tmp_ranklist contiguously. */ int local_naggs = ADIOI_BGL_select_agg_in_pset( confInfo, sorted_procInfo + i*confInfo->virtualPsetSize, nCN_in_pset, tmp_ranklist + naggs); aggrsInPset[i+1] = local_naggs; naggs += local_naggs; } aggrsInPset[0] = n_psets; /* leave */ ADIOI_Free ( PsetIdx ); ADIOI_BGL_ProcInfo_free ( sorted_procInfo ); return naggs; }
/* * ADIOI_BGL_Calc_my_req() overrides ADIOI_Calc_my_req for the default implementation * is specific for static file domain partitioning. * * ADIOI_Calc_my_req() - calculate what portions of the access requests * of this process are located in the file domains of various processes * (including this one) */ void ADIOI_BGL_Calc_my_req(ADIO_File fd, ADIO_Offset *offset_list, ADIO_Offset *len_list, int contig_access_count, ADIO_Offset min_st_offset, ADIO_Offset *fd_start, ADIO_Offset *fd_end, ADIO_Offset fd_size, int nprocs, int *count_my_req_procs_ptr, int **count_my_req_per_proc_ptr, ADIOI_Access **my_req_ptr, int **buf_idx_ptr) /* Possibly reconsider if buf_idx's are ok as int's, or should they be aints/offsets? They are used as memory buffer indices so it seems like the 2G limit is in effect */ { int *count_my_req_per_proc, count_my_req_procs, *buf_idx; int i, l, proc; ADIO_Offset fd_len, rem_len, curr_idx, off; ADIOI_Access *my_req; #ifdef AGGREGATION_PROFILE MPE_Log_event (5024, 0, NULL); #endif *count_my_req_per_proc_ptr = (int *) ADIOI_Calloc(nprocs,sizeof(int)); count_my_req_per_proc = *count_my_req_per_proc_ptr; /* count_my_req_per_proc[i] gives the no. of contig. requests of this process in process i's file domain. calloc initializes to zero. I'm allocating memory of size nprocs, so that I can do an MPI_Alltoall later on.*/ buf_idx = (int *) ADIOI_Malloc(nprocs*sizeof(int)); /* buf_idx is relevant only if buftype_is_contig. buf_idx[i] gives the index into user_buf where data received from proc. i should be placed. This allows receives to be done without extra buffer. This can't be done if buftype is not contig. */ /* initialize buf_idx to -1 */ for (i=0; i < nprocs; i++) buf_idx[i] = -1; /* one pass just to calculate how much space to allocate for my_req; * contig_access_count was calculated way back in ADIOI_Calc_my_off_len() */ for (i=0; i < contig_access_count; i++) { /* short circuit offset/len processing if len == 0 * (zero-byte read/write */ if (len_list[i] == 0) continue; off = offset_list[i]; fd_len = len_list[i]; /* note: we set fd_len to be the total size of the access. then * ADIOI_Calc_aggregator() will modify the value to return the * amount that was available from the file domain that holds the * first part of the access. */ proc = ADIOI_BGL_Calc_aggregator(fd, off, min_st_offset, &fd_len, fd_size, fd_start, fd_end); count_my_req_per_proc[proc]++; /* figure out how much data is remaining in the access (i.e. wasn't * part of the file domain that had the starting byte); we'll take * care of this data (if there is any) in the while loop below. */ rem_len = len_list[i] - fd_len; while (rem_len > 0) { off += fd_len; /* point to first remaining byte */ fd_len = rem_len; /* save remaining size, pass to calc */ proc = ADIOI_BGL_Calc_aggregator(fd, off, min_st_offset, &fd_len, fd_size, fd_start, fd_end); count_my_req_per_proc[proc]++; rem_len -= fd_len; /* reduce remaining length by amount from fd */ } } /* now allocate space for my_req, offset, and len */ *my_req_ptr = (ADIOI_Access *) ADIOI_Malloc(nprocs*sizeof(ADIOI_Access)); my_req = *my_req_ptr; count_my_req_procs = 0; for (i=0; i < nprocs; i++) { if (count_my_req_per_proc[i]) { my_req[i].offsets = (ADIO_Offset *) ADIOI_Malloc(count_my_req_per_proc[i] * sizeof(ADIO_Offset)); my_req[i].lens = (int *) ADIOI_Malloc(count_my_req_per_proc[i] * sizeof(int)); count_my_req_procs++; } my_req[i].count = 0; /* will be incremented where needed later */ } /* now fill in my_req */ curr_idx = 0; for (i=0; i<contig_access_count; i++) { /* short circuit offset/len processing if len == 0 * (zero-byte read/write */ if (len_list[i] == 0) continue; off = offset_list[i]; fd_len = len_list[i]; proc = ADIOI_BGL_Calc_aggregator(fd, off, min_st_offset, &fd_len, fd_size, fd_start, fd_end); /* for each separate contiguous access from this process */ if (buf_idx[proc] == -1) { ADIOI_Assert(curr_idx == (int) curr_idx); buf_idx[proc] = (int) curr_idx; } l = my_req[proc].count; curr_idx += fd_len; rem_len = len_list[i] - fd_len; /* store the proc, offset, and len information in an array * of structures, my_req. Each structure contains the * offsets and lengths located in that process's FD, * and the associated count. */ my_req[proc].offsets[l] = off; ADIOI_Assert(fd_len == (int) fd_len); my_req[proc].lens[l] = (int) fd_len; my_req[proc].count++; while (rem_len > 0) { off += fd_len; fd_len = rem_len; proc = ADIOI_BGL_Calc_aggregator(fd, off, min_st_offset, &fd_len, fd_size, fd_start, fd_end); if (buf_idx[proc] == -1) { ADIOI_Assert(curr_idx == (int) curr_idx); buf_idx[proc] = (int) curr_idx; } l = my_req[proc].count; curr_idx += fd_len; rem_len -= fd_len; my_req[proc].offsets[l] = off; ADIOI_Assert(fd_len == (int) fd_len); my_req[proc].lens[l] = (int) fd_len; my_req[proc].count++; } } #ifdef AGG_DEBUG for (i=0; i<nprocs; i++) { if (count_my_req_per_proc[i] > 0) { DBG_FPRINTF(stderr, "data needed from %d (count = %d):\n", i, my_req[i].count); for (l=0; l < my_req[i].count; l++) { DBG_FPRINTF(stderr, " off[%d] = %lld, len[%d] = %d\n", l, my_req[i].offsets[l], l, my_req[i].lens[l]); } } DBG_FPRINTF(stderr, "buf_idx[%d] = 0x%x\n", i, buf_idx[i]); } #endif *count_my_req_procs_ptr = count_my_req_procs; *buf_idx_ptr = buf_idx; #ifdef AGGREGATION_PROFILE MPE_Log_event (5025, 0, NULL); #endif }
void ADIOI_PVFS2_OldWriteStrided(ADIO_File fd, void *buf, int count, MPI_Datatype datatype, int file_ptr_type, ADIO_Offset offset, ADIO_Status *status, int *error_code) { /* as with all the other WriteStrided functions, offset is in units of * etype relative to the filetype */ /* Since PVFS2 does not support file locking, can't do buffered writes as on Unix */ ADIOI_Flatlist_node *flat_buf, *flat_file; int i, j, k, bwr_size, fwr_size=0, st_index=0; int bufsize, sum, n_etypes_in_filetype, size_in_filetype; int n_filetypes, etype_in_filetype; ADIO_Offset abs_off_in_filetype=0; int filetype_size, etype_size, buftype_size; MPI_Aint filetype_extent, buftype_extent; int buf_count, buftype_is_contig, filetype_is_contig; ADIO_Offset off, disp, start_off, initial_off; int flag, st_fwr_size, st_n_filetypes; int err_flag=0; int mem_list_count, file_list_count; PVFS_size * mem_offsets; int64_t *file_offsets; int *mem_lengths; int32_t *file_lengths; int total_blks_to_write; int max_mem_list, max_file_list; int b_blks_wrote; int f_data_wrote; int size_wrote=0, n_write_lists, extra_blks; int end_bwr_size, end_fwr_size; int start_k, start_j, new_file_write, new_buffer_write; int start_mem_offset; PVFS_Request mem_req, file_req; ADIOI_PVFS2_fs * pvfs_fs; PVFS_sysresp_io resp_io; MPI_Offset total_bytes_written=0; static char myname[] = "ADIOI_PVFS2_WRITESTRIDED"; /* note: don't increase this: several parts of PVFS2 now * assume this limit*/ #define MAX_ARRAY_SIZE 64 /* --BEGIN ERROR HANDLING-- */ if (fd->atomicity) { *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_ARG, "Atomic noncontiguous writes are not supported by PVFS2", 0); return; } /* --END ERROR HANDLING-- */ ADIOI_Datatype_iscontig(datatype, &buftype_is_contig); ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig); /* the HDF5 tests showed a bug in this list processing code (see many many * lines down below). We added a workaround, but common HDF5 file types * are actually contiguous and do not need the expensive workarond */ if (!filetype_is_contig) { flat_file = ADIOI_Flatlist; while (flat_file->type != fd->filetype) flat_file = flat_file->next; if (flat_file->count == 1 && !buftype_is_contig) filetype_is_contig = 1; } MPI_Type_size(fd->filetype, &filetype_size); if ( ! filetype_size ) { #ifdef HAVE_STATUS_SET_BYTES MPIR_Status_set_bytes(status, datatype, 0); #endif *error_code = MPI_SUCCESS; return; } MPI_Type_extent(fd->filetype, &filetype_extent); MPI_Type_size(datatype, &buftype_size); MPI_Type_extent(datatype, &buftype_extent); etype_size = fd->etype_size; bufsize = buftype_size * count; pvfs_fs = (ADIOI_PVFS2_fs*)fd->fs_ptr; if (!buftype_is_contig && filetype_is_contig) { /* noncontiguous in memory, contiguous in file. */ int64_t file_offsets; int32_t file_lengths; ADIOI_Flatten_datatype(datatype); flat_buf = ADIOI_Flatlist; while (flat_buf->type != datatype) flat_buf = flat_buf->next; if (file_ptr_type == ADIO_EXPLICIT_OFFSET) { off = fd->disp + etype_size * offset; } else off = fd->fp_ind; file_list_count = 1; file_offsets = off; file_lengths = 0; total_blks_to_write = count*flat_buf->count; b_blks_wrote = 0; /* allocate arrays according to max usage */ if (total_blks_to_write > MAX_ARRAY_SIZE) mem_list_count = MAX_ARRAY_SIZE; else mem_list_count = total_blks_to_write; mem_offsets = (PVFS_size*)ADIOI_Malloc(mem_list_count*sizeof(PVFS_size)); mem_lengths = (int*)ADIOI_Malloc(mem_list_count*sizeof(int)); j = 0; /* step through each block in memory, filling memory arrays */ while (b_blks_wrote < total_blks_to_write) { for (i=0; i<flat_buf->count; i++) { mem_offsets[b_blks_wrote % MAX_ARRAY_SIZE] = /* TODO: fix this warning by casting to an integer that's * the same size as a char * and /then/ casting to * PVFS_size */ ((PVFS_size)buf + j*buftype_extent + flat_buf->indices[i]); mem_lengths[b_blks_wrote % MAX_ARRAY_SIZE] = flat_buf->blocklens[i]; file_lengths += flat_buf->blocklens[i]; b_blks_wrote++; if (!(b_blks_wrote % MAX_ARRAY_SIZE) || (b_blks_wrote == total_blks_to_write)) { /* in the case of the last write list call, adjust mem_list_count */ if (b_blks_wrote == total_blks_to_write) { mem_list_count = total_blks_to_write % MAX_ARRAY_SIZE; /* in case last write list call fills max arrays */ if (!mem_list_count) mem_list_count = MAX_ARRAY_SIZE; } err_flag = PVFS_Request_hindexed(mem_list_count, mem_lengths, mem_offsets, PVFS_BYTE, &mem_req); /* --BEGIN ERROR HANDLING-- */ if (err_flag != 0) { *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, ADIOI_PVFS2_error_convert(err_flag), "Error in PVFS_Request_hindexed (memory)", 0); break; } /* --END ERROR HANDLING-- */ err_flag = PVFS_Request_contiguous(file_lengths, PVFS_BYTE, &file_req); /* --BEGIN ERROR HANDLING-- */ if (err_flag != 0) { *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, ADIOI_PVFS2_error_convert(err_flag), "Error in PVFS_Request_contiguous (file)", 0); break; } /* --END ERROR HANDLING-- */ #ifdef ADIOI_MPE_LOGGING MPE_Log_event( ADIOI_MPE_write_a, 0, NULL ); #endif err_flag = PVFS_sys_write(pvfs_fs->object_ref, file_req, file_offsets, PVFS_BOTTOM, mem_req, &(pvfs_fs->credentials), &resp_io); #ifdef ADIOI_MPE_LOGGING MPE_Log_event( ADIOI_MPE_write_b, 0, NULL ); #endif total_bytes_written += resp_io.total_completed; /* in the case of error or the last write list call, * leave here */ /* --BEGIN ERROR HANDLING-- */ if (err_flag) { *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, ADIOI_PVFS2_error_convert(err_flag), "Error in PVFS_sys_write", 0); break; } /* --END ERROR HANDLING-- */ if (b_blks_wrote == total_blks_to_write) break; file_offsets += file_lengths; file_lengths = 0; PVFS_Request_free(&mem_req); PVFS_Request_free(&file_req); } } /* for (i=0; i<flat_buf->count; i++) */ j++; } /* while (b_blks_wrote < total_blks_to_write) */ ADIOI_Free(mem_offsets); ADIOI_Free(mem_lengths); if (file_ptr_type == ADIO_INDIVIDUAL) fd->fp_ind += total_bytes_written; if (!err_flag) *error_code = MPI_SUCCESS; fd->fp_sys_posn = -1; /* clear this. */ #ifdef HAVE_STATUS_SET_BYTES MPIR_Status_set_bytes(status, datatype, bufsize); /* This is a temporary way of filling in status. The right way is to keep track of how much data was actually written by ADIOI_BUFFERED_WRITE. */ #endif ADIOI_Delete_flattened(datatype); return; } /* if (!buftype_is_contig && filetype_is_contig) */ /* already know that file is noncontiguous from above */ /* noncontiguous in file */ /* filetype already flattened in ADIO_Open */ flat_file = ADIOI_Flatlist; while (flat_file->type != fd->filetype) flat_file = flat_file->next; disp = fd->disp; initial_off = offset; /* for each case - ADIO_Individual pointer or explicit, find offset (file offset in bytes), n_filetypes (how many filetypes into file to start), fwr_size (remaining amount of data in present file block), and st_index (start point in terms of blocks in starting filetype) */ if (file_ptr_type == ADIO_INDIVIDUAL) { offset = fd->fp_ind; /* in bytes */ n_filetypes = -1; flag = 0; while (!flag) { n_filetypes++; for (i=0; i<flat_file->count; i++) { if (disp + flat_file->indices[i] + ((ADIO_Offset) n_filetypes)*filetype_extent + flat_file->blocklens[i] >= offset) { st_index = i; fwr_size = disp + flat_file->indices[i] + ((ADIO_Offset) n_filetypes)*filetype_extent + flat_file->blocklens[i] - offset; flag = 1; break; } } } /* while (!flag) */ } /* if (file_ptr_type == ADIO_INDIVIDUAL) */ else { n_etypes_in_filetype = filetype_size/etype_size; n_filetypes = (int) (offset / n_etypes_in_filetype); etype_in_filetype = (int) (offset % n_etypes_in_filetype); size_in_filetype = etype_in_filetype * etype_size; sum = 0; for (i=0; i<flat_file->count; i++) { sum += flat_file->blocklens[i]; if (sum > size_in_filetype) { st_index = i; fwr_size = sum - size_in_filetype; abs_off_in_filetype = flat_file->indices[i] + size_in_filetype - (sum - flat_file->blocklens[i]); break; } } /* abs. offset in bytes in the file */ offset = disp + ((ADIO_Offset) n_filetypes)*filetype_extent + abs_off_in_filetype; } /* else [file_ptr_type != ADIO_INDIVIDUAL] */ start_off = offset; st_fwr_size = fwr_size; st_n_filetypes = n_filetypes; if (buftype_is_contig && !filetype_is_contig) { /* contiguous in memory, noncontiguous in file. should be the most common case. */ int mem_lengths; char *mem_offsets; i = 0; j = st_index; off = offset; n_filetypes = st_n_filetypes; mem_list_count = 1; /* determine how many blocks in file to write */ f_data_wrote = ADIOI_MIN(st_fwr_size, bufsize); total_blks_to_write = 1; if (j < (flat_file->count -1)) j++; else { j = 0; n_filetypes++; } while (f_data_wrote < bufsize) { f_data_wrote += flat_file->blocklens[j]; total_blks_to_write++; if (j<(flat_file->count-1)) j++; else j = 0; } j = st_index; n_filetypes = st_n_filetypes; n_write_lists = total_blks_to_write/MAX_ARRAY_SIZE; extra_blks = total_blks_to_write%MAX_ARRAY_SIZE; mem_offsets = buf; mem_lengths = 0; /* if at least one full writelist, allocate file arrays at max array size and don't free until very end */ if (n_write_lists) { file_offsets = (int64_t*)ADIOI_Malloc(MAX_ARRAY_SIZE* sizeof(int64_t)); file_lengths = (int32_t*)ADIOI_Malloc(MAX_ARRAY_SIZE* sizeof(int32_t)); } /* if there's no full writelist allocate file arrays according to needed size (extra_blks) */ else { file_offsets = (int64_t*)ADIOI_Malloc(extra_blks* sizeof(int64_t)); file_lengths = (int32_t*)ADIOI_Malloc(extra_blks* sizeof(int32_t)); } /* for file arrays that are of MAX_ARRAY_SIZE, build arrays */ for (i=0; i<n_write_lists; i++) { file_list_count = MAX_ARRAY_SIZE; if(!i) { file_offsets[0] = offset; file_lengths[0] = st_fwr_size; mem_lengths = st_fwr_size; } for (k=0; k<MAX_ARRAY_SIZE; k++) { if (i || k) { file_offsets[k] = disp + ((ADIO_Offset)n_filetypes)*filetype_extent + flat_file->indices[j]; file_lengths[k] = flat_file->blocklens[j]; mem_lengths += file_lengths[k]; } if (j<(flat_file->count - 1)) j++; else { j = 0; n_filetypes++; } } /* for (k=0; k<MAX_ARRAY_SIZE; k++) */ err_flag = PVFS_Request_contiguous(mem_lengths, PVFS_BYTE, &mem_req); /* --BEGIN ERROR HANDLING-- */ if (err_flag != 0) { *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, ADIOI_PVFS2_error_convert(err_flag), "Error in PVFS_Request_contiguous (memory)", 0); goto error_state; } /* --END ERROR HANDLING-- */ err_flag = PVFS_Request_hindexed(file_list_count, file_lengths, file_offsets, PVFS_BYTE, &file_req); /* --BEGIN ERROR HANDLING-- */ if (err_flag != 0) { *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, ADIOI_PVFS2_error_convert(err_flag), "Error in PVFS_Request_hindexed (file)", 0); goto error_state; } /* --END ERROR HANDLING-- */ /* PVFS_Request_hindexed already expresses the offsets into the * file, so we should not pass in an offset if we are using * hindexed for the file type */ #ifdef ADIOI_MPE_LOGGING MPE_Log_event( ADIOI_MPE_write_a, 0, NULL ); #endif err_flag = PVFS_sys_write(pvfs_fs->object_ref, file_req, 0, mem_offsets, mem_req, &(pvfs_fs->credentials), &resp_io); #ifdef ADIOI_MPE_LOGGING MPE_Log_event( ADIOI_MPE_write_b, 0, NULL ); #endif /* --BEGIN ERROR HANDLING-- */ if (err_flag != 0) { *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, ADIOI_PVFS2_error_convert(err_flag), "Error in PVFS_sys_write", 0); goto error_state; } /* --END ERROR HANDLING-- */ total_bytes_written += resp_io.total_completed; mem_offsets += mem_lengths; mem_lengths = 0; PVFS_Request_free(&file_req); PVFS_Request_free(&mem_req); } /* for (i=0; i<n_write_lists; i++) */ /* for file arrays smaller than MAX_ARRAY_SIZE (last write_list call) */ if (extra_blks) { file_list_count = extra_blks; if(!i) { file_offsets[0] = offset; file_lengths[0] = ADIOI_MIN(st_fwr_size, bufsize); } for (k=0; k<extra_blks; k++) { if(i || k) { file_offsets[k] = disp + ((ADIO_Offset)n_filetypes)*filetype_extent + flat_file->indices[j]; if (k == (extra_blks - 1)) { file_lengths[k] = bufsize - (int32_t) mem_lengths - (int32_t) mem_offsets + (int32_t) buf; } else file_lengths[k] = flat_file->blocklens[j]; } /* if(i || k) */ mem_lengths += file_lengths[k]; if (j<(flat_file->count - 1)) j++; else { j = 0; n_filetypes++; } } /* for (k=0; k<extra_blks; k++) */ err_flag = PVFS_Request_contiguous(mem_lengths, PVFS_BYTE, &mem_req); /* --BEGIN ERROR HANDLING-- */ if (err_flag != 0) { *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, ADIOI_PVFS2_error_convert(err_flag), "Error in PVFS_Request_contiguous (memory)", 0); goto error_state; } /* --END ERROR HANDLING-- */ err_flag = PVFS_Request_hindexed(file_list_count, file_lengths, file_offsets, PVFS_BYTE, &file_req); /* --BEGIN ERROR HANDLING-- */ if (err_flag != 0) { *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, ADIOI_PVFS2_error_convert(err_flag), "Error in PVFS_Request_hindexed(file)", 0); goto error_state; } /* --END ERROR HANDLING-- */ /* as above, use 0 for 'offset' when using hindexed file type*/ #ifdef ADIOI_MPE_LOGGING MPE_Log_event( ADIOI_MPE_write_a, 0, NULL ); #endif err_flag = PVFS_sys_write(pvfs_fs->object_ref, file_req, 0, mem_offsets, mem_req, &(pvfs_fs->credentials), &resp_io); #ifdef ADIOI_MPE_LOGGING MPE_Log_event( ADIOI_MPE_write_b, 0, NULL ); #endif /* --BEGIN ERROR HANDLING-- */ if (err_flag != 0) { *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, ADIOI_PVFS2_error_convert(err_flag), "Error in PVFS_sys_write", 0); goto error_state; } /* --END ERROR HANDLING-- */ total_bytes_written += resp_io.total_completed; PVFS_Request_free(&mem_req); PVFS_Request_free(&file_req); } } else { /* noncontiguous in memory as well as in file */ ADIOI_Flatten_datatype(datatype); flat_buf = ADIOI_Flatlist; while (flat_buf->type != datatype) flat_buf = flat_buf->next; size_wrote = 0; n_filetypes = st_n_filetypes; fwr_size = st_fwr_size; bwr_size = flat_buf->blocklens[0]; buf_count = 0; start_mem_offset = 0; start_k = k = 0; start_j = st_index; max_mem_list = 0; max_file_list = 0; /* run through and file max_file_list and max_mem_list so that you can allocate the file and memory arrays less than MAX_ARRAY_SIZE if possible */ while (size_wrote < bufsize) { k = start_k; new_buffer_write = 0; mem_list_count = 0; while ((mem_list_count < MAX_ARRAY_SIZE) && (new_buffer_write < bufsize-size_wrote)) { /* find mem_list_count and file_list_count such that both are less than MAX_ARRAY_SIZE, the sum of their lengths are equal, and the sum of all the data written and data to be written in the next immediate write list is less than bufsize */ if(mem_list_count) { if((new_buffer_write + flat_buf->blocklens[k] + size_wrote) > bufsize) { end_bwr_size = new_buffer_write + flat_buf->blocklens[k] - (bufsize - size_wrote); new_buffer_write = bufsize - size_wrote; } else { new_buffer_write += flat_buf->blocklens[k]; end_bwr_size = flat_buf->blocklens[k]; } } else { if (bwr_size > (bufsize - size_wrote)) { new_buffer_write = bufsize - size_wrote; bwr_size = new_buffer_write; } else new_buffer_write = bwr_size; } mem_list_count++; k = (k + 1)%flat_buf->count; } /* while ((mem_list_count < MAX_ARRAY_SIZE) && (new_buffer_write < bufsize-size_wrote)) */ j = start_j; new_file_write = 0; file_list_count = 0; while ((file_list_count < MAX_ARRAY_SIZE) && (new_file_write < new_buffer_write)) { if(file_list_count) { if((new_file_write + flat_file->blocklens[j]) > new_buffer_write) { end_fwr_size = new_buffer_write - new_file_write; new_file_write = new_buffer_write; j--; } else { new_file_write += flat_file->blocklens[j]; end_fwr_size = flat_file->blocklens[j]; } } else { if (fwr_size > new_buffer_write) { new_file_write = new_buffer_write; fwr_size = new_file_write; } else new_file_write = fwr_size; } file_list_count++; if (j < (flat_file->count - 1)) j++; else j = 0; k = start_k; if ((new_file_write < new_buffer_write) && (file_list_count == MAX_ARRAY_SIZE)) { new_buffer_write = 0; mem_list_count = 0; while (new_buffer_write < new_file_write) { if(mem_list_count) { if((new_buffer_write + flat_buf->blocklens[k]) > new_file_write) { end_bwr_size = new_file_write - new_buffer_write; new_buffer_write = new_file_write; k--; } else { new_buffer_write += flat_buf->blocklens[k]; end_bwr_size = flat_buf->blocklens[k]; } } else { new_buffer_write = bwr_size; if (bwr_size > (bufsize - size_wrote)) { new_buffer_write = bufsize - size_wrote; bwr_size = new_buffer_write; } } mem_list_count++; k = (k + 1)%flat_buf->count; } /* while (new_buffer_write < new_file_write) */ } /* if ((new_file_write < new_buffer_write) && (file_list_count == MAX_ARRAY_SIZE)) */ } /* while ((mem_list_count < MAX_ARRAY_SIZE) && (new_buffer_write < bufsize-size_wrote)) */ /* fakes filling the writelist arrays of lengths found above */ k = start_k; j = start_j; for (i=0; i<mem_list_count; i++) { if(i) { if (i == (mem_list_count - 1)) { if (flat_buf->blocklens[k] == end_bwr_size) bwr_size = flat_buf->blocklens[(k+1)% flat_buf->count]; else { bwr_size = flat_buf->blocklens[k] - end_bwr_size; k--; buf_count--; } } } buf_count++; k = (k + 1)%flat_buf->count; } /* for (i=0; i<mem_list_count; i++) */ for (i=0; i<file_list_count; i++) { if (i) { if (i == (file_list_count - 1)) { if (flat_file->blocklens[j] == end_fwr_size) fwr_size = flat_file->blocklens[(j+1)% flat_file->count]; else { fwr_size = flat_file->blocklens[j] - end_fwr_size; j--; } } } if (j < flat_file->count - 1) j++; else { j = 0; n_filetypes++; } } /* for (i=0; i<file_list_count; i++) */ size_wrote += new_buffer_write; start_k = k; start_j = j; if (max_mem_list < mem_list_count) max_mem_list = mem_list_count; if (max_file_list < file_list_count) max_file_list = file_list_count; } /* while (size_wrote < bufsize) */ /* one last check before we actually carry out the operation: * this code has hard-to-fix bugs when a noncontiguous file type has * such large pieces that the sum of the lengths of the memory type is * not larger than one of those pieces (and vice versa for large memory * types and many pices of file types. In these cases, give up and * fall back to naive reads and writes. The testphdf5 test created a * type with two very large memory regions and 600 very small file * regions. The same test also created a type with one very large file * region and many (700) very small memory regions. both cases caused * problems for this code */ if ( ( (file_list_count == 1) && (new_file_write < flat_file->blocklens[0] ) ) || ((mem_list_count == 1) && (new_buffer_write < flat_buf->blocklens[0]) ) || ((file_list_count == MAX_ARRAY_SIZE) && (new_file_write < flat_buf->blocklens[0]) ) || ( (mem_list_count == MAX_ARRAY_SIZE) && (new_buffer_write < flat_file->blocklens[0])) ) { ADIOI_Delete_flattened(datatype); ADIOI_GEN_WriteStrided_naive(fd, buf, count, datatype, file_ptr_type, initial_off, status, error_code); return; } mem_offsets = (PVFS_size*)ADIOI_Malloc(max_mem_list*sizeof(PVFS_size)); mem_lengths = (int *)ADIOI_Malloc(max_mem_list*sizeof(int)); file_offsets = (int64_t *)ADIOI_Malloc(max_file_list*sizeof(int64_t)); file_lengths = (int32_t *)ADIOI_Malloc(max_file_list*sizeof(int32_t)); size_wrote = 0; n_filetypes = st_n_filetypes; fwr_size = st_fwr_size; bwr_size = flat_buf->blocklens[0]; buf_count = 0; start_mem_offset = 0; start_k = k = 0; start_j = st_index; /* this section calculates mem_list_count and file_list_count and also finds the possibly odd sized last array elements in new_fwr_size and new_bwr_size */ while (size_wrote < bufsize) { k = start_k; new_buffer_write = 0; mem_list_count = 0; while ((mem_list_count < MAX_ARRAY_SIZE) && (new_buffer_write < bufsize-size_wrote)) { /* find mem_list_count and file_list_count such that both are less than MAX_ARRAY_SIZE, the sum of their lengths are equal, and the sum of all the data written and data to be written in the next immediate write list is less than bufsize */ if(mem_list_count) { if((new_buffer_write + flat_buf->blocklens[k] + size_wrote) > bufsize) { end_bwr_size = new_buffer_write + flat_buf->blocklens[k] - (bufsize - size_wrote); new_buffer_write = bufsize - size_wrote; } else { new_buffer_write += flat_buf->blocklens[k]; end_bwr_size = flat_buf->blocklens[k]; } } else { if (bwr_size > (bufsize - size_wrote)) { new_buffer_write = bufsize - size_wrote; bwr_size = new_buffer_write; } else new_buffer_write = bwr_size; } mem_list_count++; k = (k + 1)%flat_buf->count; } /* while ((mem_list_count < MAX_ARRAY_SIZE) && (new_buffer_write < bufsize-size_wrote)) */ j = start_j; new_file_write = 0; file_list_count = 0; while ((file_list_count < MAX_ARRAY_SIZE) && (new_file_write < new_buffer_write)) { if(file_list_count) { if((new_file_write + flat_file->blocklens[j]) > new_buffer_write) { end_fwr_size = new_buffer_write - new_file_write; new_file_write = new_buffer_write; j--; } else { new_file_write += flat_file->blocklens[j]; end_fwr_size = flat_file->blocklens[j]; } } else { if (fwr_size > new_buffer_write) { new_file_write = new_buffer_write; fwr_size = new_file_write; } else new_file_write = fwr_size; } file_list_count++; if (j < (flat_file->count - 1)) j++; else j = 0; k = start_k; if ((new_file_write < new_buffer_write) && (file_list_count == MAX_ARRAY_SIZE)) { new_buffer_write = 0; mem_list_count = 0; while (new_buffer_write < new_file_write) { if(mem_list_count) { if((new_buffer_write + flat_buf->blocklens[k]) > new_file_write) { end_bwr_size = new_file_write - new_buffer_write; new_buffer_write = new_file_write; k--; } else { new_buffer_write += flat_buf->blocklens[k]; end_bwr_size = flat_buf->blocklens[k]; } } else { new_buffer_write = bwr_size; if (bwr_size > (bufsize - size_wrote)) { new_buffer_write = bufsize - size_wrote; bwr_size = new_buffer_write; } } mem_list_count++; k = (k + 1)%flat_buf->count; } /* while (new_buffer_write < new_file_write) */ } /* if ((new_file_write < new_buffer_write) && (file_list_count == MAX_ARRAY_SIZE)) */ } /* while ((mem_list_count < MAX_ARRAY_SIZE) && (new_buffer_write < bufsize-size_wrote)) */ /* fills the allocated writelist arrays */ k = start_k; j = start_j; for (i=0; i<mem_list_count; i++) { /* TODO: fix this warning by casting to an integer that's the * same size as a char * and /then/ casting to PVFS_size */ mem_offsets[i] = ((PVFS_size)buf + buftype_extent* (buf_count/flat_buf->count) + (int)flat_buf->indices[k]); if(!i) { mem_lengths[0] = bwr_size; mem_offsets[0] += flat_buf->blocklens[k] - bwr_size; } else { if (i == (mem_list_count - 1)) { mem_lengths[i] = end_bwr_size; if (flat_buf->blocklens[k] == end_bwr_size) bwr_size = flat_buf->blocklens[(k+1)% flat_buf->count]; else { bwr_size = flat_buf->blocklens[k] - end_bwr_size; k--; buf_count--; } } else { mem_lengths[i] = flat_buf->blocklens[k]; } } buf_count++; k = (k + 1)%flat_buf->count; } /* for (i=0; i<mem_list_count; i++) */ for (i=0; i<file_list_count; i++) { file_offsets[i] = disp + flat_file->indices[j] + ((ADIO_Offset)n_filetypes) * filetype_extent; if (!i) { file_lengths[0] = fwr_size; file_offsets[0] += flat_file->blocklens[j] - fwr_size; } else { if (i == (file_list_count - 1)) { file_lengths[i] = end_fwr_size; if (flat_file->blocklens[j] == end_fwr_size) fwr_size = flat_file->blocklens[(j+1)% flat_file->count]; else { fwr_size = flat_file->blocklens[j] - end_fwr_size; j--; } } else file_lengths[i] = flat_file->blocklens[j]; } if (j < flat_file->count - 1) j++; else { j = 0; n_filetypes++; } } /* for (i=0; i<file_list_count; i++) */ err_flag = PVFS_Request_hindexed(mem_list_count, mem_lengths, mem_offsets, PVFS_BYTE, &mem_req); /* --BEGIN ERROR HANDLING-- */ if (err_flag != 0 ) { *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, ADIOI_PVFS2_error_convert(err_flag), "Error in PVFS_Request_hindexed (memory)", 0); goto error_state; } /* --END ERROR HANDLING-- */ err_flag = PVFS_Request_hindexed(file_list_count, file_lengths, file_offsets, PVFS_BYTE, &file_req); /* --BEGIN ERROR HANDLING-- */ if (err_flag != 0) { *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, ADIOI_PVFS2_error_convert(err_flag), "Error in PVFS_Request_hindexed", 0); goto error_state; } /* --END ERROR HANDLING-- */ /* offset will be expressed in memory and file datatypes */ #ifdef ADIOI_MPE_LOGGING MPE_Log_event( ADIOI_MPE_write_a, 0, NULL ); #endif err_flag = PVFS_sys_write(pvfs_fs->object_ref, file_req, 0, PVFS_BOTTOM, mem_req, &(pvfs_fs->credentials), &resp_io); #ifdef ADIOI_MPE_LOGGING MPE_Log_event( ADIOI_MPE_write_b, 0, NULL ); #endif /* --BEGIN ERROR HANDLING-- */ if (err_flag != 0) { *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, ADIOI_PVFS2_error_convert(err_flag), "Error in PVFS_sys_write", 0); goto error_state; } /* --END ERROR HANDLING-- */ size_wrote += new_buffer_write; total_bytes_written += resp_io.total_completed; start_k = k; start_j = j; PVFS_Request_free(&mem_req); PVFS_Request_free(&file_req); } /* while (size_wrote < bufsize) */ ADIOI_Free(mem_offsets); ADIOI_Free(mem_lengths); } /* when incrementing fp_ind, need to also take into account the file type: * consider an N-element 1-d subarray with a lb and ub: ( |---xxxxx-----| * if we wrote N elements, offset needs to point at beginning of type, not * at empty region at offset N+1). * * As we discussed on mpich-discuss in may/june 2009, the code below might * look wierd, but by putting fp_ind at the last byte written, the next * time we run through the strided code we'll update the fp_ind to the * right location. */ if (file_ptr_type == ADIO_INDIVIDUAL) { fd->fp_ind = file_offsets[file_list_count-1]+ file_lengths[file_list_count-1]; } ADIOI_Free(file_offsets); ADIOI_Free(file_lengths); *error_code = MPI_SUCCESS; error_state: fd->fp_sys_posn = -1; /* set it to null. */ #ifdef HAVE_STATUS_SET_BYTES MPIR_Status_set_bytes(status, datatype, bufsize); /* This is a temporary way of filling in status. The right way is to keep track of how much data was actually written by ADIOI_BUFFERED_WRITE. */ #endif if (!buftype_is_contig) ADIOI_Delete_flattened(datatype); }
int MPIOI_File_read_all_begin(MPI_File fh, MPI_Offset offset, int file_ptr_type, void *buf, int count, MPI_Datatype datatype, char *myname) { int error_code; MPI_Count datatype_size; ADIO_File adio_fh; void *xbuf=NULL, *e32_buf=NULL; ROMIO_THREAD_CS_ENTER(); adio_fh = MPIO_File_resolve(fh); /* --BEGIN ERROR HANDLING-- */ MPIO_CHECK_FILE_HANDLE(adio_fh, myname, error_code); MPIO_CHECK_COUNT(adio_fh, count, myname, error_code); MPIO_CHECK_DATATYPE(adio_fh, datatype, myname, error_code); if (file_ptr_type == ADIO_EXPLICIT_OFFSET && offset < 0) { error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_ARG, "**iobadoffset", 0); error_code = MPIO_Err_return_file(adio_fh, error_code); goto fn_exit; } /* --END ERROR HANDLING-- */ MPI_Type_size_x(datatype, &datatype_size); /* --BEGIN ERROR HANDLING-- */ MPIO_CHECK_INTEGRAL_ETYPE(adio_fh, count, datatype_size, myname, error_code); MPIO_CHECK_READABLE(adio_fh, myname, error_code); MPIO_CHECK_NOT_SEQUENTIAL_MODE(adio_fh, myname, error_code); if (adio_fh->split_coll_count) { error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_IO, "**iosplitcoll", 0); error_code = MPIO_Err_return_file(adio_fh, error_code); goto fn_exit; } MPIO_CHECK_COUNT_SIZE(adio_fh, count, datatype_size, myname, error_code); /* --END ERROR HANDLING-- */ adio_fh->split_coll_count = 1; xbuf = buf; if (adio_fh->is_external32) { MPI_Aint e32_size = 0; error_code = MPIU_datatype_full_size(datatype, &e32_size); if (error_code != MPI_SUCCESS) goto fn_exit; e32_buf = ADIOI_Malloc(e32_size*count); xbuf = e32_buf; } ADIO_ReadStridedColl(adio_fh, xbuf, count, datatype, file_ptr_type, offset, &adio_fh->split_status, &error_code); /* --BEGIN ERROR HANDLING-- */ if (error_code != MPI_SUCCESS) error_code = MPIO_Err_return_file(adio_fh, error_code); /* --END ERROR HANDLING-- */ if (e32_buf != NULL) { error_code = MPIU_read_external32_conversion_fn(buf, datatype, count, e32_buf); ADIOI_Free(e32_buf); } fn_exit: ROMIO_THREAD_CS_EXIT(); return error_code; }
void ADIOI_LUSTRE_Open(ADIO_File fd, int *error_code) { int perm, old_mask, amode, amode_direct; struct lov_user_md lum = { 0 }; char *value; #if defined(MPICH2) || !defined(PRINT_ERR_MSG) static char myname[] = "ADIOI_LUSTRE_OPEN"; #endif if (fd->perm == ADIO_PERM_NULL) { old_mask = umask(022); umask(old_mask); perm = old_mask ^ 0666; } else perm = fd->perm; amode = 0; if (fd->access_mode & ADIO_CREATE) amode = amode | O_CREAT; if (fd->access_mode & ADIO_RDONLY) amode = amode | O_RDONLY; if (fd->access_mode & ADIO_WRONLY) amode = amode | O_WRONLY; if (fd->access_mode & ADIO_RDWR) amode = amode | O_RDWR; if (fd->access_mode & ADIO_EXCL) amode = amode | O_EXCL; amode_direct = amode | O_DIRECT; fd->fd_sys = open(fd->filename, amode|O_CREAT, perm); if (fd->fd_sys != -1) { int err; value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char)); /* get file striping information and set it in info */ lum.lmm_magic = LOV_USER_MAGIC; err = ioctl(fd->fd_sys, LL_IOC_LOV_GETSTRIPE, (void *) &lum); if (!err) { sprintf(value, "%d", lum.lmm_stripe_size); MPI_Info_set(fd->info, "striping_unit", value); sprintf(value, "%d", lum.lmm_stripe_count); MPI_Info_set(fd->info, "striping_factor", value); sprintf(value, "%d", lum.lmm_stripe_offset); MPI_Info_set(fd->info, "start_iodevice", value); } ADIOI_Free(value); if (fd->access_mode & ADIO_APPEND) fd->fp_ind = fd->fp_sys_posn = lseek(fd->fd_sys, 0, SEEK_END); } if ((fd->fd_sys != -1) && (fd->access_mode & ADIO_APPEND)) fd->fp_ind = fd->fp_sys_posn = lseek(fd->fd_sys, 0, SEEK_END); fd->fd_direct = -1; if (fd->direct_write || fd->direct_read) { fd->fd_direct = open(fd->filename, amode_direct, perm); if (fd->fd_direct != -1) { fd->d_mem = fd->d_miniosz = (1<<12); } else { perror("cannot open file with O_Direct"); fd->direct_write = fd->direct_read = 0; } } /* --BEGIN ERROR HANDLING-- */ if (fd->fd_sys == -1 || ((fd->fd_direct == -1) && (fd->direct_write || fd->direct_read))) { if (errno == ENAMETOOLONG) *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_BAD_FILE, "**filenamelong", "**filenamelong %s %d", fd->filename, strlen(fd->filename)); else if (errno == ENOENT) *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_NO_SUCH_FILE, "**filenoexist", "**filenoexist %s", fd->filename); else if (errno == ENOTDIR || errno == ELOOP) *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_BAD_FILE, "**filenamedir", "**filenamedir %s", fd->filename); else if (errno == EACCES) { *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_ACCESS, "**fileaccess", "**fileaccess %s", fd->filename ); } else if (errno == EROFS) { /* Read only file or file system and write access requested */ *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_READ_ONLY, "**ioneedrd", 0 ); } else { *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_IO, "**io", "**io %s", strerror(errno)); } } /* --END ERROR HANDLING-- */ else *error_code = MPI_SUCCESS; }
void ADIOI_PIOFS_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code) { piofs_create_t piofs_create; piofs_statfs_t piofs_statfs; char *value, *path, *slash; int flag, tmp_val, str_factor=-1, str_unit=-1, start_iodev=-1; int err, myrank, perm, old_mask, nioservers; if ((fd->info) == MPI_INFO_NULL) { /* This must be part of the open call. can set striping parameters if necessary. */ MPI_Info_create(&(fd->info)); /* has user specified striping parameters and do they have the same value on all processes? */ if (users_info != MPI_INFO_NULL) { value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char)); MPI_Info_get(users_info, "striping_factor", MPI_MAX_INFO_VAL, value, &flag); if (flag) { str_factor=atoi(value); tmp_val = str_factor; MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm); if (tmp_val != str_factor) { FPRINTF(stderr, "ADIOI_PIOFS_SetInfo: the value for key \"striping_factor\" must be the same on all processes\n"); MPI_Abort(MPI_COMM_WORLD, 1); } } MPI_Info_get(users_info, "striping_unit", MPI_MAX_INFO_VAL, value, &flag); if (flag) { str_unit=atoi(value); tmp_val = str_unit; MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm); if (tmp_val != str_unit) { FPRINTF(stderr, "ADIOI_PIOFS_SetInfo: the value for key \"striping_unit\" must be the same on all processes\n"); MPI_Abort(MPI_COMM_WORLD, 1); } } MPI_Info_get(users_info, "start_iodevice", MPI_MAX_INFO_VAL, value, &flag); if (flag) { start_iodev=atoi(value); tmp_val = start_iodev; MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm); if (tmp_val != start_iodev) { FPRINTF(stderr, "ADIOI_PIOFS_SetInfo: the value for key \"start_iodevice\" must be the same on all processes\n"); MPI_Abort(MPI_COMM_WORLD, 1); } } ADIOI_Free(value); /* if user has specified striping info, process 0 tries to set it */ if ((str_factor > 0) || (str_unit > 0) || (start_iodev >= 0)) { MPI_Comm_rank(fd->comm, &myrank); if (!myrank) { if (fd->perm == ADIO_PERM_NULL) { old_mask = umask(022); umask(old_mask); perm = old_mask ^ 0666; } else perm = fd->perm; /* to find out the number of I/O servers, I need the path to the directory containing the file */ path = strdup(fd->filename); slash = strrchr(path, '/'); if (!slash) strcpy(path, "."); else { if (slash == path) *(path + 1) = '\0'; else *slash = '\0'; } strcpy(piofs_statfs.name, path); err = piofsioctl(0, PIOFS_STATFS, &piofs_statfs); nioservers = (err) ? -1 : piofs_statfs.f_nodes; free(path); str_factor = ADIOI_MIN(nioservers, str_factor); if (start_iodev >= nioservers) start_iodev = -1; strcpy(piofs_create.name, fd->filename); piofs_create.bsu = (str_unit > 0) ? str_unit : -1; piofs_create.cells = (str_factor > 0) ? str_factor : -1; piofs_create.permissions = perm; piofs_create.base_node = (start_iodev >= 0) ? start_iodev : -1; piofs_create.flags = 0; err = piofsioctl(0, PIOFS_CREATE, &piofs_create); } MPI_Barrier(fd->comm); } } } /* set the values for collective I/O and data sieving parameters */ ADIOI_GEN_SetInfo(fd, users_info, error_code); *error_code = MPI_SUCCESS; }
void ADIOI_SetFunctions(ADIO_File fd) { /* NOTE: soon we want to get rid of this malloc and instead just point * straight to the appropriate table */ fd->fns = (ADIOI_Fns *) ADIOI_Malloc(sizeof(ADIOI_Fns)); switch(fd->file_system) { case ADIO_PFS: #ifdef PFS *(fd->fns) = ADIO_PFS_operations; #else FPRINTF(stderr, "ADIOI_SetFunctions: ROMIO has not been configured to use the PFS file system\n"); MPI_Abort(MPI_COMM_WORLD, 1); #endif break; case ADIO_PIOFS: #ifdef PIOFS *(fd->fns) = ADIO_PIOFS_operations; #else FPRINTF(stderr, "ADIOI_SetFunctions: ROMIO has not been configured to use the PIOFS file system\n"); MPI_Abort(MPI_COMM_WORLD, 1); #endif break; case ADIO_UFS: #ifdef UFS *(fd->fns) = ADIO_UFS_operations; #else FPRINTF(stderr, "ADIOI_SetFunctions: ROMIO has not been configured to use the UFS file system\n"); MPI_Abort(MPI_COMM_WORLD, 1); #endif break; case ADIO_NTFS: #ifdef ROMIO_NTFS *(fd->fns) = ADIO_NTFS_operations; #else FPRINTF(stderr, "ADIOI_SetFunctions: ROMIO has not been configured to use the NTFS file system\n"); MPI_Abort(MPI_COMM_WORLD, 1); #endif break; case ADIO_NFS: #ifdef NFS *(fd->fns) = ADIO_NFS_operations; #else FPRINTF(stderr, "ADIOI_SetFunctions: ROMIO has not been configured to use the NFS file system\n"); MPI_Abort(MPI_COMM_WORLD, 1); #endif break; case ADIO_HFS: #ifdef HFS *(fd->fns) = ADIO_HFS_operations; #else FPRINTF(stderr, "ADIOI_SetFunctions: ROMIO has not been configured to use the HFS file system\n"); MPI_Abort(MPI_COMM_WORLD, 1); #endif break; case ADIO_XFS: #ifdef XFS *(fd->fns) = ADIO_XFS_operations; #else FPRINTF(stderr, "ADIOI_SetFunctions: ROMIO has not been configured to use the XFS file system\n"); MPI_Abort(MPI_COMM_WORLD, 1); #endif break; case ADIO_SFS: #ifdef SFS *(fd->fns) = ADIO_SFS_operations; #else FPRINTF(stderr, "ADIOI_SetFunctions: ROMIO has not been configured to use the SFS file system\n"); MPI_Abort(MPI_COMM_WORLD, 1); #endif break; case ADIO_PVFS: #ifdef ROMIO_PVFS *(fd->fns) = ADIO_PVFS_operations; #else FPRINTF(stderr, "ADIOI_SetFunctions: ROMIO has not been configured to use the PVFS file system\n"); MPI_Abort(MPI_COMM_WORLD, 1); #endif break; case ADIO_TESTFS: #ifdef ROMIO_TESTFS *(fd->fns) = ADIO_TESTFS_operations; #else FPRINTF(stderr, "ADIOI_SetFunctions: ROMIO has not been configured to use the TESTFS file system\n"); MPI_Abort(MPI_COMM_WORLD, 1); #endif break; default: FPRINTF(stderr, "ADIOI_SetFunctions: Unsupported file system type\n"); MPI_Abort(MPI_COMM_WORLD, 1); break; } }
/* ADIOI_ZOIDFS_Open: * one process opens (or creates) the file, then broadcasts the result to the * remaining processors. * * ADIO_Open used to perform an optimization when MPI_MODE_CREATE (and before * that, MPI_MODE_EXCL) was set. Because ZoidFS handles file lookup and * creation more scalably than traditional file systems, ADIO_Open now skips any * special handling when CREATE is set. */ void ADIOI_ZOIDFS_Open(ADIO_File fd, int *error_code) { int rank; static char myname[] = "ADIOI_ZOIDFS_OPEN"; ADIOI_ZOIDFS_object *zoidfs_obj_ptr; /* since one process is doing the open, that means one process is also * doing the error checking. define a struct for both the object reference * and the error code to broadcast to all the processors */ open_status o_status; MPI_Datatype open_status_type; MPI_Datatype types[2] = {MPI_INT, MPI_BYTE}; int lens[2] = {1, sizeof(ADIOI_ZOIDFS_object)}; MPI_Aint offsets[2]; memset(&o_status, 0, sizeof(o_status)); zoidfs_obj_ptr = (ADIOI_ZOIDFS_object *) ADIOI_Malloc(sizeof(ADIOI_ZOIDFS_object)); /* --BEGIN ERROR HANDLING-- */ if (zoidfs_obj_ptr == NULL) { *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_UNKNOWN, "Error allocating memory", 0); return; } /* --END ERROR HANDLING-- */ MPI_Comm_rank(fd->comm, &rank); ADIOI_ZOIDFS_Init(rank, error_code); if (*error_code != MPI_SUCCESS) { /* ADIOI_ZOIDFS_INIT handles creating error codes on its own */ ADIOI_Free(zoidfs_obj_ptr); return; } /* one process resolves name and will later bcast to others */ #ifdef ADIOI_MPE_LOGGING MPE_Log_event( ADIOI_MPE_open_a, 0, NULL ); #endif if (rank == fd->hints->ranklist[0] && fd->fs_ptr == NULL) { fake_an_open(fd->filename, fd->access_mode, fd->hints->striping_factor, fd->hints->striping_unit, zoidfs_obj_ptr, &o_status); /* store credentials and object reference in fd */ *zoidfs_obj_ptr = o_status.handle; fd->fs_ptr = zoidfs_obj_ptr; } #ifdef ADIOI_MPE_LOGGING MPE_Log_event( ADIOI_MPE_open_b, 0, NULL ); #endif /* broadcast status and (possibly valid) object reference */ MPI_Get_address(&o_status.error, &offsets[0]); MPI_Get_address(&o_status.handle, &offsets[1]); MPI_Type_struct(2, lens, offsets, types, &open_status_type); MPI_Type_commit(&open_status_type); /* Assertion: if we hit this Bcast, then all processes collectively * called this open. * * That's because deferred open never happens with this fs. */ MPI_Bcast(MPI_BOTTOM, 1, open_status_type, fd->hints->ranklist[0], fd->comm); MPI_Type_free(&open_status_type); /* --BEGIN ERROR HANDLING-- */ if (o_status.error != ZFS_OK) { ADIOI_Free(zoidfs_obj_ptr); fd->fs_ptr = NULL; *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, ADIOI_ZOIDFS_error_convert(o_status.error), "Unknown error", 0); /* TODO: FIX STRING */ return; } /* --END ERROR HANDLING-- */ *zoidfs_obj_ptr = o_status.handle; fd->fs_ptr = zoidfs_obj_ptr; *error_code = MPI_SUCCESS; return; }
void ADIOI_PVFS_ReadStridedListIO(ADIO_File fd, void *buf, int count, MPI_Datatype datatype, int file_ptr_type, ADIO_Offset offset, ADIO_Status *status, int *error_code) { /* offset is in units of etype relative to the filetype. */ ADIOI_Flatlist_node *flat_buf, *flat_file; int i, j, k, l, brd_size, frd_size=0, st_index=0; int bufsize, sum, n_etypes_in_filetype, size_in_filetype; int n_filetypes, etype_in_filetype; ADIO_Offset abs_off_in_filetype=0; int filetype_size, etype_size, buftype_size; MPI_Aint filetype_extent, buftype_extent; int buf_count, buftype_is_contig, filetype_is_contig; ADIO_Offset userbuf_off; ADIO_Offset off, disp, start_off; int flag, st_frd_size, st_n_filetypes; int new_brd_size, new_frd_size; int mem_list_count, file_list_count; char **mem_offsets; int64_t *file_offsets; int *mem_lengths; int32_t *file_lengths; int total_blks_to_read; int max_mem_list, max_file_list; int b_blks_read; int f_data_read; int size_read=0, n_read_lists, extra_blks; int end_brd_size, end_frd_size; int start_k, start_j, new_file_read, new_buffer_read; int start_mem_offset; #define MAX_ARRAY_SIZE 1024 #ifndef PRINT_ERR_MESG static char myname[] = "ADIOI_PVFS_ReadStrided"; #endif *error_code = MPI_SUCCESS; /* changed below if error */ ADIOI_Datatype_iscontig(datatype, &buftype_is_contig); ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig); MPI_Type_size(fd->filetype, &filetype_size); if ( ! filetype_size ) { *error_code = MPI_SUCCESS; return; } MPI_Type_extent(fd->filetype, &filetype_extent); MPI_Type_size(datatype, &buftype_size); MPI_Type_extent(datatype, &buftype_extent); etype_size = fd->etype_size; bufsize = buftype_size * count; if (!buftype_is_contig && filetype_is_contig) { /* noncontiguous in memory, contiguous in file. */ int64_t file_offsets; int32_t file_lengths; ADIOI_Flatten_datatype(datatype); flat_buf = ADIOI_Flatlist; while (flat_buf->type != datatype) flat_buf = flat_buf->next; off = (file_ptr_type == ADIO_INDIVIDUAL) ? fd->fp_ind : fd->disp + etype_size * offset; file_list_count = 1; file_offsets = off; file_lengths = 0; total_blks_to_read = count*flat_buf->count; b_blks_read = 0; /* allocate arrays according to max usage */ if (total_blks_to_read > MAX_ARRAY_SIZE) mem_list_count = MAX_ARRAY_SIZE; else mem_list_count = total_blks_to_read; mem_offsets = (char**)ADIOI_Malloc(mem_list_count*sizeof(char*)); mem_lengths = (int*)ADIOI_Malloc(mem_list_count*sizeof(int)); j = 0; /* step through each block in memory, filling memory arrays */ while (b_blks_read < total_blks_to_read) { for (i=0; i<flat_buf->count; i++) { mem_offsets[b_blks_read % MAX_ARRAY_SIZE] = (char*)((char *)buf + j*buftype_extent + flat_buf->indices[i]); mem_lengths[b_blks_read % MAX_ARRAY_SIZE] = flat_buf->blocklens[i]; file_lengths += flat_buf->blocklens[i]; b_blks_read++; if (!(b_blks_read % MAX_ARRAY_SIZE) || (b_blks_read == total_blks_to_read)) { /* in the case of the last read list call, adjust mem_list_count */ if (b_blks_read == total_blks_to_read) { mem_list_count = total_blks_to_read % MAX_ARRAY_SIZE; /* in case last read list call fills max arrays */ if (!mem_list_count) mem_list_count = MAX_ARRAY_SIZE; } pvfs_read_list(fd->fd_sys ,mem_list_count, mem_offsets, mem_lengths, file_list_count, &file_offsets, &file_lengths); /* in the case of the last read list call, leave here */ if (b_blks_read == total_blks_to_read) break; file_offsets += file_lengths; file_lengths = 0; } } /* for (i=0; i<flat_buf->count; i++) */ j++; } /* while (b_blks_read < total_blks_to_read) */ ADIOI_Free(mem_offsets); ADIOI_Free(mem_lengths); if (file_ptr_type == ADIO_INDIVIDUAL) fd->fp_ind = off; fd->fp_sys_posn = -1; /* set it to null. */ #ifdef HAVE_STATUS_SET_BYTES MPIR_Status_set_bytes(status, datatype, bufsize); /* This isa temporary way of filling in status. The right way is to keep tracke of how much data was actually read adn placed in buf by ADIOI_BUFFERED_READ. */ #endif ADIOI_Delete_flattened(datatype); return; } /* if (!buftype_is_contig && filetype_is_contig) */ /* know file is noncontiguous from above */ /* noncontiguous in file */ /* filetype already flattened in ADIO_Open */ flat_file = ADIOI_Flatlist; while (flat_file->type != fd->filetype) flat_file = flat_file->next; disp = fd->disp; /* for each case - ADIO_Individual pointer or explicit, find the file offset in bytes (offset), n_filetypes (how many filetypes into file to start), frd_size (remaining amount of data in present file block), and st_index (start point in terms of blocks in starting filetype) */ if (file_ptr_type == ADIO_INDIVIDUAL) { offset = fd->fp_ind; /* in bytes */ n_filetypes = -1; flag = 0; while (!flag) { n_filetypes++; for (i=0; i<flat_file->count; i++) { if (disp + flat_file->indices[i] + (ADIO_Offset) n_filetypes*filetype_extent + flat_file->blocklens[i] >= offset) { st_index = i; frd_size = (int) (disp + flat_file->indices[i] + (ADIO_Offset) n_filetypes*filetype_extent + flat_file->blocklens[i] - offset); flag = 1; break; } } } /* while (!flag) */ } /* if (file_ptr_type == ADIO_INDIVIDUAL) */ else { n_etypes_in_filetype = filetype_size/etype_size; n_filetypes = (int) (offset / n_etypes_in_filetype); etype_in_filetype = (int) (offset % n_etypes_in_filetype); size_in_filetype = etype_in_filetype * etype_size; sum = 0; for (i=0; i<flat_file->count; i++) { sum += flat_file->blocklens[i]; if (sum > size_in_filetype) { st_index = i; frd_size = sum - size_in_filetype; abs_off_in_filetype = flat_file->indices[i] + size_in_filetype - (sum - flat_file->blocklens[i]); break; } } /* abs. offset in bytes in the file */ offset = disp + (ADIO_Offset) n_filetypes*filetype_extent + abs_off_in_filetype; } /* else [file_ptr_type != ADIO_INDIVIDUAL] */ start_off = offset; st_frd_size = frd_size; st_n_filetypes = n_filetypes; if (buftype_is_contig && !filetype_is_contig) { /* contiguous in memory, noncontiguous in file. should be the most common case. */ int mem_lengths; char *mem_offsets; i = 0; j = st_index; n_filetypes = st_n_filetypes; mem_list_count = 1; /* determine how many blocks in file to read */ f_data_read = ADIOI_MIN(st_frd_size, bufsize); total_blks_to_read = 1; j++; while (f_data_read < bufsize) { f_data_read += flat_file->blocklens[j]; total_blks_to_read++; if (j<(flat_file->count-1)) j++; else j = 0; } j = st_index; n_filetypes = st_n_filetypes; n_read_lists = total_blks_to_read/MAX_ARRAY_SIZE; extra_blks = total_blks_to_read%MAX_ARRAY_SIZE; mem_offsets = buf; mem_lengths = 0; /* if at least one full readlist, allocate file arrays at max array size and don't free until very end */ if (n_read_lists) { file_offsets = (int64_t*)ADIOI_Malloc(MAX_ARRAY_SIZE* sizeof(int64_t)); file_lengths = (int32_t*)ADIOI_Malloc(MAX_ARRAY_SIZE* sizeof(int32_t)); } /* if there's no full readlist allocate file arrays according to needed size (extra_blks) */ else { file_offsets = (int64_t*)ADIOI_Malloc(extra_blks* sizeof(int64_t)); file_lengths = (int32_t*)ADIOI_Malloc(extra_blks* sizeof(int32_t)); } /* for file arrays that are of MAX_ARRAY_SIZE, build arrays */ for (i=0; i<n_read_lists; i++) { file_list_count = MAX_ARRAY_SIZE; if(!i) { file_offsets[0] = offset; file_lengths[0] = st_frd_size; mem_lengths = st_frd_size; } for (k=0; k<MAX_ARRAY_SIZE; k++) { if (i || k) { file_offsets[k] = disp + n_filetypes*filetype_extent + flat_file->indices[j]; file_lengths[k] = flat_file->blocklens[j]; mem_lengths += file_lengths[k]; } if (j<(flat_file->count - 1)) j++; else { j = 0; n_filetypes++; } } /* for (k=0; k<MAX_ARRAY_SIZE; k++) */ pvfs_read_list(fd->fd_sys, mem_list_count, &mem_offsets, &mem_lengths, file_list_count, file_offsets, file_lengths); mem_offsets += mem_lengths; mem_lengths = 0; } /* for (i=0; i<n_read_lists; i++) */ /* for file arrays smaller than MAX_ARRAY_SIZE (last read_list call) */ if (extra_blks) { file_list_count = extra_blks; if(!i) { file_offsets[0] = offset; file_lengths[0] = st_frd_size; } for (k=0; k<extra_blks; k++) { if(i || k) { file_offsets[k] = disp + n_filetypes*filetype_extent + flat_file->indices[j]; if (k == (extra_blks - 1)) { file_lengths[k] = bufsize - (int32_t) mem_lengths - (int32_t) mem_offsets + (int32_t) buf; } else file_lengths[k] = flat_file->blocklens[j]; } /* if(i || k) */ mem_lengths += file_lengths[k]; if (j<(flat_file->count - 1)) j++; else { j = 0; n_filetypes++; } } /* for (k=0; k<extra_blks; k++) */ pvfs_read_list(fd->fd_sys, mem_list_count, &mem_offsets, &mem_lengths, file_list_count, file_offsets, file_lengths); } } else { /* noncontiguous in memory as well as in file */ ADIOI_Flatten_datatype(datatype); flat_buf = ADIOI_Flatlist; while (flat_buf->type != datatype) flat_buf = flat_buf->next; size_read = 0; n_filetypes = st_n_filetypes; frd_size = st_frd_size; brd_size = flat_buf->blocklens[0]; buf_count = 0; start_mem_offset = 0; start_k = k = 0; start_j = st_index; max_mem_list = 0; max_file_list = 0; /* run through and file max_file_list and max_mem_list so that you can allocate the file and memory arrays less than MAX_ARRAY_SIZE if possible */ while (size_read < bufsize) { k = start_k; new_buffer_read = 0; mem_list_count = 0; while ((mem_list_count < MAX_ARRAY_SIZE) && (new_buffer_read < bufsize-size_read)) { /* find mem_list_count and file_list_count such that both are less than MAX_ARRAY_SIZE, the sum of their lengths are equal, and the sum of all the data read and data to be read in the next immediate read list is less than bufsize */ if(mem_list_count) { if((new_buffer_read + flat_buf->blocklens[k] + size_read) > bufsize) { end_brd_size = new_buffer_read + flat_buf->blocklens[k] - (bufsize - size_read); new_buffer_read = bufsize - size_read; } else { new_buffer_read += flat_buf->blocklens[k]; end_brd_size = flat_buf->blocklens[k]; } } else { if (brd_size > (bufsize - size_read)) { new_buffer_read = bufsize - size_read; brd_size = new_buffer_read; } else new_buffer_read = brd_size; } mem_list_count++; k = (k + 1)%flat_buf->count; } /* while ((mem_list_count < MAX_ARRAY_SIZE) && (new_buffer_read < bufsize-size_read)) */ j = start_j; new_file_read = 0; file_list_count = 0; while ((file_list_count < MAX_ARRAY_SIZE) && (new_file_read < new_buffer_read)) { if(file_list_count) { if((new_file_read + flat_file->blocklens[j]) > new_buffer_read) { end_frd_size = new_buffer_read - new_file_read; new_file_read = new_buffer_read; j--; } else { new_file_read += flat_file->blocklens[j]; end_frd_size = flat_file->blocklens[j]; } } else { if (frd_size > new_buffer_read) { new_file_read = new_buffer_read; frd_size = new_file_read; } else new_file_read = frd_size; } file_list_count++; if (j < (flat_file->count - 1)) j++; else j = 0; k = start_k; if ((new_file_read < new_buffer_read) && (file_list_count == MAX_ARRAY_SIZE)) { new_buffer_read = 0; mem_list_count = 0; while (new_buffer_read < new_file_read) { if(mem_list_count) { if((new_buffer_read + flat_buf->blocklens[k]) > new_file_read) { end_brd_size = new_file_read - new_buffer_read; new_buffer_read = new_file_read; k--; } else { new_buffer_read += flat_buf->blocklens[k]; end_brd_size = flat_buf->blocklens[k]; } } else { new_buffer_read = brd_size; if (brd_size > (bufsize - size_read)) { new_buffer_read = bufsize - size_read; brd_size = new_buffer_read; } } mem_list_count++; k = (k + 1)%flat_buf->count; } /* while (new_buffer_read < new_file_read) */ } /* if ((new_file_read < new_buffer_read) && (file_list_count == MAX_ARRAY_SIZE)) */ } /* while ((mem_list_count < MAX_ARRAY_SIZE) && (new_buffer_read < bufsize-size_read)) */ /* fakes filling the readlist arrays of lengths found above */ k = start_k; j = start_j; for (i=0; i<mem_list_count; i++) { if(i) { if (i == (mem_list_count - 1)) { if (flat_buf->blocklens[k] == end_brd_size) brd_size = flat_buf->blocklens[(k+1)% flat_buf->count]; else { brd_size = flat_buf->blocklens[k] - end_brd_size; k--; buf_count--; } } } buf_count++; k = (k + 1)%flat_buf->count; } /* for (i=0; i<mem_list_count; i++) */ for (i=0; i<file_list_count; i++) { if (i) { if (i == (file_list_count - 1)) { if (flat_file->blocklens[j] == end_frd_size) frd_size = flat_file->blocklens[(j+1)% flat_file->count]; else { frd_size = flat_file->blocklens[j] - end_frd_size; j--; } } } if (j < flat_file->count - 1) j++; else { j = 0; n_filetypes++; } } /* for (i=0; i<file_list_count; i++) */ size_read += new_buffer_read; start_k = k; start_j = j; if (max_mem_list < mem_list_count) max_mem_list = mem_list_count; if (max_file_list < file_list_count) max_file_list = file_list_count; } /* while (size_read < bufsize) */ mem_offsets = (char **)ADIOI_Malloc(max_mem_list*sizeof(char *)); mem_lengths = (int *)ADIOI_Malloc(max_mem_list*sizeof(int)); file_offsets = (int64_t *)ADIOI_Malloc(max_file_list*sizeof(int64_t)); file_lengths = (int32_t *)ADIOI_Malloc(max_file_list*sizeof(int32_t)); size_read = 0; n_filetypes = st_n_filetypes; frd_size = st_frd_size; brd_size = flat_buf->blocklens[0]; buf_count = 0; start_mem_offset = 0; start_k = k = 0; start_j = st_index; /* this section calculates mem_list_count and file_list_count and also finds the possibly odd sized last array elements in new_frd_size and new_brd_size */ while (size_read < bufsize) { k = start_k; new_buffer_read = 0; mem_list_count = 0; while ((mem_list_count < MAX_ARRAY_SIZE) && (new_buffer_read < bufsize-size_read)) { /* find mem_list_count and file_list_count such that both are less than MAX_ARRAY_SIZE, the sum of their lengths are equal, and the sum of all the data read and data to be read in the next immediate read list is less than bufsize */ if(mem_list_count) { if((new_buffer_read + flat_buf->blocklens[k] + size_read) > bufsize) { end_brd_size = new_buffer_read + flat_buf->blocklens[k] - (bufsize - size_read); new_buffer_read = bufsize - size_read; } else { new_buffer_read += flat_buf->blocklens[k]; end_brd_size = flat_buf->blocklens[k]; } } else { if (brd_size > (bufsize - size_read)) { new_buffer_read = bufsize - size_read; brd_size = new_buffer_read; } else new_buffer_read = brd_size; } mem_list_count++; k = (k + 1)%flat_buf->count; } /* while ((mem_list_count < MAX_ARRAY_SIZE) && (new_buffer_read < bufsize-size_read)) */ j = start_j; new_file_read = 0; file_list_count = 0; while ((file_list_count < MAX_ARRAY_SIZE) && (new_file_read < new_buffer_read)) { if(file_list_count) { if((new_file_read + flat_file->blocklens[j]) > new_buffer_read) { end_frd_size = new_buffer_read - new_file_read; new_file_read = new_buffer_read; j--; } else { new_file_read += flat_file->blocklens[j]; end_frd_size = flat_file->blocklens[j]; } } else { if (frd_size > new_buffer_read) { new_file_read = new_buffer_read; frd_size = new_file_read; } else new_file_read = frd_size; } file_list_count++; if (j < (flat_file->count - 1)) j++; else j = 0; k = start_k; if ((new_file_read < new_buffer_read) && (file_list_count == MAX_ARRAY_SIZE)) { new_buffer_read = 0; mem_list_count = 0; while (new_buffer_read < new_file_read) { if(mem_list_count) { if((new_buffer_read + flat_buf->blocklens[k]) > new_file_read) { end_brd_size = new_file_read - new_buffer_read; new_buffer_read = new_file_read; k--; } else { new_buffer_read += flat_buf->blocklens[k]; end_brd_size = flat_buf->blocklens[k]; } } else { new_buffer_read = brd_size; if (brd_size > (bufsize - size_read)) { new_buffer_read = bufsize - size_read; brd_size = new_buffer_read; } } mem_list_count++; k = (k + 1)%flat_buf->count; } /* while (new_buffer_read < new_file_read) */ } /* if ((new_file_read < new_buffer_read) && (file_list_count == MAX_ARRAY_SIZE)) */ } /* while ((mem_list_count < MAX_ARRAY_SIZE) && (new_buffer_read < bufsize-size_read)) */ /* fills the allocated readlist arrays */ k = start_k; j = start_j; for (i=0; i<mem_list_count; i++) { mem_offsets[i] = (char*)((char *)buf + buftype_extent* (buf_count/flat_buf->count) + (int)flat_buf->indices[k]); if(!i) { mem_lengths[0] = brd_size; mem_offsets[0] += flat_buf->blocklens[k] - brd_size; } else { if (i == (mem_list_count - 1)) { mem_lengths[i] = end_brd_size; if (flat_buf->blocklens[k] == end_brd_size) brd_size = flat_buf->blocklens[(k+1)% flat_buf->count]; else { brd_size = flat_buf->blocklens[k] - end_brd_size; k--; buf_count--; } } else { mem_lengths[i] = flat_buf->blocklens[k]; } } buf_count++; k = (k + 1)%flat_buf->count; } /* for (i=0; i<mem_list_count; i++) */ for (i=0; i<file_list_count; i++) { file_offsets[i] = disp + flat_file->indices[j] + n_filetypes * filetype_extent; if (!i) { file_lengths[0] = frd_size; file_offsets[0] += flat_file->blocklens[j] - frd_size; } else { if (i == (file_list_count - 1)) { file_lengths[i] = end_frd_size; if (flat_file->blocklens[j] == end_frd_size) frd_size = flat_file->blocklens[(j+1)% flat_file->count]; else { frd_size = flat_file->blocklens[j] - end_frd_size; j--; } } else file_lengths[i] = flat_file->blocklens[j]; } if (j < flat_file->count - 1) j++; else { j = 0; n_filetypes++; } } /* for (i=0; i<file_list_count; i++) */ /* printf("about to call read_list in noncontig/noncontig\n"); printf("offsets and lengths in terms of integers\n"); printf("\nmem_list_count = %d\n", mem_list_count); for (i=0; i<mem_list_count; i++) { printf("mem_offsets[%2d] = %2d ", i, (int)(mem_offsets[i] - (int)buf)/4); printf("mem_lengths[%2d] = %2d\n", i, mem_lengths[i]/4); } printf("\nfile_list_count = %d\n", file_list_count); for (i=0; i<file_list_count; i++) { printf("file_offsets[%2d] = %2d ", i, (int)file_offsets[i]/4); printf("file_lengths[%2d] = %2d\n", i, file_lengths[i]/4); } printf("\n\n"); */ pvfs_read_list(fd->fd_sys,mem_list_count, mem_offsets, mem_lengths, file_list_count, file_offsets, file_lengths); size_read += new_buffer_read; start_k = k; start_j = j; } /* while (size_read < bufsize) */ ADIOI_Free(mem_offsets); ADIOI_Free(mem_lengths); } ADIOI_Free(file_offsets); ADIOI_Free(file_lengths); if (file_ptr_type == ADIO_INDIVIDUAL) fd->fp_ind = off; fd->fp_sys_posn = -1; /* set it to null. */ #ifdef HAVE_STATUS_SET_BYTES MPIR_Status_set_bytes(status, datatype, bufsize); /* This is a temporary way of filling in status. The right way is to keep track of how much data was actually read and placed in buf by ADIOI_BUFFERED_READ. */ #endif if (!buftype_is_contig) ADIOI_Delete_flattened(datatype); }
int MPIOI_File_read(MPI_File fh, MPI_Offset offset, int file_ptr_type, void *buf, int count, MPI_Datatype datatype, char *myname, MPI_Status *status) { int error_code, buftype_is_contig, filetype_is_contig; MPI_Count datatype_size; ADIO_File adio_fh; ADIO_Offset off, bufsize; void *xbuf=NULL, *e32_buf=NULL; ROMIO_THREAD_CS_ENTER(); adio_fh = MPIO_File_resolve(fh); /* --BEGIN ERROR HANDLING-- */ MPIO_CHECK_FILE_HANDLE(adio_fh, myname, error_code); MPIO_CHECK_COUNT(adio_fh, count, myname, error_code); MPIO_CHECK_DATATYPE(adio_fh, datatype, myname, error_code); if (file_ptr_type == ADIO_EXPLICIT_OFFSET && offset < 0) { error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_ARG, "**iobadoffset", 0); error_code = MPIO_Err_return_file(adio_fh, error_code); goto fn_exit; } /* --END ERROR HANDLING-- */ MPI_Type_size_x(datatype, &datatype_size); /* --BEGIN ERROR HANDLING-- */ MPIO_CHECK_COUNT_SIZE(adio_fh, count, datatype_size, myname, error_code); /* --END ERROR HANDLING-- */ if (count*datatype_size == 0) { #ifdef HAVE_STATUS_SET_BYTES MPIR_Status_set_bytes(status, datatype, 0); #endif error_code = MPI_SUCCESS; goto fn_exit; } /* --BEGIN ERROR HANDLING-- */ MPIO_CHECK_INTEGRAL_ETYPE(adio_fh, count, datatype_size, myname, error_code); MPIO_CHECK_READABLE(adio_fh, myname, error_code); MPIO_CHECK_NOT_SEQUENTIAL_MODE(adio_fh, myname, error_code); /* --END ERROR HANDLING-- */ ADIOI_Datatype_iscontig(datatype, &buftype_is_contig); ADIOI_Datatype_iscontig(adio_fh->filetype, &filetype_is_contig); ADIOI_TEST_DEFERRED(adio_fh, myname, &error_code); xbuf = buf; if (adio_fh->is_external32) { MPI_Aint e32_size = 0; error_code = MPIU_datatype_full_size(datatype, &e32_size); if (error_code != MPI_SUCCESS) goto fn_exit; e32_buf = ADIOI_Malloc(e32_size*count); xbuf = e32_buf; } if (buftype_is_contig && filetype_is_contig) { /* convert count and offset to bytes */ bufsize = datatype_size * count; if (file_ptr_type == ADIO_EXPLICIT_OFFSET) { off = adio_fh->disp + adio_fh->etype_size * offset; } else /* ADIO_INDIVIDUAL */ { off = adio_fh->fp_ind; } /* if atomic mode requested, lock (exclusive) the region, because there could be a concurrent noncontiguous request. */ if ((adio_fh->atomicity) && ADIO_Feature(adio_fh, ADIO_LOCKS)) { ADIOI_WRITE_LOCK(adio_fh, off, SEEK_SET, bufsize); } ADIO_ReadContig(adio_fh, xbuf, count, datatype, file_ptr_type, off, status, &error_code); if ((adio_fh->atomicity) && ADIO_Feature(adio_fh, ADIO_LOCKS)) { ADIOI_UNLOCK(adio_fh, off, SEEK_SET, bufsize); } } else { ADIO_ReadStrided(adio_fh, xbuf, count, datatype, file_ptr_type, offset, status, &error_code); /* For strided and atomic mode, locking is done in ADIO_ReadStrided */ } /* --BEGIN ERROR HANDLING-- */ if (error_code != MPI_SUCCESS) error_code = MPIO_Err_return_file(adio_fh, error_code); /* --END ERROR HANDLING-- */ if (e32_buf != NULL) { error_code = MPIU_read_external32_conversion_fn(buf, datatype, count, e32_buf); ADIOI_Free(e32_buf); } fn_exit: ROMIO_THREAD_CS_EXIT(); return error_code; }
int ADIO_Type_create_darray(int size, int rank, int ndims, int *array_of_gsizes, int *array_of_distribs, int *array_of_dargs, int *array_of_psizes, int order, MPI_Datatype oldtype, MPI_Datatype *newtype) { MPI_Datatype type_old, type_new, types[3]; int procs, tmp_rank, i, tmp_size, blklens[3], *coords; MPI_Aint *st_offsets, orig_extent, disps[3]; MPI_Type_extent(oldtype, &orig_extent); /* calculate position in Cartesian grid as MPI would (row-major ordering) */ coords = (int *) ADIOI_Malloc(ndims*sizeof(int)); procs = size; tmp_rank = rank; for (i=0; i<ndims; i++) { procs = procs/array_of_psizes[i]; coords[i] = tmp_rank/procs; tmp_rank = tmp_rank % procs; } st_offsets = (MPI_Aint *) ADIOI_Malloc(ndims*sizeof(MPI_Aint)); type_old = oldtype; if (order == MPI_ORDER_FORTRAN) { /* dimension 0 changes fastest */ for (i=0; i<ndims; i++) { switch(array_of_distribs[i]) { case MPI_DISTRIBUTE_BLOCK: MPIOI_Type_block(array_of_gsizes, i, ndims, array_of_psizes[i], coords[i], array_of_dargs[i], order, orig_extent, type_old, &type_new, st_offsets+i); break; case MPI_DISTRIBUTE_CYCLIC: MPIOI_Type_cyclic(array_of_gsizes, i, ndims, array_of_psizes[i], coords[i], array_of_dargs[i], order, orig_extent, type_old, &type_new, st_offsets+i); break; case MPI_DISTRIBUTE_NONE: /* treat it as a block distribution on 1 process */ MPIOI_Type_block(array_of_gsizes, i, ndims, 1, 0, MPI_DISTRIBUTE_DFLT_DARG, order, orig_extent, type_old, &type_new, st_offsets+i); break; } if (i) MPI_Type_free(&type_old); type_old = type_new; } /* add displacement and UB */ disps[1] = st_offsets[0]; tmp_size = 1; for (i=1; i<ndims; i++) { tmp_size *= array_of_gsizes[i-1]; disps[1] += tmp_size*st_offsets[i]; } /* rest done below for both Fortran and C order */ } else /* order == MPI_ORDER_C */ { /* dimension ndims-1 changes fastest */ for (i=ndims-1; i>=0; i--) { switch(array_of_distribs[i]) { case MPI_DISTRIBUTE_BLOCK: MPIOI_Type_block(array_of_gsizes, i, ndims, array_of_psizes[i], coords[i], array_of_dargs[i], order, orig_extent, type_old, &type_new, st_offsets+i); break; case MPI_DISTRIBUTE_CYCLIC: MPIOI_Type_cyclic(array_of_gsizes, i, ndims, array_of_psizes[i], coords[i], array_of_dargs[i], order, orig_extent, type_old, &type_new, st_offsets+i); break; case MPI_DISTRIBUTE_NONE: /* treat it as a block distribution on 1 process */ MPIOI_Type_block(array_of_gsizes, i, ndims, array_of_psizes[i], coords[i], MPI_DISTRIBUTE_DFLT_DARG, order, orig_extent, type_old, &type_new, st_offsets+i); break; } if (i != ndims-1) MPI_Type_free(&type_old); type_old = type_new; } /* add displacement and UB */ disps[1] = st_offsets[ndims-1]; tmp_size = 1; for (i=ndims-2; i>=0; i--) { tmp_size *= array_of_gsizes[i+1]; disps[1] += tmp_size*st_offsets[i]; } } disps[1] *= orig_extent; disps[2] = orig_extent; for (i=0; i<ndims; i++) disps[2] *= array_of_gsizes[i]; disps[0] = 0; blklens[0] = blklens[1] = blklens[2] = 1; types[0] = MPI_LB; types[1] = type_new; types[2] = MPI_UB; MPI_Type_struct(3, blklens, disps, types, newtype); MPI_Type_free(&type_new); ADIOI_Free(st_offsets); ADIOI_Free(coords); return MPI_SUCCESS; }
void ADIO_Init(int *argc, char ***argv, int *error_code) { #if defined(ROMIO_XFS) || defined(ROMIO_LUSTRE) char *c; #endif ADIOI_UNREFERENCED_ARG(argc); ADIOI_UNREFERENCED_ARG(argv); /* initialize the linked list containing flattened datatypes */ ADIOI_Flatlist = (ADIOI_Flatlist_node *) ADIOI_Malloc(sizeof(ADIOI_Flatlist_node)); ADIOI_Flatlist->type = MPI_DATATYPE_NULL; ADIOI_Flatlist->next = NULL; ADIOI_Flatlist->blocklens = NULL; ADIOI_Flatlist->indices = NULL; #if defined(ROMIO_XFS) || defined(ROMIO_LUSTRE) c = getenv("MPIO_DIRECT_READ"); if (c && (!strcmp(c, "true") || !strcmp(c, "TRUE"))) ADIOI_Direct_read = 1; else ADIOI_Direct_read = 0; c = getenv("MPIO_DIRECT_WRITE"); if (c && (!strcmp(c, "true") || !strcmp(c, "TRUE"))) ADIOI_Direct_write = 1; else ADIOI_Direct_write = 0; #endif /* Assume system-wide hints won't change between runs: move hint processing * from ADIO_Open to here */ /* FIXME should be checking error code from MPI_Info_create here */ MPI_Info_create(&ADIOI_syshints); ADIOI_process_system_hints(ADIOI_syshints); #ifdef ADIOI_MPE_LOGGING { MPE_Log_get_state_eventIDs( &ADIOI_MPE_open_a, &ADIOI_MPE_open_b ); MPE_Log_get_state_eventIDs( &ADIOI_MPE_read_a, &ADIOI_MPE_read_b ); MPE_Log_get_state_eventIDs( &ADIOI_MPE_write_a, &ADIOI_MPE_write_b ); MPE_Log_get_state_eventIDs( &ADIOI_MPE_lseek_a, &ADIOI_MPE_lseek_b ); MPE_Log_get_state_eventIDs( &ADIOI_MPE_close_a, &ADIOI_MPE_close_b ); MPE_Log_get_state_eventIDs( &ADIOI_MPE_writelock_a, &ADIOI_MPE_writelock_b ); MPE_Log_get_state_eventIDs( &ADIOI_MPE_readlock_a, &ADIOI_MPE_readlock_b ); MPE_Log_get_state_eventIDs( &ADIOI_MPE_unlock_a, &ADIOI_MPE_unlock_b ); MPE_Log_get_state_eventIDs( &ADIOI_MPE_postwrite_a, &ADIOI_MPE_postwrite_b ); MPE_Log_get_state_eventIDs( &ADIOI_MPE_openinternal_a, &ADIOI_MPE_openinternal_b); MPE_Log_get_state_eventIDs( &ADIOI_MPE_stat_a, &ADIOI_MPE_stat_b); MPE_Log_get_state_eventIDs( &ADIOI_MPE_iread_a, &ADIOI_MPE_iread_b); MPE_Log_get_state_eventIDs( &ADIOI_MPE_iwrite_a, &ADIOI_MPE_iwrite_b); int comm_world_rank; MPI_Comm_rank( MPI_COMM_WORLD, &comm_world_rank ); if ( comm_world_rank == 0 ) { MPE_Describe_state( ADIOI_MPE_open_a, ADIOI_MPE_open_b, "open", "orange" ); MPE_Describe_state( ADIOI_MPE_read_a, ADIOI_MPE_read_b, "read", "green" ); MPE_Describe_state( ADIOI_MPE_write_a, ADIOI_MPE_write_b, "write", "blue" ); MPE_Describe_state( ADIOI_MPE_lseek_a, ADIOI_MPE_lseek_b, "lseek", "red" ); MPE_Describe_state( ADIOI_MPE_close_a, ADIOI_MPE_close_b, "close", "grey" ); MPE_Describe_state( ADIOI_MPE_writelock_a, ADIOI_MPE_writelock_b, "writelock", "plum" ); MPE_Describe_state( ADIOI_MPE_readlock_a, ADIOI_MPE_readlock_b, "readlock", "magenta" ); MPE_Describe_state( ADIOI_MPE_unlock_a, ADIOI_MPE_unlock_b, "unlock", "purple" ); MPE_Describe_state( ADIOI_MPE_postwrite_a, ADIOI_MPE_postwrite_b, "postwrite", "ivory" ); MPE_Describe_state( ADIOI_MPE_openinternal_a, ADIOI_MPE_openinternal_b, "open system", "blue"); MPE_Describe_state( ADIOI_MPE_stat_a, ADIOI_MPE_stat_b, "stat", "purple"); MPE_Describe_state( ADIOI_MPE_iread_a, ADIOI_MPE_iread_b, "iread", "purple"); MPE_Describe_state( ADIOI_MPE_iwrite_a, ADIOI_MPE_iwrite_b, "iwrite", "purple"); } } #endif *error_code = MPI_SUCCESS; MPI_Op_create(my_consensus, 1, &ADIO_same_amode); }
/* ADIOI_Count_contiguous_blocks * * Returns number of contiguous blocks in type, and also updates * curr_index to reflect the space for the additional blocks. * * ASSUMES THAT TYPE IS NOT A BASIC!!! */ MPI_Count ADIOI_Count_contiguous_blocks(MPI_Datatype datatype, MPI_Count *curr_index) { int i, n; MPI_Count count=0, prev_index, num, basic_num; int top_count, combiner, old_combiner, old_is_contig; int nints, nadds, ntypes, old_nints, old_nadds, old_ntypes; int *ints; MPI_Aint *adds; /* Make no assumptions about +/- sign on these */ MPI_Datatype *types; MPI_Type_get_envelope(datatype, &nints, &nadds, &ntypes, &combiner); ints = (int *) ADIOI_Malloc((nints+1)*sizeof(int)); adds = (MPI_Aint *) ADIOI_Malloc((nadds+1)*sizeof(MPI_Aint)); types = (MPI_Datatype *) ADIOI_Malloc((ntypes+1)*sizeof(MPI_Datatype)); MPI_Type_get_contents(datatype, nints, nadds, ntypes, ints, adds, types); switch (combiner) { #ifdef MPIIMPL_HAVE_MPI_COMBINER_DUP case MPI_COMBINER_DUP: MPI_Type_get_envelope(types[0], &old_nints, &old_nadds, &old_ntypes, &old_combiner); ADIOI_Datatype_iscontig(types[0], &old_is_contig); if ((old_combiner != MPI_COMBINER_NAMED) && (!old_is_contig)) count = ADIOI_Count_contiguous_blocks(types[0], curr_index); else { count = 1; (*curr_index)++; } break; #endif #ifdef MPIIMPL_HAVE_MPI_COMBINER_SUBARRAY case MPI_COMBINER_SUBARRAY: { int dims = ints[0]; MPI_Datatype stype; ADIO_Type_create_subarray(dims, &ints[1], /* sizes */ &ints[dims+1], /* subsizes */ &ints[2*dims+1], /* starts */ ints[3*dims+1], /* order */ types[0], /* type */ &stype); count = ADIOI_Count_contiguous_blocks(stype, curr_index); /* curr_index will have already been updated; just pass * count back up. */ MPI_Type_free(&stype); } break; #endif #ifdef MPIIMPL_HAVE_MPI_COMBINER_DARRAY case MPI_COMBINER_DARRAY: { int dims = ints[2]; MPI_Datatype dtype; ADIO_Type_create_darray(ints[0], /* size */ ints[1], /* rank */ dims, &ints[3], /* gsizes */ &ints[dims+3], /* distribs */ &ints[2*dims+3], /* dargs */ &ints[3*dims+3], /* psizes */ ints[4*dims+3], /* order */ types[0], &dtype); count = ADIOI_Count_contiguous_blocks(dtype, curr_index); /* curr_index will have already been updated; just pass * count back up. */ MPI_Type_free(&dtype); } break; #endif case MPI_COMBINER_CONTIGUOUS: top_count = ints[0]; MPI_Type_get_envelope(types[0], &old_nints, &old_nadds, &old_ntypes, &old_combiner); ADIOI_Datatype_iscontig(types[0], &old_is_contig); prev_index = *curr_index; if ((old_combiner != MPI_COMBINER_NAMED) && (!old_is_contig)) count = ADIOI_Count_contiguous_blocks(types[0], curr_index); else count = 1; if (prev_index == *curr_index) /* simplest case, made up of basic or contiguous types */ (*curr_index)++; else { /* made up of noncontiguous derived types */ num = *curr_index - prev_index; count *= top_count; *curr_index += (top_count - 1)*num; } break; case MPI_COMBINER_VECTOR: case MPI_COMBINER_HVECTOR: case MPI_COMBINER_HVECTOR_INTEGER: top_count = ints[0]; MPI_Type_get_envelope(types[0], &old_nints, &old_nadds, &old_ntypes, &old_combiner); ADIOI_Datatype_iscontig(types[0], &old_is_contig); prev_index = *curr_index; if ((old_combiner != MPI_COMBINER_NAMED) && (!old_is_contig)) count = ADIOI_Count_contiguous_blocks(types[0], curr_index); else count = 1; if (prev_index == *curr_index) { /* simplest case, vector of basic or contiguous types */ count = top_count; *curr_index += count; } else { /* vector of noncontiguous derived types */ num = *curr_index - prev_index; /* The noncontiguous types have to be replicated blocklen times and then strided. */ count *= ints[1] * top_count; /* First one */ *curr_index += (ints[1] - 1)*num; /* Now repeat with strides. */ num = *curr_index - prev_index; *curr_index += (top_count - 1)*num; } break; case MPI_COMBINER_INDEXED: case MPI_COMBINER_HINDEXED: case MPI_COMBINER_HINDEXED_INTEGER: top_count = ints[0]; MPI_Type_get_envelope(types[0], &old_nints, &old_nadds, &old_ntypes, &old_combiner); ADIOI_Datatype_iscontig(types[0], &old_is_contig); prev_index = *curr_index; if ((old_combiner != MPI_COMBINER_NAMED) && (!old_is_contig)) count = ADIOI_Count_contiguous_blocks(types[0], curr_index); else count = 1; if (prev_index == *curr_index) { /* simplest case, indexed type made up of basic or contiguous types */ count = top_count; *curr_index += count; } else { /* indexed type made up of noncontiguous derived types */ basic_num = *curr_index - prev_index; /* The noncontiguous types have to be replicated blocklens[i] times and then strided. */ *curr_index += (ints[1]-1) * basic_num; count *= ints[1]; /* Now repeat with strides. */ for (i=1; i<top_count; i++) { count += ints[1+i] * basic_num; *curr_index += ints[1+i] * basic_num; } } break; #if defined HAVE_DECL_MPI_COMBINER_HINDEXED_BLOCK && HAVE_DECL_MPI_COMBINER_HINDEXED_BLOCK case MPI_COMBINER_HINDEXED_BLOCK: #endif case MPI_COMBINER_INDEXED_BLOCK: top_count = ints[0]; MPI_Type_get_envelope(types[0], &old_nints, &old_nadds, &old_ntypes, &old_combiner); ADIOI_Datatype_iscontig(types[0], &old_is_contig); prev_index = *curr_index; if ((old_combiner != MPI_COMBINER_NAMED) && (!old_is_contig)) count = ADIOI_Count_contiguous_blocks(types[0], curr_index); else count = 1; if (prev_index == *curr_index) { /* simplest case, indexed type made up of basic or contiguous types */ count = top_count; *curr_index += count; } else { /* indexed type made up of noncontiguous derived types */ basic_num = *curr_index - prev_index; /* The noncontiguous types have to be replicated blocklens[i] times and then strided. */ *curr_index += (ints[1]-1) * basic_num; count *= ints[1]; /* Now repeat with strides. */ *curr_index += (top_count-1) * count; count *= top_count; } break; case MPI_COMBINER_STRUCT: case MPI_COMBINER_STRUCT_INTEGER: top_count = ints[0]; count = 0; for (n=0; n<top_count; n++) { MPI_Type_get_envelope(types[n], &old_nints, &old_nadds, &old_ntypes, &old_combiner); ADIOI_Datatype_iscontig(types[n], &old_is_contig); prev_index = *curr_index; if ((old_combiner != MPI_COMBINER_NAMED) && (!old_is_contig)) count += ADIOI_Count_contiguous_blocks(types[n], curr_index); if (prev_index == *curr_index) { /* simplest case, current type is basic or contiguous types */ count++; (*curr_index)++; } else { /* current type made up of noncontiguous derived types */ /* The current type has to be replicated blocklens[n] times */ num = *curr_index - prev_index; count += (ints[1+n]-1)*num; (*curr_index) += (ints[1+n]-1)*num; } } break; case MPI_COMBINER_RESIZED: /* treat it as a struct with lb, type, ub */ /* add 2 for lb and ub */ (*curr_index) += 2; count += 2; /* add for datatype */ MPI_Type_get_envelope(types[0], &old_nints, &old_nadds, &old_ntypes, &old_combiner); ADIOI_Datatype_iscontig(types[0], &old_is_contig); if ((old_combiner != MPI_COMBINER_NAMED) && (!old_is_contig)) { count += ADIOI_Count_contiguous_blocks(types[0], curr_index); } else { /* basic or contiguous type */ count++; (*curr_index)++; } break; default: /* TODO: FIXME */ DBG_FPRINTF(stderr, "Error: Unsupported datatype passed to ADIOI_Count_contiguous_blocks, combiner = %d\n", combiner); MPI_Abort(MPI_COMM_WORLD, 1); } #ifndef MPISGI /* There is a bug in SGI's impl. of MPI_Type_get_contents. It doesn't return new datatypes. Therefore no need to free. */ for (i=0; i<ntypes; i++) { MPI_Type_get_envelope(types[i], &old_nints, &old_nadds, &old_ntypes, &old_combiner); if (old_combiner != MPI_COMBINER_NAMED) MPI_Type_free(types+i); } #endif ADIOI_Free(ints); ADIOI_Free(adds); ADIOI_Free(types); return count; }
void ADIOI_PFS_IwriteContig(ADIO_File fd, void *buf, int count, MPI_Datatype datatype, int file_ptr_type, ADIO_Offset offset, ADIO_Request *request, int *error_code) { long *id_sys; ADIO_Offset off; int len, typesize, err; static char myname[] = "ADIOI_PFS_IWRITECONTIG"; *request = ADIOI_Malloc_request(); (*request)->optype = ADIOI_WRITE; (*request)->fd = fd; (*request)->datatype = datatype; MPI_Type_size(datatype, &typesize); len = count * typesize; id_sys = (long *) ADIOI_Malloc(sizeof(long)); (*request)->handle = (void *) id_sys; off = (file_ptr_type == ADIO_INDIVIDUAL) ? fd->fp_ind : offset; lseek(fd->fd_sys, off, SEEK_SET); *id_sys = _iwrite(fd->fd_sys, buf, len); if ((*id_sys == -1) && (errno == EQNOMID)) { /* the man pages say EMREQUEST, but in reality errno is set to EQNOMID! */ /* exceeded the max. no. of outstanding requests. */ /* complete all previous async. requests */ ADIOI_Complete_async(error_code); if (error_code != MPI_SUCCESS) return; /* try again */ *id_sys = _iwrite(fd->fd_sys, buf, len); if ((*id_sys == -1) && (errno == EQNOMID)) { *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_IO, "**io", "**io %s", strerror(errno)); return; } } else if (*id_sys == -1) { *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_IO, "**io", "**io %s", strerror(errno)); return; } if (file_ptr_type == ADIO_INDIVIDUAL) fd->fp_ind += len; (*request)->queued = 1; (*request)->nbytes = len; ADIOI_Add_req_to_list(request); fd->async_count++; fd->fp_sys_posn = -1; /* set it to null. */ if (*id_sys == -1) { *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_IO, "**io", "**io %s", strerror(errno)); } else *error_code = MPI_SUCCESS; }
void ADIOI_PVFS_WriteStridedListIO(ADIO_File fd, void *buf, int count, MPI_Datatype datatype, int file_ptr_type, ADIO_Offset offset, ADIO_Status *status, int *error_code) { /* Since PVFS does not support file locking, can't do buffered writes as on Unix */ /* offset is in units of etype relative to the filetype. */ ADIOI_Flatlist_node *flat_buf, *flat_file; int i, j, k, err=-1, bwr_size, fwr_size=0, st_index=0; int bufsize, size, sum, n_etypes_in_filetype, size_in_filetype; int n_filetypes, etype_in_filetype; ADIO_Offset abs_off_in_filetype=0; int filetype_size, etype_size, buftype_size; MPI_Aint filetype_extent, buftype_extent; int buf_count, buftype_is_contig, filetype_is_contig; ADIO_Offset userbuf_off; ADIO_Offset off, disp, start_off; int flag, st_fwr_size, st_n_filetypes; int new_bwr_size, new_fwr_size, err_flag=0; int mem_list_count, file_list_count; char ** mem_offsets; int64_t *file_offsets; int *mem_lengths; int32_t *file_lengths; int total_blks_to_write; int max_mem_list, max_file_list; int b_blks_wrote; int f_data_wrote; int size_wrote=0, n_write_lists, extra_blks; int end_bwr_size, end_fwr_size; int start_k, start_j, new_file_write, new_buffer_write; int start_mem_offset; #define MAX_ARRAY_SIZE 1024 #ifndef PRINT_ERR_MSG static char myname[] = "ADIOI_PVFS_WRITESTRIDED"; #endif /* PFS file pointer modes are not relevant here, because PFS does not support strided accesses. */ if ((fd->iomode != M_ASYNC) && (fd->iomode != M_UNIX)) { FPRINTF(stderr, "ADIOI_PVFS_WriteStrided: only M_ASYNC and M_UNIX iomodes are valid\n"); MPI_Abort(MPI_COMM_WORLD, 1); } if (fd->atomicity) { FPRINTF(stderr, "ROMIO cannot guarantee atomicity of noncontiguous accesses in atomic mode, as PVFS doesn't support file locking. Use nonatomic mode and its associated semantics.\n"); MPI_Abort(MPI_COMM_WORLD, 1); } ADIOI_Datatype_iscontig(datatype, &buftype_is_contig); ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig); MPI_Type_size(fd->filetype, &filetype_size); if ( ! filetype_size ) { *error_code = MPI_SUCCESS; return; } MPI_Type_extent(fd->filetype, &filetype_extent); MPI_Type_size(datatype, &buftype_size); MPI_Type_extent(datatype, &buftype_extent); etype_size = fd->etype_size; bufsize = buftype_size * count; if (!buftype_is_contig && filetype_is_contig) { /* noncontiguous in memory, contiguous in file. */ int64_t file_offsets; int32_t file_lengths; ADIOI_Flatten_datatype(datatype); flat_buf = ADIOI_Flatlist; while (flat_buf->type != datatype) flat_buf = flat_buf->next; if (file_ptr_type == ADIO_EXPLICIT_OFFSET) { off = fd->disp + etype_size * offset; pvfs_lseek64(fd->fd_sys, fd->fp_ind, SEEK_SET); } else off = pvfs_lseek64(fd->fd_sys, fd->fp_ind, SEEK_SET); file_list_count = 1; file_offsets = off; file_lengths = 0; total_blks_to_write = count*flat_buf->count; b_blks_wrote = 0; /* allocate arrays according to max usage */ if (total_blks_to_write > MAX_ARRAY_SIZE) mem_list_count = MAX_ARRAY_SIZE; else mem_list_count = total_blks_to_write; mem_offsets = (char**)ADIOI_Malloc(mem_list_count*sizeof(char*)); mem_lengths = (int*)ADIOI_Malloc(mem_list_count*sizeof(int)); j = 0; /* step through each block in memory, filling memory arrays */ while (b_blks_wrote < total_blks_to_write) { for (i=0; i<flat_buf->count; i++) { mem_offsets[b_blks_wrote % MAX_ARRAY_SIZE] = ((char*)buf + j*buftype_extent + flat_buf->indices[i]); mem_lengths[b_blks_wrote % MAX_ARRAY_SIZE] = flat_buf->blocklens[i]; file_lengths += flat_buf->blocklens[i]; b_blks_wrote++; if (!(b_blks_wrote % MAX_ARRAY_SIZE) || (b_blks_wrote == total_blks_to_write)) { /* in the case of the last read list call, adjust mem_list_count */ if (b_blks_wrote == total_blks_to_write) { mem_list_count = total_blks_to_write % MAX_ARRAY_SIZE; /* in case last read list call fills max arrays */ if (!mem_list_count) mem_list_count = MAX_ARRAY_SIZE; } pvfs_write_list(fd->fd_sys ,mem_list_count, mem_offsets, mem_lengths, file_list_count, &file_offsets, &file_lengths); /* in the case of the last read list call, leave here */ if (b_blks_wrote == total_blks_to_write) break; file_offsets += file_lengths; file_lengths = 0; } } /* for (i=0; i<flat_buf->count; i++) */ j++; } /* while (b_blks_wrote < total_blks_to_write) */ ADIOI_Free(mem_offsets); ADIOI_Free(mem_lengths); if (file_ptr_type == ADIO_INDIVIDUAL) fd->fp_ind = off; #ifdef PRINT_ERR_MSG *error_code = (err_flag) ? MPI_ERR_UNKNOWN : MPI_SUCCESS; #else if (err_flag) { *error_code = MPIR_Err_setmsg(MPI_ERR_IO, MPIR_ADIO_ERROR, myname, "I/O Error", "%s", strerror(errno)); ADIOI_Error(fd, *error_code, myname); } else *error_code = MPI_SUCCESS; #endif ADIOI_Delete_flattened(datatype); return; } /* if (!buftype_is_contig && filetype_is_contig) */ /* already know that file is noncontiguous from above */ /* noncontiguous in file */ /* filetype already flattened in ADIO_Open */ flat_file = ADIOI_Flatlist; while (flat_file->type != fd->filetype) flat_file = flat_file->next; disp = fd->disp; /* for each case - ADIO_Individual pointer or explicit, find offset (file offset in bytes), n_filetypes (how many filetypes into file to start), fwr_size (remaining amount of data in present file block), and st_index (start point in terms of blocks in starting filetype) */ if (file_ptr_type == ADIO_INDIVIDUAL) { offset = fd->fp_ind; /* in bytes */ n_filetypes = -1; flag = 0; while (!flag) { n_filetypes++; for (i=0; i<flat_file->count; i++) { if (disp + flat_file->indices[i] + (ADIO_Offset) n_filetypes*filetype_extent + flat_file->blocklens[i] >= offset) { st_index = i; fwr_size = disp + flat_file->indices[i] + (ADIO_Offset) n_filetypes*filetype_extent + flat_file->blocklens[i] - offset; flag = 1; break; } } } /* while (!flag) */ } /* if (file_ptr_type == ADIOI_INDIVIDUAL) */ else { n_etypes_in_filetype = filetype_size/etype_size; n_filetypes = (int) (offset / n_etypes_in_filetype); etype_in_filetype = (int) (offset % n_etypes_in_filetype); size_in_filetype = etype_in_filetype * etype_size; sum = 0; for (i=0; i<flat_file->count; i++) { sum += flat_file->blocklens[i]; if (sum > size_in_filetype) { st_index = i; fwr_size = sum - size_in_filetype; abs_off_in_filetype = flat_file->indices[i] + size_in_filetype - (sum - flat_file->blocklens[i]); break; } } /* abs. offset in bytes in the file */ offset = disp + (ADIO_Offset) n_filetypes*filetype_extent + abs_off_in_filetype; } /* else [file_ptr_type != ADIOI_INDIVIDUAL] */ start_off = offset; st_fwr_size = fwr_size; st_n_filetypes = n_filetypes; if (buftype_is_contig && !filetype_is_contig) { /* contiguous in memory, noncontiguous in file. should be the most common case. */ int mem_lengths; char *mem_offsets; i = 0; j = st_index; off = offset; n_filetypes = st_n_filetypes; mem_list_count = 1; /* determine how many blocks in file to read */ f_data_wrote = ADIOI_MIN(st_fwr_size, bufsize); total_blks_to_write = 1; j++; while (f_data_wrote < bufsize) { f_data_wrote += flat_file->blocklens[j]; total_blks_to_write++; if (j<(flat_file->count-1)) j++; else j = 0; } j = st_index; n_filetypes = st_n_filetypes; n_write_lists = total_blks_to_write/MAX_ARRAY_SIZE; extra_blks = total_blks_to_write%MAX_ARRAY_SIZE; mem_offsets = buf; mem_lengths = 0; /* if at least one full readlist, allocate file arrays at max array size and don't free until very end */ if (n_write_lists) { file_offsets = (int64_t*)ADIOI_Malloc(MAX_ARRAY_SIZE* sizeof(int64_t)); file_lengths = (int32_t*)ADIOI_Malloc(MAX_ARRAY_SIZE* sizeof(int32_t)); } /* if there's no full readlist allocate file arrays according to needed size (extra_blks) */ else { file_offsets = (int64_t*)ADIOI_Malloc(extra_blks* sizeof(int64_t)); file_lengths = (int32_t*)ADIOI_Malloc(extra_blks* sizeof(int32_t)); } /* for file arrays that are of MAX_ARRAY_SIZE, build arrays */ for (i=0; i<n_write_lists; i++) { file_list_count = MAX_ARRAY_SIZE; if(!i) { file_offsets[0] = offset; file_lengths[0] = st_fwr_size; mem_lengths = st_fwr_size; } for (k=0; k<MAX_ARRAY_SIZE; k++) { if (i || k) { file_offsets[k] = disp + n_filetypes*filetype_extent + flat_file->indices[j]; file_lengths[k] = flat_file->blocklens[j]; mem_lengths += file_lengths[k]; } if (j<(flat_file->count - 1)) j++; else { j = 0; n_filetypes++; } } /* for (k=0; k<MAX_ARRAY_SIZE; k++) */ pvfs_write_list(fd->fd_sys, mem_list_count, &mem_offsets, &mem_lengths, file_list_count, file_offsets, file_lengths); mem_offsets += mem_lengths; mem_lengths = 0; } /* for (i=0; i<n_write_lists; i++) */ /* for file arrays smaller than MAX_ARRAY_SIZE (last read_list call) */ if (extra_blks) { file_list_count = extra_blks; if(!i) { file_offsets[0] = offset; file_lengths[0] = st_fwr_size; } for (k=0; k<extra_blks; k++) { if(i || k) { file_offsets[k] = disp + n_filetypes*filetype_extent + flat_file->indices[j]; if (k == (extra_blks - 1)) { file_lengths[k] = bufsize - (int32_t) mem_lengths - (int32_t) mem_offsets + (int32_t) buf; } else file_lengths[k] = flat_file->blocklens[j]; } /* if(i || k) */ mem_lengths += file_lengths[k]; if (j<(flat_file->count - 1)) j++; else { j = 0; n_filetypes++; } } /* for (k=0; k<extra_blks; k++) */ pvfs_write_list(fd->fd_sys, mem_list_count, &mem_offsets, &mem_lengths, file_list_count, file_offsets, file_lengths); } } else { /* noncontiguous in memory as well as in file */ ADIOI_Flatten_datatype(datatype); flat_buf = ADIOI_Flatlist; while (flat_buf->type != datatype) flat_buf = flat_buf->next; size_wrote = 0; n_filetypes = st_n_filetypes; fwr_size = st_fwr_size; bwr_size = flat_buf->blocklens[0]; buf_count = 0; start_mem_offset = 0; start_k = k = 0; start_j = st_index; max_mem_list = 0; max_file_list = 0; /* run through and file max_file_list and max_mem_list so that you can allocate the file and memory arrays less than MAX_ARRAY_SIZE if possible */ while (size_wrote < bufsize) { k = start_k; new_buffer_write = 0; mem_list_count = 0; while ((mem_list_count < MAX_ARRAY_SIZE) && (new_buffer_write < bufsize-size_wrote)) { /* find mem_list_count and file_list_count such that both are less than MAX_ARRAY_SIZE, the sum of their lengths are equal, and the sum of all the data read and data to be read in the next immediate read list is less than bufsize */ if(mem_list_count) { if((new_buffer_write + flat_buf->blocklens[k] + size_wrote) > bufsize) { end_bwr_size = new_buffer_write + flat_buf->blocklens[k] - (bufsize - size_wrote); new_buffer_write = bufsize - size_wrote; } else { new_buffer_write += flat_buf->blocklens[k]; end_bwr_size = flat_buf->blocklens[k]; } } else { if (bwr_size > (bufsize - size_wrote)) { new_buffer_write = bufsize - size_wrote; bwr_size = new_buffer_write; } else new_buffer_write = bwr_size; } mem_list_count++; k = (k + 1)%flat_buf->count; } /* while ((mem_list_count < MAX_ARRAY_SIZE) && (new_buffer_write < bufsize-size_wrote)) */ j = start_j; new_file_write = 0; file_list_count = 0; while ((file_list_count < MAX_ARRAY_SIZE) && (new_file_write < new_buffer_write)) { if(file_list_count) { if((new_file_write + flat_file->blocklens[j]) > new_buffer_write) { end_fwr_size = new_buffer_write - new_file_write; new_file_write = new_buffer_write; j--; } else { new_file_write += flat_file->blocklens[j]; end_fwr_size = flat_file->blocklens[j]; } } else { if (fwr_size > new_buffer_write) { new_file_write = new_buffer_write; fwr_size = new_file_write; } else new_file_write = fwr_size; } file_list_count++; if (j < (flat_file->count - 1)) j++; else j = 0; k = start_k; if ((new_file_write < new_buffer_write) && (file_list_count == MAX_ARRAY_SIZE)) { new_buffer_write = 0; mem_list_count = 0; while (new_buffer_write < new_file_write) { if(mem_list_count) { if((new_buffer_write + flat_buf->blocklens[k]) > new_file_write) { end_bwr_size = new_file_write - new_buffer_write; new_buffer_write = new_file_write; k--; } else { new_buffer_write += flat_buf->blocklens[k]; end_bwr_size = flat_buf->blocklens[k]; } } else { new_buffer_write = bwr_size; if (bwr_size > (bufsize - size_wrote)) { new_buffer_write = bufsize - size_wrote; bwr_size = new_buffer_write; } } mem_list_count++; k = (k + 1)%flat_buf->count; } /* while (new_buffer_write < new_file_write) */ } /* if ((new_file_write < new_buffer_write) && (file_list_count == MAX_ARRAY_SIZE)) */ } /* while ((mem_list_count < MAX_ARRAY_SIZE) && (new_buffer_write < bufsize-size_wrote)) */ /* fakes filling the writelist arrays of lengths found above */ k = start_k; j = start_j; for (i=0; i<mem_list_count; i++) { if(i) { if (i == (mem_list_count - 1)) { if (flat_buf->blocklens[k] == end_bwr_size) bwr_size = flat_buf->blocklens[(k+1)% flat_buf->count]; else { bwr_size = flat_buf->blocklens[k] - end_bwr_size; k--; buf_count--; } } } buf_count++; k = (k + 1)%flat_buf->count; } /* for (i=0; i<mem_list_count; i++) */ for (i=0; i<file_list_count; i++) { if (i) { if (i == (file_list_count - 1)) { if (flat_file->blocklens[j] == end_fwr_size) fwr_size = flat_file->blocklens[(j+1)% flat_file->count]; else { fwr_size = flat_file->blocklens[j] - end_fwr_size; j--; } } } if (j < flat_file->count - 1) j++; else { j = 0; n_filetypes++; } } /* for (i=0; i<file_list_count; i++) */ size_wrote += new_buffer_write; start_k = k; start_j = j; if (max_mem_list < mem_list_count) max_mem_list = mem_list_count; if (max_file_list < file_list_count) max_file_list = file_list_count; if (max_mem_list == max_mem_list == MAX_ARRAY_SIZE) break; } /* while (size_wrote < bufsize) */ mem_offsets = (char **)ADIOI_Malloc(max_mem_list*sizeof(char *)); mem_lengths = (int *)ADIOI_Malloc(max_mem_list*sizeof(int)); file_offsets = (int64_t *)ADIOI_Malloc(max_file_list*sizeof(int64_t)); file_lengths = (int32_t *)ADIOI_Malloc(max_file_list*sizeof(int32_t)); size_wrote = 0; n_filetypes = st_n_filetypes; fwr_size = st_fwr_size; bwr_size = flat_buf->blocklens[0]; buf_count = 0; start_mem_offset = 0; start_k = k = 0; start_j = st_index; /* this section calculates mem_list_count and file_list_count and also finds the possibly odd sized last array elements in new_fwr_size and new_bwr_size */ while (size_wrote < bufsize) { k = start_k; new_buffer_write = 0; mem_list_count = 0; while ((mem_list_count < MAX_ARRAY_SIZE) && (new_buffer_write < bufsize-size_wrote)) { /* find mem_list_count and file_list_count such that both are less than MAX_ARRAY_SIZE, the sum of their lengths are equal, and the sum of all the data read and data to be read in the next immediate read list is less than bufsize */ if(mem_list_count) { if((new_buffer_write + flat_buf->blocklens[k] + size_wrote) > bufsize) { end_bwr_size = new_buffer_write + flat_buf->blocklens[k] - (bufsize - size_wrote); new_buffer_write = bufsize - size_wrote; } else { new_buffer_write += flat_buf->blocklens[k]; end_bwr_size = flat_buf->blocklens[k]; } } else { if (bwr_size > (bufsize - size_wrote)) { new_buffer_write = bufsize - size_wrote; bwr_size = new_buffer_write; } else new_buffer_write = bwr_size; } mem_list_count++; k = (k + 1)%flat_buf->count; } /* while ((mem_list_count < MAX_ARRAY_SIZE) && (new_buffer_write < bufsize-size_wrote)) */ j = start_j; new_file_write = 0; file_list_count = 0; while ((file_list_count < MAX_ARRAY_SIZE) && (new_file_write < new_buffer_write)) { if(file_list_count) { if((new_file_write + flat_file->blocklens[j]) > new_buffer_write) { end_fwr_size = new_buffer_write - new_file_write; new_file_write = new_buffer_write; j--; } else { new_file_write += flat_file->blocklens[j]; end_fwr_size = flat_file->blocklens[j]; } } else { if (fwr_size > new_buffer_write) { new_file_write = new_buffer_write; fwr_size = new_file_write; } else new_file_write = fwr_size; } file_list_count++; if (j < (flat_file->count - 1)) j++; else j = 0; k = start_k; if ((new_file_write < new_buffer_write) && (file_list_count == MAX_ARRAY_SIZE)) { new_buffer_write = 0; mem_list_count = 0; while (new_buffer_write < new_file_write) { if(mem_list_count) { if((new_buffer_write + flat_buf->blocklens[k]) > new_file_write) { end_bwr_size = new_file_write - new_buffer_write; new_buffer_write = new_file_write; k--; } else { new_buffer_write += flat_buf->blocklens[k]; end_bwr_size = flat_buf->blocklens[k]; } } else { new_buffer_write = bwr_size; if (bwr_size > (bufsize - size_wrote)) { new_buffer_write = bufsize - size_wrote; bwr_size = new_buffer_write; } } mem_list_count++; k = (k + 1)%flat_buf->count; } /* while (new_buffer_write < new_file_write) */ } /* if ((new_file_write < new_buffer_write) && (file_list_count == MAX_ARRAY_SIZE)) */ } /* while ((mem_list_count < MAX_ARRAY_SIZE) && (new_buffer_write < bufsize-size_wrote)) */ /* fills the allocated readlist arrays */ k = start_k; j = start_j; for (i=0; i<mem_list_count; i++) { mem_offsets[i] = ((char*)buf + buftype_extent* (buf_count/flat_buf->count) + (int)flat_buf->indices[k]); if(!i) { mem_lengths[0] = bwr_size; mem_offsets[0] += flat_buf->blocklens[k] - bwr_size; } else { if (i == (mem_list_count - 1)) { mem_lengths[i] = end_bwr_size; if (flat_buf->blocklens[k] == end_bwr_size) bwr_size = flat_buf->blocklens[(k+1)% flat_buf->count]; else { bwr_size = flat_buf->blocklens[k] - end_bwr_size; k--; buf_count--; } } else { mem_lengths[i] = flat_buf->blocklens[k]; } } buf_count++; k = (k + 1)%flat_buf->count; } /* for (i=0; i<mem_list_count; i++) */ for (i=0; i<file_list_count; i++) { file_offsets[i] = disp + flat_file->indices[j] + n_filetypes * filetype_extent; if (!i) { file_lengths[0] = fwr_size; file_offsets[0] += flat_file->blocklens[j] - fwr_size; } else { if (i == (file_list_count - 1)) { file_lengths[i] = end_fwr_size; if (flat_file->blocklens[j] == end_fwr_size) fwr_size = flat_file->blocklens[(j+1)% flat_file->count]; else { fwr_size = flat_file->blocklens[j] - end_fwr_size; j--; } } else file_lengths[i] = flat_file->blocklens[j]; } if (j < flat_file->count - 1) j++; else { j = 0; n_filetypes++; } } /* for (i=0; i<file_list_count; i++) */ pvfs_write_list(fd->fd_sys,mem_list_count, mem_offsets, mem_lengths, file_list_count, file_offsets, file_lengths); size_wrote += new_buffer_write; start_k = k; start_j = j; } /* while (size_wrote < bufsize) */ ADIOI_Free(mem_offsets); ADIOI_Free(mem_lengths); } ADIOI_Free(file_offsets); ADIOI_Free(file_lengths); if (file_ptr_type == ADIO_INDIVIDUAL) fd->fp_ind = off; #ifdef PRINT_ERR_MSG *error_code = (err_flag) ? MPI_ERR_UNKNOWN : MPI_SUCCESS; #else if (err_flag) { *error_code = MPIR_Err_setmsg(MPI_ERR_IO, MPIR_ADIO_ERROR, myname, "I/O Error", "%s", strerror(errno)); ADIOI_Error(fd, *error_code, myname); } else *error_code = MPI_SUCCESS; #endif fd->fp_sys_posn = -1; /* set it to null. */ #ifdef HAVE_STATUS_SET_BYTES MPIR_Status_set_bytes(status, datatype, bufsize); /* This is a temporary way of filling in status. The right way is to keep track of how much data was actually written by ADIOI_BUFFERED_WRITE. */ #endif if (!buftype_is_contig) ADIOI_Delete_flattened(datatype); }
void ADIOI_PFS_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code) { char *value, *value_in_fd; int flag, tmp_val, str_factor=-1, str_unit=-1, start_iodev=-1; struct sattr attr; int err, myrank, fd_sys, perm, amode, old_mask; if (!(fd->info)) { /* This must be part of the open call. can set striping parameters if necessary. */ MPI_Info_create(&(fd->info)); /* has user specified striping or server buffering parameters and do they have the same value on all processes? */ if (users_info != MPI_INFO_NULL) { value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char)); MPI_Info_get(users_info, "striping_factor", MPI_MAX_INFO_VAL, value, &flag); if (flag) { str_factor=atoi(value); tmp_val = str_factor; MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm); if (tmp_val != str_factor) { FPRINTF(stderr, "ADIOI_PFS_SetInfo: the value for key \"striping_factor\" must be the same on all processes\n"); MPI_Abort(MPI_COMM_WORLD, 1); } } MPI_Info_get(users_info, "striping_unit", MPI_MAX_INFO_VAL, value, &flag); if (flag) { str_unit=atoi(value); tmp_val = str_unit; MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm); if (tmp_val != str_unit) { FPRINTF(stderr, "ADIOI_PFS_SetInfo: the value for key \"striping_unit\" must be the same on all processes\n"); MPI_Abort(MPI_COMM_WORLD, 1); } } MPI_Info_get(users_info, "start_iodevice", MPI_MAX_INFO_VAL, value, &flag); if (flag) { start_iodev=atoi(value); tmp_val = start_iodev; MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm); if (tmp_val != start_iodev) { FPRINTF(stderr, "ADIOI_PFS_SetInfo: the value for key \"start_iodevice\" must be the same on all processes\n"); MPI_Abort(MPI_COMM_WORLD, 1); } } /* if user has specified striping info, process 0 tries to set it */ if ((str_factor > 0) || (str_unit > 0) || (start_iodev >= 0)) { MPI_Comm_rank(fd->comm, &myrank); if (!myrank) { if (fd->perm == ADIO_PERM_NULL) { old_mask = umask(022); umask(old_mask); perm = old_mask ^ 0666; } else perm = fd->perm; amode = 0; if (fd->access_mode & ADIO_CREATE) amode = amode | O_CREAT; if (fd->access_mode & ADIO_RDONLY) amode = amode | O_RDONLY; if (fd->access_mode & ADIO_WRONLY) amode = amode | O_WRONLY; if (fd->access_mode & ADIO_RDWR) amode = amode | O_RDWR; if (fd->access_mode & ADIO_EXCL) amode = amode | O_EXCL; fd_sys = open(fd->filename, amode, perm); err = fcntl(fd_sys, F_GETSATTR, &attr); if (!err) { if (str_unit > 0) attr.s_sunitsize = str_unit; if ((start_iodev >= 0) && (start_iodev < attr.s_sfactor)) attr.s_start_sdir = start_iodev; if ((str_factor > 0) && (str_factor < attr.s_sfactor)) attr.s_sfactor = str_factor; err = fcntl(fd_sys, F_SETSATTR, &attr); } close(fd_sys); } MPI_Barrier(fd->comm); } /* Has user asked for pfs server buffering to be turned on? If so, mark it as true in fd->info and turn it on in ADIOI_PFS_Open after the file is opened */ MPI_Info_get(users_info, "pfs_svr_buf", MPI_MAX_INFO_VAL, value, &flag); if (flag && (!strcmp(value, "true"))) MPI_Info_set(fd->info, "pfs_svr_buf", "true"); else MPI_Info_set(fd->info, "pfs_svr_buf", "false"); ADIOI_Free(value); } else MPI_Info_set(fd->info, "pfs_svr_buf", "false"); /* set the values for collective I/O and data sieving parameters */ ADIOI_GEN_SetInfo(fd, users_info, error_code); } else { /* The file has been opened previously and fd->fd_sys is a valid file descriptor. cannot set striping parameters now. */ /* set the values for collective I/O and data sieving parameters */ ADIOI_GEN_SetInfo(fd, users_info, error_code); /* has user specified value for pfs_svr_buf? */ if (users_info != MPI_INFO_NULL) { value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char)); MPI_Info_get(users_info, "pfs_svr_buf", MPI_MAX_INFO_VAL, value, &flag); if (flag && (!strcmp(value, "true") || !strcmp(value, "false"))) { value_in_fd = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char)); MPI_Info_get(fd->info, "pfs_svr_buf", MPI_MAX_INFO_VAL, value_in_fd, &flag); if (strcmp(value, value_in_fd)) { if (!strcmp(value, "true")) { err = fcntl(fd->fd_sys, F_PFS_SVR_BUF, TRUE); if (!err) MPI_Info_set(fd->info, "pfs_svr_buf", "true"); } else { err = fcntl(fd->fd_sys, F_PFS_SVR_BUF, FALSE); if (!err) MPI_Info_set(fd->info, "pfs_svr_buf", "false"); } } ADIOI_Free(value_in_fd); } ADIOI_Free(value); } } *error_code = MPI_SUCCESS; }
void ADIOI_PVFS_WriteStrided(ADIO_File fd, void *buf, int count, MPI_Datatype datatype, int file_ptr_type, ADIO_Offset offset, ADIO_Status *status, int *error_code) { /* Since PVFS does not support file locking, can't do buffered writes as on Unix */ /* offset is in units of etype relative to the filetype. */ ADIOI_Flatlist_node *flat_buf, *flat_file; struct iovec *iov; int i, j, k, err=-1, bwr_size, fwr_size=0, st_index=0; int bufsize, num, size, sum, n_etypes_in_filetype, size_in_filetype; int n_filetypes, etype_in_filetype; ADIO_Offset abs_off_in_filetype=0; int filetype_size, etype_size, buftype_size; MPI_Aint filetype_extent, buftype_extent, indx; int buf_count, buftype_is_contig, filetype_is_contig; ADIO_Offset off, disp; int flag, new_bwr_size, new_fwr_size, err_flag=0; #ifndef PRINT_ERR_MSG static char myname[] = "ADIOI_PVFS_WRITESTRIDED"; #endif #ifdef HAVE_PVFS_LISTIO if ( fd->hints->fs_hints.pvfs.listio_write == ADIOI_HINT_ENABLE ) { ADIOI_PVFS_WriteStridedListIO(fd, buf, count, datatype, file_ptr_type, offset, status, error_code); return; } #endif /* if hint set to DISABLE or AUTOMATIC, don't use listio */ if (fd->atomicity) { FPRINTF(stderr, "ROMIO cannot guarantee atomicity of noncontiguous accesses in atomic mode, as PVFS doesn't support file locking. Use nonatomic mode and its associated semantics.\n"); MPI_Abort(MPI_COMM_WORLD, 1); } ADIOI_Datatype_iscontig(datatype, &buftype_is_contig); ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig); MPI_Type_size(fd->filetype, &filetype_size); if ( ! filetype_size ) { *error_code = MPI_SUCCESS; return; } MPI_Type_extent(fd->filetype, &filetype_extent); MPI_Type_size(datatype, &buftype_size); MPI_Type_extent(datatype, &buftype_extent); etype_size = fd->etype_size; bufsize = buftype_size * count; if (!buftype_is_contig && filetype_is_contig) { /* noncontiguous in memory, contiguous in file. use writev */ ADIOI_Flatten_datatype(datatype); flat_buf = ADIOI_Flatlist; while (flat_buf->type != datatype) flat_buf = flat_buf->next; /* There is a limit of 16 on the number of iovecs for readv/writev! */ iov = (struct iovec *) ADIOI_Malloc(16*sizeof(struct iovec)); if (file_ptr_type == ADIO_EXPLICIT_OFFSET) { off = fd->disp + etype_size * offset; pvfs_lseek64(fd->fd_sys, off, SEEK_SET); } else off = pvfs_lseek64(fd->fd_sys, fd->fp_ind, SEEK_SET); k = 0; for (j=0; j<count; j++) for (i=0; i<flat_buf->count; i++) { iov[k].iov_base = ((char *) buf) + j*buftype_extent + flat_buf->indices[i]; iov[k].iov_len = flat_buf->blocklens[i]; /*FPRINTF(stderr, "%d %d\n", iov[k].iov_base, iov[k].iov_len);*/ off += flat_buf->blocklens[i]; k = (k+1)%16; if (!k) { err = pvfs_writev(fd->fd_sys, iov, 16); if (err == -1) err_flag = 1; } } if (k) { err = pvfs_writev(fd->fd_sys, iov, k); if (err == -1) err_flag = 1; } if (file_ptr_type == ADIO_INDIVIDUAL) fd->fp_ind = off; ADIOI_Free(iov); #ifdef PRINT_ERR_MSG *error_code = (err_flag) ? MPI_ERR_UNKNOWN : MPI_SUCCESS; #else if (err_flag) { *error_code = MPIR_Err_setmsg(MPI_ERR_IO, MPIR_ADIO_ERROR, myname, "I/O Error", "%s", strerror(errno)); ADIOI_Error(fd, *error_code, myname); } else *error_code = MPI_SUCCESS; #endif } else { /* noncontiguous in file */ /* split up into several contiguous writes */ /* find starting location in the file */ /* filetype already flattened in ADIO_Open */ flat_file = ADIOI_Flatlist; while (flat_file->type != fd->filetype) flat_file = flat_file->next; disp = fd->disp; if (file_ptr_type == ADIO_INDIVIDUAL) { offset = fd->fp_ind; /* in bytes */ n_filetypes = -1; flag = 0; while (!flag) { n_filetypes++; for (i=0; i<flat_file->count; i++) { if (disp + flat_file->indices[i] + (ADIO_Offset) n_filetypes*filetype_extent + flat_file->blocklens[i] >= offset) { st_index = i; fwr_size = disp + flat_file->indices[i] + (ADIO_Offset) n_filetypes*filetype_extent + flat_file->blocklens[i] - offset; flag = 1; break; } } } } else { n_etypes_in_filetype = filetype_size/etype_size; n_filetypes = (int) (offset / n_etypes_in_filetype); etype_in_filetype = (int) (offset % n_etypes_in_filetype); size_in_filetype = etype_in_filetype * etype_size; sum = 0; for (i=0; i<flat_file->count; i++) { sum += flat_file->blocklens[i]; if (sum > size_in_filetype) { st_index = i; fwr_size = sum - size_in_filetype; abs_off_in_filetype = flat_file->indices[i] + size_in_filetype - (sum - flat_file->blocklens[i]); break; } } /* abs. offset in bytes in the file */ offset = disp + (ADIO_Offset) n_filetypes*filetype_extent + abs_off_in_filetype; } if (buftype_is_contig && !filetype_is_contig) { /* contiguous in memory, noncontiguous in file. should be the most common case. */ i = 0; j = st_index; off = offset; fwr_size = ADIOI_MIN(fwr_size, bufsize); while (i < bufsize) { if (fwr_size) { /* TYPE_UB and TYPE_LB can result in fwr_size = 0. save system call in such cases */ #ifdef PROFILE MPE_Log_event(11, 0, "start seek"); #endif pvfs_lseek64(fd->fd_sys, off, SEEK_SET); #ifdef PROFILE MPE_Log_event(12, 0, "end seek"); MPE_Log_event(5, 0, "start write"); #endif err = pvfs_write(fd->fd_sys, ((char *) buf) + i, fwr_size); #ifdef PROFILE MPE_Log_event(6, 0, "end write"); #endif if (err == -1) err_flag = 1; } i += fwr_size; if (off + fwr_size < disp + flat_file->indices[j] + flat_file->blocklens[j] + (ADIO_Offset) n_filetypes*filetype_extent) off += fwr_size; /* did not reach end of contiguous block in filetype. no more I/O needed. off is incremented by fwr_size. */ else { if (j < (flat_file->count - 1)) j++; else { j = 0; n_filetypes++; } off = disp + flat_file->indices[j] + (ADIO_Offset) n_filetypes*filetype_extent; fwr_size = ADIOI_MIN(flat_file->blocklens[j], bufsize-i); } } } else { /* noncontiguous in memory as well as in file */ ADIOI_Flatten_datatype(datatype); flat_buf = ADIOI_Flatlist; while (flat_buf->type != datatype) flat_buf = flat_buf->next; k = num = buf_count = 0; indx = flat_buf->indices[0]; j = st_index; off = offset; bwr_size = flat_buf->blocklens[0]; while (num < bufsize) { size = ADIOI_MIN(fwr_size, bwr_size); if (size) { #ifdef PROFILE MPE_Log_event(11, 0, "start seek"); #endif pvfs_lseek64(fd->fd_sys, off, SEEK_SET); #ifdef PROFILE MPE_Log_event(12, 0, "end seek"); MPE_Log_event(5, 0, "start write"); #endif err = pvfs_write(fd->fd_sys, ((char *) buf) + indx, size); #ifdef PROFILE MPE_Log_event(6, 0, "end write"); #endif if (err == -1) err_flag = 1; } new_fwr_size = fwr_size; new_bwr_size = bwr_size; if (size == fwr_size) { /* reached end of contiguous block in file */ if (j < (flat_file->count - 1)) j++; else { j = 0; n_filetypes++; } off = disp + flat_file->indices[j] + (ADIO_Offset) n_filetypes*filetype_extent; new_fwr_size = flat_file->blocklens[j]; if (size != bwr_size) { indx += size; new_bwr_size -= size; } } if (size == bwr_size) { /* reached end of contiguous block in memory */ k = (k + 1)%flat_buf->count; buf_count++; indx = buftype_extent*(buf_count/flat_buf->count) + flat_buf->indices[k]; new_bwr_size = flat_buf->blocklens[k]; if (size != fwr_size) { off += size; new_fwr_size -= size; } } num += size; fwr_size = new_fwr_size; bwr_size = new_bwr_size; } } if (file_ptr_type == ADIO_INDIVIDUAL) fd->fp_ind = off; #ifdef PRINT_ERR_MSG *error_code = (err_flag) ? MPI_ERR_UNKNOWN : MPI_SUCCESS; #else if (err_flag) { *error_code = MPIR_Err_setmsg(MPI_ERR_IO, MPIR_ADIO_ERROR, myname, "I/O Error", "%s", strerror(errno)); ADIOI_Error(fd, *error_code, myname); } else *error_code = MPI_SUCCESS; #endif } fd->fp_sys_posn = -1; /* set it to null. */ #ifdef HAVE_STATUS_SET_BYTES MPIR_Status_set_bytes(status, datatype, bufsize); /* This is a temporary way of filling in status. The right way is to keep track of how much data was actually written by ADIOI_BUFFERED_WRITE. */ #endif if (!buftype_is_contig) ADIOI_Delete_flattened(datatype); }
void ADIOI_GEN_ReadStrided(ADIO_File fd, void *buf, int count, MPI_Datatype datatype, int file_ptr_type, ADIO_Offset offset, ADIO_Status * status, int *error_code) { /* offset is in units of etype relative to the filetype. */ ADIOI_Flatlist_node *flat_buf, *flat_file; ADIO_Offset i_offset, new_brd_size, brd_size, size; int i, j, k, st_index = 0; MPI_Count num, bufsize; int n_etypes_in_filetype; ADIO_Offset n_filetypes, etype_in_filetype, st_n_filetypes, size_in_filetype; ADIO_Offset abs_off_in_filetype = 0, new_frd_size, frd_size = 0, st_frd_size; MPI_Count filetype_size, etype_size, buftype_size, partial_read; MPI_Aint filetype_extent, buftype_extent; int buf_count, buftype_is_contig, filetype_is_contig; ADIO_Offset userbuf_off, req_len, sum; ADIO_Offset off, req_off, disp, end_offset = 0, readbuf_off, start_off; char *readbuf, *tmp_buf, *value; int info_flag; unsigned max_bufsize, readbuf_len; ADIO_Status status1; if (fd->hints->ds_read == ADIOI_HINT_DISABLE) { /* if user has disabled data sieving on reads, use naive * approach instead. */ ADIOI_GEN_ReadStrided_naive(fd, buf, count, datatype, file_ptr_type, offset, status, error_code); return; } *error_code = MPI_SUCCESS; /* changed below if error */ ADIOI_Datatype_iscontig(datatype, &buftype_is_contig); ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig); MPI_Type_size_x(fd->filetype, &filetype_size); if (!filetype_size) { #ifdef HAVE_STATUS_SET_BYTES MPIR_Status_set_bytes(status, datatype, 0); #endif *error_code = MPI_SUCCESS; return; } MPI_Type_extent(fd->filetype, &filetype_extent); MPI_Type_size_x(datatype, &buftype_size); MPI_Type_extent(datatype, &buftype_extent); etype_size = fd->etype_size; ADIOI_Assert((buftype_size * count) == ((ADIO_Offset) (MPI_Count) buftype_size * (ADIO_Offset) count)); bufsize = buftype_size * count; /* get max_bufsize from the info object. */ value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL + 1) * sizeof(char)); ADIOI_Info_get(fd->info, "ind_rd_buffer_size", MPI_MAX_INFO_VAL, value, &info_flag); max_bufsize = atoi(value); ADIOI_Free(value); if (!buftype_is_contig && filetype_is_contig) { /* noncontiguous in memory, contiguous in file. */ flat_buf = ADIOI_Flatten_and_find(datatype); off = (file_ptr_type == ADIO_INDIVIDUAL) ? fd->fp_ind : fd->disp + (ADIO_Offset) etype_size *offset; start_off = off; end_offset = off + bufsize - 1; readbuf_off = off; readbuf = (char *) ADIOI_Malloc(max_bufsize); readbuf_len = (unsigned) (MPL_MIN(max_bufsize, end_offset - readbuf_off + 1)); /* if atomicity is true, lock (exclusive) the region to be accessed */ if ((fd->atomicity) && ADIO_Feature(fd, ADIO_LOCKS)) ADIOI_WRITE_LOCK(fd, start_off, SEEK_SET, end_offset - start_off + 1); ADIO_ReadContig(fd, readbuf, readbuf_len, MPI_BYTE, ADIO_EXPLICIT_OFFSET, readbuf_off, &status1, error_code); if (*error_code != MPI_SUCCESS) return; for (j = 0; j < count; j++) { for (i = 0; i < flat_buf->count; i++) { userbuf_off = (ADIO_Offset) j *(ADIO_Offset) buftype_extent + flat_buf->indices[i]; req_off = off; req_len = flat_buf->blocklens[i]; ADIOI_BUFFERED_READ off += flat_buf->blocklens[i]; } } if ((fd->atomicity) && ADIO_Feature(fd, ADIO_LOCKS)) ADIOI_UNLOCK(fd, start_off, SEEK_SET, end_offset - start_off + 1); if (file_ptr_type == ADIO_INDIVIDUAL) fd->fp_ind = off; ADIOI_Free(readbuf); } else { /* noncontiguous in file */ flat_file = ADIOI_Flatten_and_find(fd->filetype); disp = fd->disp; if (file_ptr_type == ADIO_INDIVIDUAL) { /* Wei-keng reworked type processing to be a bit more efficient */ offset = fd->fp_ind - disp; n_filetypes = (offset - flat_file->indices[0]) / filetype_extent; offset -= (ADIO_Offset) n_filetypes *filetype_extent; /* now offset is local to this extent */ /* find the block where offset is located, skip blocklens[i]==0 */ for (i = 0; i < flat_file->count; i++) { ADIO_Offset dist; if (flat_file->blocklens[i] == 0) continue; dist = flat_file->indices[i] + flat_file->blocklens[i] - offset; /* frd_size is from offset to the end of block i */ if (dist == 0) { i++; offset = flat_file->indices[i]; frd_size = flat_file->blocklens[i]; break; } if (dist > 0) { frd_size = dist; break; } } st_index = i; /* starting index in flat_file->indices[] */ offset += disp + (ADIO_Offset) n_filetypes *filetype_extent; } else { n_etypes_in_filetype = filetype_size / etype_size; n_filetypes = offset / n_etypes_in_filetype; etype_in_filetype = offset % n_etypes_in_filetype; size_in_filetype = etype_in_filetype * etype_size; sum = 0; for (i = 0; i < flat_file->count; i++) { sum += flat_file->blocklens[i]; if (sum > size_in_filetype) { st_index = i; frd_size = sum - size_in_filetype; abs_off_in_filetype = flat_file->indices[i] + size_in_filetype - (sum - flat_file->blocklens[i]); break; } } /* abs. offset in bytes in the file */ offset = disp + (ADIO_Offset) n_filetypes *filetype_extent + abs_off_in_filetype; } start_off = offset; /* Wei-keng Liao: read request is within a single flat_file contig * block e.g. with subarray types that actually describe the whole * array */ if (buftype_is_contig && bufsize <= frd_size) { /* a count of bytes can overflow. operate on original type instead */ ADIO_ReadContig(fd, buf, count, datatype, ADIO_EXPLICIT_OFFSET, offset, status, error_code); if (file_ptr_type == ADIO_INDIVIDUAL) { /* update MPI-IO file pointer to point to the first byte that * can be accessed in the fileview. */ fd->fp_ind = offset + bufsize; if (bufsize == frd_size) { do { st_index++; if (st_index == flat_file->count) { st_index = 0; n_filetypes++; } } while (flat_file->blocklens[st_index] == 0); fd->fp_ind = disp + flat_file->indices[st_index] + n_filetypes * filetype_extent; } } fd->fp_sys_posn = -1; /* set it to null. */ #ifdef HAVE_STATUS_SET_BYTES MPIR_Status_set_bytes(status, datatype, bufsize); #endif return; } /* Calculate end_offset, the last byte-offset that will be accessed. * e.g., if start_offset=0 and 100 bytes to be read, end_offset=99 */ st_frd_size = frd_size; st_n_filetypes = n_filetypes; i_offset = 0; j = st_index; off = offset; frd_size = MPL_MIN(st_frd_size, bufsize); while (i_offset < bufsize) { i_offset += frd_size; end_offset = off + frd_size - 1; j = (j + 1) % flat_file->count; n_filetypes += (j == 0) ? 1 : 0; while (flat_file->blocklens[j] == 0) { j = (j + 1) % flat_file->count; n_filetypes += (j == 0) ? 1 : 0; } off = disp + flat_file->indices[j] + n_filetypes * (ADIO_Offset) filetype_extent; frd_size = MPL_MIN(flat_file->blocklens[j], bufsize - i_offset); } /* if atomicity is true, lock (exclusive) the region to be accessed */ if ((fd->atomicity) && ADIO_Feature(fd, ADIO_LOCKS)) ADIOI_WRITE_LOCK(fd, start_off, SEEK_SET, end_offset - start_off + 1); readbuf_off = 0; readbuf_len = 0; readbuf = (char *) ADIOI_Malloc(max_bufsize); if (buftype_is_contig && !filetype_is_contig) { /* contiguous in memory, noncontiguous in file. should be the most common case. */ i_offset = 0; j = st_index; off = offset; n_filetypes = st_n_filetypes; frd_size = MPL_MIN(st_frd_size, bufsize); while (i_offset < bufsize) { if (frd_size) { /* TYPE_UB and TYPE_LB can result in * frd_size = 0. save system call in such cases */ /* lseek(fd->fd_sys, off, SEEK_SET); * err = read(fd->fd_sys, ((char *) buf) + i, frd_size); */ req_off = off; req_len = frd_size; userbuf_off = i_offset; ADIOI_BUFFERED_READ} i_offset += frd_size; if (off + frd_size < disp + flat_file->indices[j] + flat_file->blocklens[j] + n_filetypes * (ADIO_Offset) filetype_extent) off += frd_size; /* did not reach end of contiguous block in filetype. * no more I/O needed. off is incremented by frd_size. */ else { j = (j + 1) % flat_file->count; n_filetypes += (j == 0) ? 1 : 0; while (flat_file->blocklens[j] == 0) { j = (j + 1) % flat_file->count; n_filetypes += (j == 0) ? 1 : 0; } off = disp + flat_file->indices[j] + n_filetypes * (ADIO_Offset) filetype_extent; frd_size = MPL_MIN(flat_file->blocklens[j], bufsize - i_offset); } } } else {
void ADIOI_NTFS_Fcntl(ADIO_File fd, int flag, ADIO_Fcntl_t *fcntl_struct, int *error_code) { MPI_Datatype copy_etype, copy_filetype; int combiner, i, j, k, filetype_is_contig, ntimes, err,len; ADIOI_Flatlist_node *flat_file; ADIO_Offset curr_fsize, alloc_size, size, done; LARGE_INTEGER *LI; ADIO_Status status; char *buf; #ifndef PRINT_ERR_MSG static char myname[] = "ADIOI_NTFS_FCNTL"; #endif switch(flag) { case ADIO_FCNTL_SET_VIEW: /* free copies of old etypes and filetypes and delete flattened version of filetype if necessary */ MPI_Type_get_envelope(fd->etype, &i, &j, &k, &combiner); if (combiner != MPI_COMBINER_NAMED) MPI_Type_free(&(fd->etype)); ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig); if (!filetype_is_contig) ADIOI_Delete_flattened(fd->filetype); MPI_Type_get_envelope(fd->filetype, &i, &j, &k, &combiner); if (combiner != MPI_COMBINER_NAMED) MPI_Type_free(&(fd->filetype)); /* set new info */ ADIO_SetInfo(fd, fcntl_struct->info, &err); /* set new etypes and filetypes */ MPI_Type_get_envelope(fcntl_struct->etype, &i, &j, &k, &combiner); if (combiner == MPI_COMBINER_NAMED) fd->etype = fcntl_struct->etype; else { MPI_Type_contiguous(1, fcntl_struct->etype, ©_etype); MPI_Type_commit(©_etype); fd->etype = copy_etype; } MPI_Type_get_envelope(fcntl_struct->filetype, &i, &j, &k, &combiner); if (combiner == MPI_COMBINER_NAMED) fd->filetype = fcntl_struct->filetype; else { MPI_Type_contiguous(1, fcntl_struct->filetype, ©_filetype); MPI_Type_commit(©_filetype); fd->filetype = copy_filetype; ADIOI_Flatten_datatype(fd->filetype); /* this function will not flatten the filetype if it turns out to be all contiguous. */ } MPI_Type_size(fd->etype, &(fd->etype_size)); fd->disp = fcntl_struct->disp; /* reset MPI-IO file pointer to point to the first byte that can be accessed in this view. */ ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig); if (filetype_is_contig) fd->fp_ind = fcntl_struct->disp; else { flat_file = ADIOI_Flatlist; while (flat_file->type != fd->filetype) flat_file = flat_file->next; for (i=0; i<flat_file->count; i++) { if (flat_file->blocklens[i]) { fd->fp_ind = fcntl_struct->disp + flat_file->indices[i]; break; } } } *error_code = MPI_SUCCESS; break; case ADIO_FCNTL_GET_FSIZE: LI = (LARGE_INTEGER *)&fcntl_struct->fsize; LI->LowPart=GetFileSize(fd->fd_sys,&LI->HighPart); *error_code = (LI->LowPart == 0xFFFFFFFF) ? MPI_ERR_UNKNOWN : MPI_SUCCESS; break; case ADIO_FCNTL_SET_DISKSPACE: /* will be called by one process only */ /* On file systems with no preallocation function, I have to explicitly write to allocate space. Since there could be holes in the file, I need to read up to the current file size, write it back, and then write beyond that depending on how much preallocation is needed. read/write in sizes of no more than ADIOI_PREALLOC_BUFSZ */ LI = (LARGE_INTEGER *)&curr_fsize; LI->LowPart=GetFileSize(fd->fd_sys,&LI->HighPart); alloc_size = fcntl_struct->diskspace; if(LI->LowPart == 0xFFFFFFFF) { *error_code = MPI_ERR_UNKNOWN; break; } size = ADIOI_MIN(curr_fsize, alloc_size); ntimes = (size + ADIOI_PREALLOC_BUFSZ - 1)/ADIOI_PREALLOC_BUFSZ; buf = (char *) ADIOI_Malloc(ADIOI_PREALLOC_BUFSZ); done = 0; for (i=0; i<ntimes; i++) { len = ADIOI_MIN(size-done, ADIOI_PREALLOC_BUFSZ); ADIO_ReadContig(fd, buf, len,MPI_BYTE, ADIO_EXPLICIT_OFFSET, done, &status, error_code); if (*error_code != MPI_SUCCESS) { #ifdef PRINT_ERR_MSG FPRINTF(stderr, "ADIOI_NTFS_Fcntl: To preallocate disk space, ROMIO needs to read the file and write it back, but is unable to read the file. Please give the file read permission and open it with MPI_MODE_RDWR.\n"); return; #else *error_code = MPIR_Err_setmsg(MPI_ERR_IO, MPIR_PREALLOC_PERM, myname, (char *) 0, (char *) 0); ADIOI_Error(fd, *error_code, myname); return; #endif } ADIO_WriteContig(fd, buf, len,MPI_BYTE, ADIO_EXPLICIT_OFFSET, done, &status, error_code); if (*error_code != MPI_SUCCESS) return; done += len; } if (alloc_size > curr_fsize) { size = alloc_size - curr_fsize; ntimes = (size + ADIOI_PREALLOC_BUFSZ - 1)/ADIOI_PREALLOC_BUFSZ; for (i=0; i<ntimes; i++) { memset(buf, 0, ADIOI_PREALLOC_BUFSZ); len = ADIOI_MIN(alloc_size-done, ADIOI_PREALLOC_BUFSZ); ADIO_WriteContig(fd, buf, len,MPI_BYTE, ADIO_EXPLICIT_OFFSET, done, &status, error_code); if (*error_code != MPI_SUCCESS) return; done += len; } } ADIOI_Free(buf); *error_code = MPI_SUCCESS; break; case ADIO_FCNTL_SET_IOMODE: /* for implementing PFS I/O modes. will not occur in MPI-IO implementation.*/ if (fd->iomode != fcntl_struct->iomode) { fd->iomode = fcntl_struct->iomode; MPI_Barrier(MPI_COMM_WORLD); } *error_code = MPI_SUCCESS; break; case ADIO_FCNTL_SET_ATOMICITY: fd->atomicity = (fcntl_struct->atomicity == 0) ? 0 : 1; *error_code = MPI_SUCCESS; break; default: printf("Unknown flag passed to ADIOI_UFS_Fcntl\n"); MPI_Abort(MPI_COMM_WORLD, 1); } if(*error_code = MPI_SUCCESS) { #ifdef PRINT_ERR_MSG FPRINTF(stderr, "ADIOI_NTFS_Fcntl: %s\n", ad_ntfs_error(GetLastError())); #else *error_code = MPIR_Err_setmsg(MPI_ERR_IO, MPIR_ADIO_ERROR, myname, "I/O Error", "I/O Error: %s", ad_ntfs_error(GetLastError())); ADIOI_Error(fd, *error_code, myname); #endif } }
/* #define IO_DEBUG 1 */ void ADIOI_NOLOCK_WriteStrided(ADIO_File fd, const void *buf, int count, MPI_Datatype datatype, int file_ptr_type, ADIO_Offset offset, ADIO_Status *status, int *error_code) { /* borrowed from old-school PVFS (v1) code. A driver for file systems that * cannot or do not support client-side buffering * Does not do data sieving optimization * Does contain write-combining optimization for noncontig in memory, contig in * file */ /* offset is in units of etype relative to the filetype. */ ADIOI_Flatlist_node *flat_buf, *flat_file; int j, k, st_index=0; off_t err_lseek=-1; ssize_t err=-1; ADIO_Offset fwr_size=0, bwr_size, new_bwr_size, new_fwr_size, i_offset, num; ADIO_Offset bufsize, n_etypes_in_filetype; ADIO_Offset n_filetypes, etype_in_filetype, size, sum; ADIO_Offset abs_off_in_filetype=0, size_in_filetype; MPI_Count filetype_size, etype_size, buftype_size; MPI_Aint filetype_extent, buftype_extent, indx; int buf_count, buftype_is_contig, filetype_is_contig; ADIO_Offset off, disp; int flag, err_flag=0; static char myname[] = "ADIOI_NOLOCK_WRITESTRIDED"; #ifdef IO_DEBUG int rank,nprocs; #endif /* --BEGIN ERROR HANDLING-- */ if (fd->atomicity) { *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_INTERN, "Atomic mode set in I/O function", 0); return; } /* --END ERROR HANDLING-- */ ADIOI_Datatype_iscontig(datatype, &buftype_is_contig); ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig); MPI_Type_size_x(fd->filetype, &filetype_size); if ( ! filetype_size ) { #ifdef HAVE_STATUS_SET_BYTES MPIR_Status_set_bytes(status, datatype, 0); #endif *error_code = MPI_SUCCESS; return; } #ifdef IO_DEBUG MPI_Comm_rank(fd->comm, &rank); MPI_Comm_size(fd->comm, &nprocs); #endif MPI_Type_extent(fd->filetype, &filetype_extent); MPI_Type_size_x(datatype, &buftype_size); MPI_Type_extent(datatype, &buftype_extent); etype_size = fd->etype_size; ADIOI_Assert((buftype_size * count) == ((ADIO_Offset)(unsigned)buftype_size * (ADIO_Offset)count)); bufsize = buftype_size * count; if (!buftype_is_contig && filetype_is_contig) { char *combine_buf, *combine_buf_ptr; ADIO_Offset combine_buf_remain; /* noncontiguous in memory, contiguous in file. use writev */ ADIOI_Flatten_datatype(datatype); flat_buf = ADIOI_Flatlist; while (flat_buf->type != datatype) flat_buf = flat_buf->next; /* allocate our "combine buffer" to pack data into before writing */ combine_buf = (char *) ADIOI_Malloc(fd->hints->ind_wr_buffer_size); combine_buf_ptr = combine_buf; combine_buf_remain = fd->hints->ind_wr_buffer_size; /* seek to the right spot in the file */ if (file_ptr_type == ADIO_EXPLICIT_OFFSET) { off = fd->disp + etype_size * offset; lseek(fd->fd_sys, off, SEEK_SET); } else off = lseek(fd->fd_sys, fd->fp_ind, SEEK_SET); /* loop through all the flattened pieces. combine into buffer until * no more will fit, then write. * * special case of a given piece being bigger than the combine buffer * is also handled. */ for (j=0; j<count; j++) { int i; for (i=0; i<flat_buf->count; i++) { if (flat_buf->blocklens[i] > combine_buf_remain && combine_buf != combine_buf_ptr) { /* there is data in the buffer; write out the buffer so far */ #ifdef IO_DEBUG printf("[%d/%d] nc mem c file (0) writing loc = %Ld sz = %Ld\n", rank, nprocs, off, fd->hints->ind_wr_buffer_size-combine_buf_remain); #endif #ifdef ADIOI_MPE_LOGGING MPE_Log_event( ADIOI_MPE_write_a, 0, NULL ); #endif err = write(fd->fd_sys, combine_buf, fd->hints->ind_wr_buffer_size - combine_buf_remain); #ifdef ADIOI_MPE_LOGGING MPE_Log_event( ADIOI_MPE_write_b, 0, NULL ); #endif if (err == -1) err_flag = 1; /* reset our buffer info */ combine_buf_ptr = combine_buf; combine_buf_remain = fd->hints->ind_wr_buffer_size; } /* TODO: heuristic for when to not bother to use combine buffer? */ if (flat_buf->blocklens[i] >= combine_buf_remain) { /* special case: blocklen is as big as or bigger than the combine buf; * write directly */ #ifdef IO_DEBUG printf("[%d/%d] nc mem c file (1) writing loc = %Ld sz = %d\n", rank, nprocs, off, flat_buf->blocklens[i]); #endif ADIOI_Assert(flat_buf->blocklens[i] == (unsigned)flat_buf->blocklens[i]); ADIOI_Assert((((ADIO_Offset)(MPIR_Upint)buf) + (ADIO_Offset)j*(ADIO_Offset)buftype_extent + flat_buf->indices[i]) == (ADIO_Offset)((MPIR_Upint)buf + (ADIO_Offset)j*(ADIO_Offset)buftype_extent + flat_buf->indices[i])); #ifdef ADIOI_MPE_LOGGING MPE_Log_event( ADIOI_MPE_write_a, 0, NULL ); #endif err = write(fd->fd_sys, ((char *) buf) + (ADIO_Offset)j*(ADIO_Offset)buftype_extent + flat_buf->indices[i], (unsigned)flat_buf->blocklens[i]); #ifdef ADIOI_MPE_LOGGING MPE_Log_event( ADIOI_MPE_write_b, 0, NULL ); #endif if (err == -1) err_flag = 1; off += flat_buf->blocklens[i]; /* keep up with the final file offset too */ } else { /* copy more data into combine buffer */ memcpy(combine_buf_ptr, ((char *) buf) + j*buftype_extent + flat_buf->indices[i], flat_buf->blocklens[i]); combine_buf_ptr += flat_buf->blocklens[i]; combine_buf_remain -= flat_buf->blocklens[i]; off += flat_buf->blocklens[i]; /* keep up with the final file offset too */ } } } if (combine_buf_ptr != combine_buf) { /* data left in buffer to write */ #ifdef IO_DEBUG printf("[%d/%d] nc mem c file (2) writing loc = %Ld sz = %Ld\n", rank, nprocs, off, fd->hints->ind_wr_buffer_size-combine_buf_remain); #endif #ifdef ADIOI_MPE_LOGGING MPE_Log_event( ADIOI_MPE_write_a, 0, NULL ); #endif err = write(fd->fd_sys, combine_buf, fd->hints->ind_wr_buffer_size - combine_buf_remain); #ifdef ADIOI_MPE_LOGGING MPE_Log_event( ADIOI_MPE_write_b, 0, NULL ); #endif if (err == -1) err_flag = 1; } if (file_ptr_type == ADIO_INDIVIDUAL) fd->fp_ind = off; ADIOI_Free(combine_buf); if (err_flag) { *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_IO, "**io", "**io %s", strerror(errno)); } else *error_code = MPI_SUCCESS; } /* if (!buftype_is_contig && filetype_is_contig) ... */ else { /* noncontiguous in file */ /* split up into several contiguous writes */ /* find starting location in the file */ /* filetype already flattened in ADIO_Open */ flat_file = ADIOI_Flatlist; while (flat_file->type != fd->filetype) flat_file = flat_file->next; disp = fd->disp; if (file_ptr_type == ADIO_INDIVIDUAL) { offset = fd->fp_ind; /* in bytes */ n_filetypes = -1; flag = 0; while (!flag) { int i; n_filetypes++; for (i=0; i<flat_file->count; i++) { if (disp + flat_file->indices[i] + n_filetypes*(ADIO_Offset)filetype_extent + flat_file->blocklens[i] >= offset) { st_index = i; fwr_size = disp + flat_file->indices[i] + n_filetypes*(ADIO_Offset)filetype_extent + flat_file->blocklens[i] - offset; flag = 1; break; } } } } else { int i; n_etypes_in_filetype = filetype_size/etype_size; n_filetypes = offset / n_etypes_in_filetype; etype_in_filetype = offset % n_etypes_in_filetype; size_in_filetype = etype_in_filetype * etype_size; sum = 0; for (i=0; i<flat_file->count; i++) { sum += flat_file->blocklens[i]; if (sum > size_in_filetype) { st_index = i; fwr_size = sum - size_in_filetype; abs_off_in_filetype = flat_file->indices[i] + size_in_filetype - (sum - flat_file->blocklens[i]); break; } } /* abs. offset in bytes in the file */ offset = disp + n_filetypes*(ADIO_Offset)filetype_extent + abs_off_in_filetype; } if (buftype_is_contig && !filetype_is_contig) { /* contiguous in memory, noncontiguous in file. should be the most common case. */ i_offset = 0; j = st_index; off = offset; fwr_size = ADIOI_MIN(fwr_size, bufsize); while (i_offset < bufsize) { if (fwr_size) { /* TYPE_UB and TYPE_LB can result in fwr_size = 0. save system call in such cases */ #ifdef ADIOI_MPE_LOGGING MPE_Log_event(ADIOI_MPE_lseek_a, 0, NULL); #endif #ifdef IO_DEBUG printf("[%d/%d] c mem nc file writing loc = %Ld sz = %d\n", rank, nprocs, off, fwr_size); #endif err_lseek = lseek(fd->fd_sys, off, SEEK_SET); #ifdef ADIOI_MPE_LOGGING MPE_Log_event(ADIOI_MPE_lseek_b, 0, NULL); #endif if (err_lseek == -1) err_flag = 1; #ifdef ADIOI_MPE_LOGGING MPE_Log_event(ADIOI_MPE_write_a, 0, NULL); #endif err = write(fd->fd_sys, ((char *) buf) + i_offset, fwr_size); #ifdef ADIOI_MPE_LOGGING MPE_Log_event(ADIOI_MPE_write_b, 0, NULL); #endif if (err == -1) err_flag = 1; } i_offset += fwr_size; if (off + fwr_size < disp + flat_file->indices[j] + flat_file->blocklens[j] + n_filetypes*(ADIO_Offset)filetype_extent) off += fwr_size; /* did not reach end of contiguous block in filetype. no more I/O needed. off is incremented by fwr_size. */ else { if (j < (flat_file->count - 1)) j++; else { j = 0; n_filetypes++; } off = disp + flat_file->indices[j] + n_filetypes*(ADIO_Offset)filetype_extent; fwr_size = ADIOI_MIN(flat_file->blocklens[j], bufsize-i_offset); } } } else { /* noncontiguous in memory as well as in file */ ADIOI_Flatten_datatype(datatype); flat_buf = ADIOI_Flatlist; while (flat_buf->type != datatype) flat_buf = flat_buf->next; k = num = buf_count = 0; indx = flat_buf->indices[0]; j = st_index; off = offset; bwr_size = flat_buf->blocklens[0]; while (num < bufsize) { size = ADIOI_MIN(fwr_size, bwr_size); if (size) { #ifdef IO_DEBUG printf("[%d/%d] nc mem nc file writing loc = %Ld sz = %d\n", rank, nprocs, off, size); #endif #ifdef ADIOI_MPE_LOGGING MPE_Log_event( ADIOI_MPE_lseek_a, 0, NULL ); #endif lseek(fd->fd_sys, off, SEEK_SET); #ifdef ADIOI_MPE_LOGGING MPE_Log_event( ADIOI_MPE_lseek_b, 0, NULL ); #endif if (err == -1) err_flag = 1; #ifdef ADIOI_MPE_LOGGING MPE_Log_event( ADIOI_MPE_write_a, 0, NULL ); #endif ADIOI_Assert(size == (size_t) size); ADIOI_Assert(off == (off_t) off); err = write(fd->fd_sys, ((char *) buf) + indx, size); #ifdef ADIOI_MPE_LOGGING MPE_Log_event( ADIOI_MPE_write_b, 0, NULL ); #endif if (err == -1) err_flag = 1; } new_fwr_size = fwr_size; new_bwr_size = bwr_size; if (size == fwr_size) { /* reached end of contiguous block in file */ if (j < (flat_file->count - 1)) j++; else { j = 0; n_filetypes++; } off = disp + flat_file->indices[j] + n_filetypes*(ADIO_Offset)filetype_extent; new_fwr_size = flat_file->blocklens[j]; if (size != bwr_size) { indx += size; new_bwr_size -= size; } } if (size == bwr_size) { /* reached end of contiguous block in memory */ k = (k + 1)%flat_buf->count; buf_count++; indx = buftype_extent*(buf_count/flat_buf->count) + flat_buf->indices[k]; new_bwr_size = flat_buf->blocklens[k]; if (size != fwr_size) { off += size; new_fwr_size -= size; } } num += size; fwr_size = new_fwr_size; bwr_size = new_bwr_size; } } if (file_ptr_type == ADIO_INDIVIDUAL) fd->fp_ind = off; if (err_flag) { *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_IO, "**io", "**io %s", strerror(errno)); } else *error_code = MPI_SUCCESS; } fd->fp_sys_posn = -1; /* set it to null. */ #ifdef HAVE_STATUS_SET_BYTES MPIR_Status_set_bytes(status, datatype, bufsize); /* This is a temporary way of filling in status. The right way is to keep track of how much data was actually written by ADIOI_BUFFERED_WRITE. */ #endif if (!buftype_is_contig) ADIOI_Delete_flattened(datatype); }
/*@ MPI_File_preallocate - Preallocates storage space for a file Input Parameters: . fh - file handle (handle) . size - size to preallocate (nonnegative integer) .N fortran @*/ int MPI_File_preallocate(MPI_File mpi_fh, MPI_Offset size) { ADIO_Fcntl_t *fcntl_struct; int error_code, mynod; ADIO_File fh; static char myname[] = "MPI_FILE_PREALLOCATE"; MPI_Offset tmp_sz; #ifdef MPI_hpux int fl_xmpi; HPMP_IO_START(fl_xmpi, BLKMPIFILEPREALLOCATE, TRDTBLOCK, fh, MPI_DATATYPE_NULL, -1); #endif /* MPI_hpux */ MPID_CS_ENTER(); MPIR_Nest_incr(); fh = MPIO_File_resolve(mpi_fh); /* --BEGIN ERROR HANDLING-- */ MPIO_CHECK_FILE_HANDLE(fh, myname, error_code); if (size < 0) { error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_ARG, "**iobadsize", 0); error_code = MPIO_Err_return_file(fh, error_code); goto fn_exit; } tmp_sz = size; MPI_Bcast(&tmp_sz, 1, ADIO_OFFSET, 0, fh->comm); if (tmp_sz != size) { error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_ARG, "**notsame", 0); error_code = MPIO_Err_return_file(fh, error_code); goto fn_exit; } /* --END ERROR HANDLING-- */ if (size == 0) return MPI_SUCCESS; ADIOI_TEST_DEFERRED(fh, myname, &error_code); MPI_Comm_rank(fh->comm, &mynod); if (!mynod) { fcntl_struct = (ADIO_Fcntl_t *) ADIOI_Malloc(sizeof(ADIO_Fcntl_t)); fcntl_struct->diskspace = size; ADIO_Fcntl(fh, ADIO_FCNTL_SET_DISKSPACE, fcntl_struct, &error_code); ADIOI_Free(fcntl_struct); } MPI_Barrier(fh->comm); #ifdef MPI_hpux HPMP_IO_END(fl_xmpi, fh, MPI_DATATYPE_NULL, -1); #endif /* MPI_hpux */ fn_exit: MPIR_Nest_decr(); MPID_CS_EXIT(); /* TODO: bcast result? */ if (!mynod) return error_code; else return MPI_SUCCESS; }
void ADIOI_BGL_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code) { /* if fd->info is null, create a new info object. Initialize fd->info to default values. Initialize fd->hints to default values. Examine the info object passed by the user. If it contains values that ROMIO understands, override the default. */ MPI_Info info; char *value; int flag, intval, tmp_val, nprocs=0, nprocs_is_valid = 0; static char myname[] = "ADIOI_BGL_SETINFO"; int did_anything = 0; if (fd->info == MPI_INFO_NULL) MPI_Info_create(&(fd->info)); info = fd->info; /* Note that fd->hints is allocated at file open time; thus it is * not necessary to allocate it, or check for allocation, here. */ value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char)); AD_BGL_assert ((value != NULL)); /* initialize info and hints to default values if they haven't been * previously initialized */ if (!fd->hints->initialized) { did_anything = 1; /* buffer size for collective I/O */ ADIOI_Info_set(info, "cb_buffer_size", ADIOI_BGL_CB_BUFFER_SIZE_DFLT); fd->hints->cb_buffer_size = atoi(ADIOI_BGL_CB_BUFFER_SIZE_DFLT); /* default is to let romio automatically decide when to use * collective buffering */ ADIOI_Info_set(info, "romio_cb_read", "enable"); fd->hints->cb_read = ADIOI_HINT_ENABLE; ADIOI_Info_set(info, "romio_cb_write", "enable"); fd->hints->cb_write = ADIOI_HINT_ENABLE; if ( fd->hints->cb_config_list != NULL ) ADIOI_Free (fd->hints->cb_config_list); fd->hints->cb_config_list = NULL; /* number of processes that perform I/O in collective I/O */ MPI_Comm_size(fd->comm, &nprocs); nprocs_is_valid = 1; ADIOI_Snprintf(value, MPI_MAX_INFO_VAL+1, "%d", nprocs); ADIOI_Info_set(info, "cb_nodes", value); fd->hints->cb_nodes = -1; /* hint indicating that no indep. I/O will be performed on this file */ ADIOI_Info_set(info, "romio_no_indep_rw", "false"); fd->hints->no_indep_rw = 0; /* bgl is not implementing file realms (ADIOI_IOStridedColl), initialize to disabled it. */ /* hint instructing the use of persistent file realms */ ADIOI_Info_set(info, "romio_cb_pfr", "disable"); fd->hints->cb_pfr = ADIOI_HINT_DISABLE; /* hint guiding the assignment of persistent file realms */ ADIOI_Info_set(info, "romio_cb_fr_types", "aar"); fd->hints->cb_fr_type = ADIOI_FR_AAR; /* hint to align file realms with a certain byte value */ ADIOI_Info_set(info, "romio_cb_fr_alignment", "1"); fd->hints->cb_fr_alignment = 1; /* hint to set a threshold percentage for a datatype's size/extent at * which data sieving should be done in collective I/O */ ADIOI_Info_set(info, "romio_cb_ds_threshold", "0"); fd->hints->cb_ds_threshold = 0; /* hint to switch between point-to-point or all-to-all for two-phase */ ADIOI_Info_set(info, "romio_cb_alltoall", "automatic"); fd->hints->cb_alltoall = ADIOI_HINT_AUTO; /* deferred_open derived from no_indep_rw and cb_{read,write} */ fd->hints->deferred_open = 0; /* buffer size for data sieving in independent reads */ ADIOI_Info_set(info, "ind_rd_buffer_size", ADIOI_BGL_IND_RD_BUFFER_SIZE_DFLT); fd->hints->ind_rd_buffer_size = atoi(ADIOI_BGL_IND_RD_BUFFER_SIZE_DFLT); /* buffer size for data sieving in independent writes */ ADIOI_Info_set(info, "ind_wr_buffer_size", ADIOI_BGL_IND_WR_BUFFER_SIZE_DFLT); fd->hints->ind_wr_buffer_size = atoi(ADIOI_BGL_IND_WR_BUFFER_SIZE_DFLT); if(fd->file_system == ADIO_UFS) { /* default for ufs/pvfs is to disable data sieving */ ADIOI_Info_set(info, "romio_ds_read", "disable"); fd->hints->ds_read = ADIOI_HINT_DISABLE; ADIOI_Info_set(info, "romio_ds_write", "disable"); fd->hints->ds_write = ADIOI_HINT_DISABLE; } else { /* default is to let romio automatically decide when to use data * sieving */ ADIOI_Info_set(info, "romio_ds_read", "automatic"); fd->hints->ds_read = ADIOI_HINT_AUTO; ADIOI_Info_set(info, "romio_ds_write", "automatic"); fd->hints->ds_write = ADIOI_HINT_AUTO; } /* still to do: tune this a bit for a variety of file systems. there's * no good default value so just leave it unset */ fd->hints->min_fdomain_size = 0; fd->hints->striping_unit = 0; fd->hints->initialized = 1; } /* add in user's info if supplied */ if (users_info != MPI_INFO_NULL) { ADIOI_Info_get(users_info, "cb_buffer_size", MPI_MAX_INFO_VAL, value, &flag); if (flag && ((intval=atoi(value)) > 0)) { tmp_val = intval; MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm); /* --BEGIN ERROR HANDLING-- */ if (tmp_val != intval) { MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname, "cb_buffer_size", error_code); return; } /* --END ERROR HANDLING-- */ ADIOI_Info_set(info, "cb_buffer_size", value); fd->hints->cb_buffer_size = intval; } #if 0 /* bgl is not implementing file realms (ADIOI_IOStridedColl) ... */ /* aligning file realms to certain sizes (e.g. stripe sizes) * may benefit I/O performance */ ADIOI_Info_get(users_info, "romio_cb_fr_alignment", MPI_MAX_INFO_VAL, value, &flag); if (flag && ((intval=atoi(value)) > 0)) { tmp_val = intval; MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm); /* --BEGIN ERROR HANDLING-- */ if (tmp_val != intval) { MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname, "romio_cb_fr_alignment", error_code); return; } /* --END ERROR HANDLING-- */ ADIOI_Info_set(info, "romio_cb_fr_alignment", value); fd->hints->cb_fr_alignment = intval; } /* for collective I/O, try to be smarter about when to do data sieving * using a specific threshold for the datatype size/extent * (percentage 0-100%) */ ADIOI_Info_get(users_info, "romio_cb_ds_threshold", MPI_MAX_INFO_VAL, value, &flag); if (flag && ((intval=atoi(value)) > 0)) { tmp_val = intval; MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm); /* --BEGIN ERROR HANDLING-- */ if (tmp_val != intval) { MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname, "romio_cb_ds_threshold", error_code); return; } /* --END ERROR HANDLING-- */ ADIOI_Info_set(info, "romio_cb_ds_threshold", value); fd->hints->cb_ds_threshold = intval; } ADIOI_Info_get(users_info, "romio_cb_alltoall", MPI_MAX_INFO_VAL, value, &flag); if (flag) { if (!strcmp(value, "enable") || !strcmp(value, "ENABLE")) { ADIOI_Info_set(info, "romio_cb_alltoall", value); fd->hints->cb_read = ADIOI_HINT_ENABLE; } else if (!strcmp(value, "disable") || !strcmp(value, "DISABLE")) { ADIOI_Info_set(info, "romio_cb_alltoall", value); fd->hints->cb_read = ADIOI_HINT_DISABLE; } else if (!strcmp(value, "automatic") || !strcmp(value, "AUTOMATIC")) { ADIOI_Info_set(info, "romio_cb_alltoall", value); fd->hints->cb_read = ADIOI_HINT_AUTO; } tmp_val = fd->hints->cb_alltoall; MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm); /* --BEGIN ERROR HANDLING-- */ if (tmp_val != fd->hints->cb_alltoall) { MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname, "romio_cb_alltoall", error_code); return; } /* --END ERROR HANDLING-- */ } #endif /* new hints for enabling/disabling coll. buffering on * reads/writes */ ADIOI_Info_get(users_info, "romio_cb_read", MPI_MAX_INFO_VAL, value, &flag); if (flag) { if (!strcmp(value, "enable") || !strcmp(value, "ENABLE")) { ADIOI_Info_set(info, "romio_cb_read", value); fd->hints->cb_read = ADIOI_HINT_ENABLE; } else if (!strcmp(value, "disable") || !strcmp(value, "DISABLE")) { /* romio_cb_read overrides no_indep_rw */ ADIOI_Info_set(info, "romio_cb_read", value); ADIOI_Info_set(info, "romio_no_indep_rw", "false"); fd->hints->cb_read = ADIOI_HINT_DISABLE; fd->hints->no_indep_rw = ADIOI_HINT_DISABLE; } else if (!strcmp(value, "automatic") || !strcmp(value, "AUTOMATIC")) { ADIOI_Info_set(info, "romio_cb_read", value); fd->hints->cb_read = ADIOI_HINT_AUTO; } tmp_val = fd->hints->cb_read; MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm); /* --BEGIN ERROR HANDLING-- */ if (tmp_val != fd->hints->cb_read) { MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname, "romio_cb_read", error_code); return; } /* --END ERROR HANDLING-- */ } ADIOI_Info_get(users_info, "romio_cb_write", MPI_MAX_INFO_VAL, value, &flag); if (flag) { if (!strcmp(value, "enable") || !strcmp(value, "ENABLE")) { ADIOI_Info_set(info, "romio_cb_write", value); fd->hints->cb_write = ADIOI_HINT_ENABLE; } else if (!strcmp(value, "disable") || !strcmp(value, "DISABLE")) { /* romio_cb_write overrides no_indep_rw, too */ ADIOI_Info_set(info, "romio_cb_write", value); ADIOI_Info_set(info, "romio_no_indep_rw", "false"); fd->hints->cb_write = ADIOI_HINT_DISABLE; fd->hints->no_indep_rw = ADIOI_HINT_DISABLE; } else if (!strcmp(value, "automatic") || !strcmp(value, "AUTOMATIC")) { ADIOI_Info_set(info, "romio_cb_write", value); fd->hints->cb_write = ADIOI_HINT_AUTO; } tmp_val = fd->hints->cb_write; MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm); /* --BEGIN ERROR HANDLING-- */ if (tmp_val != fd->hints->cb_write) { MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname, "romio_cb_write", error_code); return; } /* --END ERROR HANDLING-- */ } #if 0 /* bgl is not implementing file realms (ADIOI_IOStridedColl) ... */ /* enable/disable persistent file realms for collective I/O */ /* may want to check for no_indep_rdwr hint as well */ ADIOI_Info_get(users_info, "romio_cb_pfr", MPI_MAX_INFO_VAL, value, &flag); if (flag) { if (!strcmp(value, "enable") || !strcmp(value, "ENABLE")) { ADIOI_Info_set(info, "romio_cb_pfr", value); fd->hints->cb_pfr = ADIOI_HINT_ENABLE; } else if (!strcmp(value, "disable") || !strcmp(value, "DISABLE")) { ADIOI_Info_set(info, "romio_cb_pfr", value); fd->hints->cb_pfr = ADIOI_HINT_DISABLE; } else if (!strcmp(value, "automatic") || !strcmp(value, "AUTOMATIC")) { ADIOI_Info_set(info, "romio_cb_pfr", value); fd->hints->cb_pfr = ADIOI_HINT_AUTO; } tmp_val = fd->hints->cb_pfr; MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm); /* --BEGIN ERROR HANDLING-- */ if (tmp_val != fd->hints->cb_pfr) { MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname, "romio_cb_pfr", error_code); return; } /* --END ERROR HANDLING-- */ } /* file realm assignment types ADIOI_FR_AAR(0), ADIOI_FR_FSZ(-1), ADIOI_FR_USR_REALMS(-2), all others specify a regular fr size in bytes. probably not the best way... */ ADIOI_Info_get(users_info, "romio_cb_fr_type", MPI_MAX_INFO_VAL, value, &flag); if (flag && ((intval=atoi(value)) >= -2)) { tmp_val = intval; MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm); /* --BEGIN ERROR HANDLING-- */ if (tmp_val != intval) { MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname, "romio_cb_fr_type", error_code); return; } /* --END ERROR HANDLING-- */ ADIOI_Info_set(info, "romio_cb_fr_type", value); fd->hints->cb_fr_type = intval; } #endif /* new hint for specifying no indep. read/write will be performed */ ADIOI_Info_get(users_info, "romio_no_indep_rw", MPI_MAX_INFO_VAL, value, &flag); if (flag) { if (!strcmp(value, "true") || !strcmp(value, "TRUE")) { /* if 'no_indep_rw' set, also hint that we will do * collective buffering: if we aren't doing independent io, * then we have to do collective */ ADIOI_Info_set(info, "romio_no_indep_rw", value); ADIOI_Info_set(info, "romio_cb_write", "enable"); ADIOI_Info_set(info, "romio_cb_read", "enable"); fd->hints->no_indep_rw = 1; fd->hints->cb_read = 1; fd->hints->cb_write = 1; tmp_val = 1; } else if (!strcmp(value, "false") || !strcmp(value, "FALSE")) { ADIOI_Info_set(info, "romio_no_indep_rw", value); fd->hints->no_indep_rw = 0; tmp_val = 0; } else { /* default is above */ tmp_val = 0; } MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm); /* --BEGIN ERROR HANDLING-- */ if (tmp_val != fd->hints->no_indep_rw) { MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname, "romio_no_indep_rw", error_code); return; } /* --END ERROR HANDLING-- */ } /* new hints for enabling/disabling data sieving on * reads/writes */ ADIOI_Info_get(users_info, "romio_ds_read", MPI_MAX_INFO_VAL, value, &flag); if (flag) { if (!strcmp(value, "enable") || !strcmp(value, "ENABLE")) { ADIOI_Info_set(info, "romio_ds_read", value); fd->hints->ds_read = ADIOI_HINT_ENABLE; } else if (!strcmp(value, "disable") || !strcmp(value, "DISABLE")) { ADIOI_Info_set(info, "romio_ds_read", value); fd->hints->ds_read = ADIOI_HINT_DISABLE; } else if (!strcmp(value, "automatic") || !strcmp(value, "AUTOMATIC")) { ADIOI_Info_set(info, "romio_ds_read", value); fd->hints->ds_read = ADIOI_HINT_AUTO; } /* otherwise ignore */ } ADIOI_Info_get(users_info, "romio_ds_write", MPI_MAX_INFO_VAL, value, &flag); if (flag) { if (!strcmp(value, "enable") || !strcmp(value, "ENABLE")) { ADIOI_Info_set(info, "romio_ds_write", value); fd->hints->ds_write = ADIOI_HINT_ENABLE; } else if (!strcmp(value, "disable") || !strcmp(value, "DISABLE")) { ADIOI_Info_set(info, "romio_ds_write", value); fd->hints->ds_write = ADIOI_HINT_DISABLE; } else if (!strcmp(value, "automatic") || !strcmp(value, "AUTOMATIC")) { ADIOI_Info_set(info, "romio_ds_write", value); fd->hints->ds_write = ADIOI_HINT_AUTO; } /* otherwise ignore */ } ADIOI_Info_get(users_info, "ind_wr_buffer_size", MPI_MAX_INFO_VAL, value, &flag); if (flag && ((intval = atoi(value)) > 0)) { ADIOI_Info_set(info, "ind_wr_buffer_size", value); fd->hints->ind_wr_buffer_size = intval; } ADIOI_Info_get(users_info, "ind_rd_buffer_size", MPI_MAX_INFO_VAL, value, &flag); if (flag && ((intval = atoi(value)) > 0)) { ADIOI_Info_set(info, "ind_rd_buffer_size", value); fd->hints->ind_rd_buffer_size = intval; } memset( value, 0, MPI_MAX_INFO_VAL+1 ); ADIOI_Info_get(users_info, "romio_min_fdomain_size", MPI_MAX_INFO_VAL, value, &flag); if ( flag && ((intval = atoi(value)) > 0) ) { ADIOI_Info_set(info, "romio_min_fdomain_size", value); fd->hints->min_fdomain_size = intval; } /* Now we use striping unit in common code so we should process hints for it. */ ADIOI_Info_get(users_info, "striping_unit", MPI_MAX_INFO_VAL, value, &flag); if ( flag && ((intval = atoi(value)) > 0) ) { ADIOI_Info_set(info, "striping_unit", value); fd->hints->striping_unit = intval; } memset( value, 0, MPI_MAX_INFO_VAL+1 ); ADIOI_Info_get(users_info, ADIOI_BGL_NAGG_IN_PSET_HINT_NAME, MPI_MAX_INFO_VAL, value, &flag); if (flag && ((intval = atoi(value)) > 0)) { did_anything = 1; ADIOI_Info_set(info, ADIOI_BGL_NAGG_IN_PSET_HINT_NAME, value); fd->hints->cb_nodes = intval; } } /* associate CB aggregators to certain CNs in every involved PSET */ if (did_anything) { ADIOI_BGL_gen_agg_ranklist(fd, fd->hints->cb_nodes); } /* ignore defered open hints and do not enable it for bluegene: need all * processors in the open path so we can stat-and-broadcast the blocksize */ ADIOI_Info_set(info, "romio_no_indep_rw", "false"); fd->hints->no_indep_rw = 0; fd->hints->deferred_open = 0; /* BobC commented this out, but since hint processing runs on both bgl and * bglockless, we need to keep DS writes enabled on gpfs and disabled on * PVFS */ if (ADIO_Feature(fd, ADIO_DATA_SIEVING_WRITES) == 0) { /* disable data sieving for fs that do not support file locking */ ADIOI_Info_get(info, "ind_wr_buffer_size", MPI_MAX_INFO_VAL, value, &flag); if (flag) { /* get rid of this value if it is set */ ADIOI_Info_delete(info, "ind_wr_buffer_size"); } /* note: leave ind_wr_buffer_size alone; used for other cases * as well. -- Rob Ross, 04/22/2003 */ ADIOI_Info_set(info, "romio_ds_write", "disable"); fd->hints->ds_write = ADIOI_HINT_DISABLE; } ADIOI_Free(value); *error_code = MPI_SUCCESS; }
/* Nonblocking version of ADIOI_GEN_ReadStridedColl() */ void ADIOI_GEN_IreadStridedColl(ADIO_File fd, void *buf, int count, MPI_Datatype datatype, int file_ptr_type, ADIO_Offset offset, MPI_Request *request, int *error_code) { /* Uses a generalized version of the extended two-phase method described in "An Extended Two-Phase Method for Accessing Sections of Out-of-Core Arrays", Rajeev Thakur and Alok Choudhary, Scientific Programming, (5)4:301--317, Winter 1996. http://www.mcs.anl.gov/home/thakur/ext2ph.ps */ ADIOI_NBC_Request *nbc_req = NULL; ADIOI_GEN_IreadStridedColl_vars *vars = NULL; int nprocs, myrank; #ifdef RDCOLL_DEBUG int i; #endif /* FIXME: need an implementation of ADIOI_IOIstridedColl if (fd->hints->cb_pfr != ADIOI_HINT_DISABLE) { ADIOI_IOIstridedColl(fd, buf, count, ADIOI_READ, datatype, file_ptr_type, offset, request, error_code); return; } */ /* top-level struct keeping the status of function progress */ nbc_req = (ADIOI_NBC_Request *)ADIOI_Calloc(1, sizeof(ADIOI_NBC_Request)); nbc_req->rdwr = ADIOI_READ; /* create a generalized request */ if (ADIOI_GEN_greq_class == 0) { MPIX_Grequest_class_create(ADIOI_GEN_irc_query_fn, ADIOI_GEN_irc_free_fn, MPIU_Greq_cancel_fn, ADIOI_GEN_irc_poll_fn, ADIOI_GEN_irc_wait_fn, &ADIOI_GEN_greq_class); } MPIX_Grequest_class_allocate(ADIOI_GEN_greq_class, nbc_req, request); memcpy(&nbc_req->req, request, sizeof(MPI_Request)); /* create a struct for parameters and variables */ vars = (ADIOI_GEN_IreadStridedColl_vars *)ADIOI_Calloc( 1, sizeof(ADIOI_GEN_IreadStridedColl_vars)); nbc_req->data.rd.rsc_vars = vars; /* save the parameters */ vars->fd = fd; vars->buf = buf; vars->count = count; vars->datatype = datatype; vars->file_ptr_type = file_ptr_type; vars->offset = offset; MPI_Comm_size(fd->comm, &nprocs); MPI_Comm_rank(fd->comm, &myrank); vars->nprocs = nprocs; vars->myrank = myrank; /* number of aggregators, cb_nodes, is stored in the hints */ vars->nprocs_for_coll = fd->hints->cb_nodes; vars->orig_fp = fd->fp_ind; /* only check for interleaving if cb_read isn't disabled */ if (fd->hints->cb_read != ADIOI_HINT_DISABLE) { /* For this process's request, calculate the list of offsets and lengths in the file and determine the start and end offsets. */ /* Note: end_offset points to the last byte-offset that will be accessed. e.g., if start_offset=0 and 100 bytes to be read, end_offset=99*/ ADIOI_Calc_my_off_len(fd, count, datatype, file_ptr_type, offset, &vars->offset_list, &vars->len_list, &vars->start_offset, &vars->end_offset, &vars->contig_access_count); #ifdef RDCOLL_DEBUG for (i = 0; i < vars->contig_access_count; i++) { DBG_FPRINTF(stderr, "rank %d off %lld len %lld\n", myrank, vars->offset_list[i], vars->len_list[i]); } #endif /* each process communicates its start and end offsets to other processes. The result is an array each of start and end offsets stored in order of process rank. */ vars->st_offsets = (ADIO_Offset *)ADIOI_Malloc(nprocs*sizeof(ADIO_Offset)); vars->end_offsets = (ADIO_Offset *)ADIOI_Malloc(nprocs*sizeof(ADIO_Offset)); *error_code = MPI_Iallgather(&vars->start_offset, 1, ADIO_OFFSET, vars->st_offsets, 1, ADIO_OFFSET, fd->comm, &vars->req_offset[0]); if (*error_code != MPI_SUCCESS) return; *error_code = MPI_Iallgather(&vars->end_offset, 1, ADIO_OFFSET, vars->end_offsets, 1, ADIO_OFFSET, fd->comm, &vars->req_offset[1]); nbc_req->data.rd.state = ADIOI_IRC_STATE_GEN_IREADSTRIDEDCOLL; return; } ADIOI_GEN_IreadStridedColl_indio(nbc_req, error_code); }
void ADIOI_BGL_WriteStrided(ADIO_File fd, void *buf, int count, MPI_Datatype datatype, int file_ptr_type, ADIO_Offset offset, ADIO_Status *status, int *error_code) { /* offset is in units of etype relative to the filetype. */ ADIOI_Flatlist_node *flat_buf, *flat_file; ADIO_Offset i_offset, sum, size_in_filetype; int i, j, k, err=-1, st_index=0; int n_etypes_in_filetype; ADIO_Offset num, size, n_filetypes, etype_in_filetype, st_n_filetypes; ADIO_Offset abs_off_in_filetype=0; int filetype_size, etype_size, buftype_size; MPI_Aint filetype_extent, buftype_extent; int buf_count, buftype_is_contig, filetype_is_contig; ADIO_Offset userbuf_off; ADIO_Offset off, req_off, disp, end_offset=0, writebuf_off, start_off; char *writebuf, *value; unsigned bufsize, writebuf_len, max_bufsize, write_sz; int err_flag=0, info_flag; ADIO_Offset new_bwr_size, new_fwr_size, st_fwr_size, fwr_size=0, bwr_size, req_len; static char myname[] = "ADIOI_BGL_WRITESTRIDED"; if (fd->hints->ds_write == ADIOI_HINT_DISABLE) { /* if user has disabled data sieving on reads, use naive * approach instead. */ /*FPRINTF(stderr, "ADIOI_GEN_WriteStrided_naive(%d):\n", __LINE__);*/ ADIOI_GEN_WriteStrided_naive(fd, buf, count, datatype, file_ptr_type, offset, status, error_code); return; } /*FPRINTF(stderr, "%s(%d):\n",myname, __LINE__);*/ ADIOI_Datatype_iscontig(datatype, &buftype_is_contig); ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig); MPI_Type_size(fd->filetype, &filetype_size); if ( ! filetype_size ) { #ifdef HAVE_STATUS_SET_BYTES MPIR_Status_set_bytes(status, datatype, 0); #endif *error_code = MPI_SUCCESS; return; } MPI_Type_extent(fd->filetype, &filetype_extent); MPI_Type_size(datatype, &buftype_size); MPI_Type_extent(datatype, &buftype_extent); etype_size = fd->etype_size; ADIOI_Assert((buftype_size * count) == ((ADIO_Offset)(unsigned)buftype_size * (ADIO_Offset)count)); bufsize = buftype_size * count; /* get max_bufsize from the info object. */ value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char)); ADIOI_Info_get(fd->info, "ind_wr_buffer_size", MPI_MAX_INFO_VAL, value, &info_flag); max_bufsize = atoi(value); ADIOI_Free(value); if (!buftype_is_contig && filetype_is_contig) { /* noncontiguous in memory, contiguous in file. */ ADIOI_Flatten_datatype(datatype); flat_buf = ADIOI_Flatlist; while (flat_buf->type != datatype) flat_buf = flat_buf->next; off = (file_ptr_type == ADIO_INDIVIDUAL) ? fd->fp_ind : fd->disp + etype_size * offset; start_off = off; end_offset = off + bufsize - 1; writebuf_off = off; writebuf = (char *) ADIOI_Malloc(max_bufsize); writebuf_len = (unsigned) (ADIOI_MIN(max_bufsize,end_offset-writebuf_off+1)); /* if atomicity is true, lock the region to be accessed */ if (fd->atomicity) ADIOI_WRITE_LOCK(fd, start_off, SEEK_SET, end_offset-start_off+1); for (j=0; j<count; j++) { int i; for (i=0; i<flat_buf->count; i++) { userbuf_off = (ADIO_Offset)j*(ADIO_Offset)buftype_extent + flat_buf->indices[i]; req_off = off; req_len = flat_buf->blocklens[i]; ADIOI_BUFFERED_WRITE_WITHOUT_READ off += flat_buf->blocklens[i]; } } /* write the buffer out finally */ lseek(fd->fd_sys, writebuf_off, SEEK_SET); if (!(fd->atomicity)) ADIOI_WRITE_LOCK(fd, writebuf_off, SEEK_SET, writebuf_len); err = write(fd->fd_sys, writebuf, writebuf_len); if (!(fd->atomicity)) ADIOI_UNLOCK(fd, writebuf_off, SEEK_SET, writebuf_len); if (err == -1) err_flag = 1; if (fd->atomicity) ADIOI_UNLOCK(fd, start_off, SEEK_SET, end_offset-start_off+1); ADIOI_Free(writebuf); /* malloced in the buffered_write macro */ if (file_ptr_type == ADIO_INDIVIDUAL) fd->fp_ind = off; if (err_flag) { *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_IO, "**io", "**io %s", strerror(errno)); } else *error_code = MPI_SUCCESS; } else { /* noncontiguous in file */ /* filetype already flattened in ADIO_Open */ flat_file = ADIOI_Flatlist; while (flat_file->type != fd->filetype) flat_file = flat_file->next; disp = fd->disp; if (file_ptr_type == ADIO_INDIVIDUAL) { /* Wei-keng reworked type processing to be a bit more efficient */ offset = fd->fp_ind - disp; n_filetypes = (offset - flat_file->indices[0]) / filetype_extent; offset -= (ADIO_Offset)n_filetypes * filetype_extent; /* now offset is local to this extent */ /* find the block where offset is located, skip blocklens[i]==0 */ for (i=0; i<flat_file->count; i++) { ADIO_Offset dist; if (flat_file->blocklens[i] == 0) continue; dist = flat_file->indices[i] + flat_file->blocklens[i] - offset; /* fwr_size is from offset to the end of block i */ if (dist == 0) { i++; offset = flat_file->indices[i]; fwr_size = flat_file->blocklens[i]; break; } if (dist > 0) { fwr_size = dist; break; } } st_index = i; /* starting index in flat_file->indices[] */ offset += disp + (ADIO_Offset)n_filetypes*filetype_extent; } else { int i; n_etypes_in_filetype = filetype_size/etype_size; n_filetypes = offset / n_etypes_in_filetype; etype_in_filetype = offset % n_etypes_in_filetype; size_in_filetype = etype_in_filetype * etype_size; sum = 0; for (i=0; i<flat_file->count; i++) { sum += flat_file->blocklens[i]; if (sum > size_in_filetype) { st_index = i; fwr_size = sum - size_in_filetype; abs_off_in_filetype = flat_file->indices[i] + size_in_filetype - (sum - flat_file->blocklens[i]); break; } } /* abs. offset in bytes in the file */ offset = disp + (ADIO_Offset) n_filetypes*filetype_extent + abs_off_in_filetype; } start_off = offset; /* Wei-keng Liao:write request is within single flat_file contig block*/ /* this could happen, for example, with subarray types that are * actually fairly contiguous */ if (buftype_is_contig && bufsize <= fwr_size) { ADIO_WriteContig(fd, buf, bufsize, MPI_BYTE, ADIO_EXPLICIT_OFFSET, offset, status, error_code); if (file_ptr_type == ADIO_INDIVIDUAL) { /* update MPI-IO file pointer to point to the first byte * that can be accessed in the fileview. */ fd->fp_ind = offset + bufsize; if (bufsize == fwr_size) { do { st_index++; if (st_index == flat_file->count) { st_index = 0; n_filetypes++; } } while (flat_file->blocklens[st_index] == 0); fd->fp_ind = disp + flat_file->indices[st_index] + (ADIO_Offset)n_filetypes*filetype_extent; } } fd->fp_sys_posn = -1; /* set it to null. */ #ifdef HAVE_STATUS_SET_BYTES MPIR_Status_set_bytes(status, datatype, bufsize); #endif return; } /* Calculate end_offset, the last byte-offset that will be accessed. e.g., if start_offset=0 and 100 bytes to be write, end_offset=99*/ st_fwr_size = fwr_size; st_n_filetypes = n_filetypes; i_offset = 0; j = st_index; off = offset; fwr_size = ADIOI_MIN(st_fwr_size, bufsize); while (i_offset < bufsize) { i_offset += fwr_size; end_offset = off + fwr_size - 1; j = (j+1) % flat_file->count; n_filetypes += (j == 0) ? 1 : 0; while (flat_file->blocklens[j]==0) { j = (j+1) % flat_file->count; n_filetypes += (j == 0) ? 1 : 0; } off = disp + flat_file->indices[j] + n_filetypes*(ADIO_Offset)filetype_extent; fwr_size = ADIOI_MIN(flat_file->blocklens[j], bufsize-i_offset); } /* if atomicity is true, lock the region to be accessed */ if (fd->atomicity) ADIOI_WRITE_LOCK(fd, start_off, SEEK_SET, end_offset-start_off+1); /* initial read for the read-modify-write */ writebuf_off = offset; writebuf = (char *) ADIOI_Malloc(max_bufsize); writebuf_len = (unsigned)(ADIOI_MIN(max_bufsize,end_offset-writebuf_off+1)); if (!(fd->atomicity)) ADIOI_WRITE_LOCK(fd, writebuf_off, SEEK_SET, writebuf_len); lseek(fd->fd_sys, writebuf_off, SEEK_SET); err = read(fd->fd_sys, writebuf, writebuf_len); if (err == -1) { *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_IO, "ADIOI_BGL_WriteStrided: ROMIO tries to optimize this access by doing a read-modify-write, but is unable to read the file. Please give the file read permission and open it with MPI_MODE_RDWR.", 0); return; } if (buftype_is_contig && !filetype_is_contig) { /* contiguous in memory, noncontiguous in file. should be the most common case. */ i_offset = 0; j = st_index; off = offset; n_filetypes = st_n_filetypes; fwr_size = ADIOI_MIN(st_fwr_size, bufsize); while (i_offset < bufsize) { if (fwr_size) { /* TYPE_UB and TYPE_LB can result in fwr_size = 0. save system call in such cases */ /* lseek(fd->fd_sys, off, SEEK_SET); err = write(fd->fd_sys, ((char *) buf) + i_offset, fwr_size);*/ req_off = off; req_len = fwr_size; userbuf_off = i_offset; ADIOI_BUFFERED_WRITE } i_offset += fwr_size; if (off + fwr_size < disp + flat_file->indices[j] + flat_file->blocklens[j] + n_filetypes*(ADIO_Offset)filetype_extent) off += fwr_size; /* did not reach end of contiguous block in filetype. no more I/O needed. off is incremented by fwr_size. */ else { j = (j+1) % flat_file->count; n_filetypes += (j == 0) ? 1 : 0; while (flat_file->blocklens[j]==0) { j = (j+1) % flat_file->count; n_filetypes += (j == 0) ? 1 : 0; } off = disp + flat_file->indices[j] + n_filetypes*(ADIO_Offset)filetype_extent; fwr_size = ADIOI_MIN(flat_file->blocklens[j], bufsize-i_offset); } } } else {
static void ADIOI_Iread_and_exch(ADIOI_NBC_Request *nbc_req, int *error_code) { ADIOI_Iread_and_exch_vars *vars = nbc_req->data.rd.rae_vars; ADIO_File fd = vars->fd; MPI_Datatype datatype = vars->datatype; int nprocs = vars->nprocs; ADIOI_Access *others_req = vars->others_req; /* Read in sizes of no more than coll_bufsize, an info parameter. Send data to appropriate processes. Place recd. data in user buf. The idea is to reduce the amount of extra memory required for collective I/O. If all data were read all at once, which is much easier, it would require temp space more than the size of user_buf, which is often unacceptable. For example, to read a distributed array from a file, where each local array is 8Mbytes, requiring at least another 8Mbytes of temp space is unacceptable. */ int i, j; ADIO_Offset st_loc = -1, end_loc = -1; ADIOI_Flatlist_node *flat_buf = NULL; int coll_bufsize; *error_code = MPI_SUCCESS; /* changed below if error */ /* only I/O errors are currently reported */ /* calculate the number of reads of size coll_bufsize to be done by each process and the max among all processes. That gives the no. of communication phases as well. coll_bufsize is obtained from the hints object. */ coll_bufsize = fd->hints->cb_buffer_size; vars->coll_bufsize = coll_bufsize; /* grab some initial values for st_loc and end_loc */ for (i = 0; i < nprocs; i++) { if (others_req[i].count) { st_loc = others_req[i].offsets[0]; end_loc = others_req[i].offsets[0]; break; } } /* now find the real values */ for (i = 0; i < nprocs; i++) for (j = 0; j < others_req[i].count; j++) { st_loc = ADIOI_MIN(st_loc, others_req[i].offsets[j]); end_loc = ADIOI_MAX(end_loc, (others_req[i].offsets[j] + others_req[i].lens[j] - 1)); } vars->st_loc = st_loc; vars->end_loc = end_loc; /* calculate ntimes, the number of times this process must perform I/O * operations in order to complete all the requests it has received. * the need for multiple I/O operations comes from the restriction that * we only use coll_bufsize bytes of memory for internal buffering. */ if ((st_loc == -1) && (end_loc == -1)) { /* this process does no I/O. */ vars->ntimes = 0; } else { /* ntimes=ceiling_div(end_loc - st_loc + 1, coll_bufsize)*/ vars->ntimes = (int)((end_loc - st_loc + coll_bufsize) / coll_bufsize); } *error_code = MPI_Iallreduce(&vars->ntimes, &vars->max_ntimes, 1, MPI_INT, MPI_MAX, fd->comm, &vars->req1); vars->read_buf = fd->io_buf; /* Allocated at open time */ vars->curr_offlen_ptr = (int *)ADIOI_Calloc(nprocs, sizeof(int)); /* its use is explained below. calloc initializes to 0. */ vars->count = (int *)ADIOI_Malloc(nprocs * sizeof(int)); /* to store count of how many off-len pairs per proc are satisfied in an iteration. */ vars->partial_send = (int *)ADIOI_Calloc(nprocs, sizeof(int)); /* if only a portion of the last off-len pair is sent to a process in a particular iteration, the length sent is stored here. calloc initializes to 0. */ vars->send_size = (int *)ADIOI_Malloc(nprocs * sizeof(int)); /* total size of data to be sent to each proc. in an iteration */ vars->recv_size = (int *)ADIOI_Malloc(nprocs * sizeof(int)); /* total size of data to be recd. from each proc. in an iteration. Of size nprocs so that I can use MPI_Alltoall later. */ vars->recd_from_proc = (int *)ADIOI_Calloc(nprocs, sizeof(int)); /* amount of data recd. so far from each proc. Used in ADIOI_Fill_user_buffer. initialized to 0 here. */ vars->start_pos = (int *)ADIOI_Malloc(nprocs*sizeof(int)); /* used to store the starting value of curr_offlen_ptr[i] in this iteration */ ADIOI_Datatype_iscontig(datatype, &vars->buftype_is_contig); if (!vars->buftype_is_contig) { ADIOI_Flatten_datatype(datatype); flat_buf = ADIOI_Flatlist; while (flat_buf->type != datatype) flat_buf = flat_buf->next; vars->flat_buf = flat_buf; } MPI_Type_extent(datatype, &vars->buftype_extent); vars->done = 0; vars->off = st_loc; vars->for_curr_iter = vars->for_next_iter = 0; /* set the state to wait until MPI_Ialltoall finishes. */ nbc_req->data.rd.state = ADIOI_IRC_STATE_IREAD_AND_EXCH; }
/* * Compute a dynamic access range based file domain partition among I/O aggregators, * which align to the GPFS block size * Divide the I/O workload among "nprocs_for_coll" processes. This is * done by (logically) dividing the file into file domains (FDs); each * process may directly access only its own file domain. * Additional effort is to make sure that each I/O aggregator get * a file domain that aligns to the GPFS block size. So, there will * not be any false sharing of GPFS file blocks among multiple I/O nodes. * * The common version of this now accepts a min_fd_size and striping_unit. * It doesn't seem necessary here (using GPFS block sizes) but keep it in mind * (e.g. we could pass striping unit instead of using fs_ptr->blksize). */ void ADIOI_BGL_GPFS_Calc_file_domains(ADIO_Offset *st_offsets, ADIO_Offset *end_offsets, int nprocs, int nprocs_for_coll, ADIO_Offset *min_st_offset_ptr, ADIO_Offset **fd_start_ptr, ADIO_Offset **fd_end_ptr, ADIO_Offset *fd_size_ptr, void *fs_ptr) { ADIO_Offset min_st_offset, max_end_offset, *fd_start, *fd_end, *fd_size; int i, aggr; #ifdef AGGREGATION_PROFILE MPE_Log_event (5004, 0, NULL); #endif # if AGG_DEBUG static char myname[] = "ADIOI_BGL_GPFS_Calc_file_domains"; DBG_FPRINTF(stderr, "%s(%d): %d aggregator(s)\n", myname,__LINE__,nprocs_for_coll); # endif __blksize_t blksize = 1048576; /* default to 1M */ if(fs_ptr && ((ADIOI_BGL_fs*)fs_ptr)->blksize) /* ignore null ptr or 0 blksize */ blksize = ((ADIOI_BGL_fs*)fs_ptr)->blksize; # if AGG_DEBUG DBG_FPRINTF(stderr,"%s(%d): Blocksize=%ld\n",myname,__LINE__,blksize); # endif /* find min of start offsets and max of end offsets of all processes */ min_st_offset = st_offsets [0]; max_end_offset = end_offsets[0]; for (i=1; i<nprocs; i++) { min_st_offset = ADIOI_MIN(min_st_offset, st_offsets[i]); max_end_offset = ADIOI_MAX(max_end_offset, end_offsets[i]); } // DBG_FPRINTF(stderr, "_calc_file_domains, min_st_offset, max_ = %qd, %qd\n", min_st_offset, max_end_offset ); /* determine the "file domain (FD)" of each process, i.e., the portion of the file that will be "owned" by each process */ ADIO_Offset gpfs_ub = (max_end_offset +blksize-1) / blksize * blksize - 1; ADIO_Offset gpfs_lb = min_st_offset / blksize * blksize; ADIO_Offset gpfs_ub_rdoff = (max_end_offset +blksize-1) / blksize * blksize - 1 - max_end_offset; ADIO_Offset gpfs_lb_rdoff = min_st_offset - min_st_offset / blksize * blksize; ADIO_Offset fd_gpfs_range = gpfs_ub - gpfs_lb + 1; int naggs = nprocs_for_coll; /* Tweak the file domains so that no fd is smaller than a threshold. We * have to strike a balance between efficency and parallelism: somewhere * between 10k processes sending 32-byte requests and one process sending a * 320k request is a (system-dependent) sweet spot This is from the common code - the new min_fd_size parm that we didn't implement. (And common code uses a different declaration of fd_size so beware) */ /* this is not entirely sufficient on BlueGene: we must be mindful of * imbalance over psets. the hint processing code has already picked, say, * 8 processors per pset, so if we go increasing fd_size we'll end up with * some psets with 8 processors and some psets with none. */ /* if (fd_size < min_fd_size) fd_size = min_fd_size; */ fd_size = (ADIO_Offset *) ADIOI_Malloc(nprocs_for_coll * sizeof(ADIO_Offset)); *fd_start_ptr = (ADIO_Offset *) ADIOI_Malloc(nprocs_for_coll * sizeof(ADIO_Offset)); *fd_end_ptr = (ADIO_Offset *) ADIOI_Malloc(nprocs_for_coll * sizeof(ADIO_Offset)); fd_start = *fd_start_ptr; fd_end = *fd_end_ptr; ADIO_Offset n_gpfs_blk = fd_gpfs_range / blksize; ADIO_Offset nb_cn_small = n_gpfs_blk/naggs; ADIO_Offset naggs_large = n_gpfs_blk - naggs * (n_gpfs_blk/naggs); ADIO_Offset naggs_small = naggs - naggs_large; /* nb_cn_small * blksize: evenly split file domain among processors: * equivalent to fd_gpfs_rnage/naggs * (nb_cn_small+1) * blksize: keeps file domain at least 'blksize' big */ for (i=0; i<naggs; i++) if (i < naggs_small) fd_size[i] = nb_cn_small * blksize; else fd_size[i] = (nb_cn_small+1) * blksize; /*potential optimization: if n_gpfs_blk smalller than * naggs, slip in some zero-sized file * domains to spread the work across all psets. */ # if AGG_DEBUG DBG_FPRINTF(stderr,"%s(%d): " "gpfs_ub %llu, " "gpfs_lb %llu, " "gpfs_ub_rdoff %llu, " "gpfs_lb_rdoff %llu, " "fd_gpfs_range %llu, " "n_gpfs_blk %llu, " "nb_cn_small %llu, " "naggs_large %llu, " "naggs_small %llu, " "\n", myname,__LINE__, gpfs_ub , gpfs_lb , gpfs_ub_rdoff, gpfs_lb_rdoff, fd_gpfs_range, n_gpfs_blk , nb_cn_small , naggs_large , naggs_small ); # endif fd_size[0] -= gpfs_lb_rdoff; fd_size[naggs-1] -= gpfs_ub_rdoff; /* compute the file domain for each aggr */ ADIO_Offset offset = min_st_offset; for (aggr=0; aggr<naggs; aggr++) { fd_start[aggr] = offset; fd_end [aggr] = offset + fd_size[aggr] - 1; offset += fd_size[aggr]; } *fd_size_ptr = fd_size[0]; *min_st_offset_ptr = min_st_offset; #ifdef AGGREGATION_PROFILE MPE_Log_event (5005, 0, NULL); #endif ADIOI_Free (fd_size); }
static void ADIOI_R_Iexchange_data_recv(ADIOI_NBC_Request *nbc_req, int *error_code) { ADIOI_R_Iexchange_data_vars *vars = nbc_req->data.rd.red_vars; ADIO_File fd = vars->fd; int *send_size = vars->send_size; int *recv_size = vars->recv_size; int *count = vars->count; int *start_pos = vars->start_pos; int *partial_send = vars->partial_send; int nprocs = vars->nprocs; int myrank = vars->myrank; ADIOI_Access *others_req = vars->others_req; int iter = vars->iter; int *buf_idx = vars->buf_idx; int i, j, k = 0, tmp = 0, nprocs_recv, nprocs_send; char **recv_buf = NULL; MPI_Datatype send_type; nprocs_recv = 0; for (i = 0; i < nprocs; i++) if (recv_size[i]) nprocs_recv++; vars->nprocs_recv = nprocs_recv; nprocs_send = 0; for (i = 0; i < nprocs; i++) if (send_size[i]) nprocs_send++; vars->nprocs_send = nprocs_send; vars->req2 = (MPI_Request *) ADIOI_Malloc((nprocs_send+nprocs_recv+1)*sizeof(MPI_Request)); /* +1 to avoid a 0-size malloc */ /* post recvs. if buftype_is_contig, data can be directly recd. into user buf at location given by buf_idx. else use recv_buf. */ #ifdef AGGREGATION_PROFILE MPE_Log_event (5032, 0, NULL); #endif if (vars->buftype_is_contig) { j = 0; for (i = 0; i < nprocs; i++) if (recv_size[i]) { MPI_Irecv(((char *)vars->buf) + buf_idx[i], recv_size[i], MPI_BYTE, i, myrank+i+100*iter, fd->comm, vars->req2 + j); j++; buf_idx[i] += recv_size[i]; } } else { /* allocate memory for recv_buf and post receives */ recv_buf = (char **) ADIOI_Malloc(nprocs * sizeof(char*)); vars->recv_buf = recv_buf; for (i = 0; i < nprocs; i++) if (recv_size[i]) recv_buf[i] = (char *)ADIOI_Malloc(recv_size[i]); j = 0; for (i = 0; i < nprocs; i++) if (recv_size[i]) { MPI_Irecv(recv_buf[i], recv_size[i], MPI_BYTE, i, myrank+i+100*iter, fd->comm, vars->req2 + j); j++; #ifdef RDCOLL_DEBUG DBG_FPRINTF(stderr, "node %d, recv_size %d, tag %d \n", myrank, recv_size[i], myrank+i+100*iter); #endif } } /* create derived datatypes and send data */ j = 0; for (i = 0; i < nprocs; i++) { if (send_size[i]) { /* take care if the last off-len pair is a partial send */ if (partial_send[i]) { k = start_pos[i] + count[i] - 1; tmp = others_req[i].lens[k]; others_req[i].lens[k] = partial_send[i]; } ADIOI_Type_create_hindexed_x(count[i], &(others_req[i].lens[start_pos[i]]), &(others_req[i].mem_ptrs[start_pos[i]]), MPI_BYTE, &send_type); /* absolute displacement; use MPI_BOTTOM in send */ MPI_Type_commit(&send_type); MPI_Isend(MPI_BOTTOM, 1, send_type, i, myrank+i+100*iter, fd->comm, vars->req2 + nprocs_recv + j); MPI_Type_free(&send_type); if (partial_send[i]) others_req[i].lens[k] = tmp; j++; } } /* wait on the receives */ if (nprocs_recv) { nbc_req->data.rd.state = ADIOI_IRC_STATE_R_IEXCHANGE_DATA_RECV; return; } ADIOI_R_Iexchange_data_fill(nbc_req, error_code); }
/* * ADIOI_Calc_others_req (copied to bgl and switched to all to all for performance) * * param[in] count_my_req_procs Number of processes whose file domain my * request touches. * param[in] count_my_req_per_proc count_my_req_per_proc[i] gives the no. of * contig. requests of this process in * process i's file domain. * param[in] my_req A structure defining my request * param[in] nprocs Number of nodes in the block * param[in] myrank Rank of this node * param[out] count_others_req_proc_ptr Number of processes whose requests lie in * my process's file domain (including my * process itself) * param[out] others_req_ptr Array of other process' requests that lie * in my process's file domain */ void ADIOI_BGL_Calc_others_req(ADIO_File fd, int count_my_req_procs, int *count_my_req_per_proc, ADIOI_Access *my_req, int nprocs, int myrank, int *count_others_req_procs_ptr, ADIOI_Access **others_req_ptr) { /* determine what requests of other processes lie in this process's file domain */ /* count_others_req_procs = number of processes whose requests lie in this process's file domain (including this process itself) count_others_req_per_proc[i] indicates how many separate contiguous requests of proc. i lie in this process's file domain. */ int *count_others_req_per_proc, count_others_req_procs; int i; ADIOI_Access *others_req; /* Parameters for MPI_Alltoallv */ int *scounts, *sdispls, *rcounts, *rdispls; /* Parameters for MPI_Alltoallv. These are the buffers, which * are later computed to be the lowest address of all buffers * to be sent/received for offsets and lengths. Initialize to * the highest possible address which is the current minimum. */ void *sendBufForOffsets=(void*)0xFFFFFFFF, *sendBufForLens =(void*)0xFFFFFFFF, *recvBufForOffsets=(void*)0xFFFFFFFF, *recvBufForLens =(void*)0xFFFFFFFF; /* first find out how much to send/recv and from/to whom */ #ifdef AGGREGATION_PROFILE MPE_Log_event (5026, 0, NULL); #endif /* Send 1 int to each process. count_my_req_per_proc[i] is the number of * requests that my process will do to the file domain owned by process[i]. * Receive 1 int from each process. count_others_req_per_proc[i] is the number of * requests that process[i] will do to the file domain owned by my process. */ count_others_req_per_proc = (int *) ADIOI_Malloc(nprocs*sizeof(int)); /* cora2a1=timebase(); */ MPI_Alltoall(count_my_req_per_proc, 1, MPI_INT, count_others_req_per_proc, 1, MPI_INT, fd->comm); /* total_cora2a+=timebase()-cora2a1; */ /* Allocate storage for an array of other nodes' accesses of our * node's file domain. Also allocate storage for the alltoallv * parameters. */ *others_req_ptr = (ADIOI_Access *) ADIOI_Malloc(nprocs*sizeof(ADIOI_Access)); others_req = *others_req_ptr; scounts = ADIOI_Malloc(nprocs*sizeof(int)); sdispls = ADIOI_Malloc(nprocs*sizeof(int)); rcounts = ADIOI_Malloc(nprocs*sizeof(int)); rdispls = ADIOI_Malloc(nprocs*sizeof(int)); /* If process[i] has any requests in my file domain, * initialize an ADIOI_Access structure that will describe each request * from process[i]. The offsets, lengths, and buffer pointers still need * to be obtained to complete the setting of this structure. */ count_others_req_procs = 0; for (i=0; i<nprocs; i++) { if (count_others_req_per_proc[i]) { others_req[i].count = count_others_req_per_proc[i]; others_req[i].offsets = (ADIO_Offset *) ADIOI_Malloc(count_others_req_per_proc[i]*sizeof(ADIO_Offset)); others_req[i].lens = (int *) ADIOI_Malloc(count_others_req_per_proc[i]*sizeof(int)); if ( (MPIR_Upint)others_req[i].offsets < (MPIR_Upint)recvBufForOffsets ) recvBufForOffsets = others_req[i].offsets; if ( (MPIR_Upint)others_req[i].lens < (MPIR_Upint)recvBufForLens ) recvBufForLens = others_req[i].lens; others_req[i].mem_ptrs = (MPI_Aint *) ADIOI_Malloc(count_others_req_per_proc[i]*sizeof(MPI_Aint)); count_others_req_procs++; } else { others_req[i].count = 0; others_req[i].offsets = NULL; others_req[i].lens = NULL; } } /* If no recv buffer was allocated in the loop above, make it NULL */ if ( recvBufForOffsets == (void*)0xFFFFFFFF) recvBufForOffsets = NULL; if ( recvBufForLens == (void*)0xFFFFFFFF) recvBufForLens = NULL; /* Now send the calculated offsets and lengths to respective processes */ /************************/ /* Exchange the offsets */ /************************/ /* Determine the lowest sendBufForOffsets/Lens */ for (i=0; i<nprocs; i++) { if ( (my_req[i].count) && ((MPIR_Upint)my_req[i].offsets <= (MPIR_Upint)sendBufForOffsets) ) sendBufForOffsets = my_req[i].offsets; if ( (my_req[i].count) && ((MPIR_Upint)my_req[i].lens <= (MPIR_Upint)sendBufForLens) ) sendBufForLens = my_req[i].lens; } /* If no send buffer was found in the loop above, make it NULL */ if ( sendBufForOffsets == (void*)0xFFFFFFFF) sendBufForOffsets = NULL; if ( sendBufForLens == (void*)0xFFFFFFFF) sendBufForLens = NULL; /* Calculate the displacements from the sendBufForOffsets/Lens */ for (i=0; i<nprocs; i++) { // Send these offsets to process i. scounts[i] = count_my_req_per_proc[i]; if ( scounts[i] == 0 ) sdispls[i] = 0; else sdispls[i] = (int) ( ( (MPIR_Upint)my_req[i].offsets - (MPIR_Upint)sendBufForOffsets ) / (MPIR_Upint)sizeof(ADIO_Offset) ); // Receive these offsets from process i. rcounts[i] = count_others_req_per_proc[i]; if ( rcounts[i] == 0 ) rdispls[i] = 0; else rdispls[i] = (int) ( ( (MPIR_Upint)others_req[i].offsets - (MPIR_Upint)recvBufForOffsets ) / (MPIR_Upint)sizeof(ADIO_Offset) ); } /* Exchange the offsets */ MPI_Alltoallv(sendBufForOffsets, scounts, sdispls, ADIO_OFFSET, recvBufForOffsets, rcounts, rdispls, ADIO_OFFSET, fd->comm); /************************/ /* Exchange the lengths */ /************************/ for (i=0; i<nprocs; i++) { // Send these lengths to process i. scounts[i] = count_my_req_per_proc[i]; if ( scounts[i] == 0 ) sdispls[i] = 0; else sdispls[i] = (int) ( ( (MPIR_Upint)my_req[i].lens - (MPIR_Upint)sendBufForLens ) / (MPIR_Upint) sizeof(int) ); // Receive these offsets from process i. rcounts[i] = count_others_req_per_proc[i]; if ( rcounts[i] == 0 ) rdispls[i] = 0; else rdispls[i] = (int) ( ( (MPIR_Upint)others_req[i].lens - (MPIR_Upint)recvBufForLens ) / (MPIR_Upint) sizeof(int) ); } /* Exchange the lengths */ MPI_Alltoallv(sendBufForLens, scounts, sdispls, MPI_INT, recvBufForLens, rcounts, rdispls, MPI_INT, fd->comm); /* Clean up */ ADIOI_Free(count_others_req_per_proc); ADIOI_Free (scounts); ADIOI_Free (sdispls); ADIOI_Free (rcounts); ADIOI_Free (rdispls); *count_others_req_procs_ptr = count_others_req_procs; #ifdef AGGREGATION_PROFILE MPE_Log_event (5027, 0, NULL); #endif }
/* ADIOI_Flatten() * * Assumption: input datatype is not a basic!!!! */ void ADIOI_Flatten(MPI_Datatype datatype, ADIOI_Flatlist_node *flat, ADIO_Offset st_offset, MPI_Count *curr_index) { int i, k, m, n, basic_num, nonzeroth, is_hindexed_block=0; int combiner, old_combiner, old_is_contig; int nints, nadds, ntypes, old_nints, old_nadds, old_ntypes; /* By using ADIO_Offset we preserve +/- sign and avoid >2G integer arithmetic problems */ ADIO_Offset top_count; MPI_Count j, old_size, prev_index, num; MPI_Aint old_extent;/* Assume extents are non-negative */ int *ints; MPI_Aint *adds; /* Make no assumptions about +/- sign on these */ MPI_Datatype *types; MPI_Type_get_envelope(datatype, &nints, &nadds, &ntypes, &combiner); ints = (int *) ADIOI_Malloc((nints+1)*sizeof(int)); adds = (MPI_Aint *) ADIOI_Malloc((nadds+1)*sizeof(MPI_Aint)); types = (MPI_Datatype *) ADIOI_Malloc((ntypes+1)*sizeof(MPI_Datatype)); MPI_Type_get_contents(datatype, nints, nadds, ntypes, ints, adds, types); #ifdef FLATTEN_DEBUG DBG_FPRINTF(stderr,"ADIOI_Flatten:: st_offset %#llX, curr_index %#llX\n",st_offset,*curr_index); DBG_FPRINTF(stderr,"ADIOI_Flatten:: nints %#X, nadds %#X, ntypes %#X\n",nints, nadds, ntypes); for(i=0; i< nints; ++i) { DBG_FPRINTF(stderr,"ADIOI_Flatten:: ints[%d]=%#X\n",i,ints[i]); } for(i=0; i< nadds; ++i) { DBG_FPRINTF(stderr,"ADIOI_Flatten:: adds[%d]="MPI_AINT_FMT_HEX_SPEC"\n",i,adds[i]); } for(i=0; i< ntypes; ++i) { DBG_FPRINTF(stderr,"ADIOI_Flatten:: types[%d]=%#llX\n",i,(unsigned long long)(unsigned long)types[i]); } #endif /* Chapter 4, page 83: when processing datatypes, note this item from the * standard: Most datatype constructors have replication count or block length arguments. Allowed values are non-negative integers. If the value is zero, no elements are generated in the type map and there is no effect on datatype bounds or extent. */ switch (combiner) { #ifdef MPIIMPL_HAVE_MPI_COMBINER_DUP case MPI_COMBINER_DUP: #ifdef FLATTEN_DEBUG DBG_FPRINTF(stderr,"ADIOI_Flatten:: MPI_COMBINER_DUP\n"); #endif MPI_Type_get_envelope(types[0], &old_nints, &old_nadds, &old_ntypes, &old_combiner); ADIOI_Datatype_iscontig(types[0], &old_is_contig); if ((old_combiner != MPI_COMBINER_NAMED) && (!old_is_contig)) ADIOI_Flatten(types[0], flat, st_offset, curr_index); break; #endif #ifdef MPIIMPL_HAVE_MPI_COMBINER_SUBARRAY case MPI_COMBINER_SUBARRAY: { int dims = ints[0]; MPI_Datatype stype; #ifdef FLATTEN_DEBUG DBG_FPRINTF(stderr,"ADIOI_Flatten:: MPI_COMBINER_SUBARRAY\n"); #endif ADIO_Type_create_subarray(dims, &ints[1], /* sizes */ &ints[dims+1], /* subsizes */ &ints[2*dims+1], /* starts */ ints[3*dims+1], /* order */ types[0], /* type */ &stype); ADIOI_Flatten(stype, flat, st_offset, curr_index); MPI_Type_free(&stype); } break; #endif #ifdef MPIIMPL_HAVE_MPI_COMBINER_DARRAY case MPI_COMBINER_DARRAY: { int dims = ints[2]; MPI_Datatype dtype; #ifdef FLATTEN_DEBUG DBG_FPRINTF(stderr,"ADIOI_Flatten:: MPI_COMBINER_DARRAY\n"); #endif ADIO_Type_create_darray(ints[0], /* size */ ints[1], /* rank */ dims, &ints[3], /* gsizes */ &ints[dims+3], /* distribs */ &ints[2*dims+3], /* dargs */ &ints[3*dims+3], /* psizes */ ints[4*dims+3], /* order */ types[0], &dtype); #ifdef FLATTEN_DEBUG DBG_FPRINTF(stderr,"ADIOI_Flatten:: MPI_COMBINER_DARRAY <ADIOI_Flatten(dtype, flat->indices[%#X] %#llX, flat->blocklens[%#X] %#llX, st_offset %#llX, curr_index %#llX);\n", 0, flat->indices[0], 0, flat->blocklens[0], st_offset, *curr_index); #endif ADIOI_Flatten(dtype, flat, st_offset, curr_index); #ifdef FLATTEN_DEBUG DBG_FPRINTF(stderr,"ADIOI_Flatten:: MPI_COMBINER_DARRAY >ADIOI_Flatten(dtype, flat->indices[%#X] %#llX, flat->blocklens[%#X] %#llX, st_offset %#llX, curr_index %#llX);\n", 0, flat->indices[0], 0, flat->blocklens[0], st_offset, *curr_index); #endif MPI_Type_free(&dtype); } break; #endif case MPI_COMBINER_CONTIGUOUS: #ifdef FLATTEN_DEBUG DBG_FPRINTF(stderr,"ADIOI_Flatten:: MPI_COMBINER_CONTIGUOUS\n"); #endif top_count = ints[0]; MPI_Type_get_envelope(types[0], &old_nints, &old_nadds, &old_ntypes, &old_combiner); ADIOI_Datatype_iscontig(types[0], &old_is_contig); prev_index = *curr_index; if ((old_combiner != MPI_COMBINER_NAMED) && (!old_is_contig)) ADIOI_Flatten(types[0], flat, st_offset, curr_index); if (prev_index == *curr_index) { /* simplest case, made up of basic or contiguous types */ j = *curr_index; flat->indices[j] = st_offset; MPI_Type_size_x(types[0], &old_size); flat->blocklens[j] = top_count * old_size; #ifdef FLATTEN_DEBUG DBG_FPRINTF(stderr,"ADIOI_Flatten:: simple flat->indices[%#llX] %#llX, flat->blocklens[%#llX] %#llX\n",j, flat->indices[j], j, flat->blocklens[j]); #endif (*curr_index)++; } else { /* made up of noncontiguous derived types */ j = *curr_index; num = *curr_index - prev_index; /* The noncontiguous types have to be replicated count times */ MPI_Type_extent(types[0], &old_extent); for (m=1; m<top_count; m++) { for (i=0; i<num; i++) { flat->indices[j] = flat->indices[j-num] + ADIOI_AINT_CAST_TO_OFFSET old_extent; flat->blocklens[j] = flat->blocklens[j-num]; #ifdef FLATTEN_DEBUG DBG_FPRINTF(stderr,"ADIOI_Flatten:: derived flat->indices[%#llX] %#llX, flat->blocklens[%#llX] %#llX\n",j, flat->indices[j], j, flat->blocklens[j]); #endif j++; } } *curr_index = j; } break; case MPI_COMBINER_VECTOR: #ifdef FLATTEN_DEBUG DBG_FPRINTF(stderr,"ADIOI_Flatten:: MPI_COMBINER_VECTOR\n"); #endif top_count = ints[0]; MPI_Type_get_envelope(types[0], &old_nints, &old_nadds, &old_ntypes, &old_combiner); ADIOI_Datatype_iscontig(types[0], &old_is_contig); prev_index = *curr_index; if ((old_combiner != MPI_COMBINER_NAMED) && (!old_is_contig)) ADIOI_Flatten(types[0], flat, st_offset, curr_index); if (prev_index == *curr_index) { /* simplest case, vector of basic or contiguous types */ /* By using ADIO_Offset we preserve +/- sign and avoid >2G integer arithmetic problems */ ADIO_Offset blocklength = ints[1], stride = ints[2]; j = *curr_index; flat->indices[j] = st_offset; MPI_Type_size_x(types[0], &old_size); flat->blocklens[j] = blocklength * old_size; for (i=j+1; i<j+top_count; i++) { flat->indices[i] = flat->indices[i-1] + stride * old_size; flat->blocklens[i] = flat->blocklens[j]; } *curr_index = i; } else { /* vector of noncontiguous derived types */ /* By using ADIO_Offset we preserve +/- sign and avoid >2G integer arithmetic problems */ ADIO_Offset blocklength = ints[1], stride = ints[2]; j = *curr_index; num = *curr_index - prev_index; /* The noncontiguous types have to be replicated blocklen times and then strided. Replicate the first one. */ MPI_Type_extent(types[0], &old_extent); for (m=1; m<blocklength; m++) { for (i=0; i<num; i++) { flat->indices[j] = flat->indices[j-num] + ADIOI_AINT_CAST_TO_OFFSET old_extent; flat->blocklens[j] = flat->blocklens[j-num]; j++; } } *curr_index = j; /* Now repeat with strides. */ num = *curr_index - prev_index; for (i=1; i<top_count; i++) { for (m=0; m<num; m++) { flat->indices[j] = flat->indices[j-num] + stride * ADIOI_AINT_CAST_TO_OFFSET old_extent; flat->blocklens[j] = flat->blocklens[j-num]; j++; } } *curr_index = j; } break; case MPI_COMBINER_HVECTOR: case MPI_COMBINER_HVECTOR_INTEGER: #ifdef FLATTEN_DEBUG DBG_FPRINTF(stderr,"ADIOI_Flatten:: MPI_COMBINER_HVECTOR_INTEGER\n"); #endif top_count = ints[0]; MPI_Type_get_envelope(types[0], &old_nints, &old_nadds, &old_ntypes, &old_combiner); ADIOI_Datatype_iscontig(types[0], &old_is_contig); prev_index = *curr_index; if ((old_combiner != MPI_COMBINER_NAMED) && (!old_is_contig)) ADIOI_Flatten(types[0], flat, st_offset, curr_index); if (prev_index == *curr_index) { /* simplest case, vector of basic or contiguous types */ /* By using ADIO_Offset we preserve +/- sign and avoid >2G integer arithmetic problems */ ADIO_Offset blocklength = ints[1]; j = *curr_index; flat->indices[j] = st_offset; MPI_Type_size_x(types[0], &old_size); flat->blocklens[j] = blocklength * old_size; for (i=j+1; i<j+top_count; i++) { flat->indices[i] = flat->indices[i-1] + adds[0]; flat->blocklens[i] = flat->blocklens[j]; } *curr_index = i; } else { /* vector of noncontiguous derived types */ /* By using ADIO_Offset we preserve +/- sign and avoid >2G integer arithmetic problems */ ADIO_Offset blocklength = ints[1]; j = *curr_index; num = *curr_index - prev_index; /* The noncontiguous types have to be replicated blocklen times and then strided. Replicate the first one. */ MPI_Type_extent(types[0], &old_extent); for (m=1; m<blocklength; m++) { for (i=0; i<num; i++) { flat->indices[j] = flat->indices[j-num] + ADIOI_AINT_CAST_TO_OFFSET old_extent; flat->blocklens[j] = flat->blocklens[j-num]; j++; } } *curr_index = j; /* Now repeat with strides. */ num = *curr_index - prev_index; for (i=1; i<top_count; i++) { for (m=0; m<num; m++) { flat->indices[j] = flat->indices[j-num] + adds[0]; flat->blocklens[j] = flat->blocklens[j-num]; j++; } } *curr_index = j; } break; case MPI_COMBINER_INDEXED: #ifdef FLATTEN_DEBUG DBG_FPRINTF(stderr,"ADIOI_Flatten:: MPI_COMBINER_INDEXED\n"); #endif top_count = ints[0]; MPI_Type_get_envelope(types[0], &old_nints, &old_nadds, &old_ntypes, &old_combiner); ADIOI_Datatype_iscontig(types[0], &old_is_contig); MPI_Type_extent(types[0], &old_extent); prev_index = *curr_index; if ((old_combiner != MPI_COMBINER_NAMED) && (!old_is_contig)) { /* By using ADIO_Offset we preserve +/- sign and avoid >2G integer arithmetic problems */ ADIO_Offset stride = ints[top_count+1]; ADIOI_Flatten(types[0], flat, st_offset+stride* ADIOI_AINT_CAST_TO_OFFSET old_extent, curr_index); } if (prev_index == *curr_index) { /* simplest case, indexed type made up of basic or contiguous types */ j = *curr_index; for (i=j, nonzeroth=i; i<j+top_count; i++) { /* By using ADIO_Offset we preserve +/- sign and avoid >2G integer arithmetic problems */ ADIO_Offset blocklength = ints[1+i-j], stride = ints[top_count+1+i-j]; if (blocklength > 0) { flat->indices[nonzeroth] = st_offset + stride* ADIOI_AINT_CAST_TO_OFFSET old_extent; flat->blocklens[nonzeroth] = blocklength* ADIOI_AINT_CAST_TO_OFFSET old_extent; nonzeroth++; } else { flat->count--; /* don't count/consider any zero-length blocklens */ } } *curr_index = i; } else { /* indexed type made up of noncontiguous derived types */ j = *curr_index; num = *curr_index - prev_index; basic_num = num; /* The noncontiguous types have to be replicated blocklens[i] times and then strided. Replicate the first one. */ for (m=1; m<ints[1]; m++) { for (i=0, nonzeroth = j; i<num; i++) { if (flat->blocklens[j-num] > 0) { flat->indices[nonzeroth] = flat->indices[nonzeroth-num] + ADIOI_AINT_CAST_TO_OFFSET old_extent; flat->blocklens[nonzeroth] = flat->blocklens[nonzeroth-num]; j++; nonzeroth++; } else { flat->count --; } } } *curr_index = j; /* Now repeat with strides. */ for (i=1; i<top_count; i++) { num = *curr_index - prev_index; prev_index = *curr_index; for (m=0, nonzeroth=j; m<basic_num; m++) { /* By using ADIO_Offset we preserve +/- sign and avoid >2G integer arithmetic problems */ ADIO_Offset stride = ints[top_count+1+i]-ints[top_count+i]; if (flat->blocklens[j-num] > 0 ) { flat->indices[nonzeroth] = flat->indices[j-num] + stride* ADIOI_AINT_CAST_TO_OFFSET old_extent; flat->blocklens[nonzeroth] = flat->blocklens[j-num]; j++; nonzeroth++; } else { flat->count--; } } *curr_index = j; for (m=1; m<ints[1+i]; m++) { for (k=0, nonzeroth=j; k<basic_num; k++) { if (flat->blocklens[j-basic_num] > 0) { flat->indices[nonzeroth] = flat->indices[j-basic_num] + ADIOI_AINT_CAST_TO_OFFSET old_extent; flat->blocklens[nonzeroth] = flat->blocklens[j-basic_num]; j++; nonzeroth++; } else { flat->count --; } } } *curr_index = j; } } break; #if defined HAVE_DECL_MPI_COMBINER_HINDEXED_BLOCK && HAVE_DECL_MPI_COMBINER_HINDEXED_BLOCK case MPI_COMBINER_HINDEXED_BLOCK: is_hindexed_block=1; /* deliberate fall-through */ #endif case MPI_COMBINER_INDEXED_BLOCK: #ifdef FLATTEN_DEBUG DBG_FPRINTF(stderr,"ADIOI_Flatten:: MPI_COMBINER_INDEXED_BLOCK\n"); #endif top_count = ints[0]; MPI_Type_get_envelope(types[0], &old_nints, &old_nadds, &old_ntypes, &old_combiner); ADIOI_Datatype_iscontig(types[0], &old_is_contig); MPI_Type_extent(types[0], &old_extent); prev_index = *curr_index; if ((old_combiner != MPI_COMBINER_NAMED) && (!old_is_contig)) { /* By using ADIO_Offset we preserve +/- sign and avoid >2G integer arithmetic problems */ ADIO_Offset stride = ints[1+1]; if (is_hindexed_block) { ADIOI_Flatten(types[0], flat, st_offset+adds[0], curr_index); } else { ADIOI_Flatten(types[0], flat, st_offset+stride* ADIOI_AINT_CAST_TO_OFFSET old_extent, curr_index); } } if (prev_index == *curr_index) { /* simplest case, indexed type made up of basic or contiguous types */ j = *curr_index; for (i=j; i<j+top_count; i++) { /* By using ADIO_Offset we preserve +/- sign and avoid >2G integer arithmetic problems */ ADIO_Offset blocklength = ints[1]; if (is_hindexed_block) { flat->indices[i] = st_offset + adds[i-j]; } else { ADIO_Offset stride = ints[1+1+i-j]; flat->indices[i] = st_offset + stride* ADIOI_AINT_CAST_TO_OFFSET old_extent; } flat->blocklens[i] = blocklength* ADIOI_AINT_CAST_TO_OFFSET old_extent; } *curr_index = i; } else { /* vector of noncontiguous derived types */ j = *curr_index; num = *curr_index - prev_index; /* The noncontiguous types have to be replicated blocklens[i] times and then strided. Replicate the first one. */ for (m=1; m<ints[1]; m++) { for (i=0; i<num; i++) { if (is_hindexed_block) { /* this is the one place the hindexed case uses the * extent of a type */ MPI_Type_extent(types[0], &old_extent); } flat->indices[j] = flat->indices[j-num] + ADIOI_AINT_CAST_TO_OFFSET old_extent; flat->blocklens[j] = flat->blocklens[j-num]; j++; } } *curr_index = j; /* Now repeat with strides. */ num = *curr_index - prev_index; for (i=1; i<top_count; i++) { for (m=0; m<num; m++) { if (is_hindexed_block) { flat->indices[j] = flat->indices[j-num] + adds[i] - adds[i-1]; } else { /* By using ADIO_Offset we preserve +/- sign and avoid >2G integer arithmetic problems */ ADIO_Offset stride = ints[2+i]-ints[1+i]; flat->indices[j] = flat->indices[j-num] + stride* ADIOI_AINT_CAST_TO_OFFSET old_extent; } flat->blocklens[j] = flat->blocklens[j-num]; j++; } } *curr_index = j; } break; case MPI_COMBINER_HINDEXED: case MPI_COMBINER_HINDEXED_INTEGER: #ifdef FLATTEN_DEBUG DBG_FPRINTF(stderr,"ADIOI_Flatten:: MPI_COMBINER_HINDEXED_INTEGER\n"); #endif top_count = ints[0]; MPI_Type_get_envelope(types[0], &old_nints, &old_nadds, &old_ntypes, &old_combiner); ADIOI_Datatype_iscontig(types[0], &old_is_contig); prev_index = *curr_index; if ((old_combiner != MPI_COMBINER_NAMED) && (!old_is_contig)) { ADIOI_Flatten(types[0], flat, st_offset+adds[0], curr_index); } if (prev_index == *curr_index) { /* simplest case, indexed type made up of basic or contiguous types */ j = *curr_index; MPI_Type_size_x(types[0], &old_size); for (i=j, nonzeroth=j; i<j+top_count; i++) { if (ints[1+i-j] > 0) { /* By using ADIO_Offset we preserve +/- sign and avoid >2G integer arithmetic problems */ ADIO_Offset blocklength = ints[1+i-j]; flat->indices[nonzeroth] = st_offset + adds[i-j]; flat->blocklens[nonzeroth] = blocklength*old_size; nonzeroth++; } else { flat->count--; } } *curr_index = i; } else { /* indexed type made up of noncontiguous derived types */ j = *curr_index; num = *curr_index - prev_index; basic_num = num; /* The noncontiguous types have to be replicated blocklens[i] times and then strided. Replicate the first one. */ MPI_Type_extent(types[0], &old_extent); for (m=1; m<ints[1]; m++) { for (i=0, nonzeroth=j; i<num; i++) { if (flat->blocklens[j-num] > 0) { flat->indices[nonzeroth] = flat->indices[j-num] + ADIOI_AINT_CAST_TO_OFFSET old_extent; flat->blocklens[nonzeroth] = flat->blocklens[j-num]; j++; nonzeroth++; } else { flat->count--; } } } *curr_index = j; /* Now repeat with strides. */ for (i=1; i<top_count; i++) { num = *curr_index - prev_index; prev_index = *curr_index; for (m=0, nonzeroth=j; m<basic_num; m++) { if (flat->blocklens[j-num] > 0) { flat->indices[nonzeroth] = flat->indices[j-num] + adds[i] - adds[i-1]; flat->blocklens[nonzeroth] = flat->blocklens[j-num]; j++; nonzeroth++; } else { flat->count--; } } *curr_index = j; for (m=1; m<ints[1+i]; m++) { for (k=0,nonzeroth=j; k<basic_num; k++) { if (flat->blocklens[j-basic_num] >0) { flat->indices[nonzeroth] = flat->indices[j-basic_num] + ADIOI_AINT_CAST_TO_OFFSET old_extent; flat->blocklens[nonzeroth] = flat->blocklens[j-basic_num]; j++; nonzeroth++; } } } *curr_index = j; } } break; case MPI_COMBINER_STRUCT: case MPI_COMBINER_STRUCT_INTEGER: #ifdef FLATTEN_DEBUG DBG_FPRINTF(stderr,"ADIOI_Flatten:: MPI_COMBINER_STRUCT_INTEGER\n"); #endif top_count = ints[0]; for (n=0; n<top_count; n++) { MPI_Type_get_envelope(types[n], &old_nints, &old_nadds, &old_ntypes, &old_combiner); ADIOI_Datatype_iscontig(types[n], &old_is_contig); prev_index = *curr_index; if ((old_combiner != MPI_COMBINER_NAMED) && (!old_is_contig)) ADIOI_Flatten(types[n], flat, st_offset+adds[n], curr_index); if (prev_index == *curr_index) { /* simplest case, current type is basic or contiguous types */ /* By using ADIO_Offset we preserve +/- sign and avoid >2G integer arithmetic problems */ if (ints[1+n] > 0 || types[n] == MPI_LB || types[n] == MPI_UB) { ADIO_Offset blocklength = ints[1+n]; j = *curr_index; flat->indices[j] = st_offset + adds[n]; MPI_Type_size_x(types[n], &old_size); flat->blocklens[j] = blocklength * old_size; #ifdef FLATTEN_DEBUG DBG_FPRINTF(stderr,"ADIOI_Flatten:: simple adds[%#X] "MPI_AINT_FMT_HEX_SPEC", flat->indices[%#llX] %#llX, flat->blocklens[%#llX] %#llX\n",n,adds[n],j, flat->indices[j], j, flat->blocklens[j]); #endif (*curr_index)++; } } else { /* current type made up of noncontiguous derived types */ j = *curr_index; num = *curr_index - prev_index; /* The current type has to be replicated blocklens[n] times */ MPI_Type_extent(types[n], &old_extent); for (m=1; m<ints[1+n]; m++) { for (i=0; i<num; i++) { flat->indices[j] = flat->indices[j-num] + ADIOI_AINT_CAST_TO_OFFSET old_extent; flat->blocklens[j] = flat->blocklens[j-num]; #ifdef FLATTEN_DEBUG DBG_FPRINTF(stderr,"ADIOI_Flatten:: simple old_extent "MPI_AINT_FMT_HEX_SPEC", flat->indices[%#llX] %#llX, flat->blocklens[%#llX] %#llX\n",old_extent,j, flat->indices[j], j, flat->blocklens[j]); #endif j++; } } *curr_index = j; } } break; case MPI_COMBINER_RESIZED: #ifdef FLATTEN_DEBUG DBG_FPRINTF(stderr,"ADIOI_Flatten:: MPI_COMBINER_RESIZED\n"); #endif /* This is done similar to a type_struct with an lb, datatype, ub */ /* handle the Lb */ j = *curr_index; flat->indices[j] = st_offset + adds[0]; /* this zero-length blocklens[] element, unlike eleswhere in the * flattening code, is correct and is used to indicate a lower bound * marker */ flat->blocklens[j] = 0; #ifdef FLATTEN_DEBUG DBG_FPRINTF(stderr,"ADIOI_Flatten:: simple adds[%#X] "MPI_AINT_FMT_HEX_SPEC", flat->indices[%#llX] %#llX, flat->blocklens[%#llX] %#llX\n",0,adds[0],j, flat->indices[j], j, flat->blocklens[j]); #endif (*curr_index)++; /* handle the datatype */ MPI_Type_get_envelope(types[0], &old_nints, &old_nadds, &old_ntypes, &old_combiner); ADIOI_Datatype_iscontig(types[0], &old_is_contig); if ((old_combiner != MPI_COMBINER_NAMED) && (!old_is_contig)) { ADIOI_Flatten(types[0], flat, st_offset+adds[0], curr_index); } else { /* current type is basic or contiguous */ j = *curr_index; flat->indices[j] = st_offset; MPI_Type_size_x(types[0], &old_size); flat->blocklens[j] = old_size; #ifdef FLATTEN_DEBUG DBG_FPRINTF(stderr,"ADIOI_Flatten:: simple adds[%#X] "MPI_AINT_FMT_HEX_SPEC", flat->indices[%#llX] %#llX, flat->blocklens[%#llX] %#llX\n",0,adds[0],j, flat->indices[j], j, flat->blocklens[j]); #endif (*curr_index)++; } /* take care of the extent as a UB */ j = *curr_index; flat->indices[j] = st_offset + adds[0] + adds[1]; /* again, zero-element ok: an upper-bound marker explicitly set by the * constructor of this resized type */ flat->blocklens[j] = 0; #ifdef FLATTEN_DEBUG DBG_FPRINTF(stderr,"ADIOI_Flatten:: simple adds[%#X] "MPI_AINT_FMT_HEX_SPEC", flat->indices[%#llX] %#llX, flat->blocklens[%#llX] %#llX\n",1,adds[1],j, flat->indices[j], j, flat->blocklens[j]); #endif (*curr_index)++; break; default: /* TODO: FIXME (requires changing prototypes to return errors...) */ DBG_FPRINTF(stderr, "Error: Unsupported datatype passed to ADIOI_Flatten\n"); MPI_Abort(MPI_COMM_WORLD, 1); } #ifndef MPISGI /* There is a bug in SGI's impl. of MPI_Type_get_contents. It doesn't return new datatypes. Therefore no need to free. */ for (i=0; i<ntypes; i++) { MPI_Type_get_envelope(types[i], &old_nints, &old_nadds, &old_ntypes, &old_combiner); if (old_combiner != MPI_COMBINER_NAMED) MPI_Type_free(types+i); } #endif ADIOI_Free(ints); ADIOI_Free(adds); ADIOI_Free(types); #ifdef FLATTEN_DEBUG DBG_FPRINTF(stderr,"ADIOI_Flatten:: return st_offset %#llX, curr_index %#llX\n",st_offset,*curr_index); #endif }