void ADIOI_GEN_OpenColl(ADIO_File fd, int rank, int access_mode, int *error_code) { int orig_amode_excl, orig_amode_wronly; MPI_Comm tmp_comm; MPI_Datatype stats_type; /* deferred open: some processes might not open the file, so we'll exchange some information with those non-aggregators */ orig_amode_excl = access_mode; if (access_mode & ADIO_CREATE ){ if(rank == fd->hints->ranklist[0]) { /* remove delete_on_close flag if set */ if (access_mode & ADIO_DELETE_ON_CLOSE) fd->access_mode = access_mode ^ ADIO_DELETE_ON_CLOSE; else fd->access_mode = access_mode; tmp_comm = fd->comm; fd->comm = MPI_COMM_SELF; (*(fd->fns->ADIOI_xxx_Open))(fd, error_code); fd->comm = tmp_comm; MPI_Bcast(error_code, 1, MPI_INT, \ fd->hints->ranklist[0], fd->comm); /* if no error, close the file and reopen normally below */ if (*error_code == MPI_SUCCESS) (*(fd->fns->ADIOI_xxx_Close))(fd, error_code); fd->access_mode = access_mode; /* back to original */ } else MPI_Bcast(error_code, 1, MPI_INT, fd->hints->ranklist[0], fd->comm); if (*error_code != MPI_SUCCESS) { return; } else { /* turn off CREAT (and EXCL if set) for real multi-processor open */ access_mode ^= ADIO_CREATE; if (access_mode & ADIO_EXCL) access_mode ^= ADIO_EXCL; } } fd->blksize = 1024*1024*4; /* this large default value should be good for most file systems. any ROMIO driver is free to stat the file and find an optimial value */ /* if we are doing deferred open, non-aggregators should return now */ if (fd->hints->deferred_open ) { if (!(fd->is_agg)) { char value[MPI_MAX_INFO_VAL+1]; /* we might have turned off EXCL for the aggregators. * restore access_mode that non-aggregators get the right * value from get_amode */ fd->access_mode = orig_amode_excl; /* In file-system specific open, a driver might collect some * information via stat(). Deferred open means not every process * participates in fs-specific open, but they all participate in * this open call. Broadcast a bit of information in case * lower-level file system driver (e.g. 'bluegene') collected it * (not all do)*/ stats_type = make_stats_type(fd); MPI_Bcast(MPI_BOTTOM, 1, stats_type, fd->hints->ranklist[0], fd->comm); ADIOI_Assert(fd->blksize > 0); /* some file systems (e.g. lustre) will inform the user via the * info object about the file configuration. deferred open, * though, skips that step for non-aggregators. we do the * info-setting here */ sprintf(value, "%d", fd->hints->striping_unit); ADIOI_Info_set(fd->info, "striping_unit", value); sprintf(value, "%d", fd->hints->striping_factor); ADIOI_Info_set(fd->info, "striping_factor", value); sprintf(value, "%d", fd->hints->start_iodevice); ADIOI_Info_set(fd->info, "romio_lustre_start_iodevice", value); *error_code = MPI_SUCCESS; MPI_Type_free(&stats_type); return; } } /* For writing with data sieving, a read-modify-write is needed. If the file is opened for write_only, the read will fail. Therefore, if write_only, open the file as read_write, but record it as write_only in fd, so that get_amode returns the right answer. */ /* observation from David Knaak: file systems that do not support data * sieving do not need to change the mode */ orig_amode_wronly = access_mode; if ( (access_mode & ADIO_WRONLY) && ADIO_Feature(fd, ADIO_DATA_SIEVING_WRITES) ) { access_mode = access_mode ^ ADIO_WRONLY; access_mode = access_mode | ADIO_RDWR; } fd->access_mode = access_mode; (*(fd->fns->ADIOI_xxx_Open))(fd, error_code); /* if error, may be it was due to the change in amode above. therefore, reopen with access mode provided by the user.*/ fd->access_mode = orig_amode_wronly; if (*error_code != MPI_SUCCESS) (*(fd->fns->ADIOI_xxx_Open))(fd, error_code); /* if we turned off EXCL earlier, then we should turn it back on */ if (fd->access_mode != orig_amode_excl) fd->access_mode = orig_amode_excl; /* broadcast information to all proceses in * communicator, not just those who participated in open */ stats_type = make_stats_type(fd); MPI_Bcast(MPI_BOTTOM, 1, stats_type, fd->hints->ranklist[0], fd->comm); MPI_Type_free(&stats_type); /* file domain code will get terribly confused in a hard-to-debug way if * gpfs blocksize not sensible */ ADIOI_Assert( fd->blksize > 0); /* for deferred open: this process has opened the file (because if we are * not an aggregaor and we are doing deferred open, we returned earlier)*/ fd->is_open = 1; }
void ADIOI_GEN_ReadStrided(ADIO_File fd, void *buf, int count, MPI_Datatype datatype, int file_ptr_type, ADIO_Offset offset, ADIO_Status *status, int *error_code) { /* offset is in units of etype relative to the filetype. */ ADIOI_Flatlist_node *flat_buf, *flat_file; ADIO_Offset i_offset, new_brd_size, brd_size, size; int i, j, k, st_index=0; MPI_Count num, bufsize; int n_etypes_in_filetype; ADIO_Offset n_filetypes, etype_in_filetype, st_n_filetypes, size_in_filetype; ADIO_Offset abs_off_in_filetype=0, new_frd_size, frd_size=0, st_frd_size; MPI_Count filetype_size, etype_size, buftype_size, partial_read; MPI_Aint filetype_extent, buftype_extent; int buf_count, buftype_is_contig, filetype_is_contig; ADIO_Offset userbuf_off, req_len, sum; ADIO_Offset off, req_off, disp, end_offset=0, readbuf_off, start_off; char *readbuf, *tmp_buf, *value; int info_flag; unsigned max_bufsize, readbuf_len; ADIO_Status status1; if (fd->hints->ds_read == ADIOI_HINT_DISABLE) { /* if user has disabled data sieving on reads, use naive * approach instead. */ ADIOI_GEN_ReadStrided_naive(fd, buf, count, datatype, file_ptr_type, offset, status, error_code); return; } *error_code = MPI_SUCCESS; /* changed below if error */ ADIOI_Datatype_iscontig(datatype, &buftype_is_contig); ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig); MPI_Type_size_x(fd->filetype, &filetype_size); if ( ! filetype_size ) { #ifdef HAVE_STATUS_SET_BYTES MPIR_Status_set_bytes(status, datatype, 0); #endif *error_code = MPI_SUCCESS; return; } MPI_Type_extent(fd->filetype, &filetype_extent); MPI_Type_size_x(datatype, &buftype_size); MPI_Type_extent(datatype, &buftype_extent); etype_size = fd->etype_size; ADIOI_Assert((buftype_size * count) == ((ADIO_Offset)(MPI_Count)buftype_size * (ADIO_Offset)count)); bufsize = buftype_size * count; /* get max_bufsize from the info object. */ value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char)); ADIOI_Info_get(fd->info, "ind_rd_buffer_size", MPI_MAX_INFO_VAL, value, &info_flag); max_bufsize = atoi(value); ADIOI_Free(value); if (!buftype_is_contig && filetype_is_contig) { /* noncontiguous in memory, contiguous in file. */ flat_buf = ADIOI_Flatten_and_find(datatype); off = (file_ptr_type == ADIO_INDIVIDUAL) ? fd->fp_ind : fd->disp + (ADIO_Offset)etype_size * offset; start_off = off; end_offset = off + bufsize - 1; readbuf_off = off; readbuf = (char *) ADIOI_Malloc(max_bufsize); readbuf_len = (unsigned) (MPL_MIN(max_bufsize, end_offset-readbuf_off+1)); /* if atomicity is true, lock (exclusive) the region to be accessed */ if ((fd->atomicity) && ADIO_Feature(fd, ADIO_LOCKS)) ADIOI_WRITE_LOCK(fd, start_off, SEEK_SET, end_offset-start_off+1); ADIO_ReadContig(fd, readbuf, readbuf_len, MPI_BYTE, ADIO_EXPLICIT_OFFSET, readbuf_off, &status1, error_code); if (*error_code != MPI_SUCCESS) return; for (j=0; j<count; j++) { for (i=0; i<flat_buf->count; i++) { userbuf_off = (ADIO_Offset)j*(ADIO_Offset)buftype_extent + flat_buf->indices[i]; req_off = off; req_len = flat_buf->blocklens[i]; ADIOI_BUFFERED_READ off += flat_buf->blocklens[i]; } } if ((fd->atomicity) && ADIO_Feature(fd, ADIO_LOCKS)) ADIOI_UNLOCK(fd, start_off, SEEK_SET, end_offset-start_off+1); if (file_ptr_type == ADIO_INDIVIDUAL) fd->fp_ind = off; ADIOI_Free(readbuf); } else { /* noncontiguous in file */ flat_file = ADIOI_Flatten_and_find(fd->filetype); disp = fd->disp; if (file_ptr_type == ADIO_INDIVIDUAL) { /* Wei-keng reworked type processing to be a bit more efficient */ offset = fd->fp_ind - disp; n_filetypes = (offset - flat_file->indices[0]) / filetype_extent; offset -= (ADIO_Offset)n_filetypes * filetype_extent; /* now offset is local to this extent */ /* find the block where offset is located, skip blocklens[i]==0 */ for (i=0; i<flat_file->count; i++) { ADIO_Offset dist; if (flat_file->blocklens[i] == 0) continue; dist = flat_file->indices[i] + flat_file->blocklens[i] - offset; /* frd_size is from offset to the end of block i */ if (dist == 0) { i++; offset = flat_file->indices[i]; frd_size = flat_file->blocklens[i]; break; } if (dist > 0) { frd_size = dist; break; } } st_index = i; /* starting index in flat_file->indices[] */ offset += disp + (ADIO_Offset)n_filetypes*filetype_extent; } else { n_etypes_in_filetype = filetype_size/etype_size; n_filetypes = offset / n_etypes_in_filetype; etype_in_filetype = offset % n_etypes_in_filetype; size_in_filetype = etype_in_filetype * etype_size; sum = 0; for (i=0; i<flat_file->count; i++) { sum += flat_file->blocklens[i]; if (sum > size_in_filetype) { st_index = i; frd_size = sum - size_in_filetype; abs_off_in_filetype = flat_file->indices[i] + size_in_filetype - (sum - flat_file->blocklens[i]); break; } } /* abs. offset in bytes in the file */ offset = disp + (ADIO_Offset) n_filetypes*filetype_extent + abs_off_in_filetype; } start_off = offset; /* Wei-keng Liao: read request is within a single flat_file contig * block e.g. with subarray types that actually describe the whole * array */ if (buftype_is_contig && bufsize <= frd_size) { /* a count of bytes can overflow. operate on original type instead */ ADIO_ReadContig(fd, buf, count, datatype, ADIO_EXPLICIT_OFFSET, offset, status, error_code); if (file_ptr_type == ADIO_INDIVIDUAL) { /* update MPI-IO file pointer to point to the first byte that * can be accessed in the fileview. */ fd->fp_ind = offset + bufsize; if (bufsize == frd_size) { do { st_index++; if (st_index == flat_file->count) { st_index = 0; n_filetypes++; } } while (flat_file->blocklens[st_index] == 0); fd->fp_ind = disp + flat_file->indices[st_index] + n_filetypes*filetype_extent; } } fd->fp_sys_posn = -1; /* set it to null. */ #ifdef HAVE_STATUS_SET_BYTES MPIR_Status_set_bytes(status, datatype, bufsize); #endif return; } /* Calculate end_offset, the last byte-offset that will be accessed. e.g., if start_offset=0 and 100 bytes to be read, end_offset=99*/ st_frd_size = frd_size; st_n_filetypes = n_filetypes; i_offset = 0; j = st_index; off = offset; frd_size = MPL_MIN(st_frd_size, bufsize); while (i_offset < bufsize) { i_offset += frd_size; end_offset = off + frd_size - 1; j = (j+1) % flat_file->count; n_filetypes += (j == 0) ? 1 : 0; while (flat_file->blocklens[j]==0) { j = (j+1) % flat_file->count; n_filetypes += (j == 0) ? 1 : 0; } off = disp + flat_file->indices[j] + n_filetypes*(ADIO_Offset)filetype_extent; frd_size = MPL_MIN(flat_file->blocklens[j], bufsize-i_offset); } /* if atomicity is true, lock (exclusive) the region to be accessed */ if ((fd->atomicity) && ADIO_Feature(fd, ADIO_LOCKS)) ADIOI_WRITE_LOCK(fd, start_off, SEEK_SET, end_offset-start_off+1); readbuf_off = 0; readbuf_len = 0; readbuf = (char *) ADIOI_Malloc(max_bufsize); if (buftype_is_contig && !filetype_is_contig) { /* contiguous in memory, noncontiguous in file. should be the most common case. */ i_offset = 0; j = st_index; off = offset; n_filetypes = st_n_filetypes; frd_size = MPL_MIN(st_frd_size, bufsize); while (i_offset < bufsize) { if (frd_size) { /* TYPE_UB and TYPE_LB can result in frd_size = 0. save system call in such cases */ /* lseek(fd->fd_sys, off, SEEK_SET); err = read(fd->fd_sys, ((char *) buf) + i, frd_size);*/ req_off = off; req_len = frd_size; userbuf_off = i_offset; ADIOI_BUFFERED_READ } i_offset += frd_size; if (off + frd_size < disp + flat_file->indices[j] + flat_file->blocklens[j] + n_filetypes*(ADIO_Offset)filetype_extent) off += frd_size; /* did not reach end of contiguous block in filetype. no more I/O needed. off is incremented by frd_size. */ else { j = (j+1) % flat_file->count; n_filetypes += (j == 0) ? 1 : 0; while (flat_file->blocklens[j]==0) { j = (j+1) % flat_file->count; n_filetypes += (j == 0) ? 1 : 0; } off = disp + flat_file->indices[j] + n_filetypes*(ADIO_Offset)filetype_extent; frd_size = MPL_MIN(flat_file->blocklens[j], bufsize-i_offset); } } } else {
int MPIOI_File_iwrite(MPI_File mpi_fh, MPI_Offset offset, int file_ptr_type, void *buf, int count, MPI_Datatype datatype, char *myname, MPI_Request *request) { int error_code, bufsize, buftype_is_contig, filetype_is_contig; int datatype_size; ADIO_Status status; ADIO_Offset off; ADIO_File fh; MPI_Offset nbytes=0; fh = MPIO_File_resolve(mpi_fh); /* --BEGIN ERROR HANDLING-- */ MPIO_CHECK_FILE_HANDLE(fh, myname, error_code); MPIO_CHECK_COUNT(fh, count, myname, error_code); MPIO_CHECK_DATATYPE(fh, datatype, myname, error_code); if (file_ptr_type == ADIO_EXPLICIT_OFFSET && offset < 0) { error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_ARG, "**iobadoffset", 0); error_code = MPIO_Err_return_file(fh, error_code); goto fn_exit; } /* --END ERROR HANDLING-- */ MPI_Type_size(datatype, &datatype_size); /* --BEGIN ERROR HANDLING-- */ MPIO_CHECK_INTEGRAL_ETYPE(fh, count, datatype_size, myname, error_code); MPIO_CHECK_WRITABLE(fh, myname, error_code); MPIO_CHECK_NOT_SEQUENTIAL_MODE(fh, myname, error_code); MPIO_CHECK_COUNT_SIZE(fh, count, datatype_size, myname, error_code); /* --END ERROR HANDLING-- */ ADIOI_Datatype_iscontig(datatype, &buftype_is_contig); ADIOI_Datatype_iscontig(fh->filetype, &filetype_is_contig); ADIOI_TEST_DEFERRED(fh, myname, &error_code); if (buftype_is_contig && filetype_is_contig) { /* convert sizes to bytes */ bufsize = datatype_size * count; if (file_ptr_type == ADIO_EXPLICIT_OFFSET) { off = fh->disp + fh->etype_size * offset; } else { off = fh->fp_ind; } if (!(fh->atomicity)) { ADIO_IwriteContig(fh, buf, count, datatype, file_ptr_type, off, request, &error_code); } else { /* to maintain strict atomicity semantics with other concurrent operations, lock (exclusive) and call blocking routine */ if (ADIO_Feature(fh, ADIO_LOCKS) ) { ADIOI_WRITE_LOCK(fh, off, SEEK_SET, bufsize); } ADIO_WriteContig(fh, buf, count, datatype, file_ptr_type, off, &status, &error_code); if (ADIO_Feature(fh, ADIO_LOCKS) ) { ADIOI_UNLOCK(fh, off, SEEK_SET, bufsize); } if (error_code == MPI_SUCCESS) { nbytes = count * datatype_size; } MPIO_Completed_request_create(&fh, nbytes, &error_code, request); } } else { ADIO_IwriteStrided(fh, buf, count, datatype, file_ptr_type, offset, request, &error_code); } fn_exit: return error_code; }
void ADIOI_GEN_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code) { /* if fd->info is null, create a new info object. Initialize fd->info to default values. Initialize fd->hints to default values. Examine the info object passed by the user. If it contains values that ROMIO understands, override the default. */ MPI_Info info; char *value; int flag, nprocs = 0, len; int ok_to_override_cb_nodes = 0; static char myname[] = "ADIOI_GEN_SETINFO"; /* if we've already set up default hints and the user has not asked us to * process any hints (MPI_INFO_NULL), then we can short-circuit hint * processing */ if (fd->hints->initialized && fd->info == MPI_INFO_NULL) { *error_code = MPI_SUCCESS; return; } ad_get_env_vars(); if (fd->info == MPI_INFO_NULL) MPI_Info_create(&(fd->info)); info = fd->info; MPI_Comm_size(fd->comm, &nprocs); /* Note that fd->hints is allocated at file open time; thus it is * not necessary to allocate it, or check for allocation, here. */ value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL + 1) * sizeof(char)); if (value == NULL) { *error_code = MPIO_Err_create_code(*error_code, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_OTHER, "**nomem2", 0); return; } /* initialize info and hints to default values if they haven't been * previously initialized */ if (!fd->hints->initialized) { /* buffer size for collective I/O */ ADIOI_Info_set(info, "cb_buffer_size", ADIOI_CB_BUFFER_SIZE_DFLT); fd->hints->cb_buffer_size = atoi(ADIOI_CB_BUFFER_SIZE_DFLT); /* default is to let romio automatically decide when to use * collective buffering */ ADIOI_Info_set(info, "romio_cb_read", "automatic"); fd->hints->cb_read = ADIOI_HINT_AUTO; ADIOI_Info_set(info, "romio_cb_write", "automatic"); fd->hints->cb_write = ADIOI_HINT_AUTO; fd->hints->cb_config_list = NULL; /* number of processes that perform I/O in collective I/O */ MPL_snprintf(value, MPI_MAX_INFO_VAL + 1, "%d", nprocs); ADIOI_Info_set(info, "cb_nodes", value); fd->hints->cb_nodes = nprocs; /* hint indicating that no indep. I/O will be performed on this file */ ADIOI_Info_set(info, "romio_no_indep_rw", "false"); fd->hints->no_indep_rw = 0; /* hint instructing the use of persistent file realms */ ADIOI_Info_set(info, "romio_cb_pfr", "disable"); fd->hints->cb_pfr = ADIOI_HINT_DISABLE; /* hint guiding the assignment of persistent file realms */ ADIOI_Info_set(info, "romio_cb_fr_types", "aar"); fd->hints->cb_fr_type = ADIOI_FR_AAR; /* hint to align file realms with a certain byte value */ ADIOI_Info_set(info, "romio_cb_fr_alignment", "1"); fd->hints->cb_fr_alignment = 1; /* hint to set a threshold percentage for a datatype's size/extent at * which data sieving should be done in collective I/O */ ADIOI_Info_set(info, "romio_cb_ds_threshold", "0"); fd->hints->cb_ds_threshold = 0; /* hint to switch between point-to-point or all-to-all for two-phase */ ADIOI_Info_set(info, "romio_cb_alltoall", "automatic"); fd->hints->cb_alltoall = ADIOI_HINT_AUTO; /* deferred_open derived from no_indep_rw and cb_{read,write} */ fd->hints->deferred_open = 0; /* buffer size for data sieving in independent reads */ ADIOI_Info_set(info, "ind_rd_buffer_size", ADIOI_IND_RD_BUFFER_SIZE_DFLT); fd->hints->ind_rd_buffer_size = atoi(ADIOI_IND_RD_BUFFER_SIZE_DFLT); /* buffer size for data sieving in independent writes */ ADIOI_Info_set(info, "ind_wr_buffer_size", ADIOI_IND_WR_BUFFER_SIZE_DFLT); fd->hints->ind_wr_buffer_size = atoi(ADIOI_IND_WR_BUFFER_SIZE_DFLT); /* default is to let romio automatically decide when to use data * sieving */ ADIOI_Info_set(info, "romio_ds_read", "automatic"); fd->hints->ds_read = ADIOI_HINT_AUTO; ADIOI_Info_set(info, "romio_ds_write", "automatic"); fd->hints->ds_write = ADIOI_HINT_AUTO; /* still to do: tune this a bit for a variety of file systems. there's * no good default value so just leave it unset */ fd->hints->min_fdomain_size = 0; fd->hints->striping_unit = 0; fd->hints->initialized = 1; /* ADIO_Open sets up collective buffering arrays. If we are in this * path from say set_file_view, then we've don't want to adjust the * array: we'll get a segfault during collective i/o. We only want to * look at the users cb_nodes if it's open time */ ok_to_override_cb_nodes = 1; } /* add in user's info if supplied */ if (users_info != MPI_INFO_NULL) { ADIOI_Info_check_and_install_int(fd, users_info, "cb_buffer_size", &(fd->hints->cb_buffer_size), myname, error_code); /* aligning file realms to certain sizes (e.g. stripe sizes) * may benefit I/O performance */ ADIOI_Info_check_and_install_int(fd, users_info, "romio_cb_fr_alignment", &(fd->hints->cb_fr_alignment), myname, error_code); /* for collective I/O, try to be smarter about when to do data sieving * using a specific threshold for the datatype size/extent * (percentage 0-100%) */ ADIOI_Info_check_and_install_int(fd, users_info, "romio_cb_ds_threshold", &(fd->hints->cb_ds_threshold), myname, error_code); ADIOI_Info_check_and_install_enabled(fd, users_info, "romio_cb_alltoall", &(fd->hints->cb_alltoall), myname, error_code); /* new hints for enabling/disabling coll. buffering on * reads/writes */ ADIOI_Info_check_and_install_enabled(fd, users_info, "romio_cb_read", &(fd->hints->cb_read), myname, error_code); if (fd->hints->cb_read == ADIOI_HINT_DISABLE) { /* romio_cb_read overrides no_indep_rw */ ADIOI_Info_set(info, "romio_no_indep_rw", "false"); fd->hints->no_indep_rw = ADIOI_HINT_DISABLE; } ADIOI_Info_check_and_install_enabled(fd, users_info, "romio_cb_write", &(fd->hints->cb_write), myname, error_code); if (fd->hints->cb_write == ADIOI_HINT_DISABLE) { /* romio_cb_write overrides no_indep_rw */ ADIOI_Info_set(info, "romio_no_indep_rw", "false"); fd->hints->no_indep_rw = ADIOI_HINT_DISABLE; } /* enable/disable persistent file realms for collective I/O */ /* may want to check for no_indep_rdwr hint as well */ ADIOI_Info_check_and_install_enabled(fd, users_info, "romio_cb_pfr", &(fd->hints->cb_pfr), myname, error_code); /* file realm assignment types ADIOI_FR_AAR(0), * ADIOI_FR_FSZ(-1), ADIOI_FR_USR_REALMS(-2), all others specify * a regular fr size in bytes. probably not the best way... */ ADIOI_Info_check_and_install_int(fd, users_info, "romio_cb_fr_type", &(fd->hints->cb_fr_type), myname, error_code); /* Has the user indicated all I/O will be done collectively? */ ADIOI_Info_check_and_install_true(fd, users_info, "romio_no_indep_rw", &(fd->hints->no_indep_rw), myname, error_code); if (fd->hints->no_indep_rw == 1) { /* if 'no_indep_rw' set, also hint that we will do * collective buffering: if we aren't doing independent io, * then we have to do collective */ ADIOI_Info_set(info, "romio_cb_write", "enable"); ADIOI_Info_set(info, "romio_cb_read", "enable"); fd->hints->cb_read = 1; fd->hints->cb_write = 1; } /* new hints for enabling/disabling data sieving on * reads/writes */ ADIOI_Info_check_and_install_enabled(fd, users_info, "romio_ds_read", &(fd->hints->ds_read), myname, error_code); ADIOI_Info_check_and_install_enabled(fd, users_info, "romio_ds_write", &(fd->hints->ds_write), myname, error_code); if (ok_to_override_cb_nodes) { /* MPI_File_open path sets up some data structrues that don't * get resized in the MPI_File_set_view path, so ignore * cb_nodes in the set_view case */ ADIOI_Info_check_and_install_int(fd, users_info, "cb_nodes", &(fd->hints->cb_nodes), myname, error_code); if ((fd->hints->cb_nodes <= 0) || (fd->hints->cb_nodes > nprocs)) { /* can't ask for more aggregators than mpi processes, though it * might be interesting to think what such oversubscription * might mean... someday */ MPL_snprintf(value, MPI_MAX_INFO_VAL + 1, "%d", nprocs); ADIOI_Info_set(info, "cb_nodes", value); fd->hints->cb_nodes = nprocs; } } /* if (ok_to_override_cb_nodes) */ ADIOI_Info_check_and_install_int(fd, users_info, "ind_wr_buffer_size", &(fd->hints->ind_wr_buffer_size), myname, error_code); ADIOI_Info_check_and_install_int(fd, users_info, "ind_rd_buffer_size", &(fd->hints->ind_rd_buffer_size), myname, error_code); if (fd->hints->cb_config_list == NULL) { /* only set cb_config_list if it isn't already set. Note that * since we set it below, this ensures that the cb_config_list hint * will be set at file open time either by the user or to the * default */ /* if it has been set already, we ignore it the second time. * otherwise we would get an error if someone used the same info * value with a cb_config_list value in it in a couple of calls, * which would be irritating. */ ADIOI_Info_check_and_install_str(fd, users_info, "cb_config_list", &(fd->hints->cb_config_list), myname, error_code); } ADIOI_Info_check_and_install_int(fd, users_info, "romio_min_fdomain_size", &(fd->hints->min_fdomain_size), myname, error_code); /* Now we use striping unit in common code so we should * process hints for it. */ ADIOI_Info_check_and_install_int(fd, users_info, "striping_unit", &(fd->hints->striping_unit), myname, error_code); } /* Begin hint post-processig: some hints take precidence over or conflict * with others, or aren't supported by some file systems */ /* handle cb_config_list default value here; avoids an extra * free/alloc and insures it is always set */ if (fd->hints->cb_config_list == NULL) { ADIOI_Info_set(info, "cb_config_list", ADIOI_CB_CONFIG_LIST_DFLT); len = (strlen(ADIOI_CB_CONFIG_LIST_DFLT) + 1) * sizeof(char); fd->hints->cb_config_list = ADIOI_Malloc(len); if (fd->hints->cb_config_list == NULL) { ADIOI_Free(value); *error_code = MPIO_Err_create_code(*error_code, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_OTHER, "**nomem2", 0); return; } ADIOI_Strncpy(fd->hints->cb_config_list, ADIOI_CB_CONFIG_LIST_DFLT, len); } /* deferred_open won't be set by callers, but if the user doesn't * explicitly disable collecitve buffering (two-phase) and does hint that * io w/o independent io is going on, we'll set this internal hint as a * convenience */ if (((fd->hints->cb_read != ADIOI_HINT_DISABLE) && (fd->hints->cb_write != ADIOI_HINT_DISABLE) && fd->hints->no_indep_rw)) { fd->hints->deferred_open = 1; } else { /* setting romio_no_indep_rw enable and romio_cb_{read,write} * disable at the same time doesn't make sense. honor * romio_cb_{read,write} and force the no_indep_rw hint to * 'disable' */ ADIOI_Info_set(info, "romio_no_indep_rw", "false"); fd->hints->no_indep_rw = 0; fd->hints->deferred_open = 0; } if (ADIO_Feature(fd, ADIO_DATA_SIEVING_WRITES) == 0) { /* disable data sieving for fs that do not * support file locking */ ADIOI_Info_get(info, "ind_wr_buffer_size", MPI_MAX_INFO_VAL, value, &flag); if (flag) { /* get rid of this value if it is set */ ADIOI_Info_delete(info, "ind_wr_buffer_size"); } /* note: leave ind_wr_buffer_size alone; used for other cases * as well. -- Rob Ross, 04/22/2003 */ ADIOI_Info_set(info, "romio_ds_write", "disable"); fd->hints->ds_write = ADIOI_HINT_DISABLE; } ADIOI_Free(value); *error_code = MPI_SUCCESS; }
void ADIOI_GPFS_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code) { /* if fd->info is null, create a new info object. Initialize fd->info to default values. Initialize fd->hints to default values. Examine the info object passed by the user. If it contains values that ROMIO understands, override the default. */ MPI_Info info; char *value; int flag, intval, nprocs=0, nprocs_is_valid = 0; static char myname[] = "ADIOI_GPFS_SETINFO"; int did_anything = 0; if (fd->info == MPI_INFO_NULL) MPI_Info_create(&(fd->info)); info = fd->info; /* Note that fd->hints is allocated at file open time; thus it is * not necessary to allocate it, or check for allocation, here. */ value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char)); ADIOI_Assert ((value != NULL)); /* initialize info and hints to default values if they haven't been * previously initialized */ if (!fd->hints->initialized) { ad_get_env_vars(); ad_gpfs_get_env_vars(); did_anything = 1; /* buffer size for collective I/O */ ADIOI_Info_set(info, "cb_buffer_size", ADIOI_GPFS_CB_BUFFER_SIZE_DFLT); fd->hints->cb_buffer_size = atoi(ADIOI_GPFS_CB_BUFFER_SIZE_DFLT); /* default is to let romio automatically decide when to use * collective buffering */ ADIOI_Info_set(info, "romio_cb_read", "enable"); fd->hints->cb_read = ADIOI_HINT_ENABLE; ADIOI_Info_set(info, "romio_cb_write", "enable"); fd->hints->cb_write = ADIOI_HINT_ENABLE; if ( fd->hints->cb_config_list != NULL ) ADIOI_Free (fd->hints->cb_config_list); fd->hints->cb_config_list = NULL; /* number of processes that perform I/O in collective I/O */ MPI_Comm_size(fd->comm, &nprocs); nprocs_is_valid = 1; MPL_snprintf(value, MPI_MAX_INFO_VAL+1, "%d", nprocs); ADIOI_Info_set(info, "cb_nodes", value); fd->hints->cb_nodes = -1; /* hint indicating that no indep. I/O will be performed on this file */ ADIOI_Info_set(info, "romio_no_indep_rw", "false"); fd->hints->no_indep_rw = 0; /* gpfs is not implementing file realms (ADIOI_IOStridedColl), initialize to disabled it. */ /* hint instructing the use of persistent file realms */ ADIOI_Info_set(info, "romio_cb_pfr", "disable"); fd->hints->cb_pfr = ADIOI_HINT_DISABLE; /* hint guiding the assignment of persistent file realms */ ADIOI_Info_set(info, "romio_cb_fr_types", "aar"); fd->hints->cb_fr_type = ADIOI_FR_AAR; /* hint to align file realms with a certain byte value */ ADIOI_Info_set(info, "romio_cb_fr_alignment", "1"); fd->hints->cb_fr_alignment = 1; /* hint to set a threshold percentage for a datatype's size/extent at * which data sieving should be done in collective I/O */ ADIOI_Info_set(info, "romio_cb_ds_threshold", "0"); fd->hints->cb_ds_threshold = 0; /* hint to switch between point-to-point or all-to-all for two-phase */ ADIOI_Info_set(info, "romio_cb_alltoall", "automatic"); fd->hints->cb_alltoall = ADIOI_HINT_AUTO; /* deferred_open derived from no_indep_rw and cb_{read,write} */ fd->hints->deferred_open = 0; /* buffer size for data sieving in independent reads */ ADIOI_Info_set(info, "ind_rd_buffer_size", ADIOI_GPFS_IND_RD_BUFFER_SIZE_DFLT); fd->hints->ind_rd_buffer_size = atoi(ADIOI_GPFS_IND_RD_BUFFER_SIZE_DFLT); /* buffer size for data sieving in independent writes */ ADIOI_Info_set(info, "ind_wr_buffer_size", ADIOI_GPFS_IND_WR_BUFFER_SIZE_DFLT); fd->hints->ind_wr_buffer_size = atoi(ADIOI_GPFS_IND_WR_BUFFER_SIZE_DFLT); ADIOI_Info_set(info, "romio_ds_read", "automatic"); fd->hints->ds_read = ADIOI_HINT_AUTO; ADIOI_Info_set(info, "romio_ds_write", "automatic"); fd->hints->ds_write = ADIOI_HINT_AUTO; /* still to do: tune this a bit for a variety of file systems. there's * no good default value so just leave it unset */ fd->hints->min_fdomain_size = 0; fd->hints->striping_unit = 0; fd->hints->initialized = 1; } /* add in user's info if supplied */ if (users_info != MPI_INFO_NULL) { ADIOI_Info_check_and_install_int(fd, users_info, "cb_buffer_size", &(fd->hints->cb_buffer_size), myname, error_code); /* new hints for enabling/disabling coll. buffering on * reads/writes */ ADIOI_Info_check_and_install_enabled(fd, users_info, "romio_cb_read", &(fd->hints->cb_read), myname, error_code); if (fd->hints->cb_read == ADIOI_HINT_DISABLE) { /* romio_cb_read overrides no_indep_rw */ ADIOI_Info_set(info, "romio_no_indep_rw", "false"); fd->hints->no_indep_rw = ADIOI_HINT_DISABLE; } ADIOI_Info_check_and_install_enabled(fd, users_info, "romio_cb_write", &(fd->hints->cb_write), myname, error_code); if (fd->hints->cb_write == ADIOI_HINT_DISABLE) { /* romio_cb_write overrides no_indep_rw */ ADIOI_Info_set(info, "romio_no_indep_rw", "false"); fd->hints->no_indep_rw = ADIOI_HINT_DISABLE; } /* Has the user indicated all I/O will be done collectively? */ ADIOI_Info_check_and_install_true(fd, users_info, "romio_no_indep_rw", &(fd->hints->no_indep_rw), myname, error_code); if (fd->hints->no_indep_rw == 1) { /* if 'no_indep_rw' set, also hint that we will do * collective buffering: if we aren't doing independent io, * then we have to do collective */ ADIOI_Info_set(info, "romio_cb_write", "enable"); ADIOI_Info_set(info, "romio_cb_read", "enable"); fd->hints->cb_read = 1; fd->hints->cb_write = 1; } /* new hints for enabling/disabling data sieving on * reads/writes */ ADIOI_Info_check_and_install_enabled(fd, users_info, "romio_ds_read", &(fd->hints->ds_read), myname, error_code); ADIOI_Info_check_and_install_enabled(fd, users_info, "romio_ds_write", &(fd->hints->ds_write), myname, error_code); ADIOI_Info_check_and_install_int(fd, users_info, "ind_wr_buffer_size", &(fd->hints->ind_wr_buffer_size), myname, error_code); ADIOI_Info_check_and_install_int(fd, users_info, "ind_rd_buffer_size", &(fd->hints->ind_rd_buffer_size), myname, error_code); memset( value, 0, MPI_MAX_INFO_VAL+1 ); ADIOI_Info_get(users_info, "romio_min_fdomain_size", MPI_MAX_INFO_VAL, value, &flag); if ( flag && ((intval = atoi(value)) > 0) ) { ADIOI_Info_set(info, "romio_min_fdomain_size", value); fd->hints->min_fdomain_size = intval; } /* Now we use striping unit in common code so we should process hints for it. */ ADIOI_Info_check_and_install_int(fd, users_info, "striping_unit", &(fd->hints->striping_unit), myname, error_code); #ifdef BGQPLATFORM memset( value, 0, MPI_MAX_INFO_VAL+1 ); ADIOI_Info_get(users_info, ADIOI_BG_NAGG_IN_PSET_HINT_NAME, MPI_MAX_INFO_VAL, value, &flag); if (flag && ((intval = atoi(value)) > 0)) { did_anything = 1; ADIOI_Info_set(info, ADIOI_BG_NAGG_IN_PSET_HINT_NAME, value); fd->hints->cb_nodes = intval; } #endif } /* special CB aggregator assignment */ if (did_anything) { #ifdef BGQPLATFORM ADIOI_BG_gen_agg_ranklist(fd, fd->hints->cb_nodes); #elif PEPLATFORM ADIOI_PE_gen_agg_ranklist(fd); #endif } /* deferred_open won't be set by callers, but if the user doesn't * explicitly disable collecitve buffering (two-phase) and does hint that * io w/o independent io is going on, we'll set this internal hint as a * convenience */ if ( ( (fd->hints->cb_read != ADIOI_HINT_DISABLE) \ && (fd->hints->cb_write != ADIOI_HINT_DISABLE)\ && fd->hints->no_indep_rw ) ) { fd->hints->deferred_open = 1; } else { /* setting romio_no_indep_rw enable and romio_cb_{read,write} * disable at the same time doesn't make sense. honor * romio_cb_{read,write} and force the no_indep_rw hint to * 'disable' */ ADIOI_Info_set(info, "romio_no_indep_rw", "false"); fd->hints->no_indep_rw = 0; fd->hints->deferred_open = 0; } /* BobC commented this out, but since hint processing runs on both bg and * bglockless, we need to keep DS writes enabled on gpfs and disabled on * PVFS */ if (ADIO_Feature(fd, ADIO_DATA_SIEVING_WRITES) == 0) { /* disable data sieving for fs that do not support file locking */ ADIOI_Info_get(info, "ind_wr_buffer_size", MPI_MAX_INFO_VAL, value, &flag); if (flag) { /* get rid of this value if it is set */ ADIOI_Info_delete(info, "ind_wr_buffer_size"); } /* note: leave ind_wr_buffer_size alone; used for other cases * as well. -- Rob Ross, 04/22/2003 */ ADIOI_Info_set(info, "romio_ds_write", "disable"); fd->hints->ds_write = ADIOI_HINT_DISABLE; } ADIOI_Free(value); *error_code = MPI_SUCCESS; }
/*@ MPI_File_open - Opens a file Input Parameters: . comm - communicator (handle) . filename - name of file to open (string) . amode - file access mode (integer) . info - info object (handle) Output Parameters: . fh - file handle (handle) .N fortran @*/ int MPI_File_open(MPI_Comm comm, ROMIO_CONST char *filename, int amode, MPI_Info info, MPI_File *fh) { int error_code = MPI_SUCCESS, file_system, flag, tmp_amode=0, rank; char *tmp; MPI_Comm dupcomm = MPI_COMM_NULL; ADIOI_Fns *fsops; static char myname[] = "MPI_FILE_OPEN"; #ifdef MPI_hpux int fl_xmpi; HPMP_IO_OPEN_START(fl_xmpi, comm); #endif /* MPI_hpux */ MPIU_THREAD_CS_ENTER(ALLFUNC,); /* --BEGIN ERROR HANDLING-- */ MPIO_CHECK_COMM(comm, myname, error_code); MPIO_CHECK_INFO_ALL(info, error_code, comm); /* --END ERROR HANDLING-- */ error_code = MPI_Comm_test_inter(comm, &flag); /* --BEGIN ERROR HANDLING-- */ if (error_code || flag) { error_code = MPIO_Err_create_code(error_code, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_COMM, "**commnotintra", 0); goto fn_fail; } if ( ((amode&MPI_MODE_RDONLY)?1:0) + ((amode&MPI_MODE_RDWR)?1:0) + ((amode&MPI_MODE_WRONLY)?1:0) != 1 ) { error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_AMODE, "**fileamodeone", 0); goto fn_fail; } if ((amode & MPI_MODE_RDONLY) && ((amode & MPI_MODE_CREATE) || (amode & MPI_MODE_EXCL))) { error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_AMODE, "**fileamoderead", 0); goto fn_fail; } if ((amode & MPI_MODE_RDWR) && (amode & MPI_MODE_SEQUENTIAL)) { error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_AMODE, "**fileamodeseq", 0); goto fn_fail; } MPI_Comm_dup(comm, &dupcomm); /* check if ADIO has been initialized. If not, initialize it */ MPIR_MPIOInit(&error_code); if (error_code != MPI_SUCCESS) goto fn_fail; /* check if amode is the same on all processes: at first glance, one might try * to use a built-in operator like MPI_BAND, but we need every mpi process to * agree the amode was not the same. Consider process A with * MPI_MODE_CREATE|MPI_MODE_RDWR, and B with MPI_MODE_RDWR: MPI_BAND yields * MPI_MODE_RDWR. A determines amodes are different, but B proceeds having not * detected an error */ MPI_Allreduce(&amode, &tmp_amode, 1, MPI_INT, ADIO_same_amode, dupcomm); if (tmp_amode == ADIO_AMODE_NOMATCH) { error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_NOT_SAME, "**fileamodediff", 0); goto fn_fail; } /* --END ERROR HANDLING-- */ file_system = -1; /* resolve file system type from file name; this is a collective call */ ADIO_ResolveFileType(dupcomm, filename, &file_system, &fsops, &error_code); /* --BEGIN ERROR HANDLING-- */ if (error_code != MPI_SUCCESS) { /* ADIO_ResolveFileType() will print as informative a message as it * possibly can or call MPIO_Err_setmsg. We just need to propagate * the error up. */ goto fn_fail; } /* --END ERROR HANDLING-- */ /* strip off prefix if there is one, but only skip prefixes * if they are greater than length one to allow for windows * drive specifications (e.g. c:\...) */ tmp = strchr(filename, ':'); if (tmp > filename + 1) { filename = tmp + 1; } /* use default values for disp, etype, filetype */ *fh = ADIO_Open(comm, dupcomm, filename, file_system, fsops, amode, 0, MPI_BYTE, MPI_BYTE, info, ADIO_PERM_NULL, &error_code); /* --BEGIN ERROR HANDLING-- */ if (error_code != MPI_SUCCESS) { goto fn_fail; } /* --END ERROR HANDLING-- */ /* if MPI_MODE_SEQUENTIAL requested, file systems cannot do explicit offset * or independent file pointer accesses, leaving not much else aside from * shared file pointer accesses. */ if ( !ADIO_Feature((*fh), ADIO_SHARED_FP) && (amode & MPI_MODE_SEQUENTIAL)) { error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_UNSUPPORTED_OPERATION, "**iosequnsupported", 0); ADIO_Close(*fh, &error_code); goto fn_fail; } /* determine name of file that will hold the shared file pointer */ /* can't support shared file pointers on a file system that doesn't support file locking. */ if ((error_code == MPI_SUCCESS) && ADIO_Feature((*fh), ADIO_SHARED_FP)) { MPI_Comm_rank(dupcomm, &rank); ADIOI_Shfp_fname(*fh, rank, &error_code); if (error_code != MPI_SUCCESS) goto fn_fail; /* if MPI_MODE_APPEND, set the shared file pointer to end of file. indiv. file pointer already set to end of file in ADIO_Open. Here file view is just bytes. */ if ((*fh)->access_mode & MPI_MODE_APPEND) { if (rank == (*fh)->hints->ranklist[0]) /* only one person need set the sharedfp */ ADIO_Set_shared_fp(*fh, (*fh)->fp_ind, &error_code); MPI_Barrier(dupcomm); } } #ifdef MPI_hpux HPMP_IO_OPEN_END(fl_xmpi, *fh, comm); #endif /* MPI_hpux */ fn_exit: MPIU_THREAD_CS_EXIT(ALLFUNC,); return error_code; fn_fail: /* --BEGIN ERROR HANDLING-- */ if (dupcomm != MPI_COMM_NULL) MPI_Comm_free(&dupcomm); error_code = MPIO_Err_return_file(MPI_FILE_NULL, error_code); goto fn_exit; /* --END ERROR HANDLING-- */ }
static int uses_generic_write(ADIO_File fd) { if (ADIO_Feature(fd, ADIO_TWO_PHASE)) return 1; return 0; }
/*@ MPI_File_close - Closes a file Input Parameters: . fh - file handle (handle) .N fortran @*/ int MPI_File_close(MPI_File *fh) { int error_code; ADIO_File adio_fh; static char myname[] = "MPI_FILE_CLOSE"; #ifdef MPI_hpux int fl_xmpi; HPMP_IO_WSTART(fl_xmpi, BLKMPIFILECLOSE, TRDTBLOCK, *adio_fh); #endif /* MPI_hpux */ MPIU_THREAD_CS_ENTER(ALLFUNC,); adio_fh = MPIO_File_resolve(*fh); /* --BEGIN ERROR HANDLING-- */ MPIO_CHECK_FILE_HANDLE(adio_fh, myname, error_code); /* --END ERROR HANDLING-- */ if (ADIO_Feature(adio_fh, ADIO_SHARED_FP)) { ADIOI_Free((adio_fh)->shared_fp_fname); /* POSIX semantics say a deleted file remains available until all * processes close the file. But since when was NFS posix-compliant? */ if (!ADIO_Feature(adio_fh, ADIO_UNLINK_AFTER_CLOSE)) { MPI_Barrier((adio_fh)->comm); } if ((adio_fh)->shared_fp_fd != ADIO_FILE_NULL) { MPI_File *fh_shared = &(adio_fh->shared_fp_fd); ADIO_Close((adio_fh)->shared_fp_fd, &error_code); MPIO_File_free(fh_shared); /* --BEGIN ERROR HANDLING-- */ if (error_code != MPI_SUCCESS) goto fn_fail; /* --END ERROR HANDLING-- */ } } /* Because ROMIO expects the MPI library to provide error handler management * routines but it doesn't ever participate in MPI_File_close, we have to * somehow inform the MPI library that we no longer hold a reference to any * user defined error handler. We do this by setting the errhandler at this * point to MPI_ERRORS_RETURN. */ error_code = PMPI_File_set_errhandler(*fh, MPI_ERRORS_RETURN); if (error_code != MPI_SUCCESS) goto fn_fail; ADIO_Close(adio_fh, &error_code); MPIO_File_free(fh); /* --BEGIN ERROR HANDLING-- */ if (error_code != MPI_SUCCESS) goto fn_fail; /* --END ERROR HANDLING-- */ #ifdef MPI_hpux HPMP_IO_WEND(fl_xmpi); #endif /* MPI_hpux */ fn_exit: MPIU_THREAD_CS_EXIT(ALLFUNC,); return error_code; fn_fail: /* --BEGIN ERROR HANDLING-- */ error_code = MPIO_Err_return_file(adio_fh, error_code); goto fn_exit; /* --END ERROR HANDLING-- */ }
MPI_File ADIO_Open(MPI_Comm orig_comm, MPI_Comm comm, const char *filename, int file_system, ADIOI_Fns *ops, int access_mode, ADIO_Offset disp, MPI_Datatype etype, MPI_Datatype filetype, MPI_Info info, int perm, int *error_code) { MPI_File mpi_fh; ADIO_File fd; int err, rank, procs; static char myname[] = "ADIO_OPEN"; int max_error_code; MPI_Info dupinfo; int syshints_processed, can_skip; char *p; *error_code = MPI_SUCCESS; /* obtain MPI_File handle */ mpi_fh = MPIO_File_create(sizeof(struct ADIOI_FileD)); if (mpi_fh == MPI_FILE_NULL) { fd = MPI_FILE_NULL; *error_code = MPIO_Err_create_code(*error_code, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_OTHER, "**nomem2",0); goto fn_exit; } fd = MPIO_File_resolve(mpi_fh); fd->cookie = ADIOI_FILE_COOKIE; fd->fp_ind = disp; fd->fp_sys_posn = 0; fd->comm = comm; /* dup'ed in MPI_File_open */ fd->filename = ADIOI_Strdup(filename); fd->file_system = file_system; fd->fs_ptr = NULL; fd->fns = ops; fd->disp = disp; fd->split_coll_count = 0; fd->shared_fp_fd = ADIO_FILE_NULL; fd->atomicity = 0; fd->etype = etype; /* MPI_BYTE by default */ fd->filetype = filetype; /* MPI_BYTE by default */ fd->etype_size = 1; /* default etype is MPI_BYTE */ fd->file_realm_st_offs = NULL; fd->file_realm_types = NULL; fd->perm = perm; fd->async_count = 0; fd->fortran_handle = -1; fd->err_handler = ADIOI_DFLT_ERR_HANDLER; fd->io_buf_window = MPI_WIN_NULL; fd->io_buf_put_amounts_window = MPI_WIN_NULL; MPI_Comm_rank(comm, &rank); MPI_Comm_size(comm, &procs); /* create and initialize info object */ fd->hints = (ADIOI_Hints *)ADIOI_Calloc(1, sizeof(struct ADIOI_Hints_struct)); if (fd->hints == NULL) { *error_code = MPIO_Err_create_code(*error_code, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_OTHER, "**nomem2",0); goto fn_exit; } fd->hints->cb_config_list = NULL; fd->hints->ranklist = NULL; fd->hints->initialized = 0; fd->info = MPI_INFO_NULL; /* move system-wide hint processing *back* into open, but this time the * hintfile reader will do a scalable read-and-broadcast. The global * ADIOI_syshints will get initialized at first open. subsequent open * calls will just use result from first open. * * We have two goals here: * 1: avoid processing the hintfile multiple times * 2: have all processes participate in hintfile processing (so we can read-and-broadcast) * * a code might do an "initialize from 0", so we can only skip hint * processing once everyone has particpiated in hint processing */ if (ADIOI_syshints == MPI_INFO_NULL) syshints_processed = 0; else syshints_processed = 1; MPI_Allreduce(&syshints_processed, &can_skip, 1, MPI_INT, MPI_MIN, fd->comm); if (!can_skip) { if (ADIOI_syshints == MPI_INFO_NULL) MPI_Info_create(&ADIOI_syshints); ADIOI_process_system_hints(fd, ADIOI_syshints); } ADIOI_incorporate_system_hints(info, ADIOI_syshints, &dupinfo); ADIO_SetInfo(fd, dupinfo, &err); if (dupinfo != MPI_INFO_NULL) { *error_code = MPI_Info_free(&dupinfo); if (*error_code != MPI_SUCCESS) goto fn_exit; } ADIOI_Info_set(fd->info, "romio_filesystem_type", fd->fns->fsname); /* Instead of repeatedly allocating this buffer in collective read/write, * allocating up-front might make memory management on small platforms * (e.g. Blue Gene) more efficent */ fd->io_buf = ADIOI_Malloc(fd->hints->cb_buffer_size); /* deferred open: * we can only do this optimization if 'fd->hints->deferred_open' is set * (which means the user hinted 'no_indep_rw' and collective buffering). * Furthermore, we only do this if our collective read/write routines use * our generic function, and not an fs-specific routine (we can defer opens * only if we use our aggreagation code). */ if (fd->hints->deferred_open && !(uses_generic_read(fd) \ && uses_generic_write(fd))) { fd->hints->deferred_open = 0; } if (ADIO_Feature(fd, ADIO_SCALABLE_OPEN)) /* disable deferred open on these fs so that scalable broadcast * will always use the propper communicator */ fd->hints->deferred_open = 0; /* on BlueGene, the cb_config_list is built when hints are processed. No * one else does that right now */ if (fd->hints->ranklist == NULL) { build_cb_config_list(fd, orig_comm, comm, rank, procs, error_code); if (*error_code != MPI_SUCCESS) goto fn_exit; } fd->is_open = 0; fd->my_cb_nodes_index = -2; fd->is_agg = is_aggregator(rank, fd); /* deferred open used to split the communicator to create an "aggregator * communicator", but we only used it as a way to indicate that deferred * open happened. fd->is_open and fd->is_agg are sufficient */ /* actual opens start here */ /* generic open: one process opens to create the file, all others open */ /* nfs open: everybody opens or else you'll end up with "file not found" * due to stupid nfs consistency semantics */ /* scalable open: one process opens and broadcasts results to everyone */ ADIOI_OpenColl(fd, rank, access_mode, error_code); /* for debugging, it can be helpful to see the hints selected. Some file * systes set up the hints in the open call (e.g. lustre) */ p = getenv("ROMIO_PRINT_HINTS"); if (rank == 0 && p != NULL ) { ADIOI_Info_print_keyvals(fd->info); } fn_exit: MPI_Allreduce(error_code, &max_error_code, 1, MPI_INT, MPI_MAX, comm); if (max_error_code != MPI_SUCCESS) { /* If the file was successfully opened, close it */ if (*error_code == MPI_SUCCESS) { /* in the deferred open case, only those who have actually opened the file should close it */ if (fd->hints->deferred_open) { if (fd->is_agg) { (*(fd->fns->ADIOI_xxx_Close))(fd, error_code); } } else { (*(fd->fns->ADIOI_xxx_Close))(fd, error_code); } } ADIOI_Free(fd->filename); ADIOI_Free(fd->hints->ranklist); ADIOI_Free(fd->hints->cb_config_list); ADIOI_Free(fd->hints); if (fd->info != MPI_INFO_NULL) MPI_Info_free(&(fd->info)); ADIOI_Free(fd->io_buf); ADIOI_Free(fd); fd = ADIO_FILE_NULL; if (*error_code == MPI_SUCCESS) { *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_IO, "**oremote_fail", 0); } } return fd; }
/*@ MPI_File_set_size - Sets the file size Input Parameters: . fh - file handle (handle) . size - size to truncate or expand file (nonnegative integer) .N fortran @*/ int MPI_File_set_size(MPI_File fh, MPI_Offset size) { int error_code; ADIO_File adio_fh; static char myname[] = "MPI_FILE_SET_SIZE"; MPI_Offset tmp_sz, max_sz, min_sz; #ifdef MPI_hpux int fl_xmpi; HPMP_IO_START(fl_xmpi, BLKMPIFILESETSIZE, TRDTBLOCK, adio_fh, MPI_DATATYPE_NULL, -1); #endif /* MPI_hpux */ ROMIO_THREAD_CS_ENTER(); adio_fh = MPIO_File_resolve(fh); /* --BEGIN ERROR HANDLING-- */ MPIO_CHECK_FILE_HANDLE(adio_fh, myname, error_code); MPIO_CHECK_NOT_SEQUENTIAL_MODE(adio_fh, myname, error_code); if (size < 0) { error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_ARG, "**iobadsize", 0); error_code = MPIO_Err_return_file(adio_fh, error_code); goto fn_exit; } MPIO_CHECK_WRITABLE(fh, myname, error_code); /* --END ERROR HANDLING-- */ tmp_sz = size; MPI_Allreduce(&tmp_sz, &max_sz, 1, ADIO_OFFSET, MPI_MAX, adio_fh->comm); MPI_Allreduce(&tmp_sz, &min_sz, 1, ADIO_OFFSET, MPI_MIN, adio_fh->comm); /* --BEGIN ERROR HANDLING-- */ if (max_sz != min_sz) { error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_ARG, "**notsame", 0); error_code = MPIO_Err_return_file(adio_fh, error_code); goto fn_exit; } /* --END ERROR HANDLING-- */ if (!ADIO_Feature(adio_fh, ADIO_SCALABLE_RESIZE)) { /* rare stupid file systems (like NFS) need to carry out resize on all * processes */ ADIOI_TEST_DEFERRED(adio_fh, "MPI_File_set_size", &error_code); } ADIO_Resize(adio_fh, size, &error_code); /* TODO: what to do with error code? */ /* --BEGIN ERROR HANDLING-- */ if (error_code != MPI_SUCCESS) error_code = MPIO_Err_return_file(adio_fh, error_code); /* --END ERROR HANDLING-- */ #ifdef MPI_hpux HPMP_IO_END(fl_xmpi, adio_fh, MPI_DATATYPE_NULL, -1); #endif /* MPI_hpux */ fn_exit: ROMIO_THREAD_CS_EXIT(); return error_code; }
void ADIOI_BGL_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code) { /* if fd->info is null, create a new info object. Initialize fd->info to default values. Initialize fd->hints to default values. Examine the info object passed by the user. If it contains values that ROMIO understands, override the default. */ MPI_Info info; char *value; int flag, intval, tmp_val, nprocs=0, nprocs_is_valid = 0; static char myname[] = "ADIOI_BGL_SETINFO"; int did_anything = 0; if (fd->info == MPI_INFO_NULL) MPI_Info_create(&(fd->info)); info = fd->info; /* Note that fd->hints is allocated at file open time; thus it is * not necessary to allocate it, or check for allocation, here. */ value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char)); AD_BGL_assert ((value != NULL)); /* initialize info and hints to default values if they haven't been * previously initialized */ if (!fd->hints->initialized) { did_anything = 1; /* buffer size for collective I/O */ ADIOI_Info_set(info, "cb_buffer_size", ADIOI_BGL_CB_BUFFER_SIZE_DFLT); fd->hints->cb_buffer_size = atoi(ADIOI_BGL_CB_BUFFER_SIZE_DFLT); /* default is to let romio automatically decide when to use * collective buffering */ ADIOI_Info_set(info, "romio_cb_read", "enable"); fd->hints->cb_read = ADIOI_HINT_ENABLE; ADIOI_Info_set(info, "romio_cb_write", "enable"); fd->hints->cb_write = ADIOI_HINT_ENABLE; if ( fd->hints->cb_config_list != NULL ) ADIOI_Free (fd->hints->cb_config_list); fd->hints->cb_config_list = NULL; /* number of processes that perform I/O in collective I/O */ MPI_Comm_size(fd->comm, &nprocs); nprocs_is_valid = 1; ADIOI_Snprintf(value, MPI_MAX_INFO_VAL+1, "%d", nprocs); ADIOI_Info_set(info, "cb_nodes", value); fd->hints->cb_nodes = -1; /* hint indicating that no indep. I/O will be performed on this file */ ADIOI_Info_set(info, "romio_no_indep_rw", "false"); fd->hints->no_indep_rw = 0; /* bgl is not implementing file realms (ADIOI_IOStridedColl), initialize to disabled it. */ /* hint instructing the use of persistent file realms */ ADIOI_Info_set(info, "romio_cb_pfr", "disable"); fd->hints->cb_pfr = ADIOI_HINT_DISABLE; /* hint guiding the assignment of persistent file realms */ ADIOI_Info_set(info, "romio_cb_fr_types", "aar"); fd->hints->cb_fr_type = ADIOI_FR_AAR; /* hint to align file realms with a certain byte value */ ADIOI_Info_set(info, "romio_cb_fr_alignment", "1"); fd->hints->cb_fr_alignment = 1; /* hint to set a threshold percentage for a datatype's size/extent at * which data sieving should be done in collective I/O */ ADIOI_Info_set(info, "romio_cb_ds_threshold", "0"); fd->hints->cb_ds_threshold = 0; /* hint to switch between point-to-point or all-to-all for two-phase */ ADIOI_Info_set(info, "romio_cb_alltoall", "automatic"); fd->hints->cb_alltoall = ADIOI_HINT_AUTO; /* deferred_open derived from no_indep_rw and cb_{read,write} */ fd->hints->deferred_open = 0; /* buffer size for data sieving in independent reads */ ADIOI_Info_set(info, "ind_rd_buffer_size", ADIOI_BGL_IND_RD_BUFFER_SIZE_DFLT); fd->hints->ind_rd_buffer_size = atoi(ADIOI_BGL_IND_RD_BUFFER_SIZE_DFLT); /* buffer size for data sieving in independent writes */ ADIOI_Info_set(info, "ind_wr_buffer_size", ADIOI_BGL_IND_WR_BUFFER_SIZE_DFLT); fd->hints->ind_wr_buffer_size = atoi(ADIOI_BGL_IND_WR_BUFFER_SIZE_DFLT); if(fd->file_system == ADIO_UFS) { /* default for ufs/pvfs is to disable data sieving */ ADIOI_Info_set(info, "romio_ds_read", "disable"); fd->hints->ds_read = ADIOI_HINT_DISABLE; ADIOI_Info_set(info, "romio_ds_write", "disable"); fd->hints->ds_write = ADIOI_HINT_DISABLE; } else { /* default is to let romio automatically decide when to use data * sieving */ ADIOI_Info_set(info, "romio_ds_read", "automatic"); fd->hints->ds_read = ADIOI_HINT_AUTO; ADIOI_Info_set(info, "romio_ds_write", "automatic"); fd->hints->ds_write = ADIOI_HINT_AUTO; } /* still to do: tune this a bit for a variety of file systems. there's * no good default value so just leave it unset */ fd->hints->min_fdomain_size = 0; fd->hints->striping_unit = 0; fd->hints->initialized = 1; } /* add in user's info if supplied */ if (users_info != MPI_INFO_NULL) { ADIOI_Info_get(users_info, "cb_buffer_size", MPI_MAX_INFO_VAL, value, &flag); if (flag && ((intval=atoi(value)) > 0)) { tmp_val = intval; MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm); /* --BEGIN ERROR HANDLING-- */ if (tmp_val != intval) { MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname, "cb_buffer_size", error_code); return; } /* --END ERROR HANDLING-- */ ADIOI_Info_set(info, "cb_buffer_size", value); fd->hints->cb_buffer_size = intval; } #if 0 /* bgl is not implementing file realms (ADIOI_IOStridedColl) ... */ /* aligning file realms to certain sizes (e.g. stripe sizes) * may benefit I/O performance */ ADIOI_Info_get(users_info, "romio_cb_fr_alignment", MPI_MAX_INFO_VAL, value, &flag); if (flag && ((intval=atoi(value)) > 0)) { tmp_val = intval; MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm); /* --BEGIN ERROR HANDLING-- */ if (tmp_val != intval) { MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname, "romio_cb_fr_alignment", error_code); return; } /* --END ERROR HANDLING-- */ ADIOI_Info_set(info, "romio_cb_fr_alignment", value); fd->hints->cb_fr_alignment = intval; } /* for collective I/O, try to be smarter about when to do data sieving * using a specific threshold for the datatype size/extent * (percentage 0-100%) */ ADIOI_Info_get(users_info, "romio_cb_ds_threshold", MPI_MAX_INFO_VAL, value, &flag); if (flag && ((intval=atoi(value)) > 0)) { tmp_val = intval; MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm); /* --BEGIN ERROR HANDLING-- */ if (tmp_val != intval) { MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname, "romio_cb_ds_threshold", error_code); return; } /* --END ERROR HANDLING-- */ ADIOI_Info_set(info, "romio_cb_ds_threshold", value); fd->hints->cb_ds_threshold = intval; } ADIOI_Info_get(users_info, "romio_cb_alltoall", MPI_MAX_INFO_VAL, value, &flag); if (flag) { if (!strcmp(value, "enable") || !strcmp(value, "ENABLE")) { ADIOI_Info_set(info, "romio_cb_alltoall", value); fd->hints->cb_read = ADIOI_HINT_ENABLE; } else if (!strcmp(value, "disable") || !strcmp(value, "DISABLE")) { ADIOI_Info_set(info, "romio_cb_alltoall", value); fd->hints->cb_read = ADIOI_HINT_DISABLE; } else if (!strcmp(value, "automatic") || !strcmp(value, "AUTOMATIC")) { ADIOI_Info_set(info, "romio_cb_alltoall", value); fd->hints->cb_read = ADIOI_HINT_AUTO; } tmp_val = fd->hints->cb_alltoall; MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm); /* --BEGIN ERROR HANDLING-- */ if (tmp_val != fd->hints->cb_alltoall) { MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname, "romio_cb_alltoall", error_code); return; } /* --END ERROR HANDLING-- */ } #endif /* new hints for enabling/disabling coll. buffering on * reads/writes */ ADIOI_Info_get(users_info, "romio_cb_read", MPI_MAX_INFO_VAL, value, &flag); if (flag) { if (!strcmp(value, "enable") || !strcmp(value, "ENABLE")) { ADIOI_Info_set(info, "romio_cb_read", value); fd->hints->cb_read = ADIOI_HINT_ENABLE; } else if (!strcmp(value, "disable") || !strcmp(value, "DISABLE")) { /* romio_cb_read overrides no_indep_rw */ ADIOI_Info_set(info, "romio_cb_read", value); ADIOI_Info_set(info, "romio_no_indep_rw", "false"); fd->hints->cb_read = ADIOI_HINT_DISABLE; fd->hints->no_indep_rw = ADIOI_HINT_DISABLE; } else if (!strcmp(value, "automatic") || !strcmp(value, "AUTOMATIC")) { ADIOI_Info_set(info, "romio_cb_read", value); fd->hints->cb_read = ADIOI_HINT_AUTO; } tmp_val = fd->hints->cb_read; MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm); /* --BEGIN ERROR HANDLING-- */ if (tmp_val != fd->hints->cb_read) { MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname, "romio_cb_read", error_code); return; } /* --END ERROR HANDLING-- */ } ADIOI_Info_get(users_info, "romio_cb_write", MPI_MAX_INFO_VAL, value, &flag); if (flag) { if (!strcmp(value, "enable") || !strcmp(value, "ENABLE")) { ADIOI_Info_set(info, "romio_cb_write", value); fd->hints->cb_write = ADIOI_HINT_ENABLE; } else if (!strcmp(value, "disable") || !strcmp(value, "DISABLE")) { /* romio_cb_write overrides no_indep_rw, too */ ADIOI_Info_set(info, "romio_cb_write", value); ADIOI_Info_set(info, "romio_no_indep_rw", "false"); fd->hints->cb_write = ADIOI_HINT_DISABLE; fd->hints->no_indep_rw = ADIOI_HINT_DISABLE; } else if (!strcmp(value, "automatic") || !strcmp(value, "AUTOMATIC")) { ADIOI_Info_set(info, "romio_cb_write", value); fd->hints->cb_write = ADIOI_HINT_AUTO; } tmp_val = fd->hints->cb_write; MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm); /* --BEGIN ERROR HANDLING-- */ if (tmp_val != fd->hints->cb_write) { MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname, "romio_cb_write", error_code); return; } /* --END ERROR HANDLING-- */ } #if 0 /* bgl is not implementing file realms (ADIOI_IOStridedColl) ... */ /* enable/disable persistent file realms for collective I/O */ /* may want to check for no_indep_rdwr hint as well */ ADIOI_Info_get(users_info, "romio_cb_pfr", MPI_MAX_INFO_VAL, value, &flag); if (flag) { if (!strcmp(value, "enable") || !strcmp(value, "ENABLE")) { ADIOI_Info_set(info, "romio_cb_pfr", value); fd->hints->cb_pfr = ADIOI_HINT_ENABLE; } else if (!strcmp(value, "disable") || !strcmp(value, "DISABLE")) { ADIOI_Info_set(info, "romio_cb_pfr", value); fd->hints->cb_pfr = ADIOI_HINT_DISABLE; } else if (!strcmp(value, "automatic") || !strcmp(value, "AUTOMATIC")) { ADIOI_Info_set(info, "romio_cb_pfr", value); fd->hints->cb_pfr = ADIOI_HINT_AUTO; } tmp_val = fd->hints->cb_pfr; MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm); /* --BEGIN ERROR HANDLING-- */ if (tmp_val != fd->hints->cb_pfr) { MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname, "romio_cb_pfr", error_code); return; } /* --END ERROR HANDLING-- */ } /* file realm assignment types ADIOI_FR_AAR(0), ADIOI_FR_FSZ(-1), ADIOI_FR_USR_REALMS(-2), all others specify a regular fr size in bytes. probably not the best way... */ ADIOI_Info_get(users_info, "romio_cb_fr_type", MPI_MAX_INFO_VAL, value, &flag); if (flag && ((intval=atoi(value)) >= -2)) { tmp_val = intval; MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm); /* --BEGIN ERROR HANDLING-- */ if (tmp_val != intval) { MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname, "romio_cb_fr_type", error_code); return; } /* --END ERROR HANDLING-- */ ADIOI_Info_set(info, "romio_cb_fr_type", value); fd->hints->cb_fr_type = intval; } #endif /* new hint for specifying no indep. read/write will be performed */ ADIOI_Info_get(users_info, "romio_no_indep_rw", MPI_MAX_INFO_VAL, value, &flag); if (flag) { if (!strcmp(value, "true") || !strcmp(value, "TRUE")) { /* if 'no_indep_rw' set, also hint that we will do * collective buffering: if we aren't doing independent io, * then we have to do collective */ ADIOI_Info_set(info, "romio_no_indep_rw", value); ADIOI_Info_set(info, "romio_cb_write", "enable"); ADIOI_Info_set(info, "romio_cb_read", "enable"); fd->hints->no_indep_rw = 1; fd->hints->cb_read = 1; fd->hints->cb_write = 1; tmp_val = 1; } else if (!strcmp(value, "false") || !strcmp(value, "FALSE")) { ADIOI_Info_set(info, "romio_no_indep_rw", value); fd->hints->no_indep_rw = 0; tmp_val = 0; } else { /* default is above */ tmp_val = 0; } MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm); /* --BEGIN ERROR HANDLING-- */ if (tmp_val != fd->hints->no_indep_rw) { MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname, "romio_no_indep_rw", error_code); return; } /* --END ERROR HANDLING-- */ } /* new hints for enabling/disabling data sieving on * reads/writes */ ADIOI_Info_get(users_info, "romio_ds_read", MPI_MAX_INFO_VAL, value, &flag); if (flag) { if (!strcmp(value, "enable") || !strcmp(value, "ENABLE")) { ADIOI_Info_set(info, "romio_ds_read", value); fd->hints->ds_read = ADIOI_HINT_ENABLE; } else if (!strcmp(value, "disable") || !strcmp(value, "DISABLE")) { ADIOI_Info_set(info, "romio_ds_read", value); fd->hints->ds_read = ADIOI_HINT_DISABLE; } else if (!strcmp(value, "automatic") || !strcmp(value, "AUTOMATIC")) { ADIOI_Info_set(info, "romio_ds_read", value); fd->hints->ds_read = ADIOI_HINT_AUTO; } /* otherwise ignore */ } ADIOI_Info_get(users_info, "romio_ds_write", MPI_MAX_INFO_VAL, value, &flag); if (flag) { if (!strcmp(value, "enable") || !strcmp(value, "ENABLE")) { ADIOI_Info_set(info, "romio_ds_write", value); fd->hints->ds_write = ADIOI_HINT_ENABLE; } else if (!strcmp(value, "disable") || !strcmp(value, "DISABLE")) { ADIOI_Info_set(info, "romio_ds_write", value); fd->hints->ds_write = ADIOI_HINT_DISABLE; } else if (!strcmp(value, "automatic") || !strcmp(value, "AUTOMATIC")) { ADIOI_Info_set(info, "romio_ds_write", value); fd->hints->ds_write = ADIOI_HINT_AUTO; } /* otherwise ignore */ } ADIOI_Info_get(users_info, "ind_wr_buffer_size", MPI_MAX_INFO_VAL, value, &flag); if (flag && ((intval = atoi(value)) > 0)) { ADIOI_Info_set(info, "ind_wr_buffer_size", value); fd->hints->ind_wr_buffer_size = intval; } ADIOI_Info_get(users_info, "ind_rd_buffer_size", MPI_MAX_INFO_VAL, value, &flag); if (flag && ((intval = atoi(value)) > 0)) { ADIOI_Info_set(info, "ind_rd_buffer_size", value); fd->hints->ind_rd_buffer_size = intval; } memset( value, 0, MPI_MAX_INFO_VAL+1 ); ADIOI_Info_get(users_info, "romio_min_fdomain_size", MPI_MAX_INFO_VAL, value, &flag); if ( flag && ((intval = atoi(value)) > 0) ) { ADIOI_Info_set(info, "romio_min_fdomain_size", value); fd->hints->min_fdomain_size = intval; } /* Now we use striping unit in common code so we should process hints for it. */ ADIOI_Info_get(users_info, "striping_unit", MPI_MAX_INFO_VAL, value, &flag); if ( flag && ((intval = atoi(value)) > 0) ) { ADIOI_Info_set(info, "striping_unit", value); fd->hints->striping_unit = intval; } memset( value, 0, MPI_MAX_INFO_VAL+1 ); ADIOI_Info_get(users_info, ADIOI_BGL_NAGG_IN_PSET_HINT_NAME, MPI_MAX_INFO_VAL, value, &flag); if (flag && ((intval = atoi(value)) > 0)) { did_anything = 1; ADIOI_Info_set(info, ADIOI_BGL_NAGG_IN_PSET_HINT_NAME, value); fd->hints->cb_nodes = intval; } } /* associate CB aggregators to certain CNs in every involved PSET */ if (did_anything) { ADIOI_BGL_gen_agg_ranklist(fd, fd->hints->cb_nodes); } /* ignore defered open hints and do not enable it for bluegene: need all * processors in the open path so we can stat-and-broadcast the blocksize */ ADIOI_Info_set(info, "romio_no_indep_rw", "false"); fd->hints->no_indep_rw = 0; fd->hints->deferred_open = 0; /* BobC commented this out, but since hint processing runs on both bgl and * bglockless, we need to keep DS writes enabled on gpfs and disabled on * PVFS */ if (ADIO_Feature(fd, ADIO_DATA_SIEVING_WRITES) == 0) { /* disable data sieving for fs that do not support file locking */ ADIOI_Info_get(info, "ind_wr_buffer_size", MPI_MAX_INFO_VAL, value, &flag); if (flag) { /* get rid of this value if it is set */ ADIOI_Info_delete(info, "ind_wr_buffer_size"); } /* note: leave ind_wr_buffer_size alone; used for other cases * as well. -- Rob Ross, 04/22/2003 */ ADIOI_Info_set(info, "romio_ds_write", "disable"); fd->hints->ds_write = ADIOI_HINT_DISABLE; } ADIOI_Free(value); *error_code = MPI_SUCCESS; }
int MPIOI_File_read(MPI_File fh, MPI_Offset offset, int file_ptr_type, void *buf, int count, MPI_Datatype datatype, char *myname, MPI_Status *status) { int error_code, buftype_is_contig, filetype_is_contig; MPI_Count datatype_size; ADIO_File adio_fh; ADIO_Offset off, bufsize; void *xbuf=NULL, *e32_buf=NULL; ROMIO_THREAD_CS_ENTER(); adio_fh = MPIO_File_resolve(fh); /* --BEGIN ERROR HANDLING-- */ MPIO_CHECK_FILE_HANDLE(adio_fh, myname, error_code); MPIO_CHECK_COUNT(adio_fh, count, myname, error_code); MPIO_CHECK_DATATYPE(adio_fh, datatype, myname, error_code); if (file_ptr_type == ADIO_EXPLICIT_OFFSET && offset < 0) { error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_ARG, "**iobadoffset", 0); error_code = MPIO_Err_return_file(adio_fh, error_code); goto fn_exit; } /* --END ERROR HANDLING-- */ MPI_Type_size_x(datatype, &datatype_size); /* --BEGIN ERROR HANDLING-- */ MPIO_CHECK_COUNT_SIZE(adio_fh, count, datatype_size, myname, error_code); /* --END ERROR HANDLING-- */ if (count*datatype_size == 0) { #ifdef HAVE_STATUS_SET_BYTES MPIR_Status_set_bytes(status, datatype, 0); #endif error_code = MPI_SUCCESS; goto fn_exit; } /* --BEGIN ERROR HANDLING-- */ MPIO_CHECK_INTEGRAL_ETYPE(adio_fh, count, datatype_size, myname, error_code); MPIO_CHECK_READABLE(adio_fh, myname, error_code); MPIO_CHECK_NOT_SEQUENTIAL_MODE(adio_fh, myname, error_code); /* --END ERROR HANDLING-- */ ADIOI_Datatype_iscontig(datatype, &buftype_is_contig); ADIOI_Datatype_iscontig(adio_fh->filetype, &filetype_is_contig); ADIOI_TEST_DEFERRED(adio_fh, myname, &error_code); xbuf = buf; if (adio_fh->is_external32) { MPI_Aint e32_size = 0; error_code = MPIU_datatype_full_size(datatype, &e32_size); if (error_code != MPI_SUCCESS) goto fn_exit; e32_buf = ADIOI_Malloc(e32_size*count); xbuf = e32_buf; } if (buftype_is_contig && filetype_is_contig) { /* convert count and offset to bytes */ bufsize = datatype_size * count; if (file_ptr_type == ADIO_EXPLICIT_OFFSET) { off = adio_fh->disp + adio_fh->etype_size * offset; } else /* ADIO_INDIVIDUAL */ { off = adio_fh->fp_ind; } /* if atomic mode requested, lock (exclusive) the region, because there could be a concurrent noncontiguous request. */ if ((adio_fh->atomicity) && ADIO_Feature(adio_fh, ADIO_LOCKS)) { ADIOI_WRITE_LOCK(adio_fh, off, SEEK_SET, bufsize); } ADIO_ReadContig(adio_fh, xbuf, count, datatype, file_ptr_type, off, status, &error_code); if ((adio_fh->atomicity) && ADIO_Feature(adio_fh, ADIO_LOCKS)) { ADIOI_UNLOCK(adio_fh, off, SEEK_SET, bufsize); } } else { ADIO_ReadStrided(adio_fh, xbuf, count, datatype, file_ptr_type, offset, status, &error_code); /* For strided and atomic mode, locking is done in ADIO_ReadStrided */ } /* --BEGIN ERROR HANDLING-- */ if (error_code != MPI_SUCCESS) error_code = MPIO_Err_return_file(adio_fh, error_code); /* --END ERROR HANDLING-- */ if (e32_buf != NULL) { error_code = MPIU_read_external32_conversion_fn(buf, datatype, count, e32_buf); ADIOI_Free(e32_buf); } fn_exit: ROMIO_THREAD_CS_EXIT(); return error_code; }
void ADIOI_GEN_WriteStrided_naive(ADIO_File fd, const void *buf, int count, MPI_Datatype buftype, int file_ptr_type, ADIO_Offset offset, ADIO_Status *status, int *error_code) { /* offset is in units of etype relative to the filetype. */ ADIOI_Flatlist_node *flat_buf, *flat_file; /* bwr == buffer write; fwr == file write */ ADIO_Offset bwr_size, fwr_size=0, sum, size_in_filetype; int b_index; MPI_Count bufsize; ADIO_Offset n_etypes_in_filetype; ADIO_Offset size, n_filetypes, etype_in_filetype; ADIO_Offset abs_off_in_filetype=0, req_len; MPI_Count filetype_size, etype_size, buftype_size; MPI_Aint filetype_extent, buftype_extent, lb; int buf_count, buftype_is_contig, filetype_is_contig; ADIO_Offset userbuf_off; ADIO_Offset off, req_off, disp, end_offset=0, start_off; ADIO_Status status1; *error_code = MPI_SUCCESS; /* changed below if error */ ADIOI_Datatype_iscontig(buftype, &buftype_is_contig); ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig); MPI_Type_size_x(fd->filetype, &filetype_size); if ( ! filetype_size ) { #ifdef HAVE_STATUS_SET_BYTES MPIR_Status_set_bytes(status, buftype, 0); #endif *error_code = MPI_SUCCESS; return; } MPI_Type_get_extent(fd->filetype, &lb, &filetype_extent); MPI_Type_size_x(buftype, &buftype_size); MPI_Type_get_extent(buftype, &lb, &buftype_extent); etype_size = fd->etype_size; ADIOI_Assert((buftype_size * count) == ((ADIO_Offset)(unsigned)buftype_size * (ADIO_Offset)count)); bufsize = buftype_size * count; /* contiguous in buftype and filetype is handled elsewhere */ if (!buftype_is_contig && filetype_is_contig) { int b_count; /* noncontiguous in memory, contiguous in file. */ flat_buf = ADIOI_Flatten_and_find(buftype); off = (file_ptr_type == ADIO_INDIVIDUAL) ? fd->fp_ind : fd->disp + (ADIO_Offset)etype_size * offset; start_off = off; end_offset = off + bufsize - 1; /* if atomicity is true, lock (exclusive) the region to be accessed */ if ((fd->atomicity) && ADIO_Feature(fd, ADIO_LOCKS)) { ADIOI_WRITE_LOCK(fd, start_off, SEEK_SET, end_offset-start_off+1); } /* for each region in the buffer, grab the data and put it in * place */ for (b_count=0; b_count < count; b_count++) { for (b_index=0; b_index < flat_buf->count; b_index++) { userbuf_off = (ADIO_Offset)b_count*(ADIO_Offset)buftype_extent + flat_buf->indices[b_index]; req_off = off; req_len = flat_buf->blocklens[b_index]; ADIOI_Assert(req_len == (int) req_len); ADIOI_Assert((((ADIO_Offset)(MPIU_Upint)buf) + userbuf_off) == (ADIO_Offset)(MPIU_Upint)((MPIU_Upint)buf + userbuf_off)); ADIO_WriteContig(fd, (char *) buf + userbuf_off, (int)req_len, MPI_BYTE, ADIO_EXPLICIT_OFFSET, req_off, &status1, error_code); if (*error_code != MPI_SUCCESS) return; /* off is (potentially) used to save the final offset later */ off += flat_buf->blocklens[b_index]; } } if ((fd->atomicity) && ADIO_Feature(fd, ADIO_LOCKS)) { ADIOI_UNLOCK(fd, start_off, SEEK_SET, end_offset-start_off+1); } if (file_ptr_type == ADIO_INDIVIDUAL) fd->fp_ind = off; } else { /* noncontiguous in file */ int f_index, st_index = 0; ADIO_Offset st_fwr_size, st_n_filetypes; int flag; /* First we're going to calculate a set of values for use in all * the noncontiguous in file cases: * start_off - starting byte position of data in file * end_offset - last byte offset to be acessed in the file * st_n_filetypes - how far into the file we start in terms of * whole filetypes * st_index - index of block in first filetype that we will be * starting in (?) * st_fwr_size - size of the data in the first filetype block * that we will write (accounts for being part-way * into writing this block of the filetype * */ /* filetype already flattened in ADIO_Open */ flat_file = ADIOI_Flatlist; while (flat_file->type != fd->filetype) flat_file = flat_file->next; disp = fd->disp; if (file_ptr_type == ADIO_INDIVIDUAL) { start_off = fd->fp_ind; /* in bytes */ n_filetypes = -1; flag = 0; while (!flag) { n_filetypes++; for (f_index=0; f_index < flat_file->count; f_index++) { if (disp + flat_file->indices[f_index] + n_filetypes*(ADIO_Offset)filetype_extent + flat_file->blocklens[f_index] >= start_off) { /* this block contains our starting position */ st_index = f_index; fwr_size = disp + flat_file->indices[f_index] + n_filetypes*(ADIO_Offset)filetype_extent + flat_file->blocklens[f_index] - start_off; flag = 1; break; } } } } else { n_etypes_in_filetype = filetype_size/etype_size; n_filetypes = offset / n_etypes_in_filetype; etype_in_filetype = offset % n_etypes_in_filetype; size_in_filetype = etype_in_filetype * etype_size; sum = 0; for (f_index=0; f_index < flat_file->count; f_index++) { sum += flat_file->blocklens[f_index]; if (sum > size_in_filetype) { st_index = f_index; fwr_size = sum - size_in_filetype; abs_off_in_filetype = flat_file->indices[f_index] + size_in_filetype - (sum - flat_file->blocklens[f_index]); break; } } /* abs. offset in bytes in the file */ start_off = disp + n_filetypes*(ADIO_Offset)filetype_extent + abs_off_in_filetype; } st_fwr_size = fwr_size; st_n_filetypes = n_filetypes; /* start_off, st_n_filetypes, st_index, and st_fwr_size are * all calculated at this point */ /* Calculate end_offset, the last byte-offset that will be accessed. * e.g., if start_off=0 and 100 bytes to be written, end_offset=99 */ userbuf_off = 0; f_index = st_index; off = start_off; fwr_size = ADIOI_MIN(st_fwr_size, bufsize); while (userbuf_off < bufsize) { userbuf_off += fwr_size; end_offset = off + fwr_size - 1; if (f_index < (flat_file->count - 1)) f_index++; else { f_index = 0; n_filetypes++; } off = disp + flat_file->indices[f_index] + n_filetypes*(ADIO_Offset)filetype_extent; fwr_size = ADIOI_MIN(flat_file->blocklens[f_index], bufsize-(unsigned)userbuf_off); } /* End of calculations. At this point the following values have * been calculated and are ready for use: * - start_off * - end_offset * - st_n_filetypes * - st_index * - st_fwr_size */ /* if atomicity is true, lock (exclusive) the region to be accessed */ if ((fd->atomicity) && ADIO_Feature(fd, ADIO_LOCKS)) { ADIOI_WRITE_LOCK(fd, start_off, SEEK_SET, end_offset-start_off+1); } if (buftype_is_contig && !filetype_is_contig) { /* contiguous in memory, noncontiguous in file. should be the * most common case. */ userbuf_off = 0; f_index = st_index; off = start_off; n_filetypes = st_n_filetypes; fwr_size = ADIOI_MIN(st_fwr_size, bufsize); /* while there is still space in the buffer, write more data */ while (userbuf_off < bufsize) { if (fwr_size) { /* TYPE_UB and TYPE_LB can result in fwr_size = 0. save system call in such cases */ req_off = off; req_len = fwr_size; ADIOI_Assert(req_len == (int) req_len); ADIOI_Assert((((ADIO_Offset)(MPIU_Upint)buf) + userbuf_off) == (ADIO_Offset)(MPIU_Upint)((MPIU_Upint)buf + userbuf_off)); ADIO_WriteContig(fd, (char *) buf + userbuf_off, (int)req_len, MPI_BYTE, ADIO_EXPLICIT_OFFSET, req_off, &status1, error_code); if (*error_code != MPI_SUCCESS) return; } userbuf_off += fwr_size; if (off + fwr_size < disp + flat_file->indices[f_index] + flat_file->blocklens[f_index] + n_filetypes*(ADIO_Offset)filetype_extent) { /* important that this value be correct, as it is * used to set the offset in the fd near the end of * this function. */ off += fwr_size; } /* did not reach end of contiguous block in filetype. * no more I/O needed. off is incremented by fwr_size. */ else { if (f_index < (flat_file->count - 1)) f_index++; else { f_index = 0; n_filetypes++; } off = disp + flat_file->indices[f_index] + n_filetypes*(ADIO_Offset)filetype_extent; fwr_size = ADIOI_MIN(flat_file->blocklens[f_index], bufsize-(unsigned)userbuf_off); } } } else { ADIO_Offset i_offset, tmp_bufsize = 0; /* noncontiguous in memory as well as in file */ flat_buf = ADIOI_Flatten_and_find(buftype); b_index = buf_count = 0; i_offset = flat_buf->indices[0]; f_index = st_index; off = start_off; n_filetypes = st_n_filetypes; fwr_size = st_fwr_size; bwr_size = flat_buf->blocklens[0]; /* while we haven't read size * count bytes, keep going */ while (tmp_bufsize < bufsize) { ADIO_Offset new_bwr_size = bwr_size, new_fwr_size = fwr_size; size = ADIOI_MIN(fwr_size, bwr_size); if (size) { req_off = off; req_len = size; userbuf_off = i_offset; ADIOI_Assert(req_len == (int) req_len); ADIOI_Assert((((ADIO_Offset)(MPIU_Upint)buf) + userbuf_off) == (ADIO_Offset)(MPIU_Upint)((MPIU_Upint)buf + userbuf_off)); ADIO_WriteContig(fd, (char *) buf + userbuf_off, (int)req_len, MPI_BYTE, ADIO_EXPLICIT_OFFSET, req_off, &status1, error_code); if (*error_code != MPI_SUCCESS) return; } if (size == fwr_size) { /* reached end of contiguous block in file */ if (f_index < (flat_file->count - 1)) f_index++; else { f_index = 0; n_filetypes++; } off = disp + flat_file->indices[f_index] + n_filetypes*(ADIO_Offset)filetype_extent; new_fwr_size = flat_file->blocklens[f_index]; if (size != bwr_size) { i_offset += size; new_bwr_size -= size; } } if (size == bwr_size) { /* reached end of contiguous block in memory */ b_index = (b_index + 1)%flat_buf->count; buf_count++; i_offset = (ADIO_Offset)buftype_extent*(ADIO_Offset)(buf_count/flat_buf->count) + flat_buf->indices[b_index]; new_bwr_size = flat_buf->blocklens[b_index]; if (size != fwr_size) { off += size; new_fwr_size -= size; } } tmp_bufsize += size; fwr_size = new_fwr_size; bwr_size = new_bwr_size; } } /* unlock the file region if we locked it */ if ((fd->atomicity) && ADIO_Feature(fd, ADIO_LOCKS)) { ADIOI_UNLOCK(fd, start_off, SEEK_SET, end_offset-start_off+1); } if (file_ptr_type == ADIO_INDIVIDUAL) fd->fp_ind = off; } /* end of (else noncontiguous in file) */ fd->fp_sys_posn = -1; /* mark it as invalid. */ #ifdef HAVE_STATUS_SET_BYTES MPIR_Status_set_bytes(status, buftype, bufsize); /* This is a temporary way of filling in status. The right way is to * keep track of how much data was actually written and placed in buf */ #endif if (!buftype_is_contig) ADIOI_Delete_flattened(buftype); }