/* ADIOI_cb_bcast_rank_map() - broadcast the rank array * * Parameters: * fd - ADIO_File for which update is occurring. cb_nodes and ranklist * parameters must be up-to-date on rank 0 of the fd->comm. * * should probably be a void fn. */ int ADIOI_cb_bcast_rank_map(ADIO_File fd) { int my_rank; char *value; int error_code = MPI_SUCCESS; static char myname[] = "ADIOI_cb_bcast_rank_map"; MPI_Bcast(&(fd->hints->cb_nodes), 1, MPI_INT, 0, fd->comm); if (fd->hints->cb_nodes > 0) { MPI_Comm_rank(fd->comm, &my_rank); if (my_rank != 0) { fd->hints->ranklist = ADIOI_Malloc(fd->hints->cb_nodes*sizeof(int)); if (fd->hints->ranklist == NULL) { error_code = MPIO_Err_create_code(error_code, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_OTHER, "**nomem2",0); return error_code; } } MPI_Bcast(fd->hints->ranklist, fd->hints->cb_nodes, MPI_INT, 0, fd->comm); } /* TEMPORARY -- REMOVE WHEN NO LONGER UPDATING INFO FOR * FS-INDEP. */ value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char)); ADIOI_Snprintf(value, MPI_MAX_INFO_VAL+1, "%d", fd->hints->cb_nodes); ADIOI_Info_set(fd->info, "cb_nodes", value); ADIOI_Free(value); return 0; }
/* given 'info', incorporate any hints in 'sysinfo' that are not already set * into 'new_info'. Caller must free 'new_info' later. */ void ADIOI_incorporate_system_hints(MPI_Info info, MPI_Info sysinfo, MPI_Info *new_info) { int i, nkeys_sysinfo, flag; char val[MPI_MAX_INFO_VAL], key[MPI_MAX_INFO_KEY]; if (sysinfo == MPI_INFO_NULL) nkeys_sysinfo = 0; else MPI_Info_get_nkeys(sysinfo, &nkeys_sysinfo); /* short-circuit: return immediately if no hints to process */ if (info == MPI_INFO_NULL && nkeys_sysinfo == 0) { *new_info = MPI_INFO_NULL; return; } if (info == MPI_INFO_NULL) MPI_Info_create(new_info); else MPI_Info_dup(info, new_info); for (i=0; i<nkeys_sysinfo; i++) { MPI_Info_get_nthkey(sysinfo, i, key); /* don't care about the value, just want to know if hint set already*/ if (info != MPI_INFO_NULL) ADIOI_Info_get(info, key, 1, val, &flag); if (flag == 1) continue; /* skip any hints already set by user */ ADIOI_Info_get(sysinfo, key, MPI_MAX_INFO_VAL-1, val, &flag); ADIOI_Info_set(*new_info, key, val); flag = 0; } return; }
/* ADIOI_cb_bcast_rank_map() - broadcast the rank array * * Parameters: * fd - ADIO_File for which update is occurring. cb_nodes and ranklist * parameters must be up-to-date on rank 0 of the fd->comm. * * should probably be a void fn. */ int ADIOI_cb_bcast_rank_map(ADIO_File fd) { int my_rank; char *value; MPI_Bcast(&(fd->hints->cb_nodes), 1, MPI_INT, 0, fd->comm); if (fd->hints->cb_nodes > 0) { MPI_Comm_rank(fd->comm, &my_rank); if (my_rank != 0) { fd->hints->ranklist = ADIOI_Malloc(fd->hints->cb_nodes*sizeof(int)); if (fd->hints->ranklist == NULL) { /* NEED TO HANDLE ENOMEM */ } } MPI_Bcast(fd->hints->ranklist, fd->hints->cb_nodes, MPI_INT, 0, fd->comm); } /* TEMPORARY -- REMOVE WHEN NO LONGER UPDATING INFO FOR * FS-INDEP. */ value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char)); ADIOI_Snprintf(value, MPI_MAX_INFO_VAL+1, "%d", fd->hints->cb_nodes); ADIOI_Info_set(fd->info, "cb_nodes", value); ADIOI_Free(value); return 0; }
static int build_cb_config_list(ADIO_File fd, MPI_Comm orig_comm, MPI_Comm comm, int rank, int procs, int *error_code) { ADIO_cb_name_array array; int *tmp_ranklist; int rank_ct; char *value; static char myname[] = "ADIO_OPEN cb_config_list"; /* gather the processor name array if we don't already have it */ /* this has to be done early in ADIO_Open so that we can cache the name * array in both the dup'd communicator (in case we want it later) and the * original communicator */ ADIOI_cb_gather_name_array(orig_comm, comm, &array); /* parse the cb_config_list and create a rank map on rank 0 */ if (rank == 0) { tmp_ranklist = (int *) ADIOI_Malloc(sizeof(int) * procs); if (tmp_ranklist == NULL) { *error_code = MPIO_Err_create_code(*error_code, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_OTHER, "**nomem2",0); return 0; } rank_ct = ADIOI_cb_config_list_parse(fd->hints->cb_config_list, array, tmp_ranklist, fd->hints->cb_nodes); /* store the ranklist using the minimum amount of memory */ if (rank_ct > 0) { fd->hints->ranklist = (int *) ADIOI_Malloc(sizeof(int) * rank_ct); memcpy(fd->hints->ranklist, tmp_ranklist, sizeof(int) * rank_ct); } ADIOI_Free(tmp_ranklist); fd->hints->cb_nodes = rank_ct; /* TEMPORARY -- REMOVE WHEN NO LONGER UPDATING INFO FOR FS-INDEP. */ value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char)); MPL_snprintf(value, MPI_MAX_INFO_VAL+1, "%d", rank_ct); ADIOI_Info_set(fd->info, "cb_nodes", value); ADIOI_Free(value); } ADIOI_cb_bcast_rank_map(fd); if (fd->hints->cb_nodes <= 0) { *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_IO, "**ioagnomatch", 0); fd = ADIO_FILE_NULL; } return 0; }
/* parse the file-of-hints. Format is zero or more lines of "<key> <value>\n". * A # in collumn zero is a comment and the line will be ignored. Do our best * to ignore badly formed lines too. * * The caller provides an 'info' object. Each key-value pair found by the * parser will get added to the info object. any keys already set will be left * alone on the assumption that the caller knows best. * * because MPI-IO hints are optional, we can get away with limited error * reporting. */ static int file_to_info(int fd, MPI_Info info) { char *buffer, *token, *key, *val, *garbage; char *pos1=NULL, *pos2=NULL; int flag, ret; char dummy; struct stat statbuf; /* assumption: config files will be small (less than 1MB) */ fstat(fd, &statbuf); /* add 1 to size to make room for NULL termination */ buffer = (char *)ADIOI_Calloc(statbuf.st_size + 1, sizeof (char)); if (buffer == NULL) return -1; ret = read(fd, buffer, statbuf.st_size); if (ret < 0) return -1; token = strtok_r(buffer, "\n", &pos1); do { if ( (key = strtok_r(token, " \t", &pos2)) == NULL) /* malformed line: found no items */ continue; if (token[0] == '#') /* ignore '#'-delimited comments */ continue; if ( (val = strtok_r(NULL, " \t", &pos2)) == NULL) /* malformed line: found key without value */ continue; if ( (garbage = strtok_r(NULL, " \t", &pos2)) != NULL) /* malformed line: more than two items */ continue; #ifdef SYSHINT_DEBUG printf("found: key=%s val=%s\n", key, val); #endif /* don't actually care what the value is. only want to know if key * exists: we leave it alone if so*/ ADIOI_Info_get(info, key, 1, &dummy, &flag); if (flag == 1) continue; ADIOI_Info_set(info, key, val); } while ((token = strtok_r(NULL, "\n", &pos1)) != NULL); ADIOI_Free(buffer); return 0; }
/* takes an extra romio_cb_pfr param to decide whether file realms * should start at byte 0 of the file*/ void ADIOI_Calc_file_realms_aar (ADIO_File fd, int nprocs_for_coll, int cb_pfr, ADIO_Offset min_st_offset, ADIO_Offset max_end_offset, ADIO_Offset *file_realm_st_offs, MPI_Datatype *file_realm_types) { int fr_size, aligned_fr_size, i; MPI_Datatype simpletype; ADIO_Offset aligned_start_off; char value[9]; fr_size = (max_end_offset - min_st_offset + nprocs_for_coll) / nprocs_for_coll; align_fr(fr_size, min_st_offset, fd->hints->cb_fr_alignment, &aligned_fr_size, &aligned_start_off); fr_size = aligned_fr_size; ADIOI_Create_fr_simpletype (fr_size, nprocs_for_coll, &simpletype); if (cb_pfr == ADIOI_HINT_ENABLE) file_realm_st_offs[0] = 0; else file_realm_st_offs[0] = aligned_start_off; file_realm_types[0] = simpletype; #ifdef DEBUG printf ("file_realm[0] = (%lld, %d)\n", file_realm_st_offs[0], fr_size); #endif for (i=1; i < nprocs_for_coll; i++) { file_realm_st_offs[i] = file_realm_st_offs[i-1] + fr_size; file_realm_types[i] = simpletype; #ifdef DEBUG printf ("file_realm[%d] = (%lld, %d)\n", i, file_realm_st_offs[i], fr_size); #endif } if (fd->hints->cb_pfr == ADIOI_HINT_ENABLE) { sprintf (value, "%d", fr_size); ADIOI_Info_set (fd->info, "romio_cb_fr_type", value); } }
void ADIOI_GRIDFTP_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code) { if (!(fd->info)) { if ( users_info==MPI_INFO_NULL ) { /* This must be part of the open call. */ MPI_Info_create(&(fd->info)); } else { MPI_Info_dup(users_info,&(fd->info)); } } else { int i,nkeys,valuelen,flag; char key[MPI_MAX_INFO_KEY], value[MPI_MAX_INFO_VAL]; if ( users_info!=MPI_INFO_NULL ) { MPI_Info_get_nkeys(users_info,&nkeys); for (i=0;i<nkeys;i++) { MPI_Info_get_nthkey(users_info,i,key); MPI_Info_get_valuelen(users_info,key,&valuelen,&flag); if (flag) { ADIOI_Info_get(users_info,key,valuelen,value,&flag); if (flag) ADIOI_Info_set(fd->info,key,value); } } } } /* let the generic ROMIO and MPI-I/O stuff happen... */ ADIOI_GEN_SetInfo(fd, users_info, error_code); }
void ADIOI_GEN_OpenColl(ADIO_File fd, int rank, int access_mode, int *error_code) { int orig_amode_excl, orig_amode_wronly; MPI_Comm tmp_comm; MPI_Datatype stats_type; /* deferred open: some processes might not open the file, so we'll exchange some information with those non-aggregators */ orig_amode_excl = access_mode; if (access_mode & ADIO_CREATE ){ if(rank == fd->hints->ranklist[0]) { /* remove delete_on_close flag if set */ if (access_mode & ADIO_DELETE_ON_CLOSE) fd->access_mode = access_mode ^ ADIO_DELETE_ON_CLOSE; else fd->access_mode = access_mode; tmp_comm = fd->comm; fd->comm = MPI_COMM_SELF; (*(fd->fns->ADIOI_xxx_Open))(fd, error_code); fd->comm = tmp_comm; MPI_Bcast(error_code, 1, MPI_INT, \ fd->hints->ranklist[0], fd->comm); /* if no error, close the file and reopen normally below */ if (*error_code == MPI_SUCCESS) (*(fd->fns->ADIOI_xxx_Close))(fd, error_code); fd->access_mode = access_mode; /* back to original */ } else MPI_Bcast(error_code, 1, MPI_INT, fd->hints->ranklist[0], fd->comm); if (*error_code != MPI_SUCCESS) { return; } else { /* turn off CREAT (and EXCL if set) for real multi-processor open */ access_mode ^= ADIO_CREATE; if (access_mode & ADIO_EXCL) access_mode ^= ADIO_EXCL; } } fd->blksize = 1024*1024*4; /* this large default value should be good for most file systems. any ROMIO driver is free to stat the file and find an optimial value */ /* if we are doing deferred open, non-aggregators should return now */ if (fd->hints->deferred_open ) { if (!(fd->is_agg)) { char value[MPI_MAX_INFO_VAL+1]; /* we might have turned off EXCL for the aggregators. * restore access_mode that non-aggregators get the right * value from get_amode */ fd->access_mode = orig_amode_excl; /* In file-system specific open, a driver might collect some * information via stat(). Deferred open means not every process * participates in fs-specific open, but they all participate in * this open call. Broadcast a bit of information in case * lower-level file system driver (e.g. 'bluegene') collected it * (not all do)*/ stats_type = make_stats_type(fd); MPI_Bcast(MPI_BOTTOM, 1, stats_type, fd->hints->ranklist[0], fd->comm); ADIOI_Assert(fd->blksize > 0); /* some file systems (e.g. lustre) will inform the user via the * info object about the file configuration. deferred open, * though, skips that step for non-aggregators. we do the * info-setting here */ sprintf(value, "%d", fd->hints->striping_unit); ADIOI_Info_set(fd->info, "striping_unit", value); sprintf(value, "%d", fd->hints->striping_factor); ADIOI_Info_set(fd->info, "striping_factor", value); sprintf(value, "%d", fd->hints->start_iodevice); ADIOI_Info_set(fd->info, "romio_lustre_start_iodevice", value); *error_code = MPI_SUCCESS; MPI_Type_free(&stats_type); return; } } /* For writing with data sieving, a read-modify-write is needed. If the file is opened for write_only, the read will fail. Therefore, if write_only, open the file as read_write, but record it as write_only in fd, so that get_amode returns the right answer. */ /* observation from David Knaak: file systems that do not support data * sieving do not need to change the mode */ orig_amode_wronly = access_mode; if ( (access_mode & ADIO_WRONLY) && ADIO_Feature(fd, ADIO_DATA_SIEVING_WRITES) ) { access_mode = access_mode ^ ADIO_WRONLY; access_mode = access_mode | ADIO_RDWR; } fd->access_mode = access_mode; (*(fd->fns->ADIOI_xxx_Open))(fd, error_code); /* if error, may be it was due to the change in amode above. therefore, reopen with access mode provided by the user.*/ fd->access_mode = orig_amode_wronly; if (*error_code != MPI_SUCCESS) (*(fd->fns->ADIOI_xxx_Open))(fd, error_code); /* if we turned off EXCL earlier, then we should turn it back on */ if (fd->access_mode != orig_amode_excl) fd->access_mode = orig_amode_excl; /* broadcast information to all proceses in * communicator, not just those who participated in open */ stats_type = make_stats_type(fd); MPI_Bcast(MPI_BOTTOM, 1, stats_type, fd->hints->ranklist[0], fd->comm); MPI_Type_free(&stats_type); /* file domain code will get terribly confused in a hard-to-debug way if * gpfs blocksize not sensible */ ADIOI_Assert( fd->blksize > 0); /* for deferred open: this process has opened the file (because if we are * not an aggregaor and we are doing deferred open, we returned earlier)*/ fd->is_open = 1; }
void ADIOI_PANFS_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code) { static char myname[] = "ADIOI_PANFS_SETINFO"; char* value; int flag, tmp_val = -1; unsigned long int concurrent_write = 0; pan_fs_client_layout_agg_type_t layout_type = PAN_FS_CLIENT_LAYOUT_TYPE__DEFAULT; unsigned long int layout_stripe_unit = 0; unsigned long int layout_parity_stripe_width = 0; unsigned long int layout_parity_stripe_depth = 0; unsigned long int layout_total_num_comps = 0; pan_fs_client_layout_visit_t layout_visit_policy = PAN_FS_CLIENT_LAYOUT_VISIT__ROUND_ROBIN; int gen_error_code; *error_code = MPI_SUCCESS; if (fd->info == MPI_INFO_NULL) { /* This must be part of the open call. can set striping parameters * if necessary. */ MPI_Info_create(&(fd->info)); /* has user specified striping parameters and do they have the same value on all processes? */ if (users_info != MPI_INFO_NULL) { value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char)); ADIOI_Info_get(users_info, "panfs_concurrent_write", MPI_MAX_INFO_VAL, value, &flag); if (flag) { concurrent_write = strtoul(value,NULL,10); tmp_val = concurrent_write; MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm); if (tmp_val != concurrent_write) { FPRINTF(stderr, "ADIOI_PANFS_SetInfo: the value for key \"panfs_concurrent_write\" must be the same on all processes\n"); MPI_Abort(MPI_COMM_WORLD, 1); } ADIOI_Info_set(fd->info, "panfs_concurrent_write", value); } ADIOI_Info_get(users_info, "panfs_layout_type", MPI_MAX_INFO_VAL, value, &flag); if (flag) { layout_type = strtoul(value,NULL,10); tmp_val = layout_type; MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm); if (tmp_val != layout_type) { FPRINTF(stderr, "ADIOI_PANFS_SetInfo: the value for key \"panfs_layout_type\" must be the same on all processes\n"); MPI_Abort(MPI_COMM_WORLD, 1); } ADIOI_Info_set(fd->info, "panfs_layout_type", value); } ADIOI_Info_get(users_info, "panfs_layout_stripe_unit", MPI_MAX_INFO_VAL, value, &flag); if (flag) { layout_stripe_unit = strtoul(value,NULL,10); tmp_val = layout_stripe_unit; MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm); if (tmp_val != layout_stripe_unit) { FPRINTF(stderr, "ADIOI_PANFS_SetInfo: the value for key \"panfs_layout_stripe_unit\" must be the same on all processes\n"); MPI_Abort(MPI_COMM_WORLD, 1); } ADIOI_Info_set(fd->info, "panfs_layout_stripe_unit", value); } ADIOI_Info_get(users_info, "panfs_layout_parity_stripe_width", MPI_MAX_INFO_VAL, value, &flag); if (flag && (layout_type == PAN_FS_CLIENT_LAYOUT_TYPE__RAID1_5_PARITY_STRIPE)) { layout_parity_stripe_width = strtoul(value,NULL,10); tmp_val = layout_parity_stripe_width; MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm); if (tmp_val != layout_parity_stripe_width) { FPRINTF(stderr, "ADIOI_PANFS_SetInfo: the value for key \"panfs_layout_parity_stripe_width\" must be the same on all processes\n"); MPI_Abort(MPI_COMM_WORLD, 1); } ADIOI_Info_set(fd->info, "panfs_layout_parity_stripe_width", value); } ADIOI_Info_get(users_info, "panfs_layout_parity_stripe_depth", MPI_MAX_INFO_VAL, value, &flag); if (flag && (layout_type == PAN_FS_CLIENT_LAYOUT_TYPE__RAID1_5_PARITY_STRIPE)) { layout_parity_stripe_depth = strtoul(value,NULL,10); tmp_val = layout_parity_stripe_depth; MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm); if (tmp_val != layout_parity_stripe_depth) { FPRINTF(stderr, "ADIOI_PANFS_SetInfo: the value for key \"panfs_layout_parity_stripe_depth\" must be the same on all processes\n"); MPI_Abort(MPI_COMM_WORLD, 1); } ADIOI_Info_set(fd->info, "panfs_layout_parity_stripe_depth", value); } ADIOI_Info_get(users_info, "panfs_layout_total_num_comps", MPI_MAX_INFO_VAL, value, &flag); if (flag) { layout_total_num_comps = strtoul(value,NULL,10); tmp_val = layout_total_num_comps; MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm); if (tmp_val != layout_total_num_comps) { FPRINTF(stderr, "ADIOI_PANFS_SetInfo: the value for key \"panfs_layout_total_num_comps\" must be the same on all processes\n"); MPI_Abort(MPI_COMM_WORLD, 1); } ADIOI_Info_set(fd->info, "panfs_layout_total_num_comps", value); } ADIOI_Info_get(users_info, "panfs_layout_visit_policy", MPI_MAX_INFO_VAL, value, &flag); if (flag && (layout_type == PAN_FS_CLIENT_LAYOUT_TYPE__RAID1_5_PARITY_STRIPE || layout_type == PAN_FS_CLIENT_LAYOUT_TYPE__RAID10)) { layout_visit_policy = strtoul(value,NULL,10); tmp_val = layout_visit_policy; MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm); if (tmp_val != layout_visit_policy) { FPRINTF(stderr, "ADIOI_PANFS_SetInfo: the value for key \"panfs_layout_visit_policy\" must be the same on all processes\n"); MPI_Abort(MPI_COMM_WORLD, 1); } ADIOI_Info_set(fd->info, "panfs_layout_visit_policy", value); } ADIOI_Free(value); } } ADIOI_GEN_SetInfo(fd, users_info, &gen_error_code); /* If this function is successful, use the error code returned from ADIOI_GEN_SetInfo * otherwise use the error_code generated by this function */ if(*error_code == MPI_SUCCESS) { *error_code = gen_error_code; } }
void ADIOI_GEN_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code) { /* if fd->info is null, create a new info object. Initialize fd->info to default values. Initialize fd->hints to default values. Examine the info object passed by the user. If it contains values that ROMIO understands, override the default. */ MPI_Info info; char *value; int flag, nprocs = 0, len; int ok_to_override_cb_nodes = 0; static char myname[] = "ADIOI_GEN_SETINFO"; /* if we've already set up default hints and the user has not asked us to * process any hints (MPI_INFO_NULL), then we can short-circuit hint * processing */ if (fd->hints->initialized && fd->info == MPI_INFO_NULL) { *error_code = MPI_SUCCESS; return; } ad_get_env_vars(); if (fd->info == MPI_INFO_NULL) MPI_Info_create(&(fd->info)); info = fd->info; MPI_Comm_size(fd->comm, &nprocs); /* Note that fd->hints is allocated at file open time; thus it is * not necessary to allocate it, or check for allocation, here. */ value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL + 1) * sizeof(char)); if (value == NULL) { *error_code = MPIO_Err_create_code(*error_code, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_OTHER, "**nomem2", 0); return; } /* initialize info and hints to default values if they haven't been * previously initialized */ if (!fd->hints->initialized) { /* buffer size for collective I/O */ ADIOI_Info_set(info, "cb_buffer_size", ADIOI_CB_BUFFER_SIZE_DFLT); fd->hints->cb_buffer_size = atoi(ADIOI_CB_BUFFER_SIZE_DFLT); /* default is to let romio automatically decide when to use * collective buffering */ ADIOI_Info_set(info, "romio_cb_read", "automatic"); fd->hints->cb_read = ADIOI_HINT_AUTO; ADIOI_Info_set(info, "romio_cb_write", "automatic"); fd->hints->cb_write = ADIOI_HINT_AUTO; fd->hints->cb_config_list = NULL; /* number of processes that perform I/O in collective I/O */ MPL_snprintf(value, MPI_MAX_INFO_VAL + 1, "%d", nprocs); ADIOI_Info_set(info, "cb_nodes", value); fd->hints->cb_nodes = nprocs; /* hint indicating that no indep. I/O will be performed on this file */ ADIOI_Info_set(info, "romio_no_indep_rw", "false"); fd->hints->no_indep_rw = 0; /* hint instructing the use of persistent file realms */ ADIOI_Info_set(info, "romio_cb_pfr", "disable"); fd->hints->cb_pfr = ADIOI_HINT_DISABLE; /* hint guiding the assignment of persistent file realms */ ADIOI_Info_set(info, "romio_cb_fr_types", "aar"); fd->hints->cb_fr_type = ADIOI_FR_AAR; /* hint to align file realms with a certain byte value */ ADIOI_Info_set(info, "romio_cb_fr_alignment", "1"); fd->hints->cb_fr_alignment = 1; /* hint to set a threshold percentage for a datatype's size/extent at * which data sieving should be done in collective I/O */ ADIOI_Info_set(info, "romio_cb_ds_threshold", "0"); fd->hints->cb_ds_threshold = 0; /* hint to switch between point-to-point or all-to-all for two-phase */ ADIOI_Info_set(info, "romio_cb_alltoall", "automatic"); fd->hints->cb_alltoall = ADIOI_HINT_AUTO; /* deferred_open derived from no_indep_rw and cb_{read,write} */ fd->hints->deferred_open = 0; /* buffer size for data sieving in independent reads */ ADIOI_Info_set(info, "ind_rd_buffer_size", ADIOI_IND_RD_BUFFER_SIZE_DFLT); fd->hints->ind_rd_buffer_size = atoi(ADIOI_IND_RD_BUFFER_SIZE_DFLT); /* buffer size for data sieving in independent writes */ ADIOI_Info_set(info, "ind_wr_buffer_size", ADIOI_IND_WR_BUFFER_SIZE_DFLT); fd->hints->ind_wr_buffer_size = atoi(ADIOI_IND_WR_BUFFER_SIZE_DFLT); /* default is to let romio automatically decide when to use data * sieving */ ADIOI_Info_set(info, "romio_ds_read", "automatic"); fd->hints->ds_read = ADIOI_HINT_AUTO; ADIOI_Info_set(info, "romio_ds_write", "automatic"); fd->hints->ds_write = ADIOI_HINT_AUTO; /* still to do: tune this a bit for a variety of file systems. there's * no good default value so just leave it unset */ fd->hints->min_fdomain_size = 0; fd->hints->striping_unit = 0; fd->hints->initialized = 1; /* ADIO_Open sets up collective buffering arrays. If we are in this * path from say set_file_view, then we've don't want to adjust the * array: we'll get a segfault during collective i/o. We only want to * look at the users cb_nodes if it's open time */ ok_to_override_cb_nodes = 1; } /* add in user's info if supplied */ if (users_info != MPI_INFO_NULL) { ADIOI_Info_check_and_install_int(fd, users_info, "cb_buffer_size", &(fd->hints->cb_buffer_size), myname, error_code); /* aligning file realms to certain sizes (e.g. stripe sizes) * may benefit I/O performance */ ADIOI_Info_check_and_install_int(fd, users_info, "romio_cb_fr_alignment", &(fd->hints->cb_fr_alignment), myname, error_code); /* for collective I/O, try to be smarter about when to do data sieving * using a specific threshold for the datatype size/extent * (percentage 0-100%) */ ADIOI_Info_check_and_install_int(fd, users_info, "romio_cb_ds_threshold", &(fd->hints->cb_ds_threshold), myname, error_code); ADIOI_Info_check_and_install_enabled(fd, users_info, "romio_cb_alltoall", &(fd->hints->cb_alltoall), myname, error_code); /* new hints for enabling/disabling coll. buffering on * reads/writes */ ADIOI_Info_check_and_install_enabled(fd, users_info, "romio_cb_read", &(fd->hints->cb_read), myname, error_code); if (fd->hints->cb_read == ADIOI_HINT_DISABLE) { /* romio_cb_read overrides no_indep_rw */ ADIOI_Info_set(info, "romio_no_indep_rw", "false"); fd->hints->no_indep_rw = ADIOI_HINT_DISABLE; } ADIOI_Info_check_and_install_enabled(fd, users_info, "romio_cb_write", &(fd->hints->cb_write), myname, error_code); if (fd->hints->cb_write == ADIOI_HINT_DISABLE) { /* romio_cb_write overrides no_indep_rw */ ADIOI_Info_set(info, "romio_no_indep_rw", "false"); fd->hints->no_indep_rw = ADIOI_HINT_DISABLE; } /* enable/disable persistent file realms for collective I/O */ /* may want to check for no_indep_rdwr hint as well */ ADIOI_Info_check_and_install_enabled(fd, users_info, "romio_cb_pfr", &(fd->hints->cb_pfr), myname, error_code); /* file realm assignment types ADIOI_FR_AAR(0), * ADIOI_FR_FSZ(-1), ADIOI_FR_USR_REALMS(-2), all others specify * a regular fr size in bytes. probably not the best way... */ ADIOI_Info_check_and_install_int(fd, users_info, "romio_cb_fr_type", &(fd->hints->cb_fr_type), myname, error_code); /* Has the user indicated all I/O will be done collectively? */ ADIOI_Info_check_and_install_true(fd, users_info, "romio_no_indep_rw", &(fd->hints->no_indep_rw), myname, error_code); if (fd->hints->no_indep_rw == 1) { /* if 'no_indep_rw' set, also hint that we will do * collective buffering: if we aren't doing independent io, * then we have to do collective */ ADIOI_Info_set(info, "romio_cb_write", "enable"); ADIOI_Info_set(info, "romio_cb_read", "enable"); fd->hints->cb_read = 1; fd->hints->cb_write = 1; } /* new hints for enabling/disabling data sieving on * reads/writes */ ADIOI_Info_check_and_install_enabled(fd, users_info, "romio_ds_read", &(fd->hints->ds_read), myname, error_code); ADIOI_Info_check_and_install_enabled(fd, users_info, "romio_ds_write", &(fd->hints->ds_write), myname, error_code); if (ok_to_override_cb_nodes) { /* MPI_File_open path sets up some data structrues that don't * get resized in the MPI_File_set_view path, so ignore * cb_nodes in the set_view case */ ADIOI_Info_check_and_install_int(fd, users_info, "cb_nodes", &(fd->hints->cb_nodes), myname, error_code); if ((fd->hints->cb_nodes <= 0) || (fd->hints->cb_nodes > nprocs)) { /* can't ask for more aggregators than mpi processes, though it * might be interesting to think what such oversubscription * might mean... someday */ MPL_snprintf(value, MPI_MAX_INFO_VAL + 1, "%d", nprocs); ADIOI_Info_set(info, "cb_nodes", value); fd->hints->cb_nodes = nprocs; } } /* if (ok_to_override_cb_nodes) */ ADIOI_Info_check_and_install_int(fd, users_info, "ind_wr_buffer_size", &(fd->hints->ind_wr_buffer_size), myname, error_code); ADIOI_Info_check_and_install_int(fd, users_info, "ind_rd_buffer_size", &(fd->hints->ind_rd_buffer_size), myname, error_code); if (fd->hints->cb_config_list == NULL) { /* only set cb_config_list if it isn't already set. Note that * since we set it below, this ensures that the cb_config_list hint * will be set at file open time either by the user or to the * default */ /* if it has been set already, we ignore it the second time. * otherwise we would get an error if someone used the same info * value with a cb_config_list value in it in a couple of calls, * which would be irritating. */ ADIOI_Info_check_and_install_str(fd, users_info, "cb_config_list", &(fd->hints->cb_config_list), myname, error_code); } ADIOI_Info_check_and_install_int(fd, users_info, "romio_min_fdomain_size", &(fd->hints->min_fdomain_size), myname, error_code); /* Now we use striping unit in common code so we should * process hints for it. */ ADIOI_Info_check_and_install_int(fd, users_info, "striping_unit", &(fd->hints->striping_unit), myname, error_code); } /* Begin hint post-processig: some hints take precidence over or conflict * with others, or aren't supported by some file systems */ /* handle cb_config_list default value here; avoids an extra * free/alloc and insures it is always set */ if (fd->hints->cb_config_list == NULL) { ADIOI_Info_set(info, "cb_config_list", ADIOI_CB_CONFIG_LIST_DFLT); len = (strlen(ADIOI_CB_CONFIG_LIST_DFLT) + 1) * sizeof(char); fd->hints->cb_config_list = ADIOI_Malloc(len); if (fd->hints->cb_config_list == NULL) { ADIOI_Free(value); *error_code = MPIO_Err_create_code(*error_code, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_OTHER, "**nomem2", 0); return; } ADIOI_Strncpy(fd->hints->cb_config_list, ADIOI_CB_CONFIG_LIST_DFLT, len); } /* deferred_open won't be set by callers, but if the user doesn't * explicitly disable collecitve buffering (two-phase) and does hint that * io w/o independent io is going on, we'll set this internal hint as a * convenience */ if (((fd->hints->cb_read != ADIOI_HINT_DISABLE) && (fd->hints->cb_write != ADIOI_HINT_DISABLE) && fd->hints->no_indep_rw)) { fd->hints->deferred_open = 1; } else { /* setting romio_no_indep_rw enable and romio_cb_{read,write} * disable at the same time doesn't make sense. honor * romio_cb_{read,write} and force the no_indep_rw hint to * 'disable' */ ADIOI_Info_set(info, "romio_no_indep_rw", "false"); fd->hints->no_indep_rw = 0; fd->hints->deferred_open = 0; } if (ADIO_Feature(fd, ADIO_DATA_SIEVING_WRITES) == 0) { /* disable data sieving for fs that do not * support file locking */ ADIOI_Info_get(info, "ind_wr_buffer_size", MPI_MAX_INFO_VAL, value, &flag); if (flag) { /* get rid of this value if it is set */ ADIOI_Info_delete(info, "ind_wr_buffer_size"); } /* note: leave ind_wr_buffer_size alone; used for other cases * as well. -- Rob Ross, 04/22/2003 */ ADIOI_Info_set(info, "romio_ds_write", "disable"); fd->hints->ds_write = ADIOI_HINT_DISABLE; } ADIOI_Free(value); *error_code = MPI_SUCCESS; }
void ADIOI_PVFS2_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code) { char *value; int flag, tmp_value; static char myname[] = "ADIOI_PVFS_SETINFO"; if ((fd->info) == MPI_INFO_NULL) { /* part of the open call */ MPI_Info_create(&(fd->info)); ADIOI_Info_set(fd->info, "romio_pvfs2_debugmask", "0"); fd->hints->fs_hints.pvfs2.debugmask = 0; ADIOI_Info_set(fd->info, "striping_factor", "0"); fd->hints->striping_factor = 0; ADIOI_Info_set(fd->info, "striping_unit", "0"); fd->hints->striping_unit = 0; /* disable the aggressive strided optimizations by default */ ADIOI_Info_set(fd->info, "romio_pvfs2_posix_read", "disable"); ADIOI_Info_set(fd->info, "romio_pvfs2_posix_write", "disable"); fd->hints->fs_hints.pvfs2.posix_read = ADIOI_HINT_DISABLE; fd->hints->fs_hints.pvfs2.posix_write = ADIOI_HINT_DISABLE; ADIOI_Info_set(fd->info, "romio_pvfs2_dtype_read", "disable"); ADIOI_Info_set(fd->info, "romio_pvfs2_dtype_write", "disable"); fd->hints->fs_hints.pvfs2.dtype_read = ADIOI_HINT_DISABLE; fd->hints->fs_hints.pvfs2.dtype_write = ADIOI_HINT_DISABLE; ADIOI_Info_set(fd->info, "romio_pvfs2_listio_read", "disable"); ADIOI_Info_set(fd->info, "romio_pvfs2_listio_write", "disable"); fd->hints->fs_hints.pvfs2.listio_read = ADIOI_HINT_DISABLE; fd->hints->fs_hints.pvfs2.listio_write = ADIOI_HINT_DISABLE; /* any user-provided hints? */ if (users_info != MPI_INFO_NULL) { /* pvfs2 debugging */ value = (char *) ADIOI_Malloc( (MPI_MAX_INFO_VAL+1)*sizeof(char)); ADIOI_Info_get(users_info, "romio_pvfs2_debugmask", MPI_MAX_INFO_VAL, value, &flag); if (flag) { tmp_value = fd->hints->fs_hints.pvfs2.debugmask = PVFS_debug_eventlog_to_mask(value); MPI_Bcast(&tmp_value, 1, MPI_INT, 0, fd->comm); /* --BEGIN ERROR HANDLING-- */ if (tmp_value != fd->hints->fs_hints.pvfs2.debugmask) { MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname, "romio_pvfs2_debugmask", error_code); return; } /* --END ERROR HANDLING-- */ ADIOI_Info_set(fd->info, "romio_pvfs2_debugmask", value); } /* the striping factor */ ADIOI_Info_get(users_info, "striping_factor", MPI_MAX_INFO_VAL, value, &flag); if (flag) { tmp_value = fd->hints->striping_factor = atoi(value); MPI_Bcast(&tmp_value, 1, MPI_INT, 0, fd->comm); /* --BEGIN ERROR HANDLING-- */ if (tmp_value != fd->hints->striping_factor) { MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname, "striping_factor", error_code); return; } /* --END ERROR HANDLING-- */ ADIOI_Info_set(fd->info, "striping_factor", value); } /* the striping unit */ ADIOI_Info_get(users_info, "striping_unit", MPI_MAX_INFO_VAL, value, &flag); if (flag) { tmp_value = fd->hints->striping_unit = atoi(value); MPI_Bcast(&tmp_value, 1, MPI_INT, 0, fd->comm); /* --BEGIN ERROR HANDLING-- */ if (tmp_value != fd->hints->striping_unit) { MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname, "striping_unit", error_code); return; } /* --END ERROR HANDLING-- */ ADIOI_Info_set(fd->info, "striping_unit", value); } /* distribution name */ ADIOI_Info_get(users_info, "romio_pvfs2_distribution_name", MPI_MAX_INFO_VAL, value, &flag); if (flag) { } /* POSIX read */ ADIOI_Info_get(users_info, "romio_pvfs2_posix_read", MPI_MAX_INFO_VAL, value, &flag); if (flag) { if ( !strcmp(value, "enable") || !strcmp(value, "ENABLE")) { ADIOI_Info_set(fd->info, "romio_pvfs2_posix_read", value); fd->hints->fs_hints.pvfs2.posix_read = ADIOI_HINT_ENABLE; } else if ( !strcmp(value, "disable") || !strcmp(value, "DISABLE")) { ADIOI_Info_set(fd->info , "romio_pvfs2_posix_read", value); fd->hints->fs_hints.pvfs2.posix_read = ADIOI_HINT_DISABLE; } tmp_value = fd->hints->fs_hints.pvfs2.posix_read; MPI_Bcast(&tmp_value, 1, MPI_INT, 0, fd->comm); if (tmp_value != fd->hints->fs_hints.pvfs2.posix_read) { MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname, "posix_read", error_code); return; } } /* POSIX write */ ADIOI_Info_get(users_info, "romio_pvfs2_posix_write", MPI_MAX_INFO_VAL, value, &flag); if (flag) { if ( !strcmp(value, "enable") || !strcmp(value, "ENABLE")) { ADIOI_Info_set(fd->info, "romio_pvfs2_posix_write", value); fd->hints->fs_hints.pvfs2.posix_write = ADIOI_HINT_ENABLE; } else if ( !strcmp(value, "disable") || !strcmp(value, "DISABLE")) { ADIOI_Info_set(fd->info , "romio_pvfs2_posix_write", value); fd->hints->fs_hints.pvfs2.posix_write = ADIOI_HINT_DISABLE; } tmp_value = fd->hints->fs_hints.pvfs2.posix_write; MPI_Bcast(&tmp_value, 1, MPI_INT, 0, fd->comm); if (tmp_value != fd->hints->fs_hints.pvfs2.posix_write) { MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname, "posix_write", error_code); return; } } /* Datatype read */ ADIOI_Info_get(users_info, "romio_pvfs2_dtype_read", MPI_MAX_INFO_VAL, value, &flag); if (flag) { if ( !strcmp(value, "enable") || !strcmp(value, "ENABLE")) { ADIOI_Info_set(fd->info, "romio_pvfs2_dtype_read", value); fd->hints->fs_hints.pvfs2.dtype_read = ADIOI_HINT_ENABLE; } else if ( !strcmp(value, "disable") || !strcmp(value, "DISABLE")) { ADIOI_Info_set(fd->info , "romio_pvfs2_dtype_read", value); fd->hints->fs_hints.pvfs2.dtype_read = ADIOI_HINT_DISABLE; } tmp_value = fd->hints->fs_hints.pvfs2.dtype_read; MPI_Bcast(&tmp_value, 1, MPI_INT, 0, fd->comm); if (tmp_value != fd->hints->fs_hints.pvfs2.dtype_read) { MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname, "dtype_read", error_code); return; } } /* Datatype write */ ADIOI_Info_get(users_info, "romio_pvfs2_dtype_write", MPI_MAX_INFO_VAL, value, &flag); if (flag) { if ( !strcmp(value, "enable") || !strcmp(value, "ENABLE")) { ADIOI_Info_set(fd->info, "romio_pvfs2_dtype_write", value); fd->hints->fs_hints.pvfs2.dtype_write = ADIOI_HINT_ENABLE; } else if ( !strcmp(value, "disable") || !strcmp(value, "DISABLE")) { ADIOI_Info_set(fd->info , "romio_pvfs2_dtype_write", value); fd->hints->fs_hints.pvfs2.dtype_write = ADIOI_HINT_DISABLE; } tmp_value = fd->hints->fs_hints.pvfs2.dtype_write; MPI_Bcast(&tmp_value, 1, MPI_INT, 0, fd->comm); if (tmp_value != fd->hints->fs_hints.pvfs2.dtype_write) { MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname, "dtype_write", error_code); return; } } /* Listio read */ ADIOI_Info_get(users_info, "romio_pvfs2_listio_read", MPI_MAX_INFO_VAL, value, &flag); if (flag) { if ( !strcmp(value, "enable") || !strcmp(value, "ENABLE")) { ADIOI_Info_set(fd->info, "romio_pvfs2_listio_read", value); fd->hints->fs_hints.pvfs2.listio_read = ADIOI_HINT_ENABLE; } else if ( !strcmp(value, "disable") || !strcmp(value, "DISABLE")) { ADIOI_Info_set(fd->info , "romio_pvfs2_listio_read", value); fd->hints->fs_hints.pvfs2.listio_read = ADIOI_HINT_DISABLE; } tmp_value = fd->hints->fs_hints.pvfs2.listio_read; MPI_Bcast(&tmp_value, 1, MPI_INT, 0, fd->comm); if (tmp_value != fd->hints->fs_hints.pvfs2.listio_read) { MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname, "listio_read", error_code); return; } } /* Datatype write */ ADIOI_Info_get(users_info, "romio_pvfs2_listio_write", MPI_MAX_INFO_VAL, value, &flag); if (flag) { if ( !strcmp(value, "enable") || !strcmp(value, "ENABLE")) { ADIOI_Info_set(fd->info, "romio_pvfs2_listio_write", value); fd->hints->fs_hints.pvfs2.listio_write = ADIOI_HINT_ENABLE; } else if ( !strcmp(value, "disable") || !strcmp(value, "DISABLE")) { ADIOI_Info_set(fd->info , "romio_pvfs2_listio_write", value); fd->hints->fs_hints.pvfs2.listio_write = ADIOI_HINT_DISABLE; } tmp_value = fd->hints->fs_hints.pvfs2.listio_write; MPI_Bcast(&tmp_value, 1, MPI_INT, 0, fd->comm); if (tmp_value != fd->hints->fs_hints.pvfs2.listio_write) { MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname, "listio_write", error_code); return; } } ADIOI_Free(value); } } /* set the values for collective I/O and data sieving parameters */ ADIOI_GEN_SetInfo(fd, users_info, error_code); *error_code = MPI_SUCCESS; }
void ADIOI_BEEGFS_SetInfo( ADIO_File fd, MPI_Info users_info, int *error_code ) { char *value, *pathname, *dname, *slash; int flag, stripe_val[2], numtargets = 0, chunksize = 0; struct BeegfsIoctl_MkFileWithStripeHints_Arg createFileArg; int err, myrank, fd_pdir, perm, old_mask; static char myname[] = "ADIOI_BEEGFS_SETINFO"; /* set error code to success */ *error_code = MPI_SUCCESS; value = ( char * )ADIOI_Malloc( ( MPI_MAX_INFO_VAL + 1 ) * sizeof( char ) ); MPI_Comm_rank( fd->comm, &myrank ); /* set hints */ if( ( fd->info ) == MPI_INFO_NULL ) { MPI_Info_create( &( fd->info ) ); ADIOI_Info_set( fd->info, "striping_unit", "0" ); ADIOI_Info_set( fd->info, "striping_factor", "0" ); /* set users infos */ if( users_info != MPI_INFO_NULL ) { /* striping information */ ADIOI_Info_get( users_info, "striping_unit", MPI_MAX_INFO_VAL, value, &flag ); if( flag ) chunksize = atoi( value ); ADIOI_Info_get( users_info, "striping_factor", MPI_MAX_INFO_VAL, value, &flag ); if( flag ) numtargets = atoi( value ); /* check stripe info consistency */ if( myrank == 0 ) { stripe_val[0] = numtargets; stripe_val[1] = chunksize; } MPI_Bcast( stripe_val, 2, MPI_INT, 0, fd->comm ); if( stripe_val[0] != numtargets || stripe_val[1] != chunksize ) { FPRINTF( stderr, "ADIOI_BEEGFS_SetInfo: All keys" "-striping_factor:striping_unit " "need to be identical across all processes\n" ); MPI_Abort( MPI_COMM_WORLD, 1 ); } /* if user has specified striping info, process 0 tries to set it */ if( myrank == 0 && ( fd->access_mode & ADIO_CREATE ) && numtargets && chunksize ) { /* open the parent dir to get/set striping info */ pathname = ADIOI_Strdup( fd->filename ); dname = strrchr( pathname, '/' ); if( dname != NULL ) { *dname = '\0'; // replace / with nul-character fd_pdir = open( pathname, O_RDONLY ); if( fd_pdir == -1 ) { FPRINTF( stderr, "Error opening %s: %s\n", pathname, strerror( errno ) ); } } else { /* current dir relative path */ fd_pdir = open( ".", O_RDONLY ); if( fd_pdir == -1 ) { FPRINTF( stderr, "Error opening .: %s\n", strerror( errno ) ); } } ADIOI_Free( pathname ); if( fd->perm == ADIO_PERM_NULL ) { old_mask = umask( 022 ); umask( old_mask ); perm = old_mask ^ 0666; } else perm = fd->perm; /* set create hints depending on e10 hints previously set */ slash = strrchr( fd->filename, '/' ); if( slash != NULL ) slash += 1; else slash = fd->filename; createFileArg.filename = slash; createFileArg.mode = perm; createFileArg.numtargets = numtargets; createFileArg.chunksize = chunksize; /* create the hint file */ err = ioctl( fd_pdir, BEEGFS_IOC_MKFILE_STRIPEHINTS, &createFileArg ); if( err ) { FPRINTF( stderr, "BEEGFS_IOC_MKFILE_STRIPEHINTS: %s. ", strerror( errno ) ); if( errno == EEXIST ) { /* ignore user striping and use current file info */ FPRINTF( stderr, "[rank:%d] Failure to set stripe info for %s!\n", myrank, fd->filename ); } } /* close the parent dir file descriptor */ close( fd_pdir ); } /* End of striping parameters validation */ } MPI_Barrier( fd->comm ); } /* set rest of the MPI hints (including E10 hints) */ ADIOI_GEN_SetInfo( fd, users_info, error_code ); ADIOI_Free( value ); }
void ADIOI_XFS_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code) { char *value, * c; int flag; static char xfs_initialized = 0; if (fd->info == MPI_INFO_NULL) MPI_Info_create(&(fd->info)); ADIOI_Info_set(fd->info, "direct_read", "false"); ADIOI_Info_set(fd->info, "direct_write", "false"); fd->direct_read = fd->direct_write = 0; if (!xfs_initialized) { xfs_initialized = 1; c = getenv("MPIO_DIRECT_READ_CHUNK_SIZE"); if (c) { int io; io = atoi(c); if (io <= 0) { fprintf(stderr, "MPI: Ignoring an invalid setting for MPIO_DIRECT_READ_CHUNK_SIZE.\n" " It must be set to a positive integer value.\n"); } else { xfs_direct_read_chunk_size = io; } } else { xfs_direct_read_chunk_size = 0; } c = getenv("MPIO_DIRECT_WRITE_CHUNK_SIZE"); if (c) { int io; io = atoi(c); if (io <= 0) { fprintf(stderr, "MPI: Ignoring an invalid setting for MPIO_DIRECT_WRITE_CHUNK_SIZE.\n" " It must be set to a positive integer value.\n"); } else { xfs_direct_write_chunk_size = io; } } else { xfs_direct_write_chunk_size = 0; } } if (!fd->hints->initialized) { fd->hints->fs_hints.xfs.read_chunk_sz = xfs_direct_read_chunk_size; fd->hints->fs_hints.xfs.write_chunk_sz = xfs_direct_write_chunk_size; } /* has user specified values for keys "direct_read" and "direct write"? */ if (users_info != MPI_INFO_NULL) { value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char)); ADIOI_Info_get(users_info, "direct_read", MPI_MAX_INFO_VAL, value, &flag); if (flag && !strcmp(value, "true")) { ADIOI_Info_set(fd->info, "direct_read", "true"); fd->direct_read = 1; } ADIOI_Info_get(users_info, "direct_write", MPI_MAX_INFO_VAL, value, &flag); if (flag && !strcmp(value, "true")) { ADIOI_Info_set(fd->info, "direct_write", "true"); fd->direct_write = 1; } ADIOI_Free(value); } /* set the values for collective I/O and data sieving parameters */ ADIOI_GEN_SetInfo(fd, users_info, error_code); /* Environment variables override MPI_Info hints */ if (ADIOI_Direct_read) fd->direct_read = 1; if (ADIOI_Direct_write) fd->direct_write = 1; /* environment variables checked in ADIO_Init */ *error_code = MPI_SUCCESS; }
void ADIOI_PFS_Open(ADIO_File fd, int *error_code) { int perm, amode, old_mask, np_comm, np_total, err, flag; char *value; struct sattr attr; static char myname[] = "ADIOI_PFS_OPEN"; if (fd->perm == ADIO_PERM_NULL) { old_mask = umask(022); umask(old_mask); perm = old_mask ^ 0666; } else perm = fd->perm; amode = 0; if (fd->access_mode & ADIO_CREATE) amode = amode | O_CREAT; if (fd->access_mode & ADIO_RDONLY) amode = amode | O_RDONLY; if (fd->access_mode & ADIO_WRONLY) amode = amode | O_WRONLY; if (fd->access_mode & ADIO_RDWR) amode = amode | O_RDWR; if (fd->access_mode & ADIO_EXCL) amode = amode | O_EXCL; MPI_Comm_size(MPI_COMM_WORLD, &np_total); MPI_Comm_size(fd->comm, &np_comm); if (np_total == np_comm) fd->fd_sys = _gopen(fd->filename, amode, M_ASYNC, perm); else fd->fd_sys = open(fd->filename, amode, perm); fd->fd_direct = -1; if (fd->fd_sys != -1) { value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char)); /* if user has asked for pfs server buffering to be turned on, it will be set to true in fd->info in the earlier call to ADIOI_PFS_SetInfo. Turn it on now, since we now have a valid file descriptor. */ ADIOI_Info_get(fd->info, "pfs_svr_buf", MPI_MAX_INFO_VAL, value, &flag); if (flag && (!strcmp(value, "true"))) { err = fcntl(fd->fd_sys, F_PFS_SVR_BUF, TRUE); if (err) ADIOI_Info_set(fd->info, "pfs_svr_buf", "false"); } /* get file striping information and set it in info */ err = fcntl(fd->fd_sys, F_GETSATTR, &attr); if (!err) { MPL_snprintf(value, MPI_MAX_INFO_VAL+1, "%d", attr.s_sunitsize); ADIOI_Info_set(fd->info, "striping_unit", value); MPL_snprintf(value, MPI_MAX_INFO_VAL+1, "%d", attr.s_sfactor); ADIOI_Info_set(fd->info, "striping_factor", value); MPL_snprintf(value, MPI_MAX_INFO_VAL+1, "%d", attr.s_start_sdir); ADIOI_Info_set(fd->info, "start_iodevice", value); } ADIOI_Free(value); if (fd->access_mode & ADIO_APPEND) fd->fp_ind = fd->fp_sys_posn = lseek(fd->fd_sys, 0, SEEK_END); } if (fd->fd_sys == -1) { *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_IO, "**io", "**io %s", strerror(errno)); } else *error_code = MPI_SUCCESS; }
void ADIOI_LUSTRE_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code) { char *value; int flag, stripe_val[3], str_factor = -1, str_unit=0, start_iodev=-1; struct lov_user_md lum = { 0 }; int err, myrank, fd_sys, perm, amode, old_mask; int int_val, tmp_val; static char myname[] = "ADIOI_LUSTRE_SETINFO"; value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char)); if ( (fd->info) == MPI_INFO_NULL) { /* This must be part of the open call. can set striping parameters if necessary. */ MPI_Info_create(&(fd->info)); ADIOI_Info_set(fd->info, "direct_read", "false"); ADIOI_Info_set(fd->info, "direct_write", "false"); fd->direct_read = fd->direct_write = 0; /* initialize lustre hints */ ADIOI_Info_set(fd->info, "romio_lustre_co_ratio", "1"); fd->hints->fs_hints.lustre.co_ratio = 1; ADIOI_Info_set(fd->info, "romio_lustre_coll_threshold", "0"); fd->hints->fs_hints.lustre.coll_threshold = 0; ADIOI_Info_set(fd->info, "romio_lustre_ds_in_coll", "enable"); fd->hints->fs_hints.lustre.ds_in_coll = ADIOI_HINT_ENABLE; /* has user specified striping or server buffering parameters and do they have the same value on all processes? */ if (users_info != MPI_INFO_NULL) { /* striping information */ ADIOI_Info_get(users_info, "striping_unit", MPI_MAX_INFO_VAL, value, &flag); if (flag) str_unit=atoi(value); ADIOI_Info_get(users_info, "striping_factor", MPI_MAX_INFO_VAL, value, &flag); if (flag) str_factor=atoi(value); ADIOI_Info_get(users_info, "romio_lustre_start_iodevice", MPI_MAX_INFO_VAL, value, &flag); if (flag) start_iodev=atoi(value); /* direct read and write */ ADIOI_Info_get(users_info, "direct_read", MPI_MAX_INFO_VAL, value, &flag); if (flag && (!strcmp(value, "true") || !strcmp(value, "TRUE"))) { ADIOI_Info_set(fd->info, "direct_read", "true"); fd->direct_read = 1; } ADIOI_Info_get(users_info, "direct_write", MPI_MAX_INFO_VAL, value, &flag); if (flag && (!strcmp(value, "true") || !strcmp(value, "TRUE"))) { ADIOI_Info_set(fd->info, "direct_write", "true"); fd->direct_write = 1; } } /* set striping information with ioctl */ MPI_Comm_rank(fd->comm, &myrank); if (myrank == 0) { stripe_val[0] = str_factor; stripe_val[1] = str_unit; stripe_val[2] = start_iodev; } MPI_Bcast(stripe_val, 3, MPI_INT, 0, fd->comm); if (stripe_val[0] != str_factor || stripe_val[1] != str_unit || stripe_val[2] != start_iodev) { FPRINTF(stderr, "ADIOI_LUSTRE_SetInfo: All keys" "-striping_factor:striping_unit:start_iodevice " "need to be identical across all processes\n"); MPI_Abort(MPI_COMM_WORLD, 1); } else if ((str_factor > 0) || (str_unit > 0) || (start_iodev >= 0)) { /* if user has specified striping info, process 0 tries to set it */ if (!myrank) { if (fd->perm == ADIO_PERM_NULL) { old_mask = umask(022); umask(old_mask); perm = old_mask ^ 0666; } else perm = fd->perm; amode = 0; if (fd->access_mode & ADIO_CREATE) amode = amode | O_CREAT; if (fd->access_mode & ADIO_RDONLY) amode = amode | O_RDONLY; if (fd->access_mode & ADIO_WRONLY) amode = amode | O_WRONLY; if (fd->access_mode & ADIO_RDWR) amode = amode | O_RDWR; if (fd->access_mode & ADIO_EXCL) amode = amode | O_EXCL; /* we need to create file so ensure this is set */ amode = amode | O_LOV_DELAY_CREATE | O_CREAT; fd_sys = open(fd->filename, amode, perm); if (fd_sys == -1) { if (errno != EEXIST) fprintf(stderr, "Failure to open file %s %d %d\n",strerror(errno), amode, perm); } else { lum.lmm_magic = LOV_USER_MAGIC; lum.lmm_pattern = 0; lum.lmm_stripe_size = str_unit; lum.lmm_stripe_count = str_factor; lum.lmm_stripe_offset = start_iodev; err = ioctl(fd_sys, LL_IOC_LOV_SETSTRIPE, &lum); if (err == -1 && errno != EEXIST) { fprintf(stderr, "Failure to set stripe info %s \n", strerror(errno)); } close(fd_sys); } } /* End of striping parameters validation */ } MPI_Barrier(fd->comm); } /* get other hint */ if (users_info != MPI_INFO_NULL) { /* CO: IO Clients/OST, * to keep the load balancing between clients and OSTs */ ADIOI_Info_get(users_info, "romio_lustre_co_ratio", MPI_MAX_INFO_VAL, value, &flag); if (flag && (int_val = atoi(value)) > 0) { tmp_val = int_val; MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm); if (tmp_val != int_val) { MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname, "romio_lustre_co_ratio", error_code); ADIOI_Free(value); return; } ADIOI_Info_set(fd->info, "romio_lustre_co_ratio", value); fd->hints->fs_hints.lustre.co_ratio = atoi(value); } /* coll_threshold: * if the req size is bigger than this, collective IO may not be performed. */ ADIOI_Info_get(users_info, "romio_lustre_coll_threshold", MPI_MAX_INFO_VAL, value, &flag); if (flag && (int_val = atoi(value)) > 0) { tmp_val = int_val; MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm); if (tmp_val != int_val) { MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname, "romio_lustre_coll_threshold", error_code); ADIOI_Free(value); return; } ADIOI_Info_set(fd->info, "romio_lustre_coll_threshold", value); fd->hints->fs_hints.lustre.coll_threshold = atoi(value); } /* ds_in_coll: disable data sieving in collective IO */ ADIOI_Info_get(users_info, "romio_lustre_ds_in_coll", MPI_MAX_INFO_VAL, value, &flag); if (flag && (!strcmp(value, "disable") || !strcmp(value, "DISABLE"))) { tmp_val = int_val = 2; MPI_Bcast(&tmp_val, 2, MPI_INT, 0, fd->comm); if (tmp_val != int_val) { MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname, "romio_lustre_ds_in_coll", error_code); ADIOI_Free(value); return; } ADIOI_Info_set(fd->info, "romio_lustre_ds_in_coll", "disable"); fd->hints->fs_hints.lustre.ds_in_coll = ADIOI_HINT_DISABLE; } } /* set the values for collective I/O and data sieving parameters */ ADIOI_GEN_SetInfo(fd, users_info, error_code); if (ADIOI_Direct_read) fd->direct_read = 1; if (ADIOI_Direct_write) fd->direct_write = 1; ADIOI_Free(value); *error_code = MPI_SUCCESS; }
void ADIOI_LUSTRE_Open(ADIO_File fd, int *error_code) { int perm, old_mask, amode, amode_direct; int lumlen; struct lov_user_md *lum = NULL; char *value; #if defined(MPICH2) || !defined(PRINT_ERR_MSG) static char myname[] = "ADIOI_LUSTRE_OPEN"; #endif if (fd->perm == ADIO_PERM_NULL) { old_mask = umask(022); umask(old_mask); perm = old_mask ^ 0666; } else perm = fd->perm; amode = 0; if (fd->access_mode & ADIO_CREATE) amode = amode | O_CREAT; if (fd->access_mode & ADIO_RDONLY) amode = amode | O_RDONLY; if (fd->access_mode & ADIO_WRONLY) amode = amode | O_WRONLY; if (fd->access_mode & ADIO_RDWR) amode = amode | O_RDWR; if (fd->access_mode & ADIO_EXCL) amode = amode | O_EXCL; amode_direct = amode | O_DIRECT; fd->fd_sys = open(fd->filename, amode|O_CREAT, perm); if (fd->fd_sys != -1) { int err; /* get file striping information and set it in info */ /* odd malloc here because lov_user_md contains some fixed data and * then a list of 'lmm_objects' representing stripe */ lumlen = sizeof(struct lov_user_md) + MAX_LOV_UUID_COUNT * sizeof(struct lov_user_ost_data); /* furthermore, Pascal Deveze reports that, even though we pass a * "GETSTRIPE" (read) flag to the ioctl, if some of the values of this * struct are uninitialzed, the call can give an error. calloc in case * there are other members that must be initialized and in case * lov_user_md struct changes in future */ lum = (struct lov_user_md *)ADIOI_Calloc(1,lumlen); lum->lmm_magic = LOV_USER_MAGIC; err = ioctl(fd->fd_sys, LL_IOC_LOV_GETSTRIPE, (void *)lum); if (!err) { value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char)); fd->hints->striping_unit = lum->lmm_stripe_size; sprintf(value, "%d", lum->lmm_stripe_size); ADIOI_Info_set(fd->info, "striping_unit", value); fd->hints->striping_factor = lum->lmm_stripe_count; sprintf(value, "%d", lum->lmm_stripe_count); ADIOI_Info_set(fd->info, "striping_factor", value); fd->hints->fs_hints.lustre.start_iodevice = lum->lmm_stripe_offset; sprintf(value, "%d", lum->lmm_stripe_offset); ADIOI_Info_set(fd->info, "romio_lustre_start_iodevice", value); ADIOI_Free(value); } ADIOI_Free(lum); if (fd->access_mode & ADIO_APPEND) fd->fp_ind = fd->fp_sys_posn = lseek(fd->fd_sys, 0, SEEK_END); } if ((fd->fd_sys != -1) && (fd->access_mode & ADIO_APPEND)) fd->fp_ind = fd->fp_sys_posn = lseek(fd->fd_sys, 0, SEEK_END); fd->fd_direct = -1; if (fd->direct_write || fd->direct_read) { fd->fd_direct = open(fd->filename, amode_direct, perm); if (fd->fd_direct != -1) { fd->d_mem = fd->d_miniosz = (1<<12); } else { perror("cannot open file with O_Direct"); fd->direct_write = fd->direct_read = 0; } } /* --BEGIN ERROR HANDLING-- */ if (fd->fd_sys == -1 || ((fd->fd_direct == -1) && (fd->direct_write || fd->direct_read))) { if (errno == ENAMETOOLONG) *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_BAD_FILE, "**filenamelong", "**filenamelong %s %d", fd->filename, strlen(fd->filename)); else if (errno == ENOENT) *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_NO_SUCH_FILE, "**filenoexist", "**filenoexist %s", fd->filename); else if (errno == ENOTDIR || errno == ELOOP) *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_BAD_FILE, "**filenamedir", "**filenamedir %s", fd->filename); else if (errno == EACCES) { *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_ACCESS, "**fileaccess", "**fileaccess %s", fd->filename ); } else if (errno == EROFS) { /* Read only file or file system and write access requested */ *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_READ_ONLY, "**ioneedrd", 0 ); } else { *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_IO, "**io", "**io %s", strerror(errno)); } } /* --END ERROR HANDLING-- */ else *error_code = MPI_SUCCESS; }
void ADIOI_PFS_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code) { char *value, *value_in_fd; int flag, tmp_val, str_factor=-1, str_unit=-1, start_iodev=-1; struct sattr attr; int err, myrank, fd_sys, perm, amode, old_mask; if ( (fd->info) == MPI_INFO_NULL) { /* This must be part of the open call. can set striping parameters if necessary. */ MPI_Info_create(&(fd->info)); /* has user specified striping or server buffering parameters and do they have the same value on all processes? */ if (users_info != MPI_INFO_NULL) { value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char)); ADIOI_Info_get(users_info, "striping_factor", MPI_MAX_INFO_VAL, value, &flag); if (flag) { str_factor=atoi(value); tmp_val = str_factor; MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm); /* --BEGIN ERROR HANDLING-- */ if (tmp_val != str_factor) { MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname, "striping_factor", error_code); return; } /* --END ERROR HANDLING-- */ } ADIOI_Info_get(users_info, "striping_unit", MPI_MAX_INFO_VAL, value, &flag); if (flag) { str_unit=atoi(value); tmp_val = str_unit; MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm); /* --BEGIN ERROR HANDLING-- */ if (tmp_val != str_unit) { MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname, "striping_unit", error_code); return; } /* --END ERROR HANDLING-- */ } ADIOI_Info_get(users_info, "start_iodevice", MPI_MAX_INFO_VAL, value, &flag); if (flag) { start_iodev=atoi(value); tmp_val = start_iodev; MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm); /* --BEGIN ERROR HANDLING-- */ if (tmp_val != start_iodev) { MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname, "start_iodevice", error_code); return; } /* --END ERROR HANDLING-- */ } /* if user has specified striping info, process 0 tries to set it */ if ((str_factor > 0) || (str_unit > 0) || (start_iodev >= 0)) { MPI_Comm_rank(fd->comm, &myrank); if (!myrank) { if (fd->perm == ADIO_PERM_NULL) { old_mask = umask(022); umask(old_mask); perm = old_mask ^ 0666; } else perm = fd->perm; amode = 0; if (fd->access_mode & ADIO_CREATE) amode = amode | O_CREAT; if (fd->access_mode & ADIO_RDONLY) amode = amode | O_RDONLY; if (fd->access_mode & ADIO_WRONLY) amode = amode | O_WRONLY; if (fd->access_mode & ADIO_RDWR) amode = amode | O_RDWR; if (fd->access_mode & ADIO_EXCL) amode = amode | O_EXCL; fd_sys = open(fd->filename, amode, perm); err = fcntl(fd_sys, F_GETSATTR, &attr); if (!err) { if (str_unit > 0) attr.s_sunitsize = str_unit; if ((start_iodev >= 0) && (start_iodev < attr.s_sfactor)) attr.s_start_sdir = start_iodev; if ((str_factor > 0) && (str_factor < attr.s_sfactor)) attr.s_sfactor = str_factor; err = fcntl(fd_sys, F_SETSATTR, &attr); } close(fd_sys); } MPI_Barrier(fd->comm); } /* Has user asked for pfs server buffering to be turned on? If so, mark it as true in fd->info and turn it on in ADIOI_PFS_Open after the file is opened */ ADIOI_Info_get(users_info, "pfs_svr_buf", MPI_MAX_INFO_VAL, value, &flag); if (flag && (!strcmp(value, "true"))) ADIOI_Info_set(fd->info, "pfs_svr_buf", "true"); else ADIOI_Info_set(fd->info, "pfs_svr_buf", "false"); ADIOI_Free(value); } else ADIOI_Info_set(fd->info, "pfs_svr_buf", "false"); /* set the values for collective I/O and data sieving parameters */ ADIOI_GEN_SetInfo(fd, users_info, error_code); } else { /* The file has been opened previously and fd->fd_sys is a valid file descriptor. cannot set striping parameters now. */ /* set the values for collective I/O and data sieving parameters */ ADIOI_GEN_SetInfo(fd, users_info, error_code); /* has user specified value for pfs_svr_buf? */ if (users_info != MPI_INFO_NULL) { value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char)); ADIOI_Info_get(users_info, "pfs_svr_buf", MPI_MAX_INFO_VAL, value, &flag); if (flag && (!strcmp(value, "true") || !strcmp(value, "false"))) { value_in_fd = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char)); ADIOI_Info_get(fd->info, "pfs_svr_buf", MPI_MAX_INFO_VAL, value_in_fd, &flag); if (strcmp(value, value_in_fd)) { if (!strcmp(value, "true")) { err = fcntl(fd->fd_sys, F_PFS_SVR_BUF, TRUE); if (!err) ADIOI_Info_set(fd->info, "pfs_svr_buf", "true"); } else { err = fcntl(fd->fd_sys, F_PFS_SVR_BUF, FALSE); if (!err) ADIOI_Info_set(fd->info, "pfs_svr_buf", "false"); } } ADIOI_Free(value_in_fd); } ADIOI_Free(value); } } *error_code = MPI_SUCCESS; }
MPI_File ADIO_Open(MPI_Comm orig_comm, MPI_Comm comm, const char *filename, int file_system, ADIOI_Fns *ops, int access_mode, ADIO_Offset disp, MPI_Datatype etype, MPI_Datatype filetype, MPI_Info info, int perm, int *error_code) { MPI_File mpi_fh; ADIO_File fd; int err, rank, procs; static char myname[] = "ADIO_OPEN"; int max_error_code; MPI_Info dupinfo; int syshints_processed, can_skip; char *p; *error_code = MPI_SUCCESS; /* obtain MPI_File handle */ mpi_fh = MPIO_File_create(sizeof(struct ADIOI_FileD)); if (mpi_fh == MPI_FILE_NULL) { fd = MPI_FILE_NULL; *error_code = MPIO_Err_create_code(*error_code, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_OTHER, "**nomem2",0); goto fn_exit; } fd = MPIO_File_resolve(mpi_fh); fd->cookie = ADIOI_FILE_COOKIE; fd->fp_ind = disp; fd->fp_sys_posn = 0; fd->comm = comm; /* dup'ed in MPI_File_open */ fd->filename = ADIOI_Strdup(filename); fd->file_system = file_system; fd->fs_ptr = NULL; fd->fns = ops; fd->disp = disp; fd->split_coll_count = 0; fd->shared_fp_fd = ADIO_FILE_NULL; fd->atomicity = 0; fd->etype = etype; /* MPI_BYTE by default */ fd->filetype = filetype; /* MPI_BYTE by default */ fd->etype_size = 1; /* default etype is MPI_BYTE */ fd->file_realm_st_offs = NULL; fd->file_realm_types = NULL; fd->perm = perm; fd->async_count = 0; fd->fortran_handle = -1; fd->err_handler = ADIOI_DFLT_ERR_HANDLER; fd->io_buf_window = MPI_WIN_NULL; fd->io_buf_put_amounts_window = MPI_WIN_NULL; MPI_Comm_rank(comm, &rank); MPI_Comm_size(comm, &procs); /* create and initialize info object */ fd->hints = (ADIOI_Hints *)ADIOI_Calloc(1, sizeof(struct ADIOI_Hints_struct)); if (fd->hints == NULL) { *error_code = MPIO_Err_create_code(*error_code, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_OTHER, "**nomem2",0); goto fn_exit; } fd->hints->cb_config_list = NULL; fd->hints->ranklist = NULL; fd->hints->initialized = 0; fd->info = MPI_INFO_NULL; /* move system-wide hint processing *back* into open, but this time the * hintfile reader will do a scalable read-and-broadcast. The global * ADIOI_syshints will get initialized at first open. subsequent open * calls will just use result from first open. * * We have two goals here: * 1: avoid processing the hintfile multiple times * 2: have all processes participate in hintfile processing (so we can read-and-broadcast) * * a code might do an "initialize from 0", so we can only skip hint * processing once everyone has particpiated in hint processing */ if (ADIOI_syshints == MPI_INFO_NULL) syshints_processed = 0; else syshints_processed = 1; MPI_Allreduce(&syshints_processed, &can_skip, 1, MPI_INT, MPI_MIN, fd->comm); if (!can_skip) { if (ADIOI_syshints == MPI_INFO_NULL) MPI_Info_create(&ADIOI_syshints); ADIOI_process_system_hints(fd, ADIOI_syshints); } ADIOI_incorporate_system_hints(info, ADIOI_syshints, &dupinfo); ADIO_SetInfo(fd, dupinfo, &err); if (dupinfo != MPI_INFO_NULL) { *error_code = MPI_Info_free(&dupinfo); if (*error_code != MPI_SUCCESS) goto fn_exit; } ADIOI_Info_set(fd->info, "romio_filesystem_type", fd->fns->fsname); /* Instead of repeatedly allocating this buffer in collective read/write, * allocating up-front might make memory management on small platforms * (e.g. Blue Gene) more efficent */ fd->io_buf = ADIOI_Malloc(fd->hints->cb_buffer_size); /* deferred open: * we can only do this optimization if 'fd->hints->deferred_open' is set * (which means the user hinted 'no_indep_rw' and collective buffering). * Furthermore, we only do this if our collective read/write routines use * our generic function, and not an fs-specific routine (we can defer opens * only if we use our aggreagation code). */ if (fd->hints->deferred_open && !(uses_generic_read(fd) \ && uses_generic_write(fd))) { fd->hints->deferred_open = 0; } if (ADIO_Feature(fd, ADIO_SCALABLE_OPEN)) /* disable deferred open on these fs so that scalable broadcast * will always use the propper communicator */ fd->hints->deferred_open = 0; /* on BlueGene, the cb_config_list is built when hints are processed. No * one else does that right now */ if (fd->hints->ranklist == NULL) { build_cb_config_list(fd, orig_comm, comm, rank, procs, error_code); if (*error_code != MPI_SUCCESS) goto fn_exit; } fd->is_open = 0; fd->my_cb_nodes_index = -2; fd->is_agg = is_aggregator(rank, fd); /* deferred open used to split the communicator to create an "aggregator * communicator", but we only used it as a way to indicate that deferred * open happened. fd->is_open and fd->is_agg are sufficient */ /* actual opens start here */ /* generic open: one process opens to create the file, all others open */ /* nfs open: everybody opens or else you'll end up with "file not found" * due to stupid nfs consistency semantics */ /* scalable open: one process opens and broadcasts results to everyone */ ADIOI_OpenColl(fd, rank, access_mode, error_code); /* for debugging, it can be helpful to see the hints selected. Some file * systes set up the hints in the open call (e.g. lustre) */ p = getenv("ROMIO_PRINT_HINTS"); if (rank == 0 && p != NULL ) { ADIOI_Info_print_keyvals(fd->info); } fn_exit: MPI_Allreduce(error_code, &max_error_code, 1, MPI_INT, MPI_MAX, comm); if (max_error_code != MPI_SUCCESS) { /* If the file was successfully opened, close it */ if (*error_code == MPI_SUCCESS) { /* in the deferred open case, only those who have actually opened the file should close it */ if (fd->hints->deferred_open) { if (fd->is_agg) { (*(fd->fns->ADIOI_xxx_Close))(fd, error_code); } } else { (*(fd->fns->ADIOI_xxx_Close))(fd, error_code); } } ADIOI_Free(fd->filename); ADIOI_Free(fd->hints->ranklist); ADIOI_Free(fd->hints->cb_config_list); ADIOI_Free(fd->hints); if (fd->info != MPI_INFO_NULL) MPI_Info_free(&(fd->info)); ADIOI_Free(fd->io_buf); ADIOI_Free(fd); fd = ADIO_FILE_NULL; if (*error_code == MPI_SUCCESS) { *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_IO, "**oremote_fail", 0); } } return fd; }
void ADIOI_BGL_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code) { /* if fd->info is null, create a new info object. Initialize fd->info to default values. Initialize fd->hints to default values. Examine the info object passed by the user. If it contains values that ROMIO understands, override the default. */ MPI_Info info; char *value; int flag, intval, tmp_val, nprocs=0, nprocs_is_valid = 0; static char myname[] = "ADIOI_BGL_SETINFO"; int did_anything = 0; if (fd->info == MPI_INFO_NULL) MPI_Info_create(&(fd->info)); info = fd->info; /* Note that fd->hints is allocated at file open time; thus it is * not necessary to allocate it, or check for allocation, here. */ value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char)); AD_BGL_assert ((value != NULL)); /* initialize info and hints to default values if they haven't been * previously initialized */ if (!fd->hints->initialized) { did_anything = 1; /* buffer size for collective I/O */ ADIOI_Info_set(info, "cb_buffer_size", ADIOI_BGL_CB_BUFFER_SIZE_DFLT); fd->hints->cb_buffer_size = atoi(ADIOI_BGL_CB_BUFFER_SIZE_DFLT); /* default is to let romio automatically decide when to use * collective buffering */ ADIOI_Info_set(info, "romio_cb_read", "enable"); fd->hints->cb_read = ADIOI_HINT_ENABLE; ADIOI_Info_set(info, "romio_cb_write", "enable"); fd->hints->cb_write = ADIOI_HINT_ENABLE; if ( fd->hints->cb_config_list != NULL ) ADIOI_Free (fd->hints->cb_config_list); fd->hints->cb_config_list = NULL; /* number of processes that perform I/O in collective I/O */ MPI_Comm_size(fd->comm, &nprocs); nprocs_is_valid = 1; ADIOI_Snprintf(value, MPI_MAX_INFO_VAL+1, "%d", nprocs); ADIOI_Info_set(info, "cb_nodes", value); fd->hints->cb_nodes = -1; /* hint indicating that no indep. I/O will be performed on this file */ ADIOI_Info_set(info, "romio_no_indep_rw", "false"); fd->hints->no_indep_rw = 0; /* bgl is not implementing file realms (ADIOI_IOStridedColl), initialize to disabled it. */ /* hint instructing the use of persistent file realms */ ADIOI_Info_set(info, "romio_cb_pfr", "disable"); fd->hints->cb_pfr = ADIOI_HINT_DISABLE; /* hint guiding the assignment of persistent file realms */ ADIOI_Info_set(info, "romio_cb_fr_types", "aar"); fd->hints->cb_fr_type = ADIOI_FR_AAR; /* hint to align file realms with a certain byte value */ ADIOI_Info_set(info, "romio_cb_fr_alignment", "1"); fd->hints->cb_fr_alignment = 1; /* hint to set a threshold percentage for a datatype's size/extent at * which data sieving should be done in collective I/O */ ADIOI_Info_set(info, "romio_cb_ds_threshold", "0"); fd->hints->cb_ds_threshold = 0; /* hint to switch between point-to-point or all-to-all for two-phase */ ADIOI_Info_set(info, "romio_cb_alltoall", "automatic"); fd->hints->cb_alltoall = ADIOI_HINT_AUTO; /* deferred_open derived from no_indep_rw and cb_{read,write} */ fd->hints->deferred_open = 0; /* buffer size for data sieving in independent reads */ ADIOI_Info_set(info, "ind_rd_buffer_size", ADIOI_BGL_IND_RD_BUFFER_SIZE_DFLT); fd->hints->ind_rd_buffer_size = atoi(ADIOI_BGL_IND_RD_BUFFER_SIZE_DFLT); /* buffer size for data sieving in independent writes */ ADIOI_Info_set(info, "ind_wr_buffer_size", ADIOI_BGL_IND_WR_BUFFER_SIZE_DFLT); fd->hints->ind_wr_buffer_size = atoi(ADIOI_BGL_IND_WR_BUFFER_SIZE_DFLT); if(fd->file_system == ADIO_UFS) { /* default for ufs/pvfs is to disable data sieving */ ADIOI_Info_set(info, "romio_ds_read", "disable"); fd->hints->ds_read = ADIOI_HINT_DISABLE; ADIOI_Info_set(info, "romio_ds_write", "disable"); fd->hints->ds_write = ADIOI_HINT_DISABLE; } else { /* default is to let romio automatically decide when to use data * sieving */ ADIOI_Info_set(info, "romio_ds_read", "automatic"); fd->hints->ds_read = ADIOI_HINT_AUTO; ADIOI_Info_set(info, "romio_ds_write", "automatic"); fd->hints->ds_write = ADIOI_HINT_AUTO; } /* still to do: tune this a bit for a variety of file systems. there's * no good default value so just leave it unset */ fd->hints->min_fdomain_size = 0; fd->hints->striping_unit = 0; fd->hints->initialized = 1; } /* add in user's info if supplied */ if (users_info != MPI_INFO_NULL) { ADIOI_Info_get(users_info, "cb_buffer_size", MPI_MAX_INFO_VAL, value, &flag); if (flag && ((intval=atoi(value)) > 0)) { tmp_val = intval; MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm); /* --BEGIN ERROR HANDLING-- */ if (tmp_val != intval) { MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname, "cb_buffer_size", error_code); return; } /* --END ERROR HANDLING-- */ ADIOI_Info_set(info, "cb_buffer_size", value); fd->hints->cb_buffer_size = intval; } #if 0 /* bgl is not implementing file realms (ADIOI_IOStridedColl) ... */ /* aligning file realms to certain sizes (e.g. stripe sizes) * may benefit I/O performance */ ADIOI_Info_get(users_info, "romio_cb_fr_alignment", MPI_MAX_INFO_VAL, value, &flag); if (flag && ((intval=atoi(value)) > 0)) { tmp_val = intval; MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm); /* --BEGIN ERROR HANDLING-- */ if (tmp_val != intval) { MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname, "romio_cb_fr_alignment", error_code); return; } /* --END ERROR HANDLING-- */ ADIOI_Info_set(info, "romio_cb_fr_alignment", value); fd->hints->cb_fr_alignment = intval; } /* for collective I/O, try to be smarter about when to do data sieving * using a specific threshold for the datatype size/extent * (percentage 0-100%) */ ADIOI_Info_get(users_info, "romio_cb_ds_threshold", MPI_MAX_INFO_VAL, value, &flag); if (flag && ((intval=atoi(value)) > 0)) { tmp_val = intval; MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm); /* --BEGIN ERROR HANDLING-- */ if (tmp_val != intval) { MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname, "romio_cb_ds_threshold", error_code); return; } /* --END ERROR HANDLING-- */ ADIOI_Info_set(info, "romio_cb_ds_threshold", value); fd->hints->cb_ds_threshold = intval; } ADIOI_Info_get(users_info, "romio_cb_alltoall", MPI_MAX_INFO_VAL, value, &flag); if (flag) { if (!strcmp(value, "enable") || !strcmp(value, "ENABLE")) { ADIOI_Info_set(info, "romio_cb_alltoall", value); fd->hints->cb_read = ADIOI_HINT_ENABLE; } else if (!strcmp(value, "disable") || !strcmp(value, "DISABLE")) { ADIOI_Info_set(info, "romio_cb_alltoall", value); fd->hints->cb_read = ADIOI_HINT_DISABLE; } else if (!strcmp(value, "automatic") || !strcmp(value, "AUTOMATIC")) { ADIOI_Info_set(info, "romio_cb_alltoall", value); fd->hints->cb_read = ADIOI_HINT_AUTO; } tmp_val = fd->hints->cb_alltoall; MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm); /* --BEGIN ERROR HANDLING-- */ if (tmp_val != fd->hints->cb_alltoall) { MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname, "romio_cb_alltoall", error_code); return; } /* --END ERROR HANDLING-- */ } #endif /* new hints for enabling/disabling coll. buffering on * reads/writes */ ADIOI_Info_get(users_info, "romio_cb_read", MPI_MAX_INFO_VAL, value, &flag); if (flag) { if (!strcmp(value, "enable") || !strcmp(value, "ENABLE")) { ADIOI_Info_set(info, "romio_cb_read", value); fd->hints->cb_read = ADIOI_HINT_ENABLE; } else if (!strcmp(value, "disable") || !strcmp(value, "DISABLE")) { /* romio_cb_read overrides no_indep_rw */ ADIOI_Info_set(info, "romio_cb_read", value); ADIOI_Info_set(info, "romio_no_indep_rw", "false"); fd->hints->cb_read = ADIOI_HINT_DISABLE; fd->hints->no_indep_rw = ADIOI_HINT_DISABLE; } else if (!strcmp(value, "automatic") || !strcmp(value, "AUTOMATIC")) { ADIOI_Info_set(info, "romio_cb_read", value); fd->hints->cb_read = ADIOI_HINT_AUTO; } tmp_val = fd->hints->cb_read; MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm); /* --BEGIN ERROR HANDLING-- */ if (tmp_val != fd->hints->cb_read) { MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname, "romio_cb_read", error_code); return; } /* --END ERROR HANDLING-- */ } ADIOI_Info_get(users_info, "romio_cb_write", MPI_MAX_INFO_VAL, value, &flag); if (flag) { if (!strcmp(value, "enable") || !strcmp(value, "ENABLE")) { ADIOI_Info_set(info, "romio_cb_write", value); fd->hints->cb_write = ADIOI_HINT_ENABLE; } else if (!strcmp(value, "disable") || !strcmp(value, "DISABLE")) { /* romio_cb_write overrides no_indep_rw, too */ ADIOI_Info_set(info, "romio_cb_write", value); ADIOI_Info_set(info, "romio_no_indep_rw", "false"); fd->hints->cb_write = ADIOI_HINT_DISABLE; fd->hints->no_indep_rw = ADIOI_HINT_DISABLE; } else if (!strcmp(value, "automatic") || !strcmp(value, "AUTOMATIC")) { ADIOI_Info_set(info, "romio_cb_write", value); fd->hints->cb_write = ADIOI_HINT_AUTO; } tmp_val = fd->hints->cb_write; MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm); /* --BEGIN ERROR HANDLING-- */ if (tmp_val != fd->hints->cb_write) { MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname, "romio_cb_write", error_code); return; } /* --END ERROR HANDLING-- */ } #if 0 /* bgl is not implementing file realms (ADIOI_IOStridedColl) ... */ /* enable/disable persistent file realms for collective I/O */ /* may want to check for no_indep_rdwr hint as well */ ADIOI_Info_get(users_info, "romio_cb_pfr", MPI_MAX_INFO_VAL, value, &flag); if (flag) { if (!strcmp(value, "enable") || !strcmp(value, "ENABLE")) { ADIOI_Info_set(info, "romio_cb_pfr", value); fd->hints->cb_pfr = ADIOI_HINT_ENABLE; } else if (!strcmp(value, "disable") || !strcmp(value, "DISABLE")) { ADIOI_Info_set(info, "romio_cb_pfr", value); fd->hints->cb_pfr = ADIOI_HINT_DISABLE; } else if (!strcmp(value, "automatic") || !strcmp(value, "AUTOMATIC")) { ADIOI_Info_set(info, "romio_cb_pfr", value); fd->hints->cb_pfr = ADIOI_HINT_AUTO; } tmp_val = fd->hints->cb_pfr; MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm); /* --BEGIN ERROR HANDLING-- */ if (tmp_val != fd->hints->cb_pfr) { MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname, "romio_cb_pfr", error_code); return; } /* --END ERROR HANDLING-- */ } /* file realm assignment types ADIOI_FR_AAR(0), ADIOI_FR_FSZ(-1), ADIOI_FR_USR_REALMS(-2), all others specify a regular fr size in bytes. probably not the best way... */ ADIOI_Info_get(users_info, "romio_cb_fr_type", MPI_MAX_INFO_VAL, value, &flag); if (flag && ((intval=atoi(value)) >= -2)) { tmp_val = intval; MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm); /* --BEGIN ERROR HANDLING-- */ if (tmp_val != intval) { MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname, "romio_cb_fr_type", error_code); return; } /* --END ERROR HANDLING-- */ ADIOI_Info_set(info, "romio_cb_fr_type", value); fd->hints->cb_fr_type = intval; } #endif /* new hint for specifying no indep. read/write will be performed */ ADIOI_Info_get(users_info, "romio_no_indep_rw", MPI_MAX_INFO_VAL, value, &flag); if (flag) { if (!strcmp(value, "true") || !strcmp(value, "TRUE")) { /* if 'no_indep_rw' set, also hint that we will do * collective buffering: if we aren't doing independent io, * then we have to do collective */ ADIOI_Info_set(info, "romio_no_indep_rw", value); ADIOI_Info_set(info, "romio_cb_write", "enable"); ADIOI_Info_set(info, "romio_cb_read", "enable"); fd->hints->no_indep_rw = 1; fd->hints->cb_read = 1; fd->hints->cb_write = 1; tmp_val = 1; } else if (!strcmp(value, "false") || !strcmp(value, "FALSE")) { ADIOI_Info_set(info, "romio_no_indep_rw", value); fd->hints->no_indep_rw = 0; tmp_val = 0; } else { /* default is above */ tmp_val = 0; } MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm); /* --BEGIN ERROR HANDLING-- */ if (tmp_val != fd->hints->no_indep_rw) { MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname, "romio_no_indep_rw", error_code); return; } /* --END ERROR HANDLING-- */ } /* new hints for enabling/disabling data sieving on * reads/writes */ ADIOI_Info_get(users_info, "romio_ds_read", MPI_MAX_INFO_VAL, value, &flag); if (flag) { if (!strcmp(value, "enable") || !strcmp(value, "ENABLE")) { ADIOI_Info_set(info, "romio_ds_read", value); fd->hints->ds_read = ADIOI_HINT_ENABLE; } else if (!strcmp(value, "disable") || !strcmp(value, "DISABLE")) { ADIOI_Info_set(info, "romio_ds_read", value); fd->hints->ds_read = ADIOI_HINT_DISABLE; } else if (!strcmp(value, "automatic") || !strcmp(value, "AUTOMATIC")) { ADIOI_Info_set(info, "romio_ds_read", value); fd->hints->ds_read = ADIOI_HINT_AUTO; } /* otherwise ignore */ } ADIOI_Info_get(users_info, "romio_ds_write", MPI_MAX_INFO_VAL, value, &flag); if (flag) { if (!strcmp(value, "enable") || !strcmp(value, "ENABLE")) { ADIOI_Info_set(info, "romio_ds_write", value); fd->hints->ds_write = ADIOI_HINT_ENABLE; } else if (!strcmp(value, "disable") || !strcmp(value, "DISABLE")) { ADIOI_Info_set(info, "romio_ds_write", value); fd->hints->ds_write = ADIOI_HINT_DISABLE; } else if (!strcmp(value, "automatic") || !strcmp(value, "AUTOMATIC")) { ADIOI_Info_set(info, "romio_ds_write", value); fd->hints->ds_write = ADIOI_HINT_AUTO; } /* otherwise ignore */ } ADIOI_Info_get(users_info, "ind_wr_buffer_size", MPI_MAX_INFO_VAL, value, &flag); if (flag && ((intval = atoi(value)) > 0)) { ADIOI_Info_set(info, "ind_wr_buffer_size", value); fd->hints->ind_wr_buffer_size = intval; } ADIOI_Info_get(users_info, "ind_rd_buffer_size", MPI_MAX_INFO_VAL, value, &flag); if (flag && ((intval = atoi(value)) > 0)) { ADIOI_Info_set(info, "ind_rd_buffer_size", value); fd->hints->ind_rd_buffer_size = intval; } memset( value, 0, MPI_MAX_INFO_VAL+1 ); ADIOI_Info_get(users_info, "romio_min_fdomain_size", MPI_MAX_INFO_VAL, value, &flag); if ( flag && ((intval = atoi(value)) > 0) ) { ADIOI_Info_set(info, "romio_min_fdomain_size", value); fd->hints->min_fdomain_size = intval; } /* Now we use striping unit in common code so we should process hints for it. */ ADIOI_Info_get(users_info, "striping_unit", MPI_MAX_INFO_VAL, value, &flag); if ( flag && ((intval = atoi(value)) > 0) ) { ADIOI_Info_set(info, "striping_unit", value); fd->hints->striping_unit = intval; } memset( value, 0, MPI_MAX_INFO_VAL+1 ); ADIOI_Info_get(users_info, ADIOI_BGL_NAGG_IN_PSET_HINT_NAME, MPI_MAX_INFO_VAL, value, &flag); if (flag && ((intval = atoi(value)) > 0)) { did_anything = 1; ADIOI_Info_set(info, ADIOI_BGL_NAGG_IN_PSET_HINT_NAME, value); fd->hints->cb_nodes = intval; } } /* associate CB aggregators to certain CNs in every involved PSET */ if (did_anything) { ADIOI_BGL_gen_agg_ranklist(fd, fd->hints->cb_nodes); } /* ignore defered open hints and do not enable it for bluegene: need all * processors in the open path so we can stat-and-broadcast the blocksize */ ADIOI_Info_set(info, "romio_no_indep_rw", "false"); fd->hints->no_indep_rw = 0; fd->hints->deferred_open = 0; /* BobC commented this out, but since hint processing runs on both bgl and * bglockless, we need to keep DS writes enabled on gpfs and disabled on * PVFS */ if (ADIO_Feature(fd, ADIO_DATA_SIEVING_WRITES) == 0) { /* disable data sieving for fs that do not support file locking */ ADIOI_Info_get(info, "ind_wr_buffer_size", MPI_MAX_INFO_VAL, value, &flag); if (flag) { /* get rid of this value if it is set */ ADIOI_Info_delete(info, "ind_wr_buffer_size"); } /* note: leave ind_wr_buffer_size alone; used for other cases * as well. -- Rob Ross, 04/22/2003 */ ADIOI_Info_set(info, "romio_ds_write", "disable"); fd->hints->ds_write = ADIOI_HINT_DISABLE; } ADIOI_Free(value); *error_code = MPI_SUCCESS; }
void ADIOI_PANFS_Open(ADIO_File fd, int *error_code) { char* value; int perm, old_mask, amode, flag; static char myname[] = "ADIOI_PANFS_OPEN"; if (fd->perm == ADIO_PERM_NULL) { old_mask = umask(022); umask(old_mask); perm = ~old_mask & 0666; } else perm = fd->perm; amode = 0; if (fd->access_mode & ADIO_CREATE) { pan_fs_client_layout_agg_type_t layout_type = PAN_FS_CLIENT_LAYOUT_TYPE__DEFAULT; unsigned long int layout_stripe_unit = 0; unsigned long int layout_parity_stripe_width = 0; unsigned long int layout_parity_stripe_depth = 0; unsigned long int layout_total_num_comps = 0; pan_fs_client_layout_visit_t layout_visit_policy = PAN_FS_CLIENT_LAYOUT_VISIT__ROUND_ROBIN; int myrank; MPI_Comm_rank(fd->comm, &myrank); *error_code = MPI_SUCCESS; value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char)); ADIOI_Info_get(fd->info, "panfs_layout_type", MPI_MAX_INFO_VAL, value, &flag); if (flag) { layout_type = strtoul(value,NULL,10); } ADIOI_Info_get(fd->info, "panfs_layout_stripe_unit", MPI_MAX_INFO_VAL, value, &flag); if (flag) { layout_stripe_unit = strtoul(value,NULL,10); } ADIOI_Info_get(fd->info, "panfs_layout_total_num_comps", MPI_MAX_INFO_VAL, value, &flag); if (flag) { layout_total_num_comps = strtoul(value,NULL,10); } ADIOI_Info_get(fd->info, "panfs_layout_parity_stripe_width", MPI_MAX_INFO_VAL, value, &flag); if (flag) { layout_parity_stripe_width = strtoul(value,NULL,10); } ADIOI_Info_get(fd->info, "panfs_layout_parity_stripe_depth", MPI_MAX_INFO_VAL, value, &flag); if (flag) { layout_parity_stripe_depth = strtoul(value,NULL,10); } ADIOI_Info_get(fd->info, "panfs_layout_visit_policy", MPI_MAX_INFO_VAL, value, &flag); if (flag) { layout_visit_policy = strtoul(value,NULL,10); } ADIOI_Free(value); amode = amode | O_CREAT; /* Check for valid set of hints */ if ((layout_type < PAN_FS_CLIENT_LAYOUT_TYPE__DEFAULT) || (layout_type > PAN_FS_CLIENT_LAYOUT_TYPE__RAID10)) { FPRINTF(stderr, "%s: panfs_layout_type is not a valid value: %u.\n", myname, layout_type); MPI_Abort(MPI_COMM_WORLD, 1); } if ((layout_type == PAN_FS_CLIENT_LAYOUT_TYPE__RAID0) && ((layout_stripe_unit == 0) || (layout_total_num_comps == 0))) { if(layout_stripe_unit == 0) { FPRINTF(stderr, "%s: MPI_Info does not contain the panfs_layout_stripe_unit hint which is necessary to specify a valid RAID0 layout to the PAN_FS_CLIENT_LAYOUT_CREATE_FILE ioctl.\n", myname); } if(layout_total_num_comps == 0) { FPRINTF(stderr, "%s: MPI_Info does not contain the panfs_layout_total_num_comps hint which is necessary to specify a valid RAID0 layout to the PAN_FS_CLIENT_LAYOUT_CREATE_FILE ioctl.\n", myname); } MPI_Abort(MPI_COMM_WORLD, 1); } if (layout_type == PAN_FS_CLIENT_LAYOUT_TYPE__RAID1_5_PARITY_STRIPE) { if ((layout_stripe_unit == 0) || (layout_parity_stripe_width == 0) || (layout_parity_stripe_depth == 0) || (layout_total_num_comps == 0)) { if(layout_stripe_unit == 0) { FPRINTF(stderr, "%s: MPI_Info does not contain the panfs_layout_stripe_unit hint which is necessary to specify a valid RAID5 parity stripe layout to the PAN_FS_CLIENT_LAYOUT_CREATE_FILE ioctl.\n", myname); } if(layout_total_num_comps == 0) { FPRINTF(stderr, "%s: MPI_Info does not contain the panfs_layout_total_num_comps hint which is necessary to specify a valid RAID5 parity stripe layout to the PAN_FS_CLIENT_LAYOUT_CREATE_FILE ioctl.\n", myname); } if(layout_parity_stripe_width == 0) { FPRINTF(stderr, "%s: MPI_Info does not contain the panfs_layout_parity_stripe_width hint which is necessary to specify a valid RAID5 parity stripe layout to the PAN_FS_CLIENT_LAYOUT_CREATE_FILE ioctl.\n", myname); } if(layout_parity_stripe_depth == 0) { FPRINTF(stderr, "%s: MPI_Info does not contain the panfs_layout_parity_stripe_depth hint which is necessary to specify a valid RAID5 parity stripe layout to the PAN_FS_CLIENT_LAYOUT_CREATE_FILE ioctl.\n", myname); } MPI_Abort(MPI_COMM_WORLD, 1); } if ((layout_visit_policy < PAN_FS_CLIENT_LAYOUT_VISIT__ROUND_ROBIN) || (layout_visit_policy > PAN_FS_CLIENT_LAYOUT_VISIT__ROUND_ROBIN_WITH_HASHED_OFFSET)) { FPRINTF(stderr, "%s: panfs_layout_visit_policy is not a valid value: %u.\n", myname, layout_visit_policy); MPI_Abort(MPI_COMM_WORLD, 1); } } if (layout_type == PAN_FS_CLIENT_LAYOUT_TYPE__RAID10) { if ((layout_stripe_unit == 0) || (layout_total_num_comps == 0)) { if(layout_stripe_unit == 0) { FPRINTF(stderr, "%s: MPI_Info does not contain the panfs_layout_stripe_unit hint which is necessary to specify a valid RAID10 layout to the PAN_FS_CLIENT_LAYOUT_CREATE_FILE ioctl.\n", myname); } if(layout_total_num_comps == 0) { FPRINTF(stderr, "%s: MPI_Info does not contain the panfs_layout_total_num_comps hint which is necessary to specify a valid RAID10 layout to the PAN_FS_CLIENT_LAYOUT_CREATE_FILE ioctl.\n", myname); } MPI_Abort(MPI_COMM_WORLD, 1); } if ((layout_visit_policy < PAN_FS_CLIENT_LAYOUT_VISIT__ROUND_ROBIN) || (layout_visit_policy > PAN_FS_CLIENT_LAYOUT_VISIT__ROUND_ROBIN_WITH_HASHED_OFFSET)) { FPRINTF(stderr, "%s: panfs_layout_visit_policy is not a valid value: %u.\n", myname, layout_visit_policy); MPI_Abort(MPI_COMM_WORLD, 1); } } /* Create the file via ioctl() or open(). ADIOI_PANFS_Open's caller * already optimizes performance by only calling this function with * ADIO_CREATE on rank 0. Therefore, we don't need to worry about * implementing that optimization here. */ if((layout_type == PAN_FS_CLIENT_LAYOUT_TYPE__RAID0) || (layout_type == PAN_FS_CLIENT_LAYOUT_TYPE__RAID1_5_PARITY_STRIPE) || (layout_type == PAN_FS_CLIENT_LAYOUT_TYPE__RAID10)) { pan_fs_client_layout_create_args_t file_create_args; int fd_dir; char* slash; struct stat stat_buf; int err; char *path; /* Check that the file does not exist before * trying to create it. The ioctl itself should * be able to handle this condition. Currently, * the ioctl will return successfully if the file * has been previously created. Filed bug 33862 * to track the problem. */ err = stat(fd->filename,&stat_buf); if((err == -1) && (errno != ENOENT)) { FPRINTF(stderr,"%s: Unexpected I/O Error calling stat() on PanFS file: %s.\n", myname, strerror(errno)); MPI_Abort(MPI_COMM_WORLD, 1); } else if (err == 0) { FPRINTF(stderr,"%s: Cannot create PanFS file with ioctl when file already exists.\n", myname); MPI_Abort(MPI_COMM_WORLD, 1); } else { /* (err == -1) && (errno == ENOENT) */ /* File does not exist */ path = ADIOI_Strdup(fd->filename); slash = strrchr(path, '/'); if (!slash) ADIOI_Strncpy(path, ".", 2); else { if (slash == path) *(path + 1) = '\0'; else *slash = '\0'; } /* create PanFS object */ bzero(&file_create_args,sizeof(pan_fs_client_layout_create_args_t)); /* open directory */ fd_dir = open(path, O_RDONLY); if (fd_dir < 0) { FPRINTF(stderr, "%s: I/O Error opening parent directory to create PanFS file using ioctl: %s.\n", myname, strerror(errno)); MPI_Abort(MPI_COMM_WORLD, 1); } else { char *file_name_ptr = fd->filename; slash = strrchr(fd->filename, '/'); if (slash) { file_name_ptr = slash + 1; } /* create file in the directory */ file_create_args.mode = perm; file_create_args.version = PAN_FS_CLIENT_LAYOUT_VERSION; file_create_args.flags = PAN_FS_CLIENT_LAYOUT_CREATE_F__NONE; ADIOI_Strncpy(file_create_args.filename, file_name_ptr, strlen(fd->filename)+1); file_create_args.layout.agg_type = layout_type; file_create_args.layout.layout_is_valid = 1; if(layout_type == PAN_FS_CLIENT_LAYOUT_TYPE__RAID1_5_PARITY_STRIPE) { file_create_args.layout.u.raid1_5_parity_stripe.total_num_comps = layout_total_num_comps; file_create_args.layout.u.raid1_5_parity_stripe.parity_stripe_width = layout_parity_stripe_width; file_create_args.layout.u.raid1_5_parity_stripe.parity_stripe_depth = layout_parity_stripe_depth; file_create_args.layout.u.raid1_5_parity_stripe.stripe_unit = layout_stripe_unit; file_create_args.layout.u.raid1_5_parity_stripe.layout_visit_policy = layout_visit_policy; } else if(layout_type == PAN_FS_CLIENT_LAYOUT_TYPE__RAID0) { file_create_args.layout.u.raid0.total_num_comps = layout_total_num_comps; file_create_args.layout.u.raid0.stripe_unit = layout_stripe_unit; } else if(layout_type == PAN_FS_CLIENT_LAYOUT_TYPE__RAID10) { file_create_args.layout.u.raid10.total_num_comps = layout_total_num_comps; file_create_args.layout.u.raid10.stripe_unit = layout_stripe_unit; file_create_args.layout.u.raid10.layout_visit_policy = layout_visit_policy; } err = ioctl(fd_dir, PAN_FS_CLIENT_LAYOUT_CREATE_FILE, &file_create_args); if (err < 0) { FPRINTF(stderr, "%s: I/O Error doing ioctl on parent directory to create PanFS file using ioctl: %s.\n", myname, strerror(errno)); MPI_Abort(MPI_COMM_WORLD, 1); } err = close(fd_dir); } ADIOI_Free(path); } } else { int create_fd = open(fd->filename,amode,perm); if(create_fd != -1) { close(create_fd); } else { FPRINTF(stderr, "%s: I/O Error creating PanFS file using open: %s.\n", myname, strerror(errno)); MPI_Abort(MPI_COMM_WORLD, 1); } } } if (fd->access_mode & ADIO_RDONLY) amode = amode | O_RDONLY; if (fd->access_mode & ADIO_WRONLY) amode = amode | O_WRONLY; if (fd->access_mode & ADIO_RDWR) amode = amode | O_RDWR; if (fd->access_mode & ADIO_EXCL) amode = amode | O_EXCL; value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char)); ADIOI_Info_get(fd->info, "panfs_concurrent_write", MPI_MAX_INFO_VAL, value, &flag); if (flag) { unsigned long int concurrent_write = strtoul(value,NULL,10); if(concurrent_write == 1) { amode = amode | O_CONCURRENT_WRITE; } } ADIOI_Free(value); fd->fd_sys = open(fd->filename, amode, perm); fd->fd_direct = -1; if (fd->fd_sys != -1) { int rc; char temp_buffer[TEMP_BUFFER_SIZE]; pan_fs_client_layout_query_args_t file_query_args; bzero(&file_query_args,sizeof(pan_fs_client_layout_query_args_t)); file_query_args.version = PAN_FS_CLIENT_LAYOUT_VERSION; rc = ioctl(fd->fd_sys, PAN_FS_CLIENT_LAYOUT_QUERY_FILE, &file_query_args); if (rc < 0) { /* Error - set layout type to unknown */ ADIOI_Info_set(fd->info, "panfs_layout_type", "PAN_FS_CLIENT_LAYOUT_TYPE__INVALID"); } else { ADIOI_Snprintf(temp_buffer,TEMP_BUFFER_SIZE,"%u",file_query_args.layout.agg_type); ADIOI_Info_set(fd->info, "panfs_layout_type", temp_buffer); if (file_query_args.layout.layout_is_valid == 1) { switch (file_query_args.layout.agg_type) { case PAN_FS_CLIENT_LAYOUT_TYPE__RAID0: ADIOI_Snprintf(temp_buffer,TEMP_BUFFER_SIZE,"%u",file_query_args.layout.u.raid0.stripe_unit); ADIOI_Info_set(fd->info, "panfs_layout_stripe_unit", temp_buffer); ADIOI_Snprintf(temp_buffer,TEMP_BUFFER_SIZE,"%u",file_query_args.layout.u.raid0.total_num_comps); ADIOI_Info_set(fd->info, "panfs_layout_total_num_comps", temp_buffer); break; case PAN_FS_CLIENT_LAYOUT_TYPE__RAID1_5_PARITY_STRIPE: ADIOI_Snprintf(temp_buffer,TEMP_BUFFER_SIZE,"%u",file_query_args.layout.u.raid1_5_parity_stripe.stripe_unit); ADIOI_Info_set(fd->info, "panfs_layout_stripe_unit", temp_buffer); ADIOI_Snprintf(temp_buffer,TEMP_BUFFER_SIZE,"%u",file_query_args.layout.u.raid1_5_parity_stripe.parity_stripe_width); ADIOI_Info_set(fd->info, "panfs_layout_parity_stripe_width", temp_buffer); ADIOI_Snprintf(temp_buffer,TEMP_BUFFER_SIZE,"%u",file_query_args.layout.u.raid1_5_parity_stripe.parity_stripe_depth); ADIOI_Info_set(fd->info, "panfs_layout_parity_stripe_depth", temp_buffer); ADIOI_Snprintf(temp_buffer,TEMP_BUFFER_SIZE,"%u",file_query_args.layout.u.raid1_5_parity_stripe.total_num_comps); ADIOI_Info_set(fd->info, "panfs_layout_total_num_comps", temp_buffer); ADIOI_Snprintf(temp_buffer,TEMP_BUFFER_SIZE,"%u",file_query_args.layout.u.raid1_5_parity_stripe.layout_visit_policy); ADIOI_Info_set(fd->info, "panfs_layout_visit_policy", temp_buffer); break; case PAN_FS_CLIENT_LAYOUT_TYPE__RAID10: ADIOI_Snprintf(temp_buffer,TEMP_BUFFER_SIZE,"%u",file_query_args.layout.u.raid10.stripe_unit); ADIOI_Info_set(fd->info, "panfs_layout_stripe_unit", temp_buffer); ADIOI_Snprintf(temp_buffer,TEMP_BUFFER_SIZE,"%u",file_query_args.layout.u.raid10.total_num_comps); ADIOI_Info_set(fd->info, "panfs_layout_total_num_comps", temp_buffer); ADIOI_Snprintf(temp_buffer,TEMP_BUFFER_SIZE,"%u",file_query_args.layout.u.raid10.layout_visit_policy); ADIOI_Info_set(fd->info, "panfs_layout_visit_policy", temp_buffer); break; default: break; } } } } if ((fd->fd_sys != -1) && (fd->access_mode & ADIO_APPEND)) fd->fp_ind = fd->fp_sys_posn = lseek(fd->fd_sys, 0, SEEK_END); if (fd->fd_sys == -1) { if (errno == ENAMETOOLONG) *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_BAD_FILE, "**filenamelong", "**filenamelong %s %d", fd->filename, strlen(fd->filename)); else if (errno == ENOENT) *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_NO_SUCH_FILE, "**filenoexist", "**filenoexist %s", fd->filename); else if (errno == ENOTDIR || errno == ELOOP) *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_BAD_FILE, "**filenamedir", "**filenamedir %s", fd->filename); else if (errno == EACCES) { *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_ACCESS, "**fileaccess", "**fileaccess %s", fd->filename ); } else if (errno == EROFS) { /* Read only file or file system and write access requested */ *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_READ_ONLY, "**ioneedrd", 0 ); } else { *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_IO, "**io", "**io %s", strerror(errno)); } } else *error_code = MPI_SUCCESS; }
void ADIOI_GPFS_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code) { /* if fd->info is null, create a new info object. Initialize fd->info to default values. Initialize fd->hints to default values. Examine the info object passed by the user. If it contains values that ROMIO understands, override the default. */ MPI_Info info; char *value; int flag, intval, nprocs=0, nprocs_is_valid = 0; static char myname[] = "ADIOI_GPFS_SETINFO"; int did_anything = 0; if (fd->info == MPI_INFO_NULL) MPI_Info_create(&(fd->info)); info = fd->info; /* Note that fd->hints is allocated at file open time; thus it is * not necessary to allocate it, or check for allocation, here. */ value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char)); ADIOI_Assert ((value != NULL)); /* initialize info and hints to default values if they haven't been * previously initialized */ if (!fd->hints->initialized) { ad_get_env_vars(); ad_gpfs_get_env_vars(); did_anything = 1; /* buffer size for collective I/O */ ADIOI_Info_set(info, "cb_buffer_size", ADIOI_GPFS_CB_BUFFER_SIZE_DFLT); fd->hints->cb_buffer_size = atoi(ADIOI_GPFS_CB_BUFFER_SIZE_DFLT); /* default is to let romio automatically decide when to use * collective buffering */ ADIOI_Info_set(info, "romio_cb_read", "enable"); fd->hints->cb_read = ADIOI_HINT_ENABLE; ADIOI_Info_set(info, "romio_cb_write", "enable"); fd->hints->cb_write = ADIOI_HINT_ENABLE; if ( fd->hints->cb_config_list != NULL ) ADIOI_Free (fd->hints->cb_config_list); fd->hints->cb_config_list = NULL; /* number of processes that perform I/O in collective I/O */ MPI_Comm_size(fd->comm, &nprocs); nprocs_is_valid = 1; MPL_snprintf(value, MPI_MAX_INFO_VAL+1, "%d", nprocs); ADIOI_Info_set(info, "cb_nodes", value); fd->hints->cb_nodes = -1; /* hint indicating that no indep. I/O will be performed on this file */ ADIOI_Info_set(info, "romio_no_indep_rw", "false"); fd->hints->no_indep_rw = 0; /* gpfs is not implementing file realms (ADIOI_IOStridedColl), initialize to disabled it. */ /* hint instructing the use of persistent file realms */ ADIOI_Info_set(info, "romio_cb_pfr", "disable"); fd->hints->cb_pfr = ADIOI_HINT_DISABLE; /* hint guiding the assignment of persistent file realms */ ADIOI_Info_set(info, "romio_cb_fr_types", "aar"); fd->hints->cb_fr_type = ADIOI_FR_AAR; /* hint to align file realms with a certain byte value */ ADIOI_Info_set(info, "romio_cb_fr_alignment", "1"); fd->hints->cb_fr_alignment = 1; /* hint to set a threshold percentage for a datatype's size/extent at * which data sieving should be done in collective I/O */ ADIOI_Info_set(info, "romio_cb_ds_threshold", "0"); fd->hints->cb_ds_threshold = 0; /* hint to switch between point-to-point or all-to-all for two-phase */ ADIOI_Info_set(info, "romio_cb_alltoall", "automatic"); fd->hints->cb_alltoall = ADIOI_HINT_AUTO; /* deferred_open derived from no_indep_rw and cb_{read,write} */ fd->hints->deferred_open = 0; /* buffer size for data sieving in independent reads */ ADIOI_Info_set(info, "ind_rd_buffer_size", ADIOI_GPFS_IND_RD_BUFFER_SIZE_DFLT); fd->hints->ind_rd_buffer_size = atoi(ADIOI_GPFS_IND_RD_BUFFER_SIZE_DFLT); /* buffer size for data sieving in independent writes */ ADIOI_Info_set(info, "ind_wr_buffer_size", ADIOI_GPFS_IND_WR_BUFFER_SIZE_DFLT); fd->hints->ind_wr_buffer_size = atoi(ADIOI_GPFS_IND_WR_BUFFER_SIZE_DFLT); ADIOI_Info_set(info, "romio_ds_read", "automatic"); fd->hints->ds_read = ADIOI_HINT_AUTO; ADIOI_Info_set(info, "romio_ds_write", "automatic"); fd->hints->ds_write = ADIOI_HINT_AUTO; /* still to do: tune this a bit for a variety of file systems. there's * no good default value so just leave it unset */ fd->hints->min_fdomain_size = 0; fd->hints->striping_unit = 0; fd->hints->initialized = 1; } /* add in user's info if supplied */ if (users_info != MPI_INFO_NULL) { ADIOI_Info_check_and_install_int(fd, users_info, "cb_buffer_size", &(fd->hints->cb_buffer_size), myname, error_code); /* new hints for enabling/disabling coll. buffering on * reads/writes */ ADIOI_Info_check_and_install_enabled(fd, users_info, "romio_cb_read", &(fd->hints->cb_read), myname, error_code); if (fd->hints->cb_read == ADIOI_HINT_DISABLE) { /* romio_cb_read overrides no_indep_rw */ ADIOI_Info_set(info, "romio_no_indep_rw", "false"); fd->hints->no_indep_rw = ADIOI_HINT_DISABLE; } ADIOI_Info_check_and_install_enabled(fd, users_info, "romio_cb_write", &(fd->hints->cb_write), myname, error_code); if (fd->hints->cb_write == ADIOI_HINT_DISABLE) { /* romio_cb_write overrides no_indep_rw */ ADIOI_Info_set(info, "romio_no_indep_rw", "false"); fd->hints->no_indep_rw = ADIOI_HINT_DISABLE; } /* Has the user indicated all I/O will be done collectively? */ ADIOI_Info_check_and_install_true(fd, users_info, "romio_no_indep_rw", &(fd->hints->no_indep_rw), myname, error_code); if (fd->hints->no_indep_rw == 1) { /* if 'no_indep_rw' set, also hint that we will do * collective buffering: if we aren't doing independent io, * then we have to do collective */ ADIOI_Info_set(info, "romio_cb_write", "enable"); ADIOI_Info_set(info, "romio_cb_read", "enable"); fd->hints->cb_read = 1; fd->hints->cb_write = 1; } /* new hints for enabling/disabling data sieving on * reads/writes */ ADIOI_Info_check_and_install_enabled(fd, users_info, "romio_ds_read", &(fd->hints->ds_read), myname, error_code); ADIOI_Info_check_and_install_enabled(fd, users_info, "romio_ds_write", &(fd->hints->ds_write), myname, error_code); ADIOI_Info_check_and_install_int(fd, users_info, "ind_wr_buffer_size", &(fd->hints->ind_wr_buffer_size), myname, error_code); ADIOI_Info_check_and_install_int(fd, users_info, "ind_rd_buffer_size", &(fd->hints->ind_rd_buffer_size), myname, error_code); memset( value, 0, MPI_MAX_INFO_VAL+1 ); ADIOI_Info_get(users_info, "romio_min_fdomain_size", MPI_MAX_INFO_VAL, value, &flag); if ( flag && ((intval = atoi(value)) > 0) ) { ADIOI_Info_set(info, "romio_min_fdomain_size", value); fd->hints->min_fdomain_size = intval; } /* Now we use striping unit in common code so we should process hints for it. */ ADIOI_Info_check_and_install_int(fd, users_info, "striping_unit", &(fd->hints->striping_unit), myname, error_code); #ifdef BGQPLATFORM memset( value, 0, MPI_MAX_INFO_VAL+1 ); ADIOI_Info_get(users_info, ADIOI_BG_NAGG_IN_PSET_HINT_NAME, MPI_MAX_INFO_VAL, value, &flag); if (flag && ((intval = atoi(value)) > 0)) { did_anything = 1; ADIOI_Info_set(info, ADIOI_BG_NAGG_IN_PSET_HINT_NAME, value); fd->hints->cb_nodes = intval; } #endif } /* special CB aggregator assignment */ if (did_anything) { #ifdef BGQPLATFORM ADIOI_BG_gen_agg_ranklist(fd, fd->hints->cb_nodes); #elif PEPLATFORM ADIOI_PE_gen_agg_ranklist(fd); #endif } /* deferred_open won't be set by callers, but if the user doesn't * explicitly disable collecitve buffering (two-phase) and does hint that * io w/o independent io is going on, we'll set this internal hint as a * convenience */ if ( ( (fd->hints->cb_read != ADIOI_HINT_DISABLE) \ && (fd->hints->cb_write != ADIOI_HINT_DISABLE)\ && fd->hints->no_indep_rw ) ) { fd->hints->deferred_open = 1; } else { /* setting romio_no_indep_rw enable and romio_cb_{read,write} * disable at the same time doesn't make sense. honor * romio_cb_{read,write} and force the no_indep_rw hint to * 'disable' */ ADIOI_Info_set(info, "romio_no_indep_rw", "false"); fd->hints->no_indep_rw = 0; fd->hints->deferred_open = 0; } /* BobC commented this out, but since hint processing runs on both bg and * bglockless, we need to keep DS writes enabled on gpfs and disabled on * PVFS */ if (ADIO_Feature(fd, ADIO_DATA_SIEVING_WRITES) == 0) { /* disable data sieving for fs that do not support file locking */ ADIOI_Info_get(info, "ind_wr_buffer_size", MPI_MAX_INFO_VAL, value, &flag); if (flag) { /* get rid of this value if it is set */ ADIOI_Info_delete(info, "ind_wr_buffer_size"); } /* note: leave ind_wr_buffer_size alone; used for other cases * as well. -- Rob Ross, 04/22/2003 */ ADIOI_Info_set(info, "romio_ds_write", "disable"); fd->hints->ds_write = ADIOI_HINT_DISABLE; } ADIOI_Free(value); *error_code = MPI_SUCCESS; }
void ADIOI_LUSTRE_Open(ADIO_File fd, int *error_code) { int perm, old_mask, amode, amode_direct; int lumlen, myrank, flag, set_layout=0, err; struct lov_user_md *lum = NULL; char *value; ADIO_Offset str_factor = -1, str_unit=0, start_iodev=-1; size_t value_sz = (MPI_MAX_INFO_VAL+1)*sizeof(char); #if defined(MPICH) || !defined(PRINT_ERR_MSG) static char myname[] = "ADIOI_LUSTRE_OPEN"; #endif MPI_Comm_rank(fd->comm, &myrank); if (fd->perm == ADIO_PERM_NULL) { old_mask = umask(022); umask(old_mask); perm = old_mask ^ 0666; } else perm = fd->perm; amode = 0; if (fd->access_mode & ADIO_CREATE) amode = amode | O_CREAT; if (fd->access_mode & ADIO_RDONLY) amode = amode | O_RDONLY; if (fd->access_mode & ADIO_WRONLY) amode = amode | O_WRONLY; if (fd->access_mode & ADIO_RDWR) amode = amode | O_RDWR; if (fd->access_mode & ADIO_EXCL) amode = amode | O_EXCL; amode_direct = amode | O_DIRECT; /* odd length here because lov_user_md contains some fixed data and * then a list of 'lmm_objects' representing stripe */ lumlen = sizeof(struct lov_user_md) + MAX_LOV_UUID_COUNT * sizeof(struct lov_user_ost_data); lum = (struct lov_user_md *)ADIOI_Calloc(1,lumlen); value = (char *) ADIOI_Malloc(value_sz); /* we already validated in LUSTRE_SetInfo that these are going to be the same */ if (fd->info != MPI_INFO_NULL) { /* striping information */ ADIOI_Info_get(fd->info, "striping_unit", MPI_MAX_INFO_VAL, value, &flag); if (flag) str_unit=atoll(value); ADIOI_Info_get(fd->info, "striping_factor", MPI_MAX_INFO_VAL, value, &flag); if (flag) str_factor=atoll(value); ADIOI_Info_get(fd->info, "romio_lustre_start_iodevice", MPI_MAX_INFO_VAL, value, &flag); if (flag) start_iodev=atoll(value); } if ((str_factor > 0) || (str_unit > 0) || (start_iodev >= 0)) set_layout = 1; /* if hints were set, we need to delay creation of any lustre objects. * However, if we open the file with O_LOV_DELAY_CREATE and don't call the * follow-up ioctl, subsequent writes will fail */ if (myrank == 0 && set_layout) amode = amode | O_LOV_DELAY_CREATE; fd->fd_sys = open(fd->filename, amode, perm); if (fd->fd_sys == -1) goto fn_exit; /* we can only set these hints on new files */ /* It was strange and buggy to open the file in the hint path. Instead, * we'll apply the file tunings at open time */ if ((amode & O_CREAT) && set_layout ) { /* if user has specified striping info, first aggregator tries to set * it */ if (myrank == fd->hints->ranklist[0] || fd->comm == MPI_COMM_SELF) { lum->lmm_magic = LOV_USER_MAGIC; lum->lmm_pattern = 0; /* crude check for overflow of lustre internal datatypes. * Silently cap to large value if user provides a value * larger than lustre supports */ if (str_unit > UINT_MAX) lum->lmm_stripe_size = UINT_MAX; else lum->lmm_stripe_size = str_unit; if (str_factor > USHRT_MAX) lum->lmm_stripe_count = USHRT_MAX; else lum->lmm_stripe_count = str_factor; if (start_iodev > USHRT_MAX) lum->lmm_stripe_offset = USHRT_MAX; else lum->lmm_stripe_offset = start_iodev; err = ioctl(fd->fd_sys, LL_IOC_LOV_SETSTRIPE, lum); if (err == -1 && errno != EEXIST) { fprintf(stderr, "Failure to set stripe info %s \n", strerror(errno)); /* not a fatal error, but user might care to know */ } } /* End of striping parameters validation */ } /* Pascal Deveze reports that, even though we pass a * "GETSTRIPE" (read) flag to the ioctl, if some of the values of this * struct are uninitialzed, the call can give an error. zero it out in case * there are other members that must be initialized and in case * lov_user_md struct changes in future */ memset(lum, 0, lumlen); lum->lmm_magic = LOV_USER_MAGIC; err = ioctl(fd->fd_sys, LL_IOC_LOV_GETSTRIPE, (void *)lum); if (!err) { fd->hints->striping_unit = lum->lmm_stripe_size; MPL_snprintf(value, value_sz, "%d", lum->lmm_stripe_size); ADIOI_Info_set(fd->info, "striping_unit", value); fd->hints->striping_factor = lum->lmm_stripe_count; MPL_snprintf(value, value_sz, "%d", lum->lmm_stripe_count); ADIOI_Info_set(fd->info, "striping_factor", value); fd->hints->start_iodevice = lum->lmm_stripe_offset; MPL_snprintf(value, value_sz, "%d", lum->lmm_stripe_offset); ADIOI_Info_set(fd->info, "romio_lustre_start_iodevice", value); } if (fd->access_mode & ADIO_APPEND) fd->fp_ind = fd->fp_sys_posn = lseek(fd->fd_sys, 0, SEEK_END); fd->fd_direct = -1; if (fd->direct_write || fd->direct_read) { fd->fd_direct = open(fd->filename, amode_direct, perm); if (fd->fd_direct != -1) { fd->d_mem = fd->d_miniosz = (1<<12); } else { perror("cannot open file with O_Direct"); fd->direct_write = fd->direct_read = 0; } } fn_exit: ADIOI_Free(lum); ADIOI_Free(value); /* --BEGIN ERROR HANDLING-- */ if (fd->fd_sys == -1 || ((fd->fd_direct == -1) && (fd->direct_write || fd->direct_read))) { *error_code = ADIOI_Err_create_code(myname, fd->filename, errno); } /* --END ERROR HANDLING-- */ else *error_code = MPI_SUCCESS; }
void ADIOI_LUSTRE_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code) { char *value; int flag; ADIO_Offset stripe_val[3], str_factor = -1, str_unit = 0, start_iodev = -1; int myrank; static char myname[] = "ADIOI_LUSTRE_SETINFO"; #ifdef HAVE_LUSTRE_LOCKAHEAD /* Set lock ahead default hints */ fd->hints->fs_hints.lustre.lock_ahead_read = 0; fd->hints->fs_hints.lustre.lock_ahead_write = 0; fd->hints->fs_hints.lustre.lock_ahead_num_extents = 500; fd->hints->fs_hints.lustre.lock_ahead_flags = 0; #endif value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL + 1) * sizeof(char)); if ((fd->info) == MPI_INFO_NULL) { /* This must be part of the open call. can set striping parameters * if necessary. */ MPI_Info_create(&(fd->info)); ADIOI_Info_set(fd->info, "direct_read", "false"); ADIOI_Info_set(fd->info, "direct_write", "false"); fd->direct_read = fd->direct_write = 0; /* initialize lustre hints */ ADIOI_Info_set(fd->info, "romio_lustre_co_ratio", "1"); fd->hints->fs_hints.lustre.co_ratio = 1; ADIOI_Info_set(fd->info, "romio_lustre_coll_threshold", "0"); fd->hints->fs_hints.lustre.coll_threshold = 0; ADIOI_Info_set(fd->info, "romio_lustre_ds_in_coll", "enable"); fd->hints->fs_hints.lustre.ds_in_coll = ADIOI_HINT_ENABLE; /* has user specified striping or server buffering parameters * and do they have the same value on all processes? */ if (users_info != MPI_INFO_NULL) { /* striping information */ ADIOI_Info_get(users_info, "striping_unit", MPI_MAX_INFO_VAL, value, &flag); if (flag) { ADIOI_Info_set(fd->info, "striping_unit", value); str_unit = atoll(value); } ADIOI_Info_get(users_info, "striping_factor", MPI_MAX_INFO_VAL, value, &flag); if (flag) { ADIOI_Info_set(fd->info, "striping_factor", value); str_factor = atoll(value); } ADIOI_Info_get(users_info, "romio_lustre_start_iodevice", MPI_MAX_INFO_VAL, value, &flag); if (flag) { ADIOI_Info_set(fd->info, "romio_lustre_start_iodevice", value); start_iodev = atoll(value); } /* direct read and write */ ADIOI_Info_get(users_info, "direct_read", MPI_MAX_INFO_VAL, value, &flag); if (flag && (!strcmp(value, "true") || !strcmp(value, "TRUE"))) { ADIOI_Info_set(fd->info, "direct_read", "true"); fd->direct_read = 1; } ADIOI_Info_get(users_info, "direct_write", MPI_MAX_INFO_VAL, value, &flag); if (flag && (!strcmp(value, "true") || !strcmp(value, "TRUE"))) { ADIOI_Info_set(fd->info, "direct_write", "true"); fd->direct_write = 1; } #ifdef HAVE_LUSTRE_LOCKAHEAD /* Get lock ahead hints */ ADIOI_Info_check_and_install_int(fd, users_info, "romio_lustre_cb_lock_ahead_write", &(fd->hints->fs_hints.lustre.lock_ahead_write), myname, error_code); ADIOI_Info_check_and_install_int(fd, users_info, "romio_lustre_cb_lock_ahead_read", &(fd->hints->fs_hints.lustre.lock_ahead_read), myname, error_code); /* If, and only if, we're using lock ahead, * process/set the number of extents to pre-lock and the flags */ if (fd->hints->fs_hints.lustre.lock_ahead_read || fd->hints->fs_hints.lustre.lock_ahead_write) { /* Get user's number of extents */ ADIOI_Info_check_and_install_int(fd, users_info, "romio_lustre_cb_lock_ahead_num_extents", &(fd->hints->fs_hints. lustre.lock_ahead_num_extents), myname, error_code); /* ADIOI_Info_check_and_install_int doesn't set the * value in fd unless it was in user_info, but knowing * the value - default or explicit - is useful. * Set the final number of extents in the fd->info */ MPL_snprintf(value, MPI_MAX_INFO_VAL + 1, "%d", fd->hints->fs_hints.lustre.lock_ahead_num_extents); ADIOI_Info_set(fd->info, "romio_lustre_cb_lock_ahead_num_extents", value); /* Get user's flags */ ADIOI_Info_check_and_install_int(fd, users_info, "romio_lustre_cb_lock_ahead_flags", &(fd->hints->fs_hints.lustre.lock_ahead_flags), myname, error_code); } #endif } /* set striping information with ioctl */ MPI_Comm_rank(fd->comm, &myrank); if (myrank == 0) { stripe_val[0] = str_factor; stripe_val[1] = str_unit; stripe_val[2] = start_iodev; } MPI_Bcast(stripe_val, 3, MPI_OFFSET, 0, fd->comm); /* do not open file in hint processing. Open file in open routines, * where we can better deal with EXCL flag . Continue to check the * "all processors set a value" condition holds. */ if (stripe_val[0] != str_factor || stripe_val[1] != str_unit || stripe_val[2] != start_iodev) { MPIO_ERR_CREATE_CODE_INFO_NOT_SAME("ADIOI_LUSTRE_SetInfo", "str_factor or str_unit or start_iodev", error_code); ADIOI_Free(value); return; } } /* get other hint */ if (users_info != MPI_INFO_NULL) { /* CO: IO Clients/OST, * to keep the load balancing between clients and OSTs */ ADIOI_Info_check_and_install_int(fd, users_info, "romio_lustre_co_ratio", &(fd->hints->fs_hints.lustre.co_ratio), myname, error_code); /* coll_threshold: * if the req size is bigger than this, collective IO may not be performed. */ ADIOI_Info_check_and_install_int(fd, users_info, "romio_lustre_coll_threshold", &(fd->hints->fs_hints.lustre.coll_threshold), myname, error_code); /* ds_in_coll: disable data sieving in collective IO */ ADIOI_Info_check_and_install_enabled(fd, users_info, "romio_lustre_ds_in_coll", &(fd->hints->fs_hints.lustre.ds_in_coll), myname, error_code); } /* set the values for collective I/O and data sieving parameters */ ADIOI_GEN_SetInfo(fd, users_info, error_code); /* generic hints might step on striping_unit */ if (users_info != MPI_INFO_NULL) { ADIOI_Info_check_and_install_int(fd, users_info, "striping_unit", NULL, myname, error_code); } if (ADIOI_Direct_read) fd->direct_read = 1; if (ADIOI_Direct_write) fd->direct_write = 1; ADIOI_Free(value); *error_code = MPI_SUCCESS; }
void ADIOI_PVFS_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code) { char *value; int flag, tmp_val, str_factor=-1, str_unit=-1, start_iodev=-1; static char myname[] = "ADIOI_PVFS_SETINFO"; if ((fd->info) == MPI_INFO_NULL) { /* This must be part of the open call. can set striping parameters if necessary. */ MPI_Info_create(&(fd->info)); ADIOI_Info_set(fd->info, "romio_pvfs_listio_read", "disable"); ADIOI_Info_set(fd->info, "romio_pvfs_listio_write", "disable"); fd->hints->fs_hints.pvfs.listio_read = ADIOI_HINT_DISABLE; fd->hints->fs_hints.pvfs.listio_write = ADIOI_HINT_DISABLE; /* has user specified any pvfs-specific hints (striping params, listio) and do they have the same value on all processes? */ if (users_info != MPI_INFO_NULL) { value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char)); ADIOI_Info_get(users_info, "striping_factor", MPI_MAX_INFO_VAL, value, &flag); if (flag) { str_factor=atoi(value); tmp_val = str_factor; MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm); if (tmp_val != str_factor) { /* --BEGIN ERROR HANDLING-- */ MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname, "striping_factor", error_code); return; /* --END ERROR HANDLING-- */ } else ADIOI_Info_set(fd->info, "striping_factor", value); } ADIOI_Info_get(users_info, "striping_unit", MPI_MAX_INFO_VAL, value, &flag); if (flag) { str_unit=atoi(value); tmp_val = str_unit; MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm); if (tmp_val != str_unit) { /* --BEGIN ERROR HANDLING-- */ MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname, "striping_unit", error_code); return; /* --END ERROR HANDLING-- */ } else ADIOI_Info_set(fd->info, "striping_unit", value); } ADIOI_Info_get(users_info, "start_iodevice", MPI_MAX_INFO_VAL, value, &flag); if (flag) { start_iodev=atoi(value); tmp_val = start_iodev; MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm); if (tmp_val != start_iodev) { /* --BEGIN ERROR HANDLING-- */ MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname, "start_iodevice", error_code); return; /* --END ERROR HANDLING-- */ } else ADIOI_Info_set(fd->info, "start_iodevice", value); } ADIOI_Info_get(users_info, "romio_pvfs_listio_read", MPI_MAX_INFO_VAL, value, &flag); if (flag) { if ( !strcmp(value, "enable") || !strcmp(value, "ENABLE")) { ADIOI_Info_set(fd->info, "romio_pvfs_listio_read", value); fd->hints->fs_hints.pvfs.listio_read = ADIOI_HINT_ENABLE; } else if ( !strcmp(value, "disable") || !strcmp(value, "DISABLE")) { ADIOI_Info_set(fd->info , "romio_pvfs_listio_read", value); fd->hints->fs_hints.pvfs.listio_read = ADIOI_HINT_DISABLE; } else if ( !strcmp(value, "automatic") || !strcmp(value, "AUTOMATIC")) { ADIOI_Info_set(fd->info, "romio_pvfs_listio_read", value); fd->hints->fs_hints.pvfs.listio_read = ADIOI_HINT_AUTO; } tmp_val = fd->hints->fs_hints.pvfs.listio_read; MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm); if (tmp_val != fd->hints->fs_hints.pvfs.listio_read) { /* --BEGIN ERROR HANDLING-- */ MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname, "romio_pvfs_listio_read", error_code); return; /* --END ERROR HANDLING-- */ } } ADIOI_Info_get(users_info, "romio_pvfs_listio_write", MPI_MAX_INFO_VAL, value, &flag); if (flag) { if ( !strcmp(value, "enable") || !strcmp(value, "ENABLE")) { ADIOI_Info_set(fd->info, "romio_pvfs_listio_write", value); fd->hints->fs_hints.pvfs.listio_write = ADIOI_HINT_ENABLE; } else if ( !strcmp(value, "disable") || !strcmp(value, "DISABLE")) { ADIOI_Info_set(fd->info, "romio_pvfs_listio_write", value); fd->hints->fs_hints.pvfs.listio_write = ADIOI_HINT_DISABLE; } else if ( !strcmp(value, "automatic") || !strcmp(value, "AUTOMATIC")) { ADIOI_Info_set(fd->info, "romio_pvfs_listio_write", value); fd->hints->fs_hints.pvfs.listio_write = ADIOI_HINT_AUTO; } tmp_val = fd->hints->fs_hints.pvfs.listio_write; MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm); if (tmp_val != fd->hints->fs_hints.pvfs.listio_write) { /* --BEGIN ERROR HANDLING-- */ MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname, "romio_pvfs_listio_write", error_code); return; /* --END ERROR HANDLING-- */ } } ADIOI_Free(value); } } /* set the values for collective I/O and data sieving parameters */ ADIOI_GEN_SetInfo(fd, users_info, error_code); *error_code = MPI_SUCCESS; }