static int builtin_posix_module_element_open (hio_dataset_t dataset, hio_element_t element) { builtin_posix_module_dataset_t *posix_dataset = (builtin_posix_module_dataset_t *) dataset; builtin_posix_module_t *posix_module = (builtin_posix_module_t *) dataset->ds_module; hio_context_t context = hioi_object_context (&dataset->ds_object); int rc; if (HIO_FILE_MODE_BASIC == dataset->ds_fmode) { rc = builtin_posix_module_element_open_basic (posix_module, posix_dataset, element); if (HIO_SUCCESS != rc) { hioi_object_release (&element->e_object); return rc; } } hioi_log (context, HIO_VERBOSE_DEBUG_LOW, "posix: %s element %p (identifier %s) for dataset %s", (HIO_FLAG_WRITE & dataset->ds_flags) ? "created" : "opened", element, hioi_object_identifier(element), hioi_object_identifier(dataset)); element->e_write_strided_nb = builtin_posix_module_element_write_strided_nb; element->e_read_strided_nb = builtin_posix_module_element_read_strided_nb; element->e_flush = builtin_posix_module_element_flush; element->e_complete = builtin_posix_module_element_complete; element->e_close = builtin_posix_module_element_close; return HIO_SUCCESS; }
static int builtin_posix_element_translate_strided (builtin_posix_module_t *posix_module, hio_element_t element, uint64_t offset, size_t *size, hio_file_t **file_out) { builtin_posix_module_dataset_t *posix_dataset = (builtin_posix_module_dataset_t *) hioi_element_dataset (element); size_t block_id, block_base, block_bound, block_offset, file_id, file_block; hio_context_t context = hioi_object_context (&element->e_object); hio_file_t *file; int32_t file_index; char *path; int rc; block_id = offset / posix_dataset->ds_bs; file_id = block_id % posix_dataset->ds_fcount; file_block = block_id / posix_dataset->ds_fcount; block_base = block_id * posix_dataset->ds_bs; block_bound = block_base + posix_dataset->ds_bs; block_offset = file_block * posix_dataset->ds_bs + offset - block_base; hioi_log (context, HIO_VERBOSE_DEBUG_LOW, "builtin_posix_element_translate_strided: element: %s, offset: %" PRIu64 ", file_id: %lu, file_block: %lu, block_offset: %lu, block_size: %" PRIu64, hioi_object_identifier(element), offset, file_id, file_id, block_offset, posix_dataset->ds_bs); if (offset + *size > block_bound) { *size = block_bound - offset; } rc = asprintf (&path, "%s/data/%s_block.%08lu", posix_dataset->base_path, hioi_object_identifier(element), (unsigned long) file_id); if (0 > rc) { return HIO_ERR_OUT_OF_RESOURCE; } /* use crc as a hash to pick a file index to use */ file_index = file_id % HIO_POSIX_MAX_OPEN_FILES; file = posix_dataset->files + file_index; if (file_id != file->f_bid || file->f_element != element) { if (file->f_bid >= 0) { POSIX_TRACE_CALL(posix_dataset, hioi_file_close (file), "file_close", file->f_bid, 0); } file->f_bid = -1; file->f_element = element; POSIX_TRACE_CALL(posix_dataset, rc = builtin_posix_open_file (posix_module, posix_dataset, path, file), "file_open", file_id, 0); if (HIO_SUCCESS != rc) { return rc; } file->f_bid = file_id; } POSIX_TRACE_CALL(posix_dataset, hioi_file_seek (file, block_offset, SEEK_SET), "file_seek", file->f_bid, block_offset); *file_out = file; return HIO_SUCCESS; }
static int builtin_posix_module_element_write_strided_nb (hio_element_t element, hio_request_t *request, off_t offset, const void *ptr, size_t count, size_t size, size_t stride) { builtin_posix_module_dataset_t *posix_dataset = (builtin_posix_module_dataset_t *) hioi_element_dataset (element); builtin_posix_module_t *posix_module = (builtin_posix_module_t *) posix_dataset->base.ds_module; hio_context_t context = hioi_object_context (&element->e_object); ssize_t bytes_written; hio_request_t new_request; pthread_mutex_lock (&posix_dataset->lock); bytes_written = builtin_posix_module_element_write_strided_internal (posix_module, element, offset, ptr, count, size, stride); pthread_mutex_unlock (&posix_dataset->lock); if (request) { new_request = hioi_request_alloc (context); if (NULL == new_request) { return HIO_ERR_OUT_OF_RESOURCE; } *request = new_request; new_request->req_transferred = bytes_written; new_request->req_complete = true; new_request->req_status = posix_dataset->base.ds_status; } return posix_dataset->base.ds_status; }
void hioi_err_push_mpi (int mpirc, hio_object_t object, char *format, ...) { hio_context_t context = object ? hioi_object_context (object) : NULL; hio_error_stack_item_t *new_item; char mpi_error[MPI_MAX_ERROR_STRING] = "Unknown error"; int resultlen = MPI_MAX_ERROR_STRING; va_list vargs; char *temp; int rc; va_start (vargs, format); rc = vasprintf (&temp, format, vargs); va_end (vargs); if (0 >= rc) { /* couldn't allocate error string */ return; } /* ignore the error code for this */ (void) MPI_Error_string (mpirc, mpi_error, &resultlen); new_item = calloc (1, sizeof (hio_error_stack_item_t)); if (NULL == new_item) { /* not much can be done here. we are just plain OOM. */ return; } new_item->hrc = hioi_err_mpi(mpirc); /* TODO -- Should probably do something smarter here */ new_item->error_string = malloc (strlen (temp) + 3 + resultlen); if (NULL == temp) { free (new_item); free (temp); return; } /* append the mpi error to the hio error string */ strcpy (new_item->error_string, temp); strcat (new_item->error_string, ": "); strcat (new_item->error_string, mpi_error); /* done with this now */ free (temp); /* push the error message onto the stack */ if (NULL == context) { pthread_mutex_lock (&hio_error_stack_mutex); new_item->next = hio_error_stack_head; hio_error_stack_head = new_item; pthread_mutex_unlock (&hio_error_stack_mutex); } else { hioi_object_lock (&context->c_object); new_item->next = (hio_error_stack_item_t *) context->c_estack; context->c_estack = (void *) new_item; hioi_object_unlock (&context->c_object); } }
hio_context_t hioi_object_context (hio_object_t object) { if (NULL == object->parent) { /* all objects have a context at the root */ assert (HIO_OBJECT_TYPE_CONTEXT == object->type); return (hio_context_t) object; } return hioi_object_context (object->parent); }
static int builtin_posix_module_process_reqs (hio_dataset_t dataset, hio_internal_request_t **reqs, int req_count) { builtin_posix_module_dataset_t *posix_dataset = (builtin_posix_module_dataset_t *) dataset; builtin_posix_module_t *posix_module = (builtin_posix_module_t *) dataset->ds_module; hio_context_t context = hioi_object_context (&dataset->ds_object); uint64_t start, stop; int rc = HIO_SUCCESS; start = hioi_gettime (); hioi_object_lock (&dataset->ds_object); for (int i = 0 ; i < req_count ; ++i) { hio_internal_request_t *req = reqs[i]; if (HIO_REQUEST_TYPE_READ == req->ir_type) { POSIX_TRACE_CALL(posix_dataset, req->ir_status = builtin_posix_module_element_read_strided_internal (posix_module, req->ir_element, req->ir_offset, req->ir_data.r, req->ir_count, req->ir_size, req->ir_stride), "element_read", req->ir_offset, req->ir_count * req->ir_size); } else { POSIX_TRACE_CALL(posix_dataset, req->ir_status = builtin_posix_module_element_write_strided_internal (posix_module, req->ir_element, req->ir_offset, req->ir_data.w, req->ir_count, req->ir_size, req->ir_stride), "element_write", req->ir_offset, req->ir_count * req->ir_size); } if (req->ir_urequest && req->ir_status > 0) { hio_request_t new_request = hioi_request_alloc (context); if (NULL == new_request) { rc = HIO_ERR_OUT_OF_RESOURCE; break; } req->ir_urequest[0] = new_request; new_request->req_transferred = req->ir_status; new_request->req_complete = true; new_request->req_status = HIO_SUCCESS; } if (req->ir_status < 0) { rc = (int) req->ir_status; break; } } hioi_object_unlock (&dataset->ds_object); stop = hioi_gettime (); builtin_posix_trace (posix_dataset, "process_requests", req_count, 0, start, stop); return rc; }
static int builtin_posix_module_dataset_init (struct hio_module_t *module, builtin_posix_module_dataset_t *posix_dataset) { hio_context_t context = hioi_object_context ((hio_object_t) posix_dataset); int rc; rc = asprintf (&posix_dataset->base_path, "%s/%s.hio/%s/%lu", module->data_root, hioi_object_identifier(context), hioi_object_identifier (posix_dataset), (unsigned long) posix_dataset->base.ds_id); assert (0 < rc); for (int i = 0 ; i < HIO_POSIX_MAX_OPEN_FILES ; ++i) { posix_dataset->files[i].f_bid = -1; posix_dataset->files[i].f_file.f_hndl = NULL; } return HIO_SUCCESS; }
/** * Search for a matching value in the configuration file. * * @param[in] context context to search * @param[in] object associated object * @param[in] var variable to set * * This function currently does a linear search of the configuration * file. In the future this should be replaced with a hash table or * similar structure. */ static int hioi_config_set_from_kv_list (hio_config_kv_list_t *list, hio_object_t object, hio_var_t *var) { hio_context_t context = hioi_object_context (object); for (int i = 0 ; i < list->kv_list_count ; ++i) { hio_config_kv_t *kv = list->kv_list + i; if ((HIO_OBJECT_TYPE_ANY == kv->object_type || object->type == kv->object_type) && (NULL == kv->object_identifier || !strcmp (object->identifier, kv->object_identifier)) && !strcmp (var->var_name, kv->key)) { hioi_log (context, HIO_VERBOSE_DEBUG_LOW, "Setting value for %s to %s from file", var->var_name, kv->value); return hioi_config_set_value_internal (context, var, kv->value); } } return HIO_SUCCESS; }
void hioi_err_push (int hrc, hio_object_t object, char *format, ...) { hio_context_t context = object ? hioi_object_context (object) : NULL; hio_error_stack_item_t *new_item; va_list vargs; int rc; new_item = calloc (1, sizeof (hio_error_stack_item_t)); if (NULL == new_item) { /* not much can be done here. we are just plain OOM. */ return; } va_start (vargs, format); rc = vasprintf (&new_item->error_string, format, vargs); va_end (vargs); if (0 >= rc) { /* couldn't allocate error string */ free (new_item); return; } if (context) { hioi_log (context, HIO_VERBOSE_ERROR, "%s", new_item->error_string); } new_item->hrc = hrc; /* push the error message onto the stack */ if (NULL == context) { pthread_mutex_lock (&hio_error_stack_mutex); new_item->next = hio_error_stack_head; hio_error_stack_head = new_item; pthread_mutex_unlock (&hio_error_stack_mutex); } else { hioi_object_lock (&context->c_object); new_item->next = (hio_error_stack_item_t *) context->c_estack; context->c_estack = (void *) new_item; hioi_object_unlock (&context->c_object); } }
static int builtin_posix_module_element_read_strided_nb (hio_element_t element, hio_request_t *request, off_t offset, void *ptr, size_t count, size_t size, size_t stride) { builtin_posix_module_dataset_t *posix_dataset = (builtin_posix_module_dataset_t *) hioi_element_dataset (element); builtin_posix_module_t *posix_module = (builtin_posix_module_t *) posix_dataset->base.ds_module; hio_context_t context = hioi_object_context (&element->e_object); ssize_t bytes_read; hio_request_t new_request; int rc = HIO_SUCCESS; if (!(posix_dataset->base.ds_flags & HIO_FLAG_READ)) { return HIO_ERR_PERM; } if (stride == 0) { size *= count; count = 1; } hioi_object_lock (&posix_dataset->base.ds_object); bytes_read = builtin_posix_module_element_read_strided_internal (posix_module, element, offset, ptr, count, size, stride); hioi_object_unlock (&posix_dataset->base.ds_object); if (0 > bytes_read) { rc = (int) bytes_read; } /* see if a request was requested */ if (request) { new_request = hioi_request_alloc (context); if (NULL == new_request) { return HIO_ERR_OUT_OF_RESOURCE; } *request = new_request; new_request->req_transferred = bytes_read; new_request->req_complete = true; } return rc; }
int hio_config_set_value (hio_object_t object, const char *variable, const char *value) { int rc = HIO_SUCCESS; hio_var_t *var; int config_index; if (NULL == object || NULL == variable || NULL == value) { return HIO_ERR_BAD_PARAM; } hioi_object_lock (object); do { /* go ahead and push this value into the object's key-value store. if the * configuration parameter has not yet been registered it will be read from * this key-valye store after the file store is checked. */ hioi_config_list_kv_push (&object->config_set, hioi_object_identifier (object), object->type, variable, value); config_index = hioi_var_lookup (&object->configuration, variable); if (0 > config_index) { /* variable does not exist (yet). nothing more to do */ break; } var = object->configuration.vars + config_index; if (HIO_VAR_FLAG_READONLY & var->var_flags) { hioi_err_push (HIO_ERR_PERM, object, "could not set read-only parameter: %s", variable); rc = HIO_ERR_PERM; break; } rc = hioi_config_set_value_internal (hioi_object_context(object), var, value); } while (0); hioi_object_unlock (object); return rc; }
static int builtin_posix_module_dataset_init (struct hio_module_t *module, builtin_posix_module_dataset_t *posix_dataset) { hio_context_t context = hioi_object_context ((hio_object_t) posix_dataset); int rc; rc = asprintf (&posix_dataset->base_path, "%s/%s.hio/%s/%lu", module->data_root, hioi_object_identifier(context), hioi_object_identifier (posix_dataset), (unsigned long) posix_dataset->base.ds_id); assert (0 < rc); /* initialize posix dataset specific data */ for (int i = 0 ; i < HIO_POSIX_MAX_OPEN_FILES ; ++i) { posix_dataset->files[i].f_bid = -1; posix_dataset->files[i].f_hndl = NULL; posix_dataset->files[i].f_fd = -1; } /* default to strided output mode */ posix_dataset->ds_fmode = HIO_FILE_MODE_STRIDED; hioi_config_add (context, &posix_dataset->base.ds_object, &posix_dataset->ds_fmode, "dataset_file_mode", HIO_CONFIG_TYPE_INT32, &hioi_dataset_file_modes, "Modes for writing dataset files. Valid values: (0: basic, 1: file_per_node, 2: strided)", 0); if (HIO_FILE_MODE_STRIDED == posix_dataset->ds_fmode && HIO_SET_ELEMENT_UNIQUE == posix_dataset->base.ds_mode) { /* strided mode only applies to shared datasets */ posix_dataset->ds_fmode = HIO_FILE_MODE_BASIC; } if (HIO_FILE_MODE_BASIC != posix_dataset->ds_fmode) { posix_dataset->ds_bs = 1ul << 23; hioi_config_add (context, &posix_dataset->base.ds_object, &posix_dataset->ds_bs, "dataset_block_size", HIO_CONFIG_TYPE_INT64, NULL, "Block size to use when writing in optimized mode (default: 8M)", 0); } return HIO_SUCCESS; }
static ssize_t builtin_posix_module_element_write_strided_internal (builtin_posix_module_t *posix_module, hio_element_t element, off_t offset, const void *ptr, size_t count, size_t size, size_t stride) { hio_dataset_t dataset = hioi_element_dataset (element); size_t bytes_written = 0, ret; hio_file_t *file; uint64_t stop, start; int rc; assert (dataset->ds_flags & HIO_FLAG_WRITE); if (!(count * size)) { return 0; } if (0 == stride) { size *= count; count = 1; } start = hioi_gettime (); errno = 0; for (size_t i = 0 ; i < count ; ++i) { size_t req = size, actual; do { actual = req; rc = builtin_posix_element_translate (posix_module, element, offset, &actual, &file, false); assert (file); if (HIO_SUCCESS != rc) { break; } ret = fwrite (ptr, 1, actual, file->f_hndl); if (ret > 0) { bytes_written += ret; file->f_offset += ret; } if (ret < actual) { /* short write */ break; } req -= actual; offset += actual; ptr = (void *) ((intptr_t) ptr + actual); } while (req); if (HIO_SUCCESS != rc || req) { break; } ptr = (void *) ((intptr_t) ptr + stride); } if (0 == bytes_written || HIO_SUCCESS != rc) { if (0 == bytes_written) { rc = hioi_err_errno (errno); } dataset->ds_status = rc; return rc; } if (offset + bytes_written > element->e_size) { element->e_size = offset + bytes_written; } stop = hioi_gettime (); dataset->ds_stat.s_wtime += stop - start; if (0 < bytes_written) { dataset->ds_stat.s_bwritten += bytes_written; } hioi_log (hioi_object_context (&element->e_object), HIO_VERBOSE_DEBUG_LOW, "posix: finished write. bytes written: " "%lu, time: %llu usec", bytes_written, stop - start); return bytes_written; }
static int builtin_posix_element_translate_opt (builtin_posix_module_t *posix_module, hio_element_t element, off_t offset, size_t *size, hio_file_t **file_out, bool reading) { builtin_posix_module_dataset_t *posix_dataset = (builtin_posix_module_dataset_t *) hioi_element_dataset (element); hio_context_t context = hioi_object_context (&element->e_object); builtin_posix_file_t *file; uint64_t file_offset; int file_index; char *path; int rc; hioi_log (context, HIO_VERBOSE_DEBUG_MED, "translating element %s offset %ld size %lu", hioi_object_identifier (&element->e_object), offset, *size); rc = hioi_element_translate_offset (element, offset, &file_index, &file_offset, size); if (HIO_SUCCESS != rc) { if (reading) { hioi_log (context, HIO_VERBOSE_DEBUG_MED, "offset not found"); /* not found */ return rc; } if (hioi_context_using_mpi (context)) { rc = asprintf (&path, "%s/data.%x", posix_dataset->base_path, posix_dataset->base.ds_shared_control->s_master); if (0 > rc) { return HIO_ERR_OUT_OF_RESOURCE; } } else { rc = asprintf (&path, "%s/data", posix_dataset->base_path); if (0 > rc) { return HIO_ERR_OUT_OF_RESOURCE; } } file_offset = builtin_posix_reserve (posix_dataset, size); file_index = hioi_dataset_add_file (&posix_dataset->base, strrchr (path, '/') + 1); hioi_element_add_segment (element, file_index, file_offset, offset, *size); } else { hioi_log (context, HIO_VERBOSE_DEBUG_MED, "offset found in file @ index %d, offset %lu, size %lu", file_index, file_offset, *size); rc = asprintf (&path, "%s/%s", posix_dataset->base_path, posix_dataset->base.ds_flist[file_index].f_name); if (0 > rc) { return HIO_ERR_OUT_OF_RESOURCE; } } /* use crc as a hash to pick a file index to use */ int internal_index = file_index % HIO_POSIX_MAX_OPEN_FILES; file = posix_dataset->files + internal_index; if (internal_index != file->f_bid) { if (NULL != file->f_file.f_hndl) { fclose (file->f_file.f_hndl); file->f_file.f_hndl = NULL; file->f_bid = -1; } rc = builtin_posix_open_file (posix_module, posix_dataset, path, &file->f_file); if (HIO_SUCCESS != rc) { free (path); return rc; } file->f_bid = file_index; } free (path); if (file_offset != file->f_file.f_offset) { fseek (file->f_file.f_hndl, file_offset, SEEK_SET); file->f_file.f_offset = file_offset; } *file_out = &file->f_file; return HIO_SUCCESS; }
static int builtin_posix_element_translate_opt_old (builtin_posix_module_t *posix_module, hio_element_t element, off_t offset, size_t *size, hio_file_t **file_out) { builtin_posix_module_dataset_t *posix_dataset = (builtin_posix_module_dataset_t *) hioi_element_dataset (element); hio_context_t context = hioi_object_context (&element->e_object); size_t block_id, block_base, block_bound, block_offset; builtin_posix_file_t *file; int32_t file_index; char *path; int rc, foo; block_id = offset / posix_dataset->base.ds_bs; block_base = block_id * posix_dataset->base.ds_bs; block_bound = block_base + posix_dataset->base.ds_bs; block_offset = offset - block_base; hioi_log (context, HIO_VERBOSE_DEBUG_LOW, "builtin_posix_element_translate: element: %s, offset: %lu, block_id: %lu, " "block_offset: %lu, block_size: %lu", hioi_object_identifier(element), (unsigned long) offset, block_id, block_offset, posix_dataset->base.ds_bs); if (offset + *size > block_bound) { *size = block_bound - offset; } rc = asprintf (&path, "%s_block.%lu", hioi_object_identifier(element), (unsigned long) block_id); if (0 > rc) { return HIO_ERR_OUT_OF_RESOURCE; } if (HIO_FLAG_WRITE & posix_dataset->base.ds_flags) { foo = hioi_dataset_add_file (&posix_dataset->base, path); } char *tmp = path; rc = asprintf (&path, "%s/%s", posix_dataset->base_path, tmp); free (tmp); if (0 > rc) { return HIO_ERR_OUT_OF_RESOURCE; } /* use crc as a hash to pick a file index to use */ file_index = hioi_crc32 ((uint8_t *) path, strlen (path)) % HIO_POSIX_MAX_OPEN_FILES; file = posix_dataset->files + file_index; if (block_id != file->f_bid || file->f_element != element) { if (file->f_file.f_hndl != NULL) { fclose (file->f_file.f_hndl); file->f_file.f_hndl = NULL; file->f_bid = -1; } file->f_element = element; rc = builtin_posix_open_file (posix_module, posix_dataset, path, &file->f_file); if (HIO_SUCCESS != rc) { return rc; } file->f_bid = block_id; } if (block_offset != file->f_file.f_offset) { fseek (file->f_file.f_hndl, block_offset, SEEK_SET); file->f_file.f_offset = block_offset; } if (HIO_FLAG_WRITE & posix_dataset->base.ds_flags) { hioi_element_add_segment (element, foo, block_offset, offset, *size); } *file_out = &file->f_file; return HIO_SUCCESS; }
static int builtin_posix_element_translate_opt (builtin_posix_module_t *posix_module, hio_element_t element, uint64_t offset, size_t *size, hio_file_t **file_out, bool reading) { builtin_posix_module_dataset_t *posix_dataset = (builtin_posix_module_dataset_t *) hioi_element_dataset (element); hio_context_t context = hioi_object_context (&element->e_object); hio_file_t *file; uint64_t file_offset; int file_index = 0; char *path; int rc; hioi_log (context, HIO_VERBOSE_DEBUG_MED, "translating element %s offset %" PRIu64 " size %lu", hioi_object_identifier (&element->e_object), offset, *size); POSIX_TRACE_CALL(posix_dataset, rc = hioi_element_translate_offset (element, offset, &file_index, &file_offset, size), "translate_offset", offset, *size); #if HIO_MPI_HAVE(3) if (HIO_SUCCESS != rc && reading) { POSIX_TRACE_CALL(posix_dataset, rc = hioi_dataset_map_translate_offset (element, offset, &file_index, &file_offset, size), "map_translate_offset", offset, *size); } #endif if (HIO_SUCCESS != rc) { if (reading) { hioi_log (context, HIO_VERBOSE_DEBUG_MED, "offset %" PRIu64 " not found", offset); /* not found */ return rc; } file_offset = builtin_posix_reserve (posix_dataset, size); if (hioi_context_using_mpi (context)) { file_index = posix_dataset->base.ds_shared_control->s_master; } else { file_index = 0; } rc = asprintf (&path, "%s/data/data.%x", posix_dataset->base_path, file_index); if (0 > rc) { return HIO_ERR_OUT_OF_RESOURCE; } hioi_element_add_segment (element, file_index, file_offset, offset, *size); } else { hioi_log (context, HIO_VERBOSE_DEBUG_MED, "offset found in file @ rank %d, offset %" PRIu64 ", size %lu", file_index, file_offset, *size); rc = asprintf (&path, "%s/data/data.%x", posix_dataset->base_path, file_index); if (0 > rc) { return HIO_ERR_OUT_OF_RESOURCE; } if (access (path, R_OK)) { free (path); rc = asprintf (&path, "%s/data.%x", posix_dataset->base_path, file_index); if (0 > rc) { return HIO_ERR_OUT_OF_RESOURCE; } } } /* use crc as a hash to pick a file index to use */ int internal_index = file_index % HIO_POSIX_MAX_OPEN_FILES; file = posix_dataset->files + internal_index; if (file_index != file->f_bid) { if (file->f_bid >= 0) { POSIX_TRACE_CALL(posix_dataset, hioi_file_close (file), "file_close", file->f_bid, 0); } file->f_bid = -1; POSIX_TRACE_CALL(posix_dataset, rc = builtin_posix_open_file (posix_module, posix_dataset, path, file), "file_open", file_index, 0); if (HIO_SUCCESS != rc) { free (path); return rc; } file->f_bid = file_index; } free (path); POSIX_TRACE_CALL(posix_dataset, hioi_file_seek (file, file_offset, SEEK_SET), "file_seek", file->f_bid, file_offset); *file_out = file; return HIO_SUCCESS; }
static int builtin_posix_module_dataset_close (hio_dataset_t dataset) { builtin_posix_module_dataset_t *posix_dataset = (builtin_posix_module_dataset_t *) dataset; builtin_posix_module_t *posix_module = (builtin_posix_module_t *) dataset->ds_module; hio_context_t context = hioi_object_context ((hio_object_t) dataset); hio_module_t *module = dataset->ds_module; unsigned char *manifest; uint64_t start, stop; int rc = HIO_SUCCESS; size_t manifest_size; start = hioi_gettime (); for (int i = 0 ; i < HIO_POSIX_MAX_OPEN_FILES ; ++i) { if (posix_dataset->files[i].f_file.f_hndl != NULL) { fclose (posix_dataset->files[i].f_file.f_hndl); posix_dataset->files[i].f_file.f_hndl = NULL; } } if (dataset->ds_flags & HIO_FLAG_WRITE) { rc = hioi_dataset_gather_manifest (dataset, &manifest, &manifest_size, dataset->ds_use_bzip); if (HIO_SUCCESS != rc) { dataset->ds_status = rc; } if (0 == context->c_rank) { char *path; rc = asprintf (&path, "%s/manifest.json%s", posix_dataset->base_path, dataset->ds_use_bzip ? ".bz2" : ""); if (0 < rc) { int fd; errno = 0; fd = open (path, O_CREAT | O_WRONLY, posix_module->access_mode); if (0 <= fd) { (void) write (fd, manifest, manifest_size); close (fd); } free (manifest); rc = hioi_err_errno (errno); free (path); if (HIO_SUCCESS != rc) { hioi_err_push (rc, &dataset->ds_object, "posix: error writing dataset manifest"); } } else { rc = HIO_ERR_OUT_OF_RESOURCE; } } } #if HIO_USE_MPI /* ensure all ranks have closed the dataset before continuing */ if (hioi_context_using_mpi (context)) { MPI_Allreduce (MPI_IN_PLACE, &rc, 1, MPI_INT, MPI_MIN, context->c_comm); } #endif free (posix_dataset->base_path); pthread_mutex_destroy (&posix_dataset->lock); stop = hioi_gettime (); hioi_log (context, HIO_VERBOSE_DEBUG_LOW, "posix:dataset_open: successfully closed posix dataset %s:%llu on data root %s. " "close time %lu usec", hioi_object_identifier(dataset), dataset->ds_id, module->data_root, stop - start); return rc; }
static int builtin_posix_module_dataset_open (struct hio_module_t *module, hio_dataset_t dataset) { builtin_posix_module_dataset_t *posix_dataset = (builtin_posix_module_dataset_t *) dataset; builtin_posix_module_t *posix_module = (builtin_posix_module_t *) module; hio_context_t context = hioi_object_context ((hio_object_t) dataset); unsigned char *manifest = NULL; size_t manifest_size = 0; hio_fs_attr_t *fs_attr; uint64_t start, stop; int rc = HIO_SUCCESS; char *path = NULL; start = hioi_gettime (); hioi_log (context, HIO_VERBOSE_DEBUG_MED, "posix:dataset_open: opening dataset %s:%lu mpi: %d flags: 0x%x mode: 0x%x", hioi_object_identifier (dataset), (unsigned long) dataset->ds_id, hioi_context_using_mpi (context), dataset->ds_flags, dataset->ds_mode); rc = builtin_posix_module_dataset_init (module, posix_dataset); if (HIO_SUCCESS != rc) { return rc; } fs_attr = &posix_dataset->base.ds_fsattr; rc = hioi_fs_query (context, module->data_root, fs_attr); if (HIO_SUCCESS != rc) { hioi_err_push (rc, &context->c_object, "posix: error querying the filesystem"); return rc; } if (fs_attr->fs_flags & HIO_FS_SUPPORTS_STRIPING) { hioi_config_add (context, &dataset->ds_object, &fs_attr->fs_scount, "stripe_count", HIO_CONFIG_TYPE_UINT32, NULL, "Stripe count for all dataset " "data files", 0); if (fs_attr->fs_scount > fs_attr->fs_smax_count) { hioi_log (context, HIO_VERBOSE_WARN, "posix:dataset_open: requested stripe count %u exceeds the available resources. " "adjusting to maximum %u", fs_attr->fs_scount, fs_attr->fs_smax_count); fs_attr->fs_scount = fs_attr->fs_smax_count; } hioi_config_add (context, &dataset->ds_object, &fs_attr->fs_ssize, "stripe_size", HIO_CONFIG_TYPE_UINT64, NULL, "Stripe size for all dataset " "data files", 0); /* ensure the stripe size is a multiple of the stripe unit */ fs_attr->fs_ssize = fs_attr->fs_sunit * ((fs_attr->fs_ssize + fs_attr->fs_sunit - 1) / fs_attr->fs_sunit); if (fs_attr->fs_ssize > fs_attr->fs_smax_size) { hioi_log (context, HIO_VERBOSE_WARN, "posix:dataset_open: requested stripe size %" PRIu64 " exceeds the maximum %" PRIu64 ". ", fs_attr->fs_ssize, fs_attr->fs_smax_size); fs_attr->fs_ssize = fs_attr->fs_smax_size; } hioi_config_add (context, &dataset->ds_object, &fs_attr->fs_raid_level, "raid_level", HIO_CONFIG_TYPE_UINT64, NULL, "RAID level for dataset " "data files. Keep in mind that some filesystems only support 1/2 RAID " "levels", 0); if (HIO_FILE_MODE_OPTIMIZED == dataset->ds_fmode) { fs_attr->fs_scount = 1; fs_attr->fs_ssize = dataset->ds_bs; fs_attr->fs_use_group_locking = true; } } do { if (0 != context->c_rank) { break; } if (dataset->ds_flags & HIO_FLAG_TRUNC) { /* blow away the existing dataset */ (void) builtin_posix_module_dataset_unlink (module, hioi_object_identifier(dataset), dataset->ds_id); /* ensure we take the create path later */ dataset->ds_flags |= HIO_FLAG_CREAT; } if (!(dataset->ds_flags & HIO_FLAG_CREAT)) { /* load manifest. the manifest data will be shared with other processes in hioi_dataset_scatter */ rc = asprintf (&path, "%s/manifest.json.bz2", posix_dataset->base_path); assert (0 < rc); if (access (path, F_OK)) { free (path); rc = asprintf (&path, "%s/manifest.json", posix_dataset->base_path); assert (0 < rc); if (access (path, F_OK)) { hioi_log (context, HIO_VERBOSE_DEBUG_LOW, "posix:dataset_open: could not find top-level manifest"); rc = HIO_ERR_NOT_FOUND; break; } } rc = hioi_manifest_read (path, &manifest, &manifest_size); free (path); } else { rc = builtin_posix_create_dataset_dirs (posix_module, posix_dataset); if (HIO_SUCCESS != rc) { break; } rc = hioi_manifest_serialize (dataset, &manifest, &manifest_size, true); } } while (0); /* share dataset information will all processes in the communication domain */ rc = hioi_dataset_scatter (dataset, manifest, manifest_size, rc); if (HIO_SUCCESS != rc) { free (posix_dataset->base_path); return rc; } free (manifest); if (HIO_FILE_MODE_OPTIMIZED == dataset->ds_fmode) { if (HIO_SET_ELEMENT_UNIQUE == dataset->ds_mode || 2 > context->c_size || NULL == dataset->ds_shared_control) { posix_dataset->base.ds_fmode = HIO_FILE_MODE_BASIC; /* NTH: no optimized mode for N->N yet */ hioi_log (context, HIO_VERBOSE_WARN, "posix:dataset_open: optimized file mode requested but not supported in this " "dataset mode. falling back to basic file mode"); } } dataset->ds_module = module; dataset->ds_close = builtin_posix_module_dataset_close; dataset->ds_element_open = builtin_posix_module_element_open; dataset->ds_process_reqs = builtin_posix_module_process_reqs; pthread_mutex_init (&posix_dataset->lock, NULL); /* record the open time */ gettimeofday (&dataset->ds_otime, NULL); stop = hioi_gettime (); hioi_log (context, HIO_VERBOSE_DEBUG_LOW, "posix:dataset_open: successfully %s posix dataset %s:%llu on data root %s. " "open time %lu usec", (dataset->ds_flags & HIO_FLAG_CREAT) ? "created" : "opened", hioi_object_identifier(dataset), dataset->ds_id, module->data_root, stop - start); return HIO_SUCCESS; }
static int builtin_posix_module_dataset_manifest_list (builtin_posix_module_dataset_t *posix_dataset, int **manifest_ids, size_t *count) { hio_context_t context = hioi_object_context (&posix_dataset->base.ds_object); int num_manifest_ids = 0, manifest_id_index = 0; unsigned int manifest_id; int rc = HIO_SUCCESS; int *tmp = NULL; struct dirent *dp; DIR *dir; *manifest_ids = NULL; *count = 0; rc = hioi_context_generate_leader_list (context); if (HIO_SUCCESS != rc) { return rc; } if (0 != context->c_shared_rank) { return HIO_SUCCESS; } do { if (0 != context->c_rank) { break; } dir = opendir (posix_dataset->base_path); if (NULL == dir) { num_manifest_ids = hioi_err_errno (errno); break; } while (NULL != (dp = readdir (dir))) { if (dp->d_name[0] != '.' && 0 != sscanf (dp->d_name, "manifest.%x.json", &manifest_id)) { ++num_manifest_ids; } } if (0 == num_manifest_ids) { break; } /* round up to a multiple of the number nodes */ num_manifest_ids = context->c_node_count * (num_manifest_ids + context->c_node_count - 1) / context->c_node_count; tmp = (int *) malloc (num_manifest_ids * sizeof (int)); assert (NULL != tmp); memset (tmp, 0xff, sizeof (int) * num_manifest_ids); rewinddir (dir); while (NULL != (dp = readdir (dir))) { if ('.' == dp->d_name[0] || 0 == sscanf (dp->d_name, "manifest.%x.json", &manifest_id)) { continue; } tmp[manifest_id_index++] = (int) manifest_id; } /* put manifest files in numerical order */ qsort (tmp, manifest_id_index, sizeof (int), manifest_index_compare); } while (0); if (0 == context->c_rank) { closedir (dir); } if (num_manifest_ids > 0) { num_manifest_ids /= context->c_node_count; } MPI_Bcast (&num_manifest_ids, 1, MPI_INT, 0, context->c_node_leader_comm); if (0 < num_manifest_ids) { *manifest_ids = (int *) malloc (num_manifest_ids * sizeof (int)); assert (NULL != *manifest_ids); MPI_Scatter (tmp, num_manifest_ids, MPI_INT, *manifest_ids, num_manifest_ids, MPI_INT, 0, context->c_node_leader_comm); } free (tmp); *count = num_manifest_ids; return num_manifest_ids >= 0 ? HIO_SUCCESS : num_manifest_ids; }
static int builtin_posix_module_dataset_close (hio_dataset_t dataset) { builtin_posix_module_dataset_t *posix_dataset = (builtin_posix_module_dataset_t *) dataset; hio_context_t context = hioi_object_context ((hio_object_t) dataset); hio_module_t *module = dataset->ds_module; unsigned char *manifest = NULL; uint64_t start, stop; int rc = HIO_SUCCESS; size_t manifest_size; start = hioi_gettime (); for (int i = 0 ; i < HIO_POSIX_MAX_OPEN_FILES ; ++i) { if (posix_dataset->files[i].f_bid >= 0) { POSIX_TRACE_CALL(posix_dataset, hioi_file_close (posix_dataset->files + i), "file_close", posix_dataset->files[i].f_bid, 0); } } #if HIO_MPI_HAVE(3) /* release the shared state if it was allocated */ (void) hioi_dataset_shared_fini (dataset); /* release the dataset map if one was allocated */ (void) hioi_dataset_map_release (dataset); #endif if (dataset->ds_flags & HIO_FLAG_WRITE) { char *path; /* write manifest header */ POSIX_TRACE_CALL(posix_dataset, rc = hioi_dataset_gather_manifest (dataset, &manifest, &manifest_size, false, true), "gather_manifest", 0, 0); if (HIO_SUCCESS != rc) { dataset->ds_status = rc; } if (0 == context->c_rank) { rc = asprintf (&path, "%s/manifest.json", posix_dataset->base_path); if (0 > rc) { /* out of memory. not much we can do now */ return hioi_err_errno (errno); } rc = hioi_manifest_save (dataset, manifest, manifest_size, path); free (manifest); free (path); if (HIO_SUCCESS != rc) { hioi_err_push (rc, &dataset->ds_object, "posix: error writing dataset manifest"); } } #if HIO_MPI_HAVE(3) if (HIO_FILE_MODE_OPTIMIZED == posix_dataset->ds_fmode) { /* optimized mode requires a data manifest to describe how the data landed on the filesystem */ POSIX_TRACE_CALL(posix_dataset, rc = hioi_dataset_gather_manifest_comm (dataset, context->c_shared_comm, &manifest, &manifest_size, posix_dataset->ds_use_bzip, false), "gather_manifest", 0, 0); if (HIO_SUCCESS != rc) { dataset->ds_status = rc; } if (NULL != manifest) { rc = asprintf (&path, "%s/manifest.%x.json%s", posix_dataset->base_path, context->c_rank, posix_dataset->ds_use_bzip ? ".bz2" : ""); if (0 > rc) { return hioi_err_errno (errno); } rc = hioi_manifest_save (dataset, manifest, manifest_size, path); free (manifest); free (path); if (HIO_SUCCESS != rc) { hioi_err_push (rc, &dataset->ds_object, "posix: error writing dataset manifest"); } } } #endif } #if HIO_MPI_HAVE(1) /* ensure all ranks have closed the dataset before continuing */ if (hioi_context_using_mpi (context)) { MPI_Allreduce (MPI_IN_PLACE, &rc, 1, MPI_INT, MPI_MIN, context->c_comm); } #endif free (posix_dataset->base_path); stop = hioi_gettime (); builtin_posix_trace (posix_dataset, "close", 0, 0, start, stop); hioi_log (context, HIO_VERBOSE_DEBUG_LOW, "posix:dataset_open: successfully closed posix dataset " "%s:%" PRIu64 " on data root %s. close time %" PRIu64 " usec", hioi_object_identifier(dataset), dataset->ds_id, module->data_root, stop - start); builtin_posix_trace (posix_dataset, "trace_end", 0, 0, 0, 0); if (posix_dataset->ds_trace_fh) { fclose (posix_dataset->ds_trace_fh); } return rc; }
static int builtin_posix_module_dataset_open (struct hio_module_t *module, hio_dataset_t dataset) { builtin_posix_module_dataset_t *posix_dataset = (builtin_posix_module_dataset_t *) dataset; builtin_posix_module_t *posix_module = (builtin_posix_module_t *) module; hio_context_t context = hioi_object_context ((hio_object_t) dataset); unsigned char *manifest = NULL; size_t manifest_size = 0; uint64_t start, stop; int rc = HIO_SUCCESS; char *path = NULL; start = hioi_gettime (); hioi_log (context, HIO_VERBOSE_DEBUG_MED, "posix:dataset_open: opening dataset %s:%lu mpi: %d flags: 0x%x mode: 0x%x", hioi_object_identifier (dataset), (unsigned long) dataset->ds_id, hioi_context_using_mpi (context), dataset->ds_flags, dataset->ds_mode); rc = builtin_posix_module_dataset_init (module, posix_dataset); if (HIO_SUCCESS != rc) { return rc; } rc = builtin_posix_module_setup_striping (context, module, dataset); if (HIO_SUCCESS != rc) { return rc; } if (HIO_FILE_MODE_STRIDED == posix_dataset->ds_fmode) { hioi_config_add (context, &dataset->ds_object, &posix_dataset->ds_fcount, "dataset_file_count", HIO_CONFIG_TYPE_UINT64, NULL, "Number of files to use " "in strided file mode", 0); } else if (HIO_FILE_MODE_OPTIMIZED == posix_dataset->ds_fmode) { posix_dataset->ds_use_bzip = true; hioi_config_add (context, &dataset->ds_object, &posix_dataset->ds_use_bzip, "dataset_use_bzip", HIO_CONFIG_TYPE_BOOL, NULL, "Use bzip2 compression for dataset manifests", 0); } if (dataset->ds_flags & HIO_FLAG_TRUNC) { /* blow away the existing dataset */ if (0 == context->c_rank) { (void) builtin_posix_module_dataset_unlink (module, hioi_object_identifier(dataset), dataset->ds_id); } } if (!(dataset->ds_flags & HIO_FLAG_CREAT)) { if (0 == context->c_rank) { /* load manifest. the manifest data will be shared with other processes in hioi_dataset_scatter */ rc = asprintf (&path, "%s/manifest.json", posix_dataset->base_path); assert (0 < rc); if (access (path, F_OK)) { /* this should never happen on a valid dataset */ free (path); hioi_log (context, HIO_VERBOSE_DEBUG_LOW, "posix:dataset_open: could not find top-level manifest %s", path); rc = HIO_ERR_NOT_FOUND; } else { rc = HIO_SUCCESS; } } /* read the manifest if it exists */ if (HIO_SUCCESS == rc) { hioi_log (context, HIO_VERBOSE_DEBUG_LOW, "posix:dataset_open: loading manifest header from %s...", path); rc = hioi_manifest_read (path, &manifest, &manifest_size); free (path); path = NULL; } } else if (0 == context->c_rank) { rc = builtin_posix_create_dataset_dirs (posix_module, posix_dataset); if (HIO_SUCCESS == rc) { /* serialize the manifest to send to remote ranks */ rc = hioi_manifest_serialize (dataset, &manifest, &manifest_size, false, false); } } #if HIO_MPI_HAVE(1) /* share dataset header will all processes in the communication domain */ rc = hioi_dataset_scatter_comm (dataset, context->c_comm, manifest, manifest_size, rc); #endif free (manifest); if (HIO_SUCCESS != rc) { free (posix_dataset->base_path); return rc; } if (context->c_enable_tracing) { char *path; rc = asprintf (&path, "%s/trace/trace.%d", posix_dataset->base_path, context->c_rank); if (rc > 0) { posix_dataset->ds_trace_fh = fopen (path, "a"); free (path); } builtin_posix_trace (posix_dataset, "trace_begin", 0, 0, 0, 0); } #if HIO_MPI_HAVE(3) if (!(dataset->ds_flags & HIO_FLAG_CREAT) && HIO_FILE_MODE_OPTIMIZED == posix_dataset->ds_fmode) { rc = bultin_posix_scatter_data (posix_dataset); if (HIO_SUCCESS != rc) { free (posix_dataset->base_path); return rc; } } /* if possible set up a shared memory window for this dataset */ POSIX_TRACE_CALL(posix_dataset, hioi_dataset_shared_init (dataset, 1), "shared_init", 0, 0); if (HIO_FILE_MODE_OPTIMIZED == posix_dataset->ds_fmode) { if (2 > context->c_size || NULL == dataset->ds_shared_control) { /* no point in using optimized mode in this case */ posix_dataset->ds_fmode = HIO_FILE_MODE_BASIC; hioi_log (context, HIO_VERBOSE_WARN, "posix:dataset_open: optimized file mode requested but not supported in this " "dataset mode. falling back to basic file mode, path: %s", posix_dataset->base_path); } else if (HIO_SET_ELEMENT_SHARED == dataset->ds_mode) { POSIX_TRACE_CALL(posix_dataset, rc = hioi_dataset_generate_map (dataset), "generate_map", 0, 0); if (HIO_SUCCESS != rc) { free (posix_dataset->base_path); return rc; } } } /* NTH: if requested more code is needed to load an optimized dataset with an older MPI */ #endif /* HIO_MPI_HAVE(3) */ dataset->ds_module = module; dataset->ds_close = builtin_posix_module_dataset_close; dataset->ds_element_open = builtin_posix_module_element_open; dataset->ds_process_reqs = builtin_posix_module_process_reqs; /* record the open time */ gettimeofday (&dataset->ds_otime, NULL); stop = hioi_gettime (); hioi_log (context, HIO_VERBOSE_DEBUG_LOW, "posix:dataset_open: successfully %s posix dataset " "%s:%" PRIu64 " on data root %s. open time %" PRIu64 " usec", (dataset->ds_flags & HIO_FLAG_CREAT) ? "created" : "opened", hioi_object_identifier(dataset), dataset->ds_id, module->data_root, stop - start); builtin_posix_trace (posix_dataset, "open", 0, 0, start, stop); return HIO_SUCCESS; }
static int bultin_posix_scatter_data (builtin_posix_module_dataset_t *posix_dataset) { hio_context_t context = hioi_object_context ((hio_object_t) posix_dataset); size_t manifest_size = 0, manifest_id_count = 0; unsigned char *manifest = NULL; int rc = HIO_SUCCESS; int *manifest_ids; char *path; if (HIO_SET_ELEMENT_UNIQUE == posix_dataset->base.ds_mode) { /* only read the manifest this rank wrote */ manifest_id_count = 1; manifest_ids = malloc (sizeof (*manifest_ids)); manifest_ids[0] = context->c_rank; } else { rc = builtin_posix_module_dataset_manifest_list (posix_dataset, &manifest_ids, &manifest_id_count); if (HIO_SUCCESS != rc) { return rc; } } for (size_t i = 0 ; i < manifest_id_count ; ++i) { if (-1 == manifest_ids[i]) { /* nothing more to do */ break; } hioi_log (context, HIO_VERBOSE_DEBUG_LOW, "posix:dataset_open: reading manifest data from id %x\n", manifest_ids[i]); /* when writing the manifest in optimized mode each IO manager writes its own manifest. try * to open the manifest. if a manifest does not exist then it is likely this rank did not * write a manifest. IO managers will distribute the manifest data to the appropriate ranks * in hioi_dataset_scatter(). */ rc = asprintf (&path, "%s/manifest.%x.json.bz2", posix_dataset->base_path, manifest_ids[i]); assert (0 < rc); if (access (path, F_OK)) { free (path); /* Check for a non-bzip'd manifest file. */ rc = asprintf (&path, "%s/manifest.%x.json", posix_dataset->base_path, manifest_ids[i]); assert (0 < rc); if (access (path, F_OK)) { /* no manifest found. this might be a non-optimized file format or this rank may not be an * IO master rank. */ free (path); path = NULL; } } if (path) { unsigned char *tmp = NULL; size_t tmp_size = 0; /* read the manifest if it exists */ rc = hioi_manifest_read (path, &tmp, &tmp_size); if (HIO_SUCCESS == rc) { rc = hioi_manifest_merge_data2 (&manifest, &manifest_size, tmp, tmp_size); free (tmp); } free (path); if (HIO_SUCCESS != rc) { break; } } } /* share dataset information with all processes on this node */ if (HIO_SET_ELEMENT_UNIQUE == posix_dataset->base.ds_mode) { rc = hioi_dataset_scatter_unique (&posix_dataset->base, manifest, manifest_size, rc); } else { rc = hioi_dataset_scatter_comm (&posix_dataset->base, context->c_shared_comm, manifest, manifest_size, rc); } free (manifest_ids); free (manifest); return rc; }