static int builtin_posix_module_process_reqs (hio_dataset_t dataset, hio_internal_request_t **reqs, int req_count) { builtin_posix_module_dataset_t *posix_dataset = (builtin_posix_module_dataset_t *) dataset; builtin_posix_module_t *posix_module = (builtin_posix_module_t *) dataset->ds_module; hio_context_t context = hioi_object_context (&dataset->ds_object); uint64_t start, stop; int rc = HIO_SUCCESS; start = hioi_gettime (); hioi_object_lock (&dataset->ds_object); for (int i = 0 ; i < req_count ; ++i) { hio_internal_request_t *req = reqs[i]; if (HIO_REQUEST_TYPE_READ == req->ir_type) { POSIX_TRACE_CALL(posix_dataset, req->ir_status = builtin_posix_module_element_read_strided_internal (posix_module, req->ir_element, req->ir_offset, req->ir_data.r, req->ir_count, req->ir_size, req->ir_stride), "element_read", req->ir_offset, req->ir_count * req->ir_size); } else { POSIX_TRACE_CALL(posix_dataset, req->ir_status = builtin_posix_module_element_write_strided_internal (posix_module, req->ir_element, req->ir_offset, req->ir_data.w, req->ir_count, req->ir_size, req->ir_stride), "element_write", req->ir_offset, req->ir_count * req->ir_size); } if (req->ir_urequest && req->ir_status > 0) { hio_request_t new_request = hioi_request_alloc (context); if (NULL == new_request) { rc = HIO_ERR_OUT_OF_RESOURCE; break; } req->ir_urequest[0] = new_request; new_request->req_transferred = req->ir_status; new_request->req_complete = true; new_request->req_status = HIO_SUCCESS; } if (req->ir_status < 0) { rc = (int) req->ir_status; break; } } hioi_object_unlock (&dataset->ds_object); stop = hioi_gettime (); builtin_posix_trace (posix_dataset, "process_requests", req_count, 0, start, stop); return rc; }
static ssize_t builtin_posix_module_element_read_strided_internal (builtin_posix_module_t *posix_module, hio_element_t element, off_t offset, void *ptr, size_t count, size_t size, size_t stride) { builtin_posix_module_dataset_t *posix_dataset = (builtin_posix_module_dataset_t *) hioi_element_dataset (element); size_t bytes_read = 0, ret; hio_file_t *file; uint64_t start, stop; int rc; if (!(count * size)) { return 0; } errno = 0; start = hioi_gettime (); for (size_t i = 0 ; i < count ; ++i) { size_t req = size, actual; do { actual = req; rc = builtin_posix_element_translate (posix_module, element, offset, &actual, &file, true); if (HIO_SUCCESS != rc) { break; } ret = fread (ptr, 1, actual, file->f_hndl); if (ret > 0) { bytes_read += ret; file->f_offset += ret; } if (ret < actual) { /* short read */ break; } req -= actual; offset += actual; ptr = (void *) ((intptr_t) ptr + actual); } while (req); if (req || HIO_SUCCESS != rc) { break; } ptr = (void *) ((intptr_t) ptr + stride); } if (0 == bytes_read || HIO_SUCCESS != rc) { if (0 == bytes_read) { rc = hioi_err_errno (errno); } return rc; } stop = hioi_gettime (); posix_dataset->base.ds_stat.s_rtime += stop - start; posix_dataset->base.ds_stat.s_bread += bytes_read; return bytes_read; }
static ssize_t builtin_posix_module_element_write_strided_internal (builtin_posix_module_t *posix_module, hio_element_t element, off_t offset, const void *ptr, size_t count, size_t size, size_t stride) { hio_dataset_t dataset = hioi_element_dataset (element); size_t bytes_written = 0, ret; hio_file_t *file; uint64_t stop, start; int rc; assert (dataset->ds_flags & HIO_FLAG_WRITE); if (!(count * size)) { return 0; } if (0 == stride) { size *= count; count = 1; } start = hioi_gettime (); errno = 0; for (size_t i = 0 ; i < count ; ++i) { size_t req = size, actual; do { actual = req; rc = builtin_posix_element_translate (posix_module, element, offset, &actual, &file, false); assert (file); if (HIO_SUCCESS != rc) { break; } ret = fwrite (ptr, 1, actual, file->f_hndl); if (ret > 0) { bytes_written += ret; file->f_offset += ret; } if (ret < actual) { /* short write */ break; } req -= actual; offset += actual; ptr = (void *) ((intptr_t) ptr + actual); } while (req); if (HIO_SUCCESS != rc || req) { break; } ptr = (void *) ((intptr_t) ptr + stride); } if (0 == bytes_written || HIO_SUCCESS != rc) { if (0 == bytes_written) { rc = hioi_err_errno (errno); } dataset->ds_status = rc; return rc; } if (offset + bytes_written > element->e_size) { element->e_size = offset + bytes_written; } stop = hioi_gettime (); dataset->ds_stat.s_wtime += stop - start; if (0 < bytes_written) { dataset->ds_stat.s_bwritten += bytes_written; } hioi_log (hioi_object_context (&element->e_object), HIO_VERBOSE_DEBUG_LOW, "posix: finished write. bytes written: " "%lu, time: %llu usec", bytes_written, stop - start); return bytes_written; }
static int builtin_posix_module_dataset_close (hio_dataset_t dataset) { builtin_posix_module_dataset_t *posix_dataset = (builtin_posix_module_dataset_t *) dataset; builtin_posix_module_t *posix_module = (builtin_posix_module_t *) dataset->ds_module; hio_context_t context = hioi_object_context ((hio_object_t) dataset); hio_module_t *module = dataset->ds_module; unsigned char *manifest; uint64_t start, stop; int rc = HIO_SUCCESS; size_t manifest_size; start = hioi_gettime (); for (int i = 0 ; i < HIO_POSIX_MAX_OPEN_FILES ; ++i) { if (posix_dataset->files[i].f_file.f_hndl != NULL) { fclose (posix_dataset->files[i].f_file.f_hndl); posix_dataset->files[i].f_file.f_hndl = NULL; } } if (dataset->ds_flags & HIO_FLAG_WRITE) { rc = hioi_dataset_gather_manifest (dataset, &manifest, &manifest_size, dataset->ds_use_bzip); if (HIO_SUCCESS != rc) { dataset->ds_status = rc; } if (0 == context->c_rank) { char *path; rc = asprintf (&path, "%s/manifest.json%s", posix_dataset->base_path, dataset->ds_use_bzip ? ".bz2" : ""); if (0 < rc) { int fd; errno = 0; fd = open (path, O_CREAT | O_WRONLY, posix_module->access_mode); if (0 <= fd) { (void) write (fd, manifest, manifest_size); close (fd); } free (manifest); rc = hioi_err_errno (errno); free (path); if (HIO_SUCCESS != rc) { hioi_err_push (rc, &dataset->ds_object, "posix: error writing dataset manifest"); } } else { rc = HIO_ERR_OUT_OF_RESOURCE; } } } #if HIO_USE_MPI /* ensure all ranks have closed the dataset before continuing */ if (hioi_context_using_mpi (context)) { MPI_Allreduce (MPI_IN_PLACE, &rc, 1, MPI_INT, MPI_MIN, context->c_comm); } #endif free (posix_dataset->base_path); pthread_mutex_destroy (&posix_dataset->lock); stop = hioi_gettime (); hioi_log (context, HIO_VERBOSE_DEBUG_LOW, "posix:dataset_open: successfully closed posix dataset %s:%llu on data root %s. " "close time %lu usec", hioi_object_identifier(dataset), dataset->ds_id, module->data_root, stop - start); return rc; }
static int builtin_posix_module_dataset_open (struct hio_module_t *module, hio_dataset_t dataset) { builtin_posix_module_dataset_t *posix_dataset = (builtin_posix_module_dataset_t *) dataset; builtin_posix_module_t *posix_module = (builtin_posix_module_t *) module; hio_context_t context = hioi_object_context ((hio_object_t) dataset); unsigned char *manifest = NULL; size_t manifest_size = 0; hio_fs_attr_t *fs_attr; uint64_t start, stop; int rc = HIO_SUCCESS; char *path = NULL; start = hioi_gettime (); hioi_log (context, HIO_VERBOSE_DEBUG_MED, "posix:dataset_open: opening dataset %s:%lu mpi: %d flags: 0x%x mode: 0x%x", hioi_object_identifier (dataset), (unsigned long) dataset->ds_id, hioi_context_using_mpi (context), dataset->ds_flags, dataset->ds_mode); rc = builtin_posix_module_dataset_init (module, posix_dataset); if (HIO_SUCCESS != rc) { return rc; } fs_attr = &posix_dataset->base.ds_fsattr; rc = hioi_fs_query (context, module->data_root, fs_attr); if (HIO_SUCCESS != rc) { hioi_err_push (rc, &context->c_object, "posix: error querying the filesystem"); return rc; } if (fs_attr->fs_flags & HIO_FS_SUPPORTS_STRIPING) { hioi_config_add (context, &dataset->ds_object, &fs_attr->fs_scount, "stripe_count", HIO_CONFIG_TYPE_UINT32, NULL, "Stripe count for all dataset " "data files", 0); if (fs_attr->fs_scount > fs_attr->fs_smax_count) { hioi_log (context, HIO_VERBOSE_WARN, "posix:dataset_open: requested stripe count %u exceeds the available resources. " "adjusting to maximum %u", fs_attr->fs_scount, fs_attr->fs_smax_count); fs_attr->fs_scount = fs_attr->fs_smax_count; } hioi_config_add (context, &dataset->ds_object, &fs_attr->fs_ssize, "stripe_size", HIO_CONFIG_TYPE_UINT64, NULL, "Stripe size for all dataset " "data files", 0); /* ensure the stripe size is a multiple of the stripe unit */ fs_attr->fs_ssize = fs_attr->fs_sunit * ((fs_attr->fs_ssize + fs_attr->fs_sunit - 1) / fs_attr->fs_sunit); if (fs_attr->fs_ssize > fs_attr->fs_smax_size) { hioi_log (context, HIO_VERBOSE_WARN, "posix:dataset_open: requested stripe size %" PRIu64 " exceeds the maximum %" PRIu64 ". ", fs_attr->fs_ssize, fs_attr->fs_smax_size); fs_attr->fs_ssize = fs_attr->fs_smax_size; } hioi_config_add (context, &dataset->ds_object, &fs_attr->fs_raid_level, "raid_level", HIO_CONFIG_TYPE_UINT64, NULL, "RAID level for dataset " "data files. Keep in mind that some filesystems only support 1/2 RAID " "levels", 0); if (HIO_FILE_MODE_OPTIMIZED == dataset->ds_fmode) { fs_attr->fs_scount = 1; fs_attr->fs_ssize = dataset->ds_bs; fs_attr->fs_use_group_locking = true; } } do { if (0 != context->c_rank) { break; } if (dataset->ds_flags & HIO_FLAG_TRUNC) { /* blow away the existing dataset */ (void) builtin_posix_module_dataset_unlink (module, hioi_object_identifier(dataset), dataset->ds_id); /* ensure we take the create path later */ dataset->ds_flags |= HIO_FLAG_CREAT; } if (!(dataset->ds_flags & HIO_FLAG_CREAT)) { /* load manifest. the manifest data will be shared with other processes in hioi_dataset_scatter */ rc = asprintf (&path, "%s/manifest.json.bz2", posix_dataset->base_path); assert (0 < rc); if (access (path, F_OK)) { free (path); rc = asprintf (&path, "%s/manifest.json", posix_dataset->base_path); assert (0 < rc); if (access (path, F_OK)) { hioi_log (context, HIO_VERBOSE_DEBUG_LOW, "posix:dataset_open: could not find top-level manifest"); rc = HIO_ERR_NOT_FOUND; break; } } rc = hioi_manifest_read (path, &manifest, &manifest_size); free (path); } else { rc = builtin_posix_create_dataset_dirs (posix_module, posix_dataset); if (HIO_SUCCESS != rc) { break; } rc = hioi_manifest_serialize (dataset, &manifest, &manifest_size, true); } } while (0); /* share dataset information will all processes in the communication domain */ rc = hioi_dataset_scatter (dataset, manifest, manifest_size, rc); if (HIO_SUCCESS != rc) { free (posix_dataset->base_path); return rc; } free (manifest); if (HIO_FILE_MODE_OPTIMIZED == dataset->ds_fmode) { if (HIO_SET_ELEMENT_UNIQUE == dataset->ds_mode || 2 > context->c_size || NULL == dataset->ds_shared_control) { posix_dataset->base.ds_fmode = HIO_FILE_MODE_BASIC; /* NTH: no optimized mode for N->N yet */ hioi_log (context, HIO_VERBOSE_WARN, "posix:dataset_open: optimized file mode requested but not supported in this " "dataset mode. falling back to basic file mode"); } } dataset->ds_module = module; dataset->ds_close = builtin_posix_module_dataset_close; dataset->ds_element_open = builtin_posix_module_element_open; dataset->ds_process_reqs = builtin_posix_module_process_reqs; pthread_mutex_init (&posix_dataset->lock, NULL); /* record the open time */ gettimeofday (&dataset->ds_otime, NULL); stop = hioi_gettime (); hioi_log (context, HIO_VERBOSE_DEBUG_LOW, "posix:dataset_open: successfully %s posix dataset %s:%llu on data root %s. " "open time %lu usec", (dataset->ds_flags & HIO_FLAG_CREAT) ? "created" : "opened", hioi_object_identifier(dataset), dataset->ds_id, module->data_root, stop - start); return HIO_SUCCESS; }
static int builtin_posix_module_dataset_close (hio_dataset_t dataset) { builtin_posix_module_dataset_t *posix_dataset = (builtin_posix_module_dataset_t *) dataset; hio_context_t context = hioi_object_context ((hio_object_t) dataset); hio_module_t *module = dataset->ds_module; unsigned char *manifest = NULL; uint64_t start, stop; int rc = HIO_SUCCESS; size_t manifest_size; start = hioi_gettime (); for (int i = 0 ; i < HIO_POSIX_MAX_OPEN_FILES ; ++i) { if (posix_dataset->files[i].f_bid >= 0) { POSIX_TRACE_CALL(posix_dataset, hioi_file_close (posix_dataset->files + i), "file_close", posix_dataset->files[i].f_bid, 0); } } #if HIO_MPI_HAVE(3) /* release the shared state if it was allocated */ (void) hioi_dataset_shared_fini (dataset); /* release the dataset map if one was allocated */ (void) hioi_dataset_map_release (dataset); #endif if (dataset->ds_flags & HIO_FLAG_WRITE) { char *path; /* write manifest header */ POSIX_TRACE_CALL(posix_dataset, rc = hioi_dataset_gather_manifest (dataset, &manifest, &manifest_size, false, true), "gather_manifest", 0, 0); if (HIO_SUCCESS != rc) { dataset->ds_status = rc; } if (0 == context->c_rank) { rc = asprintf (&path, "%s/manifest.json", posix_dataset->base_path); if (0 > rc) { /* out of memory. not much we can do now */ return hioi_err_errno (errno); } rc = hioi_manifest_save (dataset, manifest, manifest_size, path); free (manifest); free (path); if (HIO_SUCCESS != rc) { hioi_err_push (rc, &dataset->ds_object, "posix: error writing dataset manifest"); } } #if HIO_MPI_HAVE(3) if (HIO_FILE_MODE_OPTIMIZED == posix_dataset->ds_fmode) { /* optimized mode requires a data manifest to describe how the data landed on the filesystem */ POSIX_TRACE_CALL(posix_dataset, rc = hioi_dataset_gather_manifest_comm (dataset, context->c_shared_comm, &manifest, &manifest_size, posix_dataset->ds_use_bzip, false), "gather_manifest", 0, 0); if (HIO_SUCCESS != rc) { dataset->ds_status = rc; } if (NULL != manifest) { rc = asprintf (&path, "%s/manifest.%x.json%s", posix_dataset->base_path, context->c_rank, posix_dataset->ds_use_bzip ? ".bz2" : ""); if (0 > rc) { return hioi_err_errno (errno); } rc = hioi_manifest_save (dataset, manifest, manifest_size, path); free (manifest); free (path); if (HIO_SUCCESS != rc) { hioi_err_push (rc, &dataset->ds_object, "posix: error writing dataset manifest"); } } } #endif } #if HIO_MPI_HAVE(1) /* ensure all ranks have closed the dataset before continuing */ if (hioi_context_using_mpi (context)) { MPI_Allreduce (MPI_IN_PLACE, &rc, 1, MPI_INT, MPI_MIN, context->c_comm); } #endif free (posix_dataset->base_path); stop = hioi_gettime (); builtin_posix_trace (posix_dataset, "close", 0, 0, start, stop); hioi_log (context, HIO_VERBOSE_DEBUG_LOW, "posix:dataset_open: successfully closed posix dataset " "%s:%" PRIu64 " on data root %s. close time %" PRIu64 " usec", hioi_object_identifier(dataset), dataset->ds_id, module->data_root, stop - start); builtin_posix_trace (posix_dataset, "trace_end", 0, 0, 0, 0); if (posix_dataset->ds_trace_fh) { fclose (posix_dataset->ds_trace_fh); } return rc; }
static int builtin_posix_module_dataset_open (struct hio_module_t *module, hio_dataset_t dataset) { builtin_posix_module_dataset_t *posix_dataset = (builtin_posix_module_dataset_t *) dataset; builtin_posix_module_t *posix_module = (builtin_posix_module_t *) module; hio_context_t context = hioi_object_context ((hio_object_t) dataset); unsigned char *manifest = NULL; size_t manifest_size = 0; uint64_t start, stop; int rc = HIO_SUCCESS; char *path = NULL; start = hioi_gettime (); hioi_log (context, HIO_VERBOSE_DEBUG_MED, "posix:dataset_open: opening dataset %s:%lu mpi: %d flags: 0x%x mode: 0x%x", hioi_object_identifier (dataset), (unsigned long) dataset->ds_id, hioi_context_using_mpi (context), dataset->ds_flags, dataset->ds_mode); rc = builtin_posix_module_dataset_init (module, posix_dataset); if (HIO_SUCCESS != rc) { return rc; } rc = builtin_posix_module_setup_striping (context, module, dataset); if (HIO_SUCCESS != rc) { return rc; } if (HIO_FILE_MODE_STRIDED == posix_dataset->ds_fmode) { hioi_config_add (context, &dataset->ds_object, &posix_dataset->ds_fcount, "dataset_file_count", HIO_CONFIG_TYPE_UINT64, NULL, "Number of files to use " "in strided file mode", 0); } else if (HIO_FILE_MODE_OPTIMIZED == posix_dataset->ds_fmode) { posix_dataset->ds_use_bzip = true; hioi_config_add (context, &dataset->ds_object, &posix_dataset->ds_use_bzip, "dataset_use_bzip", HIO_CONFIG_TYPE_BOOL, NULL, "Use bzip2 compression for dataset manifests", 0); } if (dataset->ds_flags & HIO_FLAG_TRUNC) { /* blow away the existing dataset */ if (0 == context->c_rank) { (void) builtin_posix_module_dataset_unlink (module, hioi_object_identifier(dataset), dataset->ds_id); } } if (!(dataset->ds_flags & HIO_FLAG_CREAT)) { if (0 == context->c_rank) { /* load manifest. the manifest data will be shared with other processes in hioi_dataset_scatter */ rc = asprintf (&path, "%s/manifest.json", posix_dataset->base_path); assert (0 < rc); if (access (path, F_OK)) { /* this should never happen on a valid dataset */ free (path); hioi_log (context, HIO_VERBOSE_DEBUG_LOW, "posix:dataset_open: could not find top-level manifest %s", path); rc = HIO_ERR_NOT_FOUND; } else { rc = HIO_SUCCESS; } } /* read the manifest if it exists */ if (HIO_SUCCESS == rc) { hioi_log (context, HIO_VERBOSE_DEBUG_LOW, "posix:dataset_open: loading manifest header from %s...", path); rc = hioi_manifest_read (path, &manifest, &manifest_size); free (path); path = NULL; } } else if (0 == context->c_rank) { rc = builtin_posix_create_dataset_dirs (posix_module, posix_dataset); if (HIO_SUCCESS == rc) { /* serialize the manifest to send to remote ranks */ rc = hioi_manifest_serialize (dataset, &manifest, &manifest_size, false, false); } } #if HIO_MPI_HAVE(1) /* share dataset header will all processes in the communication domain */ rc = hioi_dataset_scatter_comm (dataset, context->c_comm, manifest, manifest_size, rc); #endif free (manifest); if (HIO_SUCCESS != rc) { free (posix_dataset->base_path); return rc; } if (context->c_enable_tracing) { char *path; rc = asprintf (&path, "%s/trace/trace.%d", posix_dataset->base_path, context->c_rank); if (rc > 0) { posix_dataset->ds_trace_fh = fopen (path, "a"); free (path); } builtin_posix_trace (posix_dataset, "trace_begin", 0, 0, 0, 0); } #if HIO_MPI_HAVE(3) if (!(dataset->ds_flags & HIO_FLAG_CREAT) && HIO_FILE_MODE_OPTIMIZED == posix_dataset->ds_fmode) { rc = bultin_posix_scatter_data (posix_dataset); if (HIO_SUCCESS != rc) { free (posix_dataset->base_path); return rc; } } /* if possible set up a shared memory window for this dataset */ POSIX_TRACE_CALL(posix_dataset, hioi_dataset_shared_init (dataset, 1), "shared_init", 0, 0); if (HIO_FILE_MODE_OPTIMIZED == posix_dataset->ds_fmode) { if (2 > context->c_size || NULL == dataset->ds_shared_control) { /* no point in using optimized mode in this case */ posix_dataset->ds_fmode = HIO_FILE_MODE_BASIC; hioi_log (context, HIO_VERBOSE_WARN, "posix:dataset_open: optimized file mode requested but not supported in this " "dataset mode. falling back to basic file mode, path: %s", posix_dataset->base_path); } else if (HIO_SET_ELEMENT_SHARED == dataset->ds_mode) { POSIX_TRACE_CALL(posix_dataset, rc = hioi_dataset_generate_map (dataset), "generate_map", 0, 0); if (HIO_SUCCESS != rc) { free (posix_dataset->base_path); return rc; } } } /* NTH: if requested more code is needed to load an optimized dataset with an older MPI */ #endif /* HIO_MPI_HAVE(3) */ dataset->ds_module = module; dataset->ds_close = builtin_posix_module_dataset_close; dataset->ds_element_open = builtin_posix_module_element_open; dataset->ds_process_reqs = builtin_posix_module_process_reqs; /* record the open time */ gettimeofday (&dataset->ds_otime, NULL); stop = hioi_gettime (); hioi_log (context, HIO_VERBOSE_DEBUG_LOW, "posix:dataset_open: successfully %s posix dataset " "%s:%" PRIu64 " on data root %s. open time %" PRIu64 " usec", (dataset->ds_flags & HIO_FLAG_CREAT) ? "created" : "opened", hioi_object_identifier(dataset), dataset->ds_id, module->data_root, stop - start); builtin_posix_trace (posix_dataset, "open", 0, 0, start, stop); return HIO_SUCCESS; }
static ssize_t builtin_posix_module_element_read_strided_internal (builtin_posix_module_t *posix_module, hio_element_t element, uint64_t offset, void *ptr, size_t count, size_t size, size_t stride) { builtin_posix_module_dataset_t *posix_dataset = (builtin_posix_module_dataset_t *) hioi_element_dataset (element); size_t bytes_read = 0, ret; hio_file_t *file; uint64_t start, stop; int rc; if (0 == count || 0 == size) { return 0; } errno = 0; start = hioi_gettime (); for (size_t i = 0 ; i < count ; ++i) { size_t req = size, actual; do { actual = req; /* find out where the data lives */ POSIX_TRACE_CALL(posix_dataset, rc = builtin_posix_element_translate (posix_module, element, offset, &actual, &file, true), "element_translate", offset, req); if (HIO_SUCCESS != rc) { break; } POSIX_TRACE_CALL(posix_dataset, ret = hioi_file_read (file, ptr, actual), "file_read", offset, actual); if (ret > 0) { bytes_read += ret; } if (ret < actual) { /* short read */ break; } req -= actual; offset += actual; ptr = (void *) ((intptr_t) ptr + actual); } while (req); if (req || HIO_SUCCESS != rc) { break; } ptr = (void *) ((intptr_t) ptr + stride); } if (0 == bytes_read || HIO_SUCCESS != rc) { if (0 == bytes_read && HIO_SUCCESS == rc) { rc = hioi_err_errno (errno); } return rc; } stop = hioi_gettime (); posix_dataset->base.ds_stat.s_rtime += stop - start; posix_dataset->base.ds_stat.s_bread += bytes_read; return bytes_read; }