static int builtin_posix_module_dataset_open (struct hio_module_t *module, hio_dataset_t dataset) { builtin_posix_module_dataset_t *posix_dataset = (builtin_posix_module_dataset_t *) dataset; builtin_posix_module_t *posix_module = (builtin_posix_module_t *) module; hio_context_t context = hioi_object_context ((hio_object_t) dataset); unsigned char *manifest = NULL; size_t manifest_size = 0; hio_fs_attr_t *fs_attr; uint64_t start, stop; int rc = HIO_SUCCESS; char *path = NULL; start = hioi_gettime (); hioi_log (context, HIO_VERBOSE_DEBUG_MED, "posix:dataset_open: opening dataset %s:%lu mpi: %d flags: 0x%x mode: 0x%x", hioi_object_identifier (dataset), (unsigned long) dataset->ds_id, hioi_context_using_mpi (context), dataset->ds_flags, dataset->ds_mode); rc = builtin_posix_module_dataset_init (module, posix_dataset); if (HIO_SUCCESS != rc) { return rc; } fs_attr = &posix_dataset->base.ds_fsattr; rc = hioi_fs_query (context, module->data_root, fs_attr); if (HIO_SUCCESS != rc) { hioi_err_push (rc, &context->c_object, "posix: error querying the filesystem"); return rc; } if (fs_attr->fs_flags & HIO_FS_SUPPORTS_STRIPING) { hioi_config_add (context, &dataset->ds_object, &fs_attr->fs_scount, "stripe_count", HIO_CONFIG_TYPE_UINT32, NULL, "Stripe count for all dataset " "data files", 0); if (fs_attr->fs_scount > fs_attr->fs_smax_count) { hioi_log (context, HIO_VERBOSE_WARN, "posix:dataset_open: requested stripe count %u exceeds the available resources. " "adjusting to maximum %u", fs_attr->fs_scount, fs_attr->fs_smax_count); fs_attr->fs_scount = fs_attr->fs_smax_count; } hioi_config_add (context, &dataset->ds_object, &fs_attr->fs_ssize, "stripe_size", HIO_CONFIG_TYPE_UINT64, NULL, "Stripe size for all dataset " "data files", 0); /* ensure the stripe size is a multiple of the stripe unit */ fs_attr->fs_ssize = fs_attr->fs_sunit * ((fs_attr->fs_ssize + fs_attr->fs_sunit - 1) / fs_attr->fs_sunit); if (fs_attr->fs_ssize > fs_attr->fs_smax_size) { hioi_log (context, HIO_VERBOSE_WARN, "posix:dataset_open: requested stripe size %" PRIu64 " exceeds the maximum %" PRIu64 ". ", fs_attr->fs_ssize, fs_attr->fs_smax_size); fs_attr->fs_ssize = fs_attr->fs_smax_size; } hioi_config_add (context, &dataset->ds_object, &fs_attr->fs_raid_level, "raid_level", HIO_CONFIG_TYPE_UINT64, NULL, "RAID level for dataset " "data files. Keep in mind that some filesystems only support 1/2 RAID " "levels", 0); if (HIO_FILE_MODE_OPTIMIZED == dataset->ds_fmode) { fs_attr->fs_scount = 1; fs_attr->fs_ssize = dataset->ds_bs; fs_attr->fs_use_group_locking = true; } } do { if (0 != context->c_rank) { break; } if (dataset->ds_flags & HIO_FLAG_TRUNC) { /* blow away the existing dataset */ (void) builtin_posix_module_dataset_unlink (module, hioi_object_identifier(dataset), dataset->ds_id); /* ensure we take the create path later */ dataset->ds_flags |= HIO_FLAG_CREAT; } if (!(dataset->ds_flags & HIO_FLAG_CREAT)) { /* load manifest. the manifest data will be shared with other processes in hioi_dataset_scatter */ rc = asprintf (&path, "%s/manifest.json.bz2", posix_dataset->base_path); assert (0 < rc); if (access (path, F_OK)) { free (path); rc = asprintf (&path, "%s/manifest.json", posix_dataset->base_path); assert (0 < rc); if (access (path, F_OK)) { hioi_log (context, HIO_VERBOSE_DEBUG_LOW, "posix:dataset_open: could not find top-level manifest"); rc = HIO_ERR_NOT_FOUND; break; } } rc = hioi_manifest_read (path, &manifest, &manifest_size); free (path); } else { rc = builtin_posix_create_dataset_dirs (posix_module, posix_dataset); if (HIO_SUCCESS != rc) { break; } rc = hioi_manifest_serialize (dataset, &manifest, &manifest_size, true); } } while (0); /* share dataset information will all processes in the communication domain */ rc = hioi_dataset_scatter (dataset, manifest, manifest_size, rc); if (HIO_SUCCESS != rc) { free (posix_dataset->base_path); return rc; } free (manifest); if (HIO_FILE_MODE_OPTIMIZED == dataset->ds_fmode) { if (HIO_SET_ELEMENT_UNIQUE == dataset->ds_mode || 2 > context->c_size || NULL == dataset->ds_shared_control) { posix_dataset->base.ds_fmode = HIO_FILE_MODE_BASIC; /* NTH: no optimized mode for N->N yet */ hioi_log (context, HIO_VERBOSE_WARN, "posix:dataset_open: optimized file mode requested but not supported in this " "dataset mode. falling back to basic file mode"); } } dataset->ds_module = module; dataset->ds_close = builtin_posix_module_dataset_close; dataset->ds_element_open = builtin_posix_module_element_open; dataset->ds_process_reqs = builtin_posix_module_process_reqs; pthread_mutex_init (&posix_dataset->lock, NULL); /* record the open time */ gettimeofday (&dataset->ds_otime, NULL); stop = hioi_gettime (); hioi_log (context, HIO_VERBOSE_DEBUG_LOW, "posix:dataset_open: successfully %s posix dataset %s:%llu on data root %s. " "open time %lu usec", (dataset->ds_flags & HIO_FLAG_CREAT) ? "created" : "opened", hioi_object_identifier(dataset), dataset->ds_id, module->data_root, stop - start); return HIO_SUCCESS; }
static int builtin_posix_module_setup_striping (hio_context_t context, struct hio_module_t *module, hio_dataset_t dataset) { builtin_posix_module_dataset_t *posix_dataset = (builtin_posix_module_dataset_t *) dataset; hio_fs_attr_t *fs_attr = &dataset->ds_fsattr; int rc; /* query the filesystem for current striping parameters */ rc = hioi_fs_query (context, module->data_root, fs_attr); if (HIO_SUCCESS != rc) { hioi_err_push (rc, &context->c_object, "posix: error querying the filesystem"); return rc; } /* for now do not use stripe exclusivity in any path */ posix_dataset->my_stripe = 0; /* set default stripe count */ fs_attr->fs_scount = 1; posix_dataset->ds_fcount = 1; if (fs_attr->fs_flags & HIO_FS_SUPPORTS_STRIPING) { if (HIO_FILE_MODE_OPTIMIZED == posix_dataset->ds_fmode) { /* pick a reasonable default stripe size */ fs_attr->fs_ssize = 1 << 24; /* use group locking if available as we guarantee stripe exclusivity in optimized mode */ fs_attr->fs_use_group_locking = true; #if HIO_MPI_HAVE(3) /* if group locking is not available then each rank should attempt to write to * a different stripe to maximize the available IO bandwidth */ fs_attr->fs_scount = min(context->c_shared_size, fs_attr->fs_smax_count); #endif } else if (HIO_FILE_MODE_STRIDED == posix_dataset->ds_fmode) { /* pick a reasonable default stripe size */ fs_attr->fs_ssize = posix_dataset->ds_bs; posix_dataset->ds_fcount = fs_attr->fs_smax_count * 32; fs_attr->fs_scount = 16; if (context->c_size < posix_dataset->ds_fcount) { posix_dataset->ds_fcount = context->c_size; } } else if (HIO_SET_ELEMENT_UNIQUE != dataset->ds_mode) { /* set defaults striping count */ fs_attr->fs_ssize = 1 << 20; fs_attr->fs_scount = max (1, (unsigned) ((float) fs_attr->fs_smax_count * 0.9)); } else { fs_attr->fs_ssize = 1 << 20; } hioi_config_add (context, &dataset->ds_object, &fs_attr->fs_scount, "stripe_count", HIO_CONFIG_TYPE_UINT32, NULL, "Stripe count for all dataset " "data files", 0); hioi_config_add (context, &dataset->ds_object, &fs_attr->fs_ssize, "stripe_size", HIO_CONFIG_TYPE_UINT64, NULL, "Stripe size for all dataset " "data files", 0); if (fs_attr->fs_flags & HIO_FS_SUPPORTS_RAID) { hioi_config_add (context, &dataset->ds_object, &fs_attr->fs_raid_level, "raid_level", HIO_CONFIG_TYPE_UINT64, NULL, "RAID level for dataset " "data files. Keep in mind that some filesystems only support 1/2 RAID " "levels", 0); } /* ensure stripe count is sane */ if (fs_attr->fs_scount > fs_attr->fs_smax_count) { hioi_log (context, HIO_VERBOSE_WARN, "posix:dataset_open: requested stripe count %u exceeds the available resources. " "adjusting to maximum %u", fs_attr->fs_scount, fs_attr->fs_smax_count); fs_attr->fs_scount = fs_attr->fs_smax_count; } /* ensure the stripe size is a multiple of the stripe unit */ fs_attr->fs_ssize = fs_attr->fs_sunit * ((fs_attr->fs_ssize + fs_attr->fs_sunit - 1) / fs_attr->fs_sunit); if (fs_attr->fs_ssize > fs_attr->fs_smax_size) { hioi_log (context, HIO_VERBOSE_WARN, "posix:dataset_open: requested stripe size %" PRIu64 " exceeds the maximum %" PRIu64 ". ", fs_attr->fs_ssize, fs_attr->fs_smax_size); fs_attr->fs_ssize = fs_attr->fs_smax_size; } if (HIO_FILE_MODE_OPTIMIZED == posix_dataset->ds_fmode && posix_dataset->ds_bs < fs_attr->fs_ssize) { posix_dataset->ds_bs = fs_attr->fs_ssize; } } return HIO_SUCCESS; }