Esempio n. 1
0
static int builtin_posix_module_element_open (hio_dataset_t dataset, hio_element_t element) {
  builtin_posix_module_dataset_t *posix_dataset = (builtin_posix_module_dataset_t *) dataset;
  builtin_posix_module_t *posix_module = (builtin_posix_module_t *) dataset->ds_module;
  hio_context_t context = hioi_object_context (&dataset->ds_object);
  int rc;

  if (HIO_FILE_MODE_BASIC == dataset->ds_fmode) {
    rc = builtin_posix_module_element_open_basic (posix_module, posix_dataset, element);
    if (HIO_SUCCESS != rc) {
      hioi_object_release (&element->e_object);
      return rc;
    }
  }

  hioi_log (context, HIO_VERBOSE_DEBUG_LOW, "posix: %s element %p (identifier %s) for dataset %s",
	    (HIO_FLAG_WRITE & dataset->ds_flags) ? "created" : "opened", element,
            hioi_object_identifier(element), hioi_object_identifier(dataset));

  element->e_write_strided_nb = builtin_posix_module_element_write_strided_nb;
  element->e_read_strided_nb = builtin_posix_module_element_read_strided_nb;
  element->e_flush = builtin_posix_module_element_flush;
  element->e_complete = builtin_posix_module_element_complete;
  element->e_close = builtin_posix_module_element_close;

  return HIO_SUCCESS;
}
static int builtin_posix_element_translate_strided (builtin_posix_module_t *posix_module, hio_element_t element,
                                                    uint64_t offset, size_t *size, hio_file_t **file_out) {
  builtin_posix_module_dataset_t *posix_dataset = (builtin_posix_module_dataset_t *) hioi_element_dataset (element);
  size_t block_id, block_base, block_bound, block_offset, file_id, file_block;
  hio_context_t context = hioi_object_context (&element->e_object);
  hio_file_t *file;
  int32_t file_index;
  char *path;
  int rc;

  block_id = offset / posix_dataset->ds_bs;

  file_id = block_id % posix_dataset->ds_fcount;
  file_block = block_id / posix_dataset->ds_fcount;

  block_base = block_id * posix_dataset->ds_bs;
  block_bound = block_base + posix_dataset->ds_bs;
  block_offset = file_block * posix_dataset->ds_bs + offset - block_base;

  hioi_log (context, HIO_VERBOSE_DEBUG_LOW, "builtin_posix_element_translate_strided: element: %s, offset: %"
            PRIu64 ", file_id: %lu, file_block: %lu, block_offset: %lu, block_size: %" PRIu64,
            hioi_object_identifier(element), offset, file_id, file_id, block_offset, posix_dataset->ds_bs);

  if (offset + *size > block_bound) {
    *size = block_bound - offset;
  }

  rc = asprintf (&path, "%s/data/%s_block.%08lu", posix_dataset->base_path, hioi_object_identifier(element),
                 (unsigned long) file_id);
  if (0 > rc) {
    return HIO_ERR_OUT_OF_RESOURCE;
  }

  /* use crc as a hash to pick a file index to use */
  file_index = file_id % HIO_POSIX_MAX_OPEN_FILES;
  file = posix_dataset->files + file_index;

  if (file_id != file->f_bid || file->f_element != element) {
    if (file->f_bid >= 0) {
      POSIX_TRACE_CALL(posix_dataset, hioi_file_close (file), "file_close", file->f_bid, 0);
    }
    file->f_bid = -1;

    file->f_element = element;

    POSIX_TRACE_CALL(posix_dataset, rc = builtin_posix_open_file (posix_module, posix_dataset, path, file),
                     "file_open", file_id, 0);
    if (HIO_SUCCESS != rc) {
      return rc;
    }

    file->f_bid = file_id;
  }

  POSIX_TRACE_CALL(posix_dataset, hioi_file_seek (file, block_offset, SEEK_SET), "file_seek", file->f_bid, block_offset);

  *file_out = file;

  return HIO_SUCCESS;
}
Esempio n. 3
0
static int builtin_posix_module_element_write_strided_nb (hio_element_t element, hio_request_t *request,
                                                          off_t offset, const void *ptr, size_t count,
                                                          size_t size, size_t stride) {
  builtin_posix_module_dataset_t *posix_dataset = (builtin_posix_module_dataset_t *) hioi_element_dataset (element);
  builtin_posix_module_t *posix_module = (builtin_posix_module_t *) posix_dataset->base.ds_module;
  hio_context_t context = hioi_object_context (&element->e_object);
  ssize_t bytes_written;
  hio_request_t new_request;

  pthread_mutex_lock (&posix_dataset->lock);
  bytes_written = builtin_posix_module_element_write_strided_internal (posix_module, element, offset, ptr, count, size,
                                                                       stride);
  pthread_mutex_unlock (&posix_dataset->lock);

  if (request) {
    new_request = hioi_request_alloc (context);
    if (NULL == new_request) {
      return HIO_ERR_OUT_OF_RESOURCE;
    }

    *request = new_request;
    new_request->req_transferred = bytes_written;
    new_request->req_complete = true;
    new_request->req_status = posix_dataset->base.ds_status;
  }

  return posix_dataset->base.ds_status;
}
Esempio n. 4
0
void hioi_err_push_mpi (int mpirc, hio_object_t object, char *format, ...) {
  hio_context_t context = object ? hioi_object_context (object) : NULL;
  hio_error_stack_item_t *new_item;
  char mpi_error[MPI_MAX_ERROR_STRING] = "Unknown error";
  int resultlen = MPI_MAX_ERROR_STRING;
  va_list vargs;
  char *temp;
  int rc;

  va_start (vargs, format);

  rc = vasprintf (&temp, format, vargs);

  va_end (vargs);

  if (0 >= rc) {
    /* couldn't allocate error string */
    return;
  }

  /* ignore the error code for this */
  (void) MPI_Error_string (mpirc, mpi_error, &resultlen);

  new_item = calloc (1, sizeof (hio_error_stack_item_t));
  if (NULL == new_item) {
    /* not much can be done here. we are just plain OOM. */
    return;
  }

  new_item->hrc = hioi_err_mpi(mpirc);

  /* TODO -- Should probably do something smarter here */
  new_item->error_string = malloc (strlen (temp) + 3 + resultlen);
  if (NULL == temp) {
    free (new_item);
    free (temp);
    return;
  }

  /* append the mpi error to the hio error string */
  strcpy (new_item->error_string, temp);
  strcat (new_item->error_string, ": ");
  strcat (new_item->error_string, mpi_error);

  /* done with this now */
  free (temp);

  /* push the error message onto the stack */
  if (NULL == context) {
    pthread_mutex_lock (&hio_error_stack_mutex);
    new_item->next = hio_error_stack_head;
    hio_error_stack_head = new_item;
    pthread_mutex_unlock (&hio_error_stack_mutex);
  } else {
    hioi_object_lock (&context->c_object);
    new_item->next = (hio_error_stack_item_t *) context->c_estack;
    context->c_estack = (void *) new_item;
    hioi_object_unlock (&context->c_object);
  }
}
Esempio n. 5
0
hio_context_t hioi_object_context (hio_object_t object) {
  if (NULL == object->parent) {
    /* all objects have a context at the root */
    assert (HIO_OBJECT_TYPE_CONTEXT == object->type);
    return (hio_context_t) object;
  }

  return hioi_object_context (object->parent);
}
static int builtin_posix_module_process_reqs (hio_dataset_t dataset, hio_internal_request_t **reqs, int req_count) {
  builtin_posix_module_dataset_t *posix_dataset = (builtin_posix_module_dataset_t *) dataset;
  builtin_posix_module_t *posix_module = (builtin_posix_module_t *) dataset->ds_module;
  hio_context_t context = hioi_object_context (&dataset->ds_object);
  uint64_t start, stop;
  int rc = HIO_SUCCESS;

  start = hioi_gettime ();

  hioi_object_lock (&dataset->ds_object);
  for (int i = 0 ; i < req_count ; ++i) {
    hio_internal_request_t *req = reqs[i];

    if (HIO_REQUEST_TYPE_READ == req->ir_type) {
      POSIX_TRACE_CALL(posix_dataset,
                       req->ir_status = builtin_posix_module_element_read_strided_internal (posix_module, req->ir_element, req->ir_offset,
                                                                                            req->ir_data.r, req->ir_count, req->ir_size,
                                                                                            req->ir_stride),
                       "element_read", req->ir_offset, req->ir_count * req->ir_size);

    } else {
      POSIX_TRACE_CALL(posix_dataset,
                       req->ir_status = builtin_posix_module_element_write_strided_internal (posix_module, req->ir_element, req->ir_offset,
                                                                                             req->ir_data.w, req->ir_count, req->ir_size,
                                                                                             req->ir_stride),
                       "element_write", req->ir_offset, req->ir_count * req->ir_size);
    }

    if (req->ir_urequest && req->ir_status > 0) {
      hio_request_t new_request = hioi_request_alloc (context);
      if (NULL == new_request) {
        rc = HIO_ERR_OUT_OF_RESOURCE;
        break;
      }

      req->ir_urequest[0] = new_request;
      new_request->req_transferred = req->ir_status;
      new_request->req_complete = true;
      new_request->req_status = HIO_SUCCESS;
    }

    if (req->ir_status < 0) {
      rc = (int) req->ir_status;
      break;
    }
  }

  hioi_object_unlock (&dataset->ds_object);

  stop = hioi_gettime ();

  builtin_posix_trace (posix_dataset, "process_requests", req_count, 0, start, stop);

  return rc;
}
Esempio n. 7
0
static int builtin_posix_module_dataset_init (struct hio_module_t *module,
                                              builtin_posix_module_dataset_t *posix_dataset) {
  hio_context_t context = hioi_object_context ((hio_object_t) posix_dataset);
  int rc;

  rc = asprintf (&posix_dataset->base_path, "%s/%s.hio/%s/%lu", module->data_root,
                 hioi_object_identifier(context), hioi_object_identifier (posix_dataset),
                 (unsigned long) posix_dataset->base.ds_id);
  assert (0 < rc);

  for (int i = 0 ; i < HIO_POSIX_MAX_OPEN_FILES ; ++i) {
    posix_dataset->files[i].f_bid = -1;
    posix_dataset->files[i].f_file.f_hndl = NULL;
  }

  return HIO_SUCCESS;
}
Esempio n. 8
0
/**
 * Search for a matching value in the configuration file.
 *
 * @param[in] context  context to search
 * @param[in] object   associated object
 * @param[in] var      variable to set
 *
 * This function currently does a linear search of the configuration
 * file. In the future this should be replaced with a hash table or
 * similar structure.
 */
static int hioi_config_set_from_kv_list (hio_config_kv_list_t *list, hio_object_t object,
                                         hio_var_t *var) {
  hio_context_t context = hioi_object_context (object);

  for (int i = 0 ; i < list->kv_list_count ; ++i) {
    hio_config_kv_t *kv = list->kv_list + i;
    if ((HIO_OBJECT_TYPE_ANY == kv->object_type || object->type == kv->object_type) &&
        (NULL == kv->object_identifier || !strcmp (object->identifier, kv->object_identifier)) &&
        !strcmp (var->var_name, kv->key)) {
      hioi_log (context, HIO_VERBOSE_DEBUG_LOW, "Setting value for %s to %s from file",
                var->var_name, kv->value);
      return hioi_config_set_value_internal (context, var, kv->value);
    }
  }

  return HIO_SUCCESS;
}
Esempio n. 9
0
void hioi_err_push (int hrc, hio_object_t object, char *format, ...) {
  hio_context_t context = object ? hioi_object_context (object) : NULL;
  hio_error_stack_item_t *new_item;
  va_list vargs;
  int rc;

  new_item = calloc (1, sizeof (hio_error_stack_item_t));
  if (NULL == new_item) {
    /* not much can be done here. we are just plain OOM. */
    return;
  }

  va_start (vargs, format);

  rc = vasprintf (&new_item->error_string, format, vargs);

  va_end (vargs);

  if (0 >= rc) {
    /* couldn't allocate error string */
    free (new_item);
    return;
  }

  if (context) {
    hioi_log (context, HIO_VERBOSE_ERROR, "%s", new_item->error_string);
  }

  new_item->hrc = hrc;

  /* push the error message onto the stack */
  if (NULL == context) {
    pthread_mutex_lock (&hio_error_stack_mutex);
    new_item->next = hio_error_stack_head;
    hio_error_stack_head = new_item;
    pthread_mutex_unlock (&hio_error_stack_mutex);
  } else {
    hioi_object_lock (&context->c_object);
    new_item->next = (hio_error_stack_item_t *) context->c_estack;
    context->c_estack = (void *) new_item;
    hioi_object_unlock (&context->c_object);
   }
}
Esempio n. 10
0
static int builtin_posix_module_element_read_strided_nb (hio_element_t element, hio_request_t *request, off_t offset,
                                                         void *ptr, size_t count, size_t size, size_t stride) {
  builtin_posix_module_dataset_t *posix_dataset = (builtin_posix_module_dataset_t *) hioi_element_dataset (element);
  builtin_posix_module_t *posix_module = (builtin_posix_module_t *) posix_dataset->base.ds_module;
  hio_context_t context = hioi_object_context (&element->e_object);
  ssize_t bytes_read;
  hio_request_t new_request;
  int rc = HIO_SUCCESS;

  if (!(posix_dataset->base.ds_flags & HIO_FLAG_READ)) {
    return HIO_ERR_PERM;
  }

  if (stride == 0) {
    size *= count;
    count = 1;
  }

  hioi_object_lock (&posix_dataset->base.ds_object);
  bytes_read = builtin_posix_module_element_read_strided_internal (posix_module, element, offset, ptr, count, size, stride);
  hioi_object_unlock (&posix_dataset->base.ds_object);

  if (0 > bytes_read) {
    rc = (int) bytes_read;
  }

  /* see if a request was requested */
  if (request) {
    new_request = hioi_request_alloc (context);
    if (NULL == new_request) {
      return HIO_ERR_OUT_OF_RESOURCE;
    }

    *request = new_request;
    new_request->req_transferred = bytes_read;
    new_request->req_complete = true;
  }

  return rc;
}
Esempio n. 11
0
int hio_config_set_value (hio_object_t object, const char *variable, const char *value) {
  int rc = HIO_SUCCESS;
  hio_var_t *var;
  int config_index;

  if (NULL == object || NULL == variable || NULL == value) {
    return HIO_ERR_BAD_PARAM;
  }

  hioi_object_lock (object);

  do {
    /* go ahead and push this value into the object's key-value store. if the
     * configuration parameter has not yet been registered it will be read from
     * this key-valye store after the file store is checked. */
    hioi_config_list_kv_push (&object->config_set, hioi_object_identifier (object),
                              object->type, variable, value);

    config_index = hioi_var_lookup (&object->configuration, variable);
    if (0 > config_index) {
      /* variable does not exist (yet). nothing more to do */
      break;
    }

    var = object->configuration.vars + config_index;

    if (HIO_VAR_FLAG_READONLY & var->var_flags) {
      hioi_err_push (HIO_ERR_PERM, object, "could not set read-only parameter: %s", variable);
      rc = HIO_ERR_PERM;
      break;
    }

    rc = hioi_config_set_value_internal (hioi_object_context(object), var, value);
  } while (0);

  hioi_object_unlock (object);

  return rc;
}
static int builtin_posix_module_dataset_init (struct hio_module_t *module,
                                              builtin_posix_module_dataset_t *posix_dataset) {
  hio_context_t context = hioi_object_context ((hio_object_t) posix_dataset);
  int rc;

  rc = asprintf (&posix_dataset->base_path, "%s/%s.hio/%s/%lu", module->data_root,
                 hioi_object_identifier(context), hioi_object_identifier (posix_dataset),
                 (unsigned long) posix_dataset->base.ds_id);
  assert (0 < rc);

  /* initialize posix dataset specific data */
  for (int i = 0 ; i < HIO_POSIX_MAX_OPEN_FILES ; ++i) {
    posix_dataset->files[i].f_bid = -1;
    posix_dataset->files[i].f_hndl = NULL;
    posix_dataset->files[i].f_fd = -1;
  }

  /* default to strided output mode */
  posix_dataset->ds_fmode = HIO_FILE_MODE_STRIDED;
  hioi_config_add (context, &posix_dataset->base.ds_object, &posix_dataset->ds_fmode,
                   "dataset_file_mode", HIO_CONFIG_TYPE_INT32, &hioi_dataset_file_modes,
                   "Modes for writing dataset files. Valid values: (0: basic, 1: file_per_node, 2: strided)", 0);

  if (HIO_FILE_MODE_STRIDED == posix_dataset->ds_fmode && HIO_SET_ELEMENT_UNIQUE == posix_dataset->base.ds_mode) {
    /* strided mode only applies to shared datasets */
    posix_dataset->ds_fmode = HIO_FILE_MODE_BASIC;
  }

  if (HIO_FILE_MODE_BASIC != posix_dataset->ds_fmode) {
    posix_dataset->ds_bs = 1ul << 23;
    hioi_config_add (context, &posix_dataset->base.ds_object, &posix_dataset->ds_bs,
                     "dataset_block_size", HIO_CONFIG_TYPE_INT64, NULL,
                     "Block size to use when writing in optimized mode (default: 8M)", 0);
  }

  return HIO_SUCCESS;
}
Esempio n. 13
0
static ssize_t builtin_posix_module_element_write_strided_internal (builtin_posix_module_t *posix_module, hio_element_t element,
                                                                    off_t offset, const void *ptr, size_t count, size_t size,
                                                                    size_t stride) {
  hio_dataset_t dataset = hioi_element_dataset (element);
  size_t bytes_written = 0, ret;
  hio_file_t *file;
  uint64_t stop, start;
  int rc;

  assert (dataset->ds_flags & HIO_FLAG_WRITE);

  if (!(count * size)) {
    return 0;
  }

  if (0 == stride) {
    size *= count;
    count = 1;
  }

  start = hioi_gettime ();

  errno = 0;

  for (size_t i = 0 ; i < count ; ++i) {
    size_t req = size, actual;

    do {
      actual = req;

      rc = builtin_posix_element_translate (posix_module, element, offset, &actual,
                                            &file, false);
      assert (file);
      if (HIO_SUCCESS != rc) {
        break;
      }

      ret = fwrite (ptr, 1, actual, file->f_hndl);
      if (ret > 0) {
        bytes_written += ret;
        file->f_offset += ret;
      }

      if (ret < actual) {
        /* short write */
        break;
      }

      req -= actual;
      offset += actual;
      ptr = (void *) ((intptr_t) ptr + actual);
    } while (req);

    if (HIO_SUCCESS != rc || req) {
      break;
    }

    ptr = (void *) ((intptr_t) ptr + stride);
  }

  if (0 == bytes_written || HIO_SUCCESS != rc) {
    if (0 == bytes_written) {
      rc = hioi_err_errno (errno);
    }

    dataset->ds_status = rc;
    return rc;
  }

  if (offset + bytes_written > element->e_size) {
    element->e_size = offset + bytes_written;
  }

  stop = hioi_gettime ();

  dataset->ds_stat.s_wtime += stop - start;

  if (0 < bytes_written) {
    dataset->ds_stat.s_bwritten += bytes_written;
  }

  hioi_log (hioi_object_context (&element->e_object), HIO_VERBOSE_DEBUG_LOW, "posix: finished write. bytes written: "
            "%lu, time: %llu usec", bytes_written, stop - start);

  return bytes_written;
}
Esempio n. 14
0
static int builtin_posix_element_translate_opt (builtin_posix_module_t *posix_module, hio_element_t element, off_t offset,
                                                size_t *size, hio_file_t **file_out, bool reading) {
  builtin_posix_module_dataset_t *posix_dataset = (builtin_posix_module_dataset_t *) hioi_element_dataset (element);
  hio_context_t context = hioi_object_context (&element->e_object);
  builtin_posix_file_t *file;
  uint64_t file_offset;
  int file_index;
  char *path;
  int rc;

  hioi_log (context, HIO_VERBOSE_DEBUG_MED, "translating element %s offset %ld size %lu",
            hioi_object_identifier (&element->e_object), offset, *size);
  rc = hioi_element_translate_offset (element, offset, &file_index, &file_offset, size);
  if (HIO_SUCCESS != rc) {
    if (reading) {
      hioi_log (context, HIO_VERBOSE_DEBUG_MED, "offset not found");
      /* not found */
      return rc;
    }

    if (hioi_context_using_mpi (context)) {
      rc = asprintf (&path, "%s/data.%x", posix_dataset->base_path, posix_dataset->base.ds_shared_control->s_master);
      if (0 > rc) {
        return HIO_ERR_OUT_OF_RESOURCE;
      }
    } else {
      rc = asprintf (&path, "%s/data", posix_dataset->base_path);
      if (0 > rc) {
        return HIO_ERR_OUT_OF_RESOURCE;
      }
    }

    file_offset = builtin_posix_reserve (posix_dataset, size);

    file_index = hioi_dataset_add_file (&posix_dataset->base, strrchr (path, '/') + 1);
    hioi_element_add_segment (element, file_index, file_offset, offset, *size);
  } else {
    hioi_log (context, HIO_VERBOSE_DEBUG_MED, "offset found in file @ index %d, offset %lu, size %lu", file_index,
              file_offset, *size);
    rc = asprintf (&path, "%s/%s", posix_dataset->base_path, posix_dataset->base.ds_flist[file_index].f_name);
    if (0 > rc) {
      return HIO_ERR_OUT_OF_RESOURCE;
    }
  }

  /* use crc as a hash to pick a file index to use */
  int internal_index = file_index % HIO_POSIX_MAX_OPEN_FILES;
  file = posix_dataset->files + internal_index;

  if (internal_index != file->f_bid) {
    if (NULL != file->f_file.f_hndl) {
      fclose (file->f_file.f_hndl);
      file->f_file.f_hndl = NULL;
      file->f_bid = -1;
    }

    rc = builtin_posix_open_file (posix_module, posix_dataset, path, &file->f_file);
    if (HIO_SUCCESS != rc) {
      free (path);
      return rc;
    }

    file->f_bid = file_index;
  }

  free (path);

  if (file_offset != file->f_file.f_offset) {
    fseek (file->f_file.f_hndl, file_offset, SEEK_SET);
    file->f_file.f_offset = file_offset;
  }

  *file_out = &file->f_file;

  return HIO_SUCCESS;
}
Esempio n. 15
0
static int builtin_posix_element_translate_opt_old (builtin_posix_module_t *posix_module, hio_element_t element, off_t offset,
                                                    size_t *size, hio_file_t **file_out) {
  builtin_posix_module_dataset_t *posix_dataset = (builtin_posix_module_dataset_t *) hioi_element_dataset (element);
  hio_context_t context = hioi_object_context (&element->e_object);
  size_t block_id, block_base, block_bound, block_offset;
  builtin_posix_file_t *file;
  int32_t file_index;
  char *path;
  int rc, foo;

  block_id = offset / posix_dataset->base.ds_bs;
  block_base = block_id * posix_dataset->base.ds_bs;
  block_bound = block_base + posix_dataset->base.ds_bs;
  block_offset = offset - block_base;

  hioi_log (context, HIO_VERBOSE_DEBUG_LOW, "builtin_posix_element_translate: element: %s, offset: %lu, block_id: %lu, "
            "block_offset: %lu, block_size: %lu", hioi_object_identifier(element), (unsigned long) offset,
            block_id, block_offset, posix_dataset->base.ds_bs);

  if (offset + *size > block_bound) {
    *size = block_bound - offset;
  }

  rc = asprintf (&path, "%s_block.%lu", hioi_object_identifier(element), (unsigned long) block_id);
  if (0 > rc) {
    return HIO_ERR_OUT_OF_RESOURCE;
  }

  if (HIO_FLAG_WRITE & posix_dataset->base.ds_flags) {
    foo = hioi_dataset_add_file (&posix_dataset->base, path);
  }
  char *tmp = path;
  rc = asprintf (&path, "%s/%s", posix_dataset->base_path, tmp);
  free (tmp);
  if (0 > rc) {
    return HIO_ERR_OUT_OF_RESOURCE;
  }

  /* use crc as a hash to pick a file index to use */
  file_index = hioi_crc32 ((uint8_t *) path, strlen (path)) % HIO_POSIX_MAX_OPEN_FILES;
  file = posix_dataset->files + file_index;

  if (block_id != file->f_bid || file->f_element != element) {
    if (file->f_file.f_hndl != NULL) {
      fclose (file->f_file.f_hndl);
      file->f_file.f_hndl = NULL;
      file->f_bid = -1;
    }

    file->f_element = element;

    rc = builtin_posix_open_file (posix_module, posix_dataset, path, &file->f_file);
    if (HIO_SUCCESS != rc) {
      return rc;
    }

    file->f_bid = block_id;
  }

  if (block_offset != file->f_file.f_offset) {
    fseek (file->f_file.f_hndl, block_offset, SEEK_SET);
    file->f_file.f_offset = block_offset;
  }

  if (HIO_FLAG_WRITE & posix_dataset->base.ds_flags) {
    hioi_element_add_segment (element, foo, block_offset, offset, *size);
  }

  *file_out = &file->f_file;

  return HIO_SUCCESS;
}
static int builtin_posix_element_translate_opt (builtin_posix_module_t *posix_module, hio_element_t element,
                                                uint64_t offset, size_t *size, hio_file_t **file_out,
                                                bool reading) {
  builtin_posix_module_dataset_t *posix_dataset = (builtin_posix_module_dataset_t *) hioi_element_dataset (element);
  hio_context_t context = hioi_object_context (&element->e_object);
  hio_file_t *file;
  uint64_t file_offset;
  int file_index = 0;
  char *path;
  int rc;

  hioi_log (context, HIO_VERBOSE_DEBUG_MED, "translating element %s offset %" PRIu64 " size %lu",
            hioi_object_identifier (&element->e_object), offset, *size);
  POSIX_TRACE_CALL(posix_dataset, rc = hioi_element_translate_offset (element, offset, &file_index, &file_offset, size),
                   "translate_offset", offset, *size);
#if HIO_MPI_HAVE(3)
  if (HIO_SUCCESS != rc && reading) {
    POSIX_TRACE_CALL(posix_dataset, rc = hioi_dataset_map_translate_offset (element, offset, &file_index, &file_offset, size),
                     "map_translate_offset", offset, *size);
  }
#endif

  if (HIO_SUCCESS != rc) {
    if (reading) {
      hioi_log (context, HIO_VERBOSE_DEBUG_MED, "offset %" PRIu64 " not found", offset);
      /* not found */
      return rc;
    }

    file_offset = builtin_posix_reserve (posix_dataset, size);

    if (hioi_context_using_mpi (context)) {
      file_index = posix_dataset->base.ds_shared_control->s_master;
    } else {
      file_index = 0;
    }

    rc = asprintf (&path, "%s/data/data.%x", posix_dataset->base_path, file_index);
    if (0 > rc) {
      return HIO_ERR_OUT_OF_RESOURCE;
    }

    hioi_element_add_segment (element, file_index, file_offset, offset, *size);
  } else {
    hioi_log (context, HIO_VERBOSE_DEBUG_MED, "offset found in file @ rank %d, offset %" PRIu64
              ", size %lu", file_index, file_offset, *size);
    rc = asprintf (&path, "%s/data/data.%x", posix_dataset->base_path, file_index);
    if (0 > rc) {
      return HIO_ERR_OUT_OF_RESOURCE;
    }

    if (access (path, R_OK)) {
      free (path);
      rc = asprintf (&path, "%s/data.%x", posix_dataset->base_path, file_index);
      if (0 > rc) {
        return HIO_ERR_OUT_OF_RESOURCE;
      }
    }
  }

  /* use crc as a hash to pick a file index to use */
  int internal_index = file_index % HIO_POSIX_MAX_OPEN_FILES;
  file = posix_dataset->files + internal_index;

  if (file_index != file->f_bid) {
    if (file->f_bid >= 0) {
      POSIX_TRACE_CALL(posix_dataset, hioi_file_close (file), "file_close", file->f_bid, 0);
    }

    file->f_bid = -1;

    POSIX_TRACE_CALL(posix_dataset, rc = builtin_posix_open_file (posix_module, posix_dataset, path, file),
                     "file_open", file_index, 0);
    if (HIO_SUCCESS != rc) {
      free (path);
      return rc;
    }

    file->f_bid = file_index;
  }

  free (path);

  POSIX_TRACE_CALL(posix_dataset, hioi_file_seek (file, file_offset, SEEK_SET), "file_seek", file->f_bid, file_offset);

  *file_out = file;

  return HIO_SUCCESS;
}
Esempio n. 17
0
static int builtin_posix_module_dataset_close (hio_dataset_t dataset) {
  builtin_posix_module_dataset_t *posix_dataset = (builtin_posix_module_dataset_t *) dataset;
  builtin_posix_module_t *posix_module = (builtin_posix_module_t *) dataset->ds_module;
  hio_context_t context = hioi_object_context ((hio_object_t) dataset);
  hio_module_t *module = dataset->ds_module;
  unsigned char *manifest;
  uint64_t start, stop;
  int rc = HIO_SUCCESS;
  size_t manifest_size;

  start = hioi_gettime ();

  for (int i = 0 ; i < HIO_POSIX_MAX_OPEN_FILES ; ++i) {
    if (posix_dataset->files[i].f_file.f_hndl != NULL) {
      fclose (posix_dataset->files[i].f_file.f_hndl);
      posix_dataset->files[i].f_file.f_hndl = NULL;
    }
  }

  if (dataset->ds_flags & HIO_FLAG_WRITE) {
    rc = hioi_dataset_gather_manifest (dataset, &manifest, &manifest_size, dataset->ds_use_bzip);
    if (HIO_SUCCESS != rc) {
      dataset->ds_status = rc;
    }

    if (0 == context->c_rank) {
      char *path;

      rc = asprintf (&path, "%s/manifest.json%s", posix_dataset->base_path,
                     dataset->ds_use_bzip ? ".bz2" : "");
      if (0 < rc) {
        int fd;

        errno = 0;
        fd = open (path, O_CREAT | O_WRONLY, posix_module->access_mode);
        if (0 <= fd) {
          (void) write (fd, manifest, manifest_size);
          close (fd);
        }
        free (manifest);

        rc = hioi_err_errno (errno);

        free (path);
        if (HIO_SUCCESS != rc) {
          hioi_err_push (rc, &dataset->ds_object, "posix: error writing dataset manifest");
        }
      } else {
        rc = HIO_ERR_OUT_OF_RESOURCE;
      }
    }
  }

#if HIO_USE_MPI
  /* ensure all ranks have closed the dataset before continuing */
  if (hioi_context_using_mpi (context)) {
    MPI_Allreduce (MPI_IN_PLACE, &rc, 1, MPI_INT, MPI_MIN, context->c_comm);
  }
#endif

  free (posix_dataset->base_path);

  pthread_mutex_destroy (&posix_dataset->lock);

  stop = hioi_gettime ();

  hioi_log (context, HIO_VERBOSE_DEBUG_LOW, "posix:dataset_open: successfully closed posix dataset %s:%llu on data root %s. "
            "close time %lu usec", hioi_object_identifier(dataset), dataset->ds_id, module->data_root, stop - start);

  return rc;
}
Esempio n. 18
0
static int builtin_posix_module_dataset_open (struct hio_module_t *module, hio_dataset_t dataset) {
  builtin_posix_module_dataset_t *posix_dataset = (builtin_posix_module_dataset_t *) dataset;
  builtin_posix_module_t *posix_module = (builtin_posix_module_t *) module;
  hio_context_t context = hioi_object_context ((hio_object_t) dataset);
  unsigned char *manifest = NULL;
  size_t manifest_size = 0;
  hio_fs_attr_t *fs_attr;
  uint64_t start, stop;
  int rc = HIO_SUCCESS;
  char *path = NULL;

  start = hioi_gettime ();

  hioi_log (context, HIO_VERBOSE_DEBUG_MED, "posix:dataset_open: opening dataset %s:%lu mpi: %d flags: 0x%x mode: 0x%x",
	    hioi_object_identifier (dataset), (unsigned long) dataset->ds_id, hioi_context_using_mpi (context),
            dataset->ds_flags, dataset->ds_mode);

  rc = builtin_posix_module_dataset_init (module, posix_dataset);
  if (HIO_SUCCESS != rc) {
    return rc;
  }

  fs_attr = &posix_dataset->base.ds_fsattr;

  rc = hioi_fs_query (context, module->data_root, fs_attr);
  if (HIO_SUCCESS != rc) {
    hioi_err_push (rc, &context->c_object, "posix: error querying the filesystem");
    return rc;
  }

  if (fs_attr->fs_flags & HIO_FS_SUPPORTS_STRIPING) {
    hioi_config_add (context, &dataset->ds_object, &fs_attr->fs_scount,
                     "stripe_count", HIO_CONFIG_TYPE_UINT32, NULL, "Stripe count for all dataset "
                     "data files", 0);

    if (fs_attr->fs_scount > fs_attr->fs_smax_count) {
      hioi_log (context, HIO_VERBOSE_WARN, "posix:dataset_open: requested stripe count %u exceeds the available resources. "
                "adjusting to maximum %u", fs_attr->fs_scount, fs_attr->fs_smax_count);
      fs_attr->fs_scount = fs_attr->fs_smax_count;
    }

    hioi_config_add (context, &dataset->ds_object, &fs_attr->fs_ssize,
                     "stripe_size", HIO_CONFIG_TYPE_UINT64, NULL, "Stripe size for all dataset "
                     "data files", 0);

    /* ensure the stripe size is a multiple of the stripe unit */
    fs_attr->fs_ssize = fs_attr->fs_sunit * ((fs_attr->fs_ssize + fs_attr->fs_sunit - 1) / fs_attr->fs_sunit);
    if (fs_attr->fs_ssize > fs_attr->fs_smax_size) {
      hioi_log (context, HIO_VERBOSE_WARN, "posix:dataset_open: requested stripe size %" PRIu64 " exceeds the maximum %"
                PRIu64 ". ", fs_attr->fs_ssize, fs_attr->fs_smax_size);
      fs_attr->fs_ssize = fs_attr->fs_smax_size;
    }

    hioi_config_add (context, &dataset->ds_object, &fs_attr->fs_raid_level,
                     "raid_level", HIO_CONFIG_TYPE_UINT64, NULL, "RAID level for dataset "
                     "data files. Keep in mind that some filesystems only support 1/2 RAID "
                     "levels", 0);

    if (HIO_FILE_MODE_OPTIMIZED == dataset->ds_fmode) {
      fs_attr->fs_scount = 1;
      fs_attr->fs_ssize = dataset->ds_bs;
      fs_attr->fs_use_group_locking = true;
    }
  }

  do {
    if (0 != context->c_rank) {
      break;
    }

    if (dataset->ds_flags & HIO_FLAG_TRUNC) {
      /* blow away the existing dataset */
      (void) builtin_posix_module_dataset_unlink (module, hioi_object_identifier(dataset),
                                                  dataset->ds_id);

      /* ensure we take the create path later */
      dataset->ds_flags |= HIO_FLAG_CREAT;
    }

    if (!(dataset->ds_flags & HIO_FLAG_CREAT)) {
      /* load manifest. the manifest data will be shared with other processes in hioi_dataset_scatter */
      rc = asprintf (&path, "%s/manifest.json.bz2", posix_dataset->base_path);
      assert (0 < rc);

      if (access (path, F_OK)) {
        free (path);
        rc = asprintf (&path, "%s/manifest.json", posix_dataset->base_path);
        assert (0 < rc);
        if (access (path, F_OK)) {
          hioi_log (context, HIO_VERBOSE_DEBUG_LOW, "posix:dataset_open: could not find top-level manifest");
          rc = HIO_ERR_NOT_FOUND;
          break;
        }
      }

      rc = hioi_manifest_read (path, &manifest, &manifest_size);
      free (path);
    } else {
      rc = builtin_posix_create_dataset_dirs (posix_module, posix_dataset);
      if (HIO_SUCCESS != rc) {
        break;
      }

      rc = hioi_manifest_serialize (dataset, &manifest, &manifest_size, true);
    }
  } while (0);

  /* share dataset information will all processes in the communication domain */
  rc = hioi_dataset_scatter (dataset, manifest, manifest_size, rc);
  if (HIO_SUCCESS != rc) {
    free (posix_dataset->base_path);
    return rc;
  }

  free (manifest);

  if (HIO_FILE_MODE_OPTIMIZED == dataset->ds_fmode) {
    if (HIO_SET_ELEMENT_UNIQUE == dataset->ds_mode || 2 > context->c_size || NULL == dataset->ds_shared_control) {
      posix_dataset->base.ds_fmode = HIO_FILE_MODE_BASIC;
      /* NTH: no optimized mode for N->N yet */
      hioi_log (context, HIO_VERBOSE_WARN, "posix:dataset_open: optimized file mode requested but not supported in this "
                "dataset mode. falling back to basic file mode");
    }
  }

  dataset->ds_module = module;
  dataset->ds_close = builtin_posix_module_dataset_close;
  dataset->ds_element_open = builtin_posix_module_element_open;
  dataset->ds_process_reqs = builtin_posix_module_process_reqs;

  pthread_mutex_init (&posix_dataset->lock, NULL);

  /* record the open time */
  gettimeofday (&dataset->ds_otime, NULL);

  stop = hioi_gettime ();

  hioi_log (context, HIO_VERBOSE_DEBUG_LOW, "posix:dataset_open: successfully %s posix dataset %s:%llu on data root %s. "
            "open time %lu usec", (dataset->ds_flags & HIO_FLAG_CREAT) ? "created" : "opened", hioi_object_identifier(dataset),
            dataset->ds_id, module->data_root, stop - start);

  return HIO_SUCCESS;
}
static int builtin_posix_module_dataset_manifest_list (builtin_posix_module_dataset_t *posix_dataset, int **manifest_ids, size_t *count) {
  hio_context_t context = hioi_object_context (&posix_dataset->base.ds_object);
  int num_manifest_ids = 0, manifest_id_index = 0;
  unsigned int manifest_id;
  int rc = HIO_SUCCESS;
  int *tmp = NULL;
  struct dirent *dp;
  DIR *dir;

  *manifest_ids = NULL;
  *count = 0;

  rc = hioi_context_generate_leader_list (context);
  if (HIO_SUCCESS != rc) {
    return rc;
  }

  if (0 != context->c_shared_rank) {
    return HIO_SUCCESS;
  }

  do {
    if (0 != context->c_rank) {
      break;
    }

    dir = opendir (posix_dataset->base_path);
    if (NULL == dir) {
      num_manifest_ids = hioi_err_errno (errno);
      break;
    }

    while (NULL != (dp = readdir (dir))) {
      if (dp->d_name[0] != '.' && 0 != sscanf (dp->d_name, "manifest.%x.json", &manifest_id)) {
        ++num_manifest_ids;
      }
    }

    if (0 == num_manifest_ids) {
      break;
    }

    /* round up to a multiple of the number nodes */
    num_manifest_ids = context->c_node_count * (num_manifest_ids + context->c_node_count - 1) / context->c_node_count;

    tmp = (int *) malloc (num_manifest_ids * sizeof (int));
    assert (NULL != tmp);
    memset (tmp, 0xff, sizeof (int) * num_manifest_ids);

    rewinddir (dir);

    while (NULL != (dp = readdir (dir))) {
      if ('.' == dp->d_name[0] || 0 == sscanf (dp->d_name, "manifest.%x.json", &manifest_id)) {
        continue;
      }

      tmp[manifest_id_index++] = (int) manifest_id;
    }

    /* put manifest files in numerical order */
    qsort (tmp, manifest_id_index, sizeof (int), manifest_index_compare);
  } while (0);

  if (0 == context->c_rank) {
    closedir (dir);
  }

  if (num_manifest_ids > 0) {
    num_manifest_ids /= context->c_node_count;
  }

  MPI_Bcast (&num_manifest_ids, 1, MPI_INT, 0, context->c_node_leader_comm);

  if (0 < num_manifest_ids) {
    *manifest_ids = (int *) malloc (num_manifest_ids * sizeof (int));
    assert (NULL != *manifest_ids);

    MPI_Scatter (tmp, num_manifest_ids, MPI_INT, *manifest_ids, num_manifest_ids, MPI_INT,
                 0, context->c_node_leader_comm);
  }

  free (tmp);

  *count = num_manifest_ids;

  return num_manifest_ids >= 0 ? HIO_SUCCESS : num_manifest_ids;
}
static int builtin_posix_module_dataset_close (hio_dataset_t dataset) {
  builtin_posix_module_dataset_t *posix_dataset = (builtin_posix_module_dataset_t *) dataset;
  hio_context_t context = hioi_object_context ((hio_object_t) dataset);
  hio_module_t *module = dataset->ds_module;
  unsigned char *manifest = NULL;
  uint64_t start, stop;
  int rc = HIO_SUCCESS;
  size_t manifest_size;

  start = hioi_gettime ();

  for (int i = 0 ; i < HIO_POSIX_MAX_OPEN_FILES ; ++i) {
    if (posix_dataset->files[i].f_bid >= 0) {
      POSIX_TRACE_CALL(posix_dataset, hioi_file_close (posix_dataset->files + i), "file_close",
                       posix_dataset->files[i].f_bid, 0);
    }
  }

#if HIO_MPI_HAVE(3)
  /* release the shared state if it was allocated */
  (void) hioi_dataset_shared_fini (dataset);

  /* release the dataset map if one was allocated */
  (void) hioi_dataset_map_release (dataset);
#endif


  if (dataset->ds_flags & HIO_FLAG_WRITE) {
    char *path;

    /* write manifest header */
    POSIX_TRACE_CALL(posix_dataset, rc = hioi_dataset_gather_manifest (dataset, &manifest, &manifest_size, false, true),
                     "gather_manifest", 0, 0);
    if (HIO_SUCCESS != rc) {
      dataset->ds_status = rc;
    }

    if (0 == context->c_rank) {
      rc = asprintf (&path, "%s/manifest.json", posix_dataset->base_path);
      if (0 > rc) {
        /* out of memory. not much we can do now */
        return hioi_err_errno (errno);
      }

      rc = hioi_manifest_save (dataset, manifest, manifest_size, path);
      free (manifest);
      free (path);
      if (HIO_SUCCESS != rc) {
        hioi_err_push (rc, &dataset->ds_object, "posix: error writing dataset manifest");
      }
    }

#if HIO_MPI_HAVE(3)
    if (HIO_FILE_MODE_OPTIMIZED == posix_dataset->ds_fmode) {
      /* optimized mode requires a data manifest to describe how the data landed on the filesystem */
      POSIX_TRACE_CALL(posix_dataset, rc = hioi_dataset_gather_manifest_comm (dataset, context->c_shared_comm, &manifest, &manifest_size,
                                                                              posix_dataset->ds_use_bzip, false),
                       "gather_manifest", 0, 0);
      if (HIO_SUCCESS != rc) {
        dataset->ds_status = rc;
      }

      if (NULL != manifest) {
        rc = asprintf (&path, "%s/manifest.%x.json%s", posix_dataset->base_path, context->c_rank,
                       posix_dataset->ds_use_bzip ? ".bz2" : "");
        if (0 > rc) {
          return hioi_err_errno (errno);
        }

        rc = hioi_manifest_save (dataset, manifest, manifest_size, path);
        free (manifest);
        free (path);
        if (HIO_SUCCESS != rc) {
          hioi_err_push (rc, &dataset->ds_object, "posix: error writing dataset manifest");
        }
      }
    }
#endif
  }

#if HIO_MPI_HAVE(1)
  /* ensure all ranks have closed the dataset before continuing */
  if (hioi_context_using_mpi (context)) {
    MPI_Allreduce (MPI_IN_PLACE, &rc, 1, MPI_INT, MPI_MIN, context->c_comm);
  }
#endif

  free (posix_dataset->base_path);

  stop = hioi_gettime ();

  builtin_posix_trace (posix_dataset, "close", 0, 0, start, stop);

  hioi_log (context, HIO_VERBOSE_DEBUG_LOW, "posix:dataset_open: successfully closed posix dataset "
            "%s:%" PRIu64 " on data root %s. close time %" PRIu64 " usec", hioi_object_identifier(dataset),
            dataset->ds_id, module->data_root, stop - start);

  builtin_posix_trace (posix_dataset, "trace_end", 0, 0, 0, 0);
  if (posix_dataset->ds_trace_fh) {
    fclose (posix_dataset->ds_trace_fh);
  }

  return rc;
}
static int builtin_posix_module_dataset_open (struct hio_module_t *module, hio_dataset_t dataset) {
  builtin_posix_module_dataset_t *posix_dataset = (builtin_posix_module_dataset_t *) dataset;
  builtin_posix_module_t *posix_module = (builtin_posix_module_t *) module;
  hio_context_t context = hioi_object_context ((hio_object_t) dataset);
  unsigned char *manifest = NULL;
  size_t manifest_size = 0;
  uint64_t start, stop;
  int rc = HIO_SUCCESS;
  char *path = NULL;

  start = hioi_gettime ();

  hioi_log (context, HIO_VERBOSE_DEBUG_MED, "posix:dataset_open: opening dataset %s:%lu mpi: %d flags: 0x%x mode: 0x%x",
	    hioi_object_identifier (dataset), (unsigned long) dataset->ds_id, hioi_context_using_mpi (context),
            dataset->ds_flags, dataset->ds_mode);

  rc = builtin_posix_module_dataset_init (module, posix_dataset);
  if (HIO_SUCCESS != rc) {
    return rc;
  }

  rc = builtin_posix_module_setup_striping (context, module, dataset);
  if (HIO_SUCCESS != rc) {
    return rc;
  }

  if (HIO_FILE_MODE_STRIDED == posix_dataset->ds_fmode) {
    hioi_config_add (context, &dataset->ds_object, &posix_dataset->ds_fcount,
                     "dataset_file_count", HIO_CONFIG_TYPE_UINT64, NULL, "Number of files to use "
                     "in strided file mode", 0);
  } else if (HIO_FILE_MODE_OPTIMIZED == posix_dataset->ds_fmode) {
    posix_dataset->ds_use_bzip = true;
    hioi_config_add (context, &dataset->ds_object, &posix_dataset->ds_use_bzip,
                     "dataset_use_bzip", HIO_CONFIG_TYPE_BOOL, NULL,
                     "Use bzip2 compression for dataset manifests", 0);
  }

  if (dataset->ds_flags & HIO_FLAG_TRUNC) {
    /* blow away the existing dataset */
    if (0 == context->c_rank) {
      (void) builtin_posix_module_dataset_unlink (module, hioi_object_identifier(dataset),
                                                  dataset->ds_id);
    }
  }

  if (!(dataset->ds_flags & HIO_FLAG_CREAT)) {
    if (0 == context->c_rank) {
      /* load manifest. the manifest data will be shared with other processes in hioi_dataset_scatter */
      rc = asprintf (&path, "%s/manifest.json", posix_dataset->base_path);
      assert (0 < rc);
      if (access (path, F_OK)) {
        /* this should never happen on a valid dataset */
        free (path);
        hioi_log (context, HIO_VERBOSE_DEBUG_LOW, "posix:dataset_open: could not find top-level manifest %s", path);
        rc = HIO_ERR_NOT_FOUND;
      } else {
        rc = HIO_SUCCESS;
      }
    }

    /* read the manifest if it exists */
    if (HIO_SUCCESS == rc) {
      hioi_log (context, HIO_VERBOSE_DEBUG_LOW, "posix:dataset_open: loading manifest header from %s...", path);
      rc = hioi_manifest_read (path, &manifest, &manifest_size);
      free (path);
      path = NULL;
    }
  } else if (0 == context->c_rank) {
    rc = builtin_posix_create_dataset_dirs (posix_module, posix_dataset);
    if (HIO_SUCCESS == rc) {
      /* serialize the manifest to send to remote ranks */
      rc = hioi_manifest_serialize (dataset, &manifest, &manifest_size, false, false);
    }
  }

#if HIO_MPI_HAVE(1)
  /* share dataset header will all processes in the communication domain */
  rc = hioi_dataset_scatter_comm (dataset, context->c_comm, manifest, manifest_size, rc);
#endif
  free (manifest);
  if (HIO_SUCCESS != rc) {
    free (posix_dataset->base_path);
    return rc;
  }

  if (context->c_enable_tracing) {
    char *path;

    rc = asprintf (&path, "%s/trace/trace.%d", posix_dataset->base_path, context->c_rank);
    if (rc > 0) {
      posix_dataset->ds_trace_fh = fopen (path, "a");
      free (path);
    }

    builtin_posix_trace (posix_dataset, "trace_begin", 0, 0, 0, 0);
  }

#if HIO_MPI_HAVE(3)
  if (!(dataset->ds_flags & HIO_FLAG_CREAT) && HIO_FILE_MODE_OPTIMIZED == posix_dataset->ds_fmode) {
    rc = bultin_posix_scatter_data (posix_dataset);
    if (HIO_SUCCESS != rc) {
      free (posix_dataset->base_path);
      return rc;
    }
  }

  /* if possible set up a shared memory window for this dataset */
  POSIX_TRACE_CALL(posix_dataset, hioi_dataset_shared_init (dataset, 1), "shared_init", 0, 0);

  if (HIO_FILE_MODE_OPTIMIZED == posix_dataset->ds_fmode) {
    if (2 > context->c_size || NULL == dataset->ds_shared_control) {
      /* no point in using optimized mode in this case */
      posix_dataset->ds_fmode = HIO_FILE_MODE_BASIC;
      hioi_log (context, HIO_VERBOSE_WARN, "posix:dataset_open: optimized file mode requested but not supported in this "
                "dataset mode. falling back to basic file mode, path: %s", posix_dataset->base_path);
    } else if (HIO_SET_ELEMENT_SHARED == dataset->ds_mode) {
      POSIX_TRACE_CALL(posix_dataset, rc = hioi_dataset_generate_map (dataset), "generate_map", 0, 0);
      if (HIO_SUCCESS != rc) {
        free (posix_dataset->base_path);
        return rc;
      }
    }
  }

  /* NTH: if requested more code is needed to load an optimized dataset with an older MPI */
#endif /* HIO_MPI_HAVE(3) */

  dataset->ds_module = module;
  dataset->ds_close = builtin_posix_module_dataset_close;
  dataset->ds_element_open = builtin_posix_module_element_open;
  dataset->ds_process_reqs = builtin_posix_module_process_reqs;

  /* record the open time */
  gettimeofday (&dataset->ds_otime, NULL);

  stop = hioi_gettime ();

  hioi_log (context, HIO_VERBOSE_DEBUG_LOW, "posix:dataset_open: successfully %s posix dataset "
            "%s:%" PRIu64 " on data root %s. open time %" PRIu64 " usec",
            (dataset->ds_flags & HIO_FLAG_CREAT) ? "created" : "opened", hioi_object_identifier(dataset),
            dataset->ds_id, module->data_root, stop - start);

  builtin_posix_trace (posix_dataset, "open", 0, 0, start, stop);

  return HIO_SUCCESS;
}
static int bultin_posix_scatter_data (builtin_posix_module_dataset_t *posix_dataset) {
  hio_context_t context = hioi_object_context ((hio_object_t) posix_dataset);
  size_t manifest_size = 0, manifest_id_count = 0;
  unsigned char *manifest = NULL;
  int rc = HIO_SUCCESS;
  int *manifest_ids;
  char *path;

  if (HIO_SET_ELEMENT_UNIQUE == posix_dataset->base.ds_mode) {
    /* only read the manifest this rank wrote */
    manifest_id_count = 1;
    manifest_ids = malloc (sizeof (*manifest_ids));
    manifest_ids[0] = context->c_rank;
  } else {
    rc = builtin_posix_module_dataset_manifest_list (posix_dataset, &manifest_ids, &manifest_id_count);
    if (HIO_SUCCESS != rc) {
      return rc;
    }
  }

  for (size_t i = 0 ; i < manifest_id_count ; ++i) {
    if (-1 == manifest_ids[i]) {
      /* nothing more to do */
      break;
    }

    hioi_log (context, HIO_VERBOSE_DEBUG_LOW, "posix:dataset_open: reading manifest data from id %x\n",
              manifest_ids[i]);

    /* when writing the manifest in optimized mode each IO manager writes its own manifest. try
     * to open the manifest. if a manifest does not exist then it is likely this rank did not
     * write a manifest. IO managers will distribute the manifest data to the appropriate ranks
     * in hioi_dataset_scatter(). */
    rc = asprintf (&path, "%s/manifest.%x.json.bz2", posix_dataset->base_path, manifest_ids[i]);
    assert (0 < rc);

    if (access (path, F_OK)) {
      free (path);
      /* Check for a non-bzip'd manifest file. */
      rc = asprintf (&path, "%s/manifest.%x.json", posix_dataset->base_path, manifest_ids[i]);
      assert (0 < rc);
      if (access (path, F_OK)) {
        /* no manifest found. this might be a non-optimized file format or this rank may not be an
         * IO master rank. */
        free (path);
        path = NULL;
      }
    }

    if (path) {
      unsigned char *tmp = NULL;
      size_t tmp_size = 0;
      /* read the manifest if it exists */
      rc = hioi_manifest_read (path, &tmp, &tmp_size);
      if (HIO_SUCCESS == rc) {
        rc = hioi_manifest_merge_data2 (&manifest, &manifest_size, tmp, tmp_size);
        free (tmp);
      }

      free (path);

      if (HIO_SUCCESS != rc) {
        break;
      }
    }
  }

  /* share dataset information with all processes on this node */
  if (HIO_SET_ELEMENT_UNIQUE == posix_dataset->base.ds_mode) {
    rc = hioi_dataset_scatter_unique (&posix_dataset->base, manifest, manifest_size, rc);
  } else {
    rc = hioi_dataset_scatter_comm (&posix_dataset->base, context->c_shared_comm, manifest, manifest_size, rc);
  }

  free (manifest_ids);
  free (manifest);

  return rc;
}