Пример #1
0
/* broadcast dataset hash from smallest rank we can find that has a copy */
static int scr_distribute_datasets(scr_filemap* map, int id)
{
  int i;

  /* create a new hash to record dataset descriptor */
  scr_hash* send_hash = scr_hash_new();

  /* for this dataset, get list of ranks we have data for */
  int  nranks = 0;
  int* ranks = NULL;
  scr_filemap_list_ranks_by_dataset(map, id, &nranks, &ranks);

  /* for each rank we have files for,
   * check whether we also have its dataset descriptor */
  int invalid_rank_found = 0;
  int have_dset = 0;
  for (i=0; i < nranks; i++) {
    /* get the rank id */
    int rank = ranks[i];

    /* check that the rank is within range */
    if (rank < 0 || rank >= scr_ranks_world) {
      scr_err("Invalid rank id %d in world of %d @ %s:%d",
        rank, scr_ranks_world, __FILE__, __LINE__
      );
      invalid_rank_found = 1;
    }

    /* lookup the dataset descriptor hash for this rank */
    scr_hash* desc = scr_hash_new();
    scr_filemap_get_dataset(map, id, rank, desc);

    /* if this descriptor has entries, add it to our send hash,
     * delete the hash otherwise */
    if (scr_hash_size(desc) > 0) {
      have_dset = 1;
      scr_hash_merge(send_hash, desc);
      scr_hash_delete(&desc);
      break;
    } else {
      scr_hash_delete(&desc);
    }
  }

  /* free off our list of ranks */
  scr_free(&ranks);

  /* check that we didn't find an invalid rank on any process */
  if (! scr_alltrue(invalid_rank_found == 0)) {
    scr_hash_delete(&send_hash);
    return SCR_FAILURE;
  }

  /* identify the smallest rank that has the dataset */
  int source_rank = scr_ranks_world;
  if (have_dset) {
    source_rank = scr_my_rank_world;
  }
  int min_rank;
  MPI_Allreduce(&source_rank, &min_rank, 1, MPI_INT, MPI_MIN, scr_comm_world);

  /* if there is no rank, return with failure */
  if (min_rank >= scr_ranks_world) {
    scr_hash_delete(&send_hash);
    return SCR_FAILURE;
  }

  /* otherwise, bcast the dataset from the minimum rank */
  if (scr_my_rank_world != min_rank) {
    scr_hash_unset_all(send_hash);
  }
  scr_hash_bcast(send_hash, min_rank, scr_comm_world);

  /* record the descriptor in our filemap */
  scr_filemap_set_dataset(map, id, scr_my_rank_world, send_hash);
  scr_filemap_write(scr_map_file, map);

  /* TODO: at this point, we could delete descriptors for other
   * ranks for this checkpoint */

  /* free off our send hash */
  scr_hash_delete(&send_hash);

  return SCR_SUCCESS;
}
Пример #2
0
/* read contents of summary file */
static int scr_fetch_summary(
  const char* summary_dir,
  scr_hash* file_list)
{
  /* assume that we won't succeed in our fetch attempt */
  int rc = SCR_SUCCESS;

  /* check whether summary file exists and is readable */
  if (scr_my_rank_world == 0) {
    /* check that we can access the directory */
    if (scr_file_is_readable(summary_dir) != SCR_SUCCESS) {
      scr_err("Failed to access summary directory %s @ %s:%d",
        summary_dir, __FILE__, __LINE__
      );
      rc = SCR_FAILURE;
    }
  }

  /* broadcast success code from rank 0 */
  MPI_Bcast(&rc, 1, MPI_INT, 0, scr_comm_world);
  if (rc != SCR_SUCCESS) {
    return rc;
  }

  /* add path to file list */
  scr_hash_util_set_str(file_list, SCR_KEY_PATH, summary_dir);

  /* build path to summary file */
  scr_path* dataset_path = scr_path_from_str(summary_dir);
  scr_path* meta_path = scr_path_dup(dataset_path);
  scr_path_append_str(meta_path, ".scr");
  scr_path_reduce(meta_path);

  /* rank 0 reads the summary file */
  scr_hash* header = scr_hash_new();
  if (scr_my_rank_world == 0) {
    /* build path to summary file */
    scr_path* summary_path = scr_path_dup(meta_path);
    scr_path_append_str(summary_path, "summary.scr");
    const char* summary_file = scr_path_strdup(summary_path);

    /* open file for reading */
    int fd = scr_open(summary_file, O_RDONLY);
    if (fd >= 0) {
      /* read summary hash */
      ssize_t header_size = scr_hash_read_fd(summary_file, fd, header);
      if (header_size < 0) {
        rc = SCR_FAILURE;
      }

      /* TODO: check that the version is correct */

      /* close the file */
      scr_close(summary_file, fd);
    } else {
      scr_err("Failed to open summary file %s @ %s:%d",
        summary_file, __FILE__, __LINE__
      );
      rc = SCR_FAILURE;
    }

    /* free summary path and string */
    scr_free(&summary_file);
    scr_path_delete(&summary_path);
  }

  /* broadcast success code from rank 0 */
  MPI_Bcast(&rc, 1, MPI_INT, 0, scr_comm_world);
  if (rc != SCR_SUCCESS) {
    goto cleanup;
  }

  /* broadcast the summary hash */
  scr_hash_bcast(header, 0, scr_comm_world);

  /* extract and record the datast in file list */
  scr_hash* dataset_hash = scr_hash_new();
  scr_dataset* dataset = scr_hash_get(header, SCR_SUMMARY_6_KEY_DATASET);
  scr_hash_merge(dataset_hash, dataset);
  scr_hash_set(file_list, SCR_SUMMARY_6_KEY_DATASET, dataset_hash);

  /* build path to rank2file map */
  scr_path* rank2file_path = scr_path_dup(meta_path);
  scr_path_append_str(rank2file_path, "rank2file.scr");

  /* fetch file names and offsets containing file hash data */
  int valid = 0;
  char* file = NULL;
  unsigned long offset = 0;
  if (scr_my_rank_world == 0) {
    /* rank 0 is only valid reader to start with */
    valid  = 1;
    file   = scr_path_strdup(rank2file_path);
    offset = 0;
  }
  if (scr_fetch_rank2file_map(dataset_path, 1, &valid, &file, &offset)
      != SCR_SUCCESS)
  {
    rc = SCR_FAILURE;
  }

  /* create hashes to exchange data */
  scr_hash* send = scr_hash_new();
  scr_hash* recv = scr_hash_new();

  /* read data from file */
  if (valid) {
    /* open file if necessary */
    int fd = scr_open(file, O_RDONLY);
    if (fd >= 0) {
      /* create hash to hold file contents */
      scr_hash* save = scr_hash_new();

      /* read hash from file */
      scr_lseek(file, fd, offset, SEEK_SET);
      ssize_t readsize = scr_hash_read_fd(file, fd, save);
      if (readsize < 0) {
        scr_err("Failed to read rank2file map file %s @ %s:%d",
          file, __FILE__, __LINE__
        );
        rc = SCR_FAILURE;
      }

      /* check that the number of ranks match */
      int ranks = 0;
      scr_hash_util_get_int(save, SCR_SUMMARY_6_KEY_RANKS, &ranks);
      if (ranks != scr_ranks_world) {
        scr_err("Invalid number of ranks in %s, got %d expected %d @ %s:%d",
          file, ranks, scr_ranks_world, __FILE__, __LINE__
        );
        rc = SCR_FAILURE;
      }

      /* delete current send hash, set it to values from file,
       * delete file hash */
      scr_hash_delete(&send);
      send = scr_hash_extract(save, SCR_SUMMARY_6_KEY_RANK);
      scr_hash_delete(&save);

      /* close the file */
      scr_close(file, fd);
    } else {
      scr_err("Failed to open rank2file map %s @ %s:%d",
        file, __FILE__, __LINE__
      );
      rc = SCR_FAILURE;
    }

    /* delete file name string */
    scr_free(&file);
  }

  /* check that everyone read the data ok */
  if (! scr_alltrue(rc == SCR_SUCCESS)) {
    rc = SCR_FAILURE;
    goto cleanup_hashes;
  }

  /* scatter to groups */
  scr_hash_exchange_direction(send, recv, scr_comm_world, SCR_HASH_EXCHANGE_RIGHT);

  /* iterate over the ranks that sent data to us, and set up our
   * list of files */
  scr_hash_elem* elem;
  for (elem = scr_hash_elem_first(recv);
       elem != NULL;
       elem = scr_hash_elem_next(elem))
  {
    /* the key is the source rank, which we don't care about,
     * the info we need is in the element hash */
    scr_hash* elem_hash = scr_hash_elem_hash(elem);

    /* get pointer to file hash */
    scr_hash* file_hash = scr_hash_get(elem_hash, SCR_SUMMARY_6_KEY_FILE);
    if (file_hash != NULL) {
      /* TODO: parse summary file format */
      scr_hash_merge(file_list, elem_hash);
    } else {
      rc = SCR_FAILURE;
    }
  }

  /* fill in file list parameters */
  if (rc == SCR_SUCCESS) {
    /* if we're not using containers, add PATH entry for each of our
     * files */
    scr_hash* files = scr_hash_get(file_list, SCR_KEY_FILE);
    for (elem = scr_hash_elem_first(files);
         elem != NULL;
         elem = scr_hash_elem_next(elem))
    {
      /* get the file name */
      char* file = scr_hash_elem_key(elem);

      /* combine the file name with the summary directory to build a
       * full path to the file */
      scr_path* path_full = scr_path_dup(dataset_path);
      scr_path_append_str(path_full, file);

      /* subtract off last component to get just the path */
      scr_path_dirname(path_full);
      char* path = scr_path_strdup(path_full);

      /* record path in file list */
      scr_hash* hash = scr_hash_elem_hash(elem);
      scr_hash_util_set_str(hash, SCR_KEY_PATH, path);

      /* free the path and string */
      scr_free(&path);
      scr_path_delete(&path_full);
    }
  }

  /* check that everyone read the data ok */
  if (! scr_alltrue(rc == SCR_SUCCESS)) {
    rc = SCR_FAILURE;
    goto cleanup_hashes;
  }

cleanup_hashes:
  /* delete send and receive hashes */
  scr_hash_delete(&recv);
  scr_hash_delete(&send);

  /* free string and path for rank2file map */
  scr_path_delete(&rank2file_path);

cleanup:
  /* free the header hash */
  scr_hash_delete(&header);

  /* free path for dataset directory */
  scr_path_delete(&meta_path);
  scr_path_delete(&dataset_path);

  return rc;
}