Ejemplo n.º 1
0
/* writes the specified command to the transfer file */
int set_transfer_file_state(char* s, int done)
{
  /* get a hash to store file data */
  scr_hash* hash = scr_hash_new();

  /* attempt to read the file transfer file */
  int fd = -1;
  if (scr_hash_lock_open_read(scr_transfer_file, &fd, hash) == SCR_SUCCESS) {
    /* set the state */
    scr_hash_util_set_str(hash, SCR_TRANSFER_KEY_STATE, s);

    /* set the flag if we're done */
    if (done) {
      scr_hash_set_kv(hash, SCR_TRANSFER_KEY_FLAG, SCR_TRANSFER_KEY_FLAG_DONE);
    }

    /* write the hash back to the file */
    scr_hash_write_close_unlock(scr_transfer_file, &fd, hash);
  }

  /* delete the hash */
  scr_hash_delete(&hash);

  return SCR_SUCCESS;
}
Ejemplo n.º 2
0
/* adds a location for the specified dataset id to the flush file */
int scr_flush_file_location_set(int id, const char* location)
{
  /* only rank 0 updates the file */
  if (scr_my_rank_world == 0) {
    /* read the flush file into hash */
    scr_hash* hash = scr_hash_new();
    scr_hash_read_path(scr_flush_file, hash);

    /* set the location for this dataset */
    scr_hash* dset_hash = scr_hash_set_kv_int(hash, SCR_FLUSH_KEY_DATASET, id);
    scr_hash_set_kv(dset_hash, SCR_FLUSH_KEY_LOCATION, location);

    /* write the hash back to the flush file */
    scr_hash_write_path(scr_flush_file, hash);

    /* delete the hash */
    scr_hash_delete(&hash);
  }
  return SCR_SUCCESS;
}
Ejemplo n.º 3
0
/* since on a restart we may end up with more or fewer ranks on a node than the
 * previous run, rely on the master to read in and distribute the filemap to
 * other ranks on the node */
int scr_scatter_filemaps(scr_filemap* my_map)
{
  /* TODO: if the control directory is on a device shared by lots of procs,
   * we should read and distribute this data in a more scalable way */

  /* allocate empty send hash */
  scr_hash* send_hash = scr_hash_new();

  /* if i'm the master on this node, read in all filemaps */
  if (scr_storedesc_cntl->rank == 0) {
    /* create an empty filemap */
    scr_filemap* all_map = scr_filemap_new();

    /* read in the master map */
    scr_hash* hash = scr_hash_new();
    scr_hash_read_path(scr_master_map_file, hash);

    /* for each filemap listed in the master map */
    scr_hash_elem* elem;
    for (elem = scr_hash_elem_first(scr_hash_get(hash, "Filemap"));
         elem != NULL;
         elem = scr_hash_elem_next(elem))
    {
      /* get the filename of this filemap */
      char* file = scr_hash_elem_key(elem);

      /* TODO MEMFS: mount storage for each filemap */

      /* read in the filemap */
      scr_filemap* tmp_map = scr_filemap_new();
      scr_path* path_file = scr_path_from_str(file);
      scr_filemap_read(path_file, tmp_map);
      scr_path_delete(&path_file);

      /* merge it with the all_map */
      scr_filemap_merge(all_map, tmp_map);

      /* delete filemap */
      scr_filemap_delete(&tmp_map);

      /* TODO: note that if we fail after unlinking this file but before
       * writing out the new file, we'll lose information */

      /* delete the file */
      scr_file_unlink(file);
    }

    /* free the hash object */
    scr_hash_delete(&hash);

    /* write out new local 0 filemap */
    if (scr_filemap_num_ranks(all_map) > 0) {
      scr_filemap_write(scr_map_file, all_map);
    }

    /* get global rank of each rank */
    int* ranks = (int*) SCR_MALLOC(scr_storedesc_cntl->ranks * sizeof(int));
    MPI_Gather(
      &scr_my_rank_world, 1, MPI_INT, ranks, 1, MPI_INT,
      0, scr_storedesc_cntl->comm
    );

    /* for each rank, send them their own file data if we have it */
    int i;
    for (i=0; i < scr_storedesc_cntl->ranks; i++) {
      int rank = ranks[i];
      if (scr_filemap_have_rank(all_map, rank)) {
        /* extract the filemap for this rank */
        scr_filemap* tmp_map = scr_filemap_extract_rank(all_map, rank);

        /* get a reference to the hash object that we'll send to this rank,
         * and merge this filemap into it */
        scr_hash* tmp_hash = scr_hash_getf(send_hash, "%d", i);
        if (tmp_hash == NULL) {
          /* if we don't find an existing entry in the send_hash,
           * create an empty hash and insert it */
          scr_hash* empty_hash = scr_hash_new();
          scr_hash_setf(send_hash, empty_hash, "%d", i);
          tmp_hash = empty_hash;
        }
        scr_hash_merge(tmp_hash, tmp_map);

        /* delete the filemap for this rank */
        scr_filemap_delete(&tmp_map);
      }
    }

    /* free our rank list */
    scr_free(&ranks);

    /* now just round robin the remainder across the set (load balancing) */
    int num;
    int* remaining_ranks = NULL;
    scr_filemap_list_ranks(all_map, &num, &remaining_ranks);

    int j = 0;
    while (j < num) {
      /* pick a rank in to send to */
      i = j % scr_storedesc_cntl->ranks;

      /* extract the filemap for this rank */
      scr_filemap* tmp_map = scr_filemap_extract_rank(all_map, remaining_ranks[j]);

      /* get a reference to the hash object that we'll send to this rank,
       * and merge this filemap into it */
      scr_hash* tmp_hash = scr_hash_getf(send_hash, "%d", i);
      if (tmp_hash == NULL) {
        /* if we don't find an existing entry in the send_hash,
         * create an empty hash and insert it */
        scr_hash* empty_hash = scr_hash_new();
        scr_hash_setf(send_hash, empty_hash, "%d", i);
        tmp_hash = empty_hash;
      }
      scr_hash_merge(tmp_hash, tmp_map);

      /* delete the filemap for this rank */
      scr_filemap_delete(&tmp_map);
      j++;
    }

    scr_free(&remaining_ranks);

    /* delete the filemap */
    scr_filemap_delete(&all_map);

    /* write out the new master filemap */
    hash = scr_hash_new();
    char file[SCR_MAX_FILENAME];
    for (i=0; i < scr_storedesc_cntl->ranks; i++) {
      sprintf(file, "%s/filemap_%d.scrinfo", scr_cntl_prefix, i);
      scr_hash_set_kv(hash, "Filemap", file);
    }
    scr_hash_write_path(scr_master_map_file, hash);
    scr_hash_delete(&hash);
  } else {
    /* send our global rank to the master */
    MPI_Gather(
      &scr_my_rank_world, 1, MPI_INT, NULL, 1, MPI_INT,
      0, scr_storedesc_cntl->comm
    );
  }

  /* receive our filemap from master */
  scr_hash* recv_hash = scr_hash_new();
  scr_hash_exchange(send_hash, recv_hash, scr_storedesc_cntl->comm);

  /* merge map sent from master into our map */
  scr_hash* map_from_master = scr_hash_getf(recv_hash, "%d", 0);
  if (map_from_master != NULL) {
    scr_hash_merge(my_map, map_from_master);
  }

  /* write out our local filemap */
  if (scr_filemap_num_ranks(my_map) > 0) {
    scr_filemap_write(scr_map_file, my_map);
  }

  /* free off our send and receive hashes */
  scr_hash_delete(&recv_hash);
  scr_hash_delete(&send_hash);

  return SCR_SUCCESS;
}
Ejemplo n.º 4
0
/* remove any dataset ids from flush file which are not in cache,
 * and add any datasets in cache that are not in the flush file */
int scr_flush_file_rebuild(const scr_filemap* map)
{
  if (scr_my_rank_world == 0) {
    /* read the flush file */
    scr_hash* hash = scr_hash_new();
    scr_hash_read_path(scr_flush_file, hash);

    /* get ordered list of dataset ids in flush file */
    int flush_ndsets;
    int* flush_dsets;
    scr_hash* flush_dsets_hash = scr_hash_get(hash, SCR_FLUSH_KEY_DATASET);
    scr_hash_list_int(flush_dsets_hash, &flush_ndsets, &flush_dsets);

    /* get ordered list of dataset ids in cache */
    int cache_ndsets;
    int* cache_dsets;
    scr_filemap_list_datasets(map, &cache_ndsets, &cache_dsets);

    int flush_index = 0;
    int cache_index = 0;
    while (flush_index < flush_ndsets && cache_index < cache_ndsets) {
      /* get next smallest index from flush file and cache */
      int flush_dset = flush_dsets[flush_index];
      int cache_dset = cache_dsets[cache_index];

      if (flush_dset < cache_dset) {
        /* dataset exists in flush file but not in cache,
         * delete it from the flush file */
        scr_hash_unset_kv_int(hash, SCR_FLUSH_KEY_DATASET, flush_dset);
        flush_index++;
      } else if (cache_dset < flush_dset) {
        /* dataset exists in cache but not flush file,
         * add it to the flush file */
        scr_hash* dset_hash = scr_hash_set_kv_int(hash, SCR_FLUSH_KEY_DATASET, cache_dset);
        scr_hash_set_kv(dset_hash, SCR_FLUSH_KEY_LOCATION, SCR_FLUSH_KEY_LOCATION_CACHE);
        cache_index++;
      } else {
        /* dataset exists in cache and the flush file,
         * ensure that it is listed as being in the cache */
        scr_hash* dset_hash = scr_hash_set_kv_int(hash, SCR_FLUSH_KEY_DATASET, cache_dset);
        scr_hash_unset_kv(dset_hash, SCR_FLUSH_KEY_LOCATION, SCR_FLUSH_KEY_LOCATION_CACHE);
        scr_hash_set_kv(dset_hash, SCR_FLUSH_KEY_LOCATION, SCR_FLUSH_KEY_LOCATION_CACHE);
        flush_index++;
        cache_index++;
      }
    }
    while (flush_index < flush_ndsets) {
      /* dataset exists in flush file but not in cache,
       * delete it from the flush file */
      int flush_dset = flush_dsets[flush_index];
      scr_hash_unset_kv_int(hash, SCR_FLUSH_KEY_DATASET, flush_dset);
      flush_index++;
    }
    while (cache_index < cache_ndsets) {
      /* dataset exists in cache but not flush file,
       * add it to the flush file */
      int cache_dset = cache_dsets[cache_index];
      scr_hash* dset_hash = scr_hash_set_kv_int(hash, SCR_FLUSH_KEY_DATASET, cache_dset);
      scr_hash_set_kv(dset_hash, SCR_FLUSH_KEY_LOCATION, SCR_FLUSH_KEY_LOCATION_CACHE);
      cache_index++;
    }

    /* free our list of cache dataset ids */
    scr_free(&cache_dsets);

    /* free our list of flush file dataset ids */
    scr_free(&flush_dsets);

    /* write the hash back to the flush file */
    scr_hash_write_path(scr_flush_file, hash);

    /* delete the hash */
    scr_hash_delete(&hash);
  }
  return SCR_SUCCESS;
}
Ejemplo n.º 5
0
/* flush files specified in list, and record corresponding entries for summary file */
static int scr_flush_files_list(scr_hash* file_list, scr_hash* summary)
{
  /* assume we will succeed in this flush */
  int rc = SCR_SUCCESS;

  /* flush each of my files and fill in summary data structure */
  scr_hash_elem* elem = NULL;
  scr_hash* files = scr_hash_get(file_list, SCR_KEY_FILE);
  for (elem = scr_hash_elem_first(files);
       elem != NULL;
       elem = scr_hash_elem_next(elem))
  {
    /* get the filename */
    char* file = scr_hash_elem_key(elem);

    /* convert file to path and extract name of file */
    scr_path* path_name = scr_path_from_str(file);
    scr_path_basename(path_name);

    /* get the hash for this element */
    scr_hash* hash = scr_hash_elem_hash(elem);

    /* get meta data for this file */
    scr_meta* meta = scr_hash_get(hash, SCR_KEY_META);

    /* if segments are defined, we flush the file to its containers,
     * otherwise we copy the file out as is */
    scr_hash* segments = scr_hash_get(hash, SCR_SUMMARY_6_KEY_SEGMENT);
    if (segments != NULL) {
      /* TODO: PRESERVE get original filename here */

      /* add this file to the summary file */
      char* name = scr_path_strdup(path_name);
      scr_hash* file_hash = scr_hash_set_kv(summary, SCR_SUMMARY_6_KEY_FILE, name);
      scr_free(&name);

// USERDEF fixme!
      /* flush the file to the containers listed in its segmenets */
      if (scr_flush_file_to_containers(file, meta, segments, scr_prefix) == SCR_SUCCESS) {
        /* successfully flushed this file, record the filesize */
        unsigned long filesize = 0;
        if (scr_meta_get_filesize(meta, &filesize) == SCR_SUCCESS) {
          scr_hash_util_set_bytecount(file_hash, SCR_SUMMARY_6_KEY_SIZE, filesize);
        }

        /* record the crc32 if one was computed */
        uLong crc = 0;
        if (scr_meta_get_crc32(meta, &crc) == SCR_SUCCESS) {
          scr_hash_util_set_crc32(file_hash, SCR_SUMMARY_6_KEY_CRC, crc);
        }

        /* record segment information in summary file */
        scr_hash* segments_copy = scr_hash_new();
        scr_hash_merge(segments_copy, segments);
        scr_hash_set(file_hash, SCR_SUMMARY_6_KEY_SEGMENT, segments_copy);
      } else {
        /* the flush failed */
        rc = SCR_FAILURE;

        /* explicitly mark file as incomplete */
        scr_hash_set_kv_int(file_hash, SCR_SUMMARY_6_KEY_COMPLETE, 0);
      }
    } else {
      /* get directory to flush file to */
      char* dir;
      if (scr_hash_util_get_str(hash, SCR_KEY_PATH, &dir) == SCR_SUCCESS) {
        /* create full path of destination file */
        scr_path* path_full = scr_path_from_str(dir);
        scr_path_append(path_full, path_name);

        /* get relative path to flushed file from SCR_PREFIX directory */
        scr_path* path_relative = scr_path_relative(scr_prefix_path, path_full);
        if (! scr_path_is_null(path_relative)) {
          /* record the name of the file in the summary hash, and get reference to a hash for this file */
          char* name = scr_path_strdup(path_relative);
          scr_hash* file_hash = scr_hash_set_kv(summary, SCR_SUMMARY_6_KEY_FILE, name);
          scr_free(&name);

          /* flush the file and fill in the meta data for this file */
          if (scr_flush_a_file(file, dir, meta) == SCR_SUCCESS) {
            /* successfully flushed this file, record the filesize */
            unsigned long filesize = 0;
            if (scr_meta_get_filesize(meta, &filesize) == SCR_SUCCESS) {
              scr_hash_util_set_bytecount(file_hash, SCR_SUMMARY_6_KEY_SIZE, filesize);
            }

            /* record the crc32 if one was computed */
            uLong crc = 0;
            if (scr_meta_get_crc32(meta, &crc) == SCR_SUCCESS) {
              scr_hash_util_set_crc32(file_hash, SCR_SUMMARY_6_KEY_CRC, crc);
            }
          } else {
            /* the flush failed */
            rc = SCR_FAILURE;

            /* explicitly mark incomplete files */
            scr_hash_set_kv_int(file_hash, SCR_SUMMARY_6_KEY_COMPLETE, 0);
          }
        } else {
          scr_abort(-1, "Failed to get relative path to directory %s from %s @ %s:%d",
            dir, scr_prefix, __FILE__, __LINE__
          );
        }

        /* free relative and full paths */
        scr_path_delete(&path_relative);
        scr_path_delete(&path_full);
      } else {
        scr_abort(-1, "Failed to read directory to flush file to @ %s:%d",
          __FILE__, __LINE__
        );
      }
    }

    /* free the file name path */
    scr_path_delete(&path_name);
  }

  return rc;
}
Ejemplo n.º 6
0
/* read in halt file (which program may have changed), update internal data structure,
 * set & unset any fields, and write out halt file all while locked */
int scr_halt_sync_and_set(const char* file, struct arglist* args, scr_hash* data)
{
  /* set the mode on the file to be readable/writable by all
   * (enables a sysadmin to halt a user's job via scr_halt --all) */
  mode_t old_mode = umask(0000);

  /* TODO: sleep and try the open several times if the first fails */
  /* open the halt file for reading */
  int fd = scr_open(file, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH | S_IWOTH);
  if (fd < 0) {
    scr_err("Opening file for write: scr_open(%s) errno=%d %m @ %s:%d",
            file, errno, __FILE__, __LINE__
    );
    /* restore the normal file mask */
    umask(old_mode);
    return SCR_FAILURE;
  }

  /* acquire an exclusive file lock before reading */
  int ret = scr_write_lock(file,fd);
  if (ret != SCR_SUCCESS){
     scr_close(file,fd);
     umask(old_mode);
     return ret;
  }

  /* read in the current data from the file */
  scr_hash_read_fd(file, fd, data);

  /* set / unset values in file */
  if (args->set_reason) {
    scr_hash_unset(data, SCR_HALT_KEY_EXIT_REASON);
    scr_hash_set_kv(data, SCR_HALT_KEY_EXIT_REASON, args->value_reason);
  } else if (args->unset_reason) {
    scr_hash_unset(data, SCR_HALT_KEY_EXIT_REASON);
  }

  if (args->set_checkpoints) {
    scr_hash_unset(data, SCR_HALT_KEY_CHECKPOINTS);
    scr_hash_setf(data, NULL, "%s %lu", SCR_HALT_KEY_CHECKPOINTS, args->value_checkpoints);
  } else if (args->unset_checkpoints) {
    scr_hash_unset(data, SCR_HALT_KEY_CHECKPOINTS);
  }

  if (args->set_before) {
    scr_hash_unset(data, SCR_HALT_KEY_EXIT_BEFORE);
    scr_hash_setf(data, NULL, "%s %lu", SCR_HALT_KEY_EXIT_BEFORE, args->value_before);
  } else if (args->unset_before) {
    scr_hash_unset(data, SCR_HALT_KEY_EXIT_BEFORE);
  }

  if (args->set_after) {
    scr_hash_unset(data, SCR_HALT_KEY_EXIT_AFTER);
    scr_hash_setf(data, NULL, "%s %lu", SCR_HALT_KEY_EXIT_AFTER, args->value_after);
  } else if (args->unset_after) {
    scr_hash_unset(data, SCR_HALT_KEY_EXIT_AFTER);
  }

  if (args->set_seconds) {
    scr_hash_unset(data, SCR_HALT_KEY_SECONDS);
    scr_hash_setf(data, NULL, "%s %lu", SCR_HALT_KEY_SECONDS, args->value_seconds);
  } else if (args->unset_seconds) {
    scr_hash_unset(data, SCR_HALT_KEY_SECONDS);
  }

  /* wind file pointer back to the start of the file */
  lseek(fd, 0, SEEK_SET);

  /* write our updated data */
  ssize_t bytes_written = scr_hash_write_fd(file, fd, data);

  /* truncate the file to the correct size (may be smaller than it was before) */
  if (bytes_written >= 0) {
    ftruncate(fd, (off_t) bytes_written);
  }

  /* release the file lock */
  ret = scr_unlock(file, fd);
  if (ret != SCR_SUCCESS){
     scr_close(file,fd);
     umask(old_mode);
     return ret;
  }

  /* close file */
  scr_close(file, fd);

  /* restore the normal file mask */
  umask(old_mode);

  /* write current values to halt file */
  return SCR_SUCCESS;
}
Ejemplo n.º 7
0
/* read in the summary file from dir assuming file is using version 4 format or earlier,
 * convert to version 5 hash */
static int scr_summary_read_v4_to_v5(const scr_path* dir, scr_hash* summary_hash)
{
  /* check that we have a pointer to a hash */
  if (summary_hash == NULL) {
    return SCR_FAILURE;
  }

  /* build name of summary file */
  scr_path* summary_path = scr_path_dup(dir);
  scr_path_append_str(summary_path, "scr_summary.txt");
  char* summary_file = scr_path_strdup(summary_path);
  scr_path_delete(&summary_path);

  /* check whether we can read the file before we actually try,
   * we take this step to avoid printing an error in scr_hash_read */
  if (scr_file_is_readable(summary_file) != SCR_SUCCESS) {
    scr_free(&summary_file);
    return SCR_FAILURE;
  }

  /* open the summary file */
  FILE* fs = fopen(summary_file, "r");
  if (fs == NULL) {
    scr_err("Opening summary file for read: fopen(%s, \"r\") errno=%d %s @ %s:%d",
      summary_file, errno, strerror(errno), __FILE__, __LINE__
    );
    scr_free(&summary_file);
    return SCR_FAILURE;
  }

  /* assume we have one file per rank */
  int num_records = scr_ranks_world;

  /* read the first line (all versions have at least one header line) */
  int linenum = 0;
  char line[2048];
  char field[2048];
  fgets(line, sizeof(line), fs);
  linenum++;

  /* get the summary file version number, if no number, assume version=1 */
  int version = 1;
  sscanf(line, "%s", field);
  if (strcmp(field, "Version:") == 0) {
    sscanf(line, "%s %d", field, &version);
  }

  /* all versions greater than 1, have two header lines, read and throw away the second */
  if (version > 1) {
    /* version 3 and higher writes the number of rows in the file (ranks may write 0 or more files) */
    if (version >= 3) {
      fgets(line, sizeof(line), fs);
      linenum++;
      sscanf(line, "%s %d", field, &num_records);
    }
    fgets(line, sizeof(line), fs);
    linenum++;
  }

  /* now we know how many records we'll be reading, so allocate space for them */
  if (num_records <= 0) {
    scr_err("No file records found in summary file %s, perhaps it is corrupt or incomplete @ %s:%d",
      summary_file, __FILE__, __LINE__
    );
    fclose(fs);
    scr_free(&summary_file);
    return SCR_FAILURE;
  }

  /* set the version number in the summary hash, initialize a pointer to the checkpoint hash */
  scr_hash_set_kv_int(summary_hash, SCR_SUMMARY_KEY_VERSION, SCR_SUMMARY_FILE_VERSION_5);
  scr_hash* ckpt_hash = NULL;

  /* read the record for each rank */
  int i;
  int bad_values        =  0;
  int all_complete      =  1;
  int all_ranks         = -1;
  int all_checkpoint_id = -1;
  for(i=0; i < num_records; i++) {
    int expected_n, n;
    int rank, scr, ranks, pattern, complete, match_filesize, checkpoint_id;
    char filename[SCR_MAX_FILENAME];
    unsigned long exp_filesize, filesize;
    int crc_computed = 0;
    uLong crc = 0UL;

    /* read a line from the file, parse depending on version */
    if (version == 1) {
      expected_n = 10;
      n = fscanf(fs, "%d\t%d\t%d\t%d\t%d\t%d\t%lu\t%d\t%lu\t%s\n",
                 &rank, &scr, &ranks, &pattern, &checkpoint_id, &complete,
                 &exp_filesize, &match_filesize, &filesize, filename
      );
      linenum++;
    } else {
      expected_n = 11;
      n = fscanf(fs, "%d\t%d\t%d\t%d\t%d\t%lu\t%d\t%lu\t%s\t%d\t0x%lx\n",
                 &rank, &scr, &ranks, &checkpoint_id, &complete,
                 &exp_filesize, &match_filesize, &filesize, filename,
                 &crc_computed, &crc
      );
      linenum++;
    }

    /* check the return code returned from the read */
    if (n == EOF) {
      scr_err("Early EOF in summary file %s at line %d.  Only read %d of %d expected records @ %s:%d",
        summary_file, linenum, i, num_records, __FILE__, __LINE__
      );
      fclose(fs);
      scr_hash_unset_all(summary_hash);
      scr_free(&summary_file);
      return SCR_FAILURE;
    } else if (n != expected_n) {
      scr_err("Invalid read of record %d in %s at line %d @ %s:%d",
        i, summary_file, linenum, __FILE__, __LINE__
      );
      fclose(fs);
      scr_hash_unset_all(summary_hash);
      scr_free(&summary_file);
      return SCR_FAILURE;
    }

    /* TODO: check whether all files are complete, match expected size, number of ranks, checkpoint_id, etc */
    if (rank < 0 || rank >= scr_ranks_world) {
      bad_values = 1;
      scr_err("Invalid rank detected (%d) in a job with %d tasks in %s at line %d @ %s:%d",
        rank, scr_my_rank_world, summary_file, linenum, __FILE__, __LINE__
      );
    }

    /* chop to basename of filename */
    char* base = basename(filename);

    /* set the pointer to the checkpoint hash, if we haven't already */
    if (ckpt_hash == NULL) {
      /* get a pointer to the checkpoint hash */
      ckpt_hash = scr_hash_set_kv_int(summary_hash, SCR_SUMMARY_5_KEY_CKPT, checkpoint_id);
    }

    /* get a pointer to the hash for this rank, and then to the file for this rank */
    scr_hash* rank_hash = scr_hash_set_kv_int(ckpt_hash, SCR_SUMMARY_5_KEY_RANK, rank);
    scr_hash* file_hash = scr_hash_set_kv(    rank_hash, SCR_SUMMARY_5_KEY_FILE, base);

    /* set the file size, and the crc32 value if it was computed */
    scr_hash_util_set_bytecount(file_hash, SCR_SUMMARY_5_KEY_SIZE, exp_filesize);
    if (crc_computed) {
      scr_hash_util_set_crc32(file_hash, SCR_SUMMARY_5_KEY_CRC, crc);
    }

    /* if the file is incomplete, set the incomplete field for this file */
    if (! complete) {
      all_complete = 0;
      scr_hash_set_kv_int(file_hash, SCR_SUMMARY_5_KEY_COMPLETE, 0);
    }

    /* check that the checkpoint id matches all other checkpoint ids in the file */
    if (checkpoint_id != all_checkpoint_id) {
      if (all_checkpoint_id == -1) {
        all_checkpoint_id = checkpoint_id;
      } else {
        bad_values = 1;
        scr_err("Checkpoint id %d on record %d does not match expected checkpoint id %d in %s at line %d @ %s:%d",
          checkpoint_id, i, all_checkpoint_id, summary_file, linenum, __FILE__, __LINE__
        );
      }
    }

    /* check that the number of ranks matches all the number of ranks specified by all other records in the file */
    if (ranks != all_ranks) {
      if (all_ranks == -1) {
        all_ranks = ranks;
      } else {
        bad_values = 1;
        scr_err("Number of ranks %d on record %d does not match expected number of ranks %d in %s at line %d @ %s:%d",
          ranks, i, all_ranks, summary_file, linenum, __FILE__, __LINE__
        );
      }
    }
  }

  /* we've read in all of the records, now set the values for the complete field
   * and the number of ranks field */
  if (ckpt_hash != NULL) {
    scr_hash_set_kv_int(ckpt_hash, SCR_SUMMARY_5_KEY_COMPLETE, all_complete);
    scr_hash_set_kv_int(ckpt_hash, SCR_SUMMARY_5_KEY_RANKS, all_ranks);
  }

  /* close the file */
  fclose(fs);

  /* if we found any problems while reading the file, clear the hash and return with an error */
  if (bad_values) {
    /* clear the hash, since we may have set bad values */
    scr_hash_unset_all(summary_hash);
    scr_free(&summary_file);
    return SCR_FAILURE;
  }

  /* free summary file string */
  scr_free(&summary_file);

  /* otherwise, return success */
  return SCR_SUCCESS;
}