/* this transfers redundancy descriptors for the given dataset id */ static int scr_distribute_reddescs(scr_filemap* map, int id, scr_reddesc* red) { int i; /* create a new hash to record redundancy descriptors that we have */ scr_hash* send_hash = scr_hash_new(); /* for this dataset, get list of ranks we have data for */ int nranks = 0; int* ranks = NULL; scr_filemap_list_ranks_by_dataset(map, id, &nranks, &ranks); /* for each rank we have files for, check whether we also have * its redundancy descriptor */ int invalid_rank_found = 0; for (i=0; i < nranks; i++) { /* get the rank id */ int rank = ranks[i]; /* check that the rank is within range */ if (rank < 0 || rank >= scr_ranks_world) { scr_err("Invalid rank id %d in world of %d @ %s:%d", rank, scr_ranks_world, __FILE__, __LINE__ ); invalid_rank_found = 1; } /* lookup the redundancy descriptor hash for this rank */ scr_hash* desc = scr_hash_new(); scr_filemap_get_desc(map, id, rank, desc); /* if this descriptor has entries, add it to our send hash, * delete the hash otherwise */ if (scr_hash_size(desc) > 0) { scr_hash_setf(send_hash, desc, "%d", rank); } else { scr_hash_delete(&desc); } } /* free off our list of ranks */ scr_free(&ranks); /* check that we didn't find an invalid rank on any process */ if (! scr_alltrue(invalid_rank_found == 0)) { scr_hash_delete(&send_hash); return SCR_FAILURE; } /* create an empty hash to receive any incoming descriptors */ /* exchange descriptors with other ranks */ scr_hash* recv_hash = scr_hash_new(); scr_hash_exchange(send_hash, recv_hash, scr_comm_world); /* check that everyone can get their descriptor */ int num_desc = scr_hash_size(recv_hash); if (! scr_alltrue(num_desc > 0)) { scr_hash_delete(&recv_hash); scr_hash_delete(&send_hash); scr_dbg(2, "Cannot find process that has my redundancy descriptor @ %s:%d", __FILE__, __LINE__ ); return SCR_FAILURE; } /* just go with the first redundancy descriptor in our list, * they should all be the same */ scr_hash_elem* desc_elem = scr_hash_elem_first(recv_hash); scr_hash* desc_hash = scr_hash_elem_hash(desc_elem); /* record the descriptor in our filemap */ scr_filemap_set_desc(map, id, scr_my_rank_world, desc_hash); scr_filemap_write(scr_map_file, map); /* TODO: at this point, we could delete descriptors for other * ranks for this checkpoint */ /* create our redundancy descriptor struct from the map */ scr_reddesc_create_from_filemap(map, id, scr_my_rank_world, red); /* free off our send and receive hashes */ scr_hash_delete(&recv_hash); scr_hash_delete(&send_hash); return SCR_SUCCESS; }
/* this moves all files of the specified dataset in the cache to * make them accessible to new rank mapping */ static int scr_distribute_files(scr_filemap* map, const scr_reddesc* red, int id) { int i, round; int rc = SCR_SUCCESS; /* TODO: mark dataset as being distributed in filemap, * because if we fail in the middle of a distribute, * we can't trust the contents of the files anymore, * at which point it should be deleted */ /* clean out any incomplete files before we start */ scr_cache_clean(map); /* for this dataset, get list of ranks we have data for */ int nranks = 0; int* ranks = NULL; scr_filemap_list_ranks_by_dataset(map, id, &nranks, &ranks); /* walk backwards through the list of ranks, and set our start index * to the rank which is the first rank that is equal to or higher * than our own rank -- when we assign round ids below, this offsetting * helps distribute the load */ int start_index = 0; int invalid_rank_found = 0; for (i = nranks-1; i >= 0; i--) { int rank = ranks[i]; /* pick the first rank whose rank id is equal to or higher than our own */ if (rank >= scr_my_rank_world) { start_index = i; } /* while we're at it, check that the rank is within range */ if (rank < 0 || rank >= scr_ranks_world) { scr_err("Invalid rank id %d in world of %d @ %s:%d", rank, scr_ranks_world, __FILE__, __LINE__ ); invalid_rank_found = 1; } } /* check that we didn't find an invalid rank on any process */ if (! scr_alltrue(invalid_rank_found == 0)) { scr_free(&ranks); return SCR_FAILURE; } /* allocate array to record the rank we can send to in each round */ int* have_rank_by_round = (int*) SCR_MALLOC(sizeof(int) * nranks); int* send_flag_by_round = (int*) SCR_MALLOC(sizeof(int) * nranks); /* check that we have all of the files for each rank, * and determine the round we can send them */ scr_hash* send_hash = scr_hash_new(); scr_hash* recv_hash = scr_hash_new(); for (round = 0; round < nranks; round++) { /* get the rank id */ int index = (start_index + round) % nranks; int rank = ranks[index]; /* record the rank indexed by the round number */ have_rank_by_round[round] = rank; /* assume we won't be sending to this rank in this round */ send_flag_by_round[round] = 0; /* if we have files for this rank, specify the round we can * send those files in */ if (scr_bool_have_files(map, id, rank)) { scr_hash_setf(send_hash, NULL, "%d %d", rank, round); } } scr_hash_exchange(send_hash, recv_hash, scr_comm_world); /* search for the minimum round we can get our files */ int retrieve_rank = -1; int retrieve_round = -1; scr_hash_elem* elem = NULL; for (elem = scr_hash_elem_first(recv_hash); elem != NULL; elem = scr_hash_elem_next(elem)) { /* get the rank id */ int rank = scr_hash_elem_key_int(elem); /* get the round id */ scr_hash* round_hash = scr_hash_elem_hash(elem); scr_hash_elem* round_elem = scr_hash_elem_first(round_hash); char* round_str = scr_hash_elem_key(round_elem); int round = atoi(round_str); /* record this round and rank number if it's less than the current round */ if (round < retrieve_round || retrieve_round == -1) { retrieve_round = round; retrieve_rank = rank; } } /* done with the round hashes, free them off */ scr_hash_delete(&recv_hash); scr_hash_delete(&send_hash); /* free off our list of ranks */ scr_free(&ranks); /* for some redundancy schemes, we know at this point whether we * can recover all files */ int can_get_files = (retrieve_rank != -1); if (red->copy_type != SCR_COPY_XOR && !scr_alltrue(can_get_files)) { /* print a debug message indicating which rank is missing files */ if (! can_get_files) { scr_dbg(2, "Cannot find process that has my checkpoint files @ %s:%d", __FILE__, __LINE__ ); } return SCR_FAILURE; } /* get the maximum retrieve round */ int max_rounds = 0; MPI_Allreduce( &retrieve_round, &max_rounds, 1, MPI_INT, MPI_MAX, scr_comm_world ); /* tell destination which round we'll take our files in */ send_hash = scr_hash_new(); recv_hash = scr_hash_new(); if (retrieve_rank != -1) { scr_hash_setf(send_hash, NULL, "%d %d", retrieve_rank, retrieve_round); } scr_hash_exchange(send_hash, recv_hash, scr_comm_world); /* determine which ranks want to fetch their files from us */ for(elem = scr_hash_elem_first(recv_hash); elem != NULL; elem = scr_hash_elem_next(elem)) { /* get the round id */ scr_hash* round_hash = scr_hash_elem_hash(elem); scr_hash_elem* round_elem = scr_hash_elem_first(round_hash); char* round_str = scr_hash_elem_key(round_elem); int round = atoi(round_str); /* record whether this rank wants its files from us */ if (round >= 0 && round < nranks) { send_flag_by_round[round] = 1; } } /* done with the round hashes, free them off */ scr_hash_delete(&recv_hash); scr_hash_delete(&send_hash); int tmp_rc = 0; /* run through rounds and exchange files */ for (round = 0; round <= max_rounds; round++) { /* assume we don't need to send or receive any files this round */ int send_rank = MPI_PROC_NULL; int recv_rank = MPI_PROC_NULL; int send_num = 0; int recv_num = 0; /* check whether I can potentially send to anyone in this round */ if (round < nranks) { /* have someone's files, check whether they are asking * for them this round */ if (send_flag_by_round[round]) { /* need to send files this round, remember to whom and how many */ int dst_rank = have_rank_by_round[round]; send_rank = dst_rank; send_num = scr_filemap_num_files(map, id, dst_rank); } } /* if I'm supposed to get my files this round, set the recv_rank */ if (retrieve_round == round) { recv_rank = retrieve_rank; } /* TODO: another special case is to just move files if the * processes are on the same node */ /* if i'm sending to myself, just move (rename) each file */ if (send_rank == scr_my_rank_world) { /* get our file list */ int numfiles = 0; char** files = NULL; scr_filemap_list_files(map, id, send_rank, &numfiles, &files); /* TODO: sort files in reverse order by size */ /* iterate over and rename each file */ for (i=0; i < numfiles; i++) { /* get the current file name */ char* file = files[i]; /* lookup meta data for this file */ scr_meta* meta = scr_meta_new(); scr_filemap_get_meta(map, id, send_rank, file, meta); /* get the path for this file based on its type * and dataset id */ char* dir = NULL; if (scr_meta_check_filetype(meta, SCR_META_FILE_USER) == SCR_SUCCESS) { dir = scr_cache_dir_get(red, id); } else { dir = scr_cache_dir_hidden_get(red, id); } /* build the new file name */ scr_path* path_newfile = scr_path_from_str(file); scr_path_basename(path_newfile); scr_path_prepend_str(path_newfile, dir); char* newfile = scr_path_strdup(path_newfile); /* if the new file name is different from the old name, rename it */ if (strcmp(file, newfile) != 0) { /* record the new filename to our map and write it to disk */ scr_filemap_add_file(map, id, send_rank, newfile); scr_filemap_set_meta(map, id, send_rank, newfile, meta); scr_filemap_write(scr_map_file, map); /* rename the file */ scr_dbg(2, "Round %d: rename(%s, %s)", round, file, newfile); tmp_rc = rename(file, newfile); if (tmp_rc != 0) { /* TODO: to cross mount points, if tmp_rc == EXDEV, * open new file, copy, and delete orig */ scr_err("Moving checkpoint file: rename(%s, %s) %s errno=%d @ %s:%d", file, newfile, strerror(errno), errno, __FILE__, __LINE__ ); rc = SCR_FAILURE; } /* remove the old name from the filemap and write it to disk */ scr_filemap_remove_file(map, id, send_rank, file); scr_filemap_write(scr_map_file, map); } /* free the path and string */ scr_free(&newfile); scr_path_delete(&path_newfile); /* free directory string */ scr_free(&dir); /* free meta data */ scr_meta_delete(&meta); } /* free the list of filename pointers */ scr_free(&files); } else { /* if we have files for this round, but the correspdonding * rank doesn't need them, delete the files */ if (round < nranks && send_rank == MPI_PROC_NULL) { int dst_rank = have_rank_by_round[round]; scr_unlink_rank(map, id, dst_rank); } /* sending to and/or recieving from another node */ if (send_rank != MPI_PROC_NULL || recv_rank != MPI_PROC_NULL) { /* have someone to send to or receive from */ int have_outgoing = 0; int have_incoming = 0; if (send_rank != MPI_PROC_NULL) { have_outgoing = 1; } if (recv_rank != MPI_PROC_NULL) { have_incoming = 1; } /* first, determine how many files I will be receiving and * tell how many I will be sending */ MPI_Request request[2]; MPI_Status status[2]; int num_req = 0; if (have_incoming) { MPI_Irecv( &recv_num, 1, MPI_INT, recv_rank, 0, scr_comm_world, &request[num_req] ); num_req++; } if (have_outgoing) { MPI_Isend( &send_num, 1, MPI_INT, send_rank, 0, scr_comm_world, &request[num_req] ); num_req++; } if (num_req > 0) { MPI_Waitall(num_req, request, status); } /* record how many files I will receive (need to distinguish * between 0 files and not knowing) */ if (have_incoming) { scr_filemap_set_expected_files(map, id, scr_my_rank_world, recv_num); } /* turn off send or receive flags if the file count is 0, * nothing else to do */ if (send_num == 0) { have_outgoing = 0; send_rank = MPI_PROC_NULL; } if (recv_num == 0) { have_incoming = 0; recv_rank = MPI_PROC_NULL; } /* TODO: since we overwrite files in place in order to avoid * running out of storage space, we should sort files in order * of descending size for the next step */ /* get our file list for the destination */ int numfiles = 0; char** files = NULL; if (have_outgoing) { scr_filemap_list_files(map, id, send_rank, &numfiles, &files); } /* while we have a file to send or receive ... */ while (have_incoming || have_outgoing) { /* get the filename */ char* file = NULL; scr_meta* send_meta = NULL; if (have_outgoing) { file = files[numfiles - send_num]; send_meta = scr_meta_new(); scr_filemap_get_meta(map, id, send_rank, file, send_meta); } /* exchange meta data so we can determine type of incoming file */ scr_meta* recv_meta = scr_meta_new(); scr_hash_sendrecv(send_meta, send_rank, recv_meta, recv_rank, scr_comm_world); /* get the path for this file based on its type and dataset id */ char* dir = NULL; if (have_incoming) { if (scr_meta_check_filetype(recv_meta, SCR_META_FILE_USER) == SCR_SUCCESS) { dir = scr_cache_dir_get(red, id); } else { dir = scr_cache_dir_hidden_get(red, id); } } /* exhange file names with partners, * building full path of incoming file */ char file_partner[SCR_MAX_FILENAME]; scr_swap_file_names( file, send_rank, file_partner, sizeof(file_partner), recv_rank, dir, scr_comm_world ); /* free directory string */ scr_free(&dir); /* free incoming meta data (we'll get this again later) */ scr_meta_delete(&recv_meta); /* if we'll receive a file, record the name of our file * in the filemap and write it to disk */ recv_meta = NULL; if (recv_rank != MPI_PROC_NULL) { recv_meta = scr_meta_new(); scr_filemap_add_file(map, id, scr_my_rank_world, file_partner); scr_filemap_write(scr_map_file, map); } /* either sending or receiving a file this round, since we move files, * it will be deleted or overwritten */ if (scr_swap_files(MOVE_FILES, file, send_meta, send_rank, file_partner, recv_meta, recv_rank, scr_comm_world) != SCR_SUCCESS) { scr_err("Swapping files: %s to %d, %s from %d @ %s:%d", file, send_rank, file_partner, recv_rank, __FILE__, __LINE__ ); rc = SCR_FAILURE; } /* if we received a file, record its meta data and decrement * our receive count */ if (have_incoming) { /* record meta data for the file we received */ scr_filemap_set_meta(map, id, scr_my_rank_world, file_partner, recv_meta); scr_meta_delete(&recv_meta); /* decrement receive count */ recv_num--; if (recv_num == 0) { have_incoming = 0; recv_rank = MPI_PROC_NULL; } } /* if we sent a file, remove it from the filemap and decrement * our send count */ if (have_outgoing) { /* remove file from the filemap */ scr_filemap_remove_file(map, id, send_rank, file); scr_meta_delete(&send_meta); /* decrement our send count */ send_num--; if (send_num == 0) { have_outgoing = 0; send_rank = MPI_PROC_NULL; } } /* update filemap on disk */ scr_filemap_write(scr_map_file, map); } /* free our file list */ scr_free(&files); } } } /* if we have more rounds than max rounds, delete the remainder of our files */ for (round = max_rounds+1; round < nranks; round++) { /* have someone's files for this round, so delete them */ int dst_rank = have_rank_by_round[round]; scr_unlink_rank(map, id, dst_rank); } scr_free(&send_flag_by_round); scr_free(&have_rank_by_round); /* write out new filemap and free the memory resources */ scr_filemap_write(scr_map_file, map); /* clean out any incomplete files */ scr_cache_clean(map); /* TODO: if the exchange or redundancy rebuild failed, * we should also delete any *good* files we received */ /* return whether distribute succeeded, it does not ensure we have * all of our files, only that the transfer completed without failure */ return rc; }
/* flushes data for files specified in file_list (with flow control), * and records status of each file in data */ static int scr_flush_data(scr_hash* file_list, scr_hash* data) { int flushed = SCR_SUCCESS; /* flow control the write among processes */ if (scr_my_rank_world == 0) { /* first, flush each of my files and fill in meta data structure */ if (scr_flush_files_list(file_list, data) != SCR_SUCCESS) { flushed = SCR_FAILURE; } /* now, have a sliding window of w processes write simultaneously */ int w = scr_flush_width; if (w > (scr_ranks_world - 1)) { w = scr_ranks_world - 1; } /* allocate MPI_Request arrays and an array of ints */ int* flags = (int*) SCR_MALLOC(2 * w * sizeof(int)); MPI_Request* req = (MPI_Request*) SCR_MALLOC(2 * w * sizeof(MPI_Request)); MPI_Status status; int i = 1; int outstanding = 0; int index = 0; while (i < scr_ranks_world || outstanding > 0) { /* issue up to w outstanding sends and receives */ while (i < scr_ranks_world && outstanding < w) { /* post a receive for the response message we'll get back when rank i is done */ MPI_Irecv(&flags[w + index], 1, MPI_INT, i, 0, scr_comm_world, &req[w + index]); /* post a send to tell rank i to start */ flags[index] = flushed; MPI_Isend(&flags[index], 1, MPI_INT, i, 0, scr_comm_world, &req[index]); /* update the number of outstanding requests */ i++; outstanding++; index++; } /* wait to hear back from any rank */ MPI_Waitany(w, &req[w], &index, &status); /* someone responded, the send to this rank should also be done, so complete it */ MPI_Wait(&req[index], &status); /* determine whether this rank flushed its file successfully */ if (flags[w + index] != SCR_SUCCESS) { flushed = SCR_FAILURE; } /* one less request outstanding now */ outstanding--; } /* free the MPI_Request arrays */ scr_free(&req); scr_free(&flags); } else { /* receive signal to start */ int start = 0; MPI_Status status; MPI_Recv(&start, 1, MPI_INT, 0, 0, scr_comm_world, &status); /* flush files if we've had success so far, otherwise skip the flush and return failure */ if (start == SCR_SUCCESS) { /* flush each of my files and fill in meta data strucutre */ if (scr_flush_files_list(file_list, data) != SCR_SUCCESS) { flushed = SCR_FAILURE; } } else { /* someone failed before we even started, so don't bother */ flushed = SCR_FAILURE; } /* send message to rank 0 to report that we're done */ MPI_Send(&flushed, 1, MPI_INT, 0, 0, scr_comm_world); } /* determine whether everyone wrote their files ok */ if (scr_alltrue((flushed == SCR_SUCCESS))) { return SCR_SUCCESS; } return SCR_FAILURE; }
/* broadcast dataset hash from smallest rank we can find that has a copy */ static int scr_distribute_datasets(scr_filemap* map, int id) { int i; /* create a new hash to record dataset descriptor */ scr_hash* send_hash = scr_hash_new(); /* for this dataset, get list of ranks we have data for */ int nranks = 0; int* ranks = NULL; scr_filemap_list_ranks_by_dataset(map, id, &nranks, &ranks); /* for each rank we have files for, * check whether we also have its dataset descriptor */ int invalid_rank_found = 0; int have_dset = 0; for (i=0; i < nranks; i++) { /* get the rank id */ int rank = ranks[i]; /* check that the rank is within range */ if (rank < 0 || rank >= scr_ranks_world) { scr_err("Invalid rank id %d in world of %d @ %s:%d", rank, scr_ranks_world, __FILE__, __LINE__ ); invalid_rank_found = 1; } /* lookup the dataset descriptor hash for this rank */ scr_hash* desc = scr_hash_new(); scr_filemap_get_dataset(map, id, rank, desc); /* if this descriptor has entries, add it to our send hash, * delete the hash otherwise */ if (scr_hash_size(desc) > 0) { have_dset = 1; scr_hash_merge(send_hash, desc); scr_hash_delete(&desc); break; } else { scr_hash_delete(&desc); } } /* free off our list of ranks */ scr_free(&ranks); /* check that we didn't find an invalid rank on any process */ if (! scr_alltrue(invalid_rank_found == 0)) { scr_hash_delete(&send_hash); return SCR_FAILURE; } /* identify the smallest rank that has the dataset */ int source_rank = scr_ranks_world; if (have_dset) { source_rank = scr_my_rank_world; } int min_rank; MPI_Allreduce(&source_rank, &min_rank, 1, MPI_INT, MPI_MIN, scr_comm_world); /* if there is no rank, return with failure */ if (min_rank >= scr_ranks_world) { scr_hash_delete(&send_hash); return SCR_FAILURE; } /* otherwise, bcast the dataset from the minimum rank */ if (scr_my_rank_world != min_rank) { scr_hash_unset_all(send_hash); } scr_hash_bcast(send_hash, min_rank, scr_comm_world); /* record the descriptor in our filemap */ scr_filemap_set_dataset(map, id, scr_my_rank_world, send_hash); scr_filemap_write(scr_map_file, map); /* TODO: at this point, we could delete descriptors for other * ranks for this checkpoint */ /* free off our send hash */ scr_hash_delete(&send_hash); return SCR_SUCCESS; }
/* fetch files from parallel file system */ static int scr_fetch_files( scr_filemap* map, scr_path* fetch_path, int* dataset_id, int* checkpoint_id) { /* get fetch directory as string */ char* fetch_dir = scr_path_strdup(fetch_path); /* this may take a while, so tell user what we're doing */ if (scr_my_rank_world == 0) { scr_dbg(1, "Attempting fetch from %s", fetch_dir); } /* make sure all processes make it this far before progressing */ MPI_Barrier(scr_comm_world); /* start timer */ time_t timestamp_start; double time_start; if (scr_my_rank_world == 0) { timestamp_start = scr_log_seconds(); time_start = MPI_Wtime(); } /* log the fetch attempt */ if (scr_my_rank_world == 0) { if (scr_log_enable) { time_t now = scr_log_seconds(); scr_log_event("FETCH STARTED", fetch_dir, NULL, &now, NULL); } } /* allocate a new hash to get a list of files to fetch */ scr_hash* file_list = scr_hash_new(); /* read the summary file */ if (scr_fetch_summary(fetch_dir, file_list) != SCR_SUCCESS) { if (scr_my_rank_world == 0) { scr_dbg(1, "Failed to read summary file @ %s:%d", __FILE__, __LINE__); if (scr_log_enable) { double time_end = MPI_Wtime(); double time_diff = time_end - time_start; time_t now = scr_log_seconds(); scr_log_event("FETCH FAILED", fetch_dir, NULL, &now, &time_diff); } } scr_hash_delete(&file_list); scr_free(&fetch_dir); return SCR_FAILURE; } /* get a pointer to the dataset */ scr_dataset* dataset = scr_hash_get(file_list, SCR_KEY_DATASET); /* get the dataset id */ int id; if (scr_dataset_get_id(dataset, &id) != SCR_SUCCESS) { if (scr_my_rank_world == 0) { scr_dbg(1, "Invalid id in summary file @ %s:%d", __FILE__, __LINE__); if (scr_log_enable) { double time_end = MPI_Wtime(); double time_diff = time_end - time_start; time_t now = scr_log_seconds(); scr_log_event("FETCH FAILED", fetch_dir, NULL, &now, &time_diff); } } scr_hash_delete(&file_list); scr_free(&fetch_dir); return SCR_FAILURE; } /* get the checkpoint id for this dataset */ int ckpt_id; if (scr_dataset_get_ckpt(dataset, &ckpt_id) != SCR_SUCCESS) { /* eventually, we'll support reading of non-checkpoint datasets, * but we don't yet */ scr_err("Failed to read checkpoint id from dataset @ %s:%d", __FILE__, __LINE__ ); scr_hash_delete(&file_list); scr_free(&fetch_dir); return SCR_FAILURE; } /* delete any existing files for this dataset id (do this before * filemap_read) */ scr_cache_delete(map, id); /* get the redundancy descriptor for this id */ scr_reddesc* c = scr_reddesc_for_checkpoint(ckpt_id, scr_nreddescs, scr_reddescs); /* store our redundancy descriptor hash in the filemap */ scr_hash* my_desc_hash = scr_hash_new(); scr_reddesc_store_to_hash(c, my_desc_hash); scr_filemap_set_desc(map, id, scr_my_rank_world, my_desc_hash); scr_hash_delete(&my_desc_hash); /* write the filemap out before creating the directory */ scr_filemap_write(scr_map_file, map); /* create the cache directory */ scr_cache_dir_create(c, id); /* get the cache directory */ char cache_dir[SCR_MAX_FILENAME]; scr_cache_dir_get(c, id, cache_dir); /* now we can finally fetch the actual files */ int success = 1; if (scr_fetch_data(file_list, cache_dir, map) != SCR_SUCCESS) { success = 0; } /* free the hash holding the summary file data */ scr_hash_delete(&file_list); /* check that all processes copied their file successfully */ if (! scr_alltrue(success)) { /* someone failed, so let's delete the partial checkpoint */ scr_cache_delete(map, id); if (scr_my_rank_world == 0) { scr_dbg(1, "One or more processes failed to read its files @ %s:%d", __FILE__, __LINE__ ); if (scr_log_enable) { double time_end = MPI_Wtime(); double time_diff = time_end - time_start; time_t now = scr_log_seconds(); scr_log_event("FETCH FAILED", fetch_dir, &id, &now, &time_diff); } } scr_free(&fetch_dir); return SCR_FAILURE; } /* apply redundancy scheme */ double bytes_copied = 0.0; int rc = scr_reddesc_apply(map, c, id, &bytes_copied); if (rc == SCR_SUCCESS) { /* record dataset and checkpoint ids */ *dataset_id = id; *checkpoint_id = ckpt_id; /* update our flush file to indicate this checkpoint is in cache * as well as the parallel file system */ /* TODO: should we place SCR_FLUSH_KEY_LOCATION_PFS before * scr_reddesc_apply? */ scr_flush_file_location_set(id, SCR_FLUSH_KEY_LOCATION_CACHE); scr_flush_file_location_set(id, SCR_FLUSH_KEY_LOCATION_PFS); scr_flush_file_location_unset(id, SCR_FLUSH_KEY_LOCATION_FLUSHING); } else { /* something went wrong, so delete this checkpoint from the cache */ scr_cache_delete(scr_map, id); } /* stop timer, compute bandwidth, and report performance */ double total_bytes = bytes_copied; if (scr_my_rank_world == 0) { double time_end = MPI_Wtime(); double time_diff = time_end - time_start; double bw = total_bytes / (1024.0 * 1024.0 * time_diff); scr_dbg(1, "scr_fetch_files: %f secs, %e bytes, %f MB/s, %f MB/s per proc", time_diff, total_bytes, bw, bw/scr_ranks_world ); /* log data on the fetch to the database */ if (scr_log_enable) { time_t now = scr_log_seconds(); if (rc == SCR_SUCCESS) { scr_log_event("FETCH SUCCEEDED", fetch_dir, &id, &now, &time_diff); } else { scr_log_event("FETCH FAILED", fetch_dir, &id, &now, &time_diff); } char cache_dir[SCR_MAX_FILENAME]; scr_cache_dir_get(c, id, cache_dir); scr_log_transfer("FETCH", fetch_dir, cache_dir, &id, ×tamp_start, &time_diff, &total_bytes ); } } /* free fetch direcotry string */ scr_free(&fetch_dir); return rc; }
/* fetch files specified in file_list into specified dir and update * filemap */ static int scr_fetch_data( const scr_hash* file_list, const char* dir, scr_filemap* map) { int success = SCR_SUCCESS; /* flow control rate of file reads from rank 0 */ if (scr_my_rank_world == 0) { /* fetch these files into the directory */ if (scr_fetch_files_list(file_list, dir, map) != SCR_SUCCESS) { success = SCR_FAILURE; } /* now, have a sliding window of w processes read simultaneously */ int w = scr_fetch_width; if (w > scr_ranks_world-1) { w = scr_ranks_world-1; } /* allocate MPI_Request arrays and an array of ints */ int* flags = (int*) SCR_MALLOC(2 * w * sizeof(int)); MPI_Request* req = (MPI_Request*) SCR_MALLOC(2 * w * sizeof(MPI_Request)); MPI_Status status; /* execute our flow control window */ int outstanding = 0; int index = 0; int i = 1; while (i < scr_ranks_world || outstanding > 0) { /* issue up to w outstanding sends and receives */ while (i < scr_ranks_world && outstanding < w) { /* post a receive for the response message we'll get back when * rank i is done */ MPI_Irecv(&flags[index + w], 1, MPI_INT, i, 0, scr_comm_world, &req[index + w]); /* send a start signal to this rank */ flags[index] = success; MPI_Isend(&flags[index], 1, MPI_INT, i, 0, scr_comm_world, &req[index]); /* update the number of outstanding requests */ outstanding++; index++; i++; } /* wait to hear back from any rank */ MPI_Waitany(w, &req[w], &index, &status); /* the corresponding send must be complete */ MPI_Wait(&req[index], &status); /* check success code from process */ if (flags[index + w] != SCR_SUCCESS) { success = SCR_FAILURE; } /* one less request outstanding now */ outstanding--; } /* free the MPI_Request arrays */ scr_free(&req); scr_free(&flags); } else { /* wait for start signal from rank 0 */ MPI_Status status; MPI_Recv(&success, 1, MPI_INT, 0, 0, scr_comm_world, &status); /* if rank 0 hasn't seen a failure, try to read in our files */ if (success == SCR_SUCCESS) { /* fetch these files into the directory */ if (scr_fetch_files_list(file_list, dir, map) != SCR_SUCCESS) { success = SCR_FAILURE; } } /* tell rank 0 that we're done and send him our success code */ MPI_Send(&success, 1, MPI_INT, 0, 0, scr_comm_world); } /* determine whether all processes successfully read their files */ if (scr_alltrue(success == SCR_SUCCESS)) { return SCR_SUCCESS; } return SCR_FAILURE; }
/* read contents of summary file */ static int scr_fetch_summary( const char* summary_dir, scr_hash* file_list) { /* assume that we won't succeed in our fetch attempt */ int rc = SCR_SUCCESS; /* check whether summary file exists and is readable */ if (scr_my_rank_world == 0) { /* check that we can access the directory */ if (scr_file_is_readable(summary_dir) != SCR_SUCCESS) { scr_err("Failed to access summary directory %s @ %s:%d", summary_dir, __FILE__, __LINE__ ); rc = SCR_FAILURE; } } /* broadcast success code from rank 0 */ MPI_Bcast(&rc, 1, MPI_INT, 0, scr_comm_world); if (rc != SCR_SUCCESS) { return rc; } /* add path to file list */ scr_hash_util_set_str(file_list, SCR_KEY_PATH, summary_dir); /* build path to summary file */ scr_path* dataset_path = scr_path_from_str(summary_dir); scr_path* meta_path = scr_path_dup(dataset_path); scr_path_append_str(meta_path, ".scr"); scr_path_reduce(meta_path); /* rank 0 reads the summary file */ scr_hash* header = scr_hash_new(); if (scr_my_rank_world == 0) { /* build path to summary file */ scr_path* summary_path = scr_path_dup(meta_path); scr_path_append_str(summary_path, "summary.scr"); const char* summary_file = scr_path_strdup(summary_path); /* open file for reading */ int fd = scr_open(summary_file, O_RDONLY); if (fd >= 0) { /* read summary hash */ ssize_t header_size = scr_hash_read_fd(summary_file, fd, header); if (header_size < 0) { rc = SCR_FAILURE; } /* TODO: check that the version is correct */ /* close the file */ scr_close(summary_file, fd); } else { scr_err("Failed to open summary file %s @ %s:%d", summary_file, __FILE__, __LINE__ ); rc = SCR_FAILURE; } /* free summary path and string */ scr_free(&summary_file); scr_path_delete(&summary_path); } /* broadcast success code from rank 0 */ MPI_Bcast(&rc, 1, MPI_INT, 0, scr_comm_world); if (rc != SCR_SUCCESS) { goto cleanup; } /* broadcast the summary hash */ scr_hash_bcast(header, 0, scr_comm_world); /* extract and record the datast in file list */ scr_hash* dataset_hash = scr_hash_new(); scr_dataset* dataset = scr_hash_get(header, SCR_SUMMARY_6_KEY_DATASET); scr_hash_merge(dataset_hash, dataset); scr_hash_set(file_list, SCR_SUMMARY_6_KEY_DATASET, dataset_hash); /* build path to rank2file map */ scr_path* rank2file_path = scr_path_dup(meta_path); scr_path_append_str(rank2file_path, "rank2file.scr"); /* fetch file names and offsets containing file hash data */ int valid = 0; char* file = NULL; unsigned long offset = 0; if (scr_my_rank_world == 0) { /* rank 0 is only valid reader to start with */ valid = 1; file = scr_path_strdup(rank2file_path); offset = 0; } if (scr_fetch_rank2file_map(dataset_path, 1, &valid, &file, &offset) != SCR_SUCCESS) { rc = SCR_FAILURE; } /* create hashes to exchange data */ scr_hash* send = scr_hash_new(); scr_hash* recv = scr_hash_new(); /* read data from file */ if (valid) { /* open file if necessary */ int fd = scr_open(file, O_RDONLY); if (fd >= 0) { /* create hash to hold file contents */ scr_hash* save = scr_hash_new(); /* read hash from file */ scr_lseek(file, fd, offset, SEEK_SET); ssize_t readsize = scr_hash_read_fd(file, fd, save); if (readsize < 0) { scr_err("Failed to read rank2file map file %s @ %s:%d", file, __FILE__, __LINE__ ); rc = SCR_FAILURE; } /* check that the number of ranks match */ int ranks = 0; scr_hash_util_get_int(save, SCR_SUMMARY_6_KEY_RANKS, &ranks); if (ranks != scr_ranks_world) { scr_err("Invalid number of ranks in %s, got %d expected %d @ %s:%d", file, ranks, scr_ranks_world, __FILE__, __LINE__ ); rc = SCR_FAILURE; } /* delete current send hash, set it to values from file, * delete file hash */ scr_hash_delete(&send); send = scr_hash_extract(save, SCR_SUMMARY_6_KEY_RANK); scr_hash_delete(&save); /* close the file */ scr_close(file, fd); } else { scr_err("Failed to open rank2file map %s @ %s:%d", file, __FILE__, __LINE__ ); rc = SCR_FAILURE; } /* delete file name string */ scr_free(&file); } /* check that everyone read the data ok */ if (! scr_alltrue(rc == SCR_SUCCESS)) { rc = SCR_FAILURE; goto cleanup_hashes; } /* scatter to groups */ scr_hash_exchange_direction(send, recv, scr_comm_world, SCR_HASH_EXCHANGE_RIGHT); /* iterate over the ranks that sent data to us, and set up our * list of files */ scr_hash_elem* elem; for (elem = scr_hash_elem_first(recv); elem != NULL; elem = scr_hash_elem_next(elem)) { /* the key is the source rank, which we don't care about, * the info we need is in the element hash */ scr_hash* elem_hash = scr_hash_elem_hash(elem); /* get pointer to file hash */ scr_hash* file_hash = scr_hash_get(elem_hash, SCR_SUMMARY_6_KEY_FILE); if (file_hash != NULL) { /* TODO: parse summary file format */ scr_hash_merge(file_list, elem_hash); } else { rc = SCR_FAILURE; } } /* fill in file list parameters */ if (rc == SCR_SUCCESS) { /* if we're not using containers, add PATH entry for each of our * files */ scr_hash* files = scr_hash_get(file_list, SCR_KEY_FILE); for (elem = scr_hash_elem_first(files); elem != NULL; elem = scr_hash_elem_next(elem)) { /* get the file name */ char* file = scr_hash_elem_key(elem); /* combine the file name with the summary directory to build a * full path to the file */ scr_path* path_full = scr_path_dup(dataset_path); scr_path_append_str(path_full, file); /* subtract off last component to get just the path */ scr_path_dirname(path_full); char* path = scr_path_strdup(path_full); /* record path in file list */ scr_hash* hash = scr_hash_elem_hash(elem); scr_hash_util_set_str(hash, SCR_KEY_PATH, path); /* free the path and string */ scr_free(&path); scr_path_delete(&path_full); } } /* check that everyone read the data ok */ if (! scr_alltrue(rc == SCR_SUCCESS)) { rc = SCR_FAILURE; goto cleanup_hashes; } cleanup_hashes: /* delete send and receive hashes */ scr_hash_delete(&recv); scr_hash_delete(&send); /* free string and path for rank2file map */ scr_path_delete(&rank2file_path); cleanup: /* free the header hash */ scr_hash_delete(&header); /* free path for dataset directory */ scr_path_delete(&meta_path); scr_path_delete(&dataset_path); return rc; }
static int scr_fetch_rank2file_map( const scr_path* dataset_path, int depth, int* ptr_valid, char** ptr_file, unsigned long* ptr_offset) { int rc = SCR_SUCCESS; /* get local variables so we don't have to deference everything */ int valid = *ptr_valid; char* file = *ptr_file; unsigned long offset = *ptr_offset; /* create a hash to hold section of file */ scr_hash* hash = scr_hash_new(); /* if we can read from file do it */ if (valid) { /* open file if we haven't already */ int fd = scr_open(file, O_RDONLY); if (fd >= 0) { /* read our segment from the file */ scr_lseek(file, fd, offset, SEEK_SET); ssize_t read_rc = scr_hash_read_fd(file, fd, hash); if (read_rc < 0) { scr_err("Failed to read from %s @ %s:%d", file, __FILE__, __LINE__ ); rc = SCR_FAILURE; } /* close the file */ scr_close(file, fd); } else { scr_err("Failed to open rank2file map %s @ %s:%d", file, __FILE__, __LINE__ ); rc = SCR_FAILURE; } } /* check for read errors */ if (! scr_alltrue(rc == SCR_SUCCESS)) { rc = SCR_FAILURE; goto cleanup; } /* create hashes to exchange data */ scr_hash* send = scr_hash_new(); scr_hash* recv = scr_hash_new(); /* copy rank data into send hash */ if (valid) { scr_hash* rank_hash = scr_hash_get(hash, SCR_SUMMARY_6_KEY_RANK); scr_hash_merge(send, rank_hash); } /* exchange hashes */ scr_hash_exchange_direction(send, recv, scr_comm_world, SCR_HASH_EXCHANGE_RIGHT); /* see if anyone sent us anything */ int newvalid = 0; char* newfile = NULL; unsigned long newoffset = 0; scr_hash_elem* elem = scr_hash_elem_first(recv); if (elem != NULL) { /* got something, so now we'll read in the next step */ newvalid = 1; /* get file name we should read */ scr_hash* elem_hash = scr_hash_elem_hash(elem); char* value; if (scr_hash_util_get_str(elem_hash, SCR_SUMMARY_6_KEY_FILE, &value) == SCR_SUCCESS) { /* return string of full path to file to caller */ scr_path* newpath = scr_path_dup(dataset_path); scr_path_append_str(newpath, value); newfile = scr_path_strdup(newpath); scr_path_delete(&newpath); } else { rc = SCR_FAILURE; } /* get offset we should start reading from */ if (scr_hash_util_get_bytecount(elem_hash, SCR_SUMMARY_6_KEY_OFFSET, &newoffset) != SCR_SUCCESS) { rc = SCR_FAILURE; } } /* free the send and receive hashes */ scr_hash_delete(&recv); scr_hash_delete(&send); /* get level id, and broadcast it from rank 0, * which we assume to be a reader in all steps */ int level_id = -1; if (valid) { if (scr_hash_util_get_int(hash, SCR_SUMMARY_6_KEY_LEVEL, &level_id) != SCR_SUCCESS) { rc = SCR_FAILURE; } } MPI_Bcast(&level_id, 1, MPI_INT, 0, scr_comm_world); /* check for read errors */ if (! scr_alltrue(rc == SCR_SUCCESS)) { rc = SCR_FAILURE; goto cleanup; } /* set parameters for output or next iteration, * we already took care of updating ptr_fd earlier */ if (valid) { scr_free(ptr_file); } *ptr_valid = newvalid; *ptr_file = newfile; *ptr_offset = newoffset; /* recurse if we still have levels to read */ if (level_id > 1) { rc = scr_fetch_rank2file_map(dataset_path, depth+1, ptr_valid, ptr_file, ptr_offset); } cleanup: /* free the hash */ scr_hash_delete(&hash); return rc; }
/* apply redundancy scheme to file and return number of bytes copied * in bytes parameter */ int scr_reddesc_apply( scr_filemap* map, const scr_reddesc* c, int id, double* bytes) { /* initialize to 0 */ *bytes = 0.0; /* step through each of my files for the specified dataset * to scan for any incomplete files */ int valid = 1; double my_bytes = 0.0; scr_hash_elem* file_elem; for (file_elem = scr_filemap_first_file(map, id, scr_my_rank_world); file_elem != NULL; file_elem = scr_hash_elem_next(file_elem)) { /* get the filename */ char* file = scr_hash_elem_key(file_elem); /* check the file */ if (! scr_bool_have_file(map, id, scr_my_rank_world, file, scr_ranks_world)) { scr_dbg(2, "File determined to be invalid: %s", file); valid = 0; } /* add up the number of bytes on our way through */ my_bytes += (double) scr_file_size(file); /* if crc_on_copy is set, compute crc and update meta file * (PARTNER does this during the copy) */ if (scr_crc_on_copy && c->copy_type != SCR_COPY_PARTNER) { scr_compute_crc(map, id, scr_my_rank_world, file); } } /* determine whether everyone's files are good */ int all_valid = scr_alltrue(valid); if (! all_valid) { if (scr_my_rank_world == 0) { scr_dbg(1, "Exiting copy since one or more checkpoint files is invalid"); } return SCR_FAILURE; } /* start timer */ time_t timestamp_start; double time_start; if (scr_my_rank_world == 0) { timestamp_start = scr_log_seconds(); time_start = MPI_Wtime(); } /* apply the redundancy scheme */ int rc = SCR_FAILURE; switch (c->copy_type) { case SCR_COPY_SINGLE: rc = SCR_SUCCESS; break; case SCR_COPY_PARTNER: rc = scr_reddesc_apply_partner(map, c, id); break; case SCR_COPY_XOR: rc = scr_reddesc_apply_xor(map, c, id); break; } /* record the number of files this task wrote during this dataset * (need to remember when a task writes 0 files) */ int num_files = scr_filemap_num_files(map, id, scr_my_rank_world); scr_filemap_set_expected_files(map, id, scr_my_rank_world, num_files); scr_filemap_write(scr_map_file, map); /* determine whether everyone succeeded in their copy */ int valid_copy = (rc == SCR_SUCCESS); if (! valid_copy) { scr_err("scr_copy_files failed with return code %d @ %s:%d", rc, __FILE__, __LINE__ ); } int all_valid_copy = scr_alltrue(valid_copy); rc = all_valid_copy ? SCR_SUCCESS : SCR_FAILURE; /* add up total number of bytes */ MPI_Allreduce(&my_bytes, bytes, 1, MPI_DOUBLE, MPI_SUM, scr_comm_world); /* stop timer and report performance info */ if (scr_my_rank_world == 0) { double time_end = MPI_Wtime(); double time_diff = time_end - time_start; double bw = *bytes / (1024.0 * 1024.0 * time_diff); scr_dbg(1, "scr_reddesc_apply: %f secs, %e bytes, %f MB/s, %f MB/s per proc", time_diff, *bytes, bw, bw/scr_ranks_world ); /* log data on the copy in the database */ if (scr_log_enable) { char* dir = scr_cache_dir_get(c, id); scr_log_transfer("COPY", c->base, dir, &id, ×tamp_start, &time_diff, bytes); scr_free(&dir); } } return rc; }