/* allocates a new hash element */ static scr_hash_elem* scr_hash_elem_new() { scr_hash_elem* elem = (scr_hash_elem*) SCR_MALLOC(sizeof(scr_hash_elem)); elem->key = NULL; elem->hash = NULL; return elem; }
/* this moves all files of the specified dataset in the cache to * make them accessible to new rank mapping */ static int scr_distribute_files(scr_filemap* map, const scr_reddesc* red, int id) { int i, round; int rc = SCR_SUCCESS; /* TODO: mark dataset as being distributed in filemap, * because if we fail in the middle of a distribute, * we can't trust the contents of the files anymore, * at which point it should be deleted */ /* clean out any incomplete files before we start */ scr_cache_clean(map); /* for this dataset, get list of ranks we have data for */ int nranks = 0; int* ranks = NULL; scr_filemap_list_ranks_by_dataset(map, id, &nranks, &ranks); /* walk backwards through the list of ranks, and set our start index * to the rank which is the first rank that is equal to or higher * than our own rank -- when we assign round ids below, this offsetting * helps distribute the load */ int start_index = 0; int invalid_rank_found = 0; for (i = nranks-1; i >= 0; i--) { int rank = ranks[i]; /* pick the first rank whose rank id is equal to or higher than our own */ if (rank >= scr_my_rank_world) { start_index = i; } /* while we're at it, check that the rank is within range */ if (rank < 0 || rank >= scr_ranks_world) { scr_err("Invalid rank id %d in world of %d @ %s:%d", rank, scr_ranks_world, __FILE__, __LINE__ ); invalid_rank_found = 1; } } /* check that we didn't find an invalid rank on any process */ if (! scr_alltrue(invalid_rank_found == 0)) { scr_free(&ranks); return SCR_FAILURE; } /* allocate array to record the rank we can send to in each round */ int* have_rank_by_round = (int*) SCR_MALLOC(sizeof(int) * nranks); int* send_flag_by_round = (int*) SCR_MALLOC(sizeof(int) * nranks); /* check that we have all of the files for each rank, * and determine the round we can send them */ scr_hash* send_hash = scr_hash_new(); scr_hash* recv_hash = scr_hash_new(); for (round = 0; round < nranks; round++) { /* get the rank id */ int index = (start_index + round) % nranks; int rank = ranks[index]; /* record the rank indexed by the round number */ have_rank_by_round[round] = rank; /* assume we won't be sending to this rank in this round */ send_flag_by_round[round] = 0; /* if we have files for this rank, specify the round we can * send those files in */ if (scr_bool_have_files(map, id, rank)) { scr_hash_setf(send_hash, NULL, "%d %d", rank, round); } } scr_hash_exchange(send_hash, recv_hash, scr_comm_world); /* search for the minimum round we can get our files */ int retrieve_rank = -1; int retrieve_round = -1; scr_hash_elem* elem = NULL; for (elem = scr_hash_elem_first(recv_hash); elem != NULL; elem = scr_hash_elem_next(elem)) { /* get the rank id */ int rank = scr_hash_elem_key_int(elem); /* get the round id */ scr_hash* round_hash = scr_hash_elem_hash(elem); scr_hash_elem* round_elem = scr_hash_elem_first(round_hash); char* round_str = scr_hash_elem_key(round_elem); int round = atoi(round_str); /* record this round and rank number if it's less than the current round */ if (round < retrieve_round || retrieve_round == -1) { retrieve_round = round; retrieve_rank = rank; } } /* done with the round hashes, free them off */ scr_hash_delete(&recv_hash); scr_hash_delete(&send_hash); /* free off our list of ranks */ scr_free(&ranks); /* for some redundancy schemes, we know at this point whether we * can recover all files */ int can_get_files = (retrieve_rank != -1); if (red->copy_type != SCR_COPY_XOR && !scr_alltrue(can_get_files)) { /* print a debug message indicating which rank is missing files */ if (! can_get_files) { scr_dbg(2, "Cannot find process that has my checkpoint files @ %s:%d", __FILE__, __LINE__ ); } return SCR_FAILURE; } /* get the maximum retrieve round */ int max_rounds = 0; MPI_Allreduce( &retrieve_round, &max_rounds, 1, MPI_INT, MPI_MAX, scr_comm_world ); /* tell destination which round we'll take our files in */ send_hash = scr_hash_new(); recv_hash = scr_hash_new(); if (retrieve_rank != -1) { scr_hash_setf(send_hash, NULL, "%d %d", retrieve_rank, retrieve_round); } scr_hash_exchange(send_hash, recv_hash, scr_comm_world); /* determine which ranks want to fetch their files from us */ for(elem = scr_hash_elem_first(recv_hash); elem != NULL; elem = scr_hash_elem_next(elem)) { /* get the round id */ scr_hash* round_hash = scr_hash_elem_hash(elem); scr_hash_elem* round_elem = scr_hash_elem_first(round_hash); char* round_str = scr_hash_elem_key(round_elem); int round = atoi(round_str); /* record whether this rank wants its files from us */ if (round >= 0 && round < nranks) { send_flag_by_round[round] = 1; } } /* done with the round hashes, free them off */ scr_hash_delete(&recv_hash); scr_hash_delete(&send_hash); int tmp_rc = 0; /* run through rounds and exchange files */ for (round = 0; round <= max_rounds; round++) { /* assume we don't need to send or receive any files this round */ int send_rank = MPI_PROC_NULL; int recv_rank = MPI_PROC_NULL; int send_num = 0; int recv_num = 0; /* check whether I can potentially send to anyone in this round */ if (round < nranks) { /* have someone's files, check whether they are asking * for them this round */ if (send_flag_by_round[round]) { /* need to send files this round, remember to whom and how many */ int dst_rank = have_rank_by_round[round]; send_rank = dst_rank; send_num = scr_filemap_num_files(map, id, dst_rank); } } /* if I'm supposed to get my files this round, set the recv_rank */ if (retrieve_round == round) { recv_rank = retrieve_rank; } /* TODO: another special case is to just move files if the * processes are on the same node */ /* if i'm sending to myself, just move (rename) each file */ if (send_rank == scr_my_rank_world) { /* get our file list */ int numfiles = 0; char** files = NULL; scr_filemap_list_files(map, id, send_rank, &numfiles, &files); /* TODO: sort files in reverse order by size */ /* iterate over and rename each file */ for (i=0; i < numfiles; i++) { /* get the current file name */ char* file = files[i]; /* lookup meta data for this file */ scr_meta* meta = scr_meta_new(); scr_filemap_get_meta(map, id, send_rank, file, meta); /* get the path for this file based on its type * and dataset id */ char* dir = NULL; if (scr_meta_check_filetype(meta, SCR_META_FILE_USER) == SCR_SUCCESS) { dir = scr_cache_dir_get(red, id); } else { dir = scr_cache_dir_hidden_get(red, id); } /* build the new file name */ scr_path* path_newfile = scr_path_from_str(file); scr_path_basename(path_newfile); scr_path_prepend_str(path_newfile, dir); char* newfile = scr_path_strdup(path_newfile); /* if the new file name is different from the old name, rename it */ if (strcmp(file, newfile) != 0) { /* record the new filename to our map and write it to disk */ scr_filemap_add_file(map, id, send_rank, newfile); scr_filemap_set_meta(map, id, send_rank, newfile, meta); scr_filemap_write(scr_map_file, map); /* rename the file */ scr_dbg(2, "Round %d: rename(%s, %s)", round, file, newfile); tmp_rc = rename(file, newfile); if (tmp_rc != 0) { /* TODO: to cross mount points, if tmp_rc == EXDEV, * open new file, copy, and delete orig */ scr_err("Moving checkpoint file: rename(%s, %s) %s errno=%d @ %s:%d", file, newfile, strerror(errno), errno, __FILE__, __LINE__ ); rc = SCR_FAILURE; } /* remove the old name from the filemap and write it to disk */ scr_filemap_remove_file(map, id, send_rank, file); scr_filemap_write(scr_map_file, map); } /* free the path and string */ scr_free(&newfile); scr_path_delete(&path_newfile); /* free directory string */ scr_free(&dir); /* free meta data */ scr_meta_delete(&meta); } /* free the list of filename pointers */ scr_free(&files); } else { /* if we have files for this round, but the correspdonding * rank doesn't need them, delete the files */ if (round < nranks && send_rank == MPI_PROC_NULL) { int dst_rank = have_rank_by_round[round]; scr_unlink_rank(map, id, dst_rank); } /* sending to and/or recieving from another node */ if (send_rank != MPI_PROC_NULL || recv_rank != MPI_PROC_NULL) { /* have someone to send to or receive from */ int have_outgoing = 0; int have_incoming = 0; if (send_rank != MPI_PROC_NULL) { have_outgoing = 1; } if (recv_rank != MPI_PROC_NULL) { have_incoming = 1; } /* first, determine how many files I will be receiving and * tell how many I will be sending */ MPI_Request request[2]; MPI_Status status[2]; int num_req = 0; if (have_incoming) { MPI_Irecv( &recv_num, 1, MPI_INT, recv_rank, 0, scr_comm_world, &request[num_req] ); num_req++; } if (have_outgoing) { MPI_Isend( &send_num, 1, MPI_INT, send_rank, 0, scr_comm_world, &request[num_req] ); num_req++; } if (num_req > 0) { MPI_Waitall(num_req, request, status); } /* record how many files I will receive (need to distinguish * between 0 files and not knowing) */ if (have_incoming) { scr_filemap_set_expected_files(map, id, scr_my_rank_world, recv_num); } /* turn off send or receive flags if the file count is 0, * nothing else to do */ if (send_num == 0) { have_outgoing = 0; send_rank = MPI_PROC_NULL; } if (recv_num == 0) { have_incoming = 0; recv_rank = MPI_PROC_NULL; } /* TODO: since we overwrite files in place in order to avoid * running out of storage space, we should sort files in order * of descending size for the next step */ /* get our file list for the destination */ int numfiles = 0; char** files = NULL; if (have_outgoing) { scr_filemap_list_files(map, id, send_rank, &numfiles, &files); } /* while we have a file to send or receive ... */ while (have_incoming || have_outgoing) { /* get the filename */ char* file = NULL; scr_meta* send_meta = NULL; if (have_outgoing) { file = files[numfiles - send_num]; send_meta = scr_meta_new(); scr_filemap_get_meta(map, id, send_rank, file, send_meta); } /* exchange meta data so we can determine type of incoming file */ scr_meta* recv_meta = scr_meta_new(); scr_hash_sendrecv(send_meta, send_rank, recv_meta, recv_rank, scr_comm_world); /* get the path for this file based on its type and dataset id */ char* dir = NULL; if (have_incoming) { if (scr_meta_check_filetype(recv_meta, SCR_META_FILE_USER) == SCR_SUCCESS) { dir = scr_cache_dir_get(red, id); } else { dir = scr_cache_dir_hidden_get(red, id); } } /* exhange file names with partners, * building full path of incoming file */ char file_partner[SCR_MAX_FILENAME]; scr_swap_file_names( file, send_rank, file_partner, sizeof(file_partner), recv_rank, dir, scr_comm_world ); /* free directory string */ scr_free(&dir); /* free incoming meta data (we'll get this again later) */ scr_meta_delete(&recv_meta); /* if we'll receive a file, record the name of our file * in the filemap and write it to disk */ recv_meta = NULL; if (recv_rank != MPI_PROC_NULL) { recv_meta = scr_meta_new(); scr_filemap_add_file(map, id, scr_my_rank_world, file_partner); scr_filemap_write(scr_map_file, map); } /* either sending or receiving a file this round, since we move files, * it will be deleted or overwritten */ if (scr_swap_files(MOVE_FILES, file, send_meta, send_rank, file_partner, recv_meta, recv_rank, scr_comm_world) != SCR_SUCCESS) { scr_err("Swapping files: %s to %d, %s from %d @ %s:%d", file, send_rank, file_partner, recv_rank, __FILE__, __LINE__ ); rc = SCR_FAILURE; } /* if we received a file, record its meta data and decrement * our receive count */ if (have_incoming) { /* record meta data for the file we received */ scr_filemap_set_meta(map, id, scr_my_rank_world, file_partner, recv_meta); scr_meta_delete(&recv_meta); /* decrement receive count */ recv_num--; if (recv_num == 0) { have_incoming = 0; recv_rank = MPI_PROC_NULL; } } /* if we sent a file, remove it from the filemap and decrement * our send count */ if (have_outgoing) { /* remove file from the filemap */ scr_filemap_remove_file(map, id, send_rank, file); scr_meta_delete(&send_meta); /* decrement our send count */ send_num--; if (send_num == 0) { have_outgoing = 0; send_rank = MPI_PROC_NULL; } } /* update filemap on disk */ scr_filemap_write(scr_map_file, map); } /* free our file list */ scr_free(&files); } } } /* if we have more rounds than max rounds, delete the remainder of our files */ for (round = max_rounds+1; round < nranks; round++) { /* have someone's files for this round, so delete them */ int dst_rank = have_rank_by_round[round]; scr_unlink_rank(map, id, dst_rank); } scr_free(&send_flag_by_round); scr_free(&have_rank_by_round); /* write out new filemap and free the memory resources */ scr_filemap_write(scr_map_file, map); /* clean out any incomplete files */ scr_cache_clean(map); /* TODO: if the exchange or redundancy rebuild failed, * we should also delete any *good* files we received */ /* return whether distribute succeeded, it does not ensure we have * all of our files, only that the transfer completed without failure */ return rc; }
/* since on a restart we may end up with more or fewer ranks on a node than the * previous run, rely on the master to read in and distribute the filemap to * other ranks on the node */ int scr_scatter_filemaps(scr_filemap* my_map) { /* TODO: if the control directory is on a device shared by lots of procs, * we should read and distribute this data in a more scalable way */ /* allocate empty send hash */ scr_hash* send_hash = scr_hash_new(); /* if i'm the master on this node, read in all filemaps */ if (scr_storedesc_cntl->rank == 0) { /* create an empty filemap */ scr_filemap* all_map = scr_filemap_new(); /* read in the master map */ scr_hash* hash = scr_hash_new(); scr_hash_read_path(scr_master_map_file, hash); /* for each filemap listed in the master map */ scr_hash_elem* elem; for (elem = scr_hash_elem_first(scr_hash_get(hash, "Filemap")); elem != NULL; elem = scr_hash_elem_next(elem)) { /* get the filename of this filemap */ char* file = scr_hash_elem_key(elem); /* TODO MEMFS: mount storage for each filemap */ /* read in the filemap */ scr_filemap* tmp_map = scr_filemap_new(); scr_path* path_file = scr_path_from_str(file); scr_filemap_read(path_file, tmp_map); scr_path_delete(&path_file); /* merge it with the all_map */ scr_filemap_merge(all_map, tmp_map); /* delete filemap */ scr_filemap_delete(&tmp_map); /* TODO: note that if we fail after unlinking this file but before * writing out the new file, we'll lose information */ /* delete the file */ scr_file_unlink(file); } /* free the hash object */ scr_hash_delete(&hash); /* write out new local 0 filemap */ if (scr_filemap_num_ranks(all_map) > 0) { scr_filemap_write(scr_map_file, all_map); } /* get global rank of each rank */ int* ranks = (int*) SCR_MALLOC(scr_storedesc_cntl->ranks * sizeof(int)); MPI_Gather( &scr_my_rank_world, 1, MPI_INT, ranks, 1, MPI_INT, 0, scr_storedesc_cntl->comm ); /* for each rank, send them their own file data if we have it */ int i; for (i=0; i < scr_storedesc_cntl->ranks; i++) { int rank = ranks[i]; if (scr_filemap_have_rank(all_map, rank)) { /* extract the filemap for this rank */ scr_filemap* tmp_map = scr_filemap_extract_rank(all_map, rank); /* get a reference to the hash object that we'll send to this rank, * and merge this filemap into it */ scr_hash* tmp_hash = scr_hash_getf(send_hash, "%d", i); if (tmp_hash == NULL) { /* if we don't find an existing entry in the send_hash, * create an empty hash and insert it */ scr_hash* empty_hash = scr_hash_new(); scr_hash_setf(send_hash, empty_hash, "%d", i); tmp_hash = empty_hash; } scr_hash_merge(tmp_hash, tmp_map); /* delete the filemap for this rank */ scr_filemap_delete(&tmp_map); } } /* free our rank list */ scr_free(&ranks); /* now just round robin the remainder across the set (load balancing) */ int num; int* remaining_ranks = NULL; scr_filemap_list_ranks(all_map, &num, &remaining_ranks); int j = 0; while (j < num) { /* pick a rank in to send to */ i = j % scr_storedesc_cntl->ranks; /* extract the filemap for this rank */ scr_filemap* tmp_map = scr_filemap_extract_rank(all_map, remaining_ranks[j]); /* get a reference to the hash object that we'll send to this rank, * and merge this filemap into it */ scr_hash* tmp_hash = scr_hash_getf(send_hash, "%d", i); if (tmp_hash == NULL) { /* if we don't find an existing entry in the send_hash, * create an empty hash and insert it */ scr_hash* empty_hash = scr_hash_new(); scr_hash_setf(send_hash, empty_hash, "%d", i); tmp_hash = empty_hash; } scr_hash_merge(tmp_hash, tmp_map); /* delete the filemap for this rank */ scr_filemap_delete(&tmp_map); j++; } scr_free(&remaining_ranks); /* delete the filemap */ scr_filemap_delete(&all_map); /* write out the new master filemap */ hash = scr_hash_new(); char file[SCR_MAX_FILENAME]; for (i=0; i < scr_storedesc_cntl->ranks; i++) { sprintf(file, "%s/filemap_%d.scrinfo", scr_cntl_prefix, i); scr_hash_set_kv(hash, "Filemap", file); } scr_hash_write_path(scr_master_map_file, hash); scr_hash_delete(&hash); } else { /* send our global rank to the master */ MPI_Gather( &scr_my_rank_world, 1, MPI_INT, NULL, 1, MPI_INT, 0, scr_storedesc_cntl->comm ); } /* receive our filemap from master */ scr_hash* recv_hash = scr_hash_new(); scr_hash_exchange(send_hash, recv_hash, scr_storedesc_cntl->comm); /* merge map sent from master into our map */ scr_hash* map_from_master = scr_hash_getf(recv_hash, "%d", 0); if (map_from_master != NULL) { scr_hash_merge(my_map, map_from_master); } /* write out our local filemap */ if (scr_filemap_num_ranks(my_map) > 0) { scr_filemap_write(scr_map_file, my_map); } /* free off our send and receive hashes */ scr_hash_delete(&recv_hash); scr_hash_delete(&send_hash); return SCR_SUCCESS; }
/* given a filename, its meta data, its list of segments, and list of destination containers, * copy file to container files */ static int scr_flush_file_to_containers( const char* file, scr_meta* meta, scr_hash* segments, const char* dst_dir) { /* check that we got something for a source file */ if (file == NULL || strcmp(file, "") == 0) { scr_err("Invalid source file @ %s:%d", __FILE__, __LINE__ ); return SCR_FAILURE; } /* check that our other arguments are valid */ if (meta == NULL || segments == NULL) { scr_err("Invalid metadata or segments @ %s:%d", __FILE__, __LINE__ ); return SCR_FAILURE; } /* open the file for reading */ int fd_src = scr_open(file, O_RDONLY); if (fd_src < 0) { scr_err("Opening file to copy: scr_open(%s) errno=%d %s @ %s:%d", file, errno, strerror(errno), __FILE__, __LINE__ ); return SCR_FAILURE; } #if !defined(__APPLE__) /* TODO: posix_fadvise(fd, 0, 0, POSIX_FADV_DONTNEED | POSIX_FADV_SEQUENTIAL) that tells the kernel that you don't ever need the pages from the file again, and it won't bother keeping them in the page cache. */ posix_fadvise(fd_src, 0, 0, POSIX_FADV_DONTNEED | POSIX_FADV_SEQUENTIAL); #endif /* get the buffer size we'll use to write to the file */ unsigned long buf_size = scr_file_buf_size; /* allocate buffer to read in file chunks */ char* buf = (char*) SCR_MALLOC(buf_size); /* initialize crc value */ uLong crc; if (scr_crc_on_flush) { crc = crc32(0L, Z_NULL, 0); } int rc = SCR_SUCCESS; /* write out each segment */ scr_hash_sort_int(segments, SCR_HASH_SORT_ASCENDING); scr_hash_elem* elem; for (elem = scr_hash_elem_first(segments); elem != NULL; elem = scr_hash_elem_next(elem)) { /* get the container info for this segment */ scr_hash* hash = scr_hash_elem_hash(elem); /* get the offset into the container and the length of the segment (both in bytes) */ char* container_name; unsigned long container_offset, segment_length; if (scr_container_get_name_offset_length(hash, &container_name, &container_offset, &segment_length) != SCR_SUCCESS) { scr_err("Failed to get segment offset and length @ %s:%d", __FILE__, __LINE__ ); rc = SCR_FAILURE; break; } /* build full name to destination file */ scr_path* dst_path = scr_path_from_str(dst_dir); scr_path_append_str(dst_path, container_name); scr_path_reduce(dst_path); char* dst_file = scr_path_strdup(dst_path); /* open container file for writing -- we don't truncate here because more than one * process may be writing to the same file */ int fd_container = scr_open(dst_file, O_WRONLY); if (fd_container < 0) { scr_err("Opening file for writing: scr_open(%s) errno=%d %s @ %s:%d", dst_file, errno, strerror(errno), __FILE__, __LINE__ ); rc = SCR_FAILURE; break; } #if !defined(__APPLE__) /* TODO: posix_fadvise(fd, 0, 0, POSIX_FADV_DONTNEED | POSIX_FADV_SEQUENTIAL) that tells the kernel that you don't ever need the pages from the file again, and it won't bother keeping them in the page cache. */ posix_fadvise(fd_container, 0, 0, POSIX_FADV_DONTNEED | POSIX_FADV_SEQUENTIAL); #endif /* seek to offset within container */ off_t pos = (off_t) container_offset; if (lseek(fd_container, pos, SEEK_SET) == (off_t)-1) { /* our seek failed, return an error */ scr_err("Failed to seek to byte %lu in %s @ %s:%d", pos, dst_file, __FILE__, __LINE__ ); rc = SCR_FAILURE; break; } /* copy data from file into container in chunks */ unsigned long remaining = segment_length; while (remaining > 0) { /* read / write up to buf_size bytes at a time from file */ unsigned long count = remaining; if (count > buf_size) { count = buf_size; } /* attempt to read buf_size bytes from file */ int nread = scr_read_attempt(file, fd_src, buf, count); /* if we read some bytes, write them out */ if (nread > 0) { /* optionally compute crc value as we go */ if (scr_crc_on_flush) { crc = crc32(crc, (const Bytef*) buf, (uInt) nread); } /* write our nread bytes out */ int nwrite = scr_write_attempt(dst_file, fd_container, buf, nread); /* check for a write error or a short write */ if (nwrite != nread) { /* write had a problem, stop copying and return an error */ rc = SCR_FAILURE; break; } /* subtract the bytes we've processed from the number remaining */ remaining -= (unsigned long) nread; } /* assume a short read is an error */ if (nread < count) { /* read had a problem, stop copying and return an error */ rc = SCR_FAILURE; break; } /* check for a read error, stop copying and return an error */ if (nread < 0) { /* read had a problem, stop copying and return an error */ rc = SCR_FAILURE; break; } } /* close container */ if (scr_close(dst_file, fd_container) != SCR_SUCCESS) { rc = SCR_FAILURE; } /* free the container file name and path */ scr_free(&dst_file); scr_path_delete(&dst_path); } /* close the source file */ if (scr_close(file, fd_src) != SCR_SUCCESS) { rc = SCR_FAILURE; } /* free buffer */ scr_free(&buf); /* verify / set crc value */ if (rc == SCR_SUCCESS) { uLong crc2; if (scr_crc_on_flush) { if (scr_meta_get_crc32(meta, &crc2) == SCR_SUCCESS) { /* if a crc is already set in the meta data, check that we computed the same value */ if (crc != crc2) { scr_err("CRC32 mismatch detected when flushing file %s @ %s:%d", file, __FILE__, __LINE__ ); rc = SCR_FAILURE; } } else { /* if there is no crc set, let's set it now */ scr_meta_set_crc32(meta, crc); } } } return rc; }
/* flushes data for files specified in file_list (with flow control), * and records status of each file in data */ static int scr_flush_data(scr_hash* file_list, scr_hash* data) { int flushed = SCR_SUCCESS; /* flow control the write among processes */ if (scr_my_rank_world == 0) { /* first, flush each of my files and fill in meta data structure */ if (scr_flush_files_list(file_list, data) != SCR_SUCCESS) { flushed = SCR_FAILURE; } /* now, have a sliding window of w processes write simultaneously */ int w = scr_flush_width; if (w > (scr_ranks_world - 1)) { w = scr_ranks_world - 1; } /* allocate MPI_Request arrays and an array of ints */ int* flags = (int*) SCR_MALLOC(2 * w * sizeof(int)); MPI_Request* req = (MPI_Request*) SCR_MALLOC(2 * w * sizeof(MPI_Request)); MPI_Status status; int i = 1; int outstanding = 0; int index = 0; while (i < scr_ranks_world || outstanding > 0) { /* issue up to w outstanding sends and receives */ while (i < scr_ranks_world && outstanding < w) { /* post a receive for the response message we'll get back when rank i is done */ MPI_Irecv(&flags[w + index], 1, MPI_INT, i, 0, scr_comm_world, &req[w + index]); /* post a send to tell rank i to start */ flags[index] = flushed; MPI_Isend(&flags[index], 1, MPI_INT, i, 0, scr_comm_world, &req[index]); /* update the number of outstanding requests */ i++; outstanding++; index++; } /* wait to hear back from any rank */ MPI_Waitany(w, &req[w], &index, &status); /* someone responded, the send to this rank should also be done, so complete it */ MPI_Wait(&req[index], &status); /* determine whether this rank flushed its file successfully */ if (flags[w + index] != SCR_SUCCESS) { flushed = SCR_FAILURE; } /* one less request outstanding now */ outstanding--; } /* free the MPI_Request arrays */ scr_free(&req); scr_free(&flags); } else { /* receive signal to start */ int start = 0; MPI_Status status; MPI_Recv(&start, 1, MPI_INT, 0, 0, scr_comm_world, &status); /* flush files if we've had success so far, otherwise skip the flush and return failure */ if (start == SCR_SUCCESS) { /* flush each of my files and fill in meta data strucutre */ if (scr_flush_files_list(file_list, data) != SCR_SUCCESS) { flushed = SCR_FAILURE; } } else { /* someone failed before we even started, so don't bother */ flushed = SCR_FAILURE; } /* send message to rank 0 to report that we're done */ MPI_Send(&flushed, 1, MPI_INT, 0, 0, scr_comm_world); } /* determine whether everyone wrote their files ok */ if (scr_alltrue((flushed == SCR_SUCCESS))) { return SCR_SUCCESS; } return SCR_FAILURE; }
/* allocates a new hash */ scr_hash* scr_hash_new() { scr_hash* hash = (scr_hash*) SCR_MALLOC(sizeof(scr_hash)); LIST_INIT(hash); return hash; }
/* fetch files specified in file_list into specified dir and update * filemap */ static int scr_fetch_data( const scr_hash* file_list, const char* dir, scr_filemap* map) { int success = SCR_SUCCESS; /* flow control rate of file reads from rank 0 */ if (scr_my_rank_world == 0) { /* fetch these files into the directory */ if (scr_fetch_files_list(file_list, dir, map) != SCR_SUCCESS) { success = SCR_FAILURE; } /* now, have a sliding window of w processes read simultaneously */ int w = scr_fetch_width; if (w > scr_ranks_world-1) { w = scr_ranks_world-1; } /* allocate MPI_Request arrays and an array of ints */ int* flags = (int*) SCR_MALLOC(2 * w * sizeof(int)); MPI_Request* req = (MPI_Request*) SCR_MALLOC(2 * w * sizeof(MPI_Request)); MPI_Status status; /* execute our flow control window */ int outstanding = 0; int index = 0; int i = 1; while (i < scr_ranks_world || outstanding > 0) { /* issue up to w outstanding sends and receives */ while (i < scr_ranks_world && outstanding < w) { /* post a receive for the response message we'll get back when * rank i is done */ MPI_Irecv(&flags[index + w], 1, MPI_INT, i, 0, scr_comm_world, &req[index + w]); /* send a start signal to this rank */ flags[index] = success; MPI_Isend(&flags[index], 1, MPI_INT, i, 0, scr_comm_world, &req[index]); /* update the number of outstanding requests */ outstanding++; index++; i++; } /* wait to hear back from any rank */ MPI_Waitany(w, &req[w], &index, &status); /* the corresponding send must be complete */ MPI_Wait(&req[index], &status); /* check success code from process */ if (flags[index + w] != SCR_SUCCESS) { success = SCR_FAILURE; } /* one less request outstanding now */ outstanding--; } /* free the MPI_Request arrays */ scr_free(&req); scr_free(&flags); } else { /* wait for start signal from rank 0 */ MPI_Status status; MPI_Recv(&success, 1, MPI_INT, 0, 0, scr_comm_world, &status); /* if rank 0 hasn't seen a failure, try to read in our files */ if (success == SCR_SUCCESS) { /* fetch these files into the directory */ if (scr_fetch_files_list(file_list, dir, map) != SCR_SUCCESS) { success = SCR_FAILURE; } } /* tell rank 0 that we're done and send him our success code */ MPI_Send(&success, 1, MPI_INT, 0, 0, scr_comm_world); } /* determine whether all processes successfully read their files */ if (scr_alltrue(success == SCR_SUCCESS)) { return SCR_SUCCESS; } return SCR_FAILURE; }
/* apply XOR redundancy scheme to dataset files */ static int scr_reddesc_apply_xor(scr_filemap* map, const scr_reddesc* c, int id) { int rc = SCR_SUCCESS; int i; /* get pointer to XOR state structure */ scr_reddesc_xor* state = (scr_reddesc_xor*) c->copy_state; /* allocate buffer to read a piece of my file */ char* send_buf = (char*) scr_align_malloc(scr_mpi_buf_size, scr_page_size); if (send_buf == NULL) { scr_abort(-1, "Allocating memory for send buffer: malloc(%d) errno=%d %s @ %s:%d", scr_mpi_buf_size, errno, strerror(errno), __FILE__, __LINE__ ); } /* allocate buffer to read a piece of the recevied chunk file */ char* recv_buf = (char*) scr_align_malloc(scr_mpi_buf_size, scr_page_size); if (recv_buf == NULL) { scr_abort(-1, "Allocating memory for recv buffer: malloc(%d) errno=%d %s @ %s:%d", scr_mpi_buf_size, errno, strerror(errno), __FILE__, __LINE__ ); } /* count the number of files I have and allocate space in structures for each of them */ int num_files = scr_filemap_num_files(map, id, scr_my_rank_world); int* fds = (int*) SCR_MALLOC(num_files * sizeof(int)); char** filenames = (char**) SCR_MALLOC(num_files * sizeof(char*)); unsigned long* filesizes = (unsigned long*) SCR_MALLOC(num_files * sizeof(unsigned long)); /* record partner's redundancy descriptor hash in our filemap */ scr_hash* lhs_desc_hash = scr_hash_new(); scr_hash* my_desc_hash = scr_hash_new(); scr_reddesc_store_to_hash(c, my_desc_hash); scr_hash_sendrecv(my_desc_hash, state->rhs_rank, lhs_desc_hash, state->lhs_rank, c->comm); scr_filemap_set_desc(map, id, state->lhs_rank_world, lhs_desc_hash); scr_hash_delete(&my_desc_hash); scr_hash_delete(&lhs_desc_hash); /* allocate a new xor file header hash */ scr_hash* header = scr_hash_new(); /* record the global ranks of the processes in our xor group */ scr_hash_merge(header, state->group_map); /* record dataset in header */ scr_hash* dataset = scr_hash_new(); scr_filemap_get_dataset(map, id, scr_my_rank_world, dataset); scr_hash_set(header, SCR_KEY_COPY_XOR_DATASET, dataset); /* open each file, get the filesize of each, and read the meta data of each */ scr_hash* current_files = scr_hash_new(); int file_count = 0; unsigned long my_bytes = 0; scr_hash_elem* file_elem; for (file_elem = scr_filemap_first_file(map, id, scr_my_rank_world); file_elem != NULL; file_elem = scr_hash_elem_next(file_elem)) { /* get the filename */ filenames[file_count] = scr_hash_elem_key(file_elem); /* get the filesize of this file and add the byte count to the total */ filesizes[file_count] = scr_file_size(filenames[file_count]); my_bytes += filesizes[file_count]; /* read the meta data for this file and insert it into the current_files hash */ scr_meta* file_hash = scr_meta_new(); scr_filemap_get_meta(map, id, scr_my_rank_world, filenames[file_count], file_hash); scr_hash_setf(current_files, file_hash, "%d", file_count); /* open the file */ fds[file_count] = scr_open(filenames[file_count], O_RDONLY); if (fds[file_count] < 0) { /* TODO: try again? */ scr_abort(-1, "Opening checkpoint file for copying: scr_open(%s, O_RDONLY) errno=%d %s @ %s:%d", filenames[file_count], errno, strerror(errno), __FILE__, __LINE__ ); } file_count++; } /* set total number of files we have, plus our rank */ scr_hash* current_hash = scr_hash_new(); scr_hash_set_kv_int(current_hash, SCR_KEY_COPY_XOR_RANK, scr_my_rank_world); scr_hash_set_kv_int(current_hash, SCR_KEY_COPY_XOR_FILES, file_count); scr_hash_set(current_hash, SCR_KEY_COPY_XOR_FILE, current_files); /* exchange file info with partners and add data to our header */ scr_hash* partner_hash = scr_hash_new(); scr_hash_sendrecv(current_hash, state->rhs_rank, partner_hash, state->lhs_rank, c->comm); scr_hash_set(header, SCR_KEY_COPY_XOR_CURRENT, current_hash); scr_hash_set(header, SCR_KEY_COPY_XOR_PARTNER, partner_hash); /* allreduce to get maximum filesize */ unsigned long max_bytes; MPI_Allreduce(&my_bytes, &max_bytes, 1, MPI_UNSIGNED_LONG, MPI_MAX, c->comm); /* TODO: use unsigned long integer arithmetic (with proper byte padding) instead of char to speed things up */ /* compute chunk size according to maximum file length and number of ranks in xor set */ /* if filesize doesn't divide evenly, then add one byte to chunk_size */ /* TODO: check that ranks > 1 for this divide to be safe (or at partner selection time) */ size_t chunk_size = max_bytes / (unsigned long) (c->ranks - 1); if ((c->ranks - 1) * chunk_size < max_bytes) { chunk_size++; } /* TODO: need something like this to handle 0-byte files? */ if (chunk_size == 0) { chunk_size++; } /* record the dataset id and the chunk size in the xor chunk header */ scr_hash_util_set_bytecount(header, SCR_KEY_COPY_XOR_CHUNK, chunk_size); /* set chunk filenames of form: xor.<group_id>_<xor_rank+1>_of_<xor_ranks>.scr */ char my_chunk_file[SCR_MAX_FILENAME]; char* dir = scr_cache_dir_hidden_get(c, id); sprintf(my_chunk_file, "%s/xor.%d_%d_of_%d.scr", dir, c->group_id, c->rank+1, c->ranks); scr_free(&dir); /* record chunk file in filemap before creating it */ scr_filemap_add_file(map, id, scr_my_rank_world, my_chunk_file); scr_filemap_write(scr_map_file, map); /* open my chunk file */ mode_t mode_file = scr_getmode(1, 1, 0); int fd_chunk = scr_open(my_chunk_file, O_WRONLY | O_CREAT | O_TRUNC, mode_file); if (fd_chunk < 0) { /* TODO: try again? */ scr_abort(-1, "Opening XOR chunk file for writing: scr_open(%s) errno=%d %s @ %s:%d", my_chunk_file, errno, strerror(errno), __FILE__, __LINE__ ); } /* write out the xor chunk header */ scr_hash_write_fd(my_chunk_file, fd_chunk, header); scr_hash_delete(&header); MPI_Request request[2]; MPI_Status status[2]; /* XOR Reduce_scatter */ size_t nread = 0; while (nread < chunk_size) { size_t count = chunk_size - nread; if (count > scr_mpi_buf_size) { count = scr_mpi_buf_size; } int chunk_id; for(chunk_id = c->ranks-1; chunk_id >= 0; chunk_id--) { /* read the next set of bytes for this chunk from my file into send_buf */ if (chunk_id > 0) { int chunk_id_rel = (c->rank + c->ranks + chunk_id) % c->ranks; if (chunk_id_rel > c->rank) { chunk_id_rel--; } unsigned long offset = chunk_size * (unsigned long) chunk_id_rel + nread; if (scr_read_pad_n(num_files, filenames, fds, send_buf, count, offset, filesizes) != SCR_SUCCESS) { rc = SCR_FAILURE; } } else { memset(send_buf, 0, count); } /* TODO: XORing with unsigned long would be faster here (if chunk size is multiple of this size) */ /* merge the blocks via xor operation */ if (chunk_id < c->ranks-1) { for (i = 0; i < count; i++) { send_buf[i] ^= recv_buf[i]; } } if (chunk_id > 0) { /* not our chunk to write, forward it on and get the next */ MPI_Irecv(recv_buf, count, MPI_BYTE, state->lhs_rank, 0, c->comm, &request[0]); MPI_Isend(send_buf, count, MPI_BYTE, state->rhs_rank, 0, c->comm, &request[1]); MPI_Waitall(2, request, status); } else { /* write send block to send chunk file */ if (scr_write_attempt(my_chunk_file, fd_chunk, send_buf, count) != count) { rc = SCR_FAILURE; } } } nread += count; } /* close my chunkfile, with fsync */ if (scr_close(my_chunk_file, fd_chunk) != SCR_SUCCESS) { rc = SCR_FAILURE; } /* close my dataset files */ for (i=0; i < num_files; i++) { scr_close(filenames[i], fds[i]); } /* free the buffers */ scr_free(&filesizes); /* in this case, we don't free each name, since we copied the pointer to the string in the filemap */ scr_free(&filenames); scr_free(&fds); scr_align_free(&send_buf); scr_align_free(&recv_buf); /* TODO: need to check for errors */ /* write meta file for xor chunk */ unsigned long my_chunk_file_size = scr_file_size(my_chunk_file); scr_meta* meta = scr_meta_new(); scr_meta_set_filename(meta, my_chunk_file); scr_meta_set_filetype(meta, SCR_META_FILE_XOR); scr_meta_set_filesize(meta, my_chunk_file_size); scr_meta_set_complete(meta, 1); /* TODODSET: move the ranks field elsewhere, for now it's needed by scr_index.c */ scr_meta_set_ranks(meta, scr_ranks_world); scr_filemap_set_meta(map, id, scr_my_rank_world, my_chunk_file, meta); scr_filemap_write(scr_map_file, map); scr_meta_delete(&meta); /* if crc_on_copy is set, compute and store CRC32 value for chunk file */ if (scr_crc_on_copy) { scr_compute_crc(map, id, scr_my_rank_world, my_chunk_file); /* TODO: would be nice to save this CRC in our partner's XOR file so we can check correctness on a rebuild */ } return rc; }