/* environment specific init/finalize */ int scr_env_init(void) { #ifdef SCR_RESOURCE_MANAGER_PMIX /* init pmix */ int retval = PMIx_Init(&scr_pmix_proc, NULL, 0); if (retval != PMIX_SUCCESS) { scr_err("PMIx_Init failed: rc=%d @ %s:%d", retval, __FILE__, __LINE__ ); return SCR_FAILURE; } scr_dbg(1, "PMIx_Init succeeded @ %s:%d", __FILE__, __LINE__); #endif /* SCR_MACHINE_TYPE == SCR_PMIX */ #ifdef HAVE_LIBCPPR /* attempt to init cppr */ int cppr_ret = cppr_status(); if (cppr_ret != CPPR_SUCCESS) { scr_abort(-1, "libcppr cppr_status() failed: %d '%s' @ %s:%d", cppr_ret, cppr_err_to_str(cppr_ret), __FILE__, __LINE__ ); } scr_dbg(1, "#bold CPPR is present @ %s:%d", __FILE__, __LINE__); #endif /* HAVE_LIBCPPR */ return SCR_SUCCESS; }
/* write current working directory to buf */ int scr_getcwd(char* buf, size_t size) { int rc = SCR_SUCCESS; if (getcwd(buf, size) == NULL) { scr_abort(-1, "Problem reading current working directory (getcwd() errno=%d %s) @ %s:%d", errno, strerror(errno), __FILE__, __LINE__ ); rc = SCR_FAILURE; } return rc; }
/* allocates a new hash */ scr_hash* scr_hash_new() { scr_hash* hash = (scr_hash*) malloc(sizeof(scr_hash)); if (hash != NULL) { LIST_INIT(hash); } else { scr_abort(-1, "Failed to allocate memory for hash object @ %s:%d", __FILE__, __LINE__ ); } return hash; }
/* allocates a new hash element */ static scr_hash_elem* scr_hash_elem_new() { scr_hash_elem* elem = (scr_hash_elem*) malloc(sizeof(scr_hash_elem)); if (elem != NULL) { elem->key = NULL; elem->hash = NULL; } else { scr_abort(-1, "Failed to allocate memory for hash element @ %s:%d", __FILE__, __LINE__ ); } return elem; }
/* searches for name and returns a character pointer to its value if set, * returns NULL if not found */ char* scr_param_get(char* name) { char* value = NULL; /* see if this parameter is one which is restricted from user */ scr_hash* no_user = scr_hash_get(scr_no_user_hash, name); /* if parameter is set in environment, return that value */ if (no_user == NULL && getenv(name) != NULL) { /* we don't just return the getenv value directly because that causes * segfaults on some systems, so instead we add it to a hash and return * the pointer into the hash */ /* try to lookup the value for this name in case we've already cached it */ if (scr_hash_util_get_str(scr_env_hash, name, &value) != SCR_SUCCESS) { /* it's not in the hash yet, so add it */ char* tmp_value = strdup(getenv(name)); scr_hash_util_set_str(scr_env_hash, name, tmp_value); scr_free(&tmp_value); /* now issue our lookup again */ if (scr_hash_util_get_str(scr_env_hash, name, &value) != SCR_SUCCESS) { /* it's an error if we don't find it this time */ scr_abort(-1, "Failed to find value for %s in env hash @ %s:%d", name, __FILE__, __LINE__ ); } } return value; } /* otherwise, if parameter is set in user configuration file, * return that value */ value = scr_hash_elem_get_first_val(scr_user_hash, name); if (no_user == NULL && value != NULL) { return value; } /* otherwise, if parameter is set in system configuration file, * return that value */ value = scr_hash_elem_get_first_val(scr_system_hash, name); if (value != NULL) { return value; } /* parameter not found, return NULL */ return NULL; }
/* allocates a new string (to be freed with scr_free) * that is path to user config file */ static char* user_config_path() { char* file = NULL; /* first, use SCR_CONF_FILE if it's set */ char* value = getenv("SCR_CONF_FILE"); if (value != NULL) { file = strdup(value); return file; } /* otherwise, look in the prefix directory */ char* prefix = NULL; value = getenv("SCR_PREFIX"); if (value != NULL) { /* user set SCR_PREFIX, strdup that value */ prefix = strdup(value); } else { /* if user didn't set with SCR_PREFIX, * pick up the current working directory as a default */ char current_dir[SCR_MAX_FILENAME]; if (scr_getcwd(current_dir, sizeof(current_dir)) != SCR_SUCCESS) { scr_abort(-1, "Problem reading current working directory @ %s:%d", __FILE__, __LINE__ ); } prefix = strdup(current_dir); } /* couldn't find a prefix directory, so bail */ if (prefix == NULL) { return file; } /* tack file name on to directory */ scr_path* prefix_path = scr_path_from_str(prefix); scr_path_append_str(prefix_path, SCR_CONFIG_FILE_USER); file = scr_path_strdup(prefix_path); scr_path_delete(&prefix_path); /* free the prefix dir which we strdup'd */ scr_free(&prefix); return file; }
static int scr_compute_crc(scr_filemap* map, int id, int rank, const char* file) { /* compute crc for the file */ uLong crc_file; if (scr_crc32(file, &crc_file) != SCR_SUCCESS) { scr_err("Failed to compute crc for file %s @ %s:%d", file, __FILE__, __LINE__ ); return SCR_FAILURE; } /* allocate a new meta data object */ scr_meta* meta = scr_meta_new(); if (meta == NULL) { scr_abort(-1, "Failed to allocate meta data object @ %s:%d", __FILE__, __LINE__ ); } /* read meta data from filemap */ if (scr_filemap_get_meta(map, id, rank, file, meta) != SCR_SUCCESS) { return SCR_FAILURE; } int rc = SCR_SUCCESS; /* read crc value from meta data */ uLong crc_meta; if (scr_meta_get_crc32(meta, &crc_meta) == SCR_SUCCESS) { /* check that the values are the same */ if (crc_file != crc_meta) { rc = SCR_FAILURE; } } else { /* record crc in filemap */ scr_meta_set_crc32(meta, crc_file); scr_filemap_set_meta(map, id, rank, file, meta); } /* free our meta data object */ scr_meta_delete(&meta); return rc; }
static int scr_swap_files_copy( int have_outgoing, const char* file_send, scr_meta* meta_send, int rank_send, uLong* crc32_send, int have_incoming, const char* file_recv, scr_meta* meta_recv, int rank_recv, uLong* crc32_recv, MPI_Comm comm) { int rc = SCR_SUCCESS; MPI_Request request[2]; MPI_Status status[2]; /* allocate MPI send buffer */ char *buf_send = NULL; if (have_outgoing) { buf_send = (char*) scr_align_malloc(scr_mpi_buf_size, scr_page_size); if (buf_send == NULL) { scr_abort(-1, "Allocating memory: malloc(%ld) errno=%d %s @ %s:%d", scr_mpi_buf_size, errno, strerror(errno), __FILE__, __LINE__ ); return SCR_FAILURE; } } /* allocate MPI recv buffer */ char *buf_recv = NULL; if (have_incoming) { buf_recv = (char*) scr_align_malloc(scr_mpi_buf_size, scr_page_size); if (buf_recv == NULL) { scr_abort(-1, "Allocating memory: malloc(%ld) errno=%d %s @ %s:%d", scr_mpi_buf_size, errno, strerror(errno), __FILE__, __LINE__ ); return SCR_FAILURE; } } /* open the file to send: read-only mode */ int fd_send = -1; if (have_outgoing) { fd_send = scr_open(file_send, O_RDONLY); if (fd_send < 0) { scr_abort(-1, "Opening file for send: scr_open(%s, O_RDONLY) errno=%d %s @ %s:%d", file_send, errno, strerror(errno), __FILE__, __LINE__ ); } } /* open the file to recv: truncate, write-only mode */ int fd_recv = -1; if (have_incoming) { mode_t mode_file = scr_getmode(1, 1, 0); fd_recv = scr_open(file_recv, O_WRONLY | O_CREAT | O_TRUNC, mode_file); if (fd_recv < 0) { scr_abort(-1, "Opening file for recv: scr_open(%s, O_WRONLY | O_CREAT | O_TRUNC, ...) errno=%d %s @ %s:%d", file_recv, errno, strerror(errno), __FILE__, __LINE__ ); } } /* exchange file chunks */ int nread, nwrite; int sending = 0; if (have_outgoing) { sending = 1; } int receiving = 0; if (have_incoming) { receiving = 1; } while (sending || receiving) { /* if we are still receiving a file, post a receive */ if (receiving) { MPI_Irecv(buf_recv, scr_mpi_buf_size, MPI_BYTE, rank_recv, 0, comm, &request[0]); } /* if we are still sending a file, read a chunk, send it, and wait */ if (sending) { nread = scr_read(file_send, fd_send, buf_send, scr_mpi_buf_size); if (scr_crc_on_copy && nread > 0) { *crc32_send = crc32(*crc32_send, (const Bytef*) buf_send, (uInt) nread); } if (nread < 0) { nread = 0; } MPI_Isend(buf_send, nread, MPI_BYTE, rank_send, 0, comm, &request[1]); MPI_Wait(&request[1], &status[1]); if (nread < scr_mpi_buf_size) { sending = 0; } } /* if we are still receiving a file, * wait on our receive to complete and write the data */ if (receiving) { MPI_Wait(&request[0], &status[0]); MPI_Get_count(&status[0], MPI_BYTE, &nwrite); if (scr_crc_on_copy && nwrite > 0) { *crc32_recv = crc32(*crc32_recv, (const Bytef*) buf_recv, (uInt) nwrite); } scr_write(file_recv, fd_recv, buf_recv, nwrite); if (nwrite < scr_mpi_buf_size) { receiving = 0; } } } /* close the files */ if (have_outgoing) { scr_close(file_send, fd_send); } if (have_incoming) { scr_close(file_recv, fd_recv); } /* set crc field on our file if it hasn't been set already */ if (scr_crc_on_copy && have_outgoing) { uLong meta_send_crc; if (scr_meta_get_crc32(meta_send, &meta_send_crc) != SCR_SUCCESS) { scr_meta_set_crc32(meta_send, *crc32_send); } else { /* TODO: we could check that the crc on the sent file matches and take some action if not */ } } /* free the MPI buffers */ scr_align_free(&buf_recv); scr_align_free(&buf_send); return rc; }
static int scr_swap_files_move( int have_outgoing, const char* file_send, scr_meta* meta_send, int rank_send, uLong* crc32_send, int have_incoming, const char* file_recv, scr_meta* meta_recv, int rank_recv, uLong* crc32_recv, MPI_Comm comm) { int rc = SCR_SUCCESS; MPI_Request request[2]; MPI_Status status[2]; /* allocate MPI send buffer */ char *buf_send = NULL; if (have_outgoing) { buf_send = (char*) scr_align_malloc(scr_mpi_buf_size, scr_page_size); if (buf_send == NULL) { scr_abort(-1, "Allocating memory: malloc(%ld) errno=%d %s @ %s:%d", scr_mpi_buf_size, errno, strerror(errno), __FILE__, __LINE__ ); return SCR_FAILURE; } } /* allocate MPI recv buffer */ char *buf_recv = NULL; if (have_incoming) { buf_recv = (char*) scr_align_malloc(scr_mpi_buf_size, scr_page_size); if (buf_recv == NULL) { scr_abort(-1, "Allocating memory: malloc(%ld) errno=%d %s @ %s:%d", scr_mpi_buf_size, errno, strerror(errno), __FILE__, __LINE__ ); return SCR_FAILURE; } } /* since we'll overwrite our send file in place with the recv file, * which may be larger, we need to keep track of how many bytes we've * sent and whether we've sent them all */ unsigned long filesize_send = 0; /* open our file */ int fd = -1; if (have_outgoing) { /* we'll overwrite our send file (or just read it if there is no incoming) */ filesize_send = scr_file_size(file_send); fd = scr_open(file_send, O_RDWR); if (fd < 0) { /* TODO: skip writes and return error? */ scr_abort(-1, "Opening file for send/recv: scr_open(%s, O_RDWR) errno=%d %s @ %s:%d", file_send, errno, strerror(errno), __FILE__, __LINE__ ); } } else if (have_incoming) { /* if we're in this branch, then we only have an incoming file, * so we'll write our recv file from scratch */ mode_t mode_file = scr_getmode(1, 1, 0); fd = scr_open(file_recv, O_WRONLY | O_CREAT | O_TRUNC, mode_file); if (fd < 0) { /* TODO: skip writes and return error? */ scr_abort(-1, "Opening file for recv: scr_open(%s, O_WRONLY | O_CREAT | O_TRUNC, ...) errno=%d %s @ %s:%d", file_recv, errno, strerror(errno), __FILE__, __LINE__ ); } } /* exchange file chunks */ int sending = 0; if (have_outgoing) { sending = 1; } int receiving = 0; if (have_incoming) { receiving = 1; } int nread, nwrite; off_t read_pos = 0, write_pos = 0; while (sending || receiving) { if (receiving) { /* prepare a buffer to receive up to scr_mpi_buf_size bytes */ MPI_Irecv(buf_recv, scr_mpi_buf_size, MPI_BYTE, rank_recv, 0, comm, &request[0]); } if (sending) { /* compute number of bytes to read */ unsigned long count = filesize_send - read_pos; if (count > scr_mpi_buf_size) { count = scr_mpi_buf_size; } /* read a chunk of up to scr_mpi_buf_size bytes into buf_send */ lseek(fd, read_pos, SEEK_SET); /* seek to read position */ nread = scr_read(file_send, fd, buf_send, count); if (scr_crc_on_copy && nread > 0) { *crc32_send = crc32(*crc32_send, (const Bytef*) buf_send, (uInt) nread); } if (nread < 0) { nread = 0; } read_pos += (off_t) nread; /* update read pointer */ /* send chunk (if nread is smaller than scr_mpi_buf_size, * then we've read the whole file) */ MPI_Isend(buf_send, nread, MPI_BYTE, rank_send, 0, comm, &request[1]); MPI_Wait(&request[1], &status[1]); /* check whether we've read the whole file */ if (filesize_send == read_pos && count < scr_mpi_buf_size) { sending = 0; } } if (receiving) { /* count the number of bytes received */ MPI_Wait(&request[0], &status[0]); MPI_Get_count(&status[0], MPI_BYTE, &nwrite); if (scr_crc_on_copy && nwrite > 0) { *crc32_recv = crc32(*crc32_recv, (const Bytef*) buf_recv, (uInt) nwrite); } /* write those bytes to file (if nwrite is smaller than scr_mpi_buf_size, * then we've received the whole file) */ lseek(fd, write_pos, SEEK_SET); /* seek to write position */ scr_write(file_recv, fd, buf_recv, nwrite); write_pos += (off_t) nwrite; /* update write pointer */ /* if nwrite is smaller than scr_mpi_buf_size, * then assume we've received the whole file */ if (nwrite < scr_mpi_buf_size) { receiving = 0; } } } /* close file and cleanup */ if (have_outgoing && have_incoming) { /* sent and received a file; close it, truncate it to corect size, rename it */ scr_close(file_send, fd); truncate(file_send, write_pos); rename(file_send, file_recv); } else if (have_outgoing) { /* only sent a file; close it, delete it, and remove its completion marker */ scr_close(file_send, fd); scr_file_unlink(file_send); } else if (have_incoming) { /* only received a file; just need to close it */ scr_close(file_recv, fd); } if (scr_crc_on_copy && have_outgoing) { uLong meta_send_crc; if (scr_meta_get_crc32(meta_send, &meta_send_crc) != SCR_SUCCESS) { /* we transfer this meta data across below, * so may as well update these fields so we can use them */ scr_meta_set_crc32(meta_send, *crc32_send); /* do not complete file send, we just deleted it above */ } else { /* TODO: we could check that the crc on the sent file matches and take some action if not */ } } /* free the MPI buffers */ scr_align_free(&buf_recv); scr_align_free(&buf_send); return rc; }
/* flush files specified in list, and record corresponding entries for summary file */ static int scr_flush_files_list(scr_hash* file_list, scr_hash* summary) { /* assume we will succeed in this flush */ int rc = SCR_SUCCESS; /* flush each of my files and fill in summary data structure */ scr_hash_elem* elem = NULL; scr_hash* files = scr_hash_get(file_list, SCR_KEY_FILE); for (elem = scr_hash_elem_first(files); elem != NULL; elem = scr_hash_elem_next(elem)) { /* get the filename */ char* file = scr_hash_elem_key(elem); /* convert file to path and extract name of file */ scr_path* path_name = scr_path_from_str(file); scr_path_basename(path_name); /* get the hash for this element */ scr_hash* hash = scr_hash_elem_hash(elem); /* get meta data for this file */ scr_meta* meta = scr_hash_get(hash, SCR_KEY_META); /* if segments are defined, we flush the file to its containers, * otherwise we copy the file out as is */ scr_hash* segments = scr_hash_get(hash, SCR_SUMMARY_6_KEY_SEGMENT); if (segments != NULL) { /* TODO: PRESERVE get original filename here */ /* add this file to the summary file */ char* name = scr_path_strdup(path_name); scr_hash* file_hash = scr_hash_set_kv(summary, SCR_SUMMARY_6_KEY_FILE, name); scr_free(&name); // USERDEF fixme! /* flush the file to the containers listed in its segmenets */ if (scr_flush_file_to_containers(file, meta, segments, scr_prefix) == SCR_SUCCESS) { /* successfully flushed this file, record the filesize */ unsigned long filesize = 0; if (scr_meta_get_filesize(meta, &filesize) == SCR_SUCCESS) { scr_hash_util_set_bytecount(file_hash, SCR_SUMMARY_6_KEY_SIZE, filesize); } /* record the crc32 if one was computed */ uLong crc = 0; if (scr_meta_get_crc32(meta, &crc) == SCR_SUCCESS) { scr_hash_util_set_crc32(file_hash, SCR_SUMMARY_6_KEY_CRC, crc); } /* record segment information in summary file */ scr_hash* segments_copy = scr_hash_new(); scr_hash_merge(segments_copy, segments); scr_hash_set(file_hash, SCR_SUMMARY_6_KEY_SEGMENT, segments_copy); } else { /* the flush failed */ rc = SCR_FAILURE; /* explicitly mark file as incomplete */ scr_hash_set_kv_int(file_hash, SCR_SUMMARY_6_KEY_COMPLETE, 0); } } else { /* get directory to flush file to */ char* dir; if (scr_hash_util_get_str(hash, SCR_KEY_PATH, &dir) == SCR_SUCCESS) { /* create full path of destination file */ scr_path* path_full = scr_path_from_str(dir); scr_path_append(path_full, path_name); /* get relative path to flushed file from SCR_PREFIX directory */ scr_path* path_relative = scr_path_relative(scr_prefix_path, path_full); if (! scr_path_is_null(path_relative)) { /* record the name of the file in the summary hash, and get reference to a hash for this file */ char* name = scr_path_strdup(path_relative); scr_hash* file_hash = scr_hash_set_kv(summary, SCR_SUMMARY_6_KEY_FILE, name); scr_free(&name); /* flush the file and fill in the meta data for this file */ if (scr_flush_a_file(file, dir, meta) == SCR_SUCCESS) { /* successfully flushed this file, record the filesize */ unsigned long filesize = 0; if (scr_meta_get_filesize(meta, &filesize) == SCR_SUCCESS) { scr_hash_util_set_bytecount(file_hash, SCR_SUMMARY_6_KEY_SIZE, filesize); } /* record the crc32 if one was computed */ uLong crc = 0; if (scr_meta_get_crc32(meta, &crc) == SCR_SUCCESS) { scr_hash_util_set_crc32(file_hash, SCR_SUMMARY_6_KEY_CRC, crc); } } else { /* the flush failed */ rc = SCR_FAILURE; /* explicitly mark incomplete files */ scr_hash_set_kv_int(file_hash, SCR_SUMMARY_6_KEY_COMPLETE, 0); } } else { scr_abort(-1, "Failed to get relative path to directory %s from %s @ %s:%d", dir, scr_prefix, __FILE__, __LINE__ ); } /* free relative and full paths */ scr_path_delete(&path_relative); scr_path_delete(&path_full); } else { scr_abort(-1, "Failed to read directory to flush file to @ %s:%d", __FILE__, __LINE__ ); } } /* free the file name path */ scr_path_delete(&path_name); } return rc; }
/* apply XOR redundancy scheme to dataset files */ static int scr_reddesc_apply_xor(scr_filemap* map, const scr_reddesc* c, int id) { int rc = SCR_SUCCESS; int i; /* get pointer to XOR state structure */ scr_reddesc_xor* state = (scr_reddesc_xor*) c->copy_state; /* allocate buffer to read a piece of my file */ char* send_buf = (char*) scr_align_malloc(scr_mpi_buf_size, scr_page_size); if (send_buf == NULL) { scr_abort(-1, "Allocating memory for send buffer: malloc(%d) errno=%d %s @ %s:%d", scr_mpi_buf_size, errno, strerror(errno), __FILE__, __LINE__ ); } /* allocate buffer to read a piece of the recevied chunk file */ char* recv_buf = (char*) scr_align_malloc(scr_mpi_buf_size, scr_page_size); if (recv_buf == NULL) { scr_abort(-1, "Allocating memory for recv buffer: malloc(%d) errno=%d %s @ %s:%d", scr_mpi_buf_size, errno, strerror(errno), __FILE__, __LINE__ ); } /* count the number of files I have and allocate space in structures for each of them */ int num_files = scr_filemap_num_files(map, id, scr_my_rank_world); int* fds = (int*) SCR_MALLOC(num_files * sizeof(int)); char** filenames = (char**) SCR_MALLOC(num_files * sizeof(char*)); unsigned long* filesizes = (unsigned long*) SCR_MALLOC(num_files * sizeof(unsigned long)); /* record partner's redundancy descriptor hash in our filemap */ scr_hash* lhs_desc_hash = scr_hash_new(); scr_hash* my_desc_hash = scr_hash_new(); scr_reddesc_store_to_hash(c, my_desc_hash); scr_hash_sendrecv(my_desc_hash, state->rhs_rank, lhs_desc_hash, state->lhs_rank, c->comm); scr_filemap_set_desc(map, id, state->lhs_rank_world, lhs_desc_hash); scr_hash_delete(&my_desc_hash); scr_hash_delete(&lhs_desc_hash); /* allocate a new xor file header hash */ scr_hash* header = scr_hash_new(); /* record the global ranks of the processes in our xor group */ scr_hash_merge(header, state->group_map); /* record dataset in header */ scr_hash* dataset = scr_hash_new(); scr_filemap_get_dataset(map, id, scr_my_rank_world, dataset); scr_hash_set(header, SCR_KEY_COPY_XOR_DATASET, dataset); /* open each file, get the filesize of each, and read the meta data of each */ scr_hash* current_files = scr_hash_new(); int file_count = 0; unsigned long my_bytes = 0; scr_hash_elem* file_elem; for (file_elem = scr_filemap_first_file(map, id, scr_my_rank_world); file_elem != NULL; file_elem = scr_hash_elem_next(file_elem)) { /* get the filename */ filenames[file_count] = scr_hash_elem_key(file_elem); /* get the filesize of this file and add the byte count to the total */ filesizes[file_count] = scr_file_size(filenames[file_count]); my_bytes += filesizes[file_count]; /* read the meta data for this file and insert it into the current_files hash */ scr_meta* file_hash = scr_meta_new(); scr_filemap_get_meta(map, id, scr_my_rank_world, filenames[file_count], file_hash); scr_hash_setf(current_files, file_hash, "%d", file_count); /* open the file */ fds[file_count] = scr_open(filenames[file_count], O_RDONLY); if (fds[file_count] < 0) { /* TODO: try again? */ scr_abort(-1, "Opening checkpoint file for copying: scr_open(%s, O_RDONLY) errno=%d %s @ %s:%d", filenames[file_count], errno, strerror(errno), __FILE__, __LINE__ ); } file_count++; } /* set total number of files we have, plus our rank */ scr_hash* current_hash = scr_hash_new(); scr_hash_set_kv_int(current_hash, SCR_KEY_COPY_XOR_RANK, scr_my_rank_world); scr_hash_set_kv_int(current_hash, SCR_KEY_COPY_XOR_FILES, file_count); scr_hash_set(current_hash, SCR_KEY_COPY_XOR_FILE, current_files); /* exchange file info with partners and add data to our header */ scr_hash* partner_hash = scr_hash_new(); scr_hash_sendrecv(current_hash, state->rhs_rank, partner_hash, state->lhs_rank, c->comm); scr_hash_set(header, SCR_KEY_COPY_XOR_CURRENT, current_hash); scr_hash_set(header, SCR_KEY_COPY_XOR_PARTNER, partner_hash); /* allreduce to get maximum filesize */ unsigned long max_bytes; MPI_Allreduce(&my_bytes, &max_bytes, 1, MPI_UNSIGNED_LONG, MPI_MAX, c->comm); /* TODO: use unsigned long integer arithmetic (with proper byte padding) instead of char to speed things up */ /* compute chunk size according to maximum file length and number of ranks in xor set */ /* if filesize doesn't divide evenly, then add one byte to chunk_size */ /* TODO: check that ranks > 1 for this divide to be safe (or at partner selection time) */ size_t chunk_size = max_bytes / (unsigned long) (c->ranks - 1); if ((c->ranks - 1) * chunk_size < max_bytes) { chunk_size++; } /* TODO: need something like this to handle 0-byte files? */ if (chunk_size == 0) { chunk_size++; } /* record the dataset id and the chunk size in the xor chunk header */ scr_hash_util_set_bytecount(header, SCR_KEY_COPY_XOR_CHUNK, chunk_size); /* set chunk filenames of form: xor.<group_id>_<xor_rank+1>_of_<xor_ranks>.scr */ char my_chunk_file[SCR_MAX_FILENAME]; char* dir = scr_cache_dir_hidden_get(c, id); sprintf(my_chunk_file, "%s/xor.%d_%d_of_%d.scr", dir, c->group_id, c->rank+1, c->ranks); scr_free(&dir); /* record chunk file in filemap before creating it */ scr_filemap_add_file(map, id, scr_my_rank_world, my_chunk_file); scr_filemap_write(scr_map_file, map); /* open my chunk file */ mode_t mode_file = scr_getmode(1, 1, 0); int fd_chunk = scr_open(my_chunk_file, O_WRONLY | O_CREAT | O_TRUNC, mode_file); if (fd_chunk < 0) { /* TODO: try again? */ scr_abort(-1, "Opening XOR chunk file for writing: scr_open(%s) errno=%d %s @ %s:%d", my_chunk_file, errno, strerror(errno), __FILE__, __LINE__ ); } /* write out the xor chunk header */ scr_hash_write_fd(my_chunk_file, fd_chunk, header); scr_hash_delete(&header); MPI_Request request[2]; MPI_Status status[2]; /* XOR Reduce_scatter */ size_t nread = 0; while (nread < chunk_size) { size_t count = chunk_size - nread; if (count > scr_mpi_buf_size) { count = scr_mpi_buf_size; } int chunk_id; for(chunk_id = c->ranks-1; chunk_id >= 0; chunk_id--) { /* read the next set of bytes for this chunk from my file into send_buf */ if (chunk_id > 0) { int chunk_id_rel = (c->rank + c->ranks + chunk_id) % c->ranks; if (chunk_id_rel > c->rank) { chunk_id_rel--; } unsigned long offset = chunk_size * (unsigned long) chunk_id_rel + nread; if (scr_read_pad_n(num_files, filenames, fds, send_buf, count, offset, filesizes) != SCR_SUCCESS) { rc = SCR_FAILURE; } } else { memset(send_buf, 0, count); } /* TODO: XORing with unsigned long would be faster here (if chunk size is multiple of this size) */ /* merge the blocks via xor operation */ if (chunk_id < c->ranks-1) { for (i = 0; i < count; i++) { send_buf[i] ^= recv_buf[i]; } } if (chunk_id > 0) { /* not our chunk to write, forward it on and get the next */ MPI_Irecv(recv_buf, count, MPI_BYTE, state->lhs_rank, 0, c->comm, &request[0]); MPI_Isend(send_buf, count, MPI_BYTE, state->rhs_rank, 0, c->comm, &request[1]); MPI_Waitall(2, request, status); } else { /* write send block to send chunk file */ if (scr_write_attempt(my_chunk_file, fd_chunk, send_buf, count) != count) { rc = SCR_FAILURE; } } } nread += count; } /* close my chunkfile, with fsync */ if (scr_close(my_chunk_file, fd_chunk) != SCR_SUCCESS) { rc = SCR_FAILURE; } /* close my dataset files */ for (i=0; i < num_files; i++) { scr_close(filenames[i], fds[i]); } /* free the buffers */ scr_free(&filesizes); /* in this case, we don't free each name, since we copied the pointer to the string in the filemap */ scr_free(&filenames); scr_free(&fds); scr_align_free(&send_buf); scr_align_free(&recv_buf); /* TODO: need to check for errors */ /* write meta file for xor chunk */ unsigned long my_chunk_file_size = scr_file_size(my_chunk_file); scr_meta* meta = scr_meta_new(); scr_meta_set_filename(meta, my_chunk_file); scr_meta_set_filetype(meta, SCR_META_FILE_XOR); scr_meta_set_filesize(meta, my_chunk_file_size); scr_meta_set_complete(meta, 1); /* TODODSET: move the ranks field elsewhere, for now it's needed by scr_index.c */ scr_meta_set_ranks(meta, scr_ranks_world); scr_filemap_set_meta(map, id, scr_my_rank_world, my_chunk_file, meta); scr_filemap_write(scr_map_file, map); scr_meta_delete(&meta); /* if crc_on_copy is set, compute and store CRC32 value for chunk file */ if (scr_crc_on_copy) { scr_compute_crc(map, id, scr_my_rank_world, my_chunk_file); /* TODO: would be nice to save this CRC in our partner's XOR file so we can check correctness on a rebuild */ } return rc; }