/* opens, reads, and computes the crc32 value for the given filename */ int scr_crc32(const char* filename, uLong* crc) { /* check that we got a variable to write our answer to */ if (crc == NULL) { return SCR_FAILURE; } /* initialize our crc value */ *crc = crc32(0L, Z_NULL, 0); /* open the file for reading */ int fd = scr_open(filename, O_RDONLY); if (fd < 0) { scr_dbg(1, "Failed to open file to compute crc: %s errno=%d @ file %s:%d", filename, errno, __FILE__, __LINE__ ); return SCR_FAILURE; } /* read the file data in and compute its crc32 */ int nread = 0; unsigned long buffer_size = 1024*1024; char buf[buffer_size]; do { nread = scr_read(filename, fd, buf, buffer_size); if (nread > 0) { *crc = crc32(*crc, (const Bytef*) buf, (uInt) nread); } } while (nread == buffer_size); /* if we got an error, don't print anything and bailout */ if (nread < 0) { scr_dbg(1, "Error while reading file to compute crc: %s @ file %s:%d", filename, __FILE__, __LINE__ ); close(fd); return SCR_FAILURE; } /* close the file */ scr_close(filename, fd); return SCR_SUCCESS; }
/* opens specified file and waits on a lock before returning the file descriptor */ int scr_open_with_lock(const char* file, int flags, mode_t mode) { /* open the file */ int fd = scr_open(file, flags, mode); if (fd < 0) { scr_err("Opening file for write: scr_open(%s) errno=%d %s @ %s:%d", file, errno, strerror(errno), __FILE__, __LINE__ ); return fd; } /* acquire an exclusive file lock */ int ret = scr_file_lock_write(file, fd); if (ret != SCR_SUCCESS) { close(fd); return ret; } /* return the opened file descriptor */ return fd; }
static int scr_swap_files_copy( int have_outgoing, const char* file_send, scr_meta* meta_send, int rank_send, uLong* crc32_send, int have_incoming, const char* file_recv, scr_meta* meta_recv, int rank_recv, uLong* crc32_recv, MPI_Comm comm) { int rc = SCR_SUCCESS; MPI_Request request[2]; MPI_Status status[2]; /* allocate MPI send buffer */ char *buf_send = NULL; if (have_outgoing) { buf_send = (char*) scr_align_malloc(scr_mpi_buf_size, scr_page_size); if (buf_send == NULL) { scr_abort(-1, "Allocating memory: malloc(%ld) errno=%d %s @ %s:%d", scr_mpi_buf_size, errno, strerror(errno), __FILE__, __LINE__ ); return SCR_FAILURE; } } /* allocate MPI recv buffer */ char *buf_recv = NULL; if (have_incoming) { buf_recv = (char*) scr_align_malloc(scr_mpi_buf_size, scr_page_size); if (buf_recv == NULL) { scr_abort(-1, "Allocating memory: malloc(%ld) errno=%d %s @ %s:%d", scr_mpi_buf_size, errno, strerror(errno), __FILE__, __LINE__ ); return SCR_FAILURE; } } /* open the file to send: read-only mode */ int fd_send = -1; if (have_outgoing) { fd_send = scr_open(file_send, O_RDONLY); if (fd_send < 0) { scr_abort(-1, "Opening file for send: scr_open(%s, O_RDONLY) errno=%d %s @ %s:%d", file_send, errno, strerror(errno), __FILE__, __LINE__ ); } } /* open the file to recv: truncate, write-only mode */ int fd_recv = -1; if (have_incoming) { mode_t mode_file = scr_getmode(1, 1, 0); fd_recv = scr_open(file_recv, O_WRONLY | O_CREAT | O_TRUNC, mode_file); if (fd_recv < 0) { scr_abort(-1, "Opening file for recv: scr_open(%s, O_WRONLY | O_CREAT | O_TRUNC, ...) errno=%d %s @ %s:%d", file_recv, errno, strerror(errno), __FILE__, __LINE__ ); } } /* exchange file chunks */ int nread, nwrite; int sending = 0; if (have_outgoing) { sending = 1; } int receiving = 0; if (have_incoming) { receiving = 1; } while (sending || receiving) { /* if we are still receiving a file, post a receive */ if (receiving) { MPI_Irecv(buf_recv, scr_mpi_buf_size, MPI_BYTE, rank_recv, 0, comm, &request[0]); } /* if we are still sending a file, read a chunk, send it, and wait */ if (sending) { nread = scr_read(file_send, fd_send, buf_send, scr_mpi_buf_size); if (scr_crc_on_copy && nread > 0) { *crc32_send = crc32(*crc32_send, (const Bytef*) buf_send, (uInt) nread); } if (nread < 0) { nread = 0; } MPI_Isend(buf_send, nread, MPI_BYTE, rank_send, 0, comm, &request[1]); MPI_Wait(&request[1], &status[1]); if (nread < scr_mpi_buf_size) { sending = 0; } } /* if we are still receiving a file, * wait on our receive to complete and write the data */ if (receiving) { MPI_Wait(&request[0], &status[0]); MPI_Get_count(&status[0], MPI_BYTE, &nwrite); if (scr_crc_on_copy && nwrite > 0) { *crc32_recv = crc32(*crc32_recv, (const Bytef*) buf_recv, (uInt) nwrite); } scr_write(file_recv, fd_recv, buf_recv, nwrite); if (nwrite < scr_mpi_buf_size) { receiving = 0; } } } /* close the files */ if (have_outgoing) { scr_close(file_send, fd_send); } if (have_incoming) { scr_close(file_recv, fd_recv); } /* set crc field on our file if it hasn't been set already */ if (scr_crc_on_copy && have_outgoing) { uLong meta_send_crc; if (scr_meta_get_crc32(meta_send, &meta_send_crc) != SCR_SUCCESS) { scr_meta_set_crc32(meta_send, *crc32_send); } else { /* TODO: we could check that the crc on the sent file matches and take some action if not */ } } /* free the MPI buffers */ scr_align_free(&buf_recv); scr_align_free(&buf_send); return rc; }
static int scr_swap_files_move( int have_outgoing, const char* file_send, scr_meta* meta_send, int rank_send, uLong* crc32_send, int have_incoming, const char* file_recv, scr_meta* meta_recv, int rank_recv, uLong* crc32_recv, MPI_Comm comm) { int rc = SCR_SUCCESS; MPI_Request request[2]; MPI_Status status[2]; /* allocate MPI send buffer */ char *buf_send = NULL; if (have_outgoing) { buf_send = (char*) scr_align_malloc(scr_mpi_buf_size, scr_page_size); if (buf_send == NULL) { scr_abort(-1, "Allocating memory: malloc(%ld) errno=%d %s @ %s:%d", scr_mpi_buf_size, errno, strerror(errno), __FILE__, __LINE__ ); return SCR_FAILURE; } } /* allocate MPI recv buffer */ char *buf_recv = NULL; if (have_incoming) { buf_recv = (char*) scr_align_malloc(scr_mpi_buf_size, scr_page_size); if (buf_recv == NULL) { scr_abort(-1, "Allocating memory: malloc(%ld) errno=%d %s @ %s:%d", scr_mpi_buf_size, errno, strerror(errno), __FILE__, __LINE__ ); return SCR_FAILURE; } } /* since we'll overwrite our send file in place with the recv file, * which may be larger, we need to keep track of how many bytes we've * sent and whether we've sent them all */ unsigned long filesize_send = 0; /* open our file */ int fd = -1; if (have_outgoing) { /* we'll overwrite our send file (or just read it if there is no incoming) */ filesize_send = scr_file_size(file_send); fd = scr_open(file_send, O_RDWR); if (fd < 0) { /* TODO: skip writes and return error? */ scr_abort(-1, "Opening file for send/recv: scr_open(%s, O_RDWR) errno=%d %s @ %s:%d", file_send, errno, strerror(errno), __FILE__, __LINE__ ); } } else if (have_incoming) { /* if we're in this branch, then we only have an incoming file, * so we'll write our recv file from scratch */ mode_t mode_file = scr_getmode(1, 1, 0); fd = scr_open(file_recv, O_WRONLY | O_CREAT | O_TRUNC, mode_file); if (fd < 0) { /* TODO: skip writes and return error? */ scr_abort(-1, "Opening file for recv: scr_open(%s, O_WRONLY | O_CREAT | O_TRUNC, ...) errno=%d %s @ %s:%d", file_recv, errno, strerror(errno), __FILE__, __LINE__ ); } } /* exchange file chunks */ int sending = 0; if (have_outgoing) { sending = 1; } int receiving = 0; if (have_incoming) { receiving = 1; } int nread, nwrite; off_t read_pos = 0, write_pos = 0; while (sending || receiving) { if (receiving) { /* prepare a buffer to receive up to scr_mpi_buf_size bytes */ MPI_Irecv(buf_recv, scr_mpi_buf_size, MPI_BYTE, rank_recv, 0, comm, &request[0]); } if (sending) { /* compute number of bytes to read */ unsigned long count = filesize_send - read_pos; if (count > scr_mpi_buf_size) { count = scr_mpi_buf_size; } /* read a chunk of up to scr_mpi_buf_size bytes into buf_send */ lseek(fd, read_pos, SEEK_SET); /* seek to read position */ nread = scr_read(file_send, fd, buf_send, count); if (scr_crc_on_copy && nread > 0) { *crc32_send = crc32(*crc32_send, (const Bytef*) buf_send, (uInt) nread); } if (nread < 0) { nread = 0; } read_pos += (off_t) nread; /* update read pointer */ /* send chunk (if nread is smaller than scr_mpi_buf_size, * then we've read the whole file) */ MPI_Isend(buf_send, nread, MPI_BYTE, rank_send, 0, comm, &request[1]); MPI_Wait(&request[1], &status[1]); /* check whether we've read the whole file */ if (filesize_send == read_pos && count < scr_mpi_buf_size) { sending = 0; } } if (receiving) { /* count the number of bytes received */ MPI_Wait(&request[0], &status[0]); MPI_Get_count(&status[0], MPI_BYTE, &nwrite); if (scr_crc_on_copy && nwrite > 0) { *crc32_recv = crc32(*crc32_recv, (const Bytef*) buf_recv, (uInt) nwrite); } /* write those bytes to file (if nwrite is smaller than scr_mpi_buf_size, * then we've received the whole file) */ lseek(fd, write_pos, SEEK_SET); /* seek to write position */ scr_write(file_recv, fd, buf_recv, nwrite); write_pos += (off_t) nwrite; /* update write pointer */ /* if nwrite is smaller than scr_mpi_buf_size, * then assume we've received the whole file */ if (nwrite < scr_mpi_buf_size) { receiving = 0; } } } /* close file and cleanup */ if (have_outgoing && have_incoming) { /* sent and received a file; close it, truncate it to corect size, rename it */ scr_close(file_send, fd); truncate(file_send, write_pos); rename(file_send, file_recv); } else if (have_outgoing) { /* only sent a file; close it, delete it, and remove its completion marker */ scr_close(file_send, fd); scr_file_unlink(file_send); } else if (have_incoming) { /* only received a file; just need to close it */ scr_close(file_recv, fd); } if (scr_crc_on_copy && have_outgoing) { uLong meta_send_crc; if (scr_meta_get_crc32(meta_send, &meta_send_crc) != SCR_SUCCESS) { /* we transfer this meta data across below, * so may as well update these fields so we can use them */ scr_meta_set_crc32(meta_send, *crc32_send); /* do not complete file send, we just deleted it above */ } else { /* TODO: we could check that the crc on the sent file matches and take some action if not */ } } /* free the MPI buffers */ scr_align_free(&buf_recv); scr_align_free(&buf_send); return rc; }
/* given a filename, its meta data, its list of segments, and list of destination containers, * copy file to container files */ static int scr_flush_file_to_containers( const char* file, scr_meta* meta, scr_hash* segments, const char* dst_dir) { /* check that we got something for a source file */ if (file == NULL || strcmp(file, "") == 0) { scr_err("Invalid source file @ %s:%d", __FILE__, __LINE__ ); return SCR_FAILURE; } /* check that our other arguments are valid */ if (meta == NULL || segments == NULL) { scr_err("Invalid metadata or segments @ %s:%d", __FILE__, __LINE__ ); return SCR_FAILURE; } /* open the file for reading */ int fd_src = scr_open(file, O_RDONLY); if (fd_src < 0) { scr_err("Opening file to copy: scr_open(%s) errno=%d %s @ %s:%d", file, errno, strerror(errno), __FILE__, __LINE__ ); return SCR_FAILURE; } #if !defined(__APPLE__) /* TODO: posix_fadvise(fd, 0, 0, POSIX_FADV_DONTNEED | POSIX_FADV_SEQUENTIAL) that tells the kernel that you don't ever need the pages from the file again, and it won't bother keeping them in the page cache. */ posix_fadvise(fd_src, 0, 0, POSIX_FADV_DONTNEED | POSIX_FADV_SEQUENTIAL); #endif /* get the buffer size we'll use to write to the file */ unsigned long buf_size = scr_file_buf_size; /* allocate buffer to read in file chunks */ char* buf = (char*) SCR_MALLOC(buf_size); /* initialize crc value */ uLong crc; if (scr_crc_on_flush) { crc = crc32(0L, Z_NULL, 0); } int rc = SCR_SUCCESS; /* write out each segment */ scr_hash_sort_int(segments, SCR_HASH_SORT_ASCENDING); scr_hash_elem* elem; for (elem = scr_hash_elem_first(segments); elem != NULL; elem = scr_hash_elem_next(elem)) { /* get the container info for this segment */ scr_hash* hash = scr_hash_elem_hash(elem); /* get the offset into the container and the length of the segment (both in bytes) */ char* container_name; unsigned long container_offset, segment_length; if (scr_container_get_name_offset_length(hash, &container_name, &container_offset, &segment_length) != SCR_SUCCESS) { scr_err("Failed to get segment offset and length @ %s:%d", __FILE__, __LINE__ ); rc = SCR_FAILURE; break; } /* build full name to destination file */ scr_path* dst_path = scr_path_from_str(dst_dir); scr_path_append_str(dst_path, container_name); scr_path_reduce(dst_path); char* dst_file = scr_path_strdup(dst_path); /* open container file for writing -- we don't truncate here because more than one * process may be writing to the same file */ int fd_container = scr_open(dst_file, O_WRONLY); if (fd_container < 0) { scr_err("Opening file for writing: scr_open(%s) errno=%d %s @ %s:%d", dst_file, errno, strerror(errno), __FILE__, __LINE__ ); rc = SCR_FAILURE; break; } #if !defined(__APPLE__) /* TODO: posix_fadvise(fd, 0, 0, POSIX_FADV_DONTNEED | POSIX_FADV_SEQUENTIAL) that tells the kernel that you don't ever need the pages from the file again, and it won't bother keeping them in the page cache. */ posix_fadvise(fd_container, 0, 0, POSIX_FADV_DONTNEED | POSIX_FADV_SEQUENTIAL); #endif /* seek to offset within container */ off_t pos = (off_t) container_offset; if (lseek(fd_container, pos, SEEK_SET) == (off_t)-1) { /* our seek failed, return an error */ scr_err("Failed to seek to byte %lu in %s @ %s:%d", pos, dst_file, __FILE__, __LINE__ ); rc = SCR_FAILURE; break; } /* copy data from file into container in chunks */ unsigned long remaining = segment_length; while (remaining > 0) { /* read / write up to buf_size bytes at a time from file */ unsigned long count = remaining; if (count > buf_size) { count = buf_size; } /* attempt to read buf_size bytes from file */ int nread = scr_read_attempt(file, fd_src, buf, count); /* if we read some bytes, write them out */ if (nread > 0) { /* optionally compute crc value as we go */ if (scr_crc_on_flush) { crc = crc32(crc, (const Bytef*) buf, (uInt) nread); } /* write our nread bytes out */ int nwrite = scr_write_attempt(dst_file, fd_container, buf, nread); /* check for a write error or a short write */ if (nwrite != nread) { /* write had a problem, stop copying and return an error */ rc = SCR_FAILURE; break; } /* subtract the bytes we've processed from the number remaining */ remaining -= (unsigned long) nread; } /* assume a short read is an error */ if (nread < count) { /* read had a problem, stop copying and return an error */ rc = SCR_FAILURE; break; } /* check for a read error, stop copying and return an error */ if (nread < 0) { /* read had a problem, stop copying and return an error */ rc = SCR_FAILURE; break; } } /* close container */ if (scr_close(dst_file, fd_container) != SCR_SUCCESS) { rc = SCR_FAILURE; } /* free the container file name and path */ scr_free(&dst_file); scr_path_delete(&dst_path); } /* close the source file */ if (scr_close(file, fd_src) != SCR_SUCCESS) { rc = SCR_FAILURE; } /* free buffer */ scr_free(&buf); /* verify / set crc value */ if (rc == SCR_SUCCESS) { uLong crc2; if (scr_crc_on_flush) { if (scr_meta_get_crc32(meta, &crc2) == SCR_SUCCESS) { /* if a crc is already set in the meta data, check that we computed the same value */ if (crc != crc2) { scr_err("CRC32 mismatch detected when flushing file %s @ %s:%d", file, __FILE__, __LINE__ ); rc = SCR_FAILURE; } } else { /* if there is no crc set, let's set it now */ scr_meta_set_crc32(meta, crc); } } } return rc; }
/* read in halt file (which program may have changed), update internal data structure, * set & unset any fields, and write out halt file all while locked */ int scr_halt_sync_and_set(const char* file, struct arglist* args, scr_hash* data) { /* set the mode on the file to be readable/writable by all * (enables a sysadmin to halt a user's job via scr_halt --all) */ mode_t old_mode = umask(0000); /* TODO: sleep and try the open several times if the first fails */ /* open the halt file for reading */ int fd = scr_open(file, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH | S_IWOTH); if (fd < 0) { scr_err("Opening file for write: scr_open(%s) errno=%d %m @ %s:%d", file, errno, __FILE__, __LINE__ ); /* restore the normal file mask */ umask(old_mode); return SCR_FAILURE; } /* acquire an exclusive file lock before reading */ int ret = scr_write_lock(file,fd); if (ret != SCR_SUCCESS){ scr_close(file,fd); umask(old_mode); return ret; } /* read in the current data from the file */ scr_hash_read_fd(file, fd, data); /* set / unset values in file */ if (args->set_reason) { scr_hash_unset(data, SCR_HALT_KEY_EXIT_REASON); scr_hash_set_kv(data, SCR_HALT_KEY_EXIT_REASON, args->value_reason); } else if (args->unset_reason) { scr_hash_unset(data, SCR_HALT_KEY_EXIT_REASON); } if (args->set_checkpoints) { scr_hash_unset(data, SCR_HALT_KEY_CHECKPOINTS); scr_hash_setf(data, NULL, "%s %lu", SCR_HALT_KEY_CHECKPOINTS, args->value_checkpoints); } else if (args->unset_checkpoints) { scr_hash_unset(data, SCR_HALT_KEY_CHECKPOINTS); } if (args->set_before) { scr_hash_unset(data, SCR_HALT_KEY_EXIT_BEFORE); scr_hash_setf(data, NULL, "%s %lu", SCR_HALT_KEY_EXIT_BEFORE, args->value_before); } else if (args->unset_before) { scr_hash_unset(data, SCR_HALT_KEY_EXIT_BEFORE); } if (args->set_after) { scr_hash_unset(data, SCR_HALT_KEY_EXIT_AFTER); scr_hash_setf(data, NULL, "%s %lu", SCR_HALT_KEY_EXIT_AFTER, args->value_after); } else if (args->unset_after) { scr_hash_unset(data, SCR_HALT_KEY_EXIT_AFTER); } if (args->set_seconds) { scr_hash_unset(data, SCR_HALT_KEY_SECONDS); scr_hash_setf(data, NULL, "%s %lu", SCR_HALT_KEY_SECONDS, args->value_seconds); } else if (args->unset_seconds) { scr_hash_unset(data, SCR_HALT_KEY_SECONDS); } /* wind file pointer back to the start of the file */ lseek(fd, 0, SEEK_SET); /* write our updated data */ ssize_t bytes_written = scr_hash_write_fd(file, fd, data); /* truncate the file to the correct size (may be smaller than it was before) */ if (bytes_written >= 0) { ftruncate(fd, (off_t) bytes_written); } /* release the file lock */ ret = scr_unlock(file, fd); if (ret != SCR_SUCCESS){ scr_close(file,fd); umask(old_mode); return ret; } /* close file */ scr_close(file, fd); /* restore the normal file mask */ umask(old_mode); /* write current values to halt file */ return SCR_SUCCESS; }
int main (int argc, char *argv[]) { /* check that we were given at least one argument * (the transfer file name) */ if (argc != 2) { printf("Usage: scr_transfer <transferfile>\n"); return 1; } /* record the name of the transfer file */ scr_transfer_file = strdup(argv[1]); if (scr_transfer_file == NULL) { scr_err("scr_transfer: Copying transfer file name @ %s:%d", __FILE__, __LINE__ ); return 1; } /* initialize our tracking variables */ read_params(); /* get file io mode */ mode_t mode_file = scr_getmode(1, 1, 0); /* we cache the opened file descriptors to avoid extra opens, * seeks, and closes */ int fd_src = -1; int fd_dst = -1; char* new_file_src = NULL; char* old_file_src = NULL; char* new_file_dst = NULL; char* old_file_dst = NULL; off_t new_position = 0; off_t old_position = 0; /* start in the stopped state */ state = STOPPED; set_transfer_file_state(SCR_TRANSFER_KEY_STATE_STOP, 0); /* TODO: enable this value to be set from config file */ /* TODO: page-align this buffer for faster performance */ /* allocate our file copy buffer */ size_t bufsize = scr_file_buf_size; char* buf = malloc(bufsize); if (buf == NULL) { scr_err("scr_transfer: Failed to allocate %llu bytes for file copy buffer @ %s:%d", (unsigned long long) bufsize, __FILE__, __LINE__ ); return 1; } int nread = 0; double secs_run = 0.0; double secs_slept = 0.0; double secs_run_start = scr_seconds(); double secs_run_end = secs_run_start; double secs_last_write = secs_run_start; scr_hash* hash = scr_hash_new(); while (keep_running) { /* loop here sleeping and checking transfer file periodically * until state changes and / or some time elapses */ /* reset our timer for our last write */ double secs_remain = scr_transfer_secs; while (keep_running && (state == STOPPED || secs_remain > 0.0)) { /* remember our current state before reading transfer file */ int old_state = state; /* read the transfer file, which fills in our hash and * also updates state and bytes_per_second */ scr_hash_delete(&hash); hash = read_transfer_file(); /* compute time we should sleep before writing more data based * on bandwidth and percent of runtime limits */ if (state == RUNNING) { /* get the current time */ double secs_now = scr_seconds(); /* based on the amount we last wrote and our allocated bandwidth, * compute time we need to sleep before attempting our next write */ double secs_remain_bw = 0.0; if (nread > 0 && bytes_per_second > 0.0) { double secs_to_wait_bw = (double) nread / bytes_per_second; double secs_waited_bw = secs_now - secs_last_write; secs_remain_bw = secs_to_wait_bw - secs_waited_bw; } /* based on the percentage of time we are allowed to be running, * compute time we need to sleep before attempting our next write */ double secs_remain_runtime = 0.0; if (percent_runtime > 0.0) { /* stop the run clock, add to the run time, * and restart the run clock */ secs_run_end = secs_now; secs_run += secs_run_end - secs_run_start; secs_run_start = secs_run_end; /* compute our total time, and the time we need to sleep */ double secs_total = secs_run + secs_slept; secs_remain_runtime = secs_run / percent_runtime - secs_total; } /* take the maximum of these two values */ secs_remain = secs_remain_bw; if (secs_remain_runtime > secs_remain) { secs_remain = secs_remain_runtime; } } /* check for a state transition */ if (state != old_state) { if (state == RUNNING) { /* if we switched to RUNNING, kick out without sleeping and * reset the total run and sleep times */ secs_remain = 0.0; secs_run = 0.0; secs_slept = 0.0; } else if (state == STOPPED) { /* if we switched to STOPPED, close our files if open */ close_files(new_file_src, &fd_src, new_file_dst, &fd_dst); clear_parameters(&new_file_src, &new_file_dst, &new_position); clear_parameters(&old_file_src, &old_file_dst, &old_position); /* after closing our files, update our state in the transfer file */ set_transfer_file_state(SCR_TRANSFER_KEY_STATE_STOP, 0); } } /* assume we can sleep for the full remainder of the time */ double secs = secs_remain; /* if we're not running, always sleep for the full time */ if (state != RUNNING) { secs = scr_transfer_secs; } /* set a maximum time to sleep before we read the hash file again * (ensures some responsiveness) */ if (secs > scr_transfer_secs) { secs = scr_transfer_secs; } /* sleep if we need to */ if (secs > 0.0) { /* stop the run clock and add to the total run time */ secs_run_end = scr_seconds(); secs_run += secs_run_end - secs_run_start; /* sleep */ usleep((unsigned long) (secs * 1000000.0)); secs_slept += secs; secs_remain -= secs; /* restart the run clock */ secs_run_start = scr_seconds(); } } /* write data out */ if (state == RUNNING) { /* look for a new file to transfer */ off_t filesize = 0; find_file(hash, &new_file_src, &new_file_dst, &new_position, &filesize); /* if we got a new file, close the old one (if open), * open the new file */ if (bool_diff_files(new_file_src, old_file_src)) { /* close the old descriptor if it's open */ if (fd_src >= 0) { scr_close(old_file_src, fd_src); fd_src = -1; } /* delete the old file name if we have one */ if (old_file_src != NULL) { free(old_file_src); old_file_src = NULL; } /* reset our position counter */ old_position = 0; /* open the file and remember the filename if we have one */ if (new_file_src != NULL) { fd_src = scr_open(new_file_src, O_RDONLY); /* TODO: check for errors here */ old_file_src = strdup(new_file_src); /* TODO: check for errors here */ } } /* if we got a new file, close the old one (if open), * open the new file */ if (bool_diff_files(new_file_dst, old_file_dst)) { /* close the old descriptor if it's open */ if (fd_dst >= 0) { scr_close(old_file_dst, fd_dst); fd_dst = -1; } /* delete the old file name if we have one */ if (old_file_dst != NULL) { free(old_file_dst); old_file_dst = NULL; } /* reset our position counter */ old_position = 0; /* open the file and remember the filename if we have one */ if (new_file_dst != NULL) { fd_dst = scr_open(new_file_dst, O_RDWR | O_CREAT, mode_file); /* TODO: check for errors here */ old_file_dst = strdup(new_file_dst); /* TODO: check for errors here */ } } /* we may have the same file, but perhaps the position changed * (may need to seek) */ if (new_position != old_position) { if (fd_src >= 0) { lseek(fd_src, new_position, SEEK_SET); /* TODO: check for errors here */ } if (fd_dst >= 0) { lseek(fd_dst, new_position, SEEK_SET); /* TODO: check for errors here */ } /* remember the new position */ old_position = new_position; } /* if we have two open files, * copy a chunk from source file to destination file */ nread = 0; if (fd_src >= 0 && fd_dst >= 0) { /* compute number of bytes to read from file */ size_t count = (size_t) (filesize - new_position); if (count > bufsize) { count = bufsize; } /* read a chunk */ nread = scr_read(new_file_src, fd_src, buf, count); /* if we read data, write it out */ if (nread > 0) { /* record the time of our write */ secs_last_write = scr_seconds(); /* write the chunk and force it out with an fsync */ scr_write(new_file_dst, fd_dst, buf, nread); fsync(fd_dst); /* update our position */ new_position += (off_t) nread; old_position = new_position; /* record the updated position in the transfer file */ update_transfer_file(new_file_src, new_file_dst, new_position); } /* if we've written all of the bytes, close the files */ if (new_position == filesize) { close_files(new_file_src, &fd_src, new_file_dst, &fd_dst); clear_parameters(&new_file_src, &new_file_dst, &new_position); clear_parameters(&old_file_src, &old_file_dst, &old_position); } } else { /* TODO: we may have an error * (failed to open the source or dest file) */ /* if we found no file to transfer, move to a STOPPED state */ if (new_file_src == NULL) { state = STOPPED; set_transfer_file_state(SCR_TRANSFER_KEY_STATE_STOP, 1); } } } } /* free our file copy buffer */ if (buf != NULL) { free(buf); buf = NULL; } /* free the strdup'd tranfer file name */ if (scr_transfer_file != NULL) { free(scr_transfer_file); scr_transfer_file = NULL; } return 0; }
int main(int argc, char* argv[]) { int i, j; int index = 1; /* print usage if not enough arguments were given */ if (argc < 2) { printf("Usage: scr_rebuild_xor <size> <root> <missing_xor_filename> <ordered_remaining_xor_filenames>\n"); return 1; } /* TODO: want to pass this on command line? */ /* get current working directory */ char dsetdir[SCR_MAX_FILENAME]; scr_getcwd(dsetdir, sizeof(dsetdir)); /* create and reduce path for dataset */ scr_path* path_dset = scr_path_from_str(dsetdir); scr_path_reduce(path_dset); /* allocate buffers */ char* buffer_A = malloc(buffer_size * sizeof(char)); char* buffer_B = malloc(buffer_size * sizeof(char)); if (buffer_A == NULL || buffer_B == NULL) { scr_err("Failed to allocate buffer memory @ %s:%d", __FILE__, __LINE__ ); return 1; } /* read in the size of the XOR set */ int xor_set_size = (int) strtol(argv[index++], (char **)NULL, 10); if (xor_set_size <= 0) { scr_err("Invalid XOR set size argument %s @ %s:%d", argv[index-1], __FILE__, __LINE__ ); return 1; } /* allocate memory for data structures based on the XOR set size */ int* num_files = malloc(xor_set_size * sizeof(int)); int* offsets = malloc(xor_set_size * sizeof(int)); char** xor_files = malloc(xor_set_size * sizeof(char*)); int* xor_fds = malloc(xor_set_size * sizeof(int)); scr_hash** xor_headers = malloc(xor_set_size * sizeof(scr_hash*)); if (num_files == NULL || offsets == NULL || xor_files == NULL || xor_fds == NULL || xor_headers == NULL) { scr_err("Failed to allocate buffer memory @ %s:%d", __FILE__, __LINE__ ); return 1; } /* read in the rank of the missing process (the root) */ int root = (int) strtol(argv[index++], (char **)NULL, 10); if (root < 0 || root >= xor_set_size) { scr_err("Invalid root argument %s @ %s:%d", argv[index-1], __FILE__, __LINE__ ); return 1; } /* read in the missing xor filename */ xor_files[0] = strdup(argv[index++]); if (xor_files[0] == NULL) { scr_err("Failed to dup XOR filename @ %s:%d", __FILE__, __LINE__ ); return 1; } /* read in the xor filenames (expected to be in order of XOR segment number) */ /* we order ranks so that root is index 0, the rank to the right of root is index 1, and so on */ for (i=0; i < xor_set_size; i++) { xor_headers[i] = scr_hash_new(); /* we'll get the XOR file name for root from the header stored in the XOR file of the partner */ if (i == root) { continue; } /* adjust the index relative to root */ j = i - root; if (j < 0) { j += xor_set_size; } /* copy the XOR file name */ xor_files[j] = strdup(argv[index++]); if (xor_files[j] == NULL) { scr_err("Failed to dup XOR filename @ %s:%d", __FILE__, __LINE__ ); return 1; } } /* open each of the xor files and read in the headers */ for (i=1; i < xor_set_size; i++) { /* open each xor file for reading */ xor_fds[i] = scr_open(xor_files[i], O_RDONLY); if (xor_fds[i] < 0) { scr_err("Opening xor segment file: scr_open(%s) errno=%d %s @ %s:%d", xor_files[i], errno, strerror(errno), __FILE__, __LINE__ ); return 1; } /* read the header from this xor file */ if (scr_hash_read_fd(xor_files[i], xor_fds[i], xor_headers[i]) < 0) { scr_err("Failed to read XOR header from %s @ %s:%d", xor_files[i], __FILE__, __LINE__ ); return 1; } } /* build header for missing XOR file */ int partner_rank = -1; if (xor_set_size >= 2) { scr_hash_merge(xor_headers[0], xor_headers[1]); /* fetch our own file list from rank to our right */ scr_hash* rhs_hash = scr_hash_get(xor_headers[1], SCR_KEY_COPY_XOR_PARTNER); scr_hash* current_hash = scr_hash_new(); scr_hash_merge(current_hash, rhs_hash); scr_hash_set(xor_headers[0], SCR_KEY_COPY_XOR_CURRENT, current_hash); /* we are the partner to the rank to our left */ scr_hash* lhs_hash = scr_hash_get(xor_headers[xor_set_size-1], SCR_KEY_COPY_XOR_CURRENT); scr_hash* partner_hash = scr_hash_new(); scr_hash_merge(partner_hash, lhs_hash); scr_hash_set(xor_headers[0], SCR_KEY_COPY_XOR_PARTNER, partner_hash); /* get global rank of partner */ if (scr_hash_util_get_int(lhs_hash, SCR_KEY_COPY_XOR_RANK, &partner_rank) != SCR_SUCCESS) { scr_err("Failed to read partner rank from XOR file header in %s @ %s:%d", xor_files[xor_set_size-1], __FILE__, __LINE__ ); return 1; } } /* get a pointer to the current hash for the missing rank */ scr_hash* missing_current_hash = scr_hash_get(xor_headers[0], SCR_KEY_COPY_XOR_CURRENT); /* read the rank */ int my_rank = -1; if (scr_hash_util_get_int(missing_current_hash, SCR_KEY_COPY_XOR_RANK, &my_rank) != SCR_SUCCESS) { scr_err("Failed to read rank from XOR file header in %s @ %s:%d", xor_files[0], __FILE__, __LINE__ ); return 1; } /* get the dataset */ scr_dataset* dataset = scr_hash_get(xor_headers[0], SCR_KEY_COPY_XOR_DATASET); /* read the dataset id */ int dset_id = -1; if (scr_dataset_get_id(dataset, &dset_id) != SCR_SUCCESS) { scr_err("Failed to read dataset id from XOR file header in %s @ %s:%d", xor_files[0], __FILE__, __LINE__ ); return 1; } /* read the ranks */ int num_ranks = -1; if (scr_hash_util_get_int(xor_headers[0], SCR_KEY_COPY_XOR_RANKS, &num_ranks) != SCR_SUCCESS) { scr_err("Failed to read ranks from XOR file header in %s @ %s:%d", xor_files[0], __FILE__, __LINE__ ); return 1; } /* get name of partner's fmap */ scr_path* path_partner_map = scr_path_from_str(".scr"); scr_path_append_strf(path_partner_map, "fmap.%d.scr", partner_rank); /* extract partner's flush descriptor */ scr_hash* flushdesc = scr_hash_new(); scr_filemap* partner_map = scr_filemap_new(); scr_filemap_read(path_partner_map, partner_map); scr_filemap_get_flushdesc(partner_map, dset_id, partner_rank, flushdesc); scr_filemap_delete(&partner_map); /* delete partner map path */ scr_path_delete(&path_partner_map); /* determine whether we should preserve user directories */ int preserve_dirs = 0; scr_hash_util_get_int(flushdesc, SCR_SCAVENGE_KEY_PRESERVE, &preserve_dirs); /* read the chunk size */ unsigned long chunk_size = 0; if (scr_hash_util_get_unsigned_long(xor_headers[0], SCR_KEY_COPY_XOR_CHUNK, &chunk_size) != SCR_SUCCESS) { scr_err("Failed to read chunk size from XOR file header in %s @ %s:%d", xor_files[0], __FILE__, __LINE__ ); return 1; } /* determine number of files each member wrote in XOR set */ for (i=0; i < xor_set_size; i++) { /* record the number of files for this rank */ scr_hash* current_hash = scr_hash_get(xor_headers[i], SCR_KEY_COPY_XOR_CURRENT); if (scr_hash_util_get_int(current_hash, SCR_KEY_COPY_XOR_FILES, &num_files[i]) != SCR_SUCCESS) { scr_err("Failed to read number of files from %s @ %s:%d", xor_files[i], __FILE__, __LINE__ ); return 1; } } /* count the total number of files and set the offsets array */ int total_num_files = 0; for (i=0; i < xor_set_size; i++) { offsets[i] = total_num_files; total_num_files += num_files[i]; } /* allocate space for a file descriptor, file name pointer, and filesize for each user file */ int* user_fds = (int*) malloc(total_num_files * sizeof(int)); char** user_files = (char**) malloc(total_num_files * sizeof(char*)); char** user_rel_files = (char**) malloc(total_num_files * sizeof(char*)); unsigned long* user_filesizes = (unsigned long*) malloc(total_num_files * sizeof(unsigned long)); if (user_fds == NULL || user_files == NULL || user_rel_files == NULL || user_filesizes == NULL) { scr_err("Failed to allocate buffer memory @ %s:%d", __FILE__, __LINE__ ); return 1; } /* get file name, file size, and open each of the user files that we have */ for (i=0; i < xor_set_size; i++) { scr_hash* current_hash = scr_hash_get(xor_headers[i], SCR_KEY_COPY_XOR_CURRENT); /* for each file belonging to this rank, get filename, filesize, and open file */ for (j=0; j < num_files[i]; j++) { int offset = offsets[i] + j; /* get the meta data for this file */ scr_meta* meta = scr_hash_get_kv_int(current_hash, SCR_KEY_COPY_XOR_FILE, j); if (meta == NULL) { scr_err("Failed to read meta data for file %d in %s @ %s:%d", j, xor_files[i], __FILE__, __LINE__ ); return 1; } /* record the filesize of this file */ if (scr_meta_get_filesize(meta, &user_filesizes[offset]) != SCR_SUCCESS) { scr_err("Failed to read filesize field for file %d in %s @ %s:%d", j, xor_files[i], __FILE__, __LINE__ ); return 1; } /* get filename */ char* origname; if (scr_meta_get_origname(meta, &origname) != SCR_SUCCESS) { scr_err("Failed to read original name for file %d in %s @ %s:%d", j, xor_files[i], __FILE__, __LINE__ ); return 1; } /* construct full path to user file */ scr_path* path_user_full = scr_path_from_str(origname); if (preserve_dirs) { /* get original path of file */ char* origpath; if (scr_meta_get_origpath(meta, &origpath) != SCR_SUCCESS) { scr_err("Failed to read original path for file %d in %s @ %s:%d", j, xor_files[i], __FILE__, __LINE__ ); return 1; } /* construct full path to file */ scr_path_prepend_str(path_user_full, origpath); } else { /* construct full path to file */ scr_path_prepend(path_user_full, path_dset); } /* reduce path to user file */ scr_path_reduce(path_user_full); /* make a copy of the full path */ user_files[offset] = scr_path_strdup(path_user_full); /* make a copy of relative path */ scr_path* path_user_rel = scr_path_relative(path_dset, path_user_full); user_rel_files[offset] = scr_path_strdup(path_user_rel); scr_path_delete(&path_user_rel); /* free the full path */ scr_path_delete(&path_user_full); /* open the file */ if (i == 0) { /* create directory for file */ scr_path* user_dir_path = scr_path_from_str(user_files[offset]); scr_path_reduce(user_dir_path); scr_path_dirname(user_dir_path); if (! scr_path_is_null(user_dir_path)) { char* user_dir = scr_path_strdup(user_dir_path); mode_t mode_dir = scr_getmode(1, 1, 1); if (scr_mkdir(user_dir, mode_dir) != SCR_SUCCESS) { scr_err("Failed to create directory for user file %s @ %s:%d", user_dir, __FILE__, __LINE__ ); return 1; } scr_free(&user_dir); } scr_path_delete(&user_dir_path); /* open missing file for writing */ mode_t mode_file = scr_getmode(1, 1, 0); user_fds[offset] = scr_open(user_files[offset], O_WRONLY | O_CREAT | O_TRUNC, mode_file); if (user_fds[offset] < 0) { scr_err("Opening user file for writing: scr_open(%s) errno=%d %s @ %s:%d", user_files[offset], errno, strerror(errno), __FILE__, __LINE__ ); return 1; } } else { /* open existing file for reading */ user_fds[offset] = scr_open(user_files[offset], O_RDONLY); if (user_fds[offset] < 0) { scr_err("Opening user file for reading: scr_open(%s) errno=%d %s @ %s:%d", user_files[offset], errno, strerror(errno), __FILE__, __LINE__ ); return 1; } } } } /* finally, open the xor file for the missing rank */ mode_t mode_file = scr_getmode(1, 1, 0); xor_fds[0] = scr_open(xor_files[0], O_WRONLY | O_CREAT | O_TRUNC, mode_file); if (xor_fds[0] < 0) { scr_err("Opening xor file to be reconstructed: scr_open(%s) errno=%d %s @ %s:%d", xor_files[0], errno, strerror(errno), __FILE__, __LINE__ ); return 1; } int rc = 0; /* write the header to the XOR file of the missing rank */ if (scr_hash_write_fd(xor_files[0], xor_fds[0], xor_headers[0]) < 0) { rc = 1; } /* this offset array records the current position we are in the logical file for each rank */ unsigned long* offset = malloc(xor_set_size * sizeof(unsigned long)); if (offset == NULL) { scr_err("Failed to allocate buffer memory @ %s:%d", __FILE__, __LINE__ ); return 1; } for (i=0; i < xor_set_size; i++) { offset[i] = 0; } unsigned long write_pos = 0; int chunk_id; for (chunk_id = 0; chunk_id < xor_set_size && rc == 0; chunk_id++) { size_t nread = 0; while (nread < chunk_size && rc == 0) { /* read upto buffer_size bytes at a time */ size_t count = chunk_size - nread; if (count > buffer_size) { count = buffer_size; } /* clear our buffer */ memset(buffer_A, 0, count); /* read a segment from each rank and XOR it into our buffer */ for (i=1; i < xor_set_size; i++) { /* read the next set of bytes for this chunk from my file into send_buf */ if (chunk_id != ((i + root) % xor_set_size)) { /* read chunk from the logical file for this rank */ if (scr_read_pad_n(num_files[i], &user_files[offsets[i]], &user_fds[offsets[i]], buffer_B, count, offset[i], &user_filesizes[offsets[i]]) != SCR_SUCCESS) { /* our read failed, set the return code to an error */ rc = 1; count = 0; } offset[i] += count; } else { /* read chunk from the XOR file for this rank */ if (scr_read_attempt(xor_files[i], xor_fds[i], buffer_B, count) != count) { /* our read failed, set the return code to an error */ rc = 1; count = 0; } } /* TODO: XORing with unsigned long would be faster here (if chunk size is multiple of this size) */ /* merge the blocks via xor operation */ for (j = 0; j < count; j++) { buffer_A[j] ^= buffer_B[j]; } } /* at this point, we have the data from the missing rank, write it out */ if (chunk_id != root) { /* write chunk to logical file for the missing rank */ if (scr_write_pad_n(num_files[0], &user_files[0], &user_fds[0], buffer_A, count, write_pos, &user_filesizes[0]) != SCR_SUCCESS) { /* our write failed, set the return code to an error */ rc = 1; } write_pos += count; } else { /* write chunk to xor file for the missing rank */ if (scr_write_attempt(xor_files[0], xor_fds[0], buffer_A, count) != count) { /* our write failed, set the return code to an error */ rc = 1; } } nread += count; } } /* close each of the user files */ for (i=0; i < total_num_files; i++) { if (scr_close(user_files[i], user_fds[i]) != SCR_SUCCESS) { rc = 1; } } /* close each of the XOR files */ for (i=0; i < xor_set_size; i++) { if (scr_close(xor_files[i], xor_fds[i]) != SCR_SUCCESS) { rc = 1; } } /* if the write failed, delete the files we just wrote, and return an error */ if (rc != 0) { for (j=0; j < num_files[0]; j++) { scr_file_unlink(user_files[j]); } scr_file_unlink(xor_files[0]); return 1; } /* check that filesizes are correct */ unsigned long filesize; for (j=0; j < num_files[0]; j++) { filesize = scr_file_size(user_files[j]); if (filesize != user_filesizes[j]) { /* the filesize check failed, so delete the file */ scr_file_unlink(user_files[j]); /* mark the file as incomplete */ scr_meta* meta = scr_hash_get_kv_int(missing_current_hash, SCR_KEY_COPY_XOR_FILE, j); scr_meta_set_complete(meta, 0); rc = 1; } } /* TODO: we didn't record the filesize of the XOR file for the missing rank anywhere */ /* create a filemap for this rank */ scr_filemap* map = scr_filemap_new(); if (map == NULL) { scr_err("Failed to allocate filemap @ %s:%d", __FILE__, __LINE__ ); return 1; } /* record the dataset information in the filemap */ scr_filemap_set_dataset(map, dset_id, my_rank, dataset); /* write meta data for each of the user files and add each one to the filemap */ for (j=0; j < num_files[0]; j++) { /* add user file to filemap and record meta data */ char* user_file_relative = user_rel_files[j]; scr_filemap_add_file(map, dset_id, my_rank, user_file_relative); scr_meta* meta = scr_hash_get_kv_int(missing_current_hash, SCR_KEY_COPY_XOR_FILE, j); scr_filemap_set_meta(map, dset_id, my_rank, user_file_relative, meta); } /* write meta data for xor file and add it to the filemap */ scr_filemap_add_file(map, dset_id, my_rank, xor_files[0]); unsigned long full_chunk_filesize = scr_file_size(xor_files[0]); int missing_complete = 1; scr_meta* meta_chunk = scr_meta_new(); scr_meta_set_filename(meta_chunk, xor_files[0]); scr_meta_set_filetype(meta_chunk, SCR_META_FILE_XOR); scr_meta_set_filesize(meta_chunk, full_chunk_filesize); /* TODO: remove this from meta file, for now it's needed in scr_index.c */ scr_meta_set_ranks(meta_chunk, num_ranks); scr_meta_set_complete(meta_chunk, missing_complete); scr_filemap_set_meta(map, dset_id, my_rank, xor_files[0], meta_chunk); /* set expected number of files for the missing rank */ int expected_num_files = scr_filemap_num_files(map, dset_id, my_rank); scr_filemap_set_expected_files(map, dset_id, my_rank, expected_num_files); /* compute, check, and store crc values with files */ for (j=0; j < num_files[0]; j++) { /* compute crc on user file */ char* user_file_relative = user_rel_files[j]; if (scr_compute_crc(map, dset_id, my_rank, user_file_relative) != SCR_SUCCESS) { /* the crc check failed, so delete the file */ scr_file_unlink(user_files[j]); rc = 1; } } if (scr_compute_crc(map, dset_id, my_rank, xor_files[0]) != SCR_SUCCESS) { /* the crc check failed, so delete the file */ scr_file_unlink(xor_files[0]); rc = 1; } /* store flush descriptor */ scr_filemap_set_flushdesc(map, dset_id, my_rank, flushdesc); /* write filemap for this rank */ scr_path* path_map = scr_path_from_str(".scr"); scr_path_append_strf(path_map, "fmap.%d.scr", my_rank); if (scr_filemap_write(path_map, map) != SCR_SUCCESS) { rc = 1; } scr_path_delete(&path_map); /* delete the map */ scr_filemap_delete(&map); scr_meta_delete(&meta_chunk); /* delete the flush/scavenge descriptor */ scr_hash_delete(&flushdesc); scr_free(&offset); for (i=0; i < total_num_files; i++) { scr_free(&user_rel_files[i]); scr_free(&user_files[i]); } scr_free(&user_filesizes); scr_free(&user_rel_files); scr_free(&user_files); scr_free(&user_fds); for (i=0; i < xor_set_size; i++) { scr_hash_delete(&xor_headers[i]); } for (i=0; i < xor_set_size; i++) { scr_free(&xor_files[i]); } scr_free(&xor_headers); scr_free(&xor_fds); scr_free(&xor_files); scr_free(&offsets); scr_free(&num_files); scr_free(&buffer_B); scr_free(&buffer_A); scr_path_delete(&path_dset); return rc; }
/* copy src_file (full path) to dest_path and return new full path in dest_file */ int scr_file_copy( const char* src_file, const char* dst_file, unsigned long buf_size, uLong* crc) { /* check that we got something for a source file */ if (src_file == NULL || strcmp(src_file, "") == 0) { scr_err("Invalid source file @ %s:%d", __FILE__, __LINE__ ); return SCR_FAILURE; } /* check that we got something for a destination file */ if (dst_file == NULL || strcmp(dst_file, "") == 0) { scr_err("Invalid destination file @ %s:%d", __FILE__, __LINE__ ); return SCR_FAILURE; } /* open src_file for reading */ int src_fd = scr_open(src_file, O_RDONLY); if (src_fd < 0) { scr_err("Opening file to copy: scr_open(%s) errno=%d %s @ %s:%d", src_file, errno, strerror(errno), __FILE__, __LINE__ ); return SCR_FAILURE; } /* open dest_file for writing */ mode_t mode_file = scr_getmode(1, 1, 0); int dst_fd = scr_open(dst_file, O_WRONLY | O_CREAT | O_TRUNC, mode_file); if (dst_fd < 0) { scr_err("Opening file for writing: scr_open(%s) errno=%d %s @ %s:%d", dst_file, errno, strerror(errno), __FILE__, __LINE__ ); scr_close(src_file, src_fd); return SCR_FAILURE; } /* TODO: posix_fadvise(fd, 0, 0, POSIX_FADV_DONTNEED | POSIX_FADV_SEQUENTIAL) that tells the kernel that you don't ever need the pages from the file again, and it won't bother keeping them in the page cache. */ posix_fadvise(src_fd, 0, 0, POSIX_FADV_DONTNEED | POSIX_FADV_SEQUENTIAL); posix_fadvise(dst_fd, 0, 0, POSIX_FADV_DONTNEED | POSIX_FADV_SEQUENTIAL); /* allocate buffer to read in file chunks */ char* buf = (char*) malloc(buf_size); if (buf == NULL) { scr_err("Allocating memory: malloc(%llu) errno=%d %s @ %s:%d", buf_size, errno, strerror(errno), __FILE__, __LINE__ ); scr_close(dst_file, dst_fd); scr_close(src_file, src_fd); return SCR_FAILURE; } /* initialize crc values */ if (crc != NULL) { *crc = crc32(0L, Z_NULL, 0); } int rc = SCR_SUCCESS; /* write chunks */ int copying = 1; while (copying) { /* attempt to read buf_size bytes from file */ int nread = scr_read_attempt(src_file, src_fd, buf, buf_size); /* if we read some bytes, write them out */ if (nread > 0) { /* optionally compute crc value as we go */ if (crc != NULL) { *crc = crc32(*crc, (const Bytef*) buf, (uInt) nread); } /* write our nread bytes out */ int nwrite = scr_write_attempt(dst_file, dst_fd, buf, nread); /* check for a write error or a short write */ if (nwrite != nread) { /* write had a problem, stop copying and return an error */ copying = 0; rc = SCR_FAILURE; } } /* assume a short read means we hit the end of the file */ if (nread < buf_size) { copying = 0; } /* check for a read error, stop copying and return an error */ if (nread < 0) { /* read had a problem, stop copying and return an error */ copying = 0; rc = SCR_FAILURE; } } /* free buffer */ scr_free(&buf); /* close source and destination files */ if (scr_close(dst_file, dst_fd) != SCR_SUCCESS) { rc = SCR_FAILURE; } if (scr_close(src_file, src_fd) != SCR_SUCCESS) { rc = SCR_FAILURE; } /* unlink the file if the copy failed */ if (rc != SCR_SUCCESS) { unlink(dst_file); } return rc; }
/* read contents of summary file */ static int scr_fetch_summary( const char* summary_dir, scr_hash* file_list) { /* assume that we won't succeed in our fetch attempt */ int rc = SCR_SUCCESS; /* check whether summary file exists and is readable */ if (scr_my_rank_world == 0) { /* check that we can access the directory */ if (scr_file_is_readable(summary_dir) != SCR_SUCCESS) { scr_err("Failed to access summary directory %s @ %s:%d", summary_dir, __FILE__, __LINE__ ); rc = SCR_FAILURE; } } /* broadcast success code from rank 0 */ MPI_Bcast(&rc, 1, MPI_INT, 0, scr_comm_world); if (rc != SCR_SUCCESS) { return rc; } /* add path to file list */ scr_hash_util_set_str(file_list, SCR_KEY_PATH, summary_dir); /* build path to summary file */ scr_path* dataset_path = scr_path_from_str(summary_dir); scr_path* meta_path = scr_path_dup(dataset_path); scr_path_append_str(meta_path, ".scr"); scr_path_reduce(meta_path); /* rank 0 reads the summary file */ scr_hash* header = scr_hash_new(); if (scr_my_rank_world == 0) { /* build path to summary file */ scr_path* summary_path = scr_path_dup(meta_path); scr_path_append_str(summary_path, "summary.scr"); const char* summary_file = scr_path_strdup(summary_path); /* open file for reading */ int fd = scr_open(summary_file, O_RDONLY); if (fd >= 0) { /* read summary hash */ ssize_t header_size = scr_hash_read_fd(summary_file, fd, header); if (header_size < 0) { rc = SCR_FAILURE; } /* TODO: check that the version is correct */ /* close the file */ scr_close(summary_file, fd); } else { scr_err("Failed to open summary file %s @ %s:%d", summary_file, __FILE__, __LINE__ ); rc = SCR_FAILURE; } /* free summary path and string */ scr_free(&summary_file); scr_path_delete(&summary_path); } /* broadcast success code from rank 0 */ MPI_Bcast(&rc, 1, MPI_INT, 0, scr_comm_world); if (rc != SCR_SUCCESS) { goto cleanup; } /* broadcast the summary hash */ scr_hash_bcast(header, 0, scr_comm_world); /* extract and record the datast in file list */ scr_hash* dataset_hash = scr_hash_new(); scr_dataset* dataset = scr_hash_get(header, SCR_SUMMARY_6_KEY_DATASET); scr_hash_merge(dataset_hash, dataset); scr_hash_set(file_list, SCR_SUMMARY_6_KEY_DATASET, dataset_hash); /* build path to rank2file map */ scr_path* rank2file_path = scr_path_dup(meta_path); scr_path_append_str(rank2file_path, "rank2file.scr"); /* fetch file names and offsets containing file hash data */ int valid = 0; char* file = NULL; unsigned long offset = 0; if (scr_my_rank_world == 0) { /* rank 0 is only valid reader to start with */ valid = 1; file = scr_path_strdup(rank2file_path); offset = 0; } if (scr_fetch_rank2file_map(dataset_path, 1, &valid, &file, &offset) != SCR_SUCCESS) { rc = SCR_FAILURE; } /* create hashes to exchange data */ scr_hash* send = scr_hash_new(); scr_hash* recv = scr_hash_new(); /* read data from file */ if (valid) { /* open file if necessary */ int fd = scr_open(file, O_RDONLY); if (fd >= 0) { /* create hash to hold file contents */ scr_hash* save = scr_hash_new(); /* read hash from file */ scr_lseek(file, fd, offset, SEEK_SET); ssize_t readsize = scr_hash_read_fd(file, fd, save); if (readsize < 0) { scr_err("Failed to read rank2file map file %s @ %s:%d", file, __FILE__, __LINE__ ); rc = SCR_FAILURE; } /* check that the number of ranks match */ int ranks = 0; scr_hash_util_get_int(save, SCR_SUMMARY_6_KEY_RANKS, &ranks); if (ranks != scr_ranks_world) { scr_err("Invalid number of ranks in %s, got %d expected %d @ %s:%d", file, ranks, scr_ranks_world, __FILE__, __LINE__ ); rc = SCR_FAILURE; } /* delete current send hash, set it to values from file, * delete file hash */ scr_hash_delete(&send); send = scr_hash_extract(save, SCR_SUMMARY_6_KEY_RANK); scr_hash_delete(&save); /* close the file */ scr_close(file, fd); } else { scr_err("Failed to open rank2file map %s @ %s:%d", file, __FILE__, __LINE__ ); rc = SCR_FAILURE; } /* delete file name string */ scr_free(&file); } /* check that everyone read the data ok */ if (! scr_alltrue(rc == SCR_SUCCESS)) { rc = SCR_FAILURE; goto cleanup_hashes; } /* scatter to groups */ scr_hash_exchange_direction(send, recv, scr_comm_world, SCR_HASH_EXCHANGE_RIGHT); /* iterate over the ranks that sent data to us, and set up our * list of files */ scr_hash_elem* elem; for (elem = scr_hash_elem_first(recv); elem != NULL; elem = scr_hash_elem_next(elem)) { /* the key is the source rank, which we don't care about, * the info we need is in the element hash */ scr_hash* elem_hash = scr_hash_elem_hash(elem); /* get pointer to file hash */ scr_hash* file_hash = scr_hash_get(elem_hash, SCR_SUMMARY_6_KEY_FILE); if (file_hash != NULL) { /* TODO: parse summary file format */ scr_hash_merge(file_list, elem_hash); } else { rc = SCR_FAILURE; } } /* fill in file list parameters */ if (rc == SCR_SUCCESS) { /* if we're not using containers, add PATH entry for each of our * files */ scr_hash* files = scr_hash_get(file_list, SCR_KEY_FILE); for (elem = scr_hash_elem_first(files); elem != NULL; elem = scr_hash_elem_next(elem)) { /* get the file name */ char* file = scr_hash_elem_key(elem); /* combine the file name with the summary directory to build a * full path to the file */ scr_path* path_full = scr_path_dup(dataset_path); scr_path_append_str(path_full, file); /* subtract off last component to get just the path */ scr_path_dirname(path_full); char* path = scr_path_strdup(path_full); /* record path in file list */ scr_hash* hash = scr_hash_elem_hash(elem); scr_hash_util_set_str(hash, SCR_KEY_PATH, path); /* free the path and string */ scr_free(&path); scr_path_delete(&path_full); } } /* check that everyone read the data ok */ if (! scr_alltrue(rc == SCR_SUCCESS)) { rc = SCR_FAILURE; goto cleanup_hashes; } cleanup_hashes: /* delete send and receive hashes */ scr_hash_delete(&recv); scr_hash_delete(&send); /* free string and path for rank2file map */ scr_path_delete(&rank2file_path); cleanup: /* free the header hash */ scr_hash_delete(&header); /* free path for dataset directory */ scr_path_delete(&meta_path); scr_path_delete(&dataset_path); return rc; }
static int scr_fetch_rank2file_map( const scr_path* dataset_path, int depth, int* ptr_valid, char** ptr_file, unsigned long* ptr_offset) { int rc = SCR_SUCCESS; /* get local variables so we don't have to deference everything */ int valid = *ptr_valid; char* file = *ptr_file; unsigned long offset = *ptr_offset; /* create a hash to hold section of file */ scr_hash* hash = scr_hash_new(); /* if we can read from file do it */ if (valid) { /* open file if we haven't already */ int fd = scr_open(file, O_RDONLY); if (fd >= 0) { /* read our segment from the file */ scr_lseek(file, fd, offset, SEEK_SET); ssize_t read_rc = scr_hash_read_fd(file, fd, hash); if (read_rc < 0) { scr_err("Failed to read from %s @ %s:%d", file, __FILE__, __LINE__ ); rc = SCR_FAILURE; } /* close the file */ scr_close(file, fd); } else { scr_err("Failed to open rank2file map %s @ %s:%d", file, __FILE__, __LINE__ ); rc = SCR_FAILURE; } } /* check for read errors */ if (! scr_alltrue(rc == SCR_SUCCESS)) { rc = SCR_FAILURE; goto cleanup; } /* create hashes to exchange data */ scr_hash* send = scr_hash_new(); scr_hash* recv = scr_hash_new(); /* copy rank data into send hash */ if (valid) { scr_hash* rank_hash = scr_hash_get(hash, SCR_SUMMARY_6_KEY_RANK); scr_hash_merge(send, rank_hash); } /* exchange hashes */ scr_hash_exchange_direction(send, recv, scr_comm_world, SCR_HASH_EXCHANGE_RIGHT); /* see if anyone sent us anything */ int newvalid = 0; char* newfile = NULL; unsigned long newoffset = 0; scr_hash_elem* elem = scr_hash_elem_first(recv); if (elem != NULL) { /* got something, so now we'll read in the next step */ newvalid = 1; /* get file name we should read */ scr_hash* elem_hash = scr_hash_elem_hash(elem); char* value; if (scr_hash_util_get_str(elem_hash, SCR_SUMMARY_6_KEY_FILE, &value) == SCR_SUCCESS) { /* return string of full path to file to caller */ scr_path* newpath = scr_path_dup(dataset_path); scr_path_append_str(newpath, value); newfile = scr_path_strdup(newpath); scr_path_delete(&newpath); } else { rc = SCR_FAILURE; } /* get offset we should start reading from */ if (scr_hash_util_get_bytecount(elem_hash, SCR_SUMMARY_6_KEY_OFFSET, &newoffset) != SCR_SUCCESS) { rc = SCR_FAILURE; } } /* free the send and receive hashes */ scr_hash_delete(&recv); scr_hash_delete(&send); /* get level id, and broadcast it from rank 0, * which we assume to be a reader in all steps */ int level_id = -1; if (valid) { if (scr_hash_util_get_int(hash, SCR_SUMMARY_6_KEY_LEVEL, &level_id) != SCR_SUCCESS) { rc = SCR_FAILURE; } } MPI_Bcast(&level_id, 1, MPI_INT, 0, scr_comm_world); /* check for read errors */ if (! scr_alltrue(rc == SCR_SUCCESS)) { rc = SCR_FAILURE; goto cleanup; } /* set parameters for output or next iteration, * we already took care of updating ptr_fd earlier */ if (valid) { scr_free(ptr_file); } *ptr_valid = newvalid; *ptr_file = newfile; *ptr_offset = newoffset; /* recurse if we still have levels to read */ if (level_id > 1) { rc = scr_fetch_rank2file_map(dataset_path, depth+1, ptr_valid, ptr_file, ptr_offset); } cleanup: /* free the hash */ scr_hash_delete(&hash); return rc; }
/* apply XOR redundancy scheme to dataset files */ static int scr_reddesc_apply_xor(scr_filemap* map, const scr_reddesc* c, int id) { int rc = SCR_SUCCESS; int i; /* get pointer to XOR state structure */ scr_reddesc_xor* state = (scr_reddesc_xor*) c->copy_state; /* allocate buffer to read a piece of my file */ char* send_buf = (char*) scr_align_malloc(scr_mpi_buf_size, scr_page_size); if (send_buf == NULL) { scr_abort(-1, "Allocating memory for send buffer: malloc(%d) errno=%d %s @ %s:%d", scr_mpi_buf_size, errno, strerror(errno), __FILE__, __LINE__ ); } /* allocate buffer to read a piece of the recevied chunk file */ char* recv_buf = (char*) scr_align_malloc(scr_mpi_buf_size, scr_page_size); if (recv_buf == NULL) { scr_abort(-1, "Allocating memory for recv buffer: malloc(%d) errno=%d %s @ %s:%d", scr_mpi_buf_size, errno, strerror(errno), __FILE__, __LINE__ ); } /* count the number of files I have and allocate space in structures for each of them */ int num_files = scr_filemap_num_files(map, id, scr_my_rank_world); int* fds = (int*) SCR_MALLOC(num_files * sizeof(int)); char** filenames = (char**) SCR_MALLOC(num_files * sizeof(char*)); unsigned long* filesizes = (unsigned long*) SCR_MALLOC(num_files * sizeof(unsigned long)); /* record partner's redundancy descriptor hash in our filemap */ scr_hash* lhs_desc_hash = scr_hash_new(); scr_hash* my_desc_hash = scr_hash_new(); scr_reddesc_store_to_hash(c, my_desc_hash); scr_hash_sendrecv(my_desc_hash, state->rhs_rank, lhs_desc_hash, state->lhs_rank, c->comm); scr_filemap_set_desc(map, id, state->lhs_rank_world, lhs_desc_hash); scr_hash_delete(&my_desc_hash); scr_hash_delete(&lhs_desc_hash); /* allocate a new xor file header hash */ scr_hash* header = scr_hash_new(); /* record the global ranks of the processes in our xor group */ scr_hash_merge(header, state->group_map); /* record dataset in header */ scr_hash* dataset = scr_hash_new(); scr_filemap_get_dataset(map, id, scr_my_rank_world, dataset); scr_hash_set(header, SCR_KEY_COPY_XOR_DATASET, dataset); /* open each file, get the filesize of each, and read the meta data of each */ scr_hash* current_files = scr_hash_new(); int file_count = 0; unsigned long my_bytes = 0; scr_hash_elem* file_elem; for (file_elem = scr_filemap_first_file(map, id, scr_my_rank_world); file_elem != NULL; file_elem = scr_hash_elem_next(file_elem)) { /* get the filename */ filenames[file_count] = scr_hash_elem_key(file_elem); /* get the filesize of this file and add the byte count to the total */ filesizes[file_count] = scr_file_size(filenames[file_count]); my_bytes += filesizes[file_count]; /* read the meta data for this file and insert it into the current_files hash */ scr_meta* file_hash = scr_meta_new(); scr_filemap_get_meta(map, id, scr_my_rank_world, filenames[file_count], file_hash); scr_hash_setf(current_files, file_hash, "%d", file_count); /* open the file */ fds[file_count] = scr_open(filenames[file_count], O_RDONLY); if (fds[file_count] < 0) { /* TODO: try again? */ scr_abort(-1, "Opening checkpoint file for copying: scr_open(%s, O_RDONLY) errno=%d %s @ %s:%d", filenames[file_count], errno, strerror(errno), __FILE__, __LINE__ ); } file_count++; } /* set total number of files we have, plus our rank */ scr_hash* current_hash = scr_hash_new(); scr_hash_set_kv_int(current_hash, SCR_KEY_COPY_XOR_RANK, scr_my_rank_world); scr_hash_set_kv_int(current_hash, SCR_KEY_COPY_XOR_FILES, file_count); scr_hash_set(current_hash, SCR_KEY_COPY_XOR_FILE, current_files); /* exchange file info with partners and add data to our header */ scr_hash* partner_hash = scr_hash_new(); scr_hash_sendrecv(current_hash, state->rhs_rank, partner_hash, state->lhs_rank, c->comm); scr_hash_set(header, SCR_KEY_COPY_XOR_CURRENT, current_hash); scr_hash_set(header, SCR_KEY_COPY_XOR_PARTNER, partner_hash); /* allreduce to get maximum filesize */ unsigned long max_bytes; MPI_Allreduce(&my_bytes, &max_bytes, 1, MPI_UNSIGNED_LONG, MPI_MAX, c->comm); /* TODO: use unsigned long integer arithmetic (with proper byte padding) instead of char to speed things up */ /* compute chunk size according to maximum file length and number of ranks in xor set */ /* if filesize doesn't divide evenly, then add one byte to chunk_size */ /* TODO: check that ranks > 1 for this divide to be safe (or at partner selection time) */ size_t chunk_size = max_bytes / (unsigned long) (c->ranks - 1); if ((c->ranks - 1) * chunk_size < max_bytes) { chunk_size++; } /* TODO: need something like this to handle 0-byte files? */ if (chunk_size == 0) { chunk_size++; } /* record the dataset id and the chunk size in the xor chunk header */ scr_hash_util_set_bytecount(header, SCR_KEY_COPY_XOR_CHUNK, chunk_size); /* set chunk filenames of form: xor.<group_id>_<xor_rank+1>_of_<xor_ranks>.scr */ char my_chunk_file[SCR_MAX_FILENAME]; char* dir = scr_cache_dir_hidden_get(c, id); sprintf(my_chunk_file, "%s/xor.%d_%d_of_%d.scr", dir, c->group_id, c->rank+1, c->ranks); scr_free(&dir); /* record chunk file in filemap before creating it */ scr_filemap_add_file(map, id, scr_my_rank_world, my_chunk_file); scr_filemap_write(scr_map_file, map); /* open my chunk file */ mode_t mode_file = scr_getmode(1, 1, 0); int fd_chunk = scr_open(my_chunk_file, O_WRONLY | O_CREAT | O_TRUNC, mode_file); if (fd_chunk < 0) { /* TODO: try again? */ scr_abort(-1, "Opening XOR chunk file for writing: scr_open(%s) errno=%d %s @ %s:%d", my_chunk_file, errno, strerror(errno), __FILE__, __LINE__ ); } /* write out the xor chunk header */ scr_hash_write_fd(my_chunk_file, fd_chunk, header); scr_hash_delete(&header); MPI_Request request[2]; MPI_Status status[2]; /* XOR Reduce_scatter */ size_t nread = 0; while (nread < chunk_size) { size_t count = chunk_size - nread; if (count > scr_mpi_buf_size) { count = scr_mpi_buf_size; } int chunk_id; for(chunk_id = c->ranks-1; chunk_id >= 0; chunk_id--) { /* read the next set of bytes for this chunk from my file into send_buf */ if (chunk_id > 0) { int chunk_id_rel = (c->rank + c->ranks + chunk_id) % c->ranks; if (chunk_id_rel > c->rank) { chunk_id_rel--; } unsigned long offset = chunk_size * (unsigned long) chunk_id_rel + nread; if (scr_read_pad_n(num_files, filenames, fds, send_buf, count, offset, filesizes) != SCR_SUCCESS) { rc = SCR_FAILURE; } } else { memset(send_buf, 0, count); } /* TODO: XORing with unsigned long would be faster here (if chunk size is multiple of this size) */ /* merge the blocks via xor operation */ if (chunk_id < c->ranks-1) { for (i = 0; i < count; i++) { send_buf[i] ^= recv_buf[i]; } } if (chunk_id > 0) { /* not our chunk to write, forward it on and get the next */ MPI_Irecv(recv_buf, count, MPI_BYTE, state->lhs_rank, 0, c->comm, &request[0]); MPI_Isend(send_buf, count, MPI_BYTE, state->rhs_rank, 0, c->comm, &request[1]); MPI_Waitall(2, request, status); } else { /* write send block to send chunk file */ if (scr_write_attempt(my_chunk_file, fd_chunk, send_buf, count) != count) { rc = SCR_FAILURE; } } } nread += count; } /* close my chunkfile, with fsync */ if (scr_close(my_chunk_file, fd_chunk) != SCR_SUCCESS) { rc = SCR_FAILURE; } /* close my dataset files */ for (i=0; i < num_files; i++) { scr_close(filenames[i], fds[i]); } /* free the buffers */ scr_free(&filesizes); /* in this case, we don't free each name, since we copied the pointer to the string in the filemap */ scr_free(&filenames); scr_free(&fds); scr_align_free(&send_buf); scr_align_free(&recv_buf); /* TODO: need to check for errors */ /* write meta file for xor chunk */ unsigned long my_chunk_file_size = scr_file_size(my_chunk_file); scr_meta* meta = scr_meta_new(); scr_meta_set_filename(meta, my_chunk_file); scr_meta_set_filetype(meta, SCR_META_FILE_XOR); scr_meta_set_filesize(meta, my_chunk_file_size); scr_meta_set_complete(meta, 1); /* TODODSET: move the ranks field elsewhere, for now it's needed by scr_index.c */ scr_meta_set_ranks(meta, scr_ranks_world); scr_filemap_set_meta(map, id, scr_my_rank_world, my_chunk_file, meta); scr_filemap_write(scr_map_file, map); scr_meta_delete(&meta); /* if crc_on_copy is set, compute and store CRC32 value for chunk file */ if (scr_crc_on_copy) { scr_compute_crc(map, id, scr_my_rank_world, my_chunk_file); /* TODO: would be nice to save this CRC in our partner's XOR file so we can check correctness on a rebuild */ } return rc; }