static int scr_compute_crc(scr_filemap* map, int id, int rank, const char* file) { /* compute crc for the file */ uLong crc_file; if (scr_crc32(file, &crc_file) != SCR_SUCCESS) { scr_err("Failed to compute crc for file %s @ %s:%d", file, __FILE__, __LINE__ ); return SCR_FAILURE; } /* allocate a new meta data object */ scr_meta* meta = scr_meta_new(); if (meta == NULL) { scr_abort(-1, "Failed to allocate meta data object @ %s:%d", __FILE__, __LINE__ ); } /* read meta data from filemap */ if (scr_filemap_get_meta(map, id, rank, file, meta) != SCR_SUCCESS) { return SCR_FAILURE; } int rc = SCR_SUCCESS; /* read crc value from meta data */ uLong crc_meta; if (scr_meta_get_crc32(meta, &crc_meta) == SCR_SUCCESS) { /* check that the values are the same */ if (crc_file != crc_meta) { rc = SCR_FAILURE; } } else { /* record crc in filemap */ scr_meta_set_crc32(meta, crc_file); scr_filemap_set_meta(map, id, rank, file, meta); } /* free our meta data object */ scr_meta_delete(&meta); return rc; }
static int scr_swap_files_copy( int have_outgoing, const char* file_send, scr_meta* meta_send, int rank_send, uLong* crc32_send, int have_incoming, const char* file_recv, scr_meta* meta_recv, int rank_recv, uLong* crc32_recv, MPI_Comm comm) { int rc = SCR_SUCCESS; MPI_Request request[2]; MPI_Status status[2]; /* allocate MPI send buffer */ char *buf_send = NULL; if (have_outgoing) { buf_send = (char*) scr_align_malloc(scr_mpi_buf_size, scr_page_size); if (buf_send == NULL) { scr_abort(-1, "Allocating memory: malloc(%ld) errno=%d %s @ %s:%d", scr_mpi_buf_size, errno, strerror(errno), __FILE__, __LINE__ ); return SCR_FAILURE; } } /* allocate MPI recv buffer */ char *buf_recv = NULL; if (have_incoming) { buf_recv = (char*) scr_align_malloc(scr_mpi_buf_size, scr_page_size); if (buf_recv == NULL) { scr_abort(-1, "Allocating memory: malloc(%ld) errno=%d %s @ %s:%d", scr_mpi_buf_size, errno, strerror(errno), __FILE__, __LINE__ ); return SCR_FAILURE; } } /* open the file to send: read-only mode */ int fd_send = -1; if (have_outgoing) { fd_send = scr_open(file_send, O_RDONLY); if (fd_send < 0) { scr_abort(-1, "Opening file for send: scr_open(%s, O_RDONLY) errno=%d %s @ %s:%d", file_send, errno, strerror(errno), __FILE__, __LINE__ ); } } /* open the file to recv: truncate, write-only mode */ int fd_recv = -1; if (have_incoming) { mode_t mode_file = scr_getmode(1, 1, 0); fd_recv = scr_open(file_recv, O_WRONLY | O_CREAT | O_TRUNC, mode_file); if (fd_recv < 0) { scr_abort(-1, "Opening file for recv: scr_open(%s, O_WRONLY | O_CREAT | O_TRUNC, ...) errno=%d %s @ %s:%d", file_recv, errno, strerror(errno), __FILE__, __LINE__ ); } } /* exchange file chunks */ int nread, nwrite; int sending = 0; if (have_outgoing) { sending = 1; } int receiving = 0; if (have_incoming) { receiving = 1; } while (sending || receiving) { /* if we are still receiving a file, post a receive */ if (receiving) { MPI_Irecv(buf_recv, scr_mpi_buf_size, MPI_BYTE, rank_recv, 0, comm, &request[0]); } /* if we are still sending a file, read a chunk, send it, and wait */ if (sending) { nread = scr_read(file_send, fd_send, buf_send, scr_mpi_buf_size); if (scr_crc_on_copy && nread > 0) { *crc32_send = crc32(*crc32_send, (const Bytef*) buf_send, (uInt) nread); } if (nread < 0) { nread = 0; } MPI_Isend(buf_send, nread, MPI_BYTE, rank_send, 0, comm, &request[1]); MPI_Wait(&request[1], &status[1]); if (nread < scr_mpi_buf_size) { sending = 0; } } /* if we are still receiving a file, * wait on our receive to complete and write the data */ if (receiving) { MPI_Wait(&request[0], &status[0]); MPI_Get_count(&status[0], MPI_BYTE, &nwrite); if (scr_crc_on_copy && nwrite > 0) { *crc32_recv = crc32(*crc32_recv, (const Bytef*) buf_recv, (uInt) nwrite); } scr_write(file_recv, fd_recv, buf_recv, nwrite); if (nwrite < scr_mpi_buf_size) { receiving = 0; } } } /* close the files */ if (have_outgoing) { scr_close(file_send, fd_send); } if (have_incoming) { scr_close(file_recv, fd_recv); } /* set crc field on our file if it hasn't been set already */ if (scr_crc_on_copy && have_outgoing) { uLong meta_send_crc; if (scr_meta_get_crc32(meta_send, &meta_send_crc) != SCR_SUCCESS) { scr_meta_set_crc32(meta_send, *crc32_send); } else { /* TODO: we could check that the crc on the sent file matches and take some action if not */ } } /* free the MPI buffers */ scr_align_free(&buf_recv); scr_align_free(&buf_send); return rc; }
static int scr_swap_files_move( int have_outgoing, const char* file_send, scr_meta* meta_send, int rank_send, uLong* crc32_send, int have_incoming, const char* file_recv, scr_meta* meta_recv, int rank_recv, uLong* crc32_recv, MPI_Comm comm) { int rc = SCR_SUCCESS; MPI_Request request[2]; MPI_Status status[2]; /* allocate MPI send buffer */ char *buf_send = NULL; if (have_outgoing) { buf_send = (char*) scr_align_malloc(scr_mpi_buf_size, scr_page_size); if (buf_send == NULL) { scr_abort(-1, "Allocating memory: malloc(%ld) errno=%d %s @ %s:%d", scr_mpi_buf_size, errno, strerror(errno), __FILE__, __LINE__ ); return SCR_FAILURE; } } /* allocate MPI recv buffer */ char *buf_recv = NULL; if (have_incoming) { buf_recv = (char*) scr_align_malloc(scr_mpi_buf_size, scr_page_size); if (buf_recv == NULL) { scr_abort(-1, "Allocating memory: malloc(%ld) errno=%d %s @ %s:%d", scr_mpi_buf_size, errno, strerror(errno), __FILE__, __LINE__ ); return SCR_FAILURE; } } /* since we'll overwrite our send file in place with the recv file, * which may be larger, we need to keep track of how many bytes we've * sent and whether we've sent them all */ unsigned long filesize_send = 0; /* open our file */ int fd = -1; if (have_outgoing) { /* we'll overwrite our send file (or just read it if there is no incoming) */ filesize_send = scr_file_size(file_send); fd = scr_open(file_send, O_RDWR); if (fd < 0) { /* TODO: skip writes and return error? */ scr_abort(-1, "Opening file for send/recv: scr_open(%s, O_RDWR) errno=%d %s @ %s:%d", file_send, errno, strerror(errno), __FILE__, __LINE__ ); } } else if (have_incoming) { /* if we're in this branch, then we only have an incoming file, * so we'll write our recv file from scratch */ mode_t mode_file = scr_getmode(1, 1, 0); fd = scr_open(file_recv, O_WRONLY | O_CREAT | O_TRUNC, mode_file); if (fd < 0) { /* TODO: skip writes and return error? */ scr_abort(-1, "Opening file for recv: scr_open(%s, O_WRONLY | O_CREAT | O_TRUNC, ...) errno=%d %s @ %s:%d", file_recv, errno, strerror(errno), __FILE__, __LINE__ ); } } /* exchange file chunks */ int sending = 0; if (have_outgoing) { sending = 1; } int receiving = 0; if (have_incoming) { receiving = 1; } int nread, nwrite; off_t read_pos = 0, write_pos = 0; while (sending || receiving) { if (receiving) { /* prepare a buffer to receive up to scr_mpi_buf_size bytes */ MPI_Irecv(buf_recv, scr_mpi_buf_size, MPI_BYTE, rank_recv, 0, comm, &request[0]); } if (sending) { /* compute number of bytes to read */ unsigned long count = filesize_send - read_pos; if (count > scr_mpi_buf_size) { count = scr_mpi_buf_size; } /* read a chunk of up to scr_mpi_buf_size bytes into buf_send */ lseek(fd, read_pos, SEEK_SET); /* seek to read position */ nread = scr_read(file_send, fd, buf_send, count); if (scr_crc_on_copy && nread > 0) { *crc32_send = crc32(*crc32_send, (const Bytef*) buf_send, (uInt) nread); } if (nread < 0) { nread = 0; } read_pos += (off_t) nread; /* update read pointer */ /* send chunk (if nread is smaller than scr_mpi_buf_size, * then we've read the whole file) */ MPI_Isend(buf_send, nread, MPI_BYTE, rank_send, 0, comm, &request[1]); MPI_Wait(&request[1], &status[1]); /* check whether we've read the whole file */ if (filesize_send == read_pos && count < scr_mpi_buf_size) { sending = 0; } } if (receiving) { /* count the number of bytes received */ MPI_Wait(&request[0], &status[0]); MPI_Get_count(&status[0], MPI_BYTE, &nwrite); if (scr_crc_on_copy && nwrite > 0) { *crc32_recv = crc32(*crc32_recv, (const Bytef*) buf_recv, (uInt) nwrite); } /* write those bytes to file (if nwrite is smaller than scr_mpi_buf_size, * then we've received the whole file) */ lseek(fd, write_pos, SEEK_SET); /* seek to write position */ scr_write(file_recv, fd, buf_recv, nwrite); write_pos += (off_t) nwrite; /* update write pointer */ /* if nwrite is smaller than scr_mpi_buf_size, * then assume we've received the whole file */ if (nwrite < scr_mpi_buf_size) { receiving = 0; } } } /* close file and cleanup */ if (have_outgoing && have_incoming) { /* sent and received a file; close it, truncate it to corect size, rename it */ scr_close(file_send, fd); truncate(file_send, write_pos); rename(file_send, file_recv); } else if (have_outgoing) { /* only sent a file; close it, delete it, and remove its completion marker */ scr_close(file_send, fd); scr_file_unlink(file_send); } else if (have_incoming) { /* only received a file; just need to close it */ scr_close(file_recv, fd); } if (scr_crc_on_copy && have_outgoing) { uLong meta_send_crc; if (scr_meta_get_crc32(meta_send, &meta_send_crc) != SCR_SUCCESS) { /* we transfer this meta data across below, * so may as well update these fields so we can use them */ scr_meta_set_crc32(meta_send, *crc32_send); /* do not complete file send, we just deleted it above */ } else { /* TODO: we could check that the crc on the sent file matches and take some action if not */ } } /* free the MPI buffers */ scr_align_free(&buf_recv); scr_align_free(&buf_send); return rc; }
/* given a filename, its meta data, its list of segments, and list of destination containers, * copy file to container files */ static int scr_flush_file_to_containers( const char* file, scr_meta* meta, scr_hash* segments, const char* dst_dir) { /* check that we got something for a source file */ if (file == NULL || strcmp(file, "") == 0) { scr_err("Invalid source file @ %s:%d", __FILE__, __LINE__ ); return SCR_FAILURE; } /* check that our other arguments are valid */ if (meta == NULL || segments == NULL) { scr_err("Invalid metadata or segments @ %s:%d", __FILE__, __LINE__ ); return SCR_FAILURE; } /* open the file for reading */ int fd_src = scr_open(file, O_RDONLY); if (fd_src < 0) { scr_err("Opening file to copy: scr_open(%s) errno=%d %s @ %s:%d", file, errno, strerror(errno), __FILE__, __LINE__ ); return SCR_FAILURE; } #if !defined(__APPLE__) /* TODO: posix_fadvise(fd, 0, 0, POSIX_FADV_DONTNEED | POSIX_FADV_SEQUENTIAL) that tells the kernel that you don't ever need the pages from the file again, and it won't bother keeping them in the page cache. */ posix_fadvise(fd_src, 0, 0, POSIX_FADV_DONTNEED | POSIX_FADV_SEQUENTIAL); #endif /* get the buffer size we'll use to write to the file */ unsigned long buf_size = scr_file_buf_size; /* allocate buffer to read in file chunks */ char* buf = (char*) SCR_MALLOC(buf_size); /* initialize crc value */ uLong crc; if (scr_crc_on_flush) { crc = crc32(0L, Z_NULL, 0); } int rc = SCR_SUCCESS; /* write out each segment */ scr_hash_sort_int(segments, SCR_HASH_SORT_ASCENDING); scr_hash_elem* elem; for (elem = scr_hash_elem_first(segments); elem != NULL; elem = scr_hash_elem_next(elem)) { /* get the container info for this segment */ scr_hash* hash = scr_hash_elem_hash(elem); /* get the offset into the container and the length of the segment (both in bytes) */ char* container_name; unsigned long container_offset, segment_length; if (scr_container_get_name_offset_length(hash, &container_name, &container_offset, &segment_length) != SCR_SUCCESS) { scr_err("Failed to get segment offset and length @ %s:%d", __FILE__, __LINE__ ); rc = SCR_FAILURE; break; } /* build full name to destination file */ scr_path* dst_path = scr_path_from_str(dst_dir); scr_path_append_str(dst_path, container_name); scr_path_reduce(dst_path); char* dst_file = scr_path_strdup(dst_path); /* open container file for writing -- we don't truncate here because more than one * process may be writing to the same file */ int fd_container = scr_open(dst_file, O_WRONLY); if (fd_container < 0) { scr_err("Opening file for writing: scr_open(%s) errno=%d %s @ %s:%d", dst_file, errno, strerror(errno), __FILE__, __LINE__ ); rc = SCR_FAILURE; break; } #if !defined(__APPLE__) /* TODO: posix_fadvise(fd, 0, 0, POSIX_FADV_DONTNEED | POSIX_FADV_SEQUENTIAL) that tells the kernel that you don't ever need the pages from the file again, and it won't bother keeping them in the page cache. */ posix_fadvise(fd_container, 0, 0, POSIX_FADV_DONTNEED | POSIX_FADV_SEQUENTIAL); #endif /* seek to offset within container */ off_t pos = (off_t) container_offset; if (lseek(fd_container, pos, SEEK_SET) == (off_t)-1) { /* our seek failed, return an error */ scr_err("Failed to seek to byte %lu in %s @ %s:%d", pos, dst_file, __FILE__, __LINE__ ); rc = SCR_FAILURE; break; } /* copy data from file into container in chunks */ unsigned long remaining = segment_length; while (remaining > 0) { /* read / write up to buf_size bytes at a time from file */ unsigned long count = remaining; if (count > buf_size) { count = buf_size; } /* attempt to read buf_size bytes from file */ int nread = scr_read_attempt(file, fd_src, buf, count); /* if we read some bytes, write them out */ if (nread > 0) { /* optionally compute crc value as we go */ if (scr_crc_on_flush) { crc = crc32(crc, (const Bytef*) buf, (uInt) nread); } /* write our nread bytes out */ int nwrite = scr_write_attempt(dst_file, fd_container, buf, nread); /* check for a write error or a short write */ if (nwrite != nread) { /* write had a problem, stop copying and return an error */ rc = SCR_FAILURE; break; } /* subtract the bytes we've processed from the number remaining */ remaining -= (unsigned long) nread; } /* assume a short read is an error */ if (nread < count) { /* read had a problem, stop copying and return an error */ rc = SCR_FAILURE; break; } /* check for a read error, stop copying and return an error */ if (nread < 0) { /* read had a problem, stop copying and return an error */ rc = SCR_FAILURE; break; } } /* close container */ if (scr_close(dst_file, fd_container) != SCR_SUCCESS) { rc = SCR_FAILURE; } /* free the container file name and path */ scr_free(&dst_file); scr_path_delete(&dst_path); } /* close the source file */ if (scr_close(file, fd_src) != SCR_SUCCESS) { rc = SCR_FAILURE; } /* free buffer */ scr_free(&buf); /* verify / set crc value */ if (rc == SCR_SUCCESS) { uLong crc2; if (scr_crc_on_flush) { if (scr_meta_get_crc32(meta, &crc2) == SCR_SUCCESS) { /* if a crc is already set in the meta data, check that we computed the same value */ if (crc != crc2) { scr_err("CRC32 mismatch detected when flushing file %s @ %s:%d", file, __FILE__, __LINE__ ); rc = SCR_FAILURE; } } else { /* if there is no crc set, let's set it now */ scr_meta_set_crc32(meta, crc); } } } return rc; }
/* flushes file named in src_file to dst_dir and fills in meta based on flush, * returns success of flush */ static int scr_flush_a_file(const char* src_file, const char* dst_dir, scr_meta* meta) { int flushed = SCR_SUCCESS; int tmp_rc; /* build full name to destination file */ scr_path* dst_path = scr_path_from_str(src_file); scr_path_basename(dst_path); scr_path_prepend_str(dst_path, dst_dir); scr_path_reduce(dst_path); char* dst_file = scr_path_strdup(dst_path); /* copy file */ int crc_valid = 0; uLong crc; uLong* crc_p = NULL; if (scr_crc_on_flush) { crc_valid = 1; crc_p = &crc; } tmp_rc = scr_file_copy(src_file, dst_file, scr_file_buf_size, crc_p); if (tmp_rc != SCR_SUCCESS) { crc_valid = 0; flushed = SCR_FAILURE; } scr_dbg(2, "scr_flush_a_file: Read and copied %s to %s with success code %d @ %s:%d", src_file, dst_file, tmp_rc, __FILE__, __LINE__ ); /* if file has crc32, check it against the one computed during the copy, * otherwise if scr_crc_on_flush is set, record crc32 */ if (crc_valid) { uLong crc_meta; if (scr_meta_get_crc32(meta, &crc_meta) == SCR_SUCCESS) { if (crc != crc_meta) { /* detected a crc mismatch during the copy */ /* TODO: unlink the copied file */ /* scr_file_unlink(dst_file); */ /* mark the file as invalid */ scr_meta_set_complete(meta, 0); flushed = SCR_FAILURE; scr_err("scr_flush_a_file: CRC32 mismatch detected when flushing file %s to %s @ %s:%d", src_file, dst_file, __FILE__, __LINE__ ); /* TODO: would be good to log this, but right now only rank 0 can write log entries */ /* if (scr_log_enable) { time_t now = scr_log_seconds(); scr_log_event("CRC32 MISMATCH", dst_file, NULL, &now, NULL); } */ } } else { /* the crc was not already in the metafile, but we just computed it, so set it */ scr_meta_set_crc32(meta, crc); } } /* TODO: check that written filesize matches expected filesize */ /* fill out meta data, set complete field based on flush success */ /* (we don't update the meta file here, since perhaps the file in cache is ok and only the flush failed) */ int complete = (flushed == SCR_SUCCESS); scr_meta_set_complete(meta, complete); /* free destination file string and path */ scr_free(&dst_file); scr_path_delete(&dst_path); return flushed; }
/* fetch files listed in hash into specified cache directory, * update filemap and fill in total number of bytes fetched, * returns SCR_SUCCESS if successful */ static int scr_fetch_files_list( const scr_hash* file_list, const char* dir, scr_filemap* map) { /* assume we'll succeed in fetching our files */ int rc = SCR_SUCCESS; /* assume we don't have any files to fetch */ int my_num_files = 0; /* get dataset id */ int id; scr_dataset* dataset = scr_hash_get(file_list, SCR_KEY_DATASET); scr_dataset_get_id(dataset, &id); /* now iterate through the file list and fetch each file */ scr_hash_elem* file_elem = NULL; scr_hash* files = scr_hash_get(file_list, SCR_KEY_FILE); for (file_elem = scr_hash_elem_first(files); file_elem != NULL; file_elem = scr_hash_elem_next(file_elem)) { /* get the filename */ char* file = scr_hash_elem_key(file_elem); /* get a pointer to the hash for this file */ scr_hash* hash = scr_hash_elem_hash(file_elem); /* check whether we are supposed to fetch this file */ /* TODO: this is a hacky way to avoid reading a redundancy file * back in under the assumption that it's an original file, which * breaks our redundancy computation due to a name conflict on * the file names */ scr_hash_elem* no_fetch_hash = scr_hash_elem_get(hash, SCR_SUMMARY_6_KEY_NOFETCH); if (no_fetch_hash != NULL) { continue; } /* increment our file count */ my_num_files++; /* build the destination file name */ scr_path* path_newfile = scr_path_from_str(file); scr_path_basename(path_newfile); scr_path_prepend_str(path_newfile, dir); char* newfile = scr_path_strdup(path_newfile); /* add the file to our filemap and write it to disk before creating * the file, this way we have a record that it may exist before we * actually start to fetch it */ scr_filemap_add_file(map, id, scr_my_rank_world, newfile); scr_filemap_write(scr_map_file, map); /* get the file size */ unsigned long filesize = 0; if (scr_hash_util_get_unsigned_long(hash, SCR_KEY_SIZE, &filesize) != SCR_SUCCESS) { scr_err("Failed to read file size from summary data @ %s:%d", __FILE__, __LINE__ ); rc = SCR_FAILURE; /* free path and string */ scr_free(&newfile); scr_path_delete(&path_newfile); break; } /* check for a complete flag */ int complete = 1; if (scr_hash_util_get_int(hash, SCR_KEY_COMPLETE, &complete) != SCR_SUCCESS) { /* in summary file, the absence of a complete flag on a file * implies the file is complete */ complete = 1; } /* create a new meta data object for this file */ scr_meta* meta = scr_meta_new(); /* set the meta data */ scr_meta_set_filename(meta, newfile); scr_meta_set_filetype(meta, SCR_META_FILE_USER); scr_meta_set_filesize(meta, filesize); scr_meta_set_complete(meta, 1); /* TODODSET: move the ranks field elsewhere, for now it's needed * by scr_index.c */ scr_meta_set_ranks(meta, scr_ranks_world); /* get the crc, if set, and add it to the meta data */ uLong crc; if (scr_hash_util_get_crc32(hash, SCR_KEY_CRC, &crc) == SCR_SUCCESS) { scr_meta_set_crc32(meta, crc); } /* fetch file from containers if they are defined, otherwise fetch * the native file */ scr_hash* segments = scr_hash_get(hash, SCR_SUMMARY_6_KEY_SEGMENT); if (segments != NULL) { /* get source path */ char* from_dir; if (scr_hash_util_get_str(file_list, SCR_KEY_PATH, &from_dir) == SCR_SUCCESS) { /* fetch file from containers */ if (scr_fetch_file_from_containers(newfile, meta, segments, from_dir) != SCR_SUCCESS) { /* failed to fetch file, mark it as incomplete */ scr_meta_set_complete(meta, 0); rc = SCR_FAILURE; } } else { /* failed to find base dataset directory in file list */ rc = SCR_FAILURE; } } else { /* fetch native file, lookup directory for this file */ char* from_dir; if (scr_hash_util_get_str(hash, SCR_KEY_PATH, &from_dir) == SCR_SUCCESS) { if (scr_fetch_file(newfile, from_dir, meta) != SCR_SUCCESS) { /* failed to fetch file, mark it as incomplete */ scr_meta_set_complete(meta, 0); rc = SCR_FAILURE; } } else { /* failed to read source directory, mark file as incomplete */ scr_meta_set_complete(meta, 0); rc = SCR_FAILURE; } } /* TODODSET: want to write out filemap before we start to fetch * each file? */ /* mark the file as complete */ scr_filemap_set_meta(map, id, scr_my_rank_world, newfile, meta); /* free the meta data object */ scr_meta_delete(&meta); /* free path and string */ scr_free(&newfile); scr_path_delete(&path_newfile); } /* set the expected number of files for this dataset */ scr_filemap_set_expected_files(map, id, scr_my_rank_world, my_num_files); scr_filemap_write(scr_map_file, map); return rc; }