/* environment specific init/finalize */ int scr_env_init(void) { #ifdef SCR_RESOURCE_MANAGER_PMIX /* init pmix */ int retval = PMIx_Init(&scr_pmix_proc, NULL, 0); if (retval != PMIX_SUCCESS) { scr_err("PMIx_Init failed: rc=%d @ %s:%d", retval, __FILE__, __LINE__ ); return SCR_FAILURE; } scr_dbg(1, "PMIx_Init succeeded @ %s:%d", __FILE__, __LINE__); #endif /* SCR_MACHINE_TYPE == SCR_PMIX */ #ifdef HAVE_LIBCPPR /* attempt to init cppr */ int cppr_ret = cppr_status(); if (cppr_ret != CPPR_SUCCESS) { scr_abort(-1, "libcppr cppr_status() failed: %d '%s' @ %s:%d", cppr_ret, cppr_err_to_str(cppr_ret), __FILE__, __LINE__ ); } scr_dbg(1, "#bold CPPR is present @ %s:%d", __FILE__, __LINE__); #endif /* HAVE_LIBCPPR */ return SCR_SUCCESS; }
/* lookup name in table, insert if it doesn't exist, and return id */ int scr_mysql_read_write_id(const char* table, const char* name, unsigned long* id) { int rc = SCR_SUCCESS; #ifdef HAVE_LIBMYSQLCLIENT /* if the value is already in the database, return its id */ rc = scr_mysql_read_id(table, name, id); if (rc == SCR_SUCCESS) { return SCR_SUCCESS; } /* didn't find the value in the db, so let's add it */ /* escape parameter */ char* qname = scr_mysql_quote_string(name); /* check that we got valid strings for each of our parameters */ if (qname == NULL) { scr_err("Failed to escape and quote one or more arguments @ %s:%d", __FILE__, __LINE__ ); return SCR_FAILURE; } /* construct the query */ char query[1024]; int n = snprintf(query, sizeof(query), "INSERT IGNORE INTO `%s` (`id`,`name`) VALUES (NULL, %s) ;", table, qname ); /* free the strings as they are now encoded into the query */ scr_free(&qname); /* check that we were able to construct the query ok */ if (n >= sizeof(query)) { scr_err("Insufficient buffer space (%lu bytes) to build query (%lu bytes) @ %s:%d", sizeof(query), n, __FILE__, __LINE__ ); return SCR_FAILURE; } /* execute the query */ if (scr_db_debug >= 1) { scr_dbg(0, "%s", query); } if (mysql_real_query(&scr_mysql, query, (unsigned int) strlen(query))) { scr_err("Insert failed, query = (%s), error = (%s) @ %s:%d", query, mysql_error(&scr_mysql), __FILE__, __LINE__ ); /* don't return failure, since another process may have just beat us to the punch */ /*return SCR_FAILURE;*/ } /* alright, now we should be able to read the id */ rc = scr_mysql_read_id(table, name, id); #endif return rc; }
/* opens, reads, and computes the crc32 value for the given filename */ int scr_crc32(const char* filename, uLong* crc) { /* check that we got a variable to write our answer to */ if (crc == NULL) { return SCR_FAILURE; } /* initialize our crc value */ *crc = crc32(0L, Z_NULL, 0); /* open the file for reading */ int fd = scr_open(filename, O_RDONLY); if (fd < 0) { scr_dbg(1, "Failed to open file to compute crc: %s errno=%d @ file %s:%d", filename, errno, __FILE__, __LINE__ ); return SCR_FAILURE; } /* read the file data in and compute its crc32 */ int nread = 0; unsigned long buffer_size = 1024*1024; char buf[buffer_size]; do { nread = scr_read(filename, fd, buf, buffer_size); if (nread > 0) { *crc = crc32(*crc, (const Bytef*) buf, (uInt) nread); } } while (nread == buffer_size); /* if we got an error, don't print anything and bailout */ if (nread < 0) { scr_dbg(1, "Error while reading file to compute crc: %s @ file %s:%d", filename, __FILE__, __LINE__ ); close(fd); return SCR_FAILURE; } /* close the file */ scr_close(filename, fd); return SCR_SUCCESS; }
/* delete a file */ int scr_file_unlink(const char* file) { if (unlink(file) != 0) { scr_dbg(2, "Failed to delete file: %s errno=%d %s @ file %s:%d", file, errno, strerror(errno), __FILE__, __LINE__ ); return SCR_FAILURE; } return SCR_SUCCESS; }
/* log an event */ int scr_log_event(const char* type, const char* note, const int* dset, const time_t* start, const double* secs) { int rc = SCR_SUCCESS; if (scr_db_enable) { rc = scr_mysql_log_event(type, note, dset, start, secs); } struct tm *timeinfo; timeinfo = localtime(start); //TODO cppr check null scr_dbg(1,"scr_log_event: type %s, note %s, dset %d, start %s, secs %f", type, note, dset, asctime(timeinfo), secs); return rc; }
/* log a transfer: copy / checkpoint / fetch / flush */ int scr_log_transfer(const char* type, const char* from, const char* to, const int* dset_id, const time_t* start, const double* secs, const double* bytes) { int rc = SCR_SUCCESS; if (scr_db_enable) { rc = scr_mysql_log_transfer(type, from, to, dset_id, start, secs, bytes); } struct tm *timeinfo; timeinfo = localtime(start); //TODO cppr check null scr_dbg(1,"scr_log_transfer: type %s, src %s, dst %s, dset %d, start %s, secs %f, bytes %f", type, from, to, *dset_id, asctime(timeinfo), *secs, *bytes); return rc; }
/* open file with specified flags and mode, retry open a few times on failure */ int scr_open(const char* file, int flags, ...) { /* extract the mode (see man 2 open) */ int mode_set = 0; mode_t mode = 0; if (flags & O_CREAT) { va_list ap; va_start(ap, flags); mode = va_arg(ap, mode_t); va_end(ap); mode_set = 1; } int fd = -1; if (mode_set) { fd = open(file, flags, mode); } else { fd = open(file, flags); } if (fd < 0) { scr_dbg(1, "Opening file: open(%s) errno=%d %s @ %s:%d", file, errno, strerror(errno), __FILE__, __LINE__ ); /* try again */ int tries = SCR_OPEN_TRIES; while (tries && fd < 0) { usleep(SCR_OPEN_USLEEP); if (mode_set) { fd = open(file, flags, mode); } else { fd = open(file, flags); } tries--; } /* if we still don't have a valid file, consider it an error */ if (fd < 0) { scr_err("Opening file: open(%s) errno=%d %s @ %s:%d", file, errno, strerror(errno), __FILE__, __LINE__ ); } } return fd; }
/* fsync and close file */ int scr_close(const char* file, int fd) { /* fsync first */ if (fsync(fd) < 0) { /* print warning that fsync failed */ scr_dbg(2, "Failed to fsync file descriptor: %s errno=%d %s @ file %s:%d", file, errno, strerror(errno), __FILE__, __LINE__ ); } /* now close the file */ if (close(fd) != 0) { /* hit an error, print message */ scr_err("Closing file descriptor %d for file %s: errno=%d %s @ %s:%d", fd, file, errno, strerror(errno), __FILE__, __LINE__ ); return SCR_FAILURE; } return SCR_SUCCESS; }
int main(int argc, char* argv[]) { /* print usage if not enough arguments were given */ if (argc < 2) { printf("Usage: scr_inspect_cache <cntldir>\n"); return 1; } scr_path* scr_master_map_file = scr_path_from_str(strdup(argv[1])); /* get my hostname */ if (gethostname(scr_my_hostname, sizeof(scr_my_hostname)) != 0) { scr_err("scr_inspect_cache: Call to gethostname failed @ %s:%d", __FILE__, __LINE__ ); return 1; } /* read in the master map */ scr_hash* hash = scr_hash_new(); scr_hash_read_path(scr_master_map_file, hash); /* create an empty filemap */ scr_filemap* map = scr_filemap_new(); /* for each filemap listed in the master map */ scr_hash_elem* elem; for (elem = scr_hash_elem_first(scr_hash_get(hash, "Filemap")); elem != NULL; elem = scr_hash_elem_next(elem)) { /* get the filename of this filemap */ char* file = scr_hash_elem_key(elem); /* read in the filemap */ scr_filemap* tmp_map = scr_filemap_new(); scr_path* path_file = scr_path_from_str(file); scr_filemap_read(path_file, tmp_map); scr_path_delete(&path_file); /* merge it with local 0 filemap */ scr_filemap_merge(map, tmp_map); /* delete filemap */ scr_filemap_delete(&tmp_map); } /* scan each file for each rank of each dataset */ scr_hash_elem* dset_elem; for (dset_elem = scr_filemap_first_dataset(map); dset_elem != NULL; dset_elem = scr_hash_elem_next(dset_elem)) { /* get dataset id */ int dset = scr_hash_elem_key_int(dset_elem); scr_hash_elem* rank_elem; for (rank_elem = scr_filemap_first_rank_by_dataset(map, dset); rank_elem != NULL; rank_elem = scr_hash_elem_next(rank_elem)) { /* get rank id */ int rank = scr_hash_elem_key_int(rank_elem); int missing_file = 0; int expected = scr_filemap_get_expected_files(map, dset, rank); int num = scr_filemap_num_files(map, dset, rank); if (expected == num) { /* first time through the file list, check that we have each file */ scr_hash_elem* file_elem = NULL; for (file_elem = scr_filemap_first_file(map, dset, rank); file_elem != NULL; file_elem = scr_hash_elem_next(file_elem)) { /* get filename */ char* file = scr_hash_elem_key(file_elem); /* check that we can read the file */ if (! scr_bool_have_file(map, dset, rank, file)) { missing_file = 1; scr_dbg(1, "File is unreadable or incomplete: Dataset %d, Rank %d, File: %s", dset, rank, file ); } } } else { missing_file = 1; } /* TODO: print partner names */ /* if we're not missing a file for rank, print this info out */ if (! missing_file) { scr_hash* desc = scr_hash_new(); scr_filemap_get_desc(map, dset, rank, desc); char* type = scr_hash_elem_get_first_val(desc, SCR_CONFIG_KEY_TYPE); char* groups_str = scr_hash_elem_get_first_val(desc, SCR_CONFIG_KEY_GROUPS); char* group_id_str = scr_hash_elem_get_first_val(desc, SCR_CONFIG_KEY_GROUP_ID); char* group_size_str = scr_hash_elem_get_first_val(desc, SCR_CONFIG_KEY_GROUP_SIZE); char* group_rank_str = scr_hash_elem_get_first_val(desc, SCR_CONFIG_KEY_GROUP_RANK); if (type != NULL && groups_str != NULL && group_id_str != NULL && group_size_str != NULL && group_rank_str != NULL) { /* we already have a group id and rank, use that to rebuild the communicator */ int groups = atoi(groups_str); int group_id = atoi(group_id_str); int group_size = atoi(group_size_str); int group_rank = atoi(group_rank_str); printf("DSET=%d RANK=%d TYPE=%s GROUPS=%d GROUP_ID=%d GROUP_SIZE=%d GROUP_RANK=%d FILES=1\n", dset, rank, type, groups, group_id, group_size, group_rank ); } } } } scr_path_delete(&scr_master_map_file); return 0; }
/* attempt to fetch most recent checkpoint from prefix directory into * cache, fills in map if successful and sets fetch_attempted to 1 if * any fetch is attempted, returns SCR_SUCCESS if successful */ int scr_fetch_sync(scr_filemap* map, int* fetch_attempted) { /* we only return success if we successfully fetch a checkpoint */ int rc = SCR_FAILURE; double time_start, time_end, time_diff; /* start timer */ if (scr_my_rank_world == 0) { time_start = MPI_Wtime(); } /* have rank 0 read the index file */ scr_hash* index_hash = NULL; int read_index_file = 0; if (scr_my_rank_world == 0) { /* create an empty hash to store our index */ index_hash = scr_hash_new(); /* read the index file */ if (scr_index_read(scr_prefix_path, index_hash) == SCR_SUCCESS) { read_index_file = 1; } } /* don't enter while loop below if rank 0 failed to read index file */ int continue_fetching = 1; MPI_Bcast(&read_index_file, 1, MPI_INT, 0, scr_comm_world); if (! read_index_file) { continue_fetching = 0; } /* now start fetching, we keep trying until we exhaust all valid * checkpoints */ char target[SCR_MAX_FILENAME]; int current_checkpoint_id = -1; while (continue_fetching) { /* create a new path */ scr_path* fetch_path = scr_path_new(); /* initialize our target directory to empty string */ strcpy(target, ""); /* rank 0 determines the directory to fetch from */ if (scr_my_rank_world == 0) { /* read the current directory if it's set */ char* current_str; if (scr_index_get_current(index_hash, ¤t_str) == SCR_SUCCESS) { size_t current_str_len = strlen(current_str) + 1; if (current_str_len <= sizeof(target)) { strcpy(target, current_str); } else { /* ERROR */ } } /* lookup the checkpoint id */ int next_checkpoint_id = -1; if (strcmp(target, "") != 0) { /* we have a subdirectory name, lookup the checkpoint id * corresponding to this directory */ scr_index_get_id_by_dir(index_hash, target, &next_checkpoint_id); } else { /* otherwise, just get the most recent complete checkpoint * (that's older than the current id) */ scr_index_get_most_recent_complete(index_hash, current_checkpoint_id, &next_checkpoint_id, target); } current_checkpoint_id = next_checkpoint_id; /* TODODSET: need to verify that dataset is really a checkpoint * and keep searching if not */ /* if we have a subdirectory (target) name, build the full fetch * directory */ if (strcmp(target, "") != 0) { /* record that we're attempting a fetch of this checkpoint in * the index file */ *fetch_attempted = 1; if (current_checkpoint_id != -1) { scr_index_mark_fetched(index_hash, current_checkpoint_id, target); scr_index_write(scr_prefix_path, index_hash); } /* we have a subdirectory, now build the full path */ scr_path_append(fetch_path, scr_prefix_path); scr_path_append_str(fetch_path, target); scr_path_reduce(fetch_path); } } /* broadcast fetch path from rank 0 */ scr_path_bcast(fetch_path, 0, scr_comm_world); /* check whether we've got a path */ if (! scr_path_is_null(fetch_path)) { /* got something, attempt to fetch the checkpoint */ int dset_id, ckpt_id; rc = scr_fetch_files(map, fetch_path, &dset_id, &ckpt_id); if (rc == SCR_SUCCESS) { /* set the dataset and checkpoint ids */ scr_dataset_id = dset_id; scr_checkpoint_id = ckpt_id; /* we succeeded in fetching this checkpoint, set current to * point to it, and stop fetching */ if (scr_my_rank_world == 0) { scr_index_set_current(index_hash, target); scr_index_write(scr_prefix_path, index_hash); } continue_fetching = 0; } else { /* we tried to fetch, but we failed, mark it as failed in * the index file so we don't try it again */ if (scr_my_rank_world == 0) { /* unset the current pointer */ scr_index_unset_current(index_hash); if (current_checkpoint_id != -1 && strcmp(target, "") != 0) { scr_index_mark_failed(index_hash, current_checkpoint_id, target); } scr_index_write(scr_prefix_path, index_hash); } } } else { /* we ran out of valid checkpoints in the index file, * bail out of the loop */ continue_fetching = 0; } /* free fetch path */ scr_path_delete(&fetch_path); } /* delete the index hash */ if (scr_my_rank_world == 0) { scr_hash_delete(&index_hash); } /* broadcast whether we actually attempted to fetch anything * (only rank 0 knows) */ MPI_Bcast(fetch_attempted, 1, MPI_INT, 0, scr_comm_world); /* stop timer for fetch */ if (scr_my_rank_world == 0) { time_end = MPI_Wtime(); time_diff = time_end - time_start; scr_dbg(1, "scr_fetch_files: return code %d, %f secs", rc, time_diff); } return rc; }
/* this transfers redundancy descriptors for the given dataset id */ static int scr_distribute_reddescs(scr_filemap* map, int id, scr_reddesc* red) { int i; /* create a new hash to record redundancy descriptors that we have */ scr_hash* send_hash = scr_hash_new(); /* for this dataset, get list of ranks we have data for */ int nranks = 0; int* ranks = NULL; scr_filemap_list_ranks_by_dataset(map, id, &nranks, &ranks); /* for each rank we have files for, check whether we also have * its redundancy descriptor */ int invalid_rank_found = 0; for (i=0; i < nranks; i++) { /* get the rank id */ int rank = ranks[i]; /* check that the rank is within range */ if (rank < 0 || rank >= scr_ranks_world) { scr_err("Invalid rank id %d in world of %d @ %s:%d", rank, scr_ranks_world, __FILE__, __LINE__ ); invalid_rank_found = 1; } /* lookup the redundancy descriptor hash for this rank */ scr_hash* desc = scr_hash_new(); scr_filemap_get_desc(map, id, rank, desc); /* if this descriptor has entries, add it to our send hash, * delete the hash otherwise */ if (scr_hash_size(desc) > 0) { scr_hash_setf(send_hash, desc, "%d", rank); } else { scr_hash_delete(&desc); } } /* free off our list of ranks */ scr_free(&ranks); /* check that we didn't find an invalid rank on any process */ if (! scr_alltrue(invalid_rank_found == 0)) { scr_hash_delete(&send_hash); return SCR_FAILURE; } /* create an empty hash to receive any incoming descriptors */ /* exchange descriptors with other ranks */ scr_hash* recv_hash = scr_hash_new(); scr_hash_exchange(send_hash, recv_hash, scr_comm_world); /* check that everyone can get their descriptor */ int num_desc = scr_hash_size(recv_hash); if (! scr_alltrue(num_desc > 0)) { scr_hash_delete(&recv_hash); scr_hash_delete(&send_hash); scr_dbg(2, "Cannot find process that has my redundancy descriptor @ %s:%d", __FILE__, __LINE__ ); return SCR_FAILURE; } /* just go with the first redundancy descriptor in our list, * they should all be the same */ scr_hash_elem* desc_elem = scr_hash_elem_first(recv_hash); scr_hash* desc_hash = scr_hash_elem_hash(desc_elem); /* record the descriptor in our filemap */ scr_filemap_set_desc(map, id, scr_my_rank_world, desc_hash); scr_filemap_write(scr_map_file, map); /* TODO: at this point, we could delete descriptors for other * ranks for this checkpoint */ /* create our redundancy descriptor struct from the map */ scr_reddesc_create_from_filemap(map, id, scr_my_rank_world, red); /* free off our send and receive hashes */ scr_hash_delete(&recv_hash); scr_hash_delete(&send_hash); return SCR_SUCCESS; }
/* flush files from cache to parallel file system under SCR_PREFIX */ int scr_flush_sync(scr_filemap* map, int id) { int flushed = SCR_SUCCESS; /* if user has disabled flush, return failure */ if (scr_flush <= 0) { return SCR_FAILURE; } /* if we don't need a flush, return right away with success */ if (! scr_flush_file_need_flush(id)) { return SCR_SUCCESS; } /* this may take a while, so tell user what we're doing */ if (scr_my_rank_world == 0) { scr_dbg(1, "Initiating flush of dataset %d", id); } /* make sure all processes make it this far before progressing */ MPI_Barrier(scr_comm_world); /* start timer */ time_t timestamp_start; double time_start; if (scr_my_rank_world == 0) { timestamp_start = scr_log_seconds(); time_start = MPI_Wtime(); } /* if we are flushing something asynchronously, wait on it */ if (scr_flush_async_in_progress) { scr_flush_async_wait(map); /* the flush we just waited on could be the requested dataset, * so perhaps we're already done */ if (! scr_flush_file_need_flush(id)) { return SCR_SUCCESS; } } /* log the flush start */ if (scr_my_rank_world == 0) { if (scr_log_enable) { time_t now = scr_log_seconds(); scr_log_event("FLUSH STARTED", NULL, &id, &now, NULL); } } /* mark in the flush file that we are flushing the dataset */ scr_flush_file_location_set(id, SCR_FLUSH_KEY_LOCATION_SYNC_FLUSHING); /* get list of files to flush, identify containers, * create directories, and create container files */ scr_hash* file_list = scr_hash_new(); if (scr_flush_prepare(map, id, file_list) != SCR_SUCCESS) { flushed = SCR_FAILURE; } /* write the data out to files */ scr_hash* data = scr_hash_new(); if (scr_flush_data(file_list, data) != SCR_SUCCESS) { flushed = SCR_FAILURE; } /* write summary file */ if (scr_flush_complete(id, file_list, data) != SCR_SUCCESS) { flushed = SCR_FAILURE; } /* get number of bytes for this dataset */ double total_bytes = 0.0; if (scr_my_rank_world == 0) { if (flushed == SCR_SUCCESS) { /* get the dataset corresponding to this id */ scr_dataset* dataset = scr_dataset_new(); scr_filemap_get_dataset(map, id, scr_my_rank_world, dataset); /* get the number of bytes in the dataset */ unsigned long dataset_bytes; if (scr_dataset_get_size(dataset, &dataset_bytes) == SCR_SUCCESS) { total_bytes = (double) dataset_bytes; } /* delete the dataset object */ scr_dataset_delete(&dataset); } } /* free data structures */ scr_hash_delete(&data); scr_hash_delete(&file_list); /* remove sync flushing marker from flush file */ scr_flush_file_location_unset(id, SCR_FLUSH_KEY_LOCATION_SYNC_FLUSHING); /* stop timer, compute bandwidth, and report performance */ if (scr_my_rank_world == 0) { /* stop timer and compute bandwidth */ double time_end = MPI_Wtime(); double time_diff = time_end - time_start; double bw = total_bytes / (1024.0 * 1024.0 * time_diff); scr_dbg(1, "scr_flush_sync: %f secs, %e bytes, %f MB/s, %f MB/s per proc", time_diff, total_bytes, bw, bw/scr_ranks_world ); /* log messages about flush */ if (flushed == SCR_SUCCESS) { /* the flush worked, print a debug message */ scr_dbg(1, "scr_flush_sync: Flush of dataset %d succeeded", id); /* log details of flush */ if (scr_log_enable) { time_t now = scr_log_seconds(); scr_log_event("FLUSH SUCCEEDED", NULL, &id, &now, &time_diff); } } else { /* the flush failed, this is more serious so print an error message */ scr_err("scr_flush_sync: Flush of dataset %d failed", id); /* log details of flush */ if (scr_log_enable) { time_t now = scr_log_seconds(); scr_log_event("FLUSH FAILED", NULL, &id, &now, &time_diff); } } } return flushed; }
/* lookup name in table and return id if found, * returns SCR_FAILURE on error or if name is not found */ int scr_mysql_read_id(const char* table, const char* name, unsigned long* id) { #ifdef HAVE_LIBMYSQLCLIENT /* escape parameter */ char* qname = scr_mysql_quote_string(name); /* check that we got valid strings for each of our parameters */ if (qname == NULL) { scr_err("Failed to escape and quote one or more arguments @ %s:%d", __FILE__, __LINE__ ); return SCR_FAILURE; } /* construct the query */ char query[1024]; int n = snprintf(query, sizeof(query), "SELECT * FROM `%s` WHERE `name` = %s ;", table, qname ); /* free the strings as they are now encoded into the query */ scr_free(&qname); /* check that we were able to construct the query ok */ if (n >= sizeof(query)) { scr_err("Insufficient buffer space (%lu bytes) to build query (%lu bytes) @ %s:%d", sizeof(query), n, __FILE__, __LINE__ ); return SCR_FAILURE; } /* execute the query */ if (scr_db_debug >= 1) { scr_dbg(0, "%s", query); } if (mysql_real_query(&scr_mysql, query, (unsigned int) strlen(query))) { scr_err("Select failed, query = (%s), error = (%s) @ %s:%d", query, mysql_error(&scr_mysql), __FILE__, __LINE__ ); return SCR_FAILURE; } /* prepare the result set to be used */ MYSQL_RES* res = mysql_store_result(&scr_mysql); if (res == NULL) { scr_err("Result failed, query = (%s), error = (%s) @ %s:%d", query, mysql_error(&scr_mysql), __FILE__, __LINE__ ); return SCR_FAILURE; } /* get the number of rows in the result set */ my_ulonglong nrows = mysql_num_rows(res); if (nrows != 1) { mysql_free_result(res); return SCR_FAILURE; } /* finally, lookup our id */ MYSQL_ROW row = mysql_fetch_row(res); if (row == NULL) { scr_err("Row fetch failed, query = (%s), error = (%s) @ %s:%d", query, mysql_error(&scr_mysql), __FILE__, __LINE__ ); mysql_free_result(res); return SCR_FAILURE; } *id = strtoul(row[0], NULL, 0); /* free the result set */ mysql_free_result(res); #endif return SCR_SUCCESS; }
/* fetch files from parallel file system */ static int scr_fetch_files( scr_filemap* map, scr_path* fetch_path, int* dataset_id, int* checkpoint_id) { /* get fetch directory as string */ char* fetch_dir = scr_path_strdup(fetch_path); /* this may take a while, so tell user what we're doing */ if (scr_my_rank_world == 0) { scr_dbg(1, "Attempting fetch from %s", fetch_dir); } /* make sure all processes make it this far before progressing */ MPI_Barrier(scr_comm_world); /* start timer */ time_t timestamp_start; double time_start; if (scr_my_rank_world == 0) { timestamp_start = scr_log_seconds(); time_start = MPI_Wtime(); } /* log the fetch attempt */ if (scr_my_rank_world == 0) { if (scr_log_enable) { time_t now = scr_log_seconds(); scr_log_event("FETCH STARTED", fetch_dir, NULL, &now, NULL); } } /* allocate a new hash to get a list of files to fetch */ scr_hash* file_list = scr_hash_new(); /* read the summary file */ if (scr_fetch_summary(fetch_dir, file_list) != SCR_SUCCESS) { if (scr_my_rank_world == 0) { scr_dbg(1, "Failed to read summary file @ %s:%d", __FILE__, __LINE__); if (scr_log_enable) { double time_end = MPI_Wtime(); double time_diff = time_end - time_start; time_t now = scr_log_seconds(); scr_log_event("FETCH FAILED", fetch_dir, NULL, &now, &time_diff); } } scr_hash_delete(&file_list); scr_free(&fetch_dir); return SCR_FAILURE; } /* get a pointer to the dataset */ scr_dataset* dataset = scr_hash_get(file_list, SCR_KEY_DATASET); /* get the dataset id */ int id; if (scr_dataset_get_id(dataset, &id) != SCR_SUCCESS) { if (scr_my_rank_world == 0) { scr_dbg(1, "Invalid id in summary file @ %s:%d", __FILE__, __LINE__); if (scr_log_enable) { double time_end = MPI_Wtime(); double time_diff = time_end - time_start; time_t now = scr_log_seconds(); scr_log_event("FETCH FAILED", fetch_dir, NULL, &now, &time_diff); } } scr_hash_delete(&file_list); scr_free(&fetch_dir); return SCR_FAILURE; } /* get the checkpoint id for this dataset */ int ckpt_id; if (scr_dataset_get_ckpt(dataset, &ckpt_id) != SCR_SUCCESS) { /* eventually, we'll support reading of non-checkpoint datasets, * but we don't yet */ scr_err("Failed to read checkpoint id from dataset @ %s:%d", __FILE__, __LINE__ ); scr_hash_delete(&file_list); scr_free(&fetch_dir); return SCR_FAILURE; } /* delete any existing files for this dataset id (do this before * filemap_read) */ scr_cache_delete(map, id); /* get the redundancy descriptor for this id */ scr_reddesc* c = scr_reddesc_for_checkpoint(ckpt_id, scr_nreddescs, scr_reddescs); /* store our redundancy descriptor hash in the filemap */ scr_hash* my_desc_hash = scr_hash_new(); scr_reddesc_store_to_hash(c, my_desc_hash); scr_filemap_set_desc(map, id, scr_my_rank_world, my_desc_hash); scr_hash_delete(&my_desc_hash); /* write the filemap out before creating the directory */ scr_filemap_write(scr_map_file, map); /* create the cache directory */ scr_cache_dir_create(c, id); /* get the cache directory */ char cache_dir[SCR_MAX_FILENAME]; scr_cache_dir_get(c, id, cache_dir); /* now we can finally fetch the actual files */ int success = 1; if (scr_fetch_data(file_list, cache_dir, map) != SCR_SUCCESS) { success = 0; } /* free the hash holding the summary file data */ scr_hash_delete(&file_list); /* check that all processes copied their file successfully */ if (! scr_alltrue(success)) { /* someone failed, so let's delete the partial checkpoint */ scr_cache_delete(map, id); if (scr_my_rank_world == 0) { scr_dbg(1, "One or more processes failed to read its files @ %s:%d", __FILE__, __LINE__ ); if (scr_log_enable) { double time_end = MPI_Wtime(); double time_diff = time_end - time_start; time_t now = scr_log_seconds(); scr_log_event("FETCH FAILED", fetch_dir, &id, &now, &time_diff); } } scr_free(&fetch_dir); return SCR_FAILURE; } /* apply redundancy scheme */ double bytes_copied = 0.0; int rc = scr_reddesc_apply(map, c, id, &bytes_copied); if (rc == SCR_SUCCESS) { /* record dataset and checkpoint ids */ *dataset_id = id; *checkpoint_id = ckpt_id; /* update our flush file to indicate this checkpoint is in cache * as well as the parallel file system */ /* TODO: should we place SCR_FLUSH_KEY_LOCATION_PFS before * scr_reddesc_apply? */ scr_flush_file_location_set(id, SCR_FLUSH_KEY_LOCATION_CACHE); scr_flush_file_location_set(id, SCR_FLUSH_KEY_LOCATION_PFS); scr_flush_file_location_unset(id, SCR_FLUSH_KEY_LOCATION_FLUSHING); } else { /* something went wrong, so delete this checkpoint from the cache */ scr_cache_delete(scr_map, id); } /* stop timer, compute bandwidth, and report performance */ double total_bytes = bytes_copied; if (scr_my_rank_world == 0) { double time_end = MPI_Wtime(); double time_diff = time_end - time_start; double bw = total_bytes / (1024.0 * 1024.0 * time_diff); scr_dbg(1, "scr_fetch_files: %f secs, %e bytes, %f MB/s, %f MB/s per proc", time_diff, total_bytes, bw, bw/scr_ranks_world ); /* log data on the fetch to the database */ if (scr_log_enable) { time_t now = scr_log_seconds(); if (rc == SCR_SUCCESS) { scr_log_event("FETCH SUCCEEDED", fetch_dir, &id, &now, &time_diff); } else { scr_log_event("FETCH FAILED", fetch_dir, &id, &now, &time_diff); } char cache_dir[SCR_MAX_FILENAME]; scr_cache_dir_get(c, id, cache_dir); scr_log_transfer("FETCH", fetch_dir, cache_dir, &id, ×tamp_start, &time_diff, &total_bytes ); } } /* free fetch direcotry string */ scr_free(&fetch_dir); return rc; }
/* checks whether specifed file exists, is readable, and is complete */ static int scr_bool_have_file(const scr_filemap* map, int dset, int rank, const char* file) { /* if no filename is given return false */ if (file == NULL || strcmp(file,"") == 0) { scr_dbg(2, "File name is null or the empty string @ %s:%d", __FILE__, __LINE__ ); return 0; } /* check that we can read the file */ if (scr_file_is_readable(file) != SCR_SUCCESS) { scr_dbg(2, "Do not have read access to file: %s @ %s:%d", file, __FILE__, __LINE__ ); return 0; } /* check that we can read meta file for the file */ scr_meta* meta = scr_meta_new(); if (scr_filemap_get_meta(map, dset, rank, file, meta) != SCR_SUCCESS) { scr_dbg(2, "Failed to read meta data for file: %s @ %s:%d", file, __FILE__, __LINE__ ); scr_meta_delete(&meta); return 0; } /* check that the file is complete */ if (scr_meta_is_complete(meta) != SCR_SUCCESS) { scr_dbg(2, "File is marked as incomplete: %s @ %s:%d", file, __FILE__, __LINE__ ); scr_meta_delete(&meta); return 0; } /* TODODSET: check that dataset id matches */ #if 0 /* check that the file really belongs to the checkpoint id we think it does */ int meta_dset = -1; if (scr_meta_get_checkpoint(meta, &meta_dset) != SCR_SUCCESS) { scr_dbg(2, "Failed to read checkpoint field in meta data: %s @ %s:%d", file, __FILE__, __LINE__ ); scr_meta_delete(&meta); return 0; } if (dset != meta_dset) { scr_dbg(2, "File's checkpoint ID (%d) does not match id in meta data file (%d) for %s @ %s:%d", dset, meta_dset, file, __FILE__, __LINE__ ); scr_meta_delete(&meta); return 0; } #endif #if 0 /* check that the file really belongs to the rank we think it does */ int meta_rank = -1; if (scr_meta_get_rank(meta, &meta_rank) != SCR_SUCCESS) { scr_dbg(2, "Failed to read rank field in meta data: %s @ %s:%d", file, __FILE__, __LINE__ ); scr_meta_delete(&meta); return 0; } if (rank != meta_rank) { scr_dbg(2, "File's rank (%d) does not match rank in meta data file (%d) for %s @ %s:%d", rank, meta_rank, file, __FILE__, __LINE__ ); scr_meta_delete(&meta); return 0; } #endif #if 0 /* check that the file was written with same number of ranks we think it was */ int meta_ranks = -1; if (scr_meta_get_ranks(meta, &meta_ranks) != SCR_SUCCESS) { scr_dbg(2, "Failed to read ranks field in meta data: %s @ %s:%d", file, __FILE__, __LINE__ ); scr_meta_delete(&meta); return 0; } if (ranks != meta_ranks) { scr_dbg(2, "File's ranks (%d) does not match ranks in meta data file (%d) for %s @ %s:%d", ranks, meta_ranks, file, __FILE__, __LINE__ ); scr_meta_delete(&meta); return 0; } #endif /* check that the file size matches (use strtol while reading data) */ unsigned long size = scr_file_size(file); unsigned long meta_size = 0; if (scr_meta_get_filesize(meta, &meta_size) != SCR_SUCCESS) { scr_dbg(2, "Failed to read filesize field in meta data: %s @ %s:%d", file, __FILE__, __LINE__ ); scr_meta_delete(&meta); return 0; } if (size != meta_size) { scr_dbg(2, "Filesize is incorrect, currently %lu, expected %lu for %s @ %s:%d", size, meta_size, file, __FILE__, __LINE__ ); scr_meta_delete(&meta); return 0; } /* TODO: check that crc32 match if set (this would be expensive) */ scr_meta_delete(&meta); /* if we made it here, assume the file is good */ return 1; }
/* records an SCR event in the SCR log database */ int scr_mysql_log_event(const char* type, const char* note, const int* dset, const time_t* start, const double* secs) { #ifdef HAVE_LIBMYSQLCLIENT /* lookup the id for the type string */ int type_id = -1; if (scr_mysql_type_id(type, &type_id) == SCR_FAILURE) { scr_err("Failed to lookup id for type string %s @ %s:%d", type, __FILE__, __LINE__ ); return SCR_FAILURE; } char* qnote = scr_mysql_quote_string(note); char* qdset = scr_mysql_quote_int(dset); char* qstart = scr_mysql_quote_seconds(start); char* qsecs = scr_mysql_quote_double(secs); /* check that we got valid strings for each of our parameters */ if (qnote == NULL || qdset == NULL || qstart == NULL || qsecs == NULL) { scr_err("Failed to escape and quote one or more arguments @ %s:%d", __FILE__, __LINE__ ); return SCR_FAILURE; } /* construct the query */ char query[4096]; int n = snprintf(query, sizeof(query), "INSERT" " INTO `events`" " (`id`,`job_id`,`type_id`,`dset_id`,`start`,`time`,`note`)" " VALUES" " (NULL, %lu, %d, %s, %s, %s, %s)" " ;", scr_db_jobid, type_id, qdset, qstart, qsecs, qnote ); /* free the strings as they are now encoded into the query */ scr_free(&qnote); scr_free(&qdset); scr_free(&qstart); scr_free(&qsecs); /* check that we were able to construct the query ok */ if (n >= sizeof(query)) { scr_err("Insufficient buffer space (%lu bytes) to build query (%lu bytes) @ %s:%d", sizeof(query), n, __FILE__, __LINE__ ); return SCR_FAILURE; } /* execute the query */ if (scr_db_debug >= 1) { scr_dbg(0, "%s", query); } if (mysql_real_query(&scr_mysql, query, (unsigned int) strlen(query))) { scr_err("Insert failed, query = (%s), error = (%s) @ %s:%d", query, mysql_error(&scr_mysql), __FILE__, __LINE__ ); return SCR_FAILURE; } #endif return SCR_SUCCESS; }
/* allocate and return a string containing the current job id */ char* scr_env_jobid() { char* jobid = NULL; char* value; #ifdef SCR_RESOURCE_MANAGER_SLURM /* read $SLURM_JOBID environment variable for jobid string */ if ((value = getenv("SLURM_JOBID")) != NULL) { jobid = strdup(value); if (jobid == NULL) { scr_err("Failed to allocate memory to record jobid (%s) @ %s:%d", value, __FILE__, __LINE__ ); } } #endif #ifdef SCR_RESOURCE_MANAGER_APRUN /* read $PBS_JOBID environment variable for jobid string */ if ((value = getenv("PBS_JOBID")) != NULL) { jobid = strdup(value); if (jobid == NULL) { scr_err("Failed to allocate memory to record jobid (%s) @ %s:%d", value, __FILE__, __LINE__ ); } } #endif #ifdef SCR_RESOURCE_MANAGER_PMIX /* todo: must replace this in the scr_env script as well */ pmix_pdata_t *pmix_query_data = NULL; PMIX_PDATA_CREATE(pmix_query_data, 1); /* todo: pmix_pdata_destroy ?? */ /* specify that we want our jobid from pmix */ strncpy(pmix_query_data[0].key, PMIX_JOBID, PMIX_MAX_KEYLEN); /* query pmix for our job id */ pmix_status_t retval = PMIx_Lookup(pmix_query_data, 1, NULL, 0); if (retval == PMIX_SUCCESS) { /* got it, strdup the value from pmix */ jobid = strdup(pmix_query_data[0].value.data.string); scr_dbg(1, "PMIx_Lookup for jobid success '%s'", jobid); } else { /* failed to get our jobid from pmix, make one up */ char *pmix_hardcoded_id = "pmix_hardcoded_jobid"; jobid = strdup(pmix_hardcoded_id); scr_dbg(1, "PMIx_Lookup for jobid failed: rc=%d, using hardcoded jobid '%s'", retval, jobid ); } /* free pmix query structure */ PMIX_PDATA_FREE(pmix_query_data, 1); #endif #ifdef SCR_RESOURCE_MANAGER_LSF /* read $PBS_JOBID environment variable for jobid string */ if ((value = getenv("LSB_JOBID")) != NULL) { jobid = strdup(value); if (jobid == NULL) { scr_err("Failed to allocate memory to record jobid (%s) @ %s:%d", value, __FILE__, __LINE__ ); } } #endif return jobid; }
/* records an SCR file transfer (copy/fetch/flush/drain) in the SCR log database */ int scr_mysql_log_transfer(const char* type, const char* from, const char* to, const int* dset, const time_t* start, const double* secs, const double* bytes) { #ifdef HAVE_LIBMYSQLCLIENT /* lookup the id for the type string */ int type_id = -1; if (scr_mysql_type_id(type, &type_id) == SCR_FAILURE) { scr_err("Failed to lookup id for type string %s @ %s:%d", type, __FILE__, __LINE__ ); return SCR_FAILURE; } /* compute end epoch, using trucation here */ time_t* end = NULL; time_t end_val; if (start != NULL && secs != NULL) { end_val = *start + (time_t) *secs; end = &end_val; } /* compute the number of seconds and the bandwidth of the operation */ double* bw = NULL; double bw_val; if (bytes != NULL && secs != NULL && *secs > 0.0) { bw_val = *bytes / *secs; bw = &bw_val; } /* convert seconds since epoch to mysql datetime strings */ char* qfrom = scr_mysql_quote_string(from); char* qto = scr_mysql_quote_string(to); char* qdset = scr_mysql_quote_int(dset); char* qstart = scr_mysql_quote_seconds(start); char* qend = scr_mysql_quote_seconds(end); char* qsecs = scr_mysql_quote_double(secs); char* qbytes = scr_mysql_quote_double(bytes); char* qbw = scr_mysql_quote_double(bw); /* check that we got valid strings for each of our parameters */ if (qfrom == NULL || qto == NULL || qdset == NULL || qstart == NULL || qend == NULL || qsecs == NULL || qbytes == NULL || qbw == NULL) { scr_err("Failed to escape and quote one or more arguments @ %s:%d", __FILE__, __LINE__ ); return SCR_FAILURE; } /* construct the query */ char query[4096]; int n = snprintf(query, sizeof(query), "INSERT" " INTO `transfers`" " (`id`,`job_id`,`type_id`,`dset_id`,`start`,`end`,`time`,`bytes`,`bw`,`from`,`to`)" " VALUES" " (NULL, %lu, %d, %s, %s, %s, %s, %s, %s, %s, %s)" " ;", scr_db_jobid, type_id, qdset, qstart, qend, qsecs, qbytes, qbw, qfrom, qto ); /* free the strings as they are now encoded into the query */ scr_free(&qfrom); scr_free(&qto); scr_free(&qdset); scr_free(&qstart); scr_free(&qend); scr_free(&qsecs); scr_free(&qbytes); scr_free(&qbw); /* check that we were able to construct the query ok */ if (n >= sizeof(query)) { scr_err("Insufficient buffer space (%lu bytes) to build query (%lu bytes) @ %s:%d", sizeof(query), n, __FILE__, __LINE__ ); return SCR_FAILURE; } /* execute the query */ if (scr_db_debug >= 1) { scr_dbg(0, "%s", query); } if (mysql_real_query(&scr_mysql, query, (unsigned int) strlen(query))) { scr_err("Insert failed, query = (%s), error = (%s) @ %s:%d", query, mysql_error(&scr_mysql), __FILE__, __LINE__ ); return SCR_FAILURE; } #endif return SCR_SUCCESS; }
/* flushes file named in src_file to dst_dir and fills in meta based on flush, * returns success of flush */ static int scr_flush_a_file(const char* src_file, const char* dst_dir, scr_meta* meta) { int flushed = SCR_SUCCESS; int tmp_rc; /* build full name to destination file */ scr_path* dst_path = scr_path_from_str(src_file); scr_path_basename(dst_path); scr_path_prepend_str(dst_path, dst_dir); scr_path_reduce(dst_path); char* dst_file = scr_path_strdup(dst_path); /* copy file */ int crc_valid = 0; uLong crc; uLong* crc_p = NULL; if (scr_crc_on_flush) { crc_valid = 1; crc_p = &crc; } tmp_rc = scr_file_copy(src_file, dst_file, scr_file_buf_size, crc_p); if (tmp_rc != SCR_SUCCESS) { crc_valid = 0; flushed = SCR_FAILURE; } scr_dbg(2, "scr_flush_a_file: Read and copied %s to %s with success code %d @ %s:%d", src_file, dst_file, tmp_rc, __FILE__, __LINE__ ); /* if file has crc32, check it against the one computed during the copy, * otherwise if scr_crc_on_flush is set, record crc32 */ if (crc_valid) { uLong crc_meta; if (scr_meta_get_crc32(meta, &crc_meta) == SCR_SUCCESS) { if (crc != crc_meta) { /* detected a crc mismatch during the copy */ /* TODO: unlink the copied file */ /* scr_file_unlink(dst_file); */ /* mark the file as invalid */ scr_meta_set_complete(meta, 0); flushed = SCR_FAILURE; scr_err("scr_flush_a_file: CRC32 mismatch detected when flushing file %s to %s @ %s:%d", src_file, dst_file, __FILE__, __LINE__ ); /* TODO: would be good to log this, but right now only rank 0 can write log entries */ /* if (scr_log_enable) { time_t now = scr_log_seconds(); scr_log_event("CRC32 MISMATCH", dst_file, NULL, &now, NULL); } */ } } else { /* the crc was not already in the metafile, but we just computed it, so set it */ scr_meta_set_crc32(meta, crc); } } /* TODO: check that written filesize matches expected filesize */ /* fill out meta data, set complete field based on flush success */ /* (we don't update the meta file here, since perhaps the file in cache is ok and only the flush failed) */ int complete = (flushed == SCR_SUCCESS); scr_meta_set_complete(meta, complete); /* free destination file string and path */ scr_free(&dst_file); scr_path_delete(&dst_path); return flushed; }
int scr_mysql_read_job(unsigned long username_id, unsigned long jobname_id, unsigned long* id) { #ifdef HAVE_LIBMYSQLCLIENT /* TODO: need to escape parameters */ /* construct the query */ char query[1024]; int n = snprintf(query, sizeof(query), "SELECT * FROM `jobs` WHERE `username_id` = '%lu' AND `jobname_id` = '%lu' ;", username_id, jobname_id ); /* check that we were able to construct the query ok */ if (n >= sizeof(query)) { scr_err("Insufficient buffer space (%lu bytes) to build query (%lu bytes) @ %s:%d", sizeof(query), n, __FILE__, __LINE__ ); return SCR_FAILURE; } /* execute the query */ if (scr_db_debug >= 1) { scr_dbg(0, "%s", query); } if (mysql_real_query(&scr_mysql, query, (unsigned int) strlen(query))) { scr_err("Select failed, query = (%s), error = (%s) @ %s:%d", query, mysql_error(&scr_mysql), __FILE__, __LINE__ ); return SCR_FAILURE; } /* prepare the result set to be used */ MYSQL_RES* res = mysql_store_result(&scr_mysql); if (res == NULL) { scr_err("Result failed, query = (%s), error = (%s) @ %s:%d", query, mysql_error(&scr_mysql), __FILE__, __LINE__ ); return SCR_FAILURE; } /* get the number of rows in the result set */ my_ulonglong nrows = mysql_num_rows(res); if (nrows != 1) { mysql_free_result(res); return SCR_FAILURE; } /* finally, lookup our id */ MYSQL_ROW row = mysql_fetch_row(res); if (row == NULL) { scr_err("Row fetch failed, query = (%s), error = (%s) @ %s:%d", query, mysql_error(&scr_mysql), __FILE__, __LINE__ ); mysql_free_result(res); return SCR_FAILURE; } *id = strtoul(row[0], NULL, 0); /* free the result set */ mysql_free_result(res); #endif return SCR_SUCCESS; }
/* distribute and rebuild files in cache */ int scr_cache_rebuild(scr_filemap* map) { int rc = SCR_FAILURE; double time_start, time_end, time_diff; /* start timer */ time_t time_t_start; if (scr_my_rank_world == 0) { time_t_start = scr_log_seconds(); time_start = MPI_Wtime(); } /* we set this variable to 1 if we actually try to distribute * files for a restart */ int distribute_attempted = 0; /* clean any incomplete files from our cache */ scr_cache_clean(map); /* get ordered list of datasets we have in our cache */ int ndsets; int* dsets; scr_filemap_list_datasets(map, &ndsets, &dsets); /* TODO: put dataset selection logic into a function */ /* TODO: also attempt to recover datasets which we were in the * middle of flushing */ int current_id; int dset_index = 0; do { /* get the smallest index across all processes (returned in current_id), * this also updates our dset_index value if appropriate */ scr_next_dataset(ndsets, dsets, &dset_index, ¤t_id); /* if we found a dataset, try to distribute and rebuild it */ if (current_id != -1) { /* remember that we made an attempt to distribute at least one dataset */ distribute_attempted = 1; /* log the attempt */ if (scr_my_rank_world == 0) { scr_dbg(1, "Attempting to distribute and rebuild dataset %d", current_id); if (scr_log_enable) { time_t now = scr_log_seconds(); scr_log_event("REBUILD STARTED", NULL, ¤t_id, &now, NULL); } } /* distribute dataset descriptor for this dataset */ int rebuild_succeeded = 0; if (scr_distribute_datasets(map, current_id) == SCR_SUCCESS) { /* distribute redundancy descriptor for this dataset */ scr_reddesc reddesc; if (scr_distribute_reddescs(map, current_id, &reddesc) == SCR_SUCCESS) { /* create a directory for this dataset */ scr_cache_dir_create(&reddesc, current_id); /* distribute the files for this dataset */ scr_distribute_files(map, &reddesc, current_id); /* rebuild files for this dataset */ int tmp_rc = scr_reddesc_recover(map, &reddesc, current_id); if (tmp_rc == SCR_SUCCESS) { /* rebuild succeeded */ rebuild_succeeded = 1; /* if we rebuild any checkpoint, return success */ rc = SCR_SUCCESS; /* update scr_dataset_id */ if (current_id > scr_dataset_id) { scr_dataset_id = current_id; } /* TODO: dataset may not be a checkpoint */ /* update scr_checkpoint_id */ if (current_id > scr_checkpoint_id) { scr_checkpoint_id = current_id; } /* update our flush file to indicate this dataset is in cache */ scr_flush_file_location_set(current_id, SCR_FLUSH_KEY_LOCATION_CACHE); /* TODO: if storing flush file in control directory on each node, * if we find any process that has marked the dataset as flushed, * marked it as flushed in every flush file */ /* TODO: would like to restore flushing status to datasets that * were in the middle of a flush, but we need to better manage * the transfer file to do this, so for now just forget about * flushing this dataset */ scr_flush_file_location_unset(current_id, SCR_FLUSH_KEY_LOCATION_FLUSHING); } /* free redundancy descriptor */ scr_reddesc_free(&reddesc); } } /* if the distribute or rebuild failed, delete the dataset */ if (! rebuild_succeeded) { /* log that we failed */ if (scr_my_rank_world == 0) { scr_dbg(1, "Failed to rebuild dataset %d", current_id); if (scr_log_enable) { time_t now = scr_log_seconds(); scr_log_event("REBUILD FAILED", NULL, ¤t_id, &now, NULL); } } /* TODO: there is a bug here, since scr_cache_delete needs to read * the redundancy descriptor from the filemap in order to delete the * cache directory, but we may have failed to distribute the reddescs * above so not every task has one */ /* rebuild failed, delete this dataset from cache */ scr_cache_delete(map, current_id); } else { /* rebuid worked, log success */ if (scr_my_rank_world == 0) { scr_dbg(1, "Rebuilt dataset %d", current_id); if (scr_log_enable) { time_t now = scr_log_seconds(); scr_log_event("REBUILD SUCCEEDED", NULL, ¤t_id, &now, NULL); } } } } } while (current_id != -1); /* stop timer and report performance */ if (scr_my_rank_world == 0) { time_end = MPI_Wtime(); time_diff = time_end - time_start; if (distribute_attempted) { if (rc == SCR_SUCCESS) { scr_dbg(1, "Scalable restart succeeded for checkpoint %d, took %f secs", scr_checkpoint_id, time_diff ); if (scr_log_enable) { scr_log_event("RESTART SUCCEEDED", NULL, &scr_checkpoint_id, &time_t_start, &time_diff); } } else { /* scr_checkpoint_id is not defined */ scr_dbg(1, "Scalable restart failed, took %f secs", time_diff); if (scr_log_enable) { scr_log_event("RESTART FAILED", NULL, NULL, &time_t_start, &time_diff); } } } } /* free our list of dataset ids */ scr_free(&dsets); return rc; }
int scr_mysql_register_job(const char* username, const char* jobname, unsigned long start, unsigned long* jobid) { int rc = SCR_SUCCESS; #ifdef HAVE_LIBMYSQLCLIENT /* lookup the id for our username */ unsigned long username_id; rc = scr_mysql_read_write_id("usernames", username, &username_id); if (rc != SCR_SUCCESS) { scr_err("Failed to find username_id for %s @ %s:%d", username, __FILE__, __LINE__ ); return SCR_FAILURE; } /* lookup the id for our jobname */ unsigned long jobname_id; rc = scr_mysql_read_write_id("jobnames", jobname, &jobname_id); if (rc != SCR_SUCCESS) { scr_err("Failed to find jobname_id for %s @ %s:%d", jobname, __FILE__, __LINE__ ); return SCR_FAILURE; } /* if this job already has a db id, return it */ rc = scr_mysql_read_job(username_id, jobname_id, jobid); if (rc == SCR_SUCCESS) { return SCR_SUCCESS; } /* didn't find the job, so we need to insert a new record into the db */ /* translate unix seconds since epoch into mysql datetime field */ time_t start_time_t = (time_t) start; char* qsecs = scr_mysql_quote_seconds(&start_time_t); /* check that we got valid strings for each of our parameters */ if (qsecs == NULL) { scr_err("Failed to escape and quote one or more arguments @ %s:%d", __FILE__, __LINE__ ); return SCR_FAILURE; } /* construct the query */ char query[1024]; int n = snprintf(query, sizeof(query), "INSERT IGNORE" " INTO `jobs`" " (`id`,`username_id`,`jobname_id`,`start`)" " VALUES" " (NULL, %lu, %lu, %s)" " ;", username_id, jobname_id, qsecs ); /* free the strings as they are now encoded into the query */ scr_free(&qsecs); /* check that we were able to construct the query ok */ if (n >= sizeof(query)) { scr_err("Insufficient buffer space (%lu bytes) to build query (%lu bytes) @ %s:%d", sizeof(query), n, __FILE__, __LINE__ ); return SCR_FAILURE; } /* execute the query */ if (scr_db_debug >= 1) { scr_dbg(0, "%s", query); } if (mysql_real_query(&scr_mysql, query, (unsigned int) strlen(query))) { scr_err("Insert failed, query = (%s), error = (%s) @ %s:%d", query, mysql_error(&scr_mysql), __FILE__, __LINE__ ); /* don't return failure, since another process may have just beat us to the punch */ /*return SCR_FAILURE;*/ } /* now the job should be in the db, so read again to get its id */ rc = scr_mysql_read_job(username_id, jobname_id, jobid); #endif return rc; }
/* this moves all files of the specified dataset in the cache to * make them accessible to new rank mapping */ static int scr_distribute_files(scr_filemap* map, const scr_reddesc* red, int id) { int i, round; int rc = SCR_SUCCESS; /* TODO: mark dataset as being distributed in filemap, * because if we fail in the middle of a distribute, * we can't trust the contents of the files anymore, * at which point it should be deleted */ /* clean out any incomplete files before we start */ scr_cache_clean(map); /* for this dataset, get list of ranks we have data for */ int nranks = 0; int* ranks = NULL; scr_filemap_list_ranks_by_dataset(map, id, &nranks, &ranks); /* walk backwards through the list of ranks, and set our start index * to the rank which is the first rank that is equal to or higher * than our own rank -- when we assign round ids below, this offsetting * helps distribute the load */ int start_index = 0; int invalid_rank_found = 0; for (i = nranks-1; i >= 0; i--) { int rank = ranks[i]; /* pick the first rank whose rank id is equal to or higher than our own */ if (rank >= scr_my_rank_world) { start_index = i; } /* while we're at it, check that the rank is within range */ if (rank < 0 || rank >= scr_ranks_world) { scr_err("Invalid rank id %d in world of %d @ %s:%d", rank, scr_ranks_world, __FILE__, __LINE__ ); invalid_rank_found = 1; } } /* check that we didn't find an invalid rank on any process */ if (! scr_alltrue(invalid_rank_found == 0)) { scr_free(&ranks); return SCR_FAILURE; } /* allocate array to record the rank we can send to in each round */ int* have_rank_by_round = (int*) SCR_MALLOC(sizeof(int) * nranks); int* send_flag_by_round = (int*) SCR_MALLOC(sizeof(int) * nranks); /* check that we have all of the files for each rank, * and determine the round we can send them */ scr_hash* send_hash = scr_hash_new(); scr_hash* recv_hash = scr_hash_new(); for (round = 0; round < nranks; round++) { /* get the rank id */ int index = (start_index + round) % nranks; int rank = ranks[index]; /* record the rank indexed by the round number */ have_rank_by_round[round] = rank; /* assume we won't be sending to this rank in this round */ send_flag_by_round[round] = 0; /* if we have files for this rank, specify the round we can * send those files in */ if (scr_bool_have_files(map, id, rank)) { scr_hash_setf(send_hash, NULL, "%d %d", rank, round); } } scr_hash_exchange(send_hash, recv_hash, scr_comm_world); /* search for the minimum round we can get our files */ int retrieve_rank = -1; int retrieve_round = -1; scr_hash_elem* elem = NULL; for (elem = scr_hash_elem_first(recv_hash); elem != NULL; elem = scr_hash_elem_next(elem)) { /* get the rank id */ int rank = scr_hash_elem_key_int(elem); /* get the round id */ scr_hash* round_hash = scr_hash_elem_hash(elem); scr_hash_elem* round_elem = scr_hash_elem_first(round_hash); char* round_str = scr_hash_elem_key(round_elem); int round = atoi(round_str); /* record this round and rank number if it's less than the current round */ if (round < retrieve_round || retrieve_round == -1) { retrieve_round = round; retrieve_rank = rank; } } /* done with the round hashes, free them off */ scr_hash_delete(&recv_hash); scr_hash_delete(&send_hash); /* free off our list of ranks */ scr_free(&ranks); /* for some redundancy schemes, we know at this point whether we * can recover all files */ int can_get_files = (retrieve_rank != -1); if (red->copy_type != SCR_COPY_XOR && !scr_alltrue(can_get_files)) { /* print a debug message indicating which rank is missing files */ if (! can_get_files) { scr_dbg(2, "Cannot find process that has my checkpoint files @ %s:%d", __FILE__, __LINE__ ); } return SCR_FAILURE; } /* get the maximum retrieve round */ int max_rounds = 0; MPI_Allreduce( &retrieve_round, &max_rounds, 1, MPI_INT, MPI_MAX, scr_comm_world ); /* tell destination which round we'll take our files in */ send_hash = scr_hash_new(); recv_hash = scr_hash_new(); if (retrieve_rank != -1) { scr_hash_setf(send_hash, NULL, "%d %d", retrieve_rank, retrieve_round); } scr_hash_exchange(send_hash, recv_hash, scr_comm_world); /* determine which ranks want to fetch their files from us */ for(elem = scr_hash_elem_first(recv_hash); elem != NULL; elem = scr_hash_elem_next(elem)) { /* get the round id */ scr_hash* round_hash = scr_hash_elem_hash(elem); scr_hash_elem* round_elem = scr_hash_elem_first(round_hash); char* round_str = scr_hash_elem_key(round_elem); int round = atoi(round_str); /* record whether this rank wants its files from us */ if (round >= 0 && round < nranks) { send_flag_by_round[round] = 1; } } /* done with the round hashes, free them off */ scr_hash_delete(&recv_hash); scr_hash_delete(&send_hash); int tmp_rc = 0; /* run through rounds and exchange files */ for (round = 0; round <= max_rounds; round++) { /* assume we don't need to send or receive any files this round */ int send_rank = MPI_PROC_NULL; int recv_rank = MPI_PROC_NULL; int send_num = 0; int recv_num = 0; /* check whether I can potentially send to anyone in this round */ if (round < nranks) { /* have someone's files, check whether they are asking * for them this round */ if (send_flag_by_round[round]) { /* need to send files this round, remember to whom and how many */ int dst_rank = have_rank_by_round[round]; send_rank = dst_rank; send_num = scr_filemap_num_files(map, id, dst_rank); } } /* if I'm supposed to get my files this round, set the recv_rank */ if (retrieve_round == round) { recv_rank = retrieve_rank; } /* TODO: another special case is to just move files if the * processes are on the same node */ /* if i'm sending to myself, just move (rename) each file */ if (send_rank == scr_my_rank_world) { /* get our file list */ int numfiles = 0; char** files = NULL; scr_filemap_list_files(map, id, send_rank, &numfiles, &files); /* TODO: sort files in reverse order by size */ /* iterate over and rename each file */ for (i=0; i < numfiles; i++) { /* get the current file name */ char* file = files[i]; /* lookup meta data for this file */ scr_meta* meta = scr_meta_new(); scr_filemap_get_meta(map, id, send_rank, file, meta); /* get the path for this file based on its type * and dataset id */ char* dir = NULL; if (scr_meta_check_filetype(meta, SCR_META_FILE_USER) == SCR_SUCCESS) { dir = scr_cache_dir_get(red, id); } else { dir = scr_cache_dir_hidden_get(red, id); } /* build the new file name */ scr_path* path_newfile = scr_path_from_str(file); scr_path_basename(path_newfile); scr_path_prepend_str(path_newfile, dir); char* newfile = scr_path_strdup(path_newfile); /* if the new file name is different from the old name, rename it */ if (strcmp(file, newfile) != 0) { /* record the new filename to our map and write it to disk */ scr_filemap_add_file(map, id, send_rank, newfile); scr_filemap_set_meta(map, id, send_rank, newfile, meta); scr_filemap_write(scr_map_file, map); /* rename the file */ scr_dbg(2, "Round %d: rename(%s, %s)", round, file, newfile); tmp_rc = rename(file, newfile); if (tmp_rc != 0) { /* TODO: to cross mount points, if tmp_rc == EXDEV, * open new file, copy, and delete orig */ scr_err("Moving checkpoint file: rename(%s, %s) %s errno=%d @ %s:%d", file, newfile, strerror(errno), errno, __FILE__, __LINE__ ); rc = SCR_FAILURE; } /* remove the old name from the filemap and write it to disk */ scr_filemap_remove_file(map, id, send_rank, file); scr_filemap_write(scr_map_file, map); } /* free the path and string */ scr_free(&newfile); scr_path_delete(&path_newfile); /* free directory string */ scr_free(&dir); /* free meta data */ scr_meta_delete(&meta); } /* free the list of filename pointers */ scr_free(&files); } else { /* if we have files for this round, but the correspdonding * rank doesn't need them, delete the files */ if (round < nranks && send_rank == MPI_PROC_NULL) { int dst_rank = have_rank_by_round[round]; scr_unlink_rank(map, id, dst_rank); } /* sending to and/or recieving from another node */ if (send_rank != MPI_PROC_NULL || recv_rank != MPI_PROC_NULL) { /* have someone to send to or receive from */ int have_outgoing = 0; int have_incoming = 0; if (send_rank != MPI_PROC_NULL) { have_outgoing = 1; } if (recv_rank != MPI_PROC_NULL) { have_incoming = 1; } /* first, determine how many files I will be receiving and * tell how many I will be sending */ MPI_Request request[2]; MPI_Status status[2]; int num_req = 0; if (have_incoming) { MPI_Irecv( &recv_num, 1, MPI_INT, recv_rank, 0, scr_comm_world, &request[num_req] ); num_req++; } if (have_outgoing) { MPI_Isend( &send_num, 1, MPI_INT, send_rank, 0, scr_comm_world, &request[num_req] ); num_req++; } if (num_req > 0) { MPI_Waitall(num_req, request, status); } /* record how many files I will receive (need to distinguish * between 0 files and not knowing) */ if (have_incoming) { scr_filemap_set_expected_files(map, id, scr_my_rank_world, recv_num); } /* turn off send or receive flags if the file count is 0, * nothing else to do */ if (send_num == 0) { have_outgoing = 0; send_rank = MPI_PROC_NULL; } if (recv_num == 0) { have_incoming = 0; recv_rank = MPI_PROC_NULL; } /* TODO: since we overwrite files in place in order to avoid * running out of storage space, we should sort files in order * of descending size for the next step */ /* get our file list for the destination */ int numfiles = 0; char** files = NULL; if (have_outgoing) { scr_filemap_list_files(map, id, send_rank, &numfiles, &files); } /* while we have a file to send or receive ... */ while (have_incoming || have_outgoing) { /* get the filename */ char* file = NULL; scr_meta* send_meta = NULL; if (have_outgoing) { file = files[numfiles - send_num]; send_meta = scr_meta_new(); scr_filemap_get_meta(map, id, send_rank, file, send_meta); } /* exchange meta data so we can determine type of incoming file */ scr_meta* recv_meta = scr_meta_new(); scr_hash_sendrecv(send_meta, send_rank, recv_meta, recv_rank, scr_comm_world); /* get the path for this file based on its type and dataset id */ char* dir = NULL; if (have_incoming) { if (scr_meta_check_filetype(recv_meta, SCR_META_FILE_USER) == SCR_SUCCESS) { dir = scr_cache_dir_get(red, id); } else { dir = scr_cache_dir_hidden_get(red, id); } } /* exhange file names with partners, * building full path of incoming file */ char file_partner[SCR_MAX_FILENAME]; scr_swap_file_names( file, send_rank, file_partner, sizeof(file_partner), recv_rank, dir, scr_comm_world ); /* free directory string */ scr_free(&dir); /* free incoming meta data (we'll get this again later) */ scr_meta_delete(&recv_meta); /* if we'll receive a file, record the name of our file * in the filemap and write it to disk */ recv_meta = NULL; if (recv_rank != MPI_PROC_NULL) { recv_meta = scr_meta_new(); scr_filemap_add_file(map, id, scr_my_rank_world, file_partner); scr_filemap_write(scr_map_file, map); } /* either sending or receiving a file this round, since we move files, * it will be deleted or overwritten */ if (scr_swap_files(MOVE_FILES, file, send_meta, send_rank, file_partner, recv_meta, recv_rank, scr_comm_world) != SCR_SUCCESS) { scr_err("Swapping files: %s to %d, %s from %d @ %s:%d", file, send_rank, file_partner, recv_rank, __FILE__, __LINE__ ); rc = SCR_FAILURE; } /* if we received a file, record its meta data and decrement * our receive count */ if (have_incoming) { /* record meta data for the file we received */ scr_filemap_set_meta(map, id, scr_my_rank_world, file_partner, recv_meta); scr_meta_delete(&recv_meta); /* decrement receive count */ recv_num--; if (recv_num == 0) { have_incoming = 0; recv_rank = MPI_PROC_NULL; } } /* if we sent a file, remove it from the filemap and decrement * our send count */ if (have_outgoing) { /* remove file from the filemap */ scr_filemap_remove_file(map, id, send_rank, file); scr_meta_delete(&send_meta); /* decrement our send count */ send_num--; if (send_num == 0) { have_outgoing = 0; send_rank = MPI_PROC_NULL; } } /* update filemap on disk */ scr_filemap_write(scr_map_file, map); } /* free our file list */ scr_free(&files); } } } /* if we have more rounds than max rounds, delete the remainder of our files */ for (round = max_rounds+1; round < nranks; round++) { /* have someone's files for this round, so delete them */ int dst_rank = have_rank_by_round[round]; scr_unlink_rank(map, id, dst_rank); } scr_free(&send_flag_by_round); scr_free(&have_rank_by_round); /* write out new filemap and free the memory resources */ scr_filemap_write(scr_map_file, map); /* clean out any incomplete files */ scr_cache_clean(map); /* TODO: if the exchange or redundancy rebuild failed, * we should also delete any *good* files we received */ /* return whether distribute succeeded, it does not ensure we have * all of our files, only that the transfer completed without failure */ return rc; }
/* apply redundancy scheme to file and return number of bytes copied * in bytes parameter */ int scr_reddesc_apply( scr_filemap* map, const scr_reddesc* c, int id, double* bytes) { /* initialize to 0 */ *bytes = 0.0; /* step through each of my files for the specified dataset * to scan for any incomplete files */ int valid = 1; double my_bytes = 0.0; scr_hash_elem* file_elem; for (file_elem = scr_filemap_first_file(map, id, scr_my_rank_world); file_elem != NULL; file_elem = scr_hash_elem_next(file_elem)) { /* get the filename */ char* file = scr_hash_elem_key(file_elem); /* check the file */ if (! scr_bool_have_file(map, id, scr_my_rank_world, file, scr_ranks_world)) { scr_dbg(2, "File determined to be invalid: %s", file); valid = 0; } /* add up the number of bytes on our way through */ my_bytes += (double) scr_file_size(file); /* if crc_on_copy is set, compute crc and update meta file * (PARTNER does this during the copy) */ if (scr_crc_on_copy && c->copy_type != SCR_COPY_PARTNER) { scr_compute_crc(map, id, scr_my_rank_world, file); } } /* determine whether everyone's files are good */ int all_valid = scr_alltrue(valid); if (! all_valid) { if (scr_my_rank_world == 0) { scr_dbg(1, "Exiting copy since one or more checkpoint files is invalid"); } return SCR_FAILURE; } /* start timer */ time_t timestamp_start; double time_start; if (scr_my_rank_world == 0) { timestamp_start = scr_log_seconds(); time_start = MPI_Wtime(); } /* apply the redundancy scheme */ int rc = SCR_FAILURE; switch (c->copy_type) { case SCR_COPY_SINGLE: rc = SCR_SUCCESS; break; case SCR_COPY_PARTNER: rc = scr_reddesc_apply_partner(map, c, id); break; case SCR_COPY_XOR: rc = scr_reddesc_apply_xor(map, c, id); break; } /* record the number of files this task wrote during this dataset * (need to remember when a task writes 0 files) */ int num_files = scr_filemap_num_files(map, id, scr_my_rank_world); scr_filemap_set_expected_files(map, id, scr_my_rank_world, num_files); scr_filemap_write(scr_map_file, map); /* determine whether everyone succeeded in their copy */ int valid_copy = (rc == SCR_SUCCESS); if (! valid_copy) { scr_err("scr_copy_files failed with return code %d @ %s:%d", rc, __FILE__, __LINE__ ); } int all_valid_copy = scr_alltrue(valid_copy); rc = all_valid_copy ? SCR_SUCCESS : SCR_FAILURE; /* add up total number of bytes */ MPI_Allreduce(&my_bytes, bytes, 1, MPI_DOUBLE, MPI_SUM, scr_comm_world); /* stop timer and report performance info */ if (scr_my_rank_world == 0) { double time_end = MPI_Wtime(); double time_diff = time_end - time_start; double bw = *bytes / (1024.0 * 1024.0 * time_diff); scr_dbg(1, "scr_reddesc_apply: %f secs, %e bytes, %f MB/s, %f MB/s per proc", time_diff, *bytes, bw, bw/scr_ranks_world ); /* log data on the copy in the database */ if (scr_log_enable) { char* dir = scr_cache_dir_get(c, id); scr_log_transfer("COPY", c->base, dir, &id, ×tamp_start, &time_diff, bytes); scr_free(&dir); } } return rc; }