/* log reason and time for halting current run */ int scr_log_halt(const char* reason, const int* dset) { int rc = SCR_SUCCESS; if (scr_db_enable) { time_t now = scr_log_seconds(); rc = scr_mysql_log_event("HALT", reason, dset, &now, NULL); } return rc; }
int main (int argc, char *argv[]) { int rc = 0; /* process command line arguments */ struct arglist args; if (!processArgs(argc, argv, &args)) { scr_err("scr_log_event: Failed to process args @ %s:%d", __FILE__, __LINE__ ); return 1; } char* value = NULL; /* read in job parameters (if user didn't specify them) */ if (args.username == NULL) { if ((value = getenv("USER")) != NULL) { args.username = strdup(value); } } if (args.jobname == NULL) { if ((value = getenv("SCR_JOB_NAME")) != NULL) { args.jobname = strdup(value); } } if (args.start == 0) { args.start = scr_log_seconds(); } /* read in log parameters */ scr_param_init(); if ((value = scr_param_get("SCR_LOG_ENABLE")) != NULL) { scr_log_enable = atoi(value); } scr_param_finalize(); if (scr_log_enable) { /* init logging */ if (scr_log_init() == SCR_SUCCESS) { /* register job */ if (args.username != NULL && args.jobname != NULL) { if (scr_log_job(args.username, args.jobname, args.start) != SCR_SUCCESS) { scr_err("scr_log_event: Failed to register job, disabling logging @ %s:%d", __FILE__, __LINE__ ); scr_log_enable = 0; rc = 1; } } else { scr_err("scr_log_event: Missing username, jobname, or start time, disabling logging @ %s:%d", __FILE__, __LINE__ ); scr_log_enable = 0; rc = 1; } } else { scr_err("scr_log_event: Failed to initialize SCR logging, disabling logging @ %s:%d", __FILE__, __LINE__ ); scr_log_enable = 0; rc = 1; } } if (scr_log_enable) { if (scr_log_event(args.event_type, args.event_note, args.event_dset, args.event_start, args.event_secs) != SCR_SUCCESS) { rc = 1; } } if (scr_log_enable) { scr_log_finalize(); } return rc; }
/* distribute and rebuild files in cache */ int scr_cache_rebuild(scr_filemap* map) { int rc = SCR_FAILURE; double time_start, time_end, time_diff; /* start timer */ time_t time_t_start; if (scr_my_rank_world == 0) { time_t_start = scr_log_seconds(); time_start = MPI_Wtime(); } /* we set this variable to 1 if we actually try to distribute * files for a restart */ int distribute_attempted = 0; /* clean any incomplete files from our cache */ scr_cache_clean(map); /* get ordered list of datasets we have in our cache */ int ndsets; int* dsets; scr_filemap_list_datasets(map, &ndsets, &dsets); /* TODO: put dataset selection logic into a function */ /* TODO: also attempt to recover datasets which we were in the * middle of flushing */ int current_id; int dset_index = 0; do { /* get the smallest index across all processes (returned in current_id), * this also updates our dset_index value if appropriate */ scr_next_dataset(ndsets, dsets, &dset_index, ¤t_id); /* if we found a dataset, try to distribute and rebuild it */ if (current_id != -1) { /* remember that we made an attempt to distribute at least one dataset */ distribute_attempted = 1; /* log the attempt */ if (scr_my_rank_world == 0) { scr_dbg(1, "Attempting to distribute and rebuild dataset %d", current_id); if (scr_log_enable) { time_t now = scr_log_seconds(); scr_log_event("REBUILD STARTED", NULL, ¤t_id, &now, NULL); } } /* distribute dataset descriptor for this dataset */ int rebuild_succeeded = 0; if (scr_distribute_datasets(map, current_id) == SCR_SUCCESS) { /* distribute redundancy descriptor for this dataset */ scr_reddesc reddesc; if (scr_distribute_reddescs(map, current_id, &reddesc) == SCR_SUCCESS) { /* create a directory for this dataset */ scr_cache_dir_create(&reddesc, current_id); /* distribute the files for this dataset */ scr_distribute_files(map, &reddesc, current_id); /* rebuild files for this dataset */ int tmp_rc = scr_reddesc_recover(map, &reddesc, current_id); if (tmp_rc == SCR_SUCCESS) { /* rebuild succeeded */ rebuild_succeeded = 1; /* if we rebuild any checkpoint, return success */ rc = SCR_SUCCESS; /* update scr_dataset_id */ if (current_id > scr_dataset_id) { scr_dataset_id = current_id; } /* TODO: dataset may not be a checkpoint */ /* update scr_checkpoint_id */ if (current_id > scr_checkpoint_id) { scr_checkpoint_id = current_id; } /* update our flush file to indicate this dataset is in cache */ scr_flush_file_location_set(current_id, SCR_FLUSH_KEY_LOCATION_CACHE); /* TODO: if storing flush file in control directory on each node, * if we find any process that has marked the dataset as flushed, * marked it as flushed in every flush file */ /* TODO: would like to restore flushing status to datasets that * were in the middle of a flush, but we need to better manage * the transfer file to do this, so for now just forget about * flushing this dataset */ scr_flush_file_location_unset(current_id, SCR_FLUSH_KEY_LOCATION_FLUSHING); } /* free redundancy descriptor */ scr_reddesc_free(&reddesc); } } /* if the distribute or rebuild failed, delete the dataset */ if (! rebuild_succeeded) { /* log that we failed */ if (scr_my_rank_world == 0) { scr_dbg(1, "Failed to rebuild dataset %d", current_id); if (scr_log_enable) { time_t now = scr_log_seconds(); scr_log_event("REBUILD FAILED", NULL, ¤t_id, &now, NULL); } } /* TODO: there is a bug here, since scr_cache_delete needs to read * the redundancy descriptor from the filemap in order to delete the * cache directory, but we may have failed to distribute the reddescs * above so not every task has one */ /* rebuild failed, delete this dataset from cache */ scr_cache_delete(map, current_id); } else { /* rebuid worked, log success */ if (scr_my_rank_world == 0) { scr_dbg(1, "Rebuilt dataset %d", current_id); if (scr_log_enable) { time_t now = scr_log_seconds(); scr_log_event("REBUILD SUCCEEDED", NULL, ¤t_id, &now, NULL); } } } } } while (current_id != -1); /* stop timer and report performance */ if (scr_my_rank_world == 0) { time_end = MPI_Wtime(); time_diff = time_end - time_start; if (distribute_attempted) { if (rc == SCR_SUCCESS) { scr_dbg(1, "Scalable restart succeeded for checkpoint %d, took %f secs", scr_checkpoint_id, time_diff ); if (scr_log_enable) { scr_log_event("RESTART SUCCEEDED", NULL, &scr_checkpoint_id, &time_t_start, &time_diff); } } else { /* scr_checkpoint_id is not defined */ scr_dbg(1, "Scalable restart failed, took %f secs", time_diff); if (scr_log_enable) { scr_log_event("RESTART FAILED", NULL, NULL, &time_t_start, &time_diff); } } } } /* free our list of dataset ids */ scr_free(&dsets); return rc; }
/* flush files from cache to parallel file system under SCR_PREFIX */ int scr_flush_sync(scr_filemap* map, int id) { int flushed = SCR_SUCCESS; /* if user has disabled flush, return failure */ if (scr_flush <= 0) { return SCR_FAILURE; } /* if we don't need a flush, return right away with success */ if (! scr_flush_file_need_flush(id)) { return SCR_SUCCESS; } /* this may take a while, so tell user what we're doing */ if (scr_my_rank_world == 0) { scr_dbg(1, "Initiating flush of dataset %d", id); } /* make sure all processes make it this far before progressing */ MPI_Barrier(scr_comm_world); /* start timer */ time_t timestamp_start; double time_start; if (scr_my_rank_world == 0) { timestamp_start = scr_log_seconds(); time_start = MPI_Wtime(); } /* if we are flushing something asynchronously, wait on it */ if (scr_flush_async_in_progress) { scr_flush_async_wait(map); /* the flush we just waited on could be the requested dataset, * so perhaps we're already done */ if (! scr_flush_file_need_flush(id)) { return SCR_SUCCESS; } } /* log the flush start */ if (scr_my_rank_world == 0) { if (scr_log_enable) { time_t now = scr_log_seconds(); scr_log_event("FLUSH STARTED", NULL, &id, &now, NULL); } } /* mark in the flush file that we are flushing the dataset */ scr_flush_file_location_set(id, SCR_FLUSH_KEY_LOCATION_SYNC_FLUSHING); /* get list of files to flush, identify containers, * create directories, and create container files */ scr_hash* file_list = scr_hash_new(); if (scr_flush_prepare(map, id, file_list) != SCR_SUCCESS) { flushed = SCR_FAILURE; } /* write the data out to files */ scr_hash* data = scr_hash_new(); if (scr_flush_data(file_list, data) != SCR_SUCCESS) { flushed = SCR_FAILURE; } /* write summary file */ if (scr_flush_complete(id, file_list, data) != SCR_SUCCESS) { flushed = SCR_FAILURE; } /* get number of bytes for this dataset */ double total_bytes = 0.0; if (scr_my_rank_world == 0) { if (flushed == SCR_SUCCESS) { /* get the dataset corresponding to this id */ scr_dataset* dataset = scr_dataset_new(); scr_filemap_get_dataset(map, id, scr_my_rank_world, dataset); /* get the number of bytes in the dataset */ unsigned long dataset_bytes; if (scr_dataset_get_size(dataset, &dataset_bytes) == SCR_SUCCESS) { total_bytes = (double) dataset_bytes; } /* delete the dataset object */ scr_dataset_delete(&dataset); } } /* free data structures */ scr_hash_delete(&data); scr_hash_delete(&file_list); /* remove sync flushing marker from flush file */ scr_flush_file_location_unset(id, SCR_FLUSH_KEY_LOCATION_SYNC_FLUSHING); /* stop timer, compute bandwidth, and report performance */ if (scr_my_rank_world == 0) { /* stop timer and compute bandwidth */ double time_end = MPI_Wtime(); double time_diff = time_end - time_start; double bw = total_bytes / (1024.0 * 1024.0 * time_diff); scr_dbg(1, "scr_flush_sync: %f secs, %e bytes, %f MB/s, %f MB/s per proc", time_diff, total_bytes, bw, bw/scr_ranks_world ); /* log messages about flush */ if (flushed == SCR_SUCCESS) { /* the flush worked, print a debug message */ scr_dbg(1, "scr_flush_sync: Flush of dataset %d succeeded", id); /* log details of flush */ if (scr_log_enable) { time_t now = scr_log_seconds(); scr_log_event("FLUSH SUCCEEDED", NULL, &id, &now, &time_diff); } } else { /* the flush failed, this is more serious so print an error message */ scr_err("scr_flush_sync: Flush of dataset %d failed", id); /* log details of flush */ if (scr_log_enable) { time_t now = scr_log_seconds(); scr_log_event("FLUSH FAILED", NULL, &id, &now, &time_diff); } } } return flushed; }
/* fetch files from parallel file system */ static int scr_fetch_files( scr_filemap* map, scr_path* fetch_path, int* dataset_id, int* checkpoint_id) { /* get fetch directory as string */ char* fetch_dir = scr_path_strdup(fetch_path); /* this may take a while, so tell user what we're doing */ if (scr_my_rank_world == 0) { scr_dbg(1, "Attempting fetch from %s", fetch_dir); } /* make sure all processes make it this far before progressing */ MPI_Barrier(scr_comm_world); /* start timer */ time_t timestamp_start; double time_start; if (scr_my_rank_world == 0) { timestamp_start = scr_log_seconds(); time_start = MPI_Wtime(); } /* log the fetch attempt */ if (scr_my_rank_world == 0) { if (scr_log_enable) { time_t now = scr_log_seconds(); scr_log_event("FETCH STARTED", fetch_dir, NULL, &now, NULL); } } /* allocate a new hash to get a list of files to fetch */ scr_hash* file_list = scr_hash_new(); /* read the summary file */ if (scr_fetch_summary(fetch_dir, file_list) != SCR_SUCCESS) { if (scr_my_rank_world == 0) { scr_dbg(1, "Failed to read summary file @ %s:%d", __FILE__, __LINE__); if (scr_log_enable) { double time_end = MPI_Wtime(); double time_diff = time_end - time_start; time_t now = scr_log_seconds(); scr_log_event("FETCH FAILED", fetch_dir, NULL, &now, &time_diff); } } scr_hash_delete(&file_list); scr_free(&fetch_dir); return SCR_FAILURE; } /* get a pointer to the dataset */ scr_dataset* dataset = scr_hash_get(file_list, SCR_KEY_DATASET); /* get the dataset id */ int id; if (scr_dataset_get_id(dataset, &id) != SCR_SUCCESS) { if (scr_my_rank_world == 0) { scr_dbg(1, "Invalid id in summary file @ %s:%d", __FILE__, __LINE__); if (scr_log_enable) { double time_end = MPI_Wtime(); double time_diff = time_end - time_start; time_t now = scr_log_seconds(); scr_log_event("FETCH FAILED", fetch_dir, NULL, &now, &time_diff); } } scr_hash_delete(&file_list); scr_free(&fetch_dir); return SCR_FAILURE; } /* get the checkpoint id for this dataset */ int ckpt_id; if (scr_dataset_get_ckpt(dataset, &ckpt_id) != SCR_SUCCESS) { /* eventually, we'll support reading of non-checkpoint datasets, * but we don't yet */ scr_err("Failed to read checkpoint id from dataset @ %s:%d", __FILE__, __LINE__ ); scr_hash_delete(&file_list); scr_free(&fetch_dir); return SCR_FAILURE; } /* delete any existing files for this dataset id (do this before * filemap_read) */ scr_cache_delete(map, id); /* get the redundancy descriptor for this id */ scr_reddesc* c = scr_reddesc_for_checkpoint(ckpt_id, scr_nreddescs, scr_reddescs); /* store our redundancy descriptor hash in the filemap */ scr_hash* my_desc_hash = scr_hash_new(); scr_reddesc_store_to_hash(c, my_desc_hash); scr_filemap_set_desc(map, id, scr_my_rank_world, my_desc_hash); scr_hash_delete(&my_desc_hash); /* write the filemap out before creating the directory */ scr_filemap_write(scr_map_file, map); /* create the cache directory */ scr_cache_dir_create(c, id); /* get the cache directory */ char cache_dir[SCR_MAX_FILENAME]; scr_cache_dir_get(c, id, cache_dir); /* now we can finally fetch the actual files */ int success = 1; if (scr_fetch_data(file_list, cache_dir, map) != SCR_SUCCESS) { success = 0; } /* free the hash holding the summary file data */ scr_hash_delete(&file_list); /* check that all processes copied their file successfully */ if (! scr_alltrue(success)) { /* someone failed, so let's delete the partial checkpoint */ scr_cache_delete(map, id); if (scr_my_rank_world == 0) { scr_dbg(1, "One or more processes failed to read its files @ %s:%d", __FILE__, __LINE__ ); if (scr_log_enable) { double time_end = MPI_Wtime(); double time_diff = time_end - time_start; time_t now = scr_log_seconds(); scr_log_event("FETCH FAILED", fetch_dir, &id, &now, &time_diff); } } scr_free(&fetch_dir); return SCR_FAILURE; } /* apply redundancy scheme */ double bytes_copied = 0.0; int rc = scr_reddesc_apply(map, c, id, &bytes_copied); if (rc == SCR_SUCCESS) { /* record dataset and checkpoint ids */ *dataset_id = id; *checkpoint_id = ckpt_id; /* update our flush file to indicate this checkpoint is in cache * as well as the parallel file system */ /* TODO: should we place SCR_FLUSH_KEY_LOCATION_PFS before * scr_reddesc_apply? */ scr_flush_file_location_set(id, SCR_FLUSH_KEY_LOCATION_CACHE); scr_flush_file_location_set(id, SCR_FLUSH_KEY_LOCATION_PFS); scr_flush_file_location_unset(id, SCR_FLUSH_KEY_LOCATION_FLUSHING); } else { /* something went wrong, so delete this checkpoint from the cache */ scr_cache_delete(scr_map, id); } /* stop timer, compute bandwidth, and report performance */ double total_bytes = bytes_copied; if (scr_my_rank_world == 0) { double time_end = MPI_Wtime(); double time_diff = time_end - time_start; double bw = total_bytes / (1024.0 * 1024.0 * time_diff); scr_dbg(1, "scr_fetch_files: %f secs, %e bytes, %f MB/s, %f MB/s per proc", time_diff, total_bytes, bw, bw/scr_ranks_world ); /* log data on the fetch to the database */ if (scr_log_enable) { time_t now = scr_log_seconds(); if (rc == SCR_SUCCESS) { scr_log_event("FETCH SUCCEEDED", fetch_dir, &id, &now, &time_diff); } else { scr_log_event("FETCH FAILED", fetch_dir, &id, &now, &time_diff); } char cache_dir[SCR_MAX_FILENAME]; scr_cache_dir_get(c, id, cache_dir); scr_log_transfer("FETCH", fetch_dir, cache_dir, &id, ×tamp_start, &time_diff, &total_bytes ); } } /* free fetch direcotry string */ scr_free(&fetch_dir); return rc; }
/* apply redundancy scheme to file and return number of bytes copied * in bytes parameter */ int scr_reddesc_apply( scr_filemap* map, const scr_reddesc* c, int id, double* bytes) { /* initialize to 0 */ *bytes = 0.0; /* step through each of my files for the specified dataset * to scan for any incomplete files */ int valid = 1; double my_bytes = 0.0; scr_hash_elem* file_elem; for (file_elem = scr_filemap_first_file(map, id, scr_my_rank_world); file_elem != NULL; file_elem = scr_hash_elem_next(file_elem)) { /* get the filename */ char* file = scr_hash_elem_key(file_elem); /* check the file */ if (! scr_bool_have_file(map, id, scr_my_rank_world, file, scr_ranks_world)) { scr_dbg(2, "File determined to be invalid: %s", file); valid = 0; } /* add up the number of bytes on our way through */ my_bytes += (double) scr_file_size(file); /* if crc_on_copy is set, compute crc and update meta file * (PARTNER does this during the copy) */ if (scr_crc_on_copy && c->copy_type != SCR_COPY_PARTNER) { scr_compute_crc(map, id, scr_my_rank_world, file); } } /* determine whether everyone's files are good */ int all_valid = scr_alltrue(valid); if (! all_valid) { if (scr_my_rank_world == 0) { scr_dbg(1, "Exiting copy since one or more checkpoint files is invalid"); } return SCR_FAILURE; } /* start timer */ time_t timestamp_start; double time_start; if (scr_my_rank_world == 0) { timestamp_start = scr_log_seconds(); time_start = MPI_Wtime(); } /* apply the redundancy scheme */ int rc = SCR_FAILURE; switch (c->copy_type) { case SCR_COPY_SINGLE: rc = SCR_SUCCESS; break; case SCR_COPY_PARTNER: rc = scr_reddesc_apply_partner(map, c, id); break; case SCR_COPY_XOR: rc = scr_reddesc_apply_xor(map, c, id); break; } /* record the number of files this task wrote during this dataset * (need to remember when a task writes 0 files) */ int num_files = scr_filemap_num_files(map, id, scr_my_rank_world); scr_filemap_set_expected_files(map, id, scr_my_rank_world, num_files); scr_filemap_write(scr_map_file, map); /* determine whether everyone succeeded in their copy */ int valid_copy = (rc == SCR_SUCCESS); if (! valid_copy) { scr_err("scr_copy_files failed with return code %d @ %s:%d", rc, __FILE__, __LINE__ ); } int all_valid_copy = scr_alltrue(valid_copy); rc = all_valid_copy ? SCR_SUCCESS : SCR_FAILURE; /* add up total number of bytes */ MPI_Allreduce(&my_bytes, bytes, 1, MPI_DOUBLE, MPI_SUM, scr_comm_world); /* stop timer and report performance info */ if (scr_my_rank_world == 0) { double time_end = MPI_Wtime(); double time_diff = time_end - time_start; double bw = *bytes / (1024.0 * 1024.0 * time_diff); scr_dbg(1, "scr_reddesc_apply: %f secs, %e bytes, %f MB/s, %f MB/s per proc", time_diff, *bytes, bw, bw/scr_ranks_world ); /* log data on the copy in the database */ if (scr_log_enable) { char* dir = scr_cache_dir_get(c, id); scr_log_transfer("COPY", c->base, dir, &id, ×tamp_start, &time_diff, bytes); scr_free(&dir); } } return rc; }