void makeflow_summary_create(struct dag *d, const char *filename, const char *email_summary_to, timestamp_t runtime, timestamp_t time_completed, int argc, char *argv[], const char *dagfile, struct batch_queue *remote_queue, int abort_flag, int failed_flag ) { char buffer[50]; FILE *summary_file = NULL; FILE *summary_email = NULL; if(filename) summary_file = fopen(filename, "w"); if(email_summary_to) { summary_email = popen("sendmail -t", "w"); fprintf(summary_email, "To: %s\n", email_summary_to); timestamp_fmt(buffer, 50, "%c", time_completed); fprintf(summary_email, "Subject: Makeflow Run Summary - %s \n", buffer); } int i; for(i = 0; i < argc; i++) summarize(summary_file, summary_email, "%s ", argv[i]); summarize(summary_file, summary_email, "\n"); if(abort_flag) summarize(summary_file, summary_email, "Workflow aborted:\t "); else if(failed_flag) summarize(summary_file, summary_email, "Workflow failed:\t "); else summarize(summary_file, summary_email, "Workflow completed:\t "); timestamp_fmt(buffer, 50, "%c\n", time_completed); summarize(summary_file, summary_email, "%s", buffer); int seconds = runtime / 1000000; int hours = seconds / 3600; int minutes = (seconds - hours * 3600) / 60; seconds = seconds - hours * 3600 - minutes * 60; summarize(summary_file, summary_email, "Total runtime:\t\t %d:%02d:%02d\n", hours, minutes, seconds); summarize(summary_file, summary_email, "Workflow file:\t\t %s\n", dagfile); struct dag_node *n; struct dag_file *f; const char *fn; dag_node_state_t state; struct list *output_files; output_files = list_create(); struct list *failed_tasks; failed_tasks = list_create(); int total_tasks = itable_size(d->node_table); int tasks_completed = 0; int tasks_aborted = 0; int tasks_unrun = 0; for(n = d->nodes; n; n = n->next) { state = n->state; if(state == DAG_NODE_STATE_FAILED && !list_find(failed_tasks, (int (*)(void *, const void *)) string_equal, (void *) fn)) list_push_tail(failed_tasks, (void *) n->command); else if(state == DAG_NODE_STATE_ABORTED) tasks_aborted++; else if(state == DAG_NODE_STATE_COMPLETE) { tasks_completed++; list_first_item(n->source_files); while((f = list_next_item(n->source_files))) { fn = f->filename; if(!list_find(output_files, (int (*)(void *, const void *)) string_equal, (void *) fn)) list_push_tail(output_files, (void *) fn); } } else tasks_unrun++; } summarize(summary_file, summary_email, "Number of tasks:\t %d\n", total_tasks); summarize(summary_file, summary_email, "Completed tasks:\t %d/%d\n", tasks_completed, total_tasks); if(tasks_aborted != 0) summarize(summary_file, summary_email, "Aborted tasks:\t %d/%d\n", tasks_aborted, total_tasks); if(tasks_unrun != 0) summarize(summary_file, summary_email, "Tasks not run:\t\t %d/%d\n", tasks_unrun, total_tasks); if(list_size(failed_tasks) > 0) summarize(summary_file, summary_email, "Failed tasks:\t\t %d/%d\n", list_size(failed_tasks), total_tasks); for(list_first_item(failed_tasks); (fn = list_next_item(failed_tasks)) != NULL;) summarize(summary_file, summary_email, "\t%s\n", fn); if(list_size(output_files) > 0) { summarize(summary_file, summary_email, "Output files:\n"); for(list_first_item(output_files); (fn = list_next_item(output_files)) != NULL;) { const char *size; struct stat buf; batch_fs_stat(remote_queue, fn, &buf); size = string_metric(buf.st_size, -1, NULL); summarize(summary_file, summary_email, "\t%s\t%s\n", fn, size); } } list_free(output_files); list_delete(output_files); list_free(failed_tasks); list_delete(failed_tasks); if(filename) { fprintf(stderr, "writing summary to %s.\n", filename); fclose(summary_file); } if(email_summary_to) { fprintf(stderr, "emailing summary to %s.\n", email_summary_to); fclose(summary_email); } }
/** The clean_mode variable was added so that we could better print out error messages * apply in the situation. Currently only used to silence node rerun checking. */ void makeflow_log_recover(struct dag *d, const char *filename, int verbose_mode, struct batch_queue *queue, makeflow_clean_depth clean_mode) { char *line, *name, file[MAX_BUFFER_SIZE]; int nodeid, state, jobid, file_state; int first_run = 1; struct dag_node *n; struct dag_file *f; struct stat buf; timestamp_t previous_completion_time; d->logfile = fopen(filename, "r"); if(d->logfile) { int linenum = 0; first_run = 0; printf("recovering from log file %s...\n",filename); while((line = get_line(d->logfile))) { linenum++; if(sscanf(line, "# %d %s %" SCNu64 "", &file_state, file, &previous_completion_time) == 3) { f = dag_file_lookup_or_create(d, file); f->state = file_state; if(file_state == DAG_FILE_STATE_EXISTS){ d->completed_files += 1; f->creation_logged = (time_t) (previous_completion_time / 1000000); } else if(file_state == DAG_FILE_STATE_DELETE){ d->deleted_files += 1; } continue; } if(line[0] == '#') continue; if(sscanf(line, "%" SCNu64 " %d %d %d", &previous_completion_time, &nodeid, &state, &jobid) == 4) { n = itable_lookup(d->node_table, nodeid); if(n) { n->state = state; n->jobid = jobid; /* Log timestamp is in microseconds, we need seconds for diff. */ n->previous_completion = (time_t) (previous_completion_time / 1000000); continue; } } fprintf(stderr, "makeflow: %s appears to be corrupted on line %d\n", filename, linenum); exit(1); } fclose(d->logfile); } d->logfile = fopen(filename, "a"); if(!d->logfile) { fprintf(stderr, "makeflow: couldn't open logfile %s: %s\n", filename, strerror(errno)); exit(1); } if(setvbuf(d->logfile, NULL, _IOLBF, BUFSIZ) != 0) { fprintf(stderr, "makeflow: couldn't set line buffer on logfile %s: %s\n", filename, strerror(errno)); exit(1); } if(first_run && verbose_mode) { struct dag_file *f; struct dag_node *p; for(n = d->nodes; n; n = n->next) { /* Record node information to log */ fprintf(d->logfile, "# NODE\t%d\t%s\n", n->nodeid, n->command); /* Record the node category to the log */ fprintf(d->logfile, "# CATEGORY\t%d\t%s\n", n->nodeid, n->category->label); fprintf(d->logfile, "# SYMBOL\t%d\t%s\n", n->nodeid, n->category->label); /* also write the SYMBOL as alias of CATEGORY, deprecated. */ /* Record node parents to log */ fprintf(d->logfile, "# PARENTS\t%d", n->nodeid); list_first_item(n->source_files); while( (f = list_next_item(n->source_files)) ) { p = f->created_by; if(p) fprintf(d->logfile, "\t%d", p->nodeid); } fputc('\n', d->logfile); /* Record node inputs to log */ fprintf(d->logfile, "# SOURCES\t%d", n->nodeid); list_first_item(n->source_files); while( (f = list_next_item(n->source_files)) ) { fprintf(d->logfile, "\t%s", f->filename); } fputc('\n', d->logfile); /* Record node outputs to log */ fprintf(d->logfile, "# TARGETS\t%d", n->nodeid); list_first_item(n->target_files); while( (f = list_next_item(n->target_files)) ) { fprintf(d->logfile, "\t%s", f->filename); } fputc('\n', d->logfile); /* Record translated command to log */ fprintf(d->logfile, "# COMMAND\t%d\t%s\n", n->nodeid, n->command); } } dag_count_states(d); // Check for log consistency if(!first_run) { hash_table_firstkey(d->files); while(hash_table_nextkey(d->files, &name, (void **) &f)) { if(dag_file_should_exist(f) && !dag_file_is_source(f) && !(batch_fs_stat(queue, f->filename, &buf) >= 0)){ fprintf(stderr, "makeflow: %s is reported as existing, but does not exist.\n", f->filename); makeflow_log_file_state_change(d, f, DAG_FILE_STATE_UNKNOWN); continue; } if(S_ISDIR(buf.st_mode)) continue; if(dag_file_should_exist(f) && !dag_file_is_source(f) && difftime(buf.st_mtime, f->creation_logged) > 0) { fprintf(stderr, "makeflow: %s is reported as existing, but has been modified (%" SCNu64 " ,%" SCNu64 ").\n", f->filename, (uint64_t)buf.st_mtime, (uint64_t)f->creation_logged); makeflow_clean_file(d, queue, f, 0); makeflow_log_file_state_change(d, f, DAG_FILE_STATE_UNKNOWN); } } } int silent = 0; if(clean_mode != MAKEFLOW_CLEAN_NONE) silent = 1; // Decide rerun tasks if(!first_run) { struct itable *rerun_table = itable_create(0); for(n = d->nodes; n; n = n->next) { makeflow_node_decide_rerun(rerun_table, d, n, silent); } itable_delete(rerun_table); } //Update file reference counts from nodes in log for(n = d->nodes; n; n = n->next) { if(n->state == DAG_NODE_STATE_COMPLETE) { struct dag_file *f; list_first_item(n->source_files); while((f = list_next_item(n->source_files))) f->ref_count += -1; } } }