void makeflow_wrapper_delete(struct makeflow_wrapper *w) { if(w->command) free(w->command); list_free(w->input_files); list_delete(w->input_files); list_free(w->output_files); list_delete(w->output_files); if(w->uses_remote_rename){ uint64_t f; char *remote; itable_firstkey(w->remote_names); while(itable_nextkey(w->remote_names, &f, (void **) &remote)){ free(remote); } } itable_delete(w->remote_names); hash_table_delete(w->remote_names_inv); free(w); }
void batch_queue_delete(struct batch_queue *q) { if(q) { if(q->options_text) free(q->options_text); if(q->job_table) itable_delete(q->job_table); if(q->output_table) itable_delete(q->output_table); if(q->logfile) free(q->logfile); if(q->work_queue) work_queue_delete(q->work_queue); free(q); } }
void histogram_delete(struct histogram *h) { histogram_clear(h); if(h->buckets) { itable_delete(h->buckets); } free(h); }
void batch_queue_delete(struct batch_queue *q) { if(q) { char *key; char *value; debug(D_BATCH, "deleting queue %p", q); q->module->free(q); for (hash_table_firstkey(q->options); hash_table_nextkey(q->options, &key, (void **) &value); free(value)) ; hash_table_delete(q->options); for (hash_table_firstkey(q->features); hash_table_nextkey(q->features, &key, (void **) &value); free(value)) ; hash_table_delete(q->features); itable_delete(q->job_table); itable_delete(q->output_table); free(q); } }
static int itable_double_buckets(struct itable *h) { struct itable *hn = itable_create(2 * h->bucket_count); if(!hn) return 0; /* Move pairs to new hash */ uint64_t key; void *value; itable_firstkey(h); while(itable_nextkey(h, &key, &value)) if(!itable_insert(hn, key, value)) { itable_delete(hn); return 0; } /* Delete all old pairs */ struct entry *e, *f; int i; for(i = 0; i < h->bucket_count; i++) { e = h->buckets[i]; while(e) { f = e->next; free(e); e = f; } } /* Make the old point to the new */ free(h->buckets); h->buckets = hn->buckets; h->bucket_count = hn->bucket_count; h->size = hn->size; /* Delete reference to new, so old is safe */ free(hn); return 1; }
void mpi_queue_delete(struct mpi_queue *q) { if(q) { UINT64_T key; void *value; list_free(q->ready_list); list_delete(q->ready_list); list_free(q->complete_list); list_delete(q->complete_list); itable_firstkey(q->active_list); while(itable_nextkey(q->active_list, &key, &value)) { free(value); itable_remove(q->active_list, key); } itable_delete(q->active_list); link_close(q->master_link); free(q); } }
static void mainloop( struct batch_queue *queue ) { int workers_submitted = 0; struct itable *job_table = itable_create(0); struct list *masters_list = NULL; struct list *foremen_list = NULL; int64_t factory_timeout_start = time(0); while(!abort_flag) { if(config_file && !read_config_file(config_file)) { debug(D_NOTICE, "Error re-reading '%s'. Using previous values.", config_file); } else { set_worker_resources_options( queue ); batch_queue_set_option(queue, "autosize", autosize ? "yes" : NULL); } submission_regex = foremen_regex ? foremen_regex : project_regex; if(using_catalog) { masters_list = work_queue_catalog_query(catalog_host,catalog_port,project_regex); } else { masters_list = do_direct_query(master_host,master_port); } if(masters_list && list_size(masters_list) > 0) { factory_timeout_start = time(0); } else { // check to see if factory timeout is triggered, factory timeout will be 0 if flag isn't set if(factory_timeout > 0) { if(time(0) - factory_timeout_start > factory_timeout) { fprintf(stderr, "There have been no masters for longer then the factory timeout, exiting\n"); abort_flag=1; break; } } } debug(D_WQ,"evaluating master list..."); int workers_needed = count_workers_needed(masters_list, 0); int workers_connected = count_workers_connected(masters_list); debug(D_WQ,"%d total workers needed across %d masters", workers_needed, masters_list ? list_size(masters_list) : 0); if(foremen_regex) { debug(D_WQ,"evaluating foremen list..."); foremen_list = work_queue_catalog_query(catalog_host,catalog_port,foremen_regex); /* add workers on foremen. Also, subtract foremen from workers * connected, as they were not deployed by the pool. */ workers_needed += count_workers_needed(foremen_list, 1); workers_connected += MAX(count_workers_connected(foremen_list) - list_size(foremen_list), 0); debug(D_WQ,"%d total workers needed across %d foremen",workers_needed,list_size(foremen_list)); } debug(D_WQ,"raw workers needed: %d", workers_needed); if(workers_needed > workers_max) { debug(D_WQ,"applying maximum of %d workers",workers_max); workers_needed = workers_max; } if(workers_needed < workers_min) { debug(D_WQ,"applying minimum of %d workers",workers_min); workers_needed = workers_min; } int new_workers_needed = workers_needed - workers_submitted; if(workers_per_cycle > 0 && new_workers_needed > workers_per_cycle) { debug(D_WQ,"applying maximum workers per cycle of %d",workers_per_cycle); new_workers_needed = workers_per_cycle; } if(workers_per_cycle > 0 && workers_submitted > new_workers_needed + workers_connected) { debug(D_WQ,"waiting for %d previously submitted workers to connect", workers_submitted - workers_connected); new_workers_needed = 0; } debug(D_WQ,"workers needed: %d", workers_needed); debug(D_WQ,"workers submitted: %d", workers_submitted); debug(D_WQ,"workers requested: %d", new_workers_needed); print_stats(masters_list, foremen_list, workers_submitted, workers_needed, new_workers_needed, workers_connected); update_blacklisted_workers(queue, masters_list); if(new_workers_needed>0) { debug(D_WQ,"submitting %d new workers to reach target",new_workers_needed); workers_submitted += submit_workers(queue,job_table,new_workers_needed); } else if(new_workers_needed<0) { debug(D_WQ,"too many workers, will wait for some to exit"); } else { debug(D_WQ,"target number of workers is reached."); } debug(D_WQ,"checking for exited workers..."); time_t stoptime = time(0)+5; while(1) { struct batch_job_info info; batch_job_id_t jobid; jobid = batch_job_wait_timeout(queue,&info,stoptime); if(jobid>0) { if(itable_lookup(job_table,jobid)) { itable_remove(job_table,jobid); debug(D_WQ,"worker job %"PRId64" exited",jobid); workers_submitted--; } else { // it may have been a job from a previous run. } } else { break; } } delete_projects_list(masters_list); delete_projects_list(foremen_list); sleep(factory_period); } remove_all_workers(queue,job_table); itable_delete(job_table); }
static void mainloop( struct batch_queue *queue, const char *project_regex, const char *foremen_regex ) { int workers_submitted = 0; struct itable *job_table = itable_create(0); struct list *masters_list = NULL; struct list *foremen_list = NULL; const char *submission_regex = foremen_regex ? foremen_regex : project_regex; while(!abort_flag) { masters_list = work_queue_catalog_query(catalog_host,catalog_port,project_regex); debug(D_WQ,"evaluating master list..."); int workers_needed = count_workers_needed(masters_list, 0); debug(D_WQ,"%d total workers needed across %d masters", workers_needed, masters_list ? list_size(masters_list) : 0); if(foremen_regex) { debug(D_WQ,"evaluating foremen list..."); foremen_list = work_queue_catalog_query(catalog_host,catalog_port,foremen_regex); workers_needed += count_workers_needed(foremen_list, 1); debug(D_WQ,"%d total workers needed across %d foremen",workers_needed,list_size(foremen_list)); } debug(D_WQ,"raw workers needed: %d", workers_needed); if(workers_needed > workers_max) { debug(D_WQ,"applying maximum of %d workers",workers_max); workers_needed = workers_max; } if(workers_needed < workers_min) { debug(D_WQ,"applying minimum of %d workers",workers_min); workers_needed = workers_min; } int new_workers_needed = workers_needed - workers_submitted; debug(D_WQ,"workers needed: %d",workers_needed); debug(D_WQ,"workers in queue: %d",workers_submitted); print_stats(masters_list, foremen_list, workers_submitted, workers_needed, new_workers_needed); if(new_workers_needed>0) { debug(D_WQ,"submitting %d new workers to reach target",new_workers_needed); workers_submitted += submit_workers(queue,job_table,new_workers_needed,submission_regex); } else if(new_workers_needed<0) { debug(D_WQ,"too many workers, will wait for some to exit"); } else { debug(D_WQ,"target number of workers is reached."); } debug(D_WQ,"checking for exited workers..."); time_t stoptime = time(0)+5; while(1) { struct batch_job_info info; batch_job_id_t jobid; jobid = batch_job_wait_timeout(queue,&info,stoptime); if(jobid>0) { if(itable_lookup(job_table,jobid)) { itable_remove(job_table,jobid); debug(D_WQ,"worker job %"PRId64" exited",jobid); workers_submitted--; } else { // it may have been a job from a previous run. } } else { break; } } delete_projects_list(masters_list); delete_projects_list(foremen_list); sleep(30); } remove_all_workers(queue,job_table); itable_delete(job_table); }
struct cluster *nearest_neighbor_clustering(struct list *initial_clusters, double (*cmp)(struct cluster *, struct cluster *)) { struct cluster *top, *closest, *subtop; struct list *stack; struct itable *active_clusters; double dclosest, dsubtop; int merge = 0; list_first_item(initial_clusters); top = list_next_item(initial_clusters); /* Return immediately if top is NULL, or there is a unique * initial cluster */ if(list_size(initial_clusters) < 2) return top; stack = list_create(0); list_push_head(stack, top); /* Add all of the initial clusters as active clusters. */ active_clusters = itable_create(0); while( (top = list_next_item(initial_clusters)) ) itable_insert(active_clusters, (uintptr_t) top, (void *) top); do { /* closest might be NULL if all of the clusters are in * the stack now. subtop might be NULL if top was the * only cluster in the stack */ top = list_pop_head( stack ); closest = cluster_nearest_neighbor(active_clusters, top, cmp); subtop = list_peek_head( stack ); dclosest = -1; dsubtop = -1; if(closest) dclosest = cluster_ward_distance(top, closest); if(subtop) dsubtop = cluster_ward_distance(top, subtop); /* The nearest neighbor of top is either one of the * remaining active clusters, or the second topmost * cluster in the stack */ if( closest && subtop ) { /* Use pointer address to systematically break ties. */ if(dclosest < dsubtop || ((dclosest == dsubtop) && (uintptr_t)closest < (uintptr_t)subtop)) merge = 0; else merge = 1; } else if( subtop ) merge = 1; else if( closest ) merge = 0; else fatal("Zero clusters?\n"); //We should never reach here. if(merge) { /* If the two topmost clusters in the stack are * mutual nearest neighbors, merge them into a single * cluster */ subtop = list_pop_head( stack ); list_push_head(stack, cluster_merge(top, subtop)); } else { /* Otherwise, push the nearest neighbor of top to the * stack */ itable_remove(active_clusters, (uintptr_t) closest); list_push_head(stack, top); list_push_head(stack, closest); } debug(D_DEBUG, "stack: %d active: %d closest: %lf subtop: %lf\n", list_size(stack), itable_size(active_clusters), dclosest, dsubtop); /* If there are no more active_clusters, but there is not * a single cluster in the stack, we try again, * converting the clusters in the stack into new active * clusters. */ if(itable_size(active_clusters) == 0 && list_size(stack) > 3) { itable_delete(active_clusters); return nearest_neighbor_clustering(stack, cmp); } }while( !(itable_size(active_clusters) == 0 && list_size(stack) == 1) ); /* top is now the root of a cluster hierarchy, of * cluster->right, cluster->left. */ top = list_pop_head(stack); list_delete(stack); itable_delete(active_clusters); return top; }
/** The clean_mode variable was added so that we could better print out error messages * apply in the situation. Currently only used to silence node rerun checking. */ void makeflow_log_recover(struct dag *d, const char *filename, int verbose_mode, struct batch_queue *queue, makeflow_clean_depth clean_mode) { char *line, *name, file[MAX_BUFFER_SIZE]; int nodeid, state, jobid, file_state; int first_run = 1; struct dag_node *n; struct dag_file *f; struct stat buf; timestamp_t previous_completion_time; d->logfile = fopen(filename, "r"); if(d->logfile) { int linenum = 0; first_run = 0; printf("recovering from log file %s...\n",filename); while((line = get_line(d->logfile))) { linenum++; if(sscanf(line, "# %d %s %" SCNu64 "", &file_state, file, &previous_completion_time) == 3) { f = dag_file_lookup_or_create(d, file); f->state = file_state; if(file_state == DAG_FILE_STATE_EXISTS){ d->completed_files += 1; f->creation_logged = (time_t) (previous_completion_time / 1000000); } else if(file_state == DAG_FILE_STATE_DELETE){ d->deleted_files += 1; } continue; } if(line[0] == '#') continue; if(sscanf(line, "%" SCNu64 " %d %d %d", &previous_completion_time, &nodeid, &state, &jobid) == 4) { n = itable_lookup(d->node_table, nodeid); if(n) { n->state = state; n->jobid = jobid; /* Log timestamp is in microseconds, we need seconds for diff. */ n->previous_completion = (time_t) (previous_completion_time / 1000000); continue; } } fprintf(stderr, "makeflow: %s appears to be corrupted on line %d\n", filename, linenum); exit(1); } fclose(d->logfile); } d->logfile = fopen(filename, "a"); if(!d->logfile) { fprintf(stderr, "makeflow: couldn't open logfile %s: %s\n", filename, strerror(errno)); exit(1); } if(setvbuf(d->logfile, NULL, _IOLBF, BUFSIZ) != 0) { fprintf(stderr, "makeflow: couldn't set line buffer on logfile %s: %s\n", filename, strerror(errno)); exit(1); } if(first_run && verbose_mode) { struct dag_file *f; struct dag_node *p; for(n = d->nodes; n; n = n->next) { /* Record node information to log */ fprintf(d->logfile, "# NODE\t%d\t%s\n", n->nodeid, n->command); /* Record the node category to the log */ fprintf(d->logfile, "# CATEGORY\t%d\t%s\n", n->nodeid, n->category->label); fprintf(d->logfile, "# SYMBOL\t%d\t%s\n", n->nodeid, n->category->label); /* also write the SYMBOL as alias of CATEGORY, deprecated. */ /* Record node parents to log */ fprintf(d->logfile, "# PARENTS\t%d", n->nodeid); list_first_item(n->source_files); while( (f = list_next_item(n->source_files)) ) { p = f->created_by; if(p) fprintf(d->logfile, "\t%d", p->nodeid); } fputc('\n', d->logfile); /* Record node inputs to log */ fprintf(d->logfile, "# SOURCES\t%d", n->nodeid); list_first_item(n->source_files); while( (f = list_next_item(n->source_files)) ) { fprintf(d->logfile, "\t%s", f->filename); } fputc('\n', d->logfile); /* Record node outputs to log */ fprintf(d->logfile, "# TARGETS\t%d", n->nodeid); list_first_item(n->target_files); while( (f = list_next_item(n->target_files)) ) { fprintf(d->logfile, "\t%s", f->filename); } fputc('\n', d->logfile); /* Record translated command to log */ fprintf(d->logfile, "# COMMAND\t%d\t%s\n", n->nodeid, n->command); } } dag_count_states(d); // Check for log consistency if(!first_run) { hash_table_firstkey(d->files); while(hash_table_nextkey(d->files, &name, (void **) &f)) { if(dag_file_should_exist(f) && !dag_file_is_source(f) && !(batch_fs_stat(queue, f->filename, &buf) >= 0)){ fprintf(stderr, "makeflow: %s is reported as existing, but does not exist.\n", f->filename); makeflow_log_file_state_change(d, f, DAG_FILE_STATE_UNKNOWN); continue; } if(S_ISDIR(buf.st_mode)) continue; if(dag_file_should_exist(f) && !dag_file_is_source(f) && difftime(buf.st_mtime, f->creation_logged) > 0) { fprintf(stderr, "makeflow: %s is reported as existing, but has been modified (%" SCNu64 " ,%" SCNu64 ").\n", f->filename, (uint64_t)buf.st_mtime, (uint64_t)f->creation_logged); makeflow_clean_file(d, queue, f, 0); makeflow_log_file_state_change(d, f, DAG_FILE_STATE_UNKNOWN); } } } int silent = 0; if(clean_mode != MAKEFLOW_CLEAN_NONE) silent = 1; // Decide rerun tasks if(!first_run) { struct itable *rerun_table = itable_create(0); for(n = d->nodes; n; n = n->next) { makeflow_node_decide_rerun(rerun_table, d, n, silent); } itable_delete(rerun_table); } //Update file reference counts from nodes in log for(n = d->nodes; n; n = n->next) { if(n->state == DAG_NODE_STATE_COMPLETE) { struct dag_file *f; list_first_item(n->source_files); while((f = list_next_item(n->source_files))) f->ref_count += -1; } } }