static void mainloop( struct batch_queue *queue, const char *project_regex, const char *foremen_regex ) { int workers_submitted = 0; struct itable *job_table = itable_create(0); struct list *masters_list = NULL; struct list *foremen_list = NULL; const char *submission_regex = foremen_regex ? foremen_regex : project_regex; while(!abort_flag) { masters_list = work_queue_catalog_query(catalog_host,catalog_port,project_regex); debug(D_WQ,"evaluating master list..."); int workers_needed = count_workers_needed(masters_list, 0); debug(D_WQ,"%d total workers needed across %d masters", workers_needed, masters_list ? list_size(masters_list) : 0); if(foremen_regex) { debug(D_WQ,"evaluating foremen list..."); foremen_list = work_queue_catalog_query(catalog_host,catalog_port,foremen_regex); workers_needed += count_workers_needed(foremen_list, 1); debug(D_WQ,"%d total workers needed across %d foremen",workers_needed,list_size(foremen_list)); } debug(D_WQ,"raw workers needed: %d", workers_needed); if(workers_needed > workers_max) { debug(D_WQ,"applying maximum of %d workers",workers_max); workers_needed = workers_max; } if(workers_needed < workers_min) { debug(D_WQ,"applying minimum of %d workers",workers_min); workers_needed = workers_min; } int new_workers_needed = workers_needed - workers_submitted; debug(D_WQ,"workers needed: %d",workers_needed); debug(D_WQ,"workers in queue: %d",workers_submitted); print_stats(masters_list, foremen_list, workers_submitted, workers_needed, new_workers_needed); if(new_workers_needed>0) { debug(D_WQ,"submitting %d new workers to reach target",new_workers_needed); workers_submitted += submit_workers(queue,job_table,new_workers_needed,submission_regex); } else if(new_workers_needed<0) { debug(D_WQ,"too many workers, will wait for some to exit"); } else { debug(D_WQ,"target number of workers is reached."); } debug(D_WQ,"checking for exited workers..."); time_t stoptime = time(0)+5; while(1) { struct batch_job_info info; batch_job_id_t jobid; jobid = batch_job_wait_timeout(queue,&info,stoptime); if(jobid>0) { if(itable_lookup(job_table,jobid)) { itable_remove(job_table,jobid); debug(D_WQ,"worker job %"PRId64" exited",jobid); workers_submitted--; } else { // it may have been a job from a previous run. } } else { break; } } delete_projects_list(masters_list); delete_projects_list(foremen_list); sleep(30); } remove_all_workers(queue,job_table); itable_delete(job_table); }
static void mainloop( struct batch_queue *queue ) { int workers_submitted = 0; struct itable *job_table = itable_create(0); struct list *masters_list = NULL; struct list *foremen_list = NULL; int64_t factory_timeout_start = time(0); while(!abort_flag) { if(config_file && !read_config_file(config_file)) { debug(D_NOTICE, "Error re-reading '%s'. Using previous values.", config_file); } else { set_worker_resources_options( queue ); batch_queue_set_option(queue, "autosize", autosize ? "yes" : NULL); } submission_regex = foremen_regex ? foremen_regex : project_regex; if(using_catalog) { masters_list = work_queue_catalog_query(catalog_host,catalog_port,project_regex); } else { masters_list = do_direct_query(master_host,master_port); } if(masters_list && list_size(masters_list) > 0) { factory_timeout_start = time(0); } else { // check to see if factory timeout is triggered, factory timeout will be 0 if flag isn't set if(factory_timeout > 0) { if(time(0) - factory_timeout_start > factory_timeout) { fprintf(stderr, "There have been no masters for longer then the factory timeout, exiting\n"); abort_flag=1; break; } } } debug(D_WQ,"evaluating master list..."); int workers_needed = count_workers_needed(masters_list, 0); int workers_connected = count_workers_connected(masters_list); debug(D_WQ,"%d total workers needed across %d masters", workers_needed, masters_list ? list_size(masters_list) : 0); if(foremen_regex) { debug(D_WQ,"evaluating foremen list..."); foremen_list = work_queue_catalog_query(catalog_host,catalog_port,foremen_regex); /* add workers on foremen. Also, subtract foremen from workers * connected, as they were not deployed by the pool. */ workers_needed += count_workers_needed(foremen_list, 1); workers_connected += MAX(count_workers_connected(foremen_list) - list_size(foremen_list), 0); debug(D_WQ,"%d total workers needed across %d foremen",workers_needed,list_size(foremen_list)); } debug(D_WQ,"raw workers needed: %d", workers_needed); if(workers_needed > workers_max) { debug(D_WQ,"applying maximum of %d workers",workers_max); workers_needed = workers_max; } if(workers_needed < workers_min) { debug(D_WQ,"applying minimum of %d workers",workers_min); workers_needed = workers_min; } int new_workers_needed = workers_needed - workers_submitted; if(workers_per_cycle > 0 && new_workers_needed > workers_per_cycle) { debug(D_WQ,"applying maximum workers per cycle of %d",workers_per_cycle); new_workers_needed = workers_per_cycle; } if(workers_per_cycle > 0 && workers_submitted > new_workers_needed + workers_connected) { debug(D_WQ,"waiting for %d previously submitted workers to connect", workers_submitted - workers_connected); new_workers_needed = 0; } debug(D_WQ,"workers needed: %d", workers_needed); debug(D_WQ,"workers submitted: %d", workers_submitted); debug(D_WQ,"workers requested: %d", new_workers_needed); print_stats(masters_list, foremen_list, workers_submitted, workers_needed, new_workers_needed, workers_connected); update_blacklisted_workers(queue, masters_list); if(new_workers_needed>0) { debug(D_WQ,"submitting %d new workers to reach target",new_workers_needed); workers_submitted += submit_workers(queue,job_table,new_workers_needed); } else if(new_workers_needed<0) { debug(D_WQ,"too many workers, will wait for some to exit"); } else { debug(D_WQ,"target number of workers is reached."); } debug(D_WQ,"checking for exited workers..."); time_t stoptime = time(0)+5; while(1) { struct batch_job_info info; batch_job_id_t jobid; jobid = batch_job_wait_timeout(queue,&info,stoptime); if(jobid>0) { if(itable_lookup(job_table,jobid)) { itable_remove(job_table,jobid); debug(D_WQ,"worker job %"PRId64" exited",jobid); workers_submitted--; } else { // it may have been a job from a previous run. } } else { break; } } delete_projects_list(masters_list); delete_projects_list(foremen_list); sleep(factory_period); } remove_all_workers(queue,job_table); itable_delete(job_table); }
batch_job_id_t batch_job_wait(struct batch_queue * q, struct batch_job_info * info) { return batch_job_wait_timeout(q, info, 0); }