static void set_worker_resources( struct batch_queue *queue ) { batch_queue_set_option(queue, "cores", num_cores_option); batch_queue_set_option(queue, "memory", num_memory_option); batch_queue_set_option(queue, "disk", num_disk_option); batch_queue_set_option(queue, "gpus", num_gpus_option); resource_args = string_format("%s%s%s%s%s%s%s%s\n", num_cores_option ? " --cores=" : "", num_cores_option ? num_cores_option : "", num_memory_option ? " --memory=" : "", num_memory_option ? num_memory_option : "", num_disk_option ? " --disk=" : "", num_disk_option ? num_disk_option : "", num_gpus_option ? " --gpus=" : "", num_gpus_option ? num_gpus_option : ""); }
static int batch_queue_dryrun_create (struct batch_queue *q) { char *cwd = path_getcwd(); batch_queue_set_feature(q, "local_job_queue", NULL); batch_queue_set_feature(q, "batch_log_name", "%s.sh"); batch_queue_set_option(q, "cwd", cwd); return 0; }
static void update_blacklisted_workers( struct batch_queue *queue, struct list *masters_list ) { if(!masters_list || list_size(masters_list) < 1) return; buffer_t b; struct jx *j; buffer_init(&b); const char *sep = ""; list_first_item(masters_list); while((j=list_next_item(masters_list))) { struct jx *blacklisted = jx_lookup(j,"workers_blacklisted"); if(!blacklisted) { continue; } if(jx_istype(blacklisted, JX_STRING)) { buffer_printf(&b, "%s%s", sep, blacklisted->u.string_value); sep = " "; } if(jx_istype(blacklisted, JX_ARRAY)) { struct jx *item; for (void *i = NULL; (item = jx_iterate_array(blacklisted, &i));) { if(jx_istype(item, JX_STRING)) { buffer_printf(&b, "%s%s", sep, item->u.string_value); sep = " "; } } } } if(buffer_pos(&b) > 0) { batch_queue_set_option(queue, "workers-blacklisted", buffer_tostring(&b)); } else { batch_queue_set_option(queue, "workers-blacklisted", NULL); } buffer_free(&b); }
static int batch_fs_dryrun_chdir (struct batch_queue *q, const char *path) { FILE *log; if ((log = fopen(q->logfile, "a"))) { char *escaped_path = string_escape_shell(path); batch_queue_set_option(q, "cwd", xxstrdup(path)); fprintf(log, "cd %s\n", escaped_path); fclose(log); free(escaped_path); return 0; } else { return -1; } }
int main(int argc, char *argv[]) { batch_queue_type_t batch_queue_type = BATCH_QUEUE_TYPE_UNKNOWN; catalog_host = CATALOG_HOST; catalog_port = CATALOG_PORT; batch_submit_options = getenv("BATCH_OPTIONS"); debug_config(argv[0]); resources = rmsummary_create(-1); int c; while((c = getopt_long(argc, argv, "B:C:F:N:M:T:t:w:W:E:P:S:cd:o:O:vh", long_options, NULL)) > -1) { switch (c) { case 'B': batch_submit_options = xxstrdup(optarg); break; case 'C': config_file = xxstrdup(optarg); break; case 'F': foremen_regex = xxstrdup(optarg); break; case 'N': case 'M': project_regex = xxstrdup(optarg); break; case 'T': batch_queue_type = batch_queue_type_from_string(optarg); if(batch_queue_type == BATCH_QUEUE_TYPE_UNKNOWN) { fprintf(stderr, "unknown batch queue type: %s\n", optarg); return EXIT_FAILURE; } break; case 't': worker_timeout = atoi(optarg); break; case 'w': workers_min = atoi(optarg); break; case 'W': workers_max = atoi(optarg); break; case LONG_OPT_WORKERS_PER_CYCLE: workers_per_cycle = atoi(optarg); break; case LONG_OPT_TASKS_PER_WORKER: tasks_per_worker = atof(optarg); break; case 'E': extra_worker_args = xxstrdup(optarg); break; case LONG_OPT_CORES: resources->cores = atoi(optarg); break; case LONG_OPT_AMAZON_CREDENTIALS: amazon_credentials = xxstrdup(optarg); break; case LONG_OPT_AMAZON_AMI: amazon_ami = xxstrdup(optarg); break; case LONG_OPT_MEMORY: resources->memory = atoi(optarg); break; case LONG_OPT_DISK: resources->disk = atoi(optarg); break; case LONG_OPT_GPUS: resources->gpus = atoi(optarg); break; case LONG_OPT_AUTOSIZE: autosize = 1; break; case LONG_OPT_FACTORY_TIMEOUT: factory_timeout = MAX(0, atoi(optarg)); break; case LONG_OPT_CONDOR_REQUIREMENTS: if(condor_requirements) { char *tmp = condor_requirements; condor_requirements = string_format("(%s && (%s))", tmp, optarg); free(tmp); } else { condor_requirements = string_format("(%s)", optarg); } break; case LONG_OPT_WRAPPER: wrapper_command = optarg; break; case LONG_OPT_WRAPPER_INPUT: if(!wrapper_input) { wrapper_input = strdup(optarg); } else { wrapper_input = string_format("%s,%s",wrapper_input,optarg); } break; case 'P': password_file = optarg; break; case 'S': scratch_dir = optarg; break; case 'c': consider_capacity = 1; break; case 'd': debug_flags_set(optarg); break; case 'o': debug_config_file(optarg); break; case 'O': debug_config_file_size(string_metric_parse(optarg)); break; case 'v': cctools_version_print(stdout, argv[0]); exit(EXIT_SUCCESS); case 'h': show_help(argv[0]); exit(EXIT_SUCCESS); default: show_help(argv[0]); return EXIT_FAILURE; } } if(project_regex) { using_catalog = 1; } else if((argc - optind) == 2) { master_host = argv[optind]; master_port = atoi(argv[optind+1]); } else { fprintf(stderr,"work_queue_factory: You must either give a project name with the -M option or master-name option with a configuration file, or give the master's host and port.\n"); show_help(argv[0]); exit(1); } cctools_version_debug(D_DEBUG, argv[0]); if(batch_queue_type == BATCH_QUEUE_TYPE_UNKNOWN) { fprintf(stderr,"work_queue_factory: You must specify a batch type with the -T option.\n"); fprintf(stderr, "valid options:\n"); fprintf(stderr, "%s\n", batch_queue_type_string()); return 1; } if(config_file) { char abs_path_name[PATH_MAX]; if(!realpath(config_file, abs_path_name)) { fprintf(stderr, "work_queue_factory: could not resolve configuration file path: '%s'.\n", config_file); exit(EXIT_FAILURE); } free(config_file); /* From now on, read config_file from absolute path */ config_file = xxstrdup(abs_path_name); if(!read_config_file(config_file)) { fprintf(stderr,"work_queue_factory: There were errors in the configuration file: %s\n", config_file); return 1; } } if(workers_min>workers_max) { fprintf(stderr,"work_queue_factory: min workers (%d) is greater than max workers (%d)\n",workers_min, workers_max); return 1; } /* Careful here: most of the supported batch systems expect that jobs are submitting from a single shared filesystem. Changing to /tmp only works in the case of Condor. */ if(!scratch_dir) { if(batch_queue_type==BATCH_QUEUE_TYPE_CONDOR) { scratch_dir = string_format("/tmp/wq-pool-%d",getuid()); } else { scratch_dir = string_format("wq-pool-%d",getuid()); } } if(!create_dir(scratch_dir,0777)) { fprintf(stderr,"work_queue_factory: couldn't create %s: %s",scratch_dir,strerror(errno)); return 1; } char cmd[1024]; sprintf(cmd,"cp \"$(which work_queue_worker)\" '%s'",scratch_dir); if (system(cmd)) { fprintf(stderr, "work_queue_factory: please add work_queue_worker to your PATH.\n"); exit(EXIT_FAILURE); } if(password_file) { sprintf(cmd,"cp %s %s/pwfile",password_file,scratch_dir); system(cmd); } if(chdir(scratch_dir)!=0) { fprintf(stderr,"work_queue_factory: couldn't chdir to %s: %s",scratch_dir,strerror(errno)); return 1; } signal(SIGINT, handle_abort); signal(SIGQUIT, handle_abort); signal(SIGTERM, handle_abort); signal(SIGHUP, ignore_signal); queue = batch_queue_create(batch_queue_type); if(!queue) { fprintf(stderr,"work_queue_factory: couldn't establish queue type %s",batch_queue_type_to_string(batch_queue_type)); return 1; } batch_queue_set_option(queue, "batch-options", batch_submit_options); batch_queue_set_option(queue, "autosize", autosize ? "yes" : NULL); set_worker_resources_options( queue ); if (amazon_credentials != NULL) { batch_queue_set_option(queue, "amazon-credentials", amazon_credentials); } if (amazon_ami != NULL) { batch_queue_set_option(queue, "amazon-ami", amazon_ami); } if(condor_requirements != NULL && batch_queue_type != BATCH_QUEUE_TYPE_CONDOR) { debug(D_NOTICE, "condor_requirements will be ignored as workers will not be running in condor."); } else { batch_queue_set_option(queue, "condor-requirements", condor_requirements); } mainloop( queue ); batch_queue_delete(queue); return 0; }
static void mainloop( struct batch_queue *queue ) { int workers_submitted = 0; struct itable *job_table = itable_create(0); struct list *masters_list = NULL; struct list *foremen_list = NULL; int64_t factory_timeout_start = time(0); while(!abort_flag) { if(config_file && !read_config_file(config_file)) { debug(D_NOTICE, "Error re-reading '%s'. Using previous values.", config_file); } else { set_worker_resources_options( queue ); batch_queue_set_option(queue, "autosize", autosize ? "yes" : NULL); } submission_regex = foremen_regex ? foremen_regex : project_regex; if(using_catalog) { masters_list = work_queue_catalog_query(catalog_host,catalog_port,project_regex); } else { masters_list = do_direct_query(master_host,master_port); } if(masters_list && list_size(masters_list) > 0) { factory_timeout_start = time(0); } else { // check to see if factory timeout is triggered, factory timeout will be 0 if flag isn't set if(factory_timeout > 0) { if(time(0) - factory_timeout_start > factory_timeout) { fprintf(stderr, "There have been no masters for longer then the factory timeout, exiting\n"); abort_flag=1; break; } } } debug(D_WQ,"evaluating master list..."); int workers_needed = count_workers_needed(masters_list, 0); int workers_connected = count_workers_connected(masters_list); debug(D_WQ,"%d total workers needed across %d masters", workers_needed, masters_list ? list_size(masters_list) : 0); if(foremen_regex) { debug(D_WQ,"evaluating foremen list..."); foremen_list = work_queue_catalog_query(catalog_host,catalog_port,foremen_regex); /* add workers on foremen. Also, subtract foremen from workers * connected, as they were not deployed by the pool. */ workers_needed += count_workers_needed(foremen_list, 1); workers_connected += MAX(count_workers_connected(foremen_list) - list_size(foremen_list), 0); debug(D_WQ,"%d total workers needed across %d foremen",workers_needed,list_size(foremen_list)); } debug(D_WQ,"raw workers needed: %d", workers_needed); if(workers_needed > workers_max) { debug(D_WQ,"applying maximum of %d workers",workers_max); workers_needed = workers_max; } if(workers_needed < workers_min) { debug(D_WQ,"applying minimum of %d workers",workers_min); workers_needed = workers_min; } int new_workers_needed = workers_needed - workers_submitted; if(workers_per_cycle > 0 && new_workers_needed > workers_per_cycle) { debug(D_WQ,"applying maximum workers per cycle of %d",workers_per_cycle); new_workers_needed = workers_per_cycle; } if(workers_per_cycle > 0 && workers_submitted > new_workers_needed + workers_connected) { debug(D_WQ,"waiting for %d previously submitted workers to connect", workers_submitted - workers_connected); new_workers_needed = 0; } debug(D_WQ,"workers needed: %d", workers_needed); debug(D_WQ,"workers submitted: %d", workers_submitted); debug(D_WQ,"workers requested: %d", new_workers_needed); print_stats(masters_list, foremen_list, workers_submitted, workers_needed, new_workers_needed, workers_connected); update_blacklisted_workers(queue, masters_list); if(new_workers_needed>0) { debug(D_WQ,"submitting %d new workers to reach target",new_workers_needed); workers_submitted += submit_workers(queue,job_table,new_workers_needed); } else if(new_workers_needed<0) { debug(D_WQ,"too many workers, will wait for some to exit"); } else { debug(D_WQ,"target number of workers is reached."); } debug(D_WQ,"checking for exited workers..."); time_t stoptime = time(0)+5; while(1) { struct batch_job_info info; batch_job_id_t jobid; jobid = batch_job_wait_timeout(queue,&info,stoptime); if(jobid>0) { if(itable_lookup(job_table,jobid)) { itable_remove(job_table,jobid); debug(D_WQ,"worker job %"PRId64" exited",jobid); workers_submitted--; } else { // it may have been a job from a previous run. } } else { break; } } delete_projects_list(masters_list); delete_projects_list(foremen_list); sleep(factory_period); } remove_all_workers(queue,job_table); itable_delete(job_table); }
void batch_queue_set_int_option(struct batch_queue *q, const char *what, int value) { char *str_value = string_format("%d", value); batch_queue_set_option(q, what, str_value); free(str_value); }