Example #1
0
static void set_worker_resources( struct batch_queue *queue )
{
	batch_queue_set_option(queue, "cores", num_cores_option);
	batch_queue_set_option(queue, "memory", num_memory_option);
	batch_queue_set_option(queue, "disk", num_disk_option);
	batch_queue_set_option(queue, "gpus", num_gpus_option);

	resource_args = string_format("%s%s%s%s%s%s%s%s\n",
			num_cores_option ? " --cores=" : "",
			num_cores_option ? num_cores_option : "",
			num_memory_option ? " --memory=" : "",
			num_memory_option ? num_memory_option : "",
			num_disk_option ? " --disk=" : "",
			num_disk_option ? num_disk_option : "",
			num_gpus_option ? " --gpus=" : "",
			num_gpus_option ? num_gpus_option : "");
}
Example #2
0
static int batch_queue_dryrun_create (struct batch_queue *q)
{
	char *cwd = path_getcwd();

	batch_queue_set_feature(q, "local_job_queue", NULL);
	batch_queue_set_feature(q, "batch_log_name", "%s.sh");
	batch_queue_set_option(q, "cwd", cwd);
	return 0;
}
Example #3
0
static void update_blacklisted_workers( struct batch_queue *queue, struct list *masters_list ) {

	if(!masters_list || list_size(masters_list) < 1)
		return;

	buffer_t b;
	struct jx *j;

	buffer_init(&b);

	const char *sep = "";
	list_first_item(masters_list);
	while((j=list_next_item(masters_list))) {
		struct jx *blacklisted = jx_lookup(j,"workers_blacklisted");

		if(!blacklisted) {
			continue;
		}

		if(jx_istype(blacklisted, JX_STRING)) {
			buffer_printf(&b, "%s%s", sep, blacklisted->u.string_value);
			sep = " ";
		}

		if(jx_istype(blacklisted, JX_ARRAY)) {
			struct jx *item;
			for (void *i = NULL; (item = jx_iterate_array(blacklisted, &i));) {
				if(jx_istype(item, JX_STRING)) {
					buffer_printf(&b, "%s%s", sep, item->u.string_value);
					sep = " ";
				}
			}
		}
	}

	if(buffer_pos(&b) > 0) {
		batch_queue_set_option(queue, "workers-blacklisted", buffer_tostring(&b));
	} else {
		batch_queue_set_option(queue, "workers-blacklisted", NULL);
	}

	buffer_free(&b);
}
Example #4
0
static int batch_fs_dryrun_chdir (struct batch_queue *q, const char *path) {
	FILE *log;

	if ((log = fopen(q->logfile, "a"))) {
		char *escaped_path = string_escape_shell(path);
		batch_queue_set_option(q, "cwd", xxstrdup(path));

		fprintf(log, "cd %s\n", escaped_path);

		fclose(log);
		free(escaped_path);
		return 0;
	} else {
		return -1;
	}
}
Example #5
0
int main(int argc, char *argv[])
{
	batch_queue_type_t batch_queue_type = BATCH_QUEUE_TYPE_UNKNOWN;

	catalog_host = CATALOG_HOST;
	catalog_port = CATALOG_PORT;

	batch_submit_options = getenv("BATCH_OPTIONS");

	debug_config(argv[0]);

	resources = rmsummary_create(-1);

	int c;

	while((c = getopt_long(argc, argv, "B:C:F:N:M:T:t:w:W:E:P:S:cd:o:O:vh", long_options, NULL)) > -1) {
		switch (c) {
			case 'B':
				batch_submit_options = xxstrdup(optarg);
				break;
			case 'C':
				config_file = xxstrdup(optarg);
				break;
			case 'F':
				foremen_regex = xxstrdup(optarg);
				break;
			case 'N':
			case 'M':
				project_regex = xxstrdup(optarg);
				break;
			case 'T':
				batch_queue_type = batch_queue_type_from_string(optarg);
				if(batch_queue_type == BATCH_QUEUE_TYPE_UNKNOWN) {
					fprintf(stderr, "unknown batch queue type: %s\n", optarg);
					return EXIT_FAILURE;
				}
				break;
			case 't':
				worker_timeout = atoi(optarg);
				break;
			case 'w':
				workers_min = atoi(optarg);
				break;
			case 'W':
				workers_max = atoi(optarg);
				break;
			case LONG_OPT_WORKERS_PER_CYCLE:
				workers_per_cycle = atoi(optarg);
				break;
			case LONG_OPT_TASKS_PER_WORKER:
				tasks_per_worker = atof(optarg);
				break;
			case 'E':
				extra_worker_args = xxstrdup(optarg);
				break;
			case LONG_OPT_CORES:
				resources->cores = atoi(optarg);
				break;
			case LONG_OPT_AMAZON_CREDENTIALS:
				amazon_credentials = xxstrdup(optarg);
				break;
			case LONG_OPT_AMAZON_AMI:
				amazon_ami = xxstrdup(optarg);
				break;
			case LONG_OPT_MEMORY:
				resources->memory = atoi(optarg);
				break;
			case LONG_OPT_DISK:
				resources->disk = atoi(optarg);
				break;
			case LONG_OPT_GPUS:
				resources->gpus = atoi(optarg);
				break;
			case LONG_OPT_AUTOSIZE:
				autosize = 1;
				break;
			case LONG_OPT_FACTORY_TIMEOUT:
				factory_timeout = MAX(0, atoi(optarg));
				break;
			case LONG_OPT_CONDOR_REQUIREMENTS:
				if(condor_requirements) {
					char *tmp = condor_requirements;
					condor_requirements = string_format("(%s && (%s))", tmp, optarg);
					free(tmp);
				} else {
					condor_requirements = string_format("(%s)", optarg);
				}
				break;
			case LONG_OPT_WRAPPER:
				wrapper_command = optarg;
				break;
			case LONG_OPT_WRAPPER_INPUT:
				if(!wrapper_input) {
					wrapper_input = strdup(optarg);
				} else {
					wrapper_input = string_format("%s,%s",wrapper_input,optarg);
				}
				break;
			case 'P':
				password_file = optarg;
				break;
			case 'S':
				scratch_dir = optarg;
				break;
			case 'c':
				consider_capacity = 1;
				break;
			case 'd':
				debug_flags_set(optarg);
				break;
			case 'o':
				debug_config_file(optarg);
				break;
			case 'O':
				debug_config_file_size(string_metric_parse(optarg));
				break;
			case 'v':
				cctools_version_print(stdout, argv[0]);
				exit(EXIT_SUCCESS);
			case 'h':
				show_help(argv[0]);
				exit(EXIT_SUCCESS);
			default:
				show_help(argv[0]);
				return EXIT_FAILURE;
		}
	}

	if(project_regex) {
		using_catalog = 1;
	}
	else if((argc - optind) == 2) {
		master_host = argv[optind];
		master_port = atoi(argv[optind+1]);
	}
	else {
		fprintf(stderr,"work_queue_factory: You must either give a project name with the -M option or master-name option with a configuration file, or give the master's host and port.\n");
		show_help(argv[0]);
		exit(1);
	}
	

	cctools_version_debug(D_DEBUG, argv[0]);

	if(batch_queue_type == BATCH_QUEUE_TYPE_UNKNOWN) {
		fprintf(stderr,"work_queue_factory: You must specify a batch type with the -T option.\n");
		fprintf(stderr, "valid options:\n");
		fprintf(stderr, "%s\n", batch_queue_type_string());
		return 1;
	}

	if(config_file) {
		char abs_path_name[PATH_MAX];

		if(!realpath(config_file, abs_path_name)) {
			fprintf(stderr, "work_queue_factory: could not resolve configuration file path: '%s'.\n", config_file);
			exit(EXIT_FAILURE);
		}

		free(config_file);

		/* From now on, read config_file from absolute path */
		config_file = xxstrdup(abs_path_name);

		if(!read_config_file(config_file)) {
			fprintf(stderr,"work_queue_factory: There were errors in the configuration file: %s\n", config_file);
			return 1;
		}
	}	

	if(workers_min>workers_max) {
		fprintf(stderr,"work_queue_factory: min workers (%d) is greater than max workers (%d)\n",workers_min, workers_max);
		return 1;
	}

	/*
	Careful here: most of the supported batch systems expect
	that jobs are submitting from a single shared filesystem.
	Changing to /tmp only works in the case of Condor.
	*/

	if(!scratch_dir) {
		if(batch_queue_type==BATCH_QUEUE_TYPE_CONDOR) {
			scratch_dir = string_format("/tmp/wq-pool-%d",getuid());
		} else {
			scratch_dir = string_format("wq-pool-%d",getuid());
		}
	}

	if(!create_dir(scratch_dir,0777)) {
		fprintf(stderr,"work_queue_factory: couldn't create %s: %s",scratch_dir,strerror(errno));
		return 1;
	}

	char cmd[1024];
	sprintf(cmd,"cp \"$(which work_queue_worker)\" '%s'",scratch_dir);
	if (system(cmd)) {
		fprintf(stderr, "work_queue_factory: please add work_queue_worker to your PATH.\n");
		exit(EXIT_FAILURE);
	}

	if(password_file) {
		sprintf(cmd,"cp %s %s/pwfile",password_file,scratch_dir);
		system(cmd);
	}

	if(chdir(scratch_dir)!=0) {
		fprintf(stderr,"work_queue_factory: couldn't chdir to %s: %s",scratch_dir,strerror(errno));
		return 1;
	}

	signal(SIGINT, handle_abort);
	signal(SIGQUIT, handle_abort);
	signal(SIGTERM, handle_abort);
	signal(SIGHUP, ignore_signal);

	queue = batch_queue_create(batch_queue_type);
	if(!queue) {
		fprintf(stderr,"work_queue_factory: couldn't establish queue type %s",batch_queue_type_to_string(batch_queue_type));
		return 1;
	}

	batch_queue_set_option(queue, "batch-options", batch_submit_options);
	batch_queue_set_option(queue, "autosize", autosize ? "yes" : NULL);
	set_worker_resources_options( queue );

	if (amazon_credentials != NULL) {
		batch_queue_set_option(queue, "amazon-credentials", amazon_credentials);
	}
	if (amazon_ami != NULL) {
		batch_queue_set_option(queue, "amazon-ami", amazon_ami);
	}

	if(condor_requirements != NULL && batch_queue_type != BATCH_QUEUE_TYPE_CONDOR) {
		debug(D_NOTICE, "condor_requirements will be ignored as workers will not be running in condor.");
	} else {
		batch_queue_set_option(queue, "condor-requirements", condor_requirements);
	}

	mainloop( queue );

	batch_queue_delete(queue);

	return 0;
}
Example #6
0
static void mainloop( struct batch_queue *queue )
{
	int workers_submitted = 0;
	struct itable *job_table = itable_create(0);

	struct list *masters_list = NULL;
	struct list *foremen_list = NULL;

	int64_t factory_timeout_start = time(0);

	while(!abort_flag) {

		if(config_file && !read_config_file(config_file)) {
			debug(D_NOTICE, "Error re-reading '%s'. Using previous values.", config_file);
		} else {
			set_worker_resources_options( queue );
			batch_queue_set_option(queue, "autosize", autosize ? "yes" : NULL);
		}

		submission_regex = foremen_regex ? foremen_regex : project_regex;

		if(using_catalog) {
			masters_list = work_queue_catalog_query(catalog_host,catalog_port,project_regex);
		}
		else {
			masters_list = do_direct_query(master_host,master_port);
		}

		if(masters_list && list_size(masters_list) > 0)
		{
			factory_timeout_start = time(0);
		} else {
			// check to see if factory timeout is triggered, factory timeout will be 0 if flag isn't set
			if(factory_timeout > 0)
			{
				if(time(0) - factory_timeout_start > factory_timeout) {
					fprintf(stderr, "There have been no masters for longer then the factory timeout, exiting\n");
					abort_flag=1;
					break;
				}
			}
		}
	
		debug(D_WQ,"evaluating master list...");
		int workers_needed    = count_workers_needed(masters_list, 0);
		int workers_connected = count_workers_connected(masters_list);

		debug(D_WQ,"%d total workers needed across %d masters",
				workers_needed,
				masters_list ? list_size(masters_list) : 0);

		if(foremen_regex)
		{
			debug(D_WQ,"evaluating foremen list...");
			foremen_list    = work_queue_catalog_query(catalog_host,catalog_port,foremen_regex);

			/* add workers on foremen. Also, subtract foremen from workers
			 * connected, as they were not deployed by the pool. */

			workers_needed    += count_workers_needed(foremen_list, 1);
			workers_connected += MAX(count_workers_connected(foremen_list) - list_size(foremen_list), 0);

			debug(D_WQ,"%d total workers needed across %d foremen",workers_needed,list_size(foremen_list));
		}

		debug(D_WQ,"raw workers needed: %d", workers_needed);

		if(workers_needed > workers_max) {
			debug(D_WQ,"applying maximum of %d workers",workers_max);
			workers_needed = workers_max;
		}

		if(workers_needed < workers_min) {
			debug(D_WQ,"applying minimum of %d workers",workers_min);
			workers_needed = workers_min;
		}

		int new_workers_needed = workers_needed - workers_submitted;

		if(workers_per_cycle > 0 && new_workers_needed > workers_per_cycle) {
			debug(D_WQ,"applying maximum workers per cycle of %d",workers_per_cycle);
			new_workers_needed = workers_per_cycle;
		}

		if(workers_per_cycle > 0 && workers_submitted > new_workers_needed + workers_connected) {
			debug(D_WQ,"waiting for %d previously submitted workers to connect", workers_submitted - workers_connected);
			new_workers_needed = 0;
		}

		debug(D_WQ,"workers needed: %d",    workers_needed);
		debug(D_WQ,"workers submitted: %d", workers_submitted);
		debug(D_WQ,"workers requested: %d", new_workers_needed);

		print_stats(masters_list, foremen_list, workers_submitted, workers_needed, new_workers_needed, workers_connected);

		update_blacklisted_workers(queue, masters_list);

		if(new_workers_needed>0) {
			debug(D_WQ,"submitting %d new workers to reach target",new_workers_needed);
			workers_submitted += submit_workers(queue,job_table,new_workers_needed);
		} else if(new_workers_needed<0) {
			debug(D_WQ,"too many workers, will wait for some to exit");
		} else {
			debug(D_WQ,"target number of workers is reached.");
		}

		debug(D_WQ,"checking for exited workers...");
		time_t stoptime = time(0)+5;

		while(1) {
			struct batch_job_info info;
			batch_job_id_t jobid;
			jobid = batch_job_wait_timeout(queue,&info,stoptime);
			if(jobid>0) {
				if(itable_lookup(job_table,jobid)) {
					itable_remove(job_table,jobid);
					debug(D_WQ,"worker job %"PRId64" exited",jobid);
					workers_submitted--;
				} else {
					// it may have been a job from a previous run.
				}
			} else {
				break;
			}
		}

		delete_projects_list(masters_list);
		delete_projects_list(foremen_list);

		sleep(factory_period);
	}

	remove_all_workers(queue,job_table);
	itable_delete(job_table);
}
Example #7
0
void batch_queue_set_int_option(struct batch_queue *q, const char *what, int value) {
    char *str_value = string_format("%d", value);
    batch_queue_set_option(q, what, str_value);

    free(str_value);
}