Exemple #1
0
int main(int argc, char *argv[])
{
	batch_queue_type_t batch_queue_type = BATCH_QUEUE_TYPE_UNKNOWN;

	catalog_host = CATALOG_HOST;
	catalog_port = CATALOG_PORT;

	debug_config(argv[0]);

	int c;

	while((c = getopt_long(argc, argv, "F:N:M:T:t:w:W:E:P:S:cd:o:O:vh", long_options, NULL)) > -1) {
		switch (c) {
			case 'F':
				foremen_regex = optarg;
				break;
			case 'N':
			case 'M':
				project_regex = optarg;
				break;
			case 'T':
				batch_queue_type = batch_queue_type_from_string(optarg);
				if(batch_queue_type == BATCH_QUEUE_TYPE_UNKNOWN) {
					fprintf(stderr, "unknown batch queue type: %s\n", optarg);
					return EXIT_FAILURE;
				}
				break;
			case 't':
				worker_timeout = atoi(optarg);
				break;
			case 'w':
				workers_min = atoi(optarg);
				break;
			case 'W':
				workers_max = atoi(optarg);
				break;
			case LONG_OPT_TASKS_PER_WORKER:
				tasks_per_worker = atof(optarg);
				break;
			case 'E':
				extra_worker_args = optarg;
				break;
			case LONG_OPT_CORES:
				num_cores_option = xxstrdup(optarg);
				break;
			case LONG_OPT_MEMORY:
				num_memory_option = xxstrdup(optarg);
				break;
			case LONG_OPT_DISK:
				num_disk_option = xxstrdup(optarg);
				break;
			case LONG_OPT_GPUS:
				num_gpus_option = xxstrdup(optarg);
				break;
			case 'P':
				password_file = optarg;
				break;
			case 'S':
				scratch_dir = optarg;
				break;
			case 'c':
				consider_capacity = 1;
				break;
			case 'd':
				debug_flags_set(optarg);
				break;
			case 'o':
				debug_config_file(optarg);
				break;
			case 'O':
				debug_config_file_size(string_metric_parse(optarg));
				break;
			case 'v':
				cctools_version_print(stdout, argv[0]);
				exit(EXIT_SUCCESS);
			case 'h':
				show_help(argv[0]);
				exit(EXIT_SUCCESS);
			default:
				show_help(argv[0]);
				return EXIT_FAILURE;
		}
	}

	cctools_version_debug(D_DEBUG, argv[0]);

	if(batch_queue_type == BATCH_QUEUE_TYPE_UNKNOWN) {
		fprintf(stderr,"work_queue_pool: You must specify a batch type with the -T option.\n");
		fprintf(stderr, "valid options:\n");
		fprintf(stderr, "%s\n", batch_queue_type_string());
		return 1;
	}

	if(!project_regex) {
		fprintf(stderr,"work_queue_pool: You must give a project name with the -M option.\n");
		return 1;
	}

	if(workers_min>workers_max) {
		fprintf(stderr,"work_queue_pool: --min-workers (%d) is greater than --max-workers (%d)\n",workers_min,workers_max);
		return 1;
	}

	if(tasks_per_worker < 1)
	{
		tasks_per_worker = num_cores_option ? atof(num_cores_option) : 1;
	}

	if(!scratch_dir) {
		scratch_dir = string_format("/tmp/wq-pool-%d",getuid());
	}

	if(!create_dir(scratch_dir,0777)) {
		fprintf(stderr,"work_queue_pool: couldn't create %s: %s",scratch_dir,strerror(errno));
		return 1;
	}

	char cmd[1024];
	sprintf(cmd,"cp \"$(which work_queue_worker)\" '%s'",scratch_dir);
	if (system(cmd)) {
		fprintf(stderr, "work_queue_pool: please add work_queue_worker to your PATH.\n");
		exit(EXIT_FAILURE);
	}

	if(password_file) {
		sprintf(cmd,"cp %s %s/pwfile",password_file,scratch_dir);
		system(cmd);
	}

	if(chdir(scratch_dir)!=0) {
		fprintf(stderr,"work_queue_pool: couldn't chdir to %s: %s",scratch_dir,strerror(errno));
		return 1;
	}

	signal(SIGINT, handle_abort);
	signal(SIGQUIT, handle_abort);
	signal(SIGTERM, handle_abort);
	signal(SIGHUP, ignore_signal);

	struct batch_queue * queue = batch_queue_create(batch_queue_type);
	if(!queue) {
		fprintf(stderr,"work_queue_pool: couldn't establish queue type %s",batch_queue_type_to_string(batch_queue_type));
		return 1;
	}

	set_worker_resources( queue );

	mainloop( queue, project_regex, foremen_regex );

	batch_queue_delete(queue);

	return 0;
}
Exemple #2
0
int main(int argc, char *argv[])
{
	batch_queue_type_t batch_queue_type = BATCH_QUEUE_TYPE_UNKNOWN;

	catalog_host = CATALOG_HOST;
	catalog_port = CATALOG_PORT;

	batch_submit_options = getenv("BATCH_OPTIONS");

	debug_config(argv[0]);

	resources = rmsummary_create(-1);

	int c;

	while((c = getopt_long(argc, argv, "B:C:F:N:M:T:t:w:W:E:P:S:cd:o:O:vh", long_options, NULL)) > -1) {
		switch (c) {
			case 'B':
				batch_submit_options = xxstrdup(optarg);
				break;
			case 'C':
				config_file = xxstrdup(optarg);
				break;
			case 'F':
				foremen_regex = xxstrdup(optarg);
				break;
			case 'N':
			case 'M':
				project_regex = xxstrdup(optarg);
				break;
			case 'T':
				batch_queue_type = batch_queue_type_from_string(optarg);
				if(batch_queue_type == BATCH_QUEUE_TYPE_UNKNOWN) {
					fprintf(stderr, "unknown batch queue type: %s\n", optarg);
					return EXIT_FAILURE;
				}
				break;
			case 't':
				worker_timeout = atoi(optarg);
				break;
			case 'w':
				workers_min = atoi(optarg);
				break;
			case 'W':
				workers_max = atoi(optarg);
				break;
			case LONG_OPT_WORKERS_PER_CYCLE:
				workers_per_cycle = atoi(optarg);
				break;
			case LONG_OPT_TASKS_PER_WORKER:
				tasks_per_worker = atof(optarg);
				break;
			case 'E':
				extra_worker_args = xxstrdup(optarg);
				break;
			case LONG_OPT_CORES:
				resources->cores = atoi(optarg);
				break;
			case LONG_OPT_AMAZON_CREDENTIALS:
				amazon_credentials = xxstrdup(optarg);
				break;
			case LONG_OPT_AMAZON_AMI:
				amazon_ami = xxstrdup(optarg);
				break;
			case LONG_OPT_MEMORY:
				resources->memory = atoi(optarg);
				break;
			case LONG_OPT_DISK:
				resources->disk = atoi(optarg);
				break;
			case LONG_OPT_GPUS:
				resources->gpus = atoi(optarg);
				break;
			case LONG_OPT_AUTOSIZE:
				autosize = 1;
				break;
			case LONG_OPT_FACTORY_TIMEOUT:
				factory_timeout = MAX(0, atoi(optarg));
				break;
			case LONG_OPT_CONDOR_REQUIREMENTS:
				if(condor_requirements) {
					char *tmp = condor_requirements;
					condor_requirements = string_format("(%s && (%s))", tmp, optarg);
					free(tmp);
				} else {
					condor_requirements = string_format("(%s)", optarg);
				}
				break;
			case LONG_OPT_WRAPPER:
				wrapper_command = optarg;
				break;
			case LONG_OPT_WRAPPER_INPUT:
				if(!wrapper_input) {
					wrapper_input = strdup(optarg);
				} else {
					wrapper_input = string_format("%s,%s",wrapper_input,optarg);
				}
				break;
			case 'P':
				password_file = optarg;
				break;
			case 'S':
				scratch_dir = optarg;
				break;
			case 'c':
				consider_capacity = 1;
				break;
			case 'd':
				debug_flags_set(optarg);
				break;
			case 'o':
				debug_config_file(optarg);
				break;
			case 'O':
				debug_config_file_size(string_metric_parse(optarg));
				break;
			case 'v':
				cctools_version_print(stdout, argv[0]);
				exit(EXIT_SUCCESS);
			case 'h':
				show_help(argv[0]);
				exit(EXIT_SUCCESS);
			default:
				show_help(argv[0]);
				return EXIT_FAILURE;
		}
	}

	if(project_regex) {
		using_catalog = 1;
	}
	else if((argc - optind) == 2) {
		master_host = argv[optind];
		master_port = atoi(argv[optind+1]);
	}
	else {
		fprintf(stderr,"work_queue_factory: You must either give a project name with the -M option or master-name option with a configuration file, or give the master's host and port.\n");
		show_help(argv[0]);
		exit(1);
	}
	

	cctools_version_debug(D_DEBUG, argv[0]);

	if(batch_queue_type == BATCH_QUEUE_TYPE_UNKNOWN) {
		fprintf(stderr,"work_queue_factory: You must specify a batch type with the -T option.\n");
		fprintf(stderr, "valid options:\n");
		fprintf(stderr, "%s\n", batch_queue_type_string());
		return 1;
	}

	if(config_file) {
		char abs_path_name[PATH_MAX];

		if(!realpath(config_file, abs_path_name)) {
			fprintf(stderr, "work_queue_factory: could not resolve configuration file path: '%s'.\n", config_file);
			exit(EXIT_FAILURE);
		}

		free(config_file);

		/* From now on, read config_file from absolute path */
		config_file = xxstrdup(abs_path_name);

		if(!read_config_file(config_file)) {
			fprintf(stderr,"work_queue_factory: There were errors in the configuration file: %s\n", config_file);
			return 1;
		}
	}	

	if(workers_min>workers_max) {
		fprintf(stderr,"work_queue_factory: min workers (%d) is greater than max workers (%d)\n",workers_min, workers_max);
		return 1;
	}

	/*
	Careful here: most of the supported batch systems expect
	that jobs are submitting from a single shared filesystem.
	Changing to /tmp only works in the case of Condor.
	*/

	if(!scratch_dir) {
		if(batch_queue_type==BATCH_QUEUE_TYPE_CONDOR) {
			scratch_dir = string_format("/tmp/wq-pool-%d",getuid());
		} else {
			scratch_dir = string_format("wq-pool-%d",getuid());
		}
	}

	if(!create_dir(scratch_dir,0777)) {
		fprintf(stderr,"work_queue_factory: couldn't create %s: %s",scratch_dir,strerror(errno));
		return 1;
	}

	char cmd[1024];
	sprintf(cmd,"cp \"$(which work_queue_worker)\" '%s'",scratch_dir);
	if (system(cmd)) {
		fprintf(stderr, "work_queue_factory: please add work_queue_worker to your PATH.\n");
		exit(EXIT_FAILURE);
	}

	if(password_file) {
		sprintf(cmd,"cp %s %s/pwfile",password_file,scratch_dir);
		system(cmd);
	}

	if(chdir(scratch_dir)!=0) {
		fprintf(stderr,"work_queue_factory: couldn't chdir to %s: %s",scratch_dir,strerror(errno));
		return 1;
	}

	signal(SIGINT, handle_abort);
	signal(SIGQUIT, handle_abort);
	signal(SIGTERM, handle_abort);
	signal(SIGHUP, ignore_signal);

	queue = batch_queue_create(batch_queue_type);
	if(!queue) {
		fprintf(stderr,"work_queue_factory: couldn't establish queue type %s",batch_queue_type_to_string(batch_queue_type));
		return 1;
	}

	batch_queue_set_option(queue, "batch-options", batch_submit_options);
	batch_queue_set_option(queue, "autosize", autosize ? "yes" : NULL);
	set_worker_resources_options( queue );

	if (amazon_credentials != NULL) {
		batch_queue_set_option(queue, "amazon-credentials", amazon_credentials);
	}
	if (amazon_ami != NULL) {
		batch_queue_set_option(queue, "amazon-ami", amazon_ami);
	}

	if(condor_requirements != NULL && batch_queue_type != BATCH_QUEUE_TYPE_CONDOR) {
		debug(D_NOTICE, "condor_requirements will be ignored as workers will not be running in condor.");
	} else {
		batch_queue_set_option(queue, "condor-requirements", condor_requirements);
	}

	mainloop( queue );

	batch_queue_delete(queue);

	return 0;
}
Exemple #3
0
int main( int argc, char *argv[] )
{
	signed char c;

	const char *progname = "wavefront";

	debug_config(progname);

	progress_log_file = stdout;

	struct option long_options[] = {
		{"help",  no_argument, 0, 'h'},
		{"version", no_argument, 0, 'v'},
		{"debug", required_argument, 0, 'd'},
		{"jobs", required_argument, 0, 'n'},
		{"block-size", required_argument, 0, 'b'},
		{"debug-file", required_argument, 0, 'o'},
		{"log-file", required_argument, 0, 'l'},
		{"bitmap", required_argument, 0, 'B'},
		{"bitmap-interval", required_argument, 0, 'i'},
		{"auto", no_argument, 0, 'A'},
		{"local", no_argument, 0, 'L'},
		{"batch-type", required_argument, 0, 'T'},
		{"verify", no_argument, 0, 'V'},
        {0,0,0,0}
	};

	while((c=getopt_long(argc,argv,"n:b:d:o:l:B:i:qALDT:VX:Y:vh", long_options, NULL)) > -1) {
		switch(c) {
			case 'n':
				manual_max_jobs_running = atoi(optarg);
				break;
			case 'b':
				manual_block_size = atoi(optarg);
				break;
			case 'd':
				debug_flags_set(optarg);
				break;
			case 'o':
				debug_config_file(optarg);
				break;
			case 'B':
				progress_bitmap_file = optarg;
				break;
			case 'i':
				progress_bitmap_interval = atoi(optarg);
				break;
			case 'l':
				progress_log_file = fopen(optarg,"w");
				if(!progress_log_file) {
					fprintf(stderr,"couldn't open %s: %s\n",optarg,strerror(errno));
					return 1;
				}
				break;
			case 'A':
				wavefront_mode = WAVEFRONT_MODE_AUTO;
				break;
			case 'L':
				wavefront_mode = WAVEFRONT_MODE_MULTICORE;
				break;
			case 'T':
				wavefront_mode = WAVEFRONT_MODE_DISTRIBUTED;
				batch_system_type = batch_queue_type_from_string(optarg);
				if(batch_system_type==BATCH_QUEUE_TYPE_UNKNOWN) {
					fprintf(stderr,"unknown batch system type: %s\n",optarg);
					exit(1);
				}
				break;
			case 'V':
				verify_mode = 1;
				break;
			case 'X':
				xstart = atoi(optarg);
				break;
			case 'Y':
				ystart = atoi(optarg);
				break;
			case 'v':
				cctools_version_print(stdout, progname);
				exit(0);
				break;
			case 'h':
				show_help(progname);
				exit(0);
				break;
		}
	}

	cctools_version_debug(D_DEBUG, argv[0]);

	if( (argc-optind<3) ) {
		show_help(progname);
		exit(1);
	}

	function = argv[optind];
	xsize=atoi(argv[optind+1]);
	ysize=atoi(argv[optind+2]);
	total_cells = xsize*ysize;

	if(!verify_mode && !check_configuration(function,xsize,ysize)) exit(1);

	int ncpus = load_average_get_cpus();

	if(wavefront_mode!=WAVEFRONT_MODE_MULTICORE) {
		double task_time = measure_task_time();
		printf("Each function takes %.02lfs to run.\n",task_time);

		block_size = find_best_block_size(xsize,1000,2,task_time,average_dispatch_time);
		double distributed_time = wavefront_distributed_model(xsize,1000,2,task_time,block_size,average_dispatch_time);
		double multicore_time = wavefront_multicore_model(xsize,ncpus,task_time);
		double ideal_multicore_time = wavefront_multicore_model(xsize,xsize,task_time);
		double sequential_time = wavefront_multicore_model(xsize,1,task_time);

		printf("---------------------------------\n");
		printf("This workload would take:\n");
		printf("%.02lfs sequentially\n",sequential_time);
		printf("%.02lfs on this %d-core machine\n",multicore_time,ncpus);
		printf("%.02lfs on a %d-core machine\n",ideal_multicore_time,xsize);
		printf("%.02lfs on a 1000-node distributed system with block size %d\n",distributed_time,block_size);
		printf("---------------------------------\n");

		if(wavefront_mode==WAVEFRONT_MODE_AUTO) {
			if(multicore_time < distributed_time*2) {
				wavefront_mode = WAVEFRONT_MODE_MULTICORE;
			} else {
				wavefront_mode = WAVEFRONT_MODE_DISTRIBUTED;
			}
		}
	}

	if(wavefront_mode==WAVEFRONT_MODE_MULTICORE) {
		batch_system_type = BATCH_QUEUE_TYPE_LOCAL;
		max_jobs_running = ncpus;
	} else {
		max_jobs_running = 1000;
	}

	if(manual_block_size!=0) {
		block_size = manual_block_size;
	}

	if(manual_max_jobs_running!=0) {
		max_jobs_running = manual_max_jobs_running;
	}

	if(wavefront_mode==WAVEFRONT_MODE_MULTICORE) {
		printf("Running in multicore mode with %d CPUs.\n",max_jobs_running);
	} else {
		printf("Running in distributed mode with block size %d on up to %d CPUs\n",block_size,max_jobs_running);
	}

	batch_q = batch_queue_create(batch_system_type);

	if(verify_mode) exit(0);

	struct bitmap * b = bitmap_create(xsize+1,ysize+1);
	struct list *ready_list = list_create();
	struct itable *running_table = itable_create(0);

	struct batch_job_info info;
	UINT64_T jobid;
	struct wavefront_task *task;

	wavefront_task_initialize(b,ready_list);

	printf("Starting workload...\n");

	fprintf(progress_log_file,"# elapsed time : waiting jobs / running jobs / cells complete (percent complete)\n");

	while(1) {

		if(abort_mode) {
			while((task=list_pop_tail(ready_list))) {
				wavefront_task_delete(task);
			}

			itable_firstkey(running_table);
			while(itable_nextkey(running_table,&jobid,(void**)&task)) {
				batch_job_remove(batch_q,jobid);
			}
		}

		if(list_size(ready_list)==0 && itable_size(running_table)==0) break;

		while(1) {
			if(itable_size(running_table)>=max_jobs_running) break;

			task = list_pop_tail(ready_list);
			if(!task) break;
			
			jobid = wavefront_task_submit(task);
			if(jobid>0) {
				itable_insert(running_table,jobid,task);
				wavefront_task_mark_range(task,b,WAVEFRONT_TASK_STATE_RUNNING);
			} else {
				abort();
				sleep(1);
				list_push_head(ready_list,task);
			}
		}


		save_status(b,ready_list,running_table);

		jobid = batch_job_wait(batch_q,&info);
		if(jobid>0) {
			task = itable_remove(running_table,jobid);
			if(task) {
				if(info.exited_normally && info.exit_code==0) {
					total_dispatch_time += info.started-info.submitted;
					total_execute_time += MAX(info.finished-info.started,1);
					total_cells_complete+=task->width*task->height;
					total_jobs_complete++;

					average_dispatch_time = 1.0*total_dispatch_time / total_jobs_complete;
					average_task_time = 1.0*total_execute_time / total_cells_complete;

					wavefront_task_complete(b,ready_list,task);
				} else {
					printf("job %" PRIu64 " failed, aborting this workload\n",jobid);
					abort_mode = 1;
				}
			}
		}
	}

	save_status(b,ready_list,running_table);

	if(abort_mode) {
		printf("Workload was aborted.\n");
	} else {
		printf("Workload complete.\n");
	}

	return 0;
}