int main(int argc, char *argv[]) { batch_queue_type_t batch_queue_type = BATCH_QUEUE_TYPE_UNKNOWN; catalog_host = CATALOG_HOST; catalog_port = CATALOG_PORT; debug_config(argv[0]); int c; while((c = getopt_long(argc, argv, "F:N:M:T:t:w:W:E:P:S:cd:o:O:vh", long_options, NULL)) > -1) { switch (c) { case 'F': foremen_regex = optarg; break; case 'N': case 'M': project_regex = optarg; break; case 'T': batch_queue_type = batch_queue_type_from_string(optarg); if(batch_queue_type == BATCH_QUEUE_TYPE_UNKNOWN) { fprintf(stderr, "unknown batch queue type: %s\n", optarg); return EXIT_FAILURE; } break; case 't': worker_timeout = atoi(optarg); break; case 'w': workers_min = atoi(optarg); break; case 'W': workers_max = atoi(optarg); break; case LONG_OPT_TASKS_PER_WORKER: tasks_per_worker = atof(optarg); break; case 'E': extra_worker_args = optarg; break; case LONG_OPT_CORES: num_cores_option = xxstrdup(optarg); break; case LONG_OPT_MEMORY: num_memory_option = xxstrdup(optarg); break; case LONG_OPT_DISK: num_disk_option = xxstrdup(optarg); break; case LONG_OPT_GPUS: num_gpus_option = xxstrdup(optarg); break; case 'P': password_file = optarg; break; case 'S': scratch_dir = optarg; break; case 'c': consider_capacity = 1; break; case 'd': debug_flags_set(optarg); break; case 'o': debug_config_file(optarg); break; case 'O': debug_config_file_size(string_metric_parse(optarg)); break; case 'v': cctools_version_print(stdout, argv[0]); exit(EXIT_SUCCESS); case 'h': show_help(argv[0]); exit(EXIT_SUCCESS); default: show_help(argv[0]); return EXIT_FAILURE; } } cctools_version_debug(D_DEBUG, argv[0]); if(batch_queue_type == BATCH_QUEUE_TYPE_UNKNOWN) { fprintf(stderr,"work_queue_pool: You must specify a batch type with the -T option.\n"); fprintf(stderr, "valid options:\n"); fprintf(stderr, "%s\n", batch_queue_type_string()); return 1; } if(!project_regex) { fprintf(stderr,"work_queue_pool: You must give a project name with the -M option.\n"); return 1; } if(workers_min>workers_max) { fprintf(stderr,"work_queue_pool: --min-workers (%d) is greater than --max-workers (%d)\n",workers_min,workers_max); return 1; } if(tasks_per_worker < 1) { tasks_per_worker = num_cores_option ? atof(num_cores_option) : 1; } if(!scratch_dir) { scratch_dir = string_format("/tmp/wq-pool-%d",getuid()); } if(!create_dir(scratch_dir,0777)) { fprintf(stderr,"work_queue_pool: couldn't create %s: %s",scratch_dir,strerror(errno)); return 1; } char cmd[1024]; sprintf(cmd,"cp \"$(which work_queue_worker)\" '%s'",scratch_dir); if (system(cmd)) { fprintf(stderr, "work_queue_pool: please add work_queue_worker to your PATH.\n"); exit(EXIT_FAILURE); } if(password_file) { sprintf(cmd,"cp %s %s/pwfile",password_file,scratch_dir); system(cmd); } if(chdir(scratch_dir)!=0) { fprintf(stderr,"work_queue_pool: couldn't chdir to %s: %s",scratch_dir,strerror(errno)); return 1; } signal(SIGINT, handle_abort); signal(SIGQUIT, handle_abort); signal(SIGTERM, handle_abort); signal(SIGHUP, ignore_signal); struct batch_queue * queue = batch_queue_create(batch_queue_type); if(!queue) { fprintf(stderr,"work_queue_pool: couldn't establish queue type %s",batch_queue_type_to_string(batch_queue_type)); return 1; } set_worker_resources( queue ); mainloop( queue, project_regex, foremen_regex ); batch_queue_delete(queue); return 0; }
int main(int argc, char *argv[]) { batch_queue_type_t batch_queue_type = BATCH_QUEUE_TYPE_UNKNOWN; catalog_host = CATALOG_HOST; catalog_port = CATALOG_PORT; batch_submit_options = getenv("BATCH_OPTIONS"); debug_config(argv[0]); resources = rmsummary_create(-1); int c; while((c = getopt_long(argc, argv, "B:C:F:N:M:T:t:w:W:E:P:S:cd:o:O:vh", long_options, NULL)) > -1) { switch (c) { case 'B': batch_submit_options = xxstrdup(optarg); break; case 'C': config_file = xxstrdup(optarg); break; case 'F': foremen_regex = xxstrdup(optarg); break; case 'N': case 'M': project_regex = xxstrdup(optarg); break; case 'T': batch_queue_type = batch_queue_type_from_string(optarg); if(batch_queue_type == BATCH_QUEUE_TYPE_UNKNOWN) { fprintf(stderr, "unknown batch queue type: %s\n", optarg); return EXIT_FAILURE; } break; case 't': worker_timeout = atoi(optarg); break; case 'w': workers_min = atoi(optarg); break; case 'W': workers_max = atoi(optarg); break; case LONG_OPT_WORKERS_PER_CYCLE: workers_per_cycle = atoi(optarg); break; case LONG_OPT_TASKS_PER_WORKER: tasks_per_worker = atof(optarg); break; case 'E': extra_worker_args = xxstrdup(optarg); break; case LONG_OPT_CORES: resources->cores = atoi(optarg); break; case LONG_OPT_AMAZON_CREDENTIALS: amazon_credentials = xxstrdup(optarg); break; case LONG_OPT_AMAZON_AMI: amazon_ami = xxstrdup(optarg); break; case LONG_OPT_MEMORY: resources->memory = atoi(optarg); break; case LONG_OPT_DISK: resources->disk = atoi(optarg); break; case LONG_OPT_GPUS: resources->gpus = atoi(optarg); break; case LONG_OPT_AUTOSIZE: autosize = 1; break; case LONG_OPT_FACTORY_TIMEOUT: factory_timeout = MAX(0, atoi(optarg)); break; case LONG_OPT_CONDOR_REQUIREMENTS: if(condor_requirements) { char *tmp = condor_requirements; condor_requirements = string_format("(%s && (%s))", tmp, optarg); free(tmp); } else { condor_requirements = string_format("(%s)", optarg); } break; case LONG_OPT_WRAPPER: wrapper_command = optarg; break; case LONG_OPT_WRAPPER_INPUT: if(!wrapper_input) { wrapper_input = strdup(optarg); } else { wrapper_input = string_format("%s,%s",wrapper_input,optarg); } break; case 'P': password_file = optarg; break; case 'S': scratch_dir = optarg; break; case 'c': consider_capacity = 1; break; case 'd': debug_flags_set(optarg); break; case 'o': debug_config_file(optarg); break; case 'O': debug_config_file_size(string_metric_parse(optarg)); break; case 'v': cctools_version_print(stdout, argv[0]); exit(EXIT_SUCCESS); case 'h': show_help(argv[0]); exit(EXIT_SUCCESS); default: show_help(argv[0]); return EXIT_FAILURE; } } if(project_regex) { using_catalog = 1; } else if((argc - optind) == 2) { master_host = argv[optind]; master_port = atoi(argv[optind+1]); } else { fprintf(stderr,"work_queue_factory: You must either give a project name with the -M option or master-name option with a configuration file, or give the master's host and port.\n"); show_help(argv[0]); exit(1); } cctools_version_debug(D_DEBUG, argv[0]); if(batch_queue_type == BATCH_QUEUE_TYPE_UNKNOWN) { fprintf(stderr,"work_queue_factory: You must specify a batch type with the -T option.\n"); fprintf(stderr, "valid options:\n"); fprintf(stderr, "%s\n", batch_queue_type_string()); return 1; } if(config_file) { char abs_path_name[PATH_MAX]; if(!realpath(config_file, abs_path_name)) { fprintf(stderr, "work_queue_factory: could not resolve configuration file path: '%s'.\n", config_file); exit(EXIT_FAILURE); } free(config_file); /* From now on, read config_file from absolute path */ config_file = xxstrdup(abs_path_name); if(!read_config_file(config_file)) { fprintf(stderr,"work_queue_factory: There were errors in the configuration file: %s\n", config_file); return 1; } } if(workers_min>workers_max) { fprintf(stderr,"work_queue_factory: min workers (%d) is greater than max workers (%d)\n",workers_min, workers_max); return 1; } /* Careful here: most of the supported batch systems expect that jobs are submitting from a single shared filesystem. Changing to /tmp only works in the case of Condor. */ if(!scratch_dir) { if(batch_queue_type==BATCH_QUEUE_TYPE_CONDOR) { scratch_dir = string_format("/tmp/wq-pool-%d",getuid()); } else { scratch_dir = string_format("wq-pool-%d",getuid()); } } if(!create_dir(scratch_dir,0777)) { fprintf(stderr,"work_queue_factory: couldn't create %s: %s",scratch_dir,strerror(errno)); return 1; } char cmd[1024]; sprintf(cmd,"cp \"$(which work_queue_worker)\" '%s'",scratch_dir); if (system(cmd)) { fprintf(stderr, "work_queue_factory: please add work_queue_worker to your PATH.\n"); exit(EXIT_FAILURE); } if(password_file) { sprintf(cmd,"cp %s %s/pwfile",password_file,scratch_dir); system(cmd); } if(chdir(scratch_dir)!=0) { fprintf(stderr,"work_queue_factory: couldn't chdir to %s: %s",scratch_dir,strerror(errno)); return 1; } signal(SIGINT, handle_abort); signal(SIGQUIT, handle_abort); signal(SIGTERM, handle_abort); signal(SIGHUP, ignore_signal); queue = batch_queue_create(batch_queue_type); if(!queue) { fprintf(stderr,"work_queue_factory: couldn't establish queue type %s",batch_queue_type_to_string(batch_queue_type)); return 1; } batch_queue_set_option(queue, "batch-options", batch_submit_options); batch_queue_set_option(queue, "autosize", autosize ? "yes" : NULL); set_worker_resources_options( queue ); if (amazon_credentials != NULL) { batch_queue_set_option(queue, "amazon-credentials", amazon_credentials); } if (amazon_ami != NULL) { batch_queue_set_option(queue, "amazon-ami", amazon_ami); } if(condor_requirements != NULL && batch_queue_type != BATCH_QUEUE_TYPE_CONDOR) { debug(D_NOTICE, "condor_requirements will be ignored as workers will not be running in condor."); } else { batch_queue_set_option(queue, "condor-requirements", condor_requirements); } mainloop( queue ); batch_queue_delete(queue); return 0; }
int main( int argc, char *argv[] ) { signed char c; const char *progname = "wavefront"; debug_config(progname); progress_log_file = stdout; struct option long_options[] = { {"help", no_argument, 0, 'h'}, {"version", no_argument, 0, 'v'}, {"debug", required_argument, 0, 'd'}, {"jobs", required_argument, 0, 'n'}, {"block-size", required_argument, 0, 'b'}, {"debug-file", required_argument, 0, 'o'}, {"log-file", required_argument, 0, 'l'}, {"bitmap", required_argument, 0, 'B'}, {"bitmap-interval", required_argument, 0, 'i'}, {"auto", no_argument, 0, 'A'}, {"local", no_argument, 0, 'L'}, {"batch-type", required_argument, 0, 'T'}, {"verify", no_argument, 0, 'V'}, {0,0,0,0} }; while((c=getopt_long(argc,argv,"n:b:d:o:l:B:i:qALDT:VX:Y:vh", long_options, NULL)) > -1) { switch(c) { case 'n': manual_max_jobs_running = atoi(optarg); break; case 'b': manual_block_size = atoi(optarg); break; case 'd': debug_flags_set(optarg); break; case 'o': debug_config_file(optarg); break; case 'B': progress_bitmap_file = optarg; break; case 'i': progress_bitmap_interval = atoi(optarg); break; case 'l': progress_log_file = fopen(optarg,"w"); if(!progress_log_file) { fprintf(stderr,"couldn't open %s: %s\n",optarg,strerror(errno)); return 1; } break; case 'A': wavefront_mode = WAVEFRONT_MODE_AUTO; break; case 'L': wavefront_mode = WAVEFRONT_MODE_MULTICORE; break; case 'T': wavefront_mode = WAVEFRONT_MODE_DISTRIBUTED; batch_system_type = batch_queue_type_from_string(optarg); if(batch_system_type==BATCH_QUEUE_TYPE_UNKNOWN) { fprintf(stderr,"unknown batch system type: %s\n",optarg); exit(1); } break; case 'V': verify_mode = 1; break; case 'X': xstart = atoi(optarg); break; case 'Y': ystart = atoi(optarg); break; case 'v': cctools_version_print(stdout, progname); exit(0); break; case 'h': show_help(progname); exit(0); break; } } cctools_version_debug(D_DEBUG, argv[0]); if( (argc-optind<3) ) { show_help(progname); exit(1); } function = argv[optind]; xsize=atoi(argv[optind+1]); ysize=atoi(argv[optind+2]); total_cells = xsize*ysize; if(!verify_mode && !check_configuration(function,xsize,ysize)) exit(1); int ncpus = load_average_get_cpus(); if(wavefront_mode!=WAVEFRONT_MODE_MULTICORE) { double task_time = measure_task_time(); printf("Each function takes %.02lfs to run.\n",task_time); block_size = find_best_block_size(xsize,1000,2,task_time,average_dispatch_time); double distributed_time = wavefront_distributed_model(xsize,1000,2,task_time,block_size,average_dispatch_time); double multicore_time = wavefront_multicore_model(xsize,ncpus,task_time); double ideal_multicore_time = wavefront_multicore_model(xsize,xsize,task_time); double sequential_time = wavefront_multicore_model(xsize,1,task_time); printf("---------------------------------\n"); printf("This workload would take:\n"); printf("%.02lfs sequentially\n",sequential_time); printf("%.02lfs on this %d-core machine\n",multicore_time,ncpus); printf("%.02lfs on a %d-core machine\n",ideal_multicore_time,xsize); printf("%.02lfs on a 1000-node distributed system with block size %d\n",distributed_time,block_size); printf("---------------------------------\n"); if(wavefront_mode==WAVEFRONT_MODE_AUTO) { if(multicore_time < distributed_time*2) { wavefront_mode = WAVEFRONT_MODE_MULTICORE; } else { wavefront_mode = WAVEFRONT_MODE_DISTRIBUTED; } } } if(wavefront_mode==WAVEFRONT_MODE_MULTICORE) { batch_system_type = BATCH_QUEUE_TYPE_LOCAL; max_jobs_running = ncpus; } else { max_jobs_running = 1000; } if(manual_block_size!=0) { block_size = manual_block_size; } if(manual_max_jobs_running!=0) { max_jobs_running = manual_max_jobs_running; } if(wavefront_mode==WAVEFRONT_MODE_MULTICORE) { printf("Running in multicore mode with %d CPUs.\n",max_jobs_running); } else { printf("Running in distributed mode with block size %d on up to %d CPUs\n",block_size,max_jobs_running); } batch_q = batch_queue_create(batch_system_type); if(verify_mode) exit(0); struct bitmap * b = bitmap_create(xsize+1,ysize+1); struct list *ready_list = list_create(); struct itable *running_table = itable_create(0); struct batch_job_info info; UINT64_T jobid; struct wavefront_task *task; wavefront_task_initialize(b,ready_list); printf("Starting workload...\n"); fprintf(progress_log_file,"# elapsed time : waiting jobs / running jobs / cells complete (percent complete)\n"); while(1) { if(abort_mode) { while((task=list_pop_tail(ready_list))) { wavefront_task_delete(task); } itable_firstkey(running_table); while(itable_nextkey(running_table,&jobid,(void**)&task)) { batch_job_remove(batch_q,jobid); } } if(list_size(ready_list)==0 && itable_size(running_table)==0) break; while(1) { if(itable_size(running_table)>=max_jobs_running) break; task = list_pop_tail(ready_list); if(!task) break; jobid = wavefront_task_submit(task); if(jobid>0) { itable_insert(running_table,jobid,task); wavefront_task_mark_range(task,b,WAVEFRONT_TASK_STATE_RUNNING); } else { abort(); sleep(1); list_push_head(ready_list,task); } } save_status(b,ready_list,running_table); jobid = batch_job_wait(batch_q,&info); if(jobid>0) { task = itable_remove(running_table,jobid); if(task) { if(info.exited_normally && info.exit_code==0) { total_dispatch_time += info.started-info.submitted; total_execute_time += MAX(info.finished-info.started,1); total_cells_complete+=task->width*task->height; total_jobs_complete++; average_dispatch_time = 1.0*total_dispatch_time / total_jobs_complete; average_task_time = 1.0*total_execute_time / total_cells_complete; wavefront_task_complete(b,ready_list,task); } else { printf("job %" PRIu64 " failed, aborting this workload\n",jobid); abort_mode = 1; } } } } save_status(b,ready_list,running_table); if(abort_mode) { printf("Workload was aborted.\n"); } else { printf("Workload complete.\n"); } return 0; }