예제 #1
0
void save_status( struct bitmap *b, struct list *ready_list, struct itable *running_table )
{
	static time_t last_saved = 0;
	static time_t start_time = 0;

	time_t current = time(0);
	if(!start_time) start_time = current;

	if(progress_bitmap_file) {
		if((current-last_saved) >= progress_bitmap_interval) {
			bitmap_save_bmp(b,progress_bitmap_file);
		}
	}

	fprintf(progress_log_file,
		"%.2lf %% %d s %d %d %d %.02lf %.02lf\n",
		100.0*total_cells_complete/total_cells,
		(int)(current-start_time),
		list_size(ready_list),
		itable_size(running_table),
		total_cells_complete,
		average_dispatch_time,
		average_task_time);
	fflush(0);
}
예제 #2
0
int makeflow_catalog_summary(struct dag* d, char* name, batch_queue_type_t type, timestamp_t start){
    struct dag_node *n;
    dag_node_state_t state;
    
    int tasks_completed = 0;
    int tasks_aborted   = 0;
    int tasks_waiting   = 0;
    int tasks_running   = 0;
    int tasks_failed    = 0;

    for (n = d->nodes; n; n = n->next) {
        state = n->state;
        if (state == DAG_NODE_STATE_FAILED)
            tasks_failed++;
        else if (state == DAG_NODE_STATE_ABORTED)
            tasks_aborted++;
        else if (state == DAG_NODE_STATE_COMPLETE) 
            tasks_completed++;
        else if(state == DAG_NODE_STATE_RUNNING)
            tasks_running++;
        else if(state == DAG_NODE_STATE_WAITING)
            tasks_waiting++;
    }
    
    //transmit report here
    char* host = CATALOG_HOST;
    
    char username[USERNAME_MAX];
    username_get(username);
    
    const char* batch_type = batch_queue_type_to_string(type);
    
    struct jx *j = jx_object(0);
    
    jx_insert_string(j,"type","makeflow");
    jx_insert_integer(j,"total",itable_size(d->node_table));
    jx_insert_integer(j,"running",tasks_running);
    jx_insert_integer(j,"waiting",tasks_waiting);
    jx_insert_integer(j,"aborted",tasks_aborted);
    jx_insert_integer(j,"completed",tasks_completed);
    jx_insert_integer(j,"failed",tasks_failed);
    jx_insert_string(j,"project",name);
    jx_insert_string(j,"owner",username);
    char* timestring = string_format("%" PRIu64 "", start);
    jx_insert_string(j,"time_started",timestring);
    jx_insert_string(j,"batch_type",batch_type);
    
    
    
    //creates memory
    char* text = jx_print_string(j);
    
    int resp = catalog_query_send_update(host, text);
    
    free(text);
    free(timestring);
    jx_delete(j);
    
    return resp;//all good
}
예제 #3
0
static batch_job_id_t batch_job_cluster_wait (struct batch_queue * q, struct batch_job_info * info_out, time_t stoptime)
{
	struct batch_job_info *info;
	batch_job_id_t jobid;
	int t, c;

	while(1) {
		UINT64_T ujobid;
		itable_firstkey(q->job_table);
		while(itable_nextkey(q->job_table, &ujobid, (void **) &info)) {
			jobid = ujobid;
			char *statusfile = string_format("%s.status.%" PRIbjid, cluster_name, jobid);
			FILE *file = fopen(statusfile, "r");
			if(file) {
				char line[BATCH_JOB_LINE_MAX];
				while(fgets(line, sizeof(line), file)) {
					if(sscanf(line, "start %d", &t)) {
						info->started = t;
					} else if(sscanf(line, "stop %d %d", &c, &t) == 2) {
						debug(D_BATCH, "job %" PRIbjid " complete", jobid);
						if(!info->started)
							info->started = t;
						info->finished = t;
						info->exited_normally = 1;
						info->exit_code = c;
					}
				}
				fclose(file);

				if(info->finished != 0) {
					unlink(statusfile);
					info = itable_remove(q->job_table, jobid);
					*info_out = *info;
					free(info);
					free(statusfile);
					return jobid;
				}
			} else {
				debug(D_BATCH, "could not open status file \"%s\"", statusfile);
			}

			free(statusfile);
		}

		if(itable_size(q->job_table) <= 0)
			return 0;

		if(stoptime != 0 && time(0) >= stoptime)
			return -1;

		if(process_pending())
			return -1;

		sleep(1);
	}

	return -1;
}
예제 #4
0
jint ClassInfo::itable_size() {
  int nof_methods = 0;
  for (int index = 0; index < itable_length(); index++) {
    InstanceClass::Raw ic = itable_interface_at(index);
    ObjArray::Raw methods = ic().methods();
    nof_methods += methods().length();
  }
  return itable_size(itable_length(), nof_methods);
}
예제 #5
0
파일: histogram.c 프로젝트: dthain/cctools
int histogram_size(struct histogram *h) {
	int count = 0;

	if(h->buckets) {
		count += itable_size(h->buckets);
	}

	return count;
}
예제 #6
0
void remove_all_workers( struct batch_queue *queue, struct itable *job_table )
{
	uint64_t jobid;
	void *value;

	debug(D_WQ,"removing all remaining worker jobs...");
	int count = itable_size(job_table);
	itable_firstkey(job_table);
	while(itable_nextkey(job_table,&jobid,&value)) {
		debug(D_WQ,"removing job %"PRId64,jobid);
		batch_job_remove(queue,jobid);
	}
	debug(D_WQ,"%d workers removed.",count);

}
예제 #7
0
struct cluster *nearest_neighbor_clustering(struct list *initial_clusters, double (*cmp)(struct cluster *, struct cluster *))
{
	struct cluster *top, *closest, *subtop;
	struct list   *stack;
	struct itable *active_clusters;
	double dclosest, dsubtop;

	int merge = 0;

	list_first_item(initial_clusters);
	top = list_next_item(initial_clusters);

	/* Return immediately if top is NULL, or there is a unique
	 * initial cluster */
	if(list_size(initial_clusters) < 2)
		return top;

	stack = list_create(0);
	list_push_head(stack, top);

	/* Add all of the initial clusters as active clusters. */
	active_clusters = itable_create(0);
	while( (top = list_next_item(initial_clusters)) ) 
		itable_insert(active_clusters, (uintptr_t) top, (void *) top);

	do
	{
		/* closest might be NULL if all of the clusters are in
		 * the stack now. subtop might be NULL if top was the
		 * only cluster in the stack */
		top     = list_pop_head( stack );
		closest = cluster_nearest_neighbor(active_clusters, top, cmp);
		subtop  = list_peek_head( stack );

		dclosest = -1;
		dsubtop  = -1;

		if(closest)
			dclosest = cluster_ward_distance(top, closest);

		if(subtop)
			dsubtop = cluster_ward_distance(top, subtop);

		/* The nearest neighbor of top is either one of the
		 * remaining active clusters, or the second topmost
		 * cluster in the stack */
		if( closest && subtop )
		{
			/* Use pointer address to systematically break ties. */
			if(dclosest < dsubtop || ((dclosest == dsubtop) && (uintptr_t)closest < (uintptr_t)subtop)) 
				merge = 0;
			else 
				merge = 1;
		}
		else if( subtop )
			merge = 1;
		else if( closest )
			merge = 0;
		else
			fatal("Zero clusters?\n"); //We should never reach here.

		if(merge)
		{
			/* If the two topmost clusters in the stack are
			 * mutual nearest neighbors, merge them into a single
			 * cluster */
			subtop = list_pop_head( stack );
			list_push_head(stack, cluster_merge(top, subtop));
		}
		else
		{
			/* Otherwise, push the nearest neighbor of top to the
			 * stack */
			itable_remove(active_clusters, (uintptr_t) closest);
			list_push_head(stack, top);
			list_push_head(stack, closest);
		}

		debug(D_DEBUG, "stack: %d  active: %d  closest: %lf subtop: %lf\n", 
				list_size(stack), itable_size(active_clusters), dclosest, dsubtop);

		/* If there are no more active_clusters, but there is not
		 * a single cluster in the stack, we try again,
		 * converting the clusters in the stack into new active
		 * clusters. */
		if(itable_size(active_clusters) == 0 && list_size(stack) > 3)
		{
			itable_delete(active_clusters);
			return nearest_neighbor_clustering(stack, cmp);
		}

	}while( !(itable_size(active_clusters) == 0 && list_size(stack) == 1) );

	/* top is now the root of a cluster hierarchy, of
	 * cluster->right, cluster->left. */
	top = list_pop_head(stack);

	list_delete(stack);
	itable_delete(active_clusters);

	return top;
}
예제 #8
0
static batch_job_id_t batch_job_condor_wait (struct batch_queue * q, struct batch_job_info * info_out, time_t stoptime)
{
	static FILE *logfile = 0;

	if(!logfile) {
		logfile = fopen(q->logfile, "r");
		if(!logfile) {
			debug(D_NOTICE, "couldn't open logfile %s: %s\n", q->logfile, strerror(errno));
			return -1;
		}
	}

	while(1) {
		/*
		   Note: clearerr is necessary to clear any cached end-of-file condition,
		   otherwise some implementations of fgets (i.e. darwin) will read to end
		   of file once and then never look for any more data.
		 */

		clearerr(logfile);

		char line[BATCH_JOB_LINE_MAX];
		while(fgets(line, sizeof(line), logfile)) {
			int type, proc, subproc;
			batch_job_id_t jobid;
			time_t current;
			struct tm tm;

			struct batch_job_info *info;
			int logcode, exitcode;

			if(sscanf(line, "%d (%" SCNbjid ".%d.%d) %d/%d %d:%d:%d", &type, &jobid, &proc, &subproc, &tm.tm_mon, &tm.tm_mday, &tm.tm_hour, &tm.tm_min, &tm.tm_sec) == 9) {
				tm.tm_year = 2008 - 1900;
				tm.tm_isdst = 0;

				current = mktime(&tm);

				info = itable_lookup(q->job_table, jobid);
				if(!info) {
					info = malloc(sizeof(*info));
					memset(info, 0, sizeof(*info));
					itable_insert(q->job_table, jobid, info);
				}

				debug(D_BATCH, "line: %s", line);

				if(type == 0) {
					info->submitted = current;
				} else if(type == 1) {
					info->started = current;
					debug(D_BATCH, "job %" PRIbjid " running now", jobid);
				} else if(type == 9) {
					itable_remove(q->job_table, jobid);

					info->finished = current;
					info->exited_normally = 0;
					info->exit_signal = SIGKILL;

					debug(D_BATCH, "job %" PRIbjid " was removed", jobid);

					memcpy(info_out, info, sizeof(*info));
					free(info);
					return jobid;
				} else if(type == 5) {
					itable_remove(q->job_table, jobid);

					info->finished = current;

					fgets(line, sizeof(line), logfile);
					if(sscanf(line, " (%d) Normal termination (return value %d)", &logcode, &exitcode) == 2) {
						debug(D_BATCH, "job %" PRIbjid " completed normally with status %d.", jobid, exitcode);
						info->exited_normally = 1;
						info->exit_code = exitcode;
					} else if(sscanf(line, " (%d) Abnormal termination (signal %d)", &logcode, &exitcode) == 2) {
						debug(D_BATCH, "job %" PRIbjid " completed abnormally with signal %d.", jobid, exitcode);
						info->exited_normally = 0;
						info->exit_signal = exitcode;
					} else {
						debug(D_BATCH, "job %" PRIbjid " completed with unknown status.", jobid);
						info->exited_normally = 0;
						info->exit_signal = 0;
					}

					memcpy(info_out, info, sizeof(*info));
					free(info);
					return jobid;
				}
			}
		}


		if(itable_size(q->job_table) <= 0)
			return 0;

		if(stoptime != 0 && time(0) >= stoptime)
			return -1;

		if(process_pending())
			return -1;

		sleep(1);
	}

	return -1;
}
예제 #9
0
int main( int argc, char *argv[] )
{
	signed char c;

	const char *progname = "wavefront";

	debug_config(progname);

	progress_log_file = stdout;

	struct option long_options[] = {
		{"help",  no_argument, 0, 'h'},
		{"version", no_argument, 0, 'v'},
		{"debug", required_argument, 0, 'd'},
		{"jobs", required_argument, 0, 'n'},
		{"block-size", required_argument, 0, 'b'},
		{"debug-file", required_argument, 0, 'o'},
		{"log-file", required_argument, 0, 'l'},
		{"bitmap", required_argument, 0, 'B'},
		{"bitmap-interval", required_argument, 0, 'i'},
		{"auto", no_argument, 0, 'A'},
		{"local", no_argument, 0, 'L'},
		{"batch-type", required_argument, 0, 'T'},
		{"verify", no_argument, 0, 'V'},
        {0,0,0,0}
	};

	while((c=getopt_long(argc,argv,"n:b:d:o:l:B:i:qALDT:VX:Y:vh", long_options, NULL)) > -1) {
		switch(c) {
			case 'n':
				manual_max_jobs_running = atoi(optarg);
				break;
			case 'b':
				manual_block_size = atoi(optarg);
				break;
			case 'd':
				debug_flags_set(optarg);
				break;
			case 'o':
				debug_config_file(optarg);
				break;
			case 'B':
				progress_bitmap_file = optarg;
				break;
			case 'i':
				progress_bitmap_interval = atoi(optarg);
				break;
			case 'l':
				progress_log_file = fopen(optarg,"w");
				if(!progress_log_file) {
					fprintf(stderr,"couldn't open %s: %s\n",optarg,strerror(errno));
					return 1;
				}
				break;
			case 'A':
				wavefront_mode = WAVEFRONT_MODE_AUTO;
				break;
			case 'L':
				wavefront_mode = WAVEFRONT_MODE_MULTICORE;
				break;
			case 'T':
				wavefront_mode = WAVEFRONT_MODE_DISTRIBUTED;
				batch_system_type = batch_queue_type_from_string(optarg);
				if(batch_system_type==BATCH_QUEUE_TYPE_UNKNOWN) {
					fprintf(stderr,"unknown batch system type: %s\n",optarg);
					exit(1);
				}
				break;
			case 'V':
				verify_mode = 1;
				break;
			case 'X':
				xstart = atoi(optarg);
				break;
			case 'Y':
				ystart = atoi(optarg);
				break;
			case 'v':
				cctools_version_print(stdout, progname);
				exit(0);
				break;
			case 'h':
				show_help(progname);
				exit(0);
				break;
		}
	}

	cctools_version_debug(D_DEBUG, argv[0]);

	if( (argc-optind<3) ) {
		show_help(progname);
		exit(1);
	}

	function = argv[optind];
	xsize=atoi(argv[optind+1]);
	ysize=atoi(argv[optind+2]);
	total_cells = xsize*ysize;

	if(!verify_mode && !check_configuration(function,xsize,ysize)) exit(1);

	int ncpus = load_average_get_cpus();

	if(wavefront_mode!=WAVEFRONT_MODE_MULTICORE) {
		double task_time = measure_task_time();
		printf("Each function takes %.02lfs to run.\n",task_time);

		block_size = find_best_block_size(xsize,1000,2,task_time,average_dispatch_time);
		double distributed_time = wavefront_distributed_model(xsize,1000,2,task_time,block_size,average_dispatch_time);
		double multicore_time = wavefront_multicore_model(xsize,ncpus,task_time);
		double ideal_multicore_time = wavefront_multicore_model(xsize,xsize,task_time);
		double sequential_time = wavefront_multicore_model(xsize,1,task_time);

		printf("---------------------------------\n");
		printf("This workload would take:\n");
		printf("%.02lfs sequentially\n",sequential_time);
		printf("%.02lfs on this %d-core machine\n",multicore_time,ncpus);
		printf("%.02lfs on a %d-core machine\n",ideal_multicore_time,xsize);
		printf("%.02lfs on a 1000-node distributed system with block size %d\n",distributed_time,block_size);
		printf("---------------------------------\n");

		if(wavefront_mode==WAVEFRONT_MODE_AUTO) {
			if(multicore_time < distributed_time*2) {
				wavefront_mode = WAVEFRONT_MODE_MULTICORE;
			} else {
				wavefront_mode = WAVEFRONT_MODE_DISTRIBUTED;
			}
		}
	}

	if(wavefront_mode==WAVEFRONT_MODE_MULTICORE) {
		batch_system_type = BATCH_QUEUE_TYPE_LOCAL;
		max_jobs_running = ncpus;
	} else {
		max_jobs_running = 1000;
	}

	if(manual_block_size!=0) {
		block_size = manual_block_size;
	}

	if(manual_max_jobs_running!=0) {
		max_jobs_running = manual_max_jobs_running;
	}

	if(wavefront_mode==WAVEFRONT_MODE_MULTICORE) {
		printf("Running in multicore mode with %d CPUs.\n",max_jobs_running);
	} else {
		printf("Running in distributed mode with block size %d on up to %d CPUs\n",block_size,max_jobs_running);
	}

	batch_q = batch_queue_create(batch_system_type);

	if(verify_mode) exit(0);

	struct bitmap * b = bitmap_create(xsize+1,ysize+1);
	struct list *ready_list = list_create();
	struct itable *running_table = itable_create(0);

	struct batch_job_info info;
	UINT64_T jobid;
	struct wavefront_task *task;

	wavefront_task_initialize(b,ready_list);

	printf("Starting workload...\n");

	fprintf(progress_log_file,"# elapsed time : waiting jobs / running jobs / cells complete (percent complete)\n");

	while(1) {

		if(abort_mode) {
			while((task=list_pop_tail(ready_list))) {
				wavefront_task_delete(task);
			}

			itable_firstkey(running_table);
			while(itable_nextkey(running_table,&jobid,(void**)&task)) {
				batch_job_remove(batch_q,jobid);
			}
		}

		if(list_size(ready_list)==0 && itable_size(running_table)==0) break;

		while(1) {
			if(itable_size(running_table)>=max_jobs_running) break;

			task = list_pop_tail(ready_list);
			if(!task) break;
			
			jobid = wavefront_task_submit(task);
			if(jobid>0) {
				itable_insert(running_table,jobid,task);
				wavefront_task_mark_range(task,b,WAVEFRONT_TASK_STATE_RUNNING);
			} else {
				abort();
				sleep(1);
				list_push_head(ready_list,task);
			}
		}


		save_status(b,ready_list,running_table);

		jobid = batch_job_wait(batch_q,&info);
		if(jobid>0) {
			task = itable_remove(running_table,jobid);
			if(task) {
				if(info.exited_normally && info.exit_code==0) {
					total_dispatch_time += info.started-info.submitted;
					total_execute_time += MAX(info.finished-info.started,1);
					total_cells_complete+=task->width*task->height;
					total_jobs_complete++;

					average_dispatch_time = 1.0*total_dispatch_time / total_jobs_complete;
					average_task_time = 1.0*total_execute_time / total_cells_complete;

					wavefront_task_complete(b,ready_list,task);
				} else {
					printf("job %" PRIu64 " failed, aborting this workload\n",jobid);
					abort_mode = 1;
				}
			}
		}
	}

	save_status(b,ready_list,running_table);

	if(abort_mode) {
		printf("Workload was aborted.\n");
	} else {
		printf("Workload complete.\n");
	}

	return 0;
}
예제 #10
0
파일: dag.c 프로젝트: Macainian/cctools
int dag_local_jobs_running( struct dag *d )
{
	return itable_size(d->local_job_table);
}
예제 #11
0
파일: dag.c 프로젝트: Macainian/cctools
int dag_remote_jobs_running( struct dag *d )
{
	return itable_size(d->remote_job_table);
}
예제 #12
0
struct mpi_queue_task *mpi_queue_wait(struct mpi_queue *q, int timeout)
{
	struct mpi_queue_task *t;
	time_t stoptime;
	int result;

	if(timeout == MPI_QUEUE_WAITFORTASK) {
		stoptime = 0;
	} else {
		stoptime = time(0) + timeout;
	}


	while(1) {
		// If a task is already complete, return it
		t = list_pop_head(q->complete_list);
		if(t)
			return t;

		if(list_size(q->ready_list) == 0 && itable_size(q->active_list) == 0)
			break;

		// Wait no longer than the caller's patience.
		int msec;
		int sec;
		if(stoptime) {
			sec = MAX(0, stoptime - time(0));
			msec = sec * 1000;
		} else {
			sec = 5;
			msec = 5000;
		}

		if(!q->mpi_link) {
			q->mpi_link = link_accept(q->master_link, stoptime);
			if(q->mpi_link) {
				char working_dir[MPI_QUEUE_LINE_MAX];
				link_tune(q->mpi_link, LINK_TUNE_INTERACTIVE);
				link_usleep(q->mpi_link, msec, 0, 1);
				getcwd(working_dir, MPI_QUEUE_LINE_MAX);
				link_putfstring(q->mpi_link, "workdir %s\n", stoptime, working_dir);
				result = link_usleep(q->mpi_link, msec, 1, 1);
			} else {
				result = 0;
			}
		} else {
			debug(D_MPI, "Waiting for link to be ready\n");
			result = link_usleep(q->mpi_link, msec, 1, 1);
		}

		// If nothing was awake, restart the loop or return without a task.
		if(result <= 0) {
			if(stoptime && time(0) >= stoptime) {
				return 0;
			} else {
				continue;
			}
		}

		debug(D_MPI, "sending %d tasks to the MPI master process\n", list_size(q->ready_list));
		// Send all ready tasks to the MPI master process
		while(list_size(q->ready_list)) {
			struct mpi_queue_task *t = list_pop_head(q->ready_list);
			result = dispatch_task(q->mpi_link, t, msec/1000);
			if(result <= 0)
				return 0;
			itable_insert(q->active_list, t->taskid, t);
		}

		// Receive any results back
		result = get_results(q->mpi_link, q->active_list, q->complete_list, msec/1000);
		if(result < 0) {
			return 0;
		}
	}

	return 0;
}
예제 #13
0
int mpi_queue_empty(struct mpi_queue *q)
{
	return ((list_size(q->ready_list) + itable_size(q->active_list) + list_size(q->complete_list)) == 0);
}
예제 #14
0
void makeflow_summary_create(struct dag *d, const char *filename, const char *email_summary_to, timestamp_t runtime, timestamp_t time_completed, int argc, char *argv[], const char *dagfile, struct batch_queue *remote_queue, int abort_flag, int failed_flag )
{
	char buffer[50];

	FILE *summary_file = NULL;
	FILE *summary_email = NULL;

	if(filename)
		summary_file = fopen(filename, "w");

	if(email_summary_to) {
		summary_email = popen("sendmail -t", "w");
		fprintf(summary_email, "To: %s\n", email_summary_to);
		timestamp_fmt(buffer, 50, "%c", time_completed);
		fprintf(summary_email, "Subject: Makeflow Run Summary - %s \n", buffer);
	}

	int i;

	for(i = 0; i < argc; i++)
		summarize(summary_file, summary_email, "%s ", argv[i]);

	summarize(summary_file, summary_email, "\n");

	if(abort_flag)
		summarize(summary_file, summary_email, "Workflow aborted:\t ");
	else if(failed_flag)
		summarize(summary_file, summary_email, "Workflow failed:\t ");
	else
		summarize(summary_file, summary_email, "Workflow completed:\t ");
	timestamp_fmt(buffer, 50, "%c\n", time_completed);
	summarize(summary_file, summary_email, "%s", buffer);

	int seconds = runtime / 1000000;
	int hours = seconds / 3600;
	int minutes = (seconds - hours * 3600) / 60;
	seconds = seconds - hours * 3600 - minutes * 60;
	summarize(summary_file, summary_email, "Total runtime:\t\t %d:%02d:%02d\n", hours, minutes, seconds);

	summarize(summary_file, summary_email, "Workflow file:\t\t %s\n", dagfile);

	struct dag_node *n;
	struct dag_file *f;
	const char *fn;
	dag_node_state_t state;
	struct list *output_files;
	output_files = list_create();
	struct list *failed_tasks;
	failed_tasks = list_create();
	int total_tasks = itable_size(d->node_table);
	int tasks_completed = 0;
	int tasks_aborted = 0;
	int tasks_unrun = 0;

	for(n = d->nodes; n; n = n->next) {
		state = n->state;
		if(state == DAG_NODE_STATE_FAILED && !list_find(failed_tasks, (int (*)(void *, const void *)) string_equal, (void *) fn))
			list_push_tail(failed_tasks, (void *) n->command);
		else if(state == DAG_NODE_STATE_ABORTED)
			tasks_aborted++;
		else if(state == DAG_NODE_STATE_COMPLETE) {
			tasks_completed++;
			list_first_item(n->source_files);
			while((f = list_next_item(n->source_files))) {
				fn = f->filename;
				if(!list_find(output_files, (int (*)(void *, const void *)) string_equal, (void *) fn))
					list_push_tail(output_files, (void *) fn);
			}
		} else
			tasks_unrun++;
	}

	summarize(summary_file, summary_email, "Number of tasks:\t %d\n", total_tasks);
	summarize(summary_file, summary_email, "Completed tasks:\t %d/%d\n", tasks_completed, total_tasks);
	if(tasks_aborted != 0)
		summarize(summary_file, summary_email, "Aborted tasks:\t %d/%d\n", tasks_aborted, total_tasks);
	if(tasks_unrun != 0)
		summarize(summary_file, summary_email, "Tasks not run:\t\t %d/%d\n", tasks_unrun, total_tasks);
	if(list_size(failed_tasks) > 0)
		summarize(summary_file, summary_email, "Failed tasks:\t\t %d/%d\n", list_size(failed_tasks), total_tasks);
	for(list_first_item(failed_tasks); (fn = list_next_item(failed_tasks)) != NULL;)
		summarize(summary_file, summary_email, "\t%s\n", fn);

	if(list_size(output_files) > 0) {
		summarize(summary_file, summary_email, "Output files:\n");
		for(list_first_item(output_files); (fn = list_next_item(output_files)) != NULL;) {
			const char *size;
			struct stat buf;
			batch_fs_stat(remote_queue, fn, &buf);
			size = string_metric(buf.st_size, -1, NULL);
			summarize(summary_file, summary_email, "\t%s\t%s\n", fn, size);
		}
	}

	list_free(output_files);
	list_delete(output_files);
	list_free(failed_tasks);
	list_delete(failed_tasks);

	if(filename) {
		fprintf(stderr, "writing summary to %s.\n", filename);
		fclose(summary_file);
	}

	if(email_summary_to) {
		fprintf(stderr, "emailing summary to %s.\n", email_summary_to);
		fclose(summary_email);
	}
}
예제 #15
0
int master_main(const char *host, int port, const char *addr) {
	time_t idle_stoptime;
	struct link *master = NULL;
	int num_workers, i;
	struct mpi_queue_job **workers;

	struct itable *active_jobs = itable_create(0);
	struct itable *waiting_jobs = itable_create(0);
	struct list   *complete_jobs = list_create();

	MPI_Comm_size(MPI_COMM_WORLD, &num_workers);

	workers = malloc(num_workers * sizeof(*workers));
	memset(workers, 0, num_workers * sizeof(*workers));	
	
	idle_stoptime = time(0) + idle_timeout;

	while(!abort_flag) {
		char line[MPI_QUEUE_LINE_MAX];

		if(time(0) > idle_stoptime) {
			if(master) {
				printf("mpi master: gave up after waiting %ds to receive a task.\n", idle_timeout);
			} else {
				printf("mpi master: gave up after waiting %ds to connect to %s port %d.\n", idle_timeout, host, port);
			}
			break;
		}


		if(!master) {
			char working_dir[MPI_QUEUE_LINE_MAX];
			master = link_connect(addr, port, idle_stoptime);
			if(!master) {
				sleep(5);
				continue;
			}

			link_tune(master, LINK_TUNE_INTERACTIVE);
			
			link_readline(master, line, sizeof(line), time(0) + active_timeout);

			memset(working_dir, 0, MPI_QUEUE_LINE_MAX);
			if(sscanf(line, "workdir %s", working_dir) == 1) {
				MPI_Bcast(working_dir, MPI_QUEUE_LINE_MAX, MPI_CHAR, 0, MPI_COMM_WORLD);
			} else {
				link_close(master);
				master = NULL;
				continue;
			}
		}
		
		if(link_readline(master, line, sizeof(line), time(0) + short_timeout)) {
			struct mpi_queue_operation *op;
			int jobid, mode;
			INT64_T length;
			char path[MPI_QUEUE_LINE_MAX];
			op = NULL;
			
			debug(D_MPI, "received: %s\n", line);

			if(!strcmp(line, "get results")) {
				struct mpi_queue_job *job;
				debug(D_MPI, "results requested: %d available\n", list_size(complete_jobs));
				link_putfstring(master, "num results %d\n", time(0) + active_timeout, list_size(complete_jobs));
				while(list_size(complete_jobs)) {
					job = list_pop_head(complete_jobs);
					link_putfstring(master, "result %d %d %d %lld\n", time(0) + active_timeout, job->jobid, job->status, job->result, job->output_length);
					if(job->output_length) {
						link_write(master, job->output, job->output_length, time(0)+active_timeout);
					}
					mpi_queue_job_delete(job);
				}

			} else if(sscanf(line, "work %d %lld", &jobid, &length)) {
				op = malloc(sizeof(*op));
				memset(op, 0, sizeof(*op));
				op->type = MPI_QUEUE_OP_WORK;
				op->buffer_length = length+1;
				op->buffer = malloc(length+1);
				op->buffer[op->buffer_length] = 0;
				link_read(master, op->buffer, length, time(0) + active_timeout);
				op->result = -1;
				
			} else if(sscanf(line, "stat %d %s", &jobid, path) == 2) {
				op = malloc(sizeof(*op));
				memset(op, 0, sizeof(*op));
				op->type = MPI_QUEUE_OP_STAT;
				sprintf(op->args, "%s", path);
				op->result = -1;
				
			} else if(sscanf(line, "unlink %d %s", &jobid, path) == 2) {
				op = malloc(sizeof(*op));
				memset(op, 0, sizeof(*op));
				op->type = MPI_QUEUE_OP_UNLINK;
				sprintf(op->args, "%s", path);
				op->result = -1;
				
			} else if(sscanf(line, "mkdir %d %s %o", &jobid, path, &mode) == 3) {
				op = malloc(sizeof(*op));
				memset(op, 0, sizeof(*op));
				op->type = MPI_QUEUE_OP_MKDIR;
				sprintf(op->args, "%s %o", path, mode);
				op->result = -1;
				
			} else if(sscanf(line, "close %d", &jobid) == 1) {
				op = malloc(sizeof(*op));
				memset(op, 0, sizeof(*op));
				op->type = MPI_QUEUE_OP_CLOSE;
				op->result = -1;
				
//			} else if(sscanf(line, "symlink %d %s %s", &jobid, path, filename) == 3) {
//			} else if(sscanf(line, "put %d %s %lld %o", &jobid, filename, &length, &mode) == 4) {
//			} else if(sscanf(line, "rget %d %s", &jobid, filename) == 2) {
//			} else if(sscanf(line, "get %d %s", &jobid, filename) == 2) {
//			} else if(sscanf(line, "thirdget %d %d %s %[^\n]", &jobid, &mode, filename, path) == 4) {
//			} else if(sscanf(line, "thirdput %d %d %s %[^\n]", &jobid, &mode, filename, path) == 4) {
			} else if(!strcmp(line, "exit")) {
				break;
			} else {
				abort_flag = 1;
				continue;
			}
			if(op) {
				struct mpi_queue_job *job;
					job = itable_lookup(active_jobs, jobid);
				if(!job) {
					job = itable_lookup(waiting_jobs, jobid);
				}
				if(!job) {
					job = malloc(sizeof(*job));
					memset(job, 0, sizeof(*job));
					job->jobid = jobid;
					job->operations = list_create();
					job->status = MPI_QUEUE_JOB_WAITING;
					job->worker_rank = -1;
					itable_insert(waiting_jobs, jobid, job);
				}
				list_push_tail(job->operations, op);
			}
			idle_stoptime = time(0) + idle_timeout;
		} else {
			link_close(master);
			master = 0;
			sleep(5);
		}
		
		int num_waiting_jobs = itable_size(waiting_jobs);
		int num_unvisited_jobs = itable_size(active_jobs);
		for(i = 1; i < num_workers && (num_unvisited_jobs > 0 || num_waiting_jobs > 0); i++) {
			struct mpi_queue_job *job;
			struct mpi_queue_operation *op;
			int flag = 0;
			UINT64_T jobid;

			if(!workers[i]) {
				if(num_waiting_jobs) {
					itable_firstkey(waiting_jobs);
					itable_nextkey(waiting_jobs, &jobid, (void **)&job);
					itable_remove(waiting_jobs, jobid);
					itable_insert(active_jobs, jobid, job);
					workers[i] = job;
					num_waiting_jobs--;
					job->worker_rank = i;
					job->status = MPI_QUEUE_JOB_READY;
				} else {
					continue;
				}
			} else {
				num_unvisited_jobs--;
				if(workers[i]->status == MPI_QUEUE_JOB_BUSY) {
					MPI_Test(&workers[i]->request, &flag, &workers[i]->mpi_status);
					if(flag) {
						op = list_pop_head(workers[i]->operations);
						if(op->output_length) {
							op->output_buffer = malloc(op->output_length);
							MPI_Recv(op->output_buffer, op->output_length, MPI_BYTE, workers[i]->worker_rank, 0, MPI_COMM_WORLD, &workers[i]->mpi_status);
						}
						
						workers[i]->status = MPI_QUEUE_JOB_READY;

						if(op->type == MPI_QUEUE_OP_WORK || op->result < 0) {
							if(workers[i]->output)
								free(workers[i]->output);
							workers[i]->output = op->output_buffer;
							op->output_buffer = NULL;
							workers[i]->output_length = op->output_length;
							workers[i]->result = op->result;
							if(op->result < 0) {
								workers[i]->status = MPI_QUEUE_JOB_FAILED | op->type;
								op->type = MPI_QUEUE_OP_CLOSE;
								list_push_head(workers[i]->operations, op);
								op = NULL;
							}
						}
						if(op) {
							if(op->buffer)
								free(op->buffer);
							if(op->output_buffer)
								free(op->output_buffer);
							free(op);
						}
					}
				}
			}
			
			if( workers[i]->status != MPI_QUEUE_JOB_BUSY && list_size(workers[i]->operations)) {
				op = list_peek_head(workers[i]->operations);
				
				if(op->type == MPI_QUEUE_OP_CLOSE) {
					itable_remove(active_jobs, workers[i]->jobid);
					list_push_tail(complete_jobs, workers[i]);
					if(!(workers[i]->status & MPI_QUEUE_JOB_FAILED))
						workers[i]->status = MPI_QUEUE_JOB_COMPLETE;
					workers[i] = NULL;
					i--;
					continue;
				}
				
				MPI_Send(op, sizeof(*op), MPI_BYTE, workers[i]->worker_rank, 0, MPI_COMM_WORLD);
				if(op->buffer_length) {
					MPI_Send(op->buffer, op->buffer_length, MPI_BYTE, workers[i]->worker_rank, 0, MPI_COMM_WORLD);
					free(op->buffer);
					op->buffer_length = 0;
					op->buffer = NULL;
				}
				MPI_Irecv(op, sizeof(*op), MPI_BYTE, workers[i]->worker_rank, 0, MPI_COMM_WORLD, &workers[i]->request);
				workers[i]->status = MPI_QUEUE_JOB_BUSY;
			}
		}
	}


	/** Clean up waiting & complete jobs, send Exit commands to each worker */
	if(!master) {
		// If the master link hasn't been set up yet
		// the workers will be waiting for the working directory
		char line[MPI_QUEUE_LINE_MAX];
		memset(line, 0, MPI_QUEUE_LINE_MAX);
		MPI_Bcast(line, MPI_QUEUE_LINE_MAX, MPI_CHAR, 0, MPI_COMM_WORLD);
	} else {
		link_close(master);
	}

	for(i = 1; i < num_workers; i++) {
		struct mpi_queue_operation *op, close;
		memset(&close, 0, sizeof(close));
		close.type = MPI_QUEUE_OP_EXIT;
		
		if(workers[i]) {
			if(workers[i]->status == MPI_QUEUE_JOB_BUSY) {
				MPI_Wait(&workers[i]->request, &workers[i]->mpi_status);
				op = list_peek_head(workers[i]->operations);
				
				if(op->output_length) {
					op->output_buffer = malloc(op->output_length);
					MPI_Recv(op->output_buffer, op->output_length, MPI_BYTE, workers[i]->worker_rank, 0, MPI_COMM_WORLD, &workers[i]->mpi_status);
				}
			}
			itable_remove(active_jobs, workers[i]->jobid);
			list_push_tail(complete_jobs, workers[i]);
		}
		MPI_Send(&close, sizeof(close), MPI_BYTE, i, 0, MPI_COMM_WORLD);
	}

	itable_firstkey(waiting_jobs);
	while(itable_size(waiting_jobs)) {
		struct mpi_queue_job *job;
		UINT64_T jobid;

		itable_nextkey(waiting_jobs, &jobid, (void **)&job);
		itable_remove(waiting_jobs, jobid);
		list_push_tail(complete_jobs, job);
	}

	while(list_size(complete_jobs)) {
		mpi_queue_job_delete(list_pop_head(complete_jobs));
	}

	MPI_Finalize();
	return abort_flag;
}