示例#1
0
static batch_job_id_t batch_job_amazon_batch_submit(struct batch_queue* q, const char* cmd, const char* extra_input_files, const char* extra_output_files, struct jx* envlist, const struct rmsummary* resources){
	struct internal_amazon_batch_amazon_ids amazon_ids = initialize(q);
	char* env_var = amazon_ids.master_env_prefix;

	//so, we have the access keys, now we need to either set up the queues and exec environments, or add them.
	unsigned int jobid = gen_guid();
	char* job_name = string_format("%s_%u",queue_name,jobid);
	
	//makeflow specifics
	struct batch_job_info *info = malloc(sizeof(*info));
	memset(info, 0, sizeof(*info));
	
	//specs
	int cpus=1;
	long int mem=1000;
	char* img = hash_table_lookup(q->options,"amazon-batch-img");
	int disk = 1000;
	if(resources){
		cpus = resources->cores;
		mem = resources->memory;
		disk = resources->disk;
		cpus = cpus > 1? cpus:1;
		mem = mem > 1000? mem:1000;
		disk = disk > 1000 ? disk : 1000;
	}
	//upload files to S3
	upload_input_files_to_s3((char*)extra_input_files,job_name);	
	upload_cmd_file(bucket_name,(char*)extra_input_files,(char*)extra_output_files,(char*)cmd,jobid);
	
	//create the fmd string to give to the command
	char* fmt_cmd = string_format("%s aws s3 cp s3://%s/COMAND_FILE_%u.sh ./ && sh ./COMAND_FILE_%u.sh",env_var,bucket_name,jobid,jobid);	

	//combine all properties together
	char* properties_string = string_format("{ \\\"image\\\": \\\"%s\\\", \\\"vcpus\\\": %i, \\\"memory\\\": %li, \\\"privileged\\\":true ,\\\"command\\\": [\\\"sh\\\",\\\"-c\\\",\\\"%s\\\"], \\\"environment\\\":[{\\\"name\\\":\\\"AWS_ACCESS_KEY_ID\\\",\\\"value\\\":\\\"%s\\\"},{\\\"name\\\":\\\"AWS_SECRET_ACCESS_KEY\\\",\\\"value\\\":\\\"%s\\\"},{\\\"name\\\":\\\"REGION\\\",\\\"value\\\":\\\"%s\\\"}] }", img,cpus,mem,fmt_cmd,amazon_ids.aws_access_key_id,amazon_ids.aws_secret_access_key,amazon_ids.aws_region);
	
	char* jaid = aws_submit_job(job_name,properties_string);

	itable_insert(amazon_job_ids,jobid,jaid);
	debug(D_BATCH,"Job %u has amazon id: %s",jobid,jaid);
	itable_insert(done_files,jobid,string_format("%s",extra_output_files));
	debug(D_BATCH,"Job %u successfully Submitted",jobid);
	
	//let makeflow know
	info->submitted = time(0);
	info->started = time(0);
	itable_insert(q->job_table, jobid, info);
	
	//cleanup
	free(job_name);
	free(fmt_cmd);
	
	return jobid;
	
}
示例#2
0
static int chirp_fuse_open(const char *path, struct fuse_file_info *fi)
{
	static int file_number_counter = 1;
	struct chirp_file *file;
	int mode = 0;

	char newpath[CHIRP_PATH_MAX];
	char host[CHIRP_PATH_MAX];

	parsepath(path, newpath, host);

	pthread_mutex_lock(&mutex);
	file = chirp_global_open(host, newpath, fi->flags, mode, time(0) + chirp_fuse_timeout);
	if(file) {
		int file_number = file_number_counter++;
		itable_insert(file_table, file_number, file);
		fi->fh = file_number;
	}

	pthread_mutex_unlock(&mutex);

	if(!file)
		return -errno;

	return 0;

}
示例#3
0
int histogram_insert(struct histogram *h, double value) {
	uint64_t bucket = bucket_of(h, value);

	struct box_count *box = itable_lookup(h->buckets, bucket);
	if(!box) {
		box = calloc(1, sizeof(*box));
		itable_insert(h->buckets, bucket, box);
	}

	h->total_count++;
	box->count++;

	int mode_count = histogram_count(h, histogram_mode(h));

	if(value > h->max_value || h->total_count < 1) {
		h->max_value = value;
	}

	if(value < h->min_value || h->total_count < 1) {
		h->min_value = value;
	}

	if(box->count > mode_count) {
		h->mode       = end_of(h, bucket);
	}

	return box->count;
}
示例#4
0
INT64_T chirp_alloc_open(const char *path, INT64_T flags, INT64_T mode)
{
	struct alloc_state *a;
	int fd = -1;

	if(!alloc_enabled)
		return cfs->open(path, flags, mode);

	a = alloc_state_cache(path);
	if(a) {
		INT64_T filesize = cfs_file_size(path);
		if(filesize < 0)
			filesize = 0;

		fd = cfs->open(path, flags, mode);
		if(fd >= 0) {
			if(!fd_table)
				fd_table = itable_create(0);
			itable_insert(fd_table, fd, xstrdup(path));
			if(flags & O_TRUNC) {
				alloc_state_update(a, -space_consumed(filesize));
			}
		}
	} else {
		fd = -1;
	}
	return fd;
}
batch_job_id_t batch_job_submit_mpi_queue( struct batch_queue *q, const char *cmd, const char *args, const char *infile, const char *outfile, const char *errfile, const char *extra_input_files, const char *extra_output_files )
{
	char *command = string_format("%s %s", cmd, args);
	if(infile) {
		char *new = string_format("%s <%s", infile);
		free(command);
		command = new;
	}

	struct mpi_queue_task *t = mpi_queue_task_create(command);
	free(command);

	if(infile)
		mpi_queue_task_specify_file(t, infile, MPI_QUEUE_INPUT);
	if(cmd)
		mpi_queue_task_specify_file(t, cmd, MPI_QUEUE_INPUT);

	specify_mpi_queue_task_files(t, extra_input_files, extra_output_files);

	mpi_queue_submit(q->mpi_queue, t);

	if(outfile) {
		itable_insert(q->output_table, t->taskid, strdup(outfile));
	}

	return t->taskid;
}
示例#6
0
void histogram_set_bucket(struct histogram *h, double value, int count) {
	uint64_t bucket = bucket_of(h, value);

	struct box_count *box = itable_lookup(h->buckets, bucket);
	if(!box) {
		box = calloc(1, sizeof(*box));
		itable_insert(h->buckets, bucket, box);
	}
}
示例#7
0
static int dag_parse_node(struct lexer *bk)
{
	struct token *t = lexer_next_token(bk);
	if(t->type != TOKEN_FILES)
	{
		lexer_report_error(bk, "Error reading rule.");
	}
	lexer_free_token(t);

	struct dag_node *n;
	n = dag_node_create(bk->d, bk->line_number);

	if(verbose_parsing && bk->d->nodeid_counter % parsing_rule_mod_counter == 0)
	{
		fprintf(stdout, "\rRules parsed: %d", bk->d->nodeid_counter + 1);
		fflush(stdout);
	}

	n->category = bk->category;
	list_push_tail(n->category->nodes, n);

	dag_parse_node_filelist(bk, n);

	bk->environment->node = n;

	/* Read variables, if any */
	while((t = lexer_peek_next_token(bk)) && t->type != TOKEN_COMMAND)
	{
		switch (t->type) {
		case TOKEN_VARIABLE:
			dag_parse_variable(bk, n);
			break;
		default:
			lexer_report_error(bk, "Expected COMMAND or VARIABLE, got: %s", lexer_print_token(t));
			break;
		}
	}

	if(!t)
	{
		lexer_report_error(bk, "Rule does not have a command.\n");
	}

	dag_parse_node_command(bk, n);
	bk->environment->node = NULL;

	n->next = bk->d->nodes;
	bk->d->nodes = n;
	itable_insert(bk->d->node_table, n->nodeid, n);

	debug(D_MAKEFLOW_PARSER, "Setting resource category '%s' for rule %d.\n", n->category->label, n->nodeid);
	dag_node_fill_resources(n);
	dag_node_print_debug_resources(n);

	return 1;
}
示例#8
0
void histogram_attach_data(struct histogram *h, double value, void *data) {
	uint64_t bucket = bucket_of(h, value);

	struct box_count *box = itable_lookup(h->buckets, bucket);
	if(!box) {
		box = calloc(1, sizeof(*box));
		itable_insert(h->buckets, bucket, box);
	}

	box->data = data;
}
示例#9
0
static int submit_workers( struct batch_queue *queue, struct itable *job_table, int count )
{
	int i;
	for(i=0;i<count;i++) {
		int jobid = submit_worker(queue);
		if(jobid>0) {
			debug(D_WQ,"worker job %d submitted",jobid);
			itable_insert(job_table,jobid,(void*)1);
		} else {
			break;
		}
	}
	return i;
}
示例#10
0
static batch_job_id_t batch_job_dryrun_submit (struct batch_queue *q, const char *cmd, const char *extra_input_files, const char *extra_output_files, struct jx *envlist, const struct rmsummary *resources )
{
	FILE *log;
	char *escaped_cmd;
	char *env_assignment;
	char *escaped_env_assignment;
	struct batch_job_info *info;
	batch_job_id_t jobid = random();

	fflush(NULL);

	debug(D_BATCH, "started dry run of job %" PRIbjid ": %s", jobid, cmd);

	if ((log = fopen(q->logfile, "a"))) {
		if (!(info = calloc(sizeof(*info), 1))) {
			fclose(log);
			return -1;
		}
		info->submitted = time(0);
		info->started = time(0);
		itable_insert(q->job_table, jobid, info);

		if(envlist && jx_istype(envlist, JX_OBJECT) && envlist->u.pairs) {
			struct jx_pair *p;
			fprintf(log, "env ");
			for(p=envlist->u.pairs;p;p=p->next) {
				if(p->key->type==JX_STRING && p->value->type==JX_STRING) {
					env_assignment = string_format("%s=%s", p->key->u.string_value,p->value->u.string_value);
					escaped_env_assignment = string_escape_shell(env_assignment);
					fprintf(log, "%s", escaped_env_assignment);
					fprintf(log, " ");
					free(env_assignment);
					free(escaped_env_assignment);
				}
			}
		}
		escaped_cmd = string_escape_shell(cmd);
		fprintf(log, "sh -c %s\n", escaped_cmd);
		free(escaped_cmd);
		fclose(log);
		return jobid;
	} else {
		return -1;
	}
}
示例#11
0
static int batch_job_amazon_batch_remove(struct batch_queue *q, batch_job_id_t jobid){
	struct internal_amazon_batch_amazon_ids amazon_ids = initialize(q);
	char* env_var = amazon_ids.master_env_prefix; 
	if(itable_lookup(done_jobs,jobid)==NULL){
		char* name = string_format("%s_%i",queue_name,(int)jobid);
		itable_insert(done_jobs,jobid+1,name);
	}
	char* amazon_id;
	if((amazon_id=itable_lookup(amazon_job_ids,jobid))==NULL){
		return -1;
	}
	char* cmd = string_format("%s aws batch terminate-job --job-id %s --reason \"Makeflow Killed\"",env_var,amazon_id);
	debug(D_BATCH,"Terminating the job: %s\n",cmd);
	sh_system(cmd);
	free(cmd);
	return 0;
	
}
示例#12
0
static int itable_double_buckets(struct itable *h)
{
	struct itable *hn = itable_create(2 * h->bucket_count);

	if(!hn)
		return 0;

	/* Move pairs to new hash */
	uint64_t key;
	void *value;
	itable_firstkey(h);
	while(itable_nextkey(h, &key, &value))
		if(!itable_insert(hn, key, value))
		{
			itable_delete(hn);
			return 0;
		}

	/* Delete all old pairs */
	struct entry *e, *f;
	int i;
	for(i = 0; i < h->bucket_count; i++) {
		e = h->buckets[i];
		while(e) {
			f = e->next;
			free(e);
			e = f;
		}
	}

	/* Make the old point to the new */
	free(h->buckets);
	h->buckets      = hn->buckets;
	h->bucket_count = hn->bucket_count;
	h->size         = hn->size;
	
	/* Delete reference to new, so old is safe */
	free(hn);

	return 1;
}
示例#13
0
batch_job_id_t batch_job_submit_simple_local(struct batch_queue *q, const char *cmd, const char *extra_input_files, const char *extra_output_files)
{
    batch_job_id_t jobid;

    fflush(NULL);
    jobid = fork();
    if(jobid > 0) {
        debug(D_BATCH, "started process %d: %s", jobid, cmd);
        struct batch_job_info *info = malloc(sizeof(*info));
        memset(info, 0, sizeof(*info));
        info->submitted = time(0);
        info->started = time(0);
        itable_insert(q->job_table, jobid, info);
        return jobid;
    } else if(jobid < 0) {
        debug(D_BATCH, "couldn't create new process: %s\n", strerror(errno));
        return -1;
    } else {
        /** The following code works but would duplicates the current process because of the system() function.
        int result = system(cmd);
        if(WIFEXITED(result)) {
        	_exit(WEXITSTATUS(result));
        } else {
        	_exit(1);
        }*/

        /** A note from "man system 3" as of Jan 2012:
         * Do not use system() from a program with set-user-ID or set-group-ID
         * privileges, because strange values for some environment variables
         * might be used to subvert system integrity. Use the exec(3) family of
         * functions instead, but not execlp(3) or execvp(3). system() will
         * not, in fact, work properly from programs with set-user-ID or
         * set-group-ID privileges on systems on which /bin/sh is bash version
         * 2, since bash 2 drops privileges on startup. (Debian uses a modified
         * bash which does not do this when invoked as sh.)
         */
        execlp("sh", "sh", "-c", cmd, (char *) 0);
        _exit(127);	// Failed to execute the cmd.
    }
    return -1;
}
示例#14
0
struct list *makeflow_wrapper_generate_files( struct list *result, struct list *input, struct dag_node *n, struct makeflow_wrapper *w)
{
	char *f;
	char *nodeid = string_format("%d",n->nodeid);

	struct list *files = list_create();

	list_first_item(input);
	while((f = list_next_item(input)))
	{
		char *filename = string_replace_percents(f, nodeid);
		char *f = xxstrdup(filename);
		free(filename);

		char *remote, *p;
		struct dag_file *file;
		p = strchr(f, '=');
		if(p) {
			*p = 0;
			file = dag_file_lookup_or_create(n->d, f);
			if(!n->local_job && !itable_lookup(w->remote_names, (uintptr_t) file)){
				remote = xxstrdup(p+1);
				itable_insert(w->remote_names, (uintptr_t) file, (void *)remote);
				hash_table_insert(w->remote_names_inv, remote, (void *)file);
			}
			*p = '=';
		} else {
			file = dag_file_lookup_or_create(n->d, f);
		}
		free(f);
		list_push_tail(files, file);
	}
	free(nodeid);

	result = list_splice(result, files);

	return result;
}
示例#15
0
文件: dag.c 项目: dcbradley/cctools
/* Adds remotename to the local name filename in the namespace of
 * the given node. If remotename is NULL, then a new name is
 * found using dag_node_translate_filename. If the remotename
 * given is different from a previosly specified, a warning is
 * written to the debug output, but otherwise this is ignored. */
const char *dag_node_add_remote_name(struct dag_node *n, const char *filename, const char *remotename)
{
	char *oldname;
	struct dag_file *f = dag_file_from_name(n->d, filename);

	if(!f)
		fatal("trying to add remote name %s to unknown file %s.\n", remotename, filename);

	if(!remotename)
		remotename = dag_node_translate_filename(n, filename);
	else
		remotename = xxstrdup(remotename);

	oldname = hash_table_lookup(n->remote_names_inv, remotename);

	if(oldname && strcmp(oldname, filename) == 0)
		debug(D_DEBUG, "Remote name %s for %s already in use for %s\n", remotename, filename, oldname);

	itable_insert(n->remote_names, (uintptr_t) f, remotename);
	hash_table_insert(n->remote_names_inv, remotename, (void *) f);

	return remotename;
}
示例#16
0
struct cluster *nearest_neighbor_clustering(struct list *initial_clusters, double (*cmp)(struct cluster *, struct cluster *))
{
	struct cluster *top, *closest, *subtop;
	struct list   *stack;
	struct itable *active_clusters;
	double dclosest, dsubtop;

	int merge = 0;

	list_first_item(initial_clusters);
	top = list_next_item(initial_clusters);

	/* Return immediately if top is NULL, or there is a unique
	 * initial cluster */
	if(list_size(initial_clusters) < 2)
		return top;

	stack = list_create(0);
	list_push_head(stack, top);

	/* Add all of the initial clusters as active clusters. */
	active_clusters = itable_create(0);
	while( (top = list_next_item(initial_clusters)) ) 
		itable_insert(active_clusters, (uintptr_t) top, (void *) top);

	do
	{
		/* closest might be NULL if all of the clusters are in
		 * the stack now. subtop might be NULL if top was the
		 * only cluster in the stack */
		top     = list_pop_head( stack );
		closest = cluster_nearest_neighbor(active_clusters, top, cmp);
		subtop  = list_peek_head( stack );

		dclosest = -1;
		dsubtop  = -1;

		if(closest)
			dclosest = cluster_ward_distance(top, closest);

		if(subtop)
			dsubtop = cluster_ward_distance(top, subtop);

		/* The nearest neighbor of top is either one of the
		 * remaining active clusters, or the second topmost
		 * cluster in the stack */
		if( closest && subtop )
		{
			/* Use pointer address to systematically break ties. */
			if(dclosest < dsubtop || ((dclosest == dsubtop) && (uintptr_t)closest < (uintptr_t)subtop)) 
				merge = 0;
			else 
				merge = 1;
		}
		else if( subtop )
			merge = 1;
		else if( closest )
			merge = 0;
		else
			fatal("Zero clusters?\n"); //We should never reach here.

		if(merge)
		{
			/* If the two topmost clusters in the stack are
			 * mutual nearest neighbors, merge them into a single
			 * cluster */
			subtop = list_pop_head( stack );
			list_push_head(stack, cluster_merge(top, subtop));
		}
		else
		{
			/* Otherwise, push the nearest neighbor of top to the
			 * stack */
			itable_remove(active_clusters, (uintptr_t) closest);
			list_push_head(stack, top);
			list_push_head(stack, closest);
		}

		debug(D_DEBUG, "stack: %d  active: %d  closest: %lf subtop: %lf\n", 
				list_size(stack), itable_size(active_clusters), dclosest, dsubtop);

		/* If there are no more active_clusters, but there is not
		 * a single cluster in the stack, we try again,
		 * converting the clusters in the stack into new active
		 * clusters. */
		if(itable_size(active_clusters) == 0 && list_size(stack) > 3)
		{
			itable_delete(active_clusters);
			return nearest_neighbor_clustering(stack, cmp);
		}

	}while( !(itable_size(active_clusters) == 0 && list_size(stack) == 1) );

	/* top is now the root of a cluster hierarchy, of
	 * cluster->right, cluster->left. */
	top = list_pop_head(stack);

	list_delete(stack);
	itable_delete(active_clusters);

	return top;
}
示例#17
0
int dag_parse_node(struct lexer_book *bk, char *line_org)
{
	struct dag *d = bk->d;
	char *line;
	char *outputs = NULL;
	char *inputs = NULL;
	struct dag_node *n;

	n = dag_node_create(bk->d, bk->line_number);

	n->category = bk->category;
	list_push_tail(n->category->nodes, n);

	line = xxstrdup(line_org);

	outputs = line;

	inputs = strchr(line, ':');
	*inputs = 0;
	inputs = inputs + 1;

	inputs = string_trim_spaces(inputs);
	outputs = string_trim_spaces(outputs);

	dag_parse_node_filelist(bk, n, outputs, 0);
	dag_parse_node_filelist(bk, n, inputs, 1);

	int ok;
	char *comment;
	//parse variables and comments
	while((line = dag_parse_readline(bk, n)) != NULL) {
		if(line[0] == '@' && strchr(line, '=')) {
			ok = dag_parse_variable(bk, n, line);
			free(line);

			if(ok) {
				continue;
			} else {
				dag_parse_error(bk, "node variable");
				free(line);
				return 0;
			}
		}

		comment = strchr(line, '#');
		if(comment)
		{
			*comment = '\0';
			int n = strspn(line, " \t");
			int m = strlen(line);
			*comment = '#';


			/* make sure that only spaces and tabs appear before the hash */
			if(n == m) {
				continue;
			}
		}

		/* not a comment or a variable, so we break to parse the command */
		break;
	}

	ok = dag_parse_node_command(bk, n, line);
	free(line);

	if(ok) {
		n->next = d->nodes;
		d->nodes = n;
		itable_insert(d->node_table, n->nodeid, n);
	} else {
		dag_parse_error(bk, "node command");
		return 0;
	}


	debug(D_DEBUG, "Setting resource category '%s' for rule %d.\n", n->category->label, n->nodeid);
	dag_task_fill_resources(n);
	dag_task_print_debug_resources(n);

	return 1;
}
示例#18
0
static batch_job_id_t batch_job_condor_submit (struct batch_queue *q, const char *cmd, const char *extra_input_files, const char *extra_output_files, struct jx *envlist, const struct rmsummary *resources )
{
	FILE *file;
	int njobs;
	int jobid;
	const char *options = hash_table_lookup(q->options, "batch-options");

	if(setup_condor_wrapper("condor.sh") < 0) {
		debug(D_BATCH, "could not create condor.sh: %s", strerror(errno));
		return -1;
	}

	if(!string_istrue(hash_table_lookup(q->options, "skip-afs-check"))) {
		char *cwd = path_getcwd();
		if(!strncmp(cwd, "/afs", 4)) {
			debug(D_NOTICE|D_BATCH, "The working directory is '%s':", cwd);
			debug(D_NOTICE|D_BATCH, "This won't work because Condor is not able to write to files in AFS.");
			debug(D_NOTICE|D_BATCH, "Instead, run makeflow from a local disk like /tmp.");
			debug(D_NOTICE|D_BATCH, "Or, use the Work Queue with -T wq and condor_submit_workers.");
			free(cwd);
			exit(EXIT_FAILURE);
		}
		free(cwd);
	}

	file = fopen("condor.submit", "w");
	if(!file) {
		debug(D_BATCH, "could not create condor.submit: %s", strerror(errno));
		return -1;
	}

	fprintf(file, "universe = vanilla\n");
	fprintf(file, "executable = condor.sh\n");
	char *escaped = string_escape_condor(cmd);
	fprintf(file, "arguments = %s\n", escaped);
	free(escaped);
	if(extra_input_files)
		fprintf(file, "transfer_input_files = %s\n", extra_input_files);
	// Note that we do not use transfer_output_files, because that causes the job
	// to get stuck in a system hold if the files are not created.
	fprintf(file, "should_transfer_files = yes\n");
	fprintf(file, "when_to_transfer_output = on_exit\n");
	fprintf(file, "notification = never\n");
	fprintf(file, "copy_to_spool = true\n");
	fprintf(file, "transfer_executable = true\n");
	fprintf(file, "keep_claim_idle = 30\n");
	fprintf(file, "log = %s\n", q->logfile);

	const char *c_req = batch_queue_get_option(q, "condor-requirements");
	char *bexp = blacklisted_expression(q);

	if(c_req && bexp) {
		fprintf(file, "requirements = %s && %s\n", c_req, bexp);
	} else if(c_req) {
		fprintf(file, "requirements = %s\n", c_req);
	} else if(bexp) {
		fprintf(file, "requirements = %s\n", bexp);
	}

	if(bexp)
		free(bexp);

	/*
	Getting environment variables formatted for a condor submit
	file is very hairy, due to some strange quoting rules.
	To avoid problems, we simply export vars to the environment,
	and then tell condor getenv=true, which pulls in the environment.
	*/

	fprintf(file, "getenv = true\n");

	if(envlist) {
		jx_export(envlist);
	}

	if(options)
		fprintf(file, "%s\n", options);

	/* set same deafults as condor_submit_workers */
	int64_t cores  = 1;
	int64_t memory = 1024;
	int64_t disk   = 1024;

	if(resources) {
		cores  = resources->cores  > -1 ? resources->cores  : cores;
		memory = resources->memory > -1 ? resources->memory : memory;
		disk   = resources->disk   > -1 ? resources->disk   : disk;
	}

	/* convert disk to KB */
	disk *= 1024;

	if(batch_queue_get_option(q, "autosize")) {
		fprintf(file, "request_cpus   = ifThenElse(%" PRId64 " > TotalSlotCpus, %" PRId64 ", TotalSlotCpus)\n", cores, cores);
		fprintf(file, "request_memory = ifThenElse(%" PRId64 " > TotalSlotMemory, %" PRId64 ", TotalSlotMemory)\n", memory, memory);
		fprintf(file, "request_disk   = ifThenElse((%" PRId64 ") > TotalSlotDisk, (%" PRId64 "), TotalSlotDisk)\n", disk, disk);
	}
	else {
			fprintf(file, "request_cpus = %" PRId64 "\n", cores);
			fprintf(file, "request_memory = %" PRId64 "\n", memory);
			fprintf(file, "request_disk = %" PRId64 "\n", disk);
	}

	fprintf(file, "queue\n");
	fclose(file);

	file = popen("condor_submit condor.submit", "r");
	if(!file)
		return -1;

	char line[BATCH_JOB_LINE_MAX];
	while(fgets(line, sizeof(line), file)) {
		if(sscanf(line, "%d job(s) submitted to cluster %d", &njobs, &jobid) == 2) {
			pclose(file);
			debug(D_BATCH, "job %d submitted to condor", jobid);
			struct batch_job_info *info;
			info = malloc(sizeof(*info));
			memset(info, 0, sizeof(*info));
			info->submitted = time(0);
			itable_insert(q->job_table, jobid, info);
			return jobid;
		}
	}

	pclose(file);
	debug(D_BATCH, "failed to submit job to condor!");
	return -1;
}
示例#19
0
static batch_job_id_t batch_job_condor_wait (struct batch_queue * q, struct batch_job_info * info_out, time_t stoptime)
{
	static FILE *logfile = 0;

	if(!logfile) {
		logfile = fopen(q->logfile, "r");
		if(!logfile) {
			debug(D_NOTICE, "couldn't open logfile %s: %s\n", q->logfile, strerror(errno));
			return -1;
		}
	}

	while(1) {
		/*
		   Note: clearerr is necessary to clear any cached end-of-file condition,
		   otherwise some implementations of fgets (i.e. darwin) will read to end
		   of file once and then never look for any more data.
		 */

		clearerr(logfile);

		char line[BATCH_JOB_LINE_MAX];
		while(fgets(line, sizeof(line), logfile)) {
			int type, proc, subproc;
			batch_job_id_t jobid;
			time_t current;
			struct tm tm;

			struct batch_job_info *info;
			int logcode, exitcode;

			if(sscanf(line, "%d (%" SCNbjid ".%d.%d) %d/%d %d:%d:%d", &type, &jobid, &proc, &subproc, &tm.tm_mon, &tm.tm_mday, &tm.tm_hour, &tm.tm_min, &tm.tm_sec) == 9) {
				tm.tm_year = 2008 - 1900;
				tm.tm_isdst = 0;

				current = mktime(&tm);

				info = itable_lookup(q->job_table, jobid);
				if(!info) {
					info = malloc(sizeof(*info));
					memset(info, 0, sizeof(*info));
					itable_insert(q->job_table, jobid, info);
				}

				debug(D_BATCH, "line: %s", line);

				if(type == 0) {
					info->submitted = current;
				} else if(type == 1) {
					info->started = current;
					debug(D_BATCH, "job %" PRIbjid " running now", jobid);
				} else if(type == 9) {
					itable_remove(q->job_table, jobid);

					info->finished = current;
					info->exited_normally = 0;
					info->exit_signal = SIGKILL;

					debug(D_BATCH, "job %" PRIbjid " was removed", jobid);

					memcpy(info_out, info, sizeof(*info));
					free(info);
					return jobid;
				} else if(type == 5) {
					itable_remove(q->job_table, jobid);

					info->finished = current;

					fgets(line, sizeof(line), logfile);
					if(sscanf(line, " (%d) Normal termination (return value %d)", &logcode, &exitcode) == 2) {
						debug(D_BATCH, "job %" PRIbjid " completed normally with status %d.", jobid, exitcode);
						info->exited_normally = 1;
						info->exit_code = exitcode;
					} else if(sscanf(line, " (%d) Abnormal termination (signal %d)", &logcode, &exitcode) == 2) {
						debug(D_BATCH, "job %" PRIbjid " completed abnormally with signal %d.", jobid, exitcode);
						info->exited_normally = 0;
						info->exit_signal = exitcode;
					} else {
						debug(D_BATCH, "job %" PRIbjid " completed with unknown status.", jobid);
						info->exited_normally = 0;
						info->exit_signal = 0;
					}

					memcpy(info_out, info, sizeof(*info));
					free(info);
					return jobid;
				}
			}
		}


		if(itable_size(q->job_table) <= 0)
			return 0;

		if(stoptime != 0 && time(0) >= stoptime)
			return -1;

		if(process_pending())
			return -1;

		sleep(1);
	}

	return -1;
}
示例#20
0
int main( int argc, char *argv[] )
{
	signed char c;

	const char *progname = "wavefront";

	debug_config(progname);

	progress_log_file = stdout;

	struct option long_options[] = {
		{"help",  no_argument, 0, 'h'},
		{"version", no_argument, 0, 'v'},
		{"debug", required_argument, 0, 'd'},
		{"jobs", required_argument, 0, 'n'},
		{"block-size", required_argument, 0, 'b'},
		{"debug-file", required_argument, 0, 'o'},
		{"log-file", required_argument, 0, 'l'},
		{"bitmap", required_argument, 0, 'B'},
		{"bitmap-interval", required_argument, 0, 'i'},
		{"auto", no_argument, 0, 'A'},
		{"local", no_argument, 0, 'L'},
		{"batch-type", required_argument, 0, 'T'},
		{"verify", no_argument, 0, 'V'},
        {0,0,0,0}
	};

	while((c=getopt_long(argc,argv,"n:b:d:o:l:B:i:qALDT:VX:Y:vh", long_options, NULL)) > -1) {
		switch(c) {
			case 'n':
				manual_max_jobs_running = atoi(optarg);
				break;
			case 'b':
				manual_block_size = atoi(optarg);
				break;
			case 'd':
				debug_flags_set(optarg);
				break;
			case 'o':
				debug_config_file(optarg);
				break;
			case 'B':
				progress_bitmap_file = optarg;
				break;
			case 'i':
				progress_bitmap_interval = atoi(optarg);
				break;
			case 'l':
				progress_log_file = fopen(optarg,"w");
				if(!progress_log_file) {
					fprintf(stderr,"couldn't open %s: %s\n",optarg,strerror(errno));
					return 1;
				}
				break;
			case 'A':
				wavefront_mode = WAVEFRONT_MODE_AUTO;
				break;
			case 'L':
				wavefront_mode = WAVEFRONT_MODE_MULTICORE;
				break;
			case 'T':
				wavefront_mode = WAVEFRONT_MODE_DISTRIBUTED;
				batch_system_type = batch_queue_type_from_string(optarg);
				if(batch_system_type==BATCH_QUEUE_TYPE_UNKNOWN) {
					fprintf(stderr,"unknown batch system type: %s\n",optarg);
					exit(1);
				}
				break;
			case 'V':
				verify_mode = 1;
				break;
			case 'X':
				xstart = atoi(optarg);
				break;
			case 'Y':
				ystart = atoi(optarg);
				break;
			case 'v':
				cctools_version_print(stdout, progname);
				exit(0);
				break;
			case 'h':
				show_help(progname);
				exit(0);
				break;
		}
	}

	cctools_version_debug(D_DEBUG, argv[0]);

	if( (argc-optind<3) ) {
		show_help(progname);
		exit(1);
	}

	function = argv[optind];
	xsize=atoi(argv[optind+1]);
	ysize=atoi(argv[optind+2]);
	total_cells = xsize*ysize;

	if(!verify_mode && !check_configuration(function,xsize,ysize)) exit(1);

	int ncpus = load_average_get_cpus();

	if(wavefront_mode!=WAVEFRONT_MODE_MULTICORE) {
		double task_time = measure_task_time();
		printf("Each function takes %.02lfs to run.\n",task_time);

		block_size = find_best_block_size(xsize,1000,2,task_time,average_dispatch_time);
		double distributed_time = wavefront_distributed_model(xsize,1000,2,task_time,block_size,average_dispatch_time);
		double multicore_time = wavefront_multicore_model(xsize,ncpus,task_time);
		double ideal_multicore_time = wavefront_multicore_model(xsize,xsize,task_time);
		double sequential_time = wavefront_multicore_model(xsize,1,task_time);

		printf("---------------------------------\n");
		printf("This workload would take:\n");
		printf("%.02lfs sequentially\n",sequential_time);
		printf("%.02lfs on this %d-core machine\n",multicore_time,ncpus);
		printf("%.02lfs on a %d-core machine\n",ideal_multicore_time,xsize);
		printf("%.02lfs on a 1000-node distributed system with block size %d\n",distributed_time,block_size);
		printf("---------------------------------\n");

		if(wavefront_mode==WAVEFRONT_MODE_AUTO) {
			if(multicore_time < distributed_time*2) {
				wavefront_mode = WAVEFRONT_MODE_MULTICORE;
			} else {
				wavefront_mode = WAVEFRONT_MODE_DISTRIBUTED;
			}
		}
	}

	if(wavefront_mode==WAVEFRONT_MODE_MULTICORE) {
		batch_system_type = BATCH_QUEUE_TYPE_LOCAL;
		max_jobs_running = ncpus;
	} else {
		max_jobs_running = 1000;
	}

	if(manual_block_size!=0) {
		block_size = manual_block_size;
	}

	if(manual_max_jobs_running!=0) {
		max_jobs_running = manual_max_jobs_running;
	}

	if(wavefront_mode==WAVEFRONT_MODE_MULTICORE) {
		printf("Running in multicore mode with %d CPUs.\n",max_jobs_running);
	} else {
		printf("Running in distributed mode with block size %d on up to %d CPUs\n",block_size,max_jobs_running);
	}

	batch_q = batch_queue_create(batch_system_type);

	if(verify_mode) exit(0);

	struct bitmap * b = bitmap_create(xsize+1,ysize+1);
	struct list *ready_list = list_create();
	struct itable *running_table = itable_create(0);

	struct batch_job_info info;
	UINT64_T jobid;
	struct wavefront_task *task;

	wavefront_task_initialize(b,ready_list);

	printf("Starting workload...\n");

	fprintf(progress_log_file,"# elapsed time : waiting jobs / running jobs / cells complete (percent complete)\n");

	while(1) {

		if(abort_mode) {
			while((task=list_pop_tail(ready_list))) {
				wavefront_task_delete(task);
			}

			itable_firstkey(running_table);
			while(itable_nextkey(running_table,&jobid,(void**)&task)) {
				batch_job_remove(batch_q,jobid);
			}
		}

		if(list_size(ready_list)==0 && itable_size(running_table)==0) break;

		while(1) {
			if(itable_size(running_table)>=max_jobs_running) break;

			task = list_pop_tail(ready_list);
			if(!task) break;
			
			jobid = wavefront_task_submit(task);
			if(jobid>0) {
				itable_insert(running_table,jobid,task);
				wavefront_task_mark_range(task,b,WAVEFRONT_TASK_STATE_RUNNING);
			} else {
				abort();
				sleep(1);
				list_push_head(ready_list,task);
			}
		}


		save_status(b,ready_list,running_table);

		jobid = batch_job_wait(batch_q,&info);
		if(jobid>0) {
			task = itable_remove(running_table,jobid);
			if(task) {
				if(info.exited_normally && info.exit_code==0) {
					total_dispatch_time += info.started-info.submitted;
					total_execute_time += MAX(info.finished-info.started,1);
					total_cells_complete+=task->width*task->height;
					total_jobs_complete++;

					average_dispatch_time = 1.0*total_dispatch_time / total_jobs_complete;
					average_task_time = 1.0*total_execute_time / total_cells_complete;

					wavefront_task_complete(b,ready_list,task);
				} else {
					printf("job %" PRIu64 " failed, aborting this workload\n",jobid);
					abort_mode = 1;
				}
			}
		}
	}

	save_status(b,ready_list,running_table);

	if(abort_mode) {
		printf("Workload was aborted.\n");
	} else {
		printf("Workload complete.\n");
	}

	return 0;
}
示例#21
0
int master_main(const char *host, int port, const char *addr) {
	time_t idle_stoptime;
	struct link *master = NULL;
	int num_workers, i;
	struct mpi_queue_job **workers;

	struct itable *active_jobs = itable_create(0);
	struct itable *waiting_jobs = itable_create(0);
	struct list   *complete_jobs = list_create();

	MPI_Comm_size(MPI_COMM_WORLD, &num_workers);

	workers = malloc(num_workers * sizeof(*workers));
	memset(workers, 0, num_workers * sizeof(*workers));	
	
	idle_stoptime = time(0) + idle_timeout;

	while(!abort_flag) {
		char line[MPI_QUEUE_LINE_MAX];

		if(time(0) > idle_stoptime) {
			if(master) {
				printf("mpi master: gave up after waiting %ds to receive a task.\n", idle_timeout);
			} else {
				printf("mpi master: gave up after waiting %ds to connect to %s port %d.\n", idle_timeout, host, port);
			}
			break;
		}


		if(!master) {
			char working_dir[MPI_QUEUE_LINE_MAX];
			master = link_connect(addr, port, idle_stoptime);
			if(!master) {
				sleep(5);
				continue;
			}

			link_tune(master, LINK_TUNE_INTERACTIVE);
			
			link_readline(master, line, sizeof(line), time(0) + active_timeout);

			memset(working_dir, 0, MPI_QUEUE_LINE_MAX);
			if(sscanf(line, "workdir %s", working_dir) == 1) {
				MPI_Bcast(working_dir, MPI_QUEUE_LINE_MAX, MPI_CHAR, 0, MPI_COMM_WORLD);
			} else {
				link_close(master);
				master = NULL;
				continue;
			}
		}
		
		if(link_readline(master, line, sizeof(line), time(0) + short_timeout)) {
			struct mpi_queue_operation *op;
			int jobid, mode;
			INT64_T length;
			char path[MPI_QUEUE_LINE_MAX];
			op = NULL;
			
			debug(D_MPI, "received: %s\n", line);

			if(!strcmp(line, "get results")) {
				struct mpi_queue_job *job;
				debug(D_MPI, "results requested: %d available\n", list_size(complete_jobs));
				link_putfstring(master, "num results %d\n", time(0) + active_timeout, list_size(complete_jobs));
				while(list_size(complete_jobs)) {
					job = list_pop_head(complete_jobs);
					link_putfstring(master, "result %d %d %d %lld\n", time(0) + active_timeout, job->jobid, job->status, job->result, job->output_length);
					if(job->output_length) {
						link_write(master, job->output, job->output_length, time(0)+active_timeout);
					}
					mpi_queue_job_delete(job);
				}

			} else if(sscanf(line, "work %d %lld", &jobid, &length)) {
				op = malloc(sizeof(*op));
				memset(op, 0, sizeof(*op));
				op->type = MPI_QUEUE_OP_WORK;
				op->buffer_length = length+1;
				op->buffer = malloc(length+1);
				op->buffer[op->buffer_length] = 0;
				link_read(master, op->buffer, length, time(0) + active_timeout);
				op->result = -1;
				
			} else if(sscanf(line, "stat %d %s", &jobid, path) == 2) {
				op = malloc(sizeof(*op));
				memset(op, 0, sizeof(*op));
				op->type = MPI_QUEUE_OP_STAT;
				sprintf(op->args, "%s", path);
				op->result = -1;
				
			} else if(sscanf(line, "unlink %d %s", &jobid, path) == 2) {
				op = malloc(sizeof(*op));
				memset(op, 0, sizeof(*op));
				op->type = MPI_QUEUE_OP_UNLINK;
				sprintf(op->args, "%s", path);
				op->result = -1;
				
			} else if(sscanf(line, "mkdir %d %s %o", &jobid, path, &mode) == 3) {
				op = malloc(sizeof(*op));
				memset(op, 0, sizeof(*op));
				op->type = MPI_QUEUE_OP_MKDIR;
				sprintf(op->args, "%s %o", path, mode);
				op->result = -1;
				
			} else if(sscanf(line, "close %d", &jobid) == 1) {
				op = malloc(sizeof(*op));
				memset(op, 0, sizeof(*op));
				op->type = MPI_QUEUE_OP_CLOSE;
				op->result = -1;
				
//			} else if(sscanf(line, "symlink %d %s %s", &jobid, path, filename) == 3) {
//			} else if(sscanf(line, "put %d %s %lld %o", &jobid, filename, &length, &mode) == 4) {
//			} else if(sscanf(line, "rget %d %s", &jobid, filename) == 2) {
//			} else if(sscanf(line, "get %d %s", &jobid, filename) == 2) {
//			} else if(sscanf(line, "thirdget %d %d %s %[^\n]", &jobid, &mode, filename, path) == 4) {
//			} else if(sscanf(line, "thirdput %d %d %s %[^\n]", &jobid, &mode, filename, path) == 4) {
			} else if(!strcmp(line, "exit")) {
				break;
			} else {
				abort_flag = 1;
				continue;
			}
			if(op) {
				struct mpi_queue_job *job;
					job = itable_lookup(active_jobs, jobid);
				if(!job) {
					job = itable_lookup(waiting_jobs, jobid);
				}
				if(!job) {
					job = malloc(sizeof(*job));
					memset(job, 0, sizeof(*job));
					job->jobid = jobid;
					job->operations = list_create();
					job->status = MPI_QUEUE_JOB_WAITING;
					job->worker_rank = -1;
					itable_insert(waiting_jobs, jobid, job);
				}
				list_push_tail(job->operations, op);
			}
			idle_stoptime = time(0) + idle_timeout;
		} else {
			link_close(master);
			master = 0;
			sleep(5);
		}
		
		int num_waiting_jobs = itable_size(waiting_jobs);
		int num_unvisited_jobs = itable_size(active_jobs);
		for(i = 1; i < num_workers && (num_unvisited_jobs > 0 || num_waiting_jobs > 0); i++) {
			struct mpi_queue_job *job;
			struct mpi_queue_operation *op;
			int flag = 0;
			UINT64_T jobid;

			if(!workers[i]) {
				if(num_waiting_jobs) {
					itable_firstkey(waiting_jobs);
					itable_nextkey(waiting_jobs, &jobid, (void **)&job);
					itable_remove(waiting_jobs, jobid);
					itable_insert(active_jobs, jobid, job);
					workers[i] = job;
					num_waiting_jobs--;
					job->worker_rank = i;
					job->status = MPI_QUEUE_JOB_READY;
				} else {
					continue;
				}
			} else {
				num_unvisited_jobs--;
				if(workers[i]->status == MPI_QUEUE_JOB_BUSY) {
					MPI_Test(&workers[i]->request, &flag, &workers[i]->mpi_status);
					if(flag) {
						op = list_pop_head(workers[i]->operations);
						if(op->output_length) {
							op->output_buffer = malloc(op->output_length);
							MPI_Recv(op->output_buffer, op->output_length, MPI_BYTE, workers[i]->worker_rank, 0, MPI_COMM_WORLD, &workers[i]->mpi_status);
						}
						
						workers[i]->status = MPI_QUEUE_JOB_READY;

						if(op->type == MPI_QUEUE_OP_WORK || op->result < 0) {
							if(workers[i]->output)
								free(workers[i]->output);
							workers[i]->output = op->output_buffer;
							op->output_buffer = NULL;
							workers[i]->output_length = op->output_length;
							workers[i]->result = op->result;
							if(op->result < 0) {
								workers[i]->status = MPI_QUEUE_JOB_FAILED | op->type;
								op->type = MPI_QUEUE_OP_CLOSE;
								list_push_head(workers[i]->operations, op);
								op = NULL;
							}
						}
						if(op) {
							if(op->buffer)
								free(op->buffer);
							if(op->output_buffer)
								free(op->output_buffer);
							free(op);
						}
					}
				}
			}
			
			if( workers[i]->status != MPI_QUEUE_JOB_BUSY && list_size(workers[i]->operations)) {
				op = list_peek_head(workers[i]->operations);
				
				if(op->type == MPI_QUEUE_OP_CLOSE) {
					itable_remove(active_jobs, workers[i]->jobid);
					list_push_tail(complete_jobs, workers[i]);
					if(!(workers[i]->status & MPI_QUEUE_JOB_FAILED))
						workers[i]->status = MPI_QUEUE_JOB_COMPLETE;
					workers[i] = NULL;
					i--;
					continue;
				}
				
				MPI_Send(op, sizeof(*op), MPI_BYTE, workers[i]->worker_rank, 0, MPI_COMM_WORLD);
				if(op->buffer_length) {
					MPI_Send(op->buffer, op->buffer_length, MPI_BYTE, workers[i]->worker_rank, 0, MPI_COMM_WORLD);
					free(op->buffer);
					op->buffer_length = 0;
					op->buffer = NULL;
				}
				MPI_Irecv(op, sizeof(*op), MPI_BYTE, workers[i]->worker_rank, 0, MPI_COMM_WORLD, &workers[i]->request);
				workers[i]->status = MPI_QUEUE_JOB_BUSY;
			}
		}
	}


	/** Clean up waiting & complete jobs, send Exit commands to each worker */
	if(!master) {
		// If the master link hasn't been set up yet
		// the workers will be waiting for the working directory
		char line[MPI_QUEUE_LINE_MAX];
		memset(line, 0, MPI_QUEUE_LINE_MAX);
		MPI_Bcast(line, MPI_QUEUE_LINE_MAX, MPI_CHAR, 0, MPI_COMM_WORLD);
	} else {
		link_close(master);
	}

	for(i = 1; i < num_workers; i++) {
		struct mpi_queue_operation *op, close;
		memset(&close, 0, sizeof(close));
		close.type = MPI_QUEUE_OP_EXIT;
		
		if(workers[i]) {
			if(workers[i]->status == MPI_QUEUE_JOB_BUSY) {
				MPI_Wait(&workers[i]->request, &workers[i]->mpi_status);
				op = list_peek_head(workers[i]->operations);
				
				if(op->output_length) {
					op->output_buffer = malloc(op->output_length);
					MPI_Recv(op->output_buffer, op->output_length, MPI_BYTE, workers[i]->worker_rank, 0, MPI_COMM_WORLD, &workers[i]->mpi_status);
				}
			}
			itable_remove(active_jobs, workers[i]->jobid);
			list_push_tail(complete_jobs, workers[i]);
		}
		MPI_Send(&close, sizeof(close), MPI_BYTE, i, 0, MPI_COMM_WORLD);
	}

	itable_firstkey(waiting_jobs);
	while(itable_size(waiting_jobs)) {
		struct mpi_queue_job *job;
		UINT64_T jobid;

		itable_nextkey(waiting_jobs, &jobid, (void **)&job);
		itable_remove(waiting_jobs, jobid);
		list_push_tail(complete_jobs, job);
	}

	while(list_size(complete_jobs)) {
		mpi_queue_job_delete(list_pop_head(complete_jobs));
	}

	MPI_Finalize();
	return abort_flag;
}
示例#22
0
static batch_job_id_t batch_job_mesos_submit (struct batch_queue *q, const char *cmd, 
	const char *extra_input_files, const char *extra_output_files, 
	struct jx *envlist, const struct rmsummary *resources )
{

	// Get the path to mesos python site-packages
	if (!is_mesos_py_path_known) {
		mesos_py_path = batch_queue_get_option(q, "mesos-path");
		if (mesos_py_path != NULL) {
			debug(D_INFO, "Get mesos_path %s from command line\n", mesos_py_path);
		}
		is_mesos_py_path_known = 1;
	}

	// Get the mesos master address
	if (!is_mesos_master_known) {
		mesos_master = batch_queue_get_option(q, "mesos-master");
		if (mesos_master == NULL) {
			fatal("Please specify the hostname of mesos master by using --mesos-master");
		} else {
			debug(D_INFO, "Get mesos_path %s from command line\n", mesos_py_path);
			is_mesos_master_known = 1;
		}
	}

	mesos_preload = batch_queue_get_option(q, "mesos-preload");

	if (is_mesos_py_path_known && 
		is_mesos_master_known && 
		!is_scheduler_running ) {
		// start mesos scheduler if it is not running
		start_mesos_scheduler(q);
		is_scheduler_running = 1;
	}

	int task_id = ++counter;

	debug(D_BATCH, "task %d is ready", task_id);
	struct batch_job_info *info = calloc(1, sizeof(*info));
	info->started = time(0);
	info->submitted = time(0);
	itable_insert(q->job_table, task_id, info);

	// write the ready task information as  
	// "task_id, task_cmd, inputs, outputs" to 
	// mesos_task_info, which will be scanned by 
	// mf_mesos_scheduler later. 

	FILE *task_info_fp;

	if(access(FILE_TASK_INFO, F_OK) != -1) {
		task_info_fp = fopen(FILE_TASK_INFO, "a+");
	} else {
		task_info_fp = fopen(FILE_TASK_INFO, "w+");
	}
	
	struct mesos_task *mt = mesos_task_create(task_id, cmd, extra_input_files, extra_output_files);

	fprintf(task_info_fp, "%d,%s,", mt->task_id, mt->task_cmd);

	if (extra_input_files != NULL && strlen(extra_input_files) != 0) {

		int j = 0;
		int num_input_files = text_list_size(mt->task_input_files);
		for (j = 0; j < (num_input_files-1); j++) {
			fprintf(task_info_fp, "%s ", text_list_get(mt->task_input_files, j));
		}
		fprintf(task_info_fp, "%s,", text_list_get(mt->task_input_files, num_input_files-1));

	} else {
		fprintf(task_info_fp, ",");
	}
    
	if (extra_output_files != NULL && strlen(extra_output_files) != 0) {
		int j = 0;
		int num_output_files = text_list_size(mt->task_output_files);
		for (j = 0; j < (num_output_files-1); j++) {
			fprintf(task_info_fp, "%s ", text_list_get(mt->task_output_files, j));
		}
		fprintf(task_info_fp, "%s,",text_list_get(mt->task_output_files, num_output_files-1));
	} else {
		fprintf(task_info_fp, ",");	
	}

	// The default resource requirements for each task
	int64_t cores = -1;
	int64_t memory = -1;
	int64_t disk = -1;

	if (resources) {
		cores  = resources->cores  > -1 ? resources->cores  : cores;
		memory = resources->memory > -1 ? resources->memory : memory;
		disk   = resources->disk   > -1 ? resources->disk   : disk;
	}

	fprintf(task_info_fp, "%" PRId64 ",%" PRId64 ",%" PRId64 ",", cores, memory, disk);

	fputs("submitted\n", task_info_fp);

	mesos_task_delete(mt);
	fclose(task_info_fp);

	return task_id;
}
示例#23
0
static batch_job_id_t batch_job_amazon_batch_wait(struct batch_queue *q, struct batch_job_info *info_out, time_t stoptime){
	struct internal_amazon_batch_amazon_ids amazon_ids = initialize(q);
	//succeeded check
	int done  = 0;
	char* env_var = amazon_ids.master_env_prefix;
	itable_firstkey(amazon_job_ids);
	char* jaid;
	UINT64_T jobid;
 	while(itable_nextkey(amazon_job_ids,&jobid,(void**)&jaid)){
		done = describe_aws_job(jaid,env_var);
		char* jobname = string_format("%s_%u",queue_name,(unsigned int)jobid);
		unsigned int id = (unsigned int)jobid;
		if(done == DESCRIBE_AWS_JOB_SUCCESS){
			if(itable_lookup(done_jobs,id+1) == NULL){
				//id is done, returning here
				debug(D_BATCH,"Inserting id: %u into done_jobs",id);
				itable_insert(done_jobs,id+1,jobname);
				itable_remove(amazon_job_ids,jobid);
				
				//pull files from s3
				char* output_files = itable_lookup(done_files,id);
				struct list* file_list = extract_file_names_from_list(output_files);
				if(list_size(file_list)> 0){
					list_first_item(file_list);
					char* cur_file = NULL;
					while((cur_file=list_next_item(file_list)) != NULL){
						debug(D_BATCH,"Copying over %s",cur_file);
						char* get_from_s3_cmd = string_format("%s aws s3 cp s3://%s/%s.txz ./%s.txz && tar -xvf %s.txz && rm %s.txz",env_var,bucket_name,cur_file,cur_file, cur_file, cur_file);
						int outputcode = sh_system(get_from_s3_cmd);
						debug(D_BATCH,"output code from calling S3 to pull file %s: %i",cur_file,outputcode);
						FILE* tmpOut = fopen(cur_file,"r");
						if(tmpOut){
							debug(D_BATCH,"File does indeed exist: %s",cur_file);
							fclose(tmpOut);
						}else{
							debug(D_BATCH,"File doesn't exist: %s",cur_file);
						}
						free(get_from_s3_cmd);
					}
				}
				list_free(file_list);
				list_delete(file_list);
				
				//Let Makeflow know we're all done!
				debug(D_BATCH,"Removing the job from the job_table");
				struct batch_job_info* info = itable_remove(q->job_table, id);//got from batch_job_amazon.c
				info->finished = time(0);//get now
				info->exited_normally=1;
				info->exit_code=finished_aws_job_exit_code(jaid,env_var);
				debug(D_BATCH,"copying over the data to info_out");
				memcpy(info_out, info, sizeof(struct batch_job_info));
				free(info);
                                
                                char* jobdef = aws_job_def(jaid);
                                del_job_def(jobdef);
                                free(jobdef);
                                
				return id;
			}
		}else if(done == DESCRIBE_AWS_JOB_FAILED || done == DESCRIBE_AWS_JOB_NON_EXIST){
			if(itable_lookup(done_jobs,id+1)==NULL){
				//id is done, returning here
				itable_insert(done_jobs,id+1,jobname);
				itable_remove(amazon_job_ids,jobid);
				
				debug(D_BATCH,"Failed job: %i",id);
				
				struct batch_job_info* info = itable_remove(q->job_table, id);//got from batch_job_amazon.c
				info->finished = time(0); //get now
				info->exited_normally=0;
				int exc = finished_aws_job_exit_code(jaid,env_var);
				info->exit_code= exc == 0 ? -1 : exc;
				memcpy(info_out, info, sizeof(*info));
				free(info);
                                
                                char* jobdef = aws_job_def(jaid);
                                del_job_def(jobdef);
                                free(jobdef);
                                
				return id;
			}
		}else{
			continue;
		}
	}
	return -1;
}
示例#24
0
static batch_job_id_t batch_job_mesos_wait (struct batch_queue * q, struct batch_job_info * info_out, time_t stoptime)
{
		
	char line[MAX_BUF_SIZE];
	FILE *task_state_fp;
	int last_pos = 0;
	int curr_pos = 0;
	int read_len = 0;

	if(!finished_tasks) {
		finished_tasks = itable_create(0);
	}

	while(access(FILE_TASK_STATE, F_OK) == -1) {}

	task_state_fp = fopen(FILE_TASK_STATE, "r");

	while(1) {

		char *task_id_str;
		char *task_stat_str;
		const char *task_exit_code;
		int task_id;
				
		while(fgets(line, MAX_BUF_SIZE, task_state_fp) != NULL) {
			
			curr_pos = ftell(task_state_fp);
			read_len = curr_pos - last_pos;
			last_pos = curr_pos;

			// trim the newline character
			if (line[read_len-1] == '\n') {
				line[read_len-1] = '\0';
				--read_len;
			}

			task_id_str = strtok(line, ",");
			task_id = atoi(task_id_str);

			// There is a new task finished
			if(itable_lookup(finished_tasks, task_id) == NULL) {

				struct batch_job_info *info = itable_remove(q->job_table, task_id);
			    	
				info->finished = time(0);
				task_stat_str = strtok(NULL, ",");

				if (strcmp(task_stat_str, "finished") == 0) {
					info->exited_normally = 1;
				} else if (strcmp(task_stat_str, "failed") == 0) {
					info->exited_normally = 0;
					task_exit_code = strtok(NULL, ",");

					// 444 is an arbitrary exit code set in mf_mesos_scheduler, 
					// which means the task failed to retrieve the outpus 
					if(atoi(task_exit_code) == 444) {
						info->exit_code = 444;
						debug(D_BATCH, "Task %s failed to retrieve the output.", task_id_str);
					}
					info->exit_code = atoi(task_exit_code);
				} else {
					info->exited_normally = 0;
				}

				memcpy(info_out, info, sizeof(*info));
				free(info);
				fclose(task_state_fp);

				int itable_val = 1;
				itable_insert(finished_tasks, task_id, &itable_val);

				return task_id;
			}
		}
		sleep(1);

		if(stoptime != 0 && time(0) >= stoptime) {
			fclose(task_state_fp);
			return -1;
		}
	}

}
示例#25
0
static batch_job_id_t batch_job_cluster_submit (struct batch_queue * q, const char *cmd, const char *extra_input_files, const char *extra_output_files, struct jx *envlist, const struct rmsummary *resources )
{
	batch_job_id_t jobid;
	struct batch_job_info *info;
	const char *options = hash_table_lookup(q->options, "batch-options");

	if(!setup_batch_wrapper(q, cluster_name)) {
		debug(D_NOTICE|D_BATCH,"couldn't setup wrapper file: %s",strerror(errno));
		return 0;
	}

	/* Use the first word in the command line as a name for the job. */

	char *name = xxstrdup(cmd);
	{
		char *s = strchr(name, ' ');
		if(s)
			*s = 0;
	}

	/*
	Experiment shows that passing environment variables
	through the command-line doesn't work, due to multiple
	levels of quote interpretation.  So, we export all
	variables into the environment, and rely upon the -V
	option to load the environment into the job.
	*/

	if(envlist) {
		jx_export(envlist);
	}

	/*
	Pass the command to run through the environment as well.
	*/
	setenv("BATCH_JOB_COMMAND", cmd, 1);

	char *command = string_format("%s %s %s '%s' %s %s.wrapper",
		cluster_submit_cmd,
		cluster_options,
		cluster_jobname_var,
		path_basename(name),
		options ? options : "",
		cluster_name);

	free(name);

	debug(D_BATCH, "%s", command);

	FILE *file = popen(command, "r");
	free(command);
	if(!file) {
		debug(D_BATCH, "couldn't submit job: %s", strerror(errno));
		return -1;
	}

	char line[BATCH_JOB_LINE_MAX] = "";
	while(fgets(line, sizeof(line), file)) {
		if(sscanf(line, "Your job %" SCNbjid, &jobid) == 1
		|| sscanf(line, "Submitted batch job %" SCNbjid, &jobid) == 1
		|| sscanf(line, "%" SCNbjid, &jobid) == 1 ) {
			debug(D_BATCH, "job %" PRIbjid " submitted", jobid);
			pclose(file);
			info = malloc(sizeof(*info));
			memset(info, 0, sizeof(*info));
			info->submitted = time(0);
			itable_insert(q->job_table, jobid, info);
			return jobid;
		}
	}

	if(strlen(line)) {
		debug(D_NOTICE, "job submission failed: %s", line);
	} else {
		debug(D_NOTICE, "job submission failed: no output from %s", cluster_name);
	}
	pclose(file);
	return -1;
}
示例#26
0
struct mpi_queue_task *mpi_queue_wait(struct mpi_queue *q, int timeout)
{
	struct mpi_queue_task *t;
	time_t stoptime;
	int result;

	if(timeout == MPI_QUEUE_WAITFORTASK) {
		stoptime = 0;
	} else {
		stoptime = time(0) + timeout;
	}


	while(1) {
		// If a task is already complete, return it
		t = list_pop_head(q->complete_list);
		if(t)
			return t;

		if(list_size(q->ready_list) == 0 && itable_size(q->active_list) == 0)
			break;

		// Wait no longer than the caller's patience.
		int msec;
		int sec;
		if(stoptime) {
			sec = MAX(0, stoptime - time(0));
			msec = sec * 1000;
		} else {
			sec = 5;
			msec = 5000;
		}

		if(!q->mpi_link) {
			q->mpi_link = link_accept(q->master_link, stoptime);
			if(q->mpi_link) {
				char working_dir[MPI_QUEUE_LINE_MAX];
				link_tune(q->mpi_link, LINK_TUNE_INTERACTIVE);
				link_usleep(q->mpi_link, msec, 0, 1);
				getcwd(working_dir, MPI_QUEUE_LINE_MAX);
				link_putfstring(q->mpi_link, "workdir %s\n", stoptime, working_dir);
				result = link_usleep(q->mpi_link, msec, 1, 1);
			} else {
				result = 0;
			}
		} else {
			debug(D_MPI, "Waiting for link to be ready\n");
			result = link_usleep(q->mpi_link, msec, 1, 1);
		}

		// If nothing was awake, restart the loop or return without a task.
		if(result <= 0) {
			if(stoptime && time(0) >= stoptime) {
				return 0;
			} else {
				continue;
			}
		}

		debug(D_MPI, "sending %d tasks to the MPI master process\n", list_size(q->ready_list));
		// Send all ready tasks to the MPI master process
		while(list_size(q->ready_list)) {
			struct mpi_queue_task *t = list_pop_head(q->ready_list);
			result = dispatch_task(q->mpi_link, t, msec/1000);
			if(result <= 0)
				return 0;
			itable_insert(q->active_list, t->taskid, t);
		}

		// Receive any results back
		result = get_results(q->mpi_link, q->active_list, q->complete_list, msec/1000);
		if(result < 0) {
			return 0;
		}
	}

	return 0;
}
示例#27
0
void makeflow_wrapper_generate_files( struct batch_task *task, struct list *input, struct list *output, struct dag_node *n, struct makeflow_wrapper *w)
{
	char *f;
	char *nodeid = string_format("%d",n->nodeid);

	list_first_item(input);
	while((f = list_next_item(input)))
	{
		char *filename = string_replace_percents(f, nodeid);
		char *f = xxstrdup(filename);
		free(filename);

		char *remote, *p;
		struct dag_file *file;
		p = strchr(f, '=');
		if(p) {
			*p = 0;
			file = dag_file_lookup_or_create(n->d, f);
			if(!n->local_job && !itable_lookup(w->remote_names, (uintptr_t) file)){
				remote = xxstrdup(p+1);
				itable_insert(w->remote_names, (uintptr_t) file, (void *)remote);
				hash_table_insert(w->remote_names_inv, remote, (void *)file);
				makeflow_hook_add_input_file(n->d, task, f, remote, file->type);
			} else {
				makeflow_hook_add_output_file(n->d, task, f, NULL, file->type);
			}
			*p = '=';
		} else {
			file = dag_file_lookup_or_create(n->d, f);
			makeflow_hook_add_input_file(n->d, task, f, NULL, file->type);
		}
		free(f);
	}

	list_first_item(output);
	while((f = list_next_item(output)))
	{
		char *filename = string_replace_percents(f, nodeid);
		char *f = xxstrdup(filename);
		free(filename);

		char *remote, *p;
		struct dag_file *file;
		p = strchr(f, '=');
		if(p) {
			*p = 0;
			file = dag_file_lookup_or_create(n->d, f);
			if(!n->local_job && !itable_lookup(w->remote_names, (uintptr_t) file)){
				remote = xxstrdup(p+1);
				itable_insert(w->remote_names, (uintptr_t) file, (void *)remote);
				hash_table_insert(w->remote_names_inv, remote, (void *)file);
				makeflow_hook_add_output_file(n->d, task, f, remote, file->type);
			} else {
				makeflow_hook_add_output_file(n->d, task, f, NULL, file->type);
			}
			*p = '=';
		} else {
			file = dag_file_lookup_or_create(n->d, f);
			makeflow_hook_add_output_file(n->d, task, f, NULL, file->type);
		}
		free(f);
	}
	free(nodeid);

}