예제 #1
0
static int batch_fs_dryrun_getcwd (struct batch_queue *q, char *buf, size_t size) {
	const char *cwd = batch_queue_get_option(q, "cwd");
	size_t pathlength = strlen(cwd);
	if (pathlength + 1 > size) {
		errno = ERANGE;
		return -1;
	} else {
		strcpy(buf, cwd);
		return 0;
	}
}
예제 #2
0
static int batch_queue_mesos_free(struct batch_queue *q)
{
	FILE *fp;
	fp = fopen(MESOS_DONE_FILE, "w");

	if(fp == NULL) {
		fatal("Fail to clean up batch queue. %s\n", strerror(errno));
	}

	int batch_queue_abort_flag = atoi(batch_queue_get_option(q, "batch-queue-abort-flag"));
	int batch_queue_failed_flag = atoi(batch_queue_get_option(q, "batch-queue-failed-flag"));

	if(batch_queue_abort_flag) {
		fprintf(fp, "aborted");
	} else if(batch_queue_failed_flag) {
		fprintf(fp, "failed");
	} else {
		fprintf(fp, "finished");
	}

	fclose(fp);
	return 0;
}
예제 #3
0
static batch_job_id_t batch_job_condor_submit (struct batch_queue *q, const char *cmd, const char *extra_input_files, const char *extra_output_files, struct jx *envlist, const struct rmsummary *resources )
{
	FILE *file;
	int njobs;
	int jobid;
	const char *options = hash_table_lookup(q->options, "batch-options");

	if(setup_condor_wrapper("condor.sh") < 0) {
		debug(D_BATCH, "could not create condor.sh: %s", strerror(errno));
		return -1;
	}

	if(!string_istrue(hash_table_lookup(q->options, "skip-afs-check"))) {
		char *cwd = path_getcwd();
		if(!strncmp(cwd, "/afs", 4)) {
			debug(D_NOTICE|D_BATCH, "The working directory is '%s':", cwd);
			debug(D_NOTICE|D_BATCH, "This won't work because Condor is not able to write to files in AFS.");
			debug(D_NOTICE|D_BATCH, "Instead, run makeflow from a local disk like /tmp.");
			debug(D_NOTICE|D_BATCH, "Or, use the Work Queue with -T wq and condor_submit_workers.");
			free(cwd);
			exit(EXIT_FAILURE);
		}
		free(cwd);
	}

	file = fopen("condor.submit", "w");
	if(!file) {
		debug(D_BATCH, "could not create condor.submit: %s", strerror(errno));
		return -1;
	}

	fprintf(file, "universe = vanilla\n");
	fprintf(file, "executable = condor.sh\n");
	char *escaped = string_escape_condor(cmd);
	fprintf(file, "arguments = %s\n", escaped);
	free(escaped);
	if(extra_input_files)
		fprintf(file, "transfer_input_files = %s\n", extra_input_files);
	// Note that we do not use transfer_output_files, because that causes the job
	// to get stuck in a system hold if the files are not created.
	fprintf(file, "should_transfer_files = yes\n");
	fprintf(file, "when_to_transfer_output = on_exit\n");
	fprintf(file, "notification = never\n");
	fprintf(file, "copy_to_spool = true\n");
	fprintf(file, "transfer_executable = true\n");
	fprintf(file, "keep_claim_idle = 30\n");
	fprintf(file, "log = %s\n", q->logfile);

	const char *c_req = batch_queue_get_option(q, "condor-requirements");
	char *bexp = blacklisted_expression(q);

	if(c_req && bexp) {
		fprintf(file, "requirements = %s && %s\n", c_req, bexp);
	} else if(c_req) {
		fprintf(file, "requirements = %s\n", c_req);
	} else if(bexp) {
		fprintf(file, "requirements = %s\n", bexp);
	}

	if(bexp)
		free(bexp);

	/*
	Getting environment variables formatted for a condor submit
	file is very hairy, due to some strange quoting rules.
	To avoid problems, we simply export vars to the environment,
	and then tell condor getenv=true, which pulls in the environment.
	*/

	fprintf(file, "getenv = true\n");

	if(envlist) {
		jx_export(envlist);
	}

	if(options)
		fprintf(file, "%s\n", options);

	/* set same deafults as condor_submit_workers */
	int64_t cores  = 1;
	int64_t memory = 1024;
	int64_t disk   = 1024;

	if(resources) {
		cores  = resources->cores  > -1 ? resources->cores  : cores;
		memory = resources->memory > -1 ? resources->memory : memory;
		disk   = resources->disk   > -1 ? resources->disk   : disk;
	}

	/* convert disk to KB */
	disk *= 1024;

	if(batch_queue_get_option(q, "autosize")) {
		fprintf(file, "request_cpus   = ifThenElse(%" PRId64 " > TotalSlotCpus, %" PRId64 ", TotalSlotCpus)\n", cores, cores);
		fprintf(file, "request_memory = ifThenElse(%" PRId64 " > TotalSlotMemory, %" PRId64 ", TotalSlotMemory)\n", memory, memory);
		fprintf(file, "request_disk   = ifThenElse((%" PRId64 ") > TotalSlotDisk, (%" PRId64 "), TotalSlotDisk)\n", disk, disk);
	}
	else {
			fprintf(file, "request_cpus = %" PRId64 "\n", cores);
			fprintf(file, "request_memory = %" PRId64 "\n", memory);
			fprintf(file, "request_disk = %" PRId64 "\n", disk);
	}

	fprintf(file, "queue\n");
	fclose(file);

	file = popen("condor_submit condor.submit", "r");
	if(!file)
		return -1;

	char line[BATCH_JOB_LINE_MAX];
	while(fgets(line, sizeof(line), file)) {
		if(sscanf(line, "%d job(s) submitted to cluster %d", &njobs, &jobid) == 2) {
			pclose(file);
			debug(D_BATCH, "job %d submitted to condor", jobid);
			struct batch_job_info *info;
			info = malloc(sizeof(*info));
			memset(info, 0, sizeof(*info));
			info->submitted = time(0);
			itable_insert(q->job_table, jobid, info);
			return jobid;
		}
	}

	pclose(file);
	debug(D_BATCH, "failed to submit job to condor!");
	return -1;
}
예제 #4
0
static batch_job_id_t batch_job_wq_wait (struct batch_queue * q, struct batch_job_info * info, time_t stoptime)
{
	static int try_open_log = 0;
	int timeout, taskid = -1;

	if(!try_open_log)
	{
		try_open_log = 1;
		if(!work_queue_specify_log(q->data, q->logfile))
		{
			return -1;
		}

		const char *transactions = batch_queue_get_option(q, "batch_log_transactions_name");
		if(transactions) {
			work_queue_specify_transactions_log(q->data, transactions);
		}
	}

	if(stoptime == 0) {
		timeout = WORK_QUEUE_WAITFORTASK;
	} else {
		timeout = MAX(0, stoptime - time(0));
	}

	struct work_queue_task *t = work_queue_wait(q->data, timeout);
	if(t) {
		info->submitted = t->time_when_submitted / 1000000;
		info->started   = t->time_when_commit_end / 1000000;
		info->finished  = t->time_when_done / 1000000;
		info->exited_normally = 1;
		info->exit_code = t->return_status;
		info->exit_signal = 0;
		info->disk_allocation_exhausted = t->disk_allocation_exhausted;

		/*
		   If the standard ouput of the job is not empty,
		   then print it, because this is analogous to a Unix
		   job, and would otherwise be lost.  Important for
		   capturing errors from the program.
		 */

		if(t->output && t->output[0]) {
			if(t->output[1] || t->output[0] != '\n') {
				string_chomp(t->output);
				printf("%s\n", t->output);
			}
		}

		char *outfile = itable_remove(q->output_table, t->taskid);
		if(outfile) {
			FILE *file = fopen(outfile, "w");
			if(file) {
				fwrite(t->output, strlen(t->output), 1, file);
				fclose(file);
			}
			free(outfile);
		}

		taskid = t->taskid;
		work_queue_task_delete(t);
	}

	if(taskid >= 0) {
		return taskid;
	}

	if(work_queue_empty(q->data)) {
		return 0;
	} else {
		return -1;
	}
}
예제 #5
0
static batch_job_id_t batch_job_mesos_submit (struct batch_queue *q, const char *cmd, 
	const char *extra_input_files, const char *extra_output_files, 
	struct jx *envlist, const struct rmsummary *resources )
{

	// Get the path to mesos python site-packages
	if (!is_mesos_py_path_known) {
		mesos_py_path = batch_queue_get_option(q, "mesos-path");
		if (mesos_py_path != NULL) {
			debug(D_INFO, "Get mesos_path %s from command line\n", mesos_py_path);
		}
		is_mesos_py_path_known = 1;
	}

	// Get the mesos master address
	if (!is_mesos_master_known) {
		mesos_master = batch_queue_get_option(q, "mesos-master");
		if (mesos_master == NULL) {
			fatal("Please specify the hostname of mesos master by using --mesos-master");
		} else {
			debug(D_INFO, "Get mesos_path %s from command line\n", mesos_py_path);
			is_mesos_master_known = 1;
		}
	}

	mesos_preload = batch_queue_get_option(q, "mesos-preload");

	if (is_mesos_py_path_known && 
		is_mesos_master_known && 
		!is_scheduler_running ) {
		// start mesos scheduler if it is not running
		start_mesos_scheduler(q);
		is_scheduler_running = 1;
	}

	int task_id = ++counter;

	debug(D_BATCH, "task %d is ready", task_id);
	struct batch_job_info *info = calloc(1, sizeof(*info));
	info->started = time(0);
	info->submitted = time(0);
	itable_insert(q->job_table, task_id, info);

	// write the ready task information as  
	// "task_id, task_cmd, inputs, outputs" to 
	// mesos_task_info, which will be scanned by 
	// mf_mesos_scheduler later. 

	FILE *task_info_fp;

	if(access(FILE_TASK_INFO, F_OK) != -1) {
		task_info_fp = fopen(FILE_TASK_INFO, "a+");
	} else {
		task_info_fp = fopen(FILE_TASK_INFO, "w+");
	}
	
	struct mesos_task *mt = mesos_task_create(task_id, cmd, extra_input_files, extra_output_files);

	fprintf(task_info_fp, "%d,%s,", mt->task_id, mt->task_cmd);

	if (extra_input_files != NULL && strlen(extra_input_files) != 0) {

		int j = 0;
		int num_input_files = text_list_size(mt->task_input_files);
		for (j = 0; j < (num_input_files-1); j++) {
			fprintf(task_info_fp, "%s ", text_list_get(mt->task_input_files, j));
		}
		fprintf(task_info_fp, "%s,", text_list_get(mt->task_input_files, num_input_files-1));

	} else {
		fprintf(task_info_fp, ",");
	}
    
	if (extra_output_files != NULL && strlen(extra_output_files) != 0) {
		int j = 0;
		int num_output_files = text_list_size(mt->task_output_files);
		for (j = 0; j < (num_output_files-1); j++) {
			fprintf(task_info_fp, "%s ", text_list_get(mt->task_output_files, j));
		}
		fprintf(task_info_fp, "%s,",text_list_get(mt->task_output_files, num_output_files-1));
	} else {
		fprintf(task_info_fp, ",");	
	}

	// The default resource requirements for each task
	int64_t cores = -1;
	int64_t memory = -1;
	int64_t disk = -1;

	if (resources) {
		cores  = resources->cores  > -1 ? resources->cores  : cores;
		memory = resources->memory > -1 ? resources->memory : memory;
		disk   = resources->disk   > -1 ? resources->disk   : disk;
	}

	fprintf(task_info_fp, "%" PRId64 ",%" PRId64 ",%" PRId64 ",", cores, memory, disk);

	fputs("submitted\n", task_info_fp);

	mesos_task_delete(mt);
	fclose(task_info_fp);

	return task_id;
}