static int batch_fs_dryrun_getcwd (struct batch_queue *q, char *buf, size_t size) { const char *cwd = batch_queue_get_option(q, "cwd"); size_t pathlength = strlen(cwd); if (pathlength + 1 > size) { errno = ERANGE; return -1; } else { strcpy(buf, cwd); return 0; } }
static int batch_queue_mesos_free(struct batch_queue *q) { FILE *fp; fp = fopen(MESOS_DONE_FILE, "w"); if(fp == NULL) { fatal("Fail to clean up batch queue. %s\n", strerror(errno)); } int batch_queue_abort_flag = atoi(batch_queue_get_option(q, "batch-queue-abort-flag")); int batch_queue_failed_flag = atoi(batch_queue_get_option(q, "batch-queue-failed-flag")); if(batch_queue_abort_flag) { fprintf(fp, "aborted"); } else if(batch_queue_failed_flag) { fprintf(fp, "failed"); } else { fprintf(fp, "finished"); } fclose(fp); return 0; }
static batch_job_id_t batch_job_condor_submit (struct batch_queue *q, const char *cmd, const char *extra_input_files, const char *extra_output_files, struct jx *envlist, const struct rmsummary *resources ) { FILE *file; int njobs; int jobid; const char *options = hash_table_lookup(q->options, "batch-options"); if(setup_condor_wrapper("condor.sh") < 0) { debug(D_BATCH, "could not create condor.sh: %s", strerror(errno)); return -1; } if(!string_istrue(hash_table_lookup(q->options, "skip-afs-check"))) { char *cwd = path_getcwd(); if(!strncmp(cwd, "/afs", 4)) { debug(D_NOTICE|D_BATCH, "The working directory is '%s':", cwd); debug(D_NOTICE|D_BATCH, "This won't work because Condor is not able to write to files in AFS."); debug(D_NOTICE|D_BATCH, "Instead, run makeflow from a local disk like /tmp."); debug(D_NOTICE|D_BATCH, "Or, use the Work Queue with -T wq and condor_submit_workers."); free(cwd); exit(EXIT_FAILURE); } free(cwd); } file = fopen("condor.submit", "w"); if(!file) { debug(D_BATCH, "could not create condor.submit: %s", strerror(errno)); return -1; } fprintf(file, "universe = vanilla\n"); fprintf(file, "executable = condor.sh\n"); char *escaped = string_escape_condor(cmd); fprintf(file, "arguments = %s\n", escaped); free(escaped); if(extra_input_files) fprintf(file, "transfer_input_files = %s\n", extra_input_files); // Note that we do not use transfer_output_files, because that causes the job // to get stuck in a system hold if the files are not created. fprintf(file, "should_transfer_files = yes\n"); fprintf(file, "when_to_transfer_output = on_exit\n"); fprintf(file, "notification = never\n"); fprintf(file, "copy_to_spool = true\n"); fprintf(file, "transfer_executable = true\n"); fprintf(file, "keep_claim_idle = 30\n"); fprintf(file, "log = %s\n", q->logfile); const char *c_req = batch_queue_get_option(q, "condor-requirements"); char *bexp = blacklisted_expression(q); if(c_req && bexp) { fprintf(file, "requirements = %s && %s\n", c_req, bexp); } else if(c_req) { fprintf(file, "requirements = %s\n", c_req); } else if(bexp) { fprintf(file, "requirements = %s\n", bexp); } if(bexp) free(bexp); /* Getting environment variables formatted for a condor submit file is very hairy, due to some strange quoting rules. To avoid problems, we simply export vars to the environment, and then tell condor getenv=true, which pulls in the environment. */ fprintf(file, "getenv = true\n"); if(envlist) { jx_export(envlist); } if(options) fprintf(file, "%s\n", options); /* set same deafults as condor_submit_workers */ int64_t cores = 1; int64_t memory = 1024; int64_t disk = 1024; if(resources) { cores = resources->cores > -1 ? resources->cores : cores; memory = resources->memory > -1 ? resources->memory : memory; disk = resources->disk > -1 ? resources->disk : disk; } /* convert disk to KB */ disk *= 1024; if(batch_queue_get_option(q, "autosize")) { fprintf(file, "request_cpus = ifThenElse(%" PRId64 " > TotalSlotCpus, %" PRId64 ", TotalSlotCpus)\n", cores, cores); fprintf(file, "request_memory = ifThenElse(%" PRId64 " > TotalSlotMemory, %" PRId64 ", TotalSlotMemory)\n", memory, memory); fprintf(file, "request_disk = ifThenElse((%" PRId64 ") > TotalSlotDisk, (%" PRId64 "), TotalSlotDisk)\n", disk, disk); } else { fprintf(file, "request_cpus = %" PRId64 "\n", cores); fprintf(file, "request_memory = %" PRId64 "\n", memory); fprintf(file, "request_disk = %" PRId64 "\n", disk); } fprintf(file, "queue\n"); fclose(file); file = popen("condor_submit condor.submit", "r"); if(!file) return -1; char line[BATCH_JOB_LINE_MAX]; while(fgets(line, sizeof(line), file)) { if(sscanf(line, "%d job(s) submitted to cluster %d", &njobs, &jobid) == 2) { pclose(file); debug(D_BATCH, "job %d submitted to condor", jobid); struct batch_job_info *info; info = malloc(sizeof(*info)); memset(info, 0, sizeof(*info)); info->submitted = time(0); itable_insert(q->job_table, jobid, info); return jobid; } } pclose(file); debug(D_BATCH, "failed to submit job to condor!"); return -1; }
static batch_job_id_t batch_job_wq_wait (struct batch_queue * q, struct batch_job_info * info, time_t stoptime) { static int try_open_log = 0; int timeout, taskid = -1; if(!try_open_log) { try_open_log = 1; if(!work_queue_specify_log(q->data, q->logfile)) { return -1; } const char *transactions = batch_queue_get_option(q, "batch_log_transactions_name"); if(transactions) { work_queue_specify_transactions_log(q->data, transactions); } } if(stoptime == 0) { timeout = WORK_QUEUE_WAITFORTASK; } else { timeout = MAX(0, stoptime - time(0)); } struct work_queue_task *t = work_queue_wait(q->data, timeout); if(t) { info->submitted = t->time_when_submitted / 1000000; info->started = t->time_when_commit_end / 1000000; info->finished = t->time_when_done / 1000000; info->exited_normally = 1; info->exit_code = t->return_status; info->exit_signal = 0; info->disk_allocation_exhausted = t->disk_allocation_exhausted; /* If the standard ouput of the job is not empty, then print it, because this is analogous to a Unix job, and would otherwise be lost. Important for capturing errors from the program. */ if(t->output && t->output[0]) { if(t->output[1] || t->output[0] != '\n') { string_chomp(t->output); printf("%s\n", t->output); } } char *outfile = itable_remove(q->output_table, t->taskid); if(outfile) { FILE *file = fopen(outfile, "w"); if(file) { fwrite(t->output, strlen(t->output), 1, file); fclose(file); } free(outfile); } taskid = t->taskid; work_queue_task_delete(t); } if(taskid >= 0) { return taskid; } if(work_queue_empty(q->data)) { return 0; } else { return -1; } }
static batch_job_id_t batch_job_mesos_submit (struct batch_queue *q, const char *cmd, const char *extra_input_files, const char *extra_output_files, struct jx *envlist, const struct rmsummary *resources ) { // Get the path to mesos python site-packages if (!is_mesos_py_path_known) { mesos_py_path = batch_queue_get_option(q, "mesos-path"); if (mesos_py_path != NULL) { debug(D_INFO, "Get mesos_path %s from command line\n", mesos_py_path); } is_mesos_py_path_known = 1; } // Get the mesos master address if (!is_mesos_master_known) { mesos_master = batch_queue_get_option(q, "mesos-master"); if (mesos_master == NULL) { fatal("Please specify the hostname of mesos master by using --mesos-master"); } else { debug(D_INFO, "Get mesos_path %s from command line\n", mesos_py_path); is_mesos_master_known = 1; } } mesos_preload = batch_queue_get_option(q, "mesos-preload"); if (is_mesos_py_path_known && is_mesos_master_known && !is_scheduler_running ) { // start mesos scheduler if it is not running start_mesos_scheduler(q); is_scheduler_running = 1; } int task_id = ++counter; debug(D_BATCH, "task %d is ready", task_id); struct batch_job_info *info = calloc(1, sizeof(*info)); info->started = time(0); info->submitted = time(0); itable_insert(q->job_table, task_id, info); // write the ready task information as // "task_id, task_cmd, inputs, outputs" to // mesos_task_info, which will be scanned by // mf_mesos_scheduler later. FILE *task_info_fp; if(access(FILE_TASK_INFO, F_OK) != -1) { task_info_fp = fopen(FILE_TASK_INFO, "a+"); } else { task_info_fp = fopen(FILE_TASK_INFO, "w+"); } struct mesos_task *mt = mesos_task_create(task_id, cmd, extra_input_files, extra_output_files); fprintf(task_info_fp, "%d,%s,", mt->task_id, mt->task_cmd); if (extra_input_files != NULL && strlen(extra_input_files) != 0) { int j = 0; int num_input_files = text_list_size(mt->task_input_files); for (j = 0; j < (num_input_files-1); j++) { fprintf(task_info_fp, "%s ", text_list_get(mt->task_input_files, j)); } fprintf(task_info_fp, "%s,", text_list_get(mt->task_input_files, num_input_files-1)); } else { fprintf(task_info_fp, ","); } if (extra_output_files != NULL && strlen(extra_output_files) != 0) { int j = 0; int num_output_files = text_list_size(mt->task_output_files); for (j = 0; j < (num_output_files-1); j++) { fprintf(task_info_fp, "%s ", text_list_get(mt->task_output_files, j)); } fprintf(task_info_fp, "%s,",text_list_get(mt->task_output_files, num_output_files-1)); } else { fprintf(task_info_fp, ","); } // The default resource requirements for each task int64_t cores = -1; int64_t memory = -1; int64_t disk = -1; if (resources) { cores = resources->cores > -1 ? resources->cores : cores; memory = resources->memory > -1 ? resources->memory : memory; disk = resources->disk > -1 ? resources->disk : disk; } fprintf(task_info_fp, "%" PRId64 ",%" PRId64 ",%" PRId64 ",", cores, memory, disk); fputs("submitted\n", task_info_fp); mesos_task_delete(mt); fclose(task_info_fp); return task_id; }