/** * @brief * executes a job * * @param[in] job - The fully qualified job id. * @param[in] server - The name of the server that manages the job. * @param[in] location - location indicating where to run job * * @return - Void * * @File Variables: * exitstatus Set to two if an error occurs. * */ static void execute(char *job, char *server, char *location) { int ct; /* Connection to the server */ int err; /* Error return from pbs_run */ int out; /* Stores the size of err_msg_buf*/ int located = FALSE; char *errmsg; char err_msg_buf[COMMENT_BUF_SIZE] = {'\0'}; /* generic buffer - comments & logging*/ char rmt_server[MAXSERVERNAME]; cnt: if ((ct = cnt2server(server)) > 0) { if (async) err = pbs_asyrunjob(ct, job, location, NULL); else err = pbs_runjob(ct, job, location, NULL); if (err && (pbs_errno != PBSE_UNKJOBID)) { errmsg = pbs_geterrmsg(ct); if (errmsg != NULL) { if (pbs_errno == PBSE_UNKNODE) { out = snprintf(err_msg_buf, sizeof(err_msg_buf),"qrun: %s %s",errmsg, location); if (out >= sizeof(err_msg_buf)) { fprintf(stderr,"%s...\n", err_msg_buf); } else { fprintf(stderr, "%s\n", err_msg_buf); } } else { prt_job_err("qrun", ct, job); } } else { fprintf(stderr, "qrun : Server returned error %d for job ", pbs_errno); } exitstatus = 2; } else if (err && (pbs_errno == PBSE_UNKJOBID) && !located) { located = TRUE; if (locate_job(job, server, rmt_server)) { pbs_disconnect(ct); strcpy(server, rmt_server); goto cnt; } prt_job_err("qrun", ct, job); exitstatus = 2; } pbs_disconnect(ct); } else { fprintf(stderr, "qrun: could not connect to server %s (%d)\n", server, pbs_errno); exitstatus = 2; } }
/* * * run_update_job - run the job and update the job information * * pbs_sd - connection to pbs_server * sinfo - server job is on * qinfo - queue job resides in * jinfo - the job to run * * returns success/failure - see pbs_errno for more info * */ int run_update_job(int pbs_sd, server_info *sinfo, queue_info *qinfo, job_info *jinfo) { int ret; /* return code from pbs_runjob() */ node_info *best_node = NULL; /* best node to run job on */ char *best_node_name = NULL; /* name of best node */ char buf[256] = {'\0'}; /* generic buffer - comments & logging*/ char timebuf[128]; /* buffer to hold the time and date */ resource_req *res; /* ptr to the resource of ncpus */ int ncpus; /* numeric amount of resource ncpus */ char *errmsg; /* used for pbs_geterrmsg() */ strftime(timebuf, 128, "started on %a %b %d at %H:%M", localtime(&cstat.current_time)); if (cstat.load_balancing || cstat.load_balancing_rr) { best_node = find_best_node(jinfo, sinfo -> timesharing_nodes); if (best_node != NULL) { best_node_name = best_node -> name; sprintf(buf, "Job run on node %s - %s", best_node_name, timebuf); } } if (best_node == NULL) sprintf(buf, "Job %s", timebuf); update_job_comment(pbs_sd, jinfo, buf); buf[0] = '\0'; ret = pbs_runjob(pbs_sd, jinfo -> name, best_node_name, NULL); if (ret == 0) { /* If a job is 100% efficent, it will raise the load average by 1 per * cpu is uses. Temporarly inflate load average by that value */ if (cstat.load_balancing && best_node != NULL) { if ((res = find_resource_req(jinfo -> resreq, "ncpus")) == NULL) ncpus = 1; else ncpus = res -> amount; best_node -> loadave += ncpus; } if (cstat.help_starving_jobs && jinfo == cstat.starving_job) jinfo -> sch_priority = 0; sched_log(PBSEVENT_SCHED, PBS_EVENTCLASS_JOB, jinfo -> name, "Job Run"); update_server_on_run(sinfo, qinfo, jinfo); update_queue_on_run(qinfo, jinfo); update_job_on_run(pbs_sd, jinfo); if (cstat.fair_share) update_usage_on_run(jinfo); free(sinfo -> running_jobs); sinfo -> running_jobs = job_filter(sinfo -> jobs, sinfo -> sc.total, check_run_job, NULL); free(qinfo -> running_jobs); qinfo -> running_jobs = job_filter(qinfo -> jobs, qinfo -> sc.total, check_run_job, NULL); } else { errmsg = pbs_geterrmsg(pbs_sd); sprintf(buf, "Not Running - PBS Error: %s", errmsg); update_job_comment(pbs_sd, jinfo, buf); } return ret; }
int schd_run_job_on(Job *job, Queue *destq, char *exechost, int set_comment) { char *id = "schd_run_job_on"; char reason[128], tmp_word[20]; char *date; Queue *srcq = NULL; int ret = 0; /* Get the datestamp from 'ctime()'. Remove the trailing '\n'. */ date = ctime(&schd_TimeNow); date[strlen(date) - 1] = '\0'; if (set_comment) { sprintf(reason, "Started on %s", date); if (job->flags & JFLAGS_PRIORITY) { strcat(reason, " (EXPRESS/high priority job)"); } if (job->flags & JFLAGS_WAITING) { strcat(reason, " (long-waiting job)"); } schd_comment_job(job, reason, JOB_COMMENT_REQUIRED); } /* If this is NOT a suspended job... */ if (!(job->flags & JFLAGS_SUSPENDED)) { /* * If a destination Queue is provided, and it is different from the * source queue, then ask PBS to move the job to that queue before * running it. */ srcq = job->queue; /* * Move the job from its queue to the specified run queue. */ if ((destq != NULL) && (strcmp(destq->qname, srcq->qname) != 0)) { if (pbs_movejob(connector, job->jobid, destq->qname, NULL)) { (void)sprintf(log_buffer, "move job %s to queue %s failed, %d", job->jobid, destq->qname, pbs_errno); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); DBPRT(("%s: %s\n", id, log_buffer)); return (-1); } schd_move_job_to(job, destq); } /* * Give the job handle (JOBID) to PBS to run. */ if (pbs_runjob(connector, job->jobid, exechost, NULL)) { (void)sprintf(log_buffer, "failed start job %s on queue %s@%s, %d", job->jobid, destq->qname, exechost, pbs_errno); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); DBPRT(("%s: %s\n", id, log_buffer)); /* * Running failed! Move the job back to the source queue (if * applicable) before returning. This prevents jobs being left * in execution queues. */ if (srcq) { DBPRT(("Attempting to move job %s back to queue %s\n", job->jobid, srcq->qname)); if (pbs_movejob(connector, job->jobid, srcq->qname, NULL)) { (void)sprintf(log_buffer, "failed to move job %s back to queue %s, %d", job->jobid, srcq->qname, pbs_errno); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); DBPRT(("%s: %s\n", id, log_buffer)); } schd_move_job_to(job, srcq); } return (-1); } strcpy(tmp_word, "started"); } else /* it IS a suspended job */ { schd_move_job_to(job, destq); ret = pbs_sigjob(connector, job->jobid, "resume", NULL); if (ret) { sprintf(log_buffer, "resume of job %s FAILED (%d)", job->jobid, ret); return (-1); } job->flags &= ~JFLAGS_SUSPENDED; strcpy(tmp_word, "resumed"); } /* PBS accepted the job (and presumably will run it). Log the fact. */ (void)sprintf(log_buffer, "job %s %s on %s@%s", job->jobid, tmp_word, destq->qname, exechost); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); DBPRT(("%s: %s\n", id, log_buffer)); /* * Change the state of the local representation of the job to "Running". */ job->state = 'R'; /* * Account for the job on this queue's statistics. 'queued' will be * bumped up if the queued job was moved to a new destination queue. */ job->queue->queued --; job->queue->running ++; /* The queue is no longer idle. Unset the idle timer. */ job->queue->idle_since = 0; return (0); /* Job successfully started. */ }