job *svr_find_job( char *jobid, /* I */ int get_subjob) /* I */ { char *at; char *comp; int different = FALSE; char *dash = NULL; char *dot = NULL; char without_dash[PBS_MAXSVRJOBID + 1]; job *pj = NULL; if (LOGLEVEL >= 10) LOG_EVENT(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, __func__, jobid); if ((at = strchr(jobid, '@')) != NULL) *at = '\0'; /* strip off @server_name */ /* jobid-0.server indicates the external sub-job of a heterogeneous * job. For this case we want to get jobid.server, find that, and * the get the external sub-job */ if (get_subjob == TRUE) { dot = strchr(jobid, '.'); if (((dash = strchr(jobid, '-')) != NULL) && (dot != NULL) && (dash < dot)) { *dash = '\0'; snprintf(without_dash, sizeof(without_dash), "%s%s", jobid, dash + 2); jobid = without_dash; } else dash = NULL; } if ((is_svr_attr_set(SRV_ATR_display_job_server_suffix)) || (is_svr_attr_set(SRV_ATR_job_suffix_alias))) { comp = get_correct_jobname(jobid); different = TRUE; if (comp == NULL) return(NULL); } else { comp = jobid; } if (strstr(jobid,"[]") == NULL) { /* if we're searching for the external we want find_job_by_array to * return the parent, but if we're searching for the cray subjob then * we want find_job_by_array to return the sub job */ pj = find_job_by_array(&alljobs, comp, (dash != NULL) ? FALSE : get_subjob); } /* when remotely routing jobs, they are removed from the * regular job list first and the array summary after. * Attempt to find them there if NULL * OR it's an array, try to find the job */ if (pj == NULL) { /* see the comment on the above call to find_job_by_array() */ pj = find_job_by_array(&array_summary, comp, (dash != NULL) ? FALSE : get_subjob); } if (at) *at = '@'; /* restore @server_name */ if ((get_subjob == TRUE) && (pj != NULL)) { if (dash != NULL) { *dash = '-'; if (pj->ji_external_clone != NULL) { pj = pj->ji_external_clone; lock_ji_mutex(pj, __func__, NULL, 0); unlock_ji_mutex(pj->ji_parent_job, __func__, NULL, 0); if (pj->ji_being_recycled == TRUE) { unlock_ji_mutex(pj, __func__, NULL, 0); pj = NULL; } } else { unlock_ji_mutex(pj, __func__, NULL, 0); pj = NULL; } } } if (different) free(comp); return(pj); /* may be NULL */ } /* END svr_find_job() */
int req_rerunjob( struct batch_request *preq) { int rc = PBSE_NONE; job *pjob; int Force; int MgrRequired = TRUE; char log_buf[LOCAL_LOG_BUF_SIZE]; /* check if requestor is admin, job owner, etc */ if ((pjob = chk_job_request(preq->rq_ind.rq_rerun, preq)) == 0) { /* FAILURE */ /* chk_job_request calls req_reject() */ rc = PBSE_SYSTEM; return rc; /* This needs to fixed to return an accurate error */ } /* the job must be running or completed */ if (pjob->ji_qs.ji_state >= JOB_STATE_EXITING) { if (pjob->ji_wattr[JOB_ATR_checkpoint_name].at_flags & ATR_VFLAG_SET) { /* allow end-users to rerun checkpointed jobs */ MgrRequired = FALSE; } } else if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING) { /* job is running */ /* NO-OP */ } else { /* FAILURE - job is in bad state */ rc = PBSE_BADSTATE; snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "job %s is in a bad state", preq->rq_ind.rq_rerun); req_reject(rc, 0, preq, NULL, log_buf); unlock_ji_mutex(pjob, __func__, "2", LOGLEVEL); return rc; } if ((MgrRequired == TRUE) && ((preq->rq_perm & (ATR_DFLAG_MGWR | ATR_DFLAG_OPWR)) == 0)) { /* FAILURE */ rc = PBSE_PERM; snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "additional permissions required (ATR_DFLAG_MGWR | ATR_DFLAG_OPWR)"); req_reject(rc, 0, preq, NULL, log_buf); unlock_ji_mutex(pjob, __func__, "3", LOGLEVEL); return rc; } /* the job must be rerunnable */ if (pjob->ji_wattr[JOB_ATR_rerunable].at_val.at_long == 0) { /* NOTE: should force override this constraint? maybe (???) */ /* no, the user is saying that the job will break, and IEEE Std 1003.1 specifically says rerun is to be rejected if rerunable==FALSE -garrick */ rc = PBSE_NORERUN; snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "job %s not rerunnable", preq->rq_ind.rq_rerun); req_reject(rc, 0, preq, NULL, log_buf); unlock_ji_mutex(pjob, __func__, "4", LOGLEVEL); return rc; } if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING) { /* ask MOM to kill off the job if it is running */ static const char *rerun = "rerun"; char *extra = strdup(rerun); rc = issue_signal(&pjob, "SIGKILL", post_rerun, extra); } else { if (pjob->ji_wattr[JOB_ATR_hold].at_val.at_long == HOLD_n) { svr_setjobstate(pjob, JOB_STATE_QUEUED, JOB_SUBSTATE_QUEUED, FALSE); } else { svr_setjobstate(pjob, JOB_STATE_HELD, JOB_SUBSTATE_HELD, FALSE); } /* reset some job attributes */ pjob->ji_wattr[JOB_ATR_comp_time].at_flags &= ~ATR_VFLAG_SET; pjob->ji_wattr[JOB_ATR_reported].at_flags &= ~ATR_VFLAG_SET; set_statechar(pjob); rc = -1; } if (preq->rq_extend && !strncasecmp(preq->rq_extend, RERUNFORCE, strlen(RERUNFORCE))) Force = 1; else Force = 0; switch (rc) { case - 1: /* completed job was requeued */ /* clear out job completion time if there is one */ break; case 0: /* requeue request successful */ if (pjob != NULL) pjob->ji_qs.ji_substate = JOB_SUBSTATE_RERUN; break; case PBSE_SYSTEM: /* This may not be accurate...*/ rc = PBSE_MEM_MALLOC; snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "Can not allocate memory"); req_reject(rc, 0, preq, NULL, log_buf); return rc; break; default: if (Force == 0) { rc = PBSE_MOMREJECT; snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "Rejected by mom"); req_reject(rc, 0, preq, NULL, log_buf); if (pjob != NULL) unlock_ji_mutex(pjob, __func__, "5", LOGLEVEL); return rc; } else { int newstate; int newsubst; unsigned int dummy; char *tmp; long cray_enabled = FALSE; if (pjob != NULL) { get_svr_attr_l(SRV_ATR_CrayEnabled, &cray_enabled); if ((cray_enabled == TRUE) && (pjob->ji_wattr[JOB_ATR_login_node_id].at_val.at_str != NULL)) tmp = parse_servername(pjob->ji_wattr[JOB_ATR_login_node_id].at_val.at_str, &dummy); else tmp = parse_servername(pjob->ji_wattr[JOB_ATR_exec_host].at_val.at_str, &dummy); /* Cannot communicate with MOM, forcibly requeue job. This is a relatively disgusting thing to do */ sprintf(log_buf, "rerun req to %s failed (rc=%d), forcibly requeueing job", tmp, rc); free(tmp); log_event( PBSEVENT_ERROR | PBSEVENT_ADMIN | PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buf); log_err(-1, __func__, log_buf); strcat(log_buf, ", previous output files may be lost"); svr_mailowner(pjob, MAIL_OTHER, MAIL_FORCE, log_buf); svr_setjobstate(pjob, JOB_STATE_EXITING, JOB_SUBSTATE_RERUN3, FALSE); rel_resc(pjob); /* free resc assigned to job */ if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_HOTSTART) == 0) { /* in case of server shutdown, don't clear exec_host */ /* will use it on hotstart when next comes up */ job_attr_def[JOB_ATR_exec_host].at_free(&pjob->ji_wattr[JOB_ATR_exec_host]); job_attr_def[JOB_ATR_session_id].at_free(&pjob->ji_wattr[JOB_ATR_session_id]); job_attr_def[JOB_ATR_exec_gpus].at_free(&pjob->ji_wattr[JOB_ATR_exec_gpus]); } pjob->ji_modified = 1; /* force full job save */ pjob->ji_momhandle = -1; pjob->ji_qs.ji_svrflags &= ~JOB_SVFLG_StagedIn; svr_evaljobstate(pjob, &newstate, &newsubst, 0); svr_setjobstate(pjob, newstate, newsubst, FALSE); } } break; } /* END switch (rc) */ /* So job has run and is to be rerun (not restarted) */ if (pjob == NULL) { rc = PBSE_JOB_RERUN; } else { pjob->ji_qs.ji_svrflags = (pjob->ji_qs.ji_svrflags & ~(JOB_SVFLG_CHECKPOINT_FILE |JOB_SVFLG_CHECKPOINT_MIGRATEABLE | JOB_SVFLG_CHECKPOINT_COPIED)) | JOB_SVFLG_HASRUN; sprintf(log_buf, msg_manager, msg_jobrerun, preq->rq_user, preq->rq_host); log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf); reply_ack(preq); /* note in accounting file */ account_record(PBS_ACCT_RERUN, pjob, NULL); unlock_ji_mutex(pjob, __func__, "6", LOGLEVEL); } return rc; } /* END req_rerunjob() */
void req_stat_job_step2( struct stat_cntl *cntl) /* I/O (free'd on return) */ { batch_request *preq = cntl->sc_origrq; svrattrl *pal = (svrattrl *)GET_NEXT(preq->rq_ind.rq_status.rq_attr); job *pjob = NULL; struct batch_reply *preply = &preq->rq_reply; int rc = 0; enum TJobStatTypeEnum type = (enum TJobStatTypeEnum)cntl->sc_type; bool exec_only = false; int bad = 0; /* delta time - only report full pbs_attribute list if J->MTime > DTime */ int job_array_index = -1; job_array *pa = NULL; all_jobs_iterator *iter; if (preq->rq_extend != NULL) { /* FORMAT: { EXECQONLY } */ if (strstr(preq->rq_extend, EXECQUEONLY)) exec_only = true; } if ((type == tjstTruncatedServer) || (type == tjstTruncatedQueue)) { handle_truncated_qstat(exec_only, cntl->sc_condensed, preq); return; } /* END if ((type == tjstTruncatedServer) || ...) */ else if (type == tjstJob) { pjob = svr_find_job(preq->rq_ind.rq_status.rq_id, FALSE); if (pjob != NULL) { if ((rc = status_job(pjob, preq, pal, &preply->brp_un.brp_status, cntl->sc_condensed, &bad))) req_reject(rc, bad, preq, NULL, NULL); else reply_send_svr(preq); unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL); } else { req_reject(PBSE_JOBNOTFOUND, bad, preq, NULL, NULL); } } else { if (type == tjstArray) { pa = get_array(preq->rq_ind.rq_status.rq_id); if (pa == NULL) { req_reject(PBSE_UNKARRAYID, 0, preq, NULL, "unable to find array"); return; } } else if ((type == tjstSummarizeArraysQueue) || (type == tjstSummarizeArraysServer)) update_array_statuses(); iter = get_correct_status_iterator(cntl); for (pjob = get_next_status_job(cntl, job_array_index, pa, iter); pjob != NULL; pjob = get_next_status_job(cntl, job_array_index, pa, iter)) { mutex_mgr job_mutex(pjob->ji_mutex, true); /* go ahead and build the status reply for this job */ if (pjob->ji_being_recycled == true) continue; if (exec_only) { if (cntl->sc_pque != NULL) { if (cntl->sc_pque->qu_qs.qu_type != QTYPE_Execution) continue; } else if (in_execution_queue(pjob, pa) == false) continue; } rc = status_job(pjob, preq, pal, &preply->brp_un.brp_status, cntl->sc_condensed, &bad); if ((rc != PBSE_NONE) && (rc != PBSE_PERM)) { if (pa != NULL) unlock_ai_mutex(pa, __func__, "1", LOGLEVEL); req_reject(rc, bad, preq, NULL, NULL); delete iter; return; } } /* END for (pjob != NULL) */ delete iter; if (pa != NULL) { unlock_ai_mutex(pa, __func__, "1", LOGLEVEL); } reply_send_svr(preq); } if (LOGLEVEL >= 7) { log_event(PBSEVENT_SYSTEM, PBS_EVENTCLASS_JOB, "req_statjob", "Successfully returned the status of queued jobs\n"); } return; } /* END req_stat_job_step2() */
int relay_to_mom( job **pjob_ptr, struct batch_request *request, /* the request to send */ void (*func)(struct work_task *)) { int handle; /* a client style connection handle */ int rc; int local_errno = 0; pbs_net_t addr; unsigned short port; job *pjob = *pjob_ptr; char jobid[PBS_MAXSVRJOBID + 1]; char *job_momname = NULL; struct pbsnode *node; char log_buf[LOCAL_LOG_BUF_SIZE]; /* if MOM is down don't try to connect */ addr = pjob->ji_qs.ji_un.ji_exect.ji_momaddr; port = pjob->ji_qs.ji_un.ji_exect.ji_momport; job_momname = strdup(pjob->ji_wattr[JOB_ATR_exec_host].at_val.at_str); if (job_momname == NULL) return PBSE_MEM_MALLOC; if ((node = tfind_addr(addr, port, job_momname)) == NULL) { free(job_momname); return(PBSE_NORELYMOM); } free(job_momname); if ((node != NULL) && (node->nd_state & INUSE_DOWN)) { unlock_node(node, __func__, "no rely mom", LOGLEVEL); return(PBSE_NORELYMOM); } if (LOGLEVEL >= 7) { char *tmp = netaddr_pbs_net_t(pjob->ji_qs.ji_un.ji_exect.ji_momaddr); sprintf(log_buf, "momaddr=%s",tmp); log_record(PBSEVENT_SCHED, PBS_EVENTCLASS_REQUEST, __func__, log_buf); free(tmp); } unlock_node(node, __func__, "after svr_connect", LOGLEVEL); handle = svr_connect( pjob->ji_qs.ji_un.ji_exect.ji_momaddr, pjob->ji_qs.ji_un.ji_exect.ji_momport, &local_errno, NULL, NULL, ToServerDIS); if (handle < 0) { log_event(PBSEVENT_ERROR,PBS_EVENTCLASS_REQUEST,"",msg_norelytomom); return(PBSE_NORELYMOM); } strcpy(jobid, pjob->ji_qs.ji_jobid); unlock_ji_mutex(pjob, __func__, NULL, LOGLEVEL); request->rq_orgconn = request->rq_conn; /* save client socket */ rc = issue_Drequest(handle, request); *pjob_ptr = svr_find_job(jobid, TRUE); return(rc); } /* END relay_to_mom() */
int req_signaljob( void *vp) /* I */ { struct batch_request *preq = (struct batch_request *)vp; job *pjob; int rc; char log_buf[LOCAL_LOG_BUF_SIZE]; struct batch_request *dup_req = NULL; /* preq free'd in error cases */ if ((pjob = chk_job_request(preq->rq_ind.rq_signal.rq_jid, preq)) == 0) { return(PBSE_NONE); } /* the job must be running */ if (pjob->ji_qs.ji_state != JOB_STATE_RUNNING) { req_reject(PBSE_BADSTATE, 0, preq, NULL, NULL); unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL); return(PBSE_NONE); } /* Special pseudo signals for suspend and resume require op/mgr */ if (!strcmp(preq->rq_ind.rq_signal.rq_signame, SIG_RESUME) || !strcmp(preq->rq_ind.rq_signal.rq_signame, SIG_SUSPEND)) { if ((preq->rq_perm & (ATR_DFLAG_OPRD | ATR_DFLAG_OPWR | ATR_DFLAG_MGRD | ATR_DFLAG_MGWR)) == 0) { /* for suspend/resume, must be mgr/op */ req_reject(PBSE_PERM, 0, preq, NULL, NULL); unlock_ji_mutex(pjob, __func__, "2", LOGLEVEL); return(PBSE_NONE); } } /* save job ptr for post_signal_req() */ preq->rq_extra = strdup(pjob->ji_qs.ji_jobid); /* FIXME: need a race-free check for available free subnodes before * resuming a suspended job */ #ifdef DONOTSUSPINTJOB /* interactive jobs don't resume correctly so don't allow a suspend */ if (!strcmp(preq->rq_ind.rq_signal.rq_signame, SIG_SUSPEND) && (pjob->ji_wattr[JOB_ATR_interactive].at_flags & ATR_VFLAG_SET) && (pjob->ji_wattr[JOB_ATR_interactive].at_val.at_long > 0)) { req_reject(PBSE_JOBTYPE, 0, preq, NULL, NULL); unlock_ji_mutex(pjob, __func__, "3", LOGLEVEL); return(PBSE_NONE); } #endif if (LOGLEVEL >= 6) { sprintf(log_buf, "relaying signal request to mom %lu", pjob->ji_qs.ji_un.ji_exect.ji_momaddr); log_record(PBSEVENT_SCHED,PBS_EVENTCLASS_REQUEST,"req_signaljob",log_buf); } /* send reply for asynchronous suspend */ if (preq->rq_type == PBS_BATCH_AsySignalJob) { reply_ack(preq); preq->rq_noreply = TRUE; } /* pass the request on to MOM */ if ((rc = copy_batchrequest(&dup_req, preq, 0, -1)) != 0) { req_reject(rc, 0, preq, NULL, "can not allocate memory"); unlock_ji_mutex(pjob, __func__, "4", LOGLEVEL); } /* The dup_req is freed in relay_to_mom (failure) * or in issue_Drequest (success) */ else { rc = relay_to_mom(&pjob, dup_req, NULL); if (pjob != NULL) unlock_ji_mutex(pjob, __func__, "4", LOGLEVEL); if (rc != PBSE_NONE) { free_br(dup_req); req_reject(rc, 0, preq, NULL, NULL); /* unable to get to MOM */ } else { post_signal_req(dup_req); free_br(preq); } } /* If successful we ack after mom replies to us, we pick up in post_signal_req() */ return(PBSE_NONE); } /* END req_signaljob() */
void *delete_all_work( void *vp) { batch_request *preq = (batch_request *)vp; batch_request *preq_dup = duplicate_request(preq); job *pjob; int iter = -1; int failed_deletes = 0; int total_jobs = 0; int rc = PBSE_NONE; char tmpLine[MAXLINE]; char *Msg = preq->rq_extend; while ((pjob = next_job(&alljobs, &iter)) != NULL) { if ((rc = forced_jobpurge(pjob, preq_dup)) == PURGE_SUCCESS) { continue; } if (pjob->ji_qs.ji_state >= JOB_STATE_EXITING) { unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL); continue; } total_jobs++; /* mutex is freed below */ if (rc == PBSE_NONE) { if ((rc = execute_job_delete(pjob, Msg, preq_dup)) == PBSE_NONE) reply_ack(preq_dup); /* mark this as NULL because it has been freed */ if (rc == PURGE_SUCCESS) { preq_dup = duplicate_request(preq); preq_dup->rq_noreply = TRUE; } else preq_dup = NULL; } if (rc != PURGE_SUCCESS) { /* duplicate the preq so we don't have a problem with double frees */ preq_dup = duplicate_request(preq); preq_dup->rq_noreply = TRUE; if ((rc == MOM_DELETE) || (rc == ROUTE_DELETE)) failed_deletes++; } } if (failed_deletes == 0) { reply_ack(preq); /* PURGE SUCCESS means this was qdel -p all. In this case no reply_*() * functions have been called */ if (rc == PURGE_SUCCESS) { free_br(preq_dup); preq_dup = NULL; } } else { snprintf(tmpLine,sizeof(tmpLine),"Deletes failed for %d of %d jobs", failed_deletes, total_jobs); req_reject(PBSE_SYSTEM, 0, preq, NULL, tmpLine); } /* preq_dup happens at the end of the loop, so free the extra one if * it is there */ if (preq_dup != NULL) free_br(preq_dup); return(NULL); } /* END delete_all_work() */
void finish_routing_processing( job *pjob, int status) { int newstate; int newsub; if (pjob == NULL) return; if (LOGLEVEL >= 10) log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, __func__, pjob->ji_qs.ji_jobid); switch (status) { case LOCUTION_SUCCESS: /* normal return, job was routed */ if (pjob->ji_qs.ji_svrflags & JOB_SVFLG_StagedIn) remove_stagein(&pjob); if (pjob != NULL) { if (pjob->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_COPIED) remove_checkpoint(&pjob); if (pjob != NULL) svr_job_purge(pjob); /* need to remove server job struct */ } break; case LOCUTION_FAIL: /* permanent rejection (or signal) */ if (pjob->ji_qs.ji_substate == JOB_SUBSTATE_ABORT) { /* job delete in progress, just set to queued status */ svr_setjobstate(pjob, JOB_STATE_QUEUED, JOB_SUBSTATE_ABORT, FALSE); svr_mailowner(pjob, 'a', TRUE, "Couldn't route job to remote server"); unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL); return; } add_dest(pjob); /* else mark destination as bad */ /* fall through */ default: /* try routing again */ svr_mailowner(pjob, 'a', TRUE, "Couldn't route job to remote server"); /* force re-eval of job state out of Transit */ svr_evaljobstate(*pjob, newstate, newsub, 1); svr_setjobstate(pjob, newstate, newsub, FALSE); if ((status = job_route(pjob)) == PBSE_ROUTEREJ) job_abt(&pjob, pbse_to_txt(PBSE_ROUTEREJ)); else if (status != 0) job_abt(&pjob, msg_routexceed); else unlock_ji_mutex(pjob, __func__, "2", LOGLEVEL); break; } /* END switch (status) */ return; } /* END finish_routing_processing() */
int req_holdjob( batch_request *vp) /* I */ { long *hold_val; int newstate; int newsub; long old_hold; job *pjob; char *pset; int rc; pbs_attribute temphold; pbs_attribute *pattr; batch_request *preq = (struct batch_request *)vp; char log_buf[LOCAL_LOG_BUF_SIZE]; batch_request *dup_req = NULL; pjob = chk_job_request(preq->rq_ind.rq_hold.rq_orig.rq_objname, preq); if (pjob == NULL) { return(PBSE_NONE); } mutex_mgr job_mutex(pjob->ji_mutex, true); /* cannot do anything until we decode the holds to be set */ if ((rc = get_hold(&preq->rq_ind.rq_hold.rq_orig.rq_attr, (const char **)&pset, &temphold)) != 0) { req_reject(rc, 0, preq, NULL, NULL); return(PBSE_NONE); } /* if other than HOLD_u is being set, must have privil */ if ((rc = chk_hold_priv(temphold.at_val.at_long, preq->rq_perm)) != 0) { req_reject(rc, 0, preq, NULL, NULL); return(PBSE_NONE); } hold_val = &pjob->ji_wattr[JOB_ATR_hold].at_val.at_long; old_hold = *hold_val; *hold_val |= temphold.at_val.at_long; pjob->ji_wattr[JOB_ATR_hold].at_flags |= ATR_VFLAG_SET; sprintf(log_buf, msg_jobholdset, pset, preq->rq_user, preq->rq_host); pattr = &pjob->ji_wattr[JOB_ATR_checkpoint]; if ((pjob->ji_qs.ji_state == JOB_STATE_RUNNING) && ((pattr->at_flags & ATR_VFLAG_SET) && ((csv_find_string(pattr->at_val.at_str, "s") != NULL) || (csv_find_string(pattr->at_val.at_str, "c") != NULL) || (csv_find_string(pattr->at_val.at_str, "enabled") != NULL)))) { /* have MOM attempt checkpointing */ /* ** The jobid in the request always have the server suffix attached ** which is dropped when the server attribute ** 'display_job_server_suffix' is FALSE and so will in the MOM's. ** Therefore, it must be passed as the server to the MOM so she can ** find it to hold. */ if (strncmp(pjob->ji_qs.ji_jobid, preq->rq_ind.rq_hold.rq_orig.rq_objname, PBS_MAXSVRJOBID)) snprintf(preq->rq_ind.rq_hold.rq_orig.rq_objname, sizeof(preq->rq_ind.rq_hold.rq_orig.rq_objname), "%s", pjob->ji_qs.ji_jobid); if ((dup_req = duplicate_request(preq)) == NULL) { req_reject(rc, 0, preq, NULL, "memory allocation failure"); } /* The dup_req is freed in relay_to_mom (failure) * or in issue_Drequest (success) */ else if ((rc = relay_to_mom(&pjob, dup_req, NULL)) != PBSE_NONE) { free_br(dup_req); *hold_val = old_hold; /* reset to the old value */ req_reject(rc, 0, preq, NULL, "relay to mom failed"); if (pjob == NULL) job_mutex.set_unlock_on_exit(false); } else { if (pjob != NULL) { pjob->ji_qs.ji_svrflags |= JOB_SVFLG_HASRUN | JOB_SVFLG_CHECKPOINT_FILE; job_save(pjob, SAVEJOB_QUICK, 0); /* fill in log_buf again, since relay_to_mom changed it */ sprintf(log_buf, msg_jobholdset, pset, preq->rq_user, preq->rq_host); log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buf); unlock_ji_mutex(pjob, __func__, "3", LOGLEVEL); pjob = NULL; reply_ack(preq); } else job_mutex.set_unlock_on_exit(false); process_hold_reply(dup_req); } } #ifdef ENABLE_BLCR else if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING) { /* * This system is configured with BLCR checkpointing to be used, * but this Running job does not have checkpointing enabled, * so we reject the request */ log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf); req_reject(PBSE_IVALREQ, 0, preq, NULL, "job not held since checkpointing is expected but not enabled for job"); } #endif else { /* everything went well, may need to update the job state */ log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf); if (old_hold != *hold_val) { /* indicate attributes changed */ pjob->ji_modified = 1; svr_evaljobstate(*pjob, newstate, newsub, 0); svr_setjobstate(pjob, newstate, newsub, FALSE); } reply_ack(preq); } return(PBSE_NONE); } /* END req_holdjob() */
void *req_checkpointjob( batch_request *preq) /* I */ { job *pjob; int rc; pbs_attribute *pattr; char log_buf[LOCAL_LOG_BUF_SIZE]; batch_request *dup_req = NULL; if ((pjob = chk_job_request(preq->rq_ind.rq_manager.rq_objname, preq)) == NULL) { return(NULL); } mutex_mgr job_mutex(pjob->ji_mutex, true); pattr = &pjob->ji_wattr[JOB_ATR_checkpoint]; if ((pjob->ji_qs.ji_state == JOB_STATE_RUNNING) && ((pattr->at_flags & ATR_VFLAG_SET) && ((csv_find_string(pattr->at_val.at_str, "s") != NULL) || (csv_find_string(pattr->at_val.at_str, "c") != NULL) || (csv_find_string(pattr->at_val.at_str, "enabled") != NULL)))) { /* have MOM attempt checkpointing */ if ((dup_req = duplicate_request(preq)) == NULL) { req_reject(PBSE_SYSTEM, 0, preq, NULL, "failure to allocate memory"); } /* The dup_req is freed in relay_to_mom (failure) * or in issue_Drequest (success) */ else if ((rc = relay_to_mom(&pjob, dup_req, NULL)) != PBSE_NONE) { req_reject(rc, 0, preq, NULL, NULL); free_br(dup_req); if (pjob == NULL) job_mutex.set_unlock_on_exit(false); } else { if (pjob != NULL) { pjob->ji_qs.ji_svrflags |= JOB_SVFLG_CHECKPOINT_FILE; job_save(pjob, SAVEJOB_QUICK, 0); log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buf); unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL); pjob = NULL; } else job_mutex.set_unlock_on_exit(false); process_checkpoint_reply(dup_req); } } else { /* Job does not have checkpointing enabled, so reject the request */ log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf); req_reject(PBSE_IVALREQ, 0, preq, NULL, "job is not checkpointable"); } return(NULL); } /* END req_checkpointjob() */
job *job_recov( char *filename) /* I */ /* pathname to job save file */ { int fds; job *pj; char *pn; char namebuf[MAXPATHLEN]; char log_buf[LOCAL_LOG_BUF_SIZE]; #ifndef PBS_MOM char parent_id[PBS_MAXSVRJOBID + 1]; job_array *pa; #endif pj = job_alloc(); /* allocate & initialize job structure space */ if (pj == NULL) { /* FAILURE - cannot alloc memory */ return(NULL); } snprintf(namebuf, MAXPATHLEN, "%s%s", path_jobs, filename); /* job directory path, filename */ fds = open(namebuf, O_RDONLY, 0); if (fds < 0) { snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "unable to open %s", namebuf); log_err(errno, __func__, log_buf); #ifndef PBS_MOM unlock_ji_mutex(pj, __func__, "1", LOGLEVEL); free(pj->ji_mutex); #endif free((char *)pj); /* FAILURE - cannot open job file */ return(NULL); } /* read in job quick save sub-structure */ if (read_ac_socket(fds, (char *)&pj->ji_qs, sizeof(pj->ji_qs)) != sizeof(pj->ji_qs) && pj->ji_qs.qs_version == PBS_QS_VERSION) { snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "Unable to read %s", namebuf); log_err(errno, __func__, log_buf); #ifndef PBS_MOM unlock_ji_mutex(pj, __func__, "2", LOGLEVEL); free(pj->ji_mutex); #endif free((char *)pj); close(fds); return(NULL); } /* is ji_qs the version we expect? */ if (pj->ji_qs.qs_version != PBS_QS_VERSION) { /* ji_qs is older version */ snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "%s appears to be from an old version. Attempting to convert.\n", namebuf); log_err(-1, __func__, log_buf); if (job_qs_upgrade(pj, fds, namebuf, pj->ji_qs.qs_version) != 0) { snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "unable to upgrade %s\n", namebuf); log_err(-1, __func__, log_buf); #ifndef PBS_MOM unlock_ji_mutex(pj, __func__, "3", LOGLEVEL); free(pj->ji_mutex); #endif free((char *)pj); close(fds); return(NULL); } } /* END if (pj->ji_qs.qs_version != PBS_QS_VERSION) */ /* Does file name match the internal name? */ /* This detects ghost files */ pn = strrchr(namebuf, (int)'/') + 1; if (strncmp(pn, pj->ji_qs.ji_fileprefix, strlen(pj->ji_qs.ji_fileprefix)) != 0) { /* mismatch, discard job */ snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "Job Id %s does not match file name for %s", pj->ji_qs.ji_jobid, namebuf); log_err(-1, __func__, log_buf); #ifndef PBS_MOM unlock_ji_mutex(pj, __func__, "4", LOGLEVEL); free(pj->ji_mutex); #endif free((char *)pj); close(fds); return(NULL); } /* read in working attributes */ if (recov_attr( fds, pj, job_attr_def, pj->ji_wattr, JOB_ATR_LAST, JOB_ATR_UNKN, TRUE) != 0) { snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "unable to recover %s (file is likely corrupted)", namebuf); log_err(-1, __func__, log_buf); #ifndef PBS_MOM unlock_ji_mutex(pj, __func__, "5", LOGLEVEL); job_free(pj, FALSE); #else mom_job_free(pj); #endif close(fds); return(NULL); } #ifndef PBS_MOM /* Comment out the mother superior tracking. Will be debugged later if (pj->ji_wattr[JOB_ATR_exec_host].at_val.at_str != NULL) {*/ /* add job to the mother superior list for it's node */ /* char *ms = strdup(pj->ji_wattr[JOB_ATR_exec_host].at_val.at_str); char *end = strchr(ms, '/'); if (end != NULL) *end = '\0'; if ((end = strchr(ms, '+')) != NULL) *end = '\0'; add_to_ms_list(ms, pj); free(ms); }*/ #endif #ifdef PBS_MOM /* read in tm sockets and ips */ if (recov_tmsock(fds, pj) != 0) { snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "warning: tmsockets not recovered from %s (written by an older pbs_mom?)", namebuf); log_err(-1, __func__, log_buf); } #else /* not PBS_MOM */ if (strchr(pj->ji_qs.ji_jobid, '[') != NULL) { /* job is part of an array. We need to put a link back to the server job array struct for this array. We also have to link this job into the linked list of jobs belonging to the array. */ array_get_parent_id(pj->ji_qs.ji_jobid, parent_id); pa = get_array(parent_id); if (pa == NULL) { job_abt(&pj, (char *)"Array job missing array struct, aborting job"); close(fds); return NULL; } strcpy(pj->ji_arraystructid, parent_id); if (strcmp(parent_id, pj->ji_qs.ji_jobid) == 0) { pj->ji_is_array_template = TRUE; } else { pa->job_ids[(int)pj->ji_wattr[JOB_ATR_job_array_id].at_val.at_long] = strdup(pj->ji_qs.ji_jobid); pa->jobs_recovered++; /* This is a bit of a kluge, but for some reason if an array job was on hold when the server went down the ji_wattr[JOB_ATR_hold].at_val.at_long value is 0 on recovery even though pj->ji_qs.ji_state is JOB_STATE_HELD and the substate is JOB_SUBSTATE_HELD */ if ((pj->ji_qs.ji_state == JOB_STATE_HELD) && (pj->ji_qs.ji_substate == JOB_SUBSTATE_HELD)) { pj->ji_wattr[JOB_ATR_hold].at_val.at_long = HOLD_l; pj->ji_wattr[JOB_ATR_hold].at_flags = ATR_VFLAG_SET; } } if (pa != NULL) { unlock_ai_mutex(pa, __func__, "1", LOGLEVEL); } } #endif close(fds); pj->ji_commit_done = 1; /* all done recovering the job */ job_save(pj, SAVEJOB_FULL, 0); return(pj); } /* END job_recov() */
int req_locatejob( struct batch_request *preq) { int rc = PBSE_NONE; char *at; int i; job *pjob; char *location = (char *)0; if ((at = strchr(preq->rq_ind.rq_locate, (int)'@'))) * at = '\0'; /* strip off @server_name */ pjob = svr_find_job(preq->rq_ind.rq_locate, FALSE); if (pjob) { unlock_ji_mutex(pjob, __func__, (char *)"1", LOGLEVEL); location = server_name; } else { for (i = 0; i < server.sv_tracksize; i++) { if ((server.sv_track + i)->tk_mtime && !strcmp((server.sv_track + i)->tk_jobid, preq->rq_ind.rq_locate)) { location = (server.sv_track + i)->tk_location; break; } } } if (location != NULL) { preq->rq_reply.brp_code = 0; preq->rq_reply.brp_auxcode = 0; preq->rq_reply.brp_choice = BATCH_REPLY_CHOICE_Locate; snprintf(preq->rq_reply.brp_un.brp_locate, sizeof(preq->rq_reply.brp_un.brp_locate), "%s", location); reply_send_svr(preq); } else { if (LOGLEVEL >= 7) { log_event( PBSEVENT_JOB, PBS_EVENTCLASS_JOB, preq->rq_ind.rq_locate, "cannot find job in server tracking list"); } rc = PBSE_UNKJOBID; req_reject(rc, 0, preq, NULL, NULL); } return rc; } /* END req_locatejob() */
int relay_to_mom( job **pjob_ptr, struct batch_request *request, /* the request to send */ void (*func)(struct work_task *)) { int handle; /* a client style connection handle */ int rc; int local_errno = 0; pbs_net_t addr; unsigned short port; job *pjob = *pjob_ptr; char jobid[PBS_MAXSVRJOBID + 1]; char *job_momname = NULL; struct pbsnode *node; char log_buf[LOCAL_LOG_BUF_SIZE]; std::string node_name; if (pjob->ji_wattr[JOB_ATR_exec_host].at_val.at_str == NULL) { snprintf(log_buf, sizeof(log_buf), "attempting to send a request to %s's mom but no exec_host list?", pjob->ji_qs.ji_jobid); log_err(PBSE_BADSTATE, __func__, log_buf); return(PBSE_BADSTATE); } /* if MOM is down don't try to connect */ addr = pjob->ji_qs.ji_un.ji_exect.ji_momaddr; port = pjob->ji_qs.ji_un.ji_exect.ji_momport; job_momname = strdup(pjob->ji_wattr[JOB_ATR_exec_host].at_val.at_str); if (job_momname == NULL) return PBSE_MEM_MALLOC; if ((node = tfind_addr(addr, port, job_momname)) == NULL) { free(job_momname); return(PBSE_NORELYMOM); } free(job_momname); if ((node != NULL) && ((node->nd_state & INUSE_NOT_READY)|| (node->nd_power_state != POWER_STATE_RUNNING))) { node->unlock_node(__func__, "no relay mom", LOGLEVEL); return(PBSE_NORELYMOM); } if (LOGLEVEL >= 7) { char *tmp = netaddr_pbs_net_t(pjob->ji_qs.ji_un.ji_exect.ji_momaddr); sprintf(log_buf, "momaddr=%s",tmp); log_record(PBSEVENT_SCHED, PBS_EVENTCLASS_REQUEST, __func__, log_buf); free(tmp); } node_name = node->get_name(); node->unlock_node(__func__, "after svr_connect", LOGLEVEL); strcpy(jobid, pjob->ji_qs.ji_jobid); unlock_ji_mutex(pjob, __func__, NULL, LOGLEVEL); *pjob_ptr = NULL; handle = svr_connect(addr, port, &local_errno, NULL, NULL); if (handle < 0) { update_failure_counts(node_name.c_str(), -1); log_event(PBSEVENT_ERROR,PBS_EVENTCLASS_REQUEST,"",msg_norelytomom); return(PBSE_NORELYMOM); } request->rq_orgconn = request->rq_conn; /* save client socket */ rc = issue_Drequest(handle, request, true); if (request->rq_reply.brp_code == PBSE_TIMEOUT) update_failure_counts(node_name.c_str(), PBSE_TIMEOUT); else update_failure_counts(node_name.c_str(), 0); *pjob_ptr = svr_find_job(jobid, TRUE); return(rc); } /* END relay_to_mom() */
void process_hold_reply( batch_request *preq) { job *pjob; pbs_attribute temphold; int newstate; int newsub; int rc; char *pset; char log_buf[LOCAL_LOG_BUF_SIZE]; /* preq was handled previously */ if (preq == NULL) return; preq->rq_conn = preq->rq_orgconn; /* restore client socket */ if ((pjob = svr_find_job(preq->rq_ind.rq_hold.rq_orig.rq_objname, FALSE)) == NULL) { log_event(PBSEVENT_DEBUG, PBS_EVENTCLASS_JOB, preq->rq_ind.rq_hold.rq_orig.rq_objname, msg_postmomnojob); req_reject(PBSE_UNKJOBID, 0, preq, NULL, msg_postmomnojob); return; } else if (preq->rq_reply.brp_code != 0) { rc = get_hold(&preq->rq_ind.rq_hold.rq_orig.rq_attr, &pset, &temphold); if (rc == 0) { rc = job_attr_def[JOB_ATR_hold].at_set(&pjob->ji_wattr[JOB_ATR_hold], &temphold, DECR); } pjob->ji_qs.ji_substate = JOB_SUBSTATE_RUNNING; /* reset it */ pjob->ji_modified = 1; /* indicate attributes changed */ svr_evaljobstate(pjob, &newstate, &newsub, 0); svr_setjobstate(pjob, newstate, newsub, FALSE); /* saves job */ if (preq->rq_reply.brp_code != PBSE_NOSUP) { sprintf(log_buf, msg_mombadhold, preq->rq_reply.brp_code); log_event(PBSEVENT_DEBUG, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buf); req_reject(preq->rq_reply.brp_code, 0, preq, NULL, log_buf); } else { reply_ack(preq); } } else { /* record that MOM has a checkpoint file */ /* PBS_CHECKPOINT_MIGRATEABLE is defined as zero therefore this code will never fire. * And if these flags are not set, start_exec will not try to run the job from * the checkpoint image file. */ pjob->ji_qs.ji_svrflags |= JOB_SVFLG_CHECKPOINT_FILE; if (preq->rq_reply.brp_auxcode) /* checkpoint can be moved */ { pjob->ji_qs.ji_svrflags &= ~JOB_SVFLG_CHECKPOINT_FILE; pjob->ji_qs.ji_svrflags |= JOB_SVFLG_HASRUN | JOB_SVFLG_CHECKPOINT_MIGRATEABLE; } pjob->ji_modified = 1; /* indicate attributes changed */ svr_evaljobstate(pjob, &newstate, &newsub, 0); svr_setjobstate(pjob, newstate, newsub, FALSE); /* saves job */ account_record(PBS_ACCT_CHKPNT, pjob, "Checkpointed and held"); /* note in accounting file */ reply_ack(preq); } unlock_ji_mutex(pjob, __func__, (char *)"1", LOGLEVEL); } /* END process_hold_reply() */
int req_releasearray( void *vp) /* I */ { job *pjob; job_array *pa; char *range; int rc; int index; struct batch_request *preq = (struct batch_request *)vp; pa = get_array(preq->rq_ind.rq_release.rq_objname); if (pa == NULL) { req_reject(PBSE_IVALREQ,0,preq,NULL,"Cannot find array"); return(PBSE_NONE); } while (TRUE) { if (((index = first_job_index(pa)) == -1) || (pa->job_ids[index] == NULL)) { unlock_ai_mutex(pa, __func__, (char *)"1", LOGLEVEL); return(PBSE_NONE); } if ((pjob = svr_find_job(pa->job_ids[index], FALSE)) == NULL) { free(pa->job_ids[index]); pa->job_ids[index] = NULL; } else break; } if (svr_authorize_jobreq(preq, pjob) == -1) { req_reject(PBSE_PERM,0,preq,NULL,NULL); unlock_ai_mutex(pa, __func__, (char *)"2", LOGLEVEL); unlock_ji_mutex(pjob, __func__, (char *)"1", LOGLEVEL); return(PBSE_NONE); } unlock_ji_mutex(pjob, __func__, (char *)"2", LOGLEVEL); range = preq->rq_extend; if ((range != NULL) && (strstr(range,ARRAY_RANGE) != NULL)) { /* parse the array range */ if ((rc = release_array_range(pa,preq,range)) != 0) { unlock_ai_mutex(pa, __func__, (char *)"3", LOGLEVEL); req_reject(rc,0,preq,NULL,NULL); return(PBSE_NONE); } } else if ((rc = release_whole_array(pa,preq)) != 0) { unlock_ai_mutex(pa, __func__, (char *)"4", LOGLEVEL); req_reject(rc,0,preq,NULL,NULL); return(PBSE_NONE); } unlock_ai_mutex(pa, __func__, (char *)"5", LOGLEVEL); reply_ack(preq); return(PBSE_NONE); } /* END req_releasearray() */
void purge_completed_jobs( struct batch_request *preq) /* I */ { job *pjob; char *time_str; time_t purge_time = 0; int iter; char log_buf[LOCAL_LOG_BUF_SIZE]; /* get the time to purge the jobs that completed before */ time_str = preq->rq_extend; time_str += strlen(PURGECOMP); purge_time = strtol(time_str,NULL,10); /* * Clean unreported capability is only for operators and managers. * Check if request is authorized */ if ((preq->rq_perm & (ATR_DFLAG_OPRD|ATR_DFLAG_OPWR| ATR_DFLAG_MGRD|ATR_DFLAG_MGWR)) == 0) { req_reject(PBSE_PERM,0,preq,NULL, "must have operator or manager privilege to use -c parameter"); return; } if (LOGLEVEL >= 4) { sprintf(log_buf,"Received purge completed jobs command, purge time is %ld (%s)", (long)purge_time, preq->rq_extend); log_event(PBSEVENT_SYSTEM, PBS_EVENTCLASS_REQUEST, __func__, log_buf); } reply_ack(preq); iter = -1; while ((pjob = next_job(&alljobs,&iter)) != NULL) { if ((pjob->ji_qs.ji_substate == JOB_SUBSTATE_COMPLETE) && (pjob->ji_wattr[JOB_ATR_comp_time].at_val.at_long <= purge_time) && ((pjob->ji_wattr[JOB_ATR_reported].at_flags & ATR_VFLAG_SET) != 0) && (pjob->ji_wattr[JOB_ATR_reported].at_val.at_long == 0)) { if (LOGLEVEL >= 4) { sprintf(log_buf,"Reported job is COMPLETED (%ld), setting reported to TRUE", pjob->ji_wattr[JOB_ATR_comp_time].at_val.at_long); log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf); } pjob->ji_wattr[JOB_ATR_reported].at_val.at_long = 1; pjob->ji_wattr[JOB_ATR_reported].at_flags = ATR_VFLAG_SET | ATR_VFLAG_MODIFY; job_save(pjob, SAVEJOB_FULL, 0); } unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL); } return; } /* END purge_completed_jobs() */
/* * delete_array_range() * * deletes a range from a specific array * * @param pa - the array whose jobs are deleted * @param range_str - the user-given range to delete * @return - the number of jobs skipped, -1 if range error */ int delete_array_range( job_array *pa, char *range_str) { tlist_head tl; array_request_node *rn; array_request_node *to_free; job *pjob; char *range; int i; int num_skipped = 0; int deleted; /* get just the numeric range specified, '=' should * always be there since we put it there in qdel */ range = strchr(range_str,'='); range++; /* move past the '=' */ CLEAR_HEAD(tl); if (parse_array_request(range,&tl) > 0) { /* don't delete jobs if range error */ return(-1); } rn = (array_request_node*)GET_NEXT(tl); while (rn != NULL) { for (i = rn->start; i <= rn->end; i++) { if (pa->job_ids[i] == NULL) continue; /* don't stomp on other memory */ if (i >= pa->ai_qs.array_size) continue; if ((pjob = svr_find_job(pa->job_ids[i], FALSE)) == NULL) { free(pa->job_ids[i]); pa->job_ids[i] = NULL; } else { if (pjob->ji_qs.ji_state >= JOB_STATE_EXITING) { /* invalid state for request, skip */ unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL); continue; } pthread_mutex_unlock(pa->ai_mutex); deleted = attempt_delete(pjob); if (deleted == FALSE) { /* if the job was deleted, this mutex would be taked care of elsewhere. When it fails, * release it here */ unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL); num_skipped++; } pthread_mutex_lock(pa->ai_mutex); } } to_free = rn; rn = (array_request_node*)GET_NEXT(rn->request_tokens_link); /* release mem */ free(to_free); } return(num_skipped); }
int execute_job_delete( job *pjob, /* M */ char *Msg, /* I */ struct batch_request *preq) /* I */ { struct work_task *pwtnew; int rc; const char *sigt = "SIGTERM"; char log_buf[LOCAL_LOG_BUF_SIZE]; time_t time_now = time(NULL); long force_cancel = FALSE; long array_compatible = FALSE; chk_job_req_permissions(&pjob,preq); if (pjob == NULL) { /* preq is rejected in chk_job_req_permissions here */ return(-1); } mutex_mgr job_mutex(pjob->ji_mutex, true); if (LOGLEVEL >= 10) log_event(PBSEVENT_DEBUG, PBS_EVENTCLASS_QUEUE, __func__, pjob->ji_qs.ji_jobid); if (pjob->ji_qs.ji_state == JOB_STATE_TRANSIT) { /* see note in req_delete - not sure this is possible still, * but the deleted code is irrelevant now. I will leave this * part --dbeer */ return(-1); } if (pjob->ji_qs.ji_substate == JOB_SUBSTATE_PRERUN || pjob->ji_qs.ji_substate == JOB_SUBSTATE_RERUN || pjob->ji_qs.ji_substate == JOB_SUBSTATE_RERUN1 || pjob->ji_qs.ji_substate == JOB_SUBSTATE_RERUN2 || pjob->ji_qs.ji_substate == JOB_SUBSTATE_RERUN3 ) { /* If JOB_SUBSTATE_PRERUN being sent to MOM, wait till she gets it going */ /* retry in one second */ /* If JOB_SUBSTATE_RERUN, RERUN1, RERUN2 or RERUN3 the job is being requeued. Wait until finished */ static time_t cycle_check_when = 0; static char cycle_check_jid[PBS_MAXSVRJOBID + 1]; if (cycle_check_when != 0) { if (!strcmp(pjob->ji_qs.ji_jobid, cycle_check_jid) && (time_now - cycle_check_when > 10)) { /* state not updated after 10 seconds */ /* did the mom ever get it? delete it anyways... */ cycle_check_jid[0] = '\0'; cycle_check_when = 0; goto jump; } if (time_now - cycle_check_when > 20) { /* give up after 20 seconds */ cycle_check_jid[0] = '\0'; cycle_check_when = 0; } } /* END if (cycle_check_when != 0) */ if (cycle_check_when == 0) { /* new PRERUN job located */ cycle_check_when = time_now; strcpy(cycle_check_jid, pjob->ji_qs.ji_jobid); } sprintf(log_buf, "job cannot be deleted, state=PRERUN, requeuing delete request"); log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf); pwtnew = set_task(WORK_Timed,time_now + 1,post_delete_route,preq,FALSE); if (pwtnew == NULL) { req_reject(PBSE_SYSTEM, 0, preq, NULL, NULL); return(-1); } else { return(ROUTE_DELETE); } } /* END if (pjob->ji_qs.ji_substate == JOB_SUBSTATE_PRERUN) */ jump: /* * Log delete and if requesting client is not job owner, send mail. */ sprintf(log_buf, "requestor=%s@%s", preq->rq_user, preq->rq_host); /* NOTE: should annotate accounting record with extend message (NYI) */ account_record(PBS_ACCT_DEL, pjob, log_buf); sprintf(log_buf, msg_manager, msg_deletejob, preq->rq_user, preq->rq_host); log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf); /* NOTE: should incorporate job delete message */ if (Msg != NULL) { /* have text message in request extension, add it */ int len = strlen(log_buf); snprintf(log_buf + len, sizeof(log_buf) - len, "\n%s", Msg); } if ((svr_chk_owner(preq, pjob) != 0) && (pjob->ji_has_delete_nanny == FALSE)) { /* only send email if owner did not delete job and job deleted has not been previously attempted */ svr_mailowner(pjob, MAIL_DEL, MAIL_FORCE, log_buf); /* * If we sent mail and already sent the extra message * then reset message so we don't trigger a redundant email * in job_abt() */ if (Msg != NULL) { Msg = NULL; } } if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_FILE) != 0) { /* job has restart file at mom, change restart comment if failed */ change_restart_comment_if_needed(pjob); } if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING) { /* * setup a nanny task to make sure the job is actually deleted (see the * comments at job_delete_nanny()). */ if (pjob->ji_has_delete_nanny == TRUE) { req_reject(PBSE_IVALREQ, 0, preq, NULL, "job cancel in progress"); return(-1); } apply_job_delete_nanny(pjob, time_now + 60); /* * Send signal request to MOM. The server will automagically * pick up and "finish" off the client request when MOM replies. */ get_batch_request_id(preq); if ((rc = issue_signal(&pjob, sigt, post_delete_mom1, strdup(preq->rq_id)))) { /* cant send to MOM */ req_reject(rc, 0, preq, NULL, NULL); } /* normally will ack reply when mom responds */ if (pjob != NULL) { sprintf(log_buf, msg_delrunjobsig, sigt); log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf); } else job_mutex.set_lock_on_exit(false); return(-1); } /* END if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING) */ /* make a cleanup task if set */ get_svr_attr_l(SRV_ATR_JobForceCancelTime, &force_cancel); if (force_cancel > 0) { char *dup_jobid = strdup(pjob->ji_qs.ji_jobid); set_task(WORK_Timed, time_now + force_cancel, ensure_deleted, dup_jobid, FALSE); } /* if configured, and this job didn't have a slot limit hold, free a job * held with the slot limit hold */ get_svr_attr_l(SRV_ATR_MoabArrayCompatible, &array_compatible); if ((array_compatible != FALSE) && ((pjob->ji_wattr[JOB_ATR_hold].at_val.at_long & HOLD_l) == FALSE)) { if ((pjob->ji_arraystructid[0] != '\0') && (pjob->ji_is_array_template == FALSE)) { int i; int newstate; int newsub; job *tmp; job_array *pa = get_jobs_array(&pjob); if (pjob == NULL) { job_mutex.set_lock_on_exit(false); return(-1); } for (i = 0; i < pa->ai_qs.array_size; i++) { if (pa->job_ids[i] == NULL) continue; if (!strcmp(pa->job_ids[i], pjob->ji_qs.ji_jobid)) continue; if ((tmp = svr_find_job(pa->job_ids[i], FALSE)) == NULL) { free(pa->job_ids[i]); pa->job_ids[i] = NULL; } else { if (tmp->ji_wattr[JOB_ATR_hold].at_val.at_long & HOLD_l) { tmp->ji_wattr[JOB_ATR_hold].at_val.at_long &= ~HOLD_l; if (tmp->ji_wattr[JOB_ATR_hold].at_val.at_long == 0) { tmp->ji_wattr[JOB_ATR_hold].at_flags &= ~ATR_VFLAG_SET; } svr_evaljobstate(tmp, &newstate, &newsub, 1); svr_setjobstate(tmp, newstate, newsub, FALSE); job_save(tmp, SAVEJOB_FULL, 0); unlock_ji_mutex(tmp, __func__, "5", LOGLEVEL); break; } unlock_ji_mutex(tmp, __func__, "6", LOGLEVEL); } } unlock_ai_mutex(pa, __func__, "1", LOGLEVEL); } } /* END MoabArrayCompatible check */ if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_FILE) != 0) { /* job has restart file at mom, do end job processing */ svr_setjobstate(pjob, JOB_STATE_EXITING, JOB_SUBSTATE_EXITING, FALSE); /* force new connection */ pjob->ji_momhandle = -1; if (LOGLEVEL >= 7) { sprintf(log_buf, "calling on_job_exit from %s", __func__); log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buf); } set_task(WORK_Immed, 0, on_job_exit_task, strdup(pjob->ji_qs.ji_jobid), FALSE); } else if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_StagedIn) != 0) { /* job has staged-in file, should remove them */ remove_stagein(&pjob); job_mutex.set_lock_on_exit(false); if (pjob != NULL) job_abt(&pjob, Msg); } else { /* * the job is not transitting (though it may have been) and * is not running, so put in into a complete state. */ struct pbs_queue *pque; int KeepSeconds = 0; svr_setjobstate(pjob, JOB_STATE_COMPLETE, JOB_SUBSTATE_COMPLETE, FALSE); if ((pque = get_jobs_queue(&pjob)) != NULL) { pque->qu_numcompleted++; unlock_queue(pque, __func__, NULL, LOGLEVEL); if (LOGLEVEL >= 7) { sprintf(log_buf, "calling on_job_exit from %s", __func__); log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buf); } pthread_mutex_lock(server.sv_attr_mutex); KeepSeconds = attr_ifelse_long( &pque->qu_attr[QE_ATR_KeepCompleted], &server.sv_attr[SRV_ATR_KeepCompleted], 0); pthread_mutex_unlock(server.sv_attr_mutex); } else KeepSeconds = 0; if (pjob != NULL) { set_task(WORK_Timed, time_now + KeepSeconds, on_job_exit_task, strdup(pjob->ji_qs.ji_jobid), FALSE); } else job_mutex.set_lock_on_exit(false); } /* END else if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_FILE) != 0) */ return(PBSE_NONE); } /* END execute_job_delete() */
int release_array_range( job_array *pa, struct batch_request *preq, char *range_str) { tlist_head tl; int i; int rc; job *pjob; array_request_node *rn; array_request_node *to_free; char *range = strchr(range_str,'='); if (range == NULL) return(PBSE_IVALREQ); range++; /* move past the '=' */ CLEAR_HEAD(tl); if (parse_array_request(range,&tl) > 0) { /* don't hold the jobs if range error */ return(PBSE_IVALREQ); } /* hold just that range from the array */ rn = (array_request_node*)GET_NEXT(tl); while (rn != NULL) { for (i = rn->start; i <= rn->end; i++) { /* don't stomp on other memory */ if (i >= pa->ai_qs.array_size) continue; if (pa->job_ids[i] == NULL) continue; if ((pjob = svr_find_job(pa->job_ids[i], FALSE)) == NULL) { free(pa->job_ids[i]); pa->job_ids[i] = NULL; } else { if ((rc = release_job(preq,pjob))) { unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL); return(rc); } unlock_ji_mutex(pjob, __func__, "2", LOGLEVEL); } } /* release mem */ to_free = rn; rn = (array_request_node*)GET_NEXT(rn->request_tokens_link); free(to_free); } return(PBSE_NONE); } /* END release_array_range() */
int local_move( job *pjob, int *my_err, struct batch_request *req) { pbs_queue *dest_que = NULL; char *destination = pjob->ji_qs.ji_destin; int mtype; char log_buf[LOCAL_LOG_BUF_SIZE]; char job_id[PBS_MAXSVRJOBID+1]; int rc; bool reservation = false; /* Sometimes multiple threads are trying to route the same job. Protect against this * by making sure that the destionation queue and the current queue are different. * If they are the same then consider it done correctly */ if (!strcmp(pjob->ji_qs.ji_queue, pjob->ji_qs.ji_destin)) return(PBSE_NONE); if (LOGLEVEL >= 8) { sprintf(log_buf, "%s", pjob->ji_qs.ji_jobid); log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, __func__, log_buf); } /* * if being moved at specific request of administrator, then * checks on queue availability, etc. are skipped; * otherwise all checks are enforced. */ if (req == 0) { mtype = MOVE_TYPE_Route; /* route */ } else if (req->rq_perm & (ATR_DFLAG_MGRD | ATR_DFLAG_MGWR)) { mtype = MOVE_TYPE_MgrMv; /* privileged move */ } else { mtype = MOVE_TYPE_Move; /* non-privileged move */ } strcpy(job_id, pjob->ji_qs.ji_jobid); unlock_ji_mutex(pjob, __func__, NULL, LOGLEVEL); dest_que = find_queuebyname(destination); if (dest_que == NULL) { /* this should never happen */ sprintf(log_buf, "queue %s does not exist\n", pjob->ji_qs.ji_queue); log_err(-1, __func__, log_buf); *my_err = PBSE_UNKQUE; return(-1); } mutex_mgr dest_que_mutex = mutex_mgr(dest_que->qu_mutex, true); if ((pjob = svr_find_job(job_id, TRUE)) == NULL) { /* job disappeared while locking queue */ return(PBSE_JOB_RECYCLED); } /* check the destination */ if ((*my_err = svr_chkque(pjob, dest_que, get_variable(pjob, pbs_o_host), mtype, NULL))) { /* should this queue be retried? */ return(should_retry_route(*my_err)); } reservation = have_reservation(pjob, dest_que); /* dequeue job from present queue, update destination and */ /* queue_rank for new queue and enqueue into destination */ dest_que_mutex.unlock(); rc = svr_dequejob(pjob, FALSE); if (rc) return(rc); snprintf(pjob->ji_qs.ji_queue, sizeof(pjob->ji_qs.ji_queue), "%s", destination); pjob->ji_wattr[JOB_ATR_qrank].at_val.at_long = ++queue_rank; if ((*my_err = svr_enquejob(pjob, FALSE, NULL, reservation, false)) == PBSE_JOB_RECYCLED) return(-1); if (*my_err != PBSE_NONE) { return(-1); /* should never ever get here */ } if (pjob != NULL) { pjob->ji_lastdest = 0; /* reset in case of another route */ job_save(pjob, SAVEJOB_FULL, 0); } return(PBSE_NONE); } /* END local_move() */
int modify_array_range( job_array *pa, /* I/O */ char *range, /* I */ svrattrl *plist, /* I */ struct batch_request *preq, /* I */ int checkpoint_req) /* I */ { char log_buf[LOCAL_LOG_BUF_SIZE]; tlist_head tl; int i; int rc; int mom_relay = 0; job *pjob; array_request_node *rn; array_request_node *to_free; CLEAR_HEAD(tl); if (parse_array_request(range,&tl) > 0) { /* don't hold the jobs if range error */ return(FAILURE); } else { /* hold just that range from the array */ rn = (array_request_node*)GET_NEXT(tl); while (rn != NULL) { for (i = rn->start; i <= rn->end; i++) { if ((i >= pa->ai_qs.array_size) || (pa->job_ids[i] == NULL)) continue; if ((pjob = svr_find_job(pa->job_ids[i], FALSE)) == NULL) { free(pa->job_ids[i]); pa->job_ids[i] = NULL; } else { pthread_mutex_unlock(pa->ai_mutex); rc = modify_job((void **)&pjob, plist, preq, checkpoint_req, NO_MOM_RELAY); pa = get_jobs_array(&pjob); if (pjob != NULL) { if (rc == PBSE_RELAYED_TO_MOM) { struct batch_request *array_req = NULL; /* We told modify_job not to call relay_to_mom so we need to contact the mom */ if ((rc = copy_batchrequest(&array_req, preq, 0, i)) != PBSE_NONE) { return(rc); } preq->rq_refcount++; if (mom_relay == 0) { preq->rq_refcount++; } mom_relay++; /* The array_req is freed in relay_to_mom (failure) * or in issue_Drequest (success) */ if ((rc = relay_to_mom(&pjob, array_req, NULL))) { snprintf(log_buf,sizeof(log_buf), "Unable to relay information to mom for job '%s'\n", pjob->ji_qs.ji_jobid); log_err(rc, __func__, log_buf); unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL); return(rc); /* unable to get to MOM */ } else { unlock_ji_mutex(pjob, __func__, "2", LOGLEVEL); post_modify_arrayreq(array_req); } } else unlock_ji_mutex(pjob, __func__, "2", LOGLEVEL); } else pa->job_ids[i] = NULL; } } /* release mem */ to_free = rn; rn = (array_request_node*)GET_NEXT(rn->request_tokens_link); free(to_free); } } if (mom_relay) { preq->rq_refcount--; if (preq->rq_refcount == 0) { free_br(preq); } return(PBSE_RELAYED_TO_MOM); } return(PBSE_NONE); } /* END modify_array_range() */
void finish_moving_processing( job *pjob, struct batch_request *req, int status) { char log_buf[LOCAL_LOG_BUF_SIZE]; int newstate; int newsub; if (req->rq_type != PBS_BATCH_MoveJob) { sprintf(log_buf, "bad request type %d\n", req->rq_type); log_err(-1, __func__, log_buf); return; } if (pjob == NULL) return; switch (status) { case LOCUTION_SUCCESS: /* purge server's job structure */ if (pjob->ji_qs.ji_svrflags & JOB_SVFLG_StagedIn) remove_stagein(&pjob); if (pjob != NULL) { if (pjob->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_COPIED) remove_checkpoint(&pjob); } snprintf(log_buf, sizeof(log_buf), "%s", msg_movejob); snprintf(log_buf + strlen(log_buf), sizeof(log_buf) - strlen(log_buf), msg_manager, req->rq_ind.rq_move.rq_destin, req->rq_user, req->rq_host); if (pjob != NULL) svr_job_purge(pjob); reply_ack(req); break; default: status = PBSE_ROUTEREJ; if (pjob != NULL) { /* force re-eval of job state out of Transit */ svr_evaljobstate(*pjob, newstate, newsub, 1); svr_setjobstate(pjob, newstate, newsub, FALSE); unlock_ji_mutex(pjob, __func__, "3", LOGLEVEL); } req_reject(status, 0, req, NULL, NULL); break; } /* END switch (status) */ } /* END finish_moving_processing() */
/** * update_array_values() * * updates internal bookeeping values for job arrays * @param pa - array to update * @param pjob - the pjob that an event happened on * @param event - code for what event just happened */ void update_array_values( job_array *pa, /* I */ int old_state, /* I */ enum ArrayEventsEnum event, /* I */ char *job_id, long job_atr_hold, int job_exit_status) { long moab_compatible; switch (event) { case aeQueue: /* NYI, nothing needs to be done for this yet */ break; case aeRun: if (old_state != JOB_STATE_RUNNING) { pa->ai_qs.jobs_running++; pa->ai_qs.num_started++; } break; case aeTerminate: if (old_state == JOB_STATE_RUNNING) { if (pa->ai_qs.jobs_running > 0) pa->ai_qs.jobs_running--; } if (job_exit_status == 0) { pa->ai_qs.num_successful++; pa->ai_qs.jobs_done++; } else { pa->ai_qs.num_failed++; pa->ai_qs.jobs_done++; } array_save(pa); /* update slot limit hold if necessary */ if (get_svr_attr_l(SRV_ATR_MoabArrayCompatible, &moab_compatible) != PBSE_NONE) moab_compatible = FALSE; if (moab_compatible != FALSE) { /* only need to update if the job wasn't previously held */ if ((job_atr_hold & HOLD_l) == FALSE) { int i; int newstate; int newsub; job *pj; /* find the first held job and release its hold */ for (i = 0; i < pa->ai_qs.array_size; i++) { if (pa->job_ids[i] == NULL) continue; if (!strcmp(pa->job_ids[i], job_id)) continue; if ((pj = svr_find_job(pa->job_ids[i], TRUE)) == NULL) { free(pa->job_ids[i]); pa->job_ids[i] = NULL; } else { if (pj->ji_wattr[JOB_ATR_hold].at_val.at_long & HOLD_l) { pj->ji_wattr[JOB_ATR_hold].at_val.at_long &= ~HOLD_l; if (pj->ji_wattr[JOB_ATR_hold].at_val.at_long == 0) { pj->ji_wattr[JOB_ATR_hold].at_flags &= ~ATR_VFLAG_SET; } svr_evaljobstate(pj, &newstate, &newsub, 1); svr_setjobstate(pj, newstate, newsub, FALSE); job_save(pj, SAVEJOB_FULL, 0); unlock_ji_mutex(pj, __func__, "1", LOGLEVEL); break; } unlock_ji_mutex(pj, __func__, "2", LOGLEVEL); } } } } break; default: /* log error? */ break; } set_array_depend_holds(pa); array_save(pa); } /* END update_array_values() */
int process_request( struct tcp_chan *chan) /* file descriptor (socket) to get request */ { int rc = PBSE_NONE; struct batch_request *request = NULL; char log_buf[LOCAL_LOG_BUF_SIZE]; long acl_enable = FALSE; long state = SV_STATE_DOWN; time_t time_now = time(NULL); int free_request = TRUE; char tmpLine[MAXLINE]; char *auth_err = NULL; enum conn_type conn_active; unsigned short conn_socktype; unsigned short conn_authen; unsigned long conn_addr; int sfds = chan->sock; pthread_mutex_lock(svr_conn[sfds].cn_mutex); conn_active = svr_conn[sfds].cn_active; conn_socktype = svr_conn[sfds].cn_socktype; conn_authen = svr_conn[sfds].cn_authen; conn_addr = svr_conn[sfds].cn_addr; svr_conn[sfds].cn_lasttime = time_now; pthread_mutex_unlock(svr_conn[sfds].cn_mutex); if ((request = alloc_br(0)) == NULL) { snprintf(tmpLine, sizeof(tmpLine), "cannot allocate memory for request from %lu", conn_addr); req_reject(PBSE_MEM_MALLOC, 0, request, NULL, tmpLine); free_request = FALSE; rc = PBSE_SYSTEM; goto process_request_cleanup; } request->rq_conn = sfds; /* * Read in the request and decode it to the internal request structure. */ if (conn_active == FromClientDIS || conn_active == ToServerDIS) { #ifdef ENABLE_UNIX_SOCKETS if ((conn_socktype & PBS_SOCK_UNIX) && (conn_authen != PBS_NET_CONN_AUTHENTICATED)) { /* get_creds interestingly always returns 0 */ get_creds(sfds, conn_credent[sfds].username, conn_credent[sfds].hostname); } #endif /* END ENABLE_UNIX_SOCKETS */ rc = dis_request_read(chan, request); } else { char out[80]; snprintf(tmpLine, MAXLINE, "request on invalid type of connection: %d, sock type: %d, from address %s", conn_active,conn_socktype, netaddr_long(conn_addr, out)); log_event(PBSEVENT_SYSTEM, PBS_EVENTCLASS_REQUEST, "process_req", tmpLine); snprintf(tmpLine, sizeof(tmpLine), "request on invalid type of connection (%d) from %s", conn_active, netaddr_long(conn_addr, out)); req_reject(PBSE_BADHOST, 0, request, NULL, tmpLine); free_request = FALSE; rc = PBSE_BADHOST; goto process_request_cleanup; } if (rc == -1) { /* FAILURE */ /* premature end of file */ rc = PBSE_PREMATURE_EOF; goto process_request_cleanup; } if ((rc == PBSE_SYSTEM) || (rc == PBSE_INTERNAL) || (rc == PBSE_SOCKET_CLOSE)) { /* FAILURE */ /* read error, likely cannot send reply so just disconnect */ /* ??? not sure about this ??? */ goto process_request_cleanup; } if (rc > 0) { /* FAILURE */ /* * request didn't decode, either garbage or unknown * request type, in either case, return reject-reply */ req_reject(rc, 0, request, NULL, "cannot decode message"); free_request = FALSE; goto process_request_cleanup; } if (get_connecthost(sfds, request->rq_host, PBS_MAXHOSTNAME) != 0) { sprintf(log_buf, "%s: %lu", pbse_to_txt(PBSE_BADHOST), conn_addr); log_event(PBSEVENT_DEBUG, PBS_EVENTCLASS_REQUEST, "", log_buf); snprintf(tmpLine, sizeof(tmpLine), "cannot determine hostname for connection from %lu", conn_addr); req_reject(PBSE_BADHOST, 0, request, NULL, tmpLine); free_request = FALSE; rc = PBSE_BADHOST; goto process_request_cleanup; } if (LOGLEVEL >= 1) { sprintf(log_buf, msg_request, reqtype_to_txt(request->rq_type), request->rq_user, request->rq_host, sfds); log_event(PBSEVENT_DEBUG2, PBS_EVENTCLASS_REQUEST, "", log_buf); } /* is the request from a host acceptable to the server */ if (conn_socktype & PBS_SOCK_UNIX) { strcpy(request->rq_host, server_name); } get_svr_attr_l(SRV_ATR_acl_host_enable, &acl_enable); if (acl_enable) { /* acl enabled, check it; always allow myself and nodes */ struct array_strings *pas = NULL; struct pbsnode *isanode; get_svr_attr_arst(SRV_ATR_acl_hosts, &pas); isanode = PGetNodeFromAddr(conn_addr); if ((isanode == NULL) && (strcmp(server_host, request->rq_host) != 0) && (acl_check_my_array_string(pas, request->rq_host, ACL_Host) == 0)) { char tmpLine[MAXLINE]; snprintf(tmpLine, sizeof(tmpLine), "request not authorized from host %s", request->rq_host); req_reject(PBSE_BADHOST, 0, request, NULL, tmpLine); free_request = FALSE; rc = PBSE_BADHOST; goto process_request_cleanup; } if (isanode != NULL) unlock_node(isanode, "process_request", NULL, LOGLEVEL); } /* * determine source (user client or another server) of request. * set the permissions granted to the client */ if (conn_authen == PBS_NET_CONN_FROM_PRIVIL) { /* request came from another server */ request->rq_fromsvr = 1; request->rq_perm = ATR_DFLAG_USRD | ATR_DFLAG_USWR | ATR_DFLAG_OPRD | ATR_DFLAG_OPWR | ATR_DFLAG_MGRD | ATR_DFLAG_MGWR | ATR_DFLAG_SvWR; } else { /* request not from another server */ conn_credent[sfds].timestamp = time_now; request->rq_fromsvr = 0; /* * Client must be authenticated by an Authenticate User Request, if not, * reject request and close connection. -- The following is retained for * compat with old cmds -- The exception to this is of course the Connect * Request which cannot have been authenticated, because it contains the * needed ticket; so trap it here. Of course, there is no prior * authentication on the Authenticate User request either, but it comes * over a reserved port and appears from another server, hence is * automatically granted authentication. * * The above is only true with inet sockets. With unix domain sockets, the * user creds were read before the first dis_request_read call above. * We automatically granted authentication because we can trust the socket * creds. Authorization is still granted in svr_get_privilege below */ if (request->rq_type == PBS_BATCH_Connect) { req_connect(request); if (conn_socktype == PBS_SOCK_INET) { rc = PBSE_IVALREQ; req_reject(rc, 0, request, NULL, NULL); free_request = FALSE; goto process_request_cleanup; } } if (conn_socktype & PBS_SOCK_UNIX) { pthread_mutex_lock(svr_conn[sfds].cn_mutex); svr_conn[sfds].cn_authen = PBS_NET_CONN_AUTHENTICATED; pthread_mutex_unlock(svr_conn[sfds].cn_mutex); } if (ENABLE_TRUSTED_AUTH == TRUE ) rc = PBSE_NONE; /* bypass the authentication of the user--trust the client completely */ else if (munge_on) { /* If munge_on is true we will validate the connection now */ if (request->rq_type == PBS_BATCH_AltAuthenUser) { rc = req_altauthenuser(request); free_request = FALSE; goto process_request_cleanup; } else { rc = authenticate_user(request, &conn_credent[sfds], &auth_err); } } else if (conn_authen != PBS_NET_CONN_AUTHENTICATED) /* skip checking user if we did not get an authenticated credential */ rc = PBSE_BADCRED; else rc = authenticate_user(request, &conn_credent[sfds], &auth_err); if (rc != 0) { req_reject(rc, 0, request, NULL, auth_err); if (auth_err != NULL) free(auth_err); free_request = FALSE; goto process_request_cleanup; } /* * pbs_mom and checkpoint restart scripts both need the authority to do * alters and releases on checkpointable jobs. Allow manager permission * for root on the jobs execution node. */ if (((request->rq_type == PBS_BATCH_ModifyJob) || (request->rq_type == PBS_BATCH_ReleaseJob)) && (strcmp(request->rq_user, PBS_DEFAULT_ADMIN) == 0)) { job *pjob; char *dptr; int skip = FALSE; char short_host[PBS_MAXHOSTNAME+1]; /* make short host name */ strcpy(short_host, request->rq_host); if ((dptr = strchr(short_host, '.')) != NULL) { *dptr = '\0'; } if ((pjob = svr_find_job(request->rq_ind.rq_modify.rq_objname, FALSE)) != (job *)0) { if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING) { if ((pjob->ji_wattr[JOB_ATR_checkpoint].at_flags & ATR_VFLAG_SET) && ((csv_find_string(pjob->ji_wattr[JOB_ATR_checkpoint].at_val.at_str, "s") != NULL) || (csv_find_string(pjob->ji_wattr[JOB_ATR_checkpoint].at_val.at_str, "c") != NULL) || (csv_find_string(pjob->ji_wattr[JOB_ATR_checkpoint].at_val.at_str, "enabled") != NULL)) && (strstr(pjob->ji_wattr[JOB_ATR_exec_host].at_val.at_str, short_host) != NULL)) { request->rq_perm = svr_get_privilege(request->rq_user, server_host); skip = TRUE; } } unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL); } if (!skip) { request->rq_perm = svr_get_privilege(request->rq_user, request->rq_host); } } else { request->rq_perm = svr_get_privilege(request->rq_user, request->rq_host); } } /* END else (conn_authen == PBS_NET_CONN_FROM_PRIVIL) */ /* if server shutting down, disallow new jobs and new running */ get_svr_attr_l(SRV_ATR_State, &state); if (state > SV_STATE_RUN) { switch (request->rq_type) { case PBS_BATCH_AsyrunJob: case PBS_BATCH_JobCred: case PBS_BATCH_MoveJob: case PBS_BATCH_QueueJob: case PBS_BATCH_RunJob: case PBS_BATCH_StageIn: case PBS_BATCH_jobscript: req_reject(PBSE_SVRDOWN, 0, request, NULL, NULL); rc = PBSE_SVRDOWN; free_request = FALSE; goto process_request_cleanup; /*NOTREACHED*/ break; } } /* * dispatch the request to the correct processing function. * The processing function must call reply_send() to free * the request struture. */ rc = dispatch_request(sfds, request); return(rc); process_request_cleanup: if (free_request == TRUE) free_br(request); return(rc); } /* END process_request() */
void update_array_statuses( job_array *owned) { job_array *pa; job *pj; job *pjob; int i; int iter = -1; unsigned int running; unsigned int queued; unsigned int held; unsigned int complete; char log_buf[LOCAL_LOG_BUF_SIZE]; while ((pa = next_array_check(&iter, owned)) != NULL) { running = 0; queued = 0; held = 0; complete = 0; for (i = 0; i < pa->ai_qs.array_size; i++) { if (pa->job_ids[i] != NULL) { if ((pj = svr_find_job(pa->job_ids[i], TRUE)) == NULL) { free(pa->job_ids[i]); pa->job_ids[i] = NULL; } else { if (pj->ji_qs.ji_state == JOB_STATE_RUNNING) { running++; } else if (pj->ji_qs.ji_state == JOB_STATE_QUEUED) { queued++; } else if (pj->ji_qs.ji_state == JOB_STATE_HELD) { held++; } else if (pj->ji_qs.ji_state == JOB_STATE_COMPLETE) { complete++; } unlock_ji_mutex(pj, __func__, "1", LOGLEVEL); } } } if (LOGLEVEL >= 7) { sprintf(log_buf, "%s: unlocking ai_mutex", __func__); log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, __func__, log_buf); } unlock_ai_mutex(pa, __func__, "1", LOGLEVEL); if ((pjob = svr_find_job(pa->ai_qs.parent_id, TRUE)) != NULL) { if (running > 0) { svr_setjobstate(pjob, JOB_STATE_RUNNING, pjob->ji_qs.ji_substate, FALSE); } else if (held > 0 && queued == 0 && complete == 0) { svr_setjobstate(pjob, JOB_STATE_HELD, pjob->ji_qs.ji_substate, FALSE); } else if (complete > 0 && queued == 0 && held == 0) { svr_setjobstate(pjob, JOB_STATE_COMPLETE, pjob->ji_qs.ji_substate, FALSE); } else { /* default to just calling the array queued */ svr_setjobstate(pjob, JOB_STATE_QUEUED, pjob->ji_qs.ji_substate, FALSE); } unlock_ji_mutex(pjob, __func__, "2", LOGLEVEL); } if (pa == owned) { lock_ai_mutex(pa, __func__, "1", LOGLEVEL); } } } /* END update_array_statuses() */
void post_signal_req( batch_request *preq) { char *jobid; job *pjob; char log_buf[LOCAL_LOG_BUF_SIZE]; /* request has been handled elsewhere */ if (preq == NULL) return; preq->rq_conn = preq->rq_orgconn; /* restore client socket */ if (preq->rq_reply.brp_code) { log_event( PBSEVENT_DEBUG, PBS_EVENTCLASS_REQUEST, preq->rq_ind.rq_signal.rq_jid, pbse_to_txt(PBSE_MOMREJECT)); errno = 0; req_reject(preq->rq_reply.brp_code, 0, preq, NULL, NULL); } else { if ((jobid = preq->rq_extra) == NULL) { log_err(ENOMEM, __func__, "Cannot allocate memory! FAILURE"); return; } if ((pjob = svr_find_job(jobid, FALSE)) != NULL) { if (strcmp(preq->rq_ind.rq_signal.rq_signame, SIG_SUSPEND) == 0) { if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_Suspend) == 0) { pjob->ji_qs.ji_svrflags |= JOB_SVFLG_Suspend; set_statechar(pjob); job_save(pjob, SAVEJOB_QUICK, 0); /* release resources allocated to suspended job - NORWAY */ free_nodes(pjob); } } else if (strcmp(preq->rq_ind.rq_signal.rq_signame, SIG_RESUME) == 0) { if (pjob->ji_qs.ji_svrflags & JOB_SVFLG_Suspend) { /* re-allocate assigned node to resumed job - NORWAY */ set_old_nodes(pjob); pjob->ji_qs.ji_svrflags &= ~JOB_SVFLG_Suspend; set_statechar(pjob); job_save(pjob, SAVEJOB_QUICK, 0); } } unlock_ji_mutex(pjob, __func__, "5", LOGLEVEL); } else { /* job is gone */ snprintf(log_buf,sizeof(log_buf), "Cannot find job '%s', assuming success", jobid); log_event(PBSEVENT_SYSTEM, PBS_EVENTCLASS_JOB, __func__, log_buf); } free(jobid); reply_ack(preq); } return; } /* END post_signal_req() */
int issue_signal( job **pjob_ptr, const char *signame, /* name of the signal to send */ void (*func)(struct batch_request *), void *extra, /* extra parameter to be stored in sig request */ char *extend) /* Parameter to put in extended part of request */ { int rc; job *pjob = *pjob_ptr; struct batch_request *newreq; char jobid[PBS_MAXSVRJOBID + 1]; /* build up a Signal Job batch request */ if ((newreq = alloc_br(PBS_BATCH_SignalJob)) == NULL) { /* FAILURE */ return(PBSE_SYSTEM); } newreq->rq_extra = extra; newreq->rq_extend = extend; if (extend != NULL) { newreq->rq_extsz = strlen(extend); } strcpy(jobid, pjob->ji_qs.ji_jobid); strcpy(newreq->rq_ind.rq_signal.rq_jid, pjob->ji_qs.ji_jobid); snprintf(newreq->rq_ind.rq_signal.rq_signame, sizeof(newreq->rq_ind.rq_signal.rq_signame), "%s", signame); /* The newreq is freed in relay_to_mom (failure) * or in issue_Drequest (success) */ rc = relay_to_mom(&pjob, newreq, NULL); if ((rc == PBSE_NONE) && (pjob != NULL)) { strcpy(jobid, pjob->ji_qs.ji_jobid); unlock_ji_mutex(pjob, __func__, NULL, LOGLEVEL); func(newreq); *pjob_ptr = svr_find_job((char *)jobid, TRUE); } else if ((extend != NULL) && (!strcmp(extend, RERUNFORCE))) { if (pjob == NULL) { *pjob_ptr = svr_find_job((char *)jobid, TRUE); pjob = *pjob_ptr; } /* The job state is normally set when the obit arrives. But since the MOM is not responding we need to set the state here */ if (pjob != NULL) { /* Rerunning job, if not checkpointed, clear "resources_used and requeue job */ if ((pjob->ji_qs.ji_svrflags & (JOB_SVFLG_CHECKPOINT_FILE | JOB_SVFLG_CHECKPOINT_MIGRATEABLE)) == 0) { job_attr_def[JOB_ATR_resc_used].at_free(&pjob->ji_wattr[JOB_ATR_resc_used]); } else if (pjob->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_FILE) { /* non-migratable checkpoint (cray), leave there */ /* and just requeue the job */ rel_resc(pjob); pjob->ji_qs.ji_svrflags |= JOB_SVFLG_HASRUN; svr_setjobstate(pjob, JOB_STATE_QUEUED, JOB_SUBSTATE_QUEUED, FALSE); pjob->ji_momhandle = -1; unlock_ji_mutex(pjob, __func__, "8", LOGLEVEL); return(PBSE_SYSTEM); } rel_resc(pjob); /* free resc assigned to job */ /* Now re-queue the job */ pjob->ji_modified = 1; /* force full job save */ pjob->ji_momhandle = -1; pjob->ji_qs.ji_svrflags &= ~JOB_SVFLG_StagedIn; svr_setjobstate(pjob, JOB_STATE_QUEUED, JOB_SUBSTATE_QUEUED, FALSE); unlock_ji_mutex(pjob, __func__, NULL, LOGLEVEL); func(newreq); rc = PBSE_NONE; } else rc = PBSE_JOBNOTFOUND; } else { free_br(newreq); if (pjob == NULL) *pjob_ptr = NULL; } return(rc); } /* END issue_signal() */
int req_stat_job( struct batch_request *preq) /* ptr to the decoded request */ { struct stat_cntl cntl; /* see svrfunc.h */ char *name; job *pjob = NULL; pbs_queue *pque = NULL; int rc = PBSE_NONE; char log_buf[LOCAL_LOG_BUF_SIZE]; bool condensed = false; enum TJobStatTypeEnum type = tjstNONE; /* * first, validate the name of the requested object, either * a job, a queue, or the whole server. */ if (LOGLEVEL >= 7) { sprintf(log_buf, "note"); log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, __func__, log_buf); } /* FORMAT: name = { <JOBID> | <QUEUEID> | '' } */ name = preq->rq_ind.rq_status.rq_id; if (preq->rq_extend != NULL) { /* evaluate pbs_job_stat() 'extension' field */ if (!strncasecmp(preq->rq_extend, "truncated", strlen("truncated"))) { /* truncate response by 'max_report' */ type = tjstTruncatedServer; } else if (!strncasecmp(preq->rq_extend, "summarize_arrays", strlen("summarize_arrays"))) { type = tjstSummarizeArraysServer; } if (preq->rq_extend[strlen(preq->rq_extend) - 1] == 'C') { condensed = true; } } /* END if (preq->rq_extend != NULL) */ if (isdigit((int)*name)) { /* status a single job */ if (is_array(name)) { if (type != tjstSummarizeArraysServer) { type = tjstArray; } } else { type = tjstJob; if ((pjob = svr_find_job(name, FALSE)) == NULL) { rc = PBSE_UNKJOBID; } else unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL); } } else if (isalpha(name[0])) { if (type == tjstNONE) type = tjstQueue; else if (type == tjstSummarizeArraysServer) type = tjstSummarizeArraysQueue; else type = tjstTruncatedQueue; /* if found, this mutex is released later */ if ((pque = find_queuebyname(name)) == NULL) { rc = PBSE_UNKQUE; } } else if ((*name == '\0') || (*name == '@')) { /* status all jobs at server */ if (type == tjstNONE) type = tjstServer; } else { rc = PBSE_IVALREQ; } if (rc != 0) { /* is invalid - an error */ req_reject(rc, 0, preq, NULL, NULL); return(rc); } set_reply_type(&preq->rq_reply, BATCH_REPLY_CHOICE_Status); CLEAR_HEAD(preq->rq_reply.brp_un.brp_status); if ((type == tjstTruncatedQueue) || (type == tjstTruncatedServer)) { if (pque != NULL) { unlock_queue(pque, __func__, "", LOGLEVEL); pque = NULL; } } memset(&cntl, 0, sizeof(cntl)); cntl.sc_type = (int)type; cntl.sc_conn = -1; cntl.sc_pque = pque; cntl.sc_origrq = preq; cntl.sc_post = req_stat_job_step2; cntl.sc_jobid[0] = '\0'; /* cause "start from beginning" */ cntl.sc_condensed = condensed; req_stat_job_step2(&cntl); /* go to step 2, see if running is current */ if (pque != NULL) unlock_queue(pque, "req_stat_job", (char *)"success", LOGLEVEL); return(PBSE_NONE); } /* END req_stat_job() */
int req_holdarray( void *vp) /* I */ { int i; struct batch_request *preq = (struct batch_request *)vp; char *pset; char *range_str; int rc; pbs_attribute temphold; char owner[PBS_MAXUSER + 1]; job_array *pa; job *pjob; char log_buf[LOCAL_LOG_BUF_SIZE]; pa = get_array(preq->rq_ind.rq_hold.rq_orig.rq_objname); if (pa == NULL) { /* this shouldn't happen since we verify that this is a valid array just prior to calling this function */ req_reject(PBSE_UNKARRAYID, 0, preq, NULL, "unable to find array"); return(PBSE_NONE); } get_jobowner(pa->ai_qs.owner, owner); if (svr_authorize_req(preq, owner, pa->ai_qs.submit_host) == -1) { sprintf(log_buf, msg_permlog, preq->rq_type, "Array", preq->rq_ind.rq_delete.rq_objname, preq->rq_user, preq->rq_host); log_event(PBSEVENT_SECURITY, PBS_EVENTCLASS_JOB, preq->rq_ind.rq_delete.rq_objname, log_buf); if (LOGLEVEL >= 7) { sprintf(log_buf, "%s: unlocking ai_mutex", __func__); log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pa->ai_qs.parent_id, log_buf); } pthread_mutex_unlock(pa->ai_mutex); req_reject(PBSE_PERM, 0, preq, NULL, "operation not permitted"); return(PBSE_NONE); } if ((rc = get_hold(&preq->rq_ind.rq_hold.rq_orig.rq_attr, &pset, &temphold)) != 0) { if (LOGLEVEL >= 7) { sprintf(log_buf, "%s: unlocking ai_mutex", __func__); log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pa->ai_qs.parent_id, log_buf); } pthread_mutex_unlock(pa->ai_mutex); req_reject(rc, 0, preq, NULL, NULL); return(PBSE_NONE); } /* if other than HOLD_u is being set, must have privil */ if ((rc = chk_hold_priv(temphold.at_val.at_long, preq->rq_perm)) != 0) { if (LOGLEVEL >= 7) { sprintf(log_buf, "%s: unlocking ai_mutex", __func__); log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pa->ai_qs.parent_id, log_buf); } pthread_mutex_unlock(pa->ai_mutex); req_reject(rc, 0, preq, NULL, NULL); return(PBSE_NONE); } /* get the range of jobs to iterate over */ range_str = preq->rq_extend; if ((range_str != NULL) && (strstr(range_str,ARRAY_RANGE) != NULL)) { if ((rc = hold_array_range(pa,range_str,&temphold)) != 0) { pthread_mutex_unlock(pa->ai_mutex); req_reject(rc,0,preq,NULL, "Error in specified array range"); return(PBSE_NONE); } } else { /* do the entire array */ for (i = 0;i < pa->ai_qs.array_size;i++) { if (pa->job_ids[i] == NULL) continue; if ((pjob = svr_find_job(pa->job_ids[i], FALSE)) == NULL) { free(pa->job_ids[i]); pa->job_ids[i] = NULL; } else { hold_job(&temphold,pjob); if (LOGLEVEL >= 7) { sprintf(log_buf, "%s: unlocking ai_mutex", __func__); log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pa->ai_qs.parent_id, log_buf); } unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL); } } } if (LOGLEVEL >= 7) { sprintf(log_buf, "%s: unlocking ai_mutex", __func__); log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pa->ai_qs.parent_id, log_buf); } pthread_mutex_unlock(pa->ai_mutex); reply_ack(preq); return(PBSE_NONE); } /* END req_holdarray() */
void *queue_route( void *vp) { pbs_queue *pque; job *pjob = NULL; char *queue_name; char log_buf[LOCAL_LOG_BUF_SIZE]; all_jobs_iterator *iter = NULL; queue_name = (char *)vp; if (queue_name == NULL) { sprintf(log_buf, "NULL queue name"); log_err(-1, __func__, log_buf); return(NULL); } while (1) { pthread_mutex_lock(reroute_job_mutex); /* Before we attempt to service this queue, make sure we can find it. */ pque = find_queuebyname(queue_name); if (pque == NULL) { sprintf(log_buf, "Could not find queue %s", queue_name); log_err(-1, __func__, log_buf); free(queue_name); return(NULL); } mutex_mgr que_mutex(pque->qu_mutex, true); pque->qu_jobs->lock(); iter = pque->qu_jobs->get_iterator(); pque->qu_jobs->unlock(); if (LOGLEVEL >= 7) { snprintf(log_buf, sizeof(log_buf), "routing any ready jobs in queue: %s", queue_name); log_event(PBSEVENT_SYSTEM, PBS_EVENTCLASS_QUEUE, __func__, log_buf); } while ((pjob = next_job(pque->qu_jobs,iter)) != NULL) { /* We only want to try if routing has been tried at least once - this is to let * req_commit have the first crack at routing always. */ if (pjob->ji_commit_done == 0) /* when req_commit is done it will set ji_commit_done to 1 */ { unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL); continue; } /* queue must be unlocked when calling reroute_job */ que_mutex.unlock(); reroute_job(pjob); unlock_ji_mutex(pjob, __func__, "2", LOGLEVEL); /* need to relock queue when we go to call next_job */ pque = find_queuebyname(queue_name); if (pque == NULL) { sprintf(log_buf, "Could not find queue %s", queue_name); log_err(-1, __func__, log_buf); free(queue_name); delete iter; return(NULL); } que_mutex.mark_as_locked(); } /* we come out of the while loop with the queue locked. We don't want it locked while we sleep */ que_mutex.unlock(); pthread_mutex_unlock(reroute_job_mutex); delete iter; sleep(route_retry_interval); } free(queue_name); return(NULL); } /* END queue_route() */
int get_parent_dest_queues( char *queue_parent_name, char *queue_dest_name, pbs_queue **parent, pbs_queue **dest, job **pjob_ptr) { pbs_queue *pque_parent; pbs_queue *pque_dest; char jobid[PBS_MAXSVRJOBID + 1]; char log_buf[LOCAL_LOG_BUF_SIZE + 1]; job *pjob = *pjob_ptr; int index_parent; int index_dest; int rc = PBSE_NONE; strcpy(jobid, pjob->ji_qs.ji_jobid); if ((queue_parent_name != NULL) && (queue_dest_name != NULL)) { if (!strcmp(queue_parent_name, queue_dest_name)) { /* parent and destination are the same. Job is already in destnation queue. return */ snprintf(log_buf, sizeof(log_buf), "parent and destination queues are the same: parent %s - dest %s. jobid: %s", queue_parent_name, queue_dest_name, jobid); log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, __func__, log_buf); return(-1); } } else return(-1); unlock_ji_mutex(pjob, __func__, (char *)"1", LOGLEVEL); unlock_queue(*parent, __func__, (char *)NULL, 0); *parent = NULL; *dest = NULL; pthread_mutex_lock(svr_queues.allques_mutex); index_parent = get_value_hash(svr_queues.ht, queue_parent_name); index_dest = get_value_hash(svr_queues.ht, queue_dest_name); if ((index_parent < 0) || (index_dest < 0)) { rc = -1; } else { /* good path */ pque_parent = svr_queues.ra->slots[index_parent].item; pque_dest = svr_queues.ra->slots[index_dest].item; if ((pque_parent == NULL) || (pque_dest == NULL)) { rc = -1; } else { /* SUCCESS! */ lock_queue(pque_parent, __func__, (char *)NULL, 0); lock_queue(pque_dest, __func__, (char *)NULL, 0); *parent = pque_parent; *dest = pque_dest; rc = PBSE_NONE; } } pthread_mutex_unlock(svr_queues.allques_mutex); if ((*pjob_ptr = svr_find_job(jobid, TRUE)) == NULL) rc = -1; return(rc); } /* END get_parent_dest_queues() */