int job_resume (struct jobCard *jp) { static char fname[] = "job_resume"; int rep; if (jp->jobSpecs.actPid) return 0; if (jobsig(jp, SIGCONT, FALSE) < 0) return -1; SBD_SET_STATE(jp, JOB_STAT_RUN); jp->jobSpecs.reasons = 0; jp->jobSpecs.subreasons = 0; rep = status_job (BATCH_STATUS_JOB, jp, jp->jobSpecs.jStatus, ERR_NO_ERROR); if (rep < 0) jp->notReported++; else { if (jp->notReported > 0) jp->notReported = 0; } if (logclass & (LC_TRACE | LC_SCHED | LC_EXEC)) ls_syslog(LOG_DEBUG1, "%s: Resume job %s", fname, lsb_jobid2str(jp->jobSpecs.jobId)); return 0; }
/** * @brief * Support function for req_stat_job() and stat_a_jobidname(). * Builds status reply for normal job, Array job, and if requested all of the * subjobs of the array (but not a single or range of subjobs). * @par * Note, if dohistjobs is not set and the job is history, no status or error * is returned. If an error return is needed, the caller must make that check. * * @param[in,out] preq - pointer to the stat job batch request, reply updated * @param[in] pjob - pointer to the job to be statused * @param[in] dohistjobs - flag to include job if it is a history job * @param[in] dosubjobs - flag to expand a Array job to include all subjobs * * @return int * @retval PBSE_NONE (0) : no error * @retval non-zero : PBS error code to return to client */ static int do_stat_of_a_job(struct batch_request *preq, job *pjob, int dohistjobs, int dosubjobs) { int indx; svrattrl *pal; int rc; struct batch_reply *preply = &preq->rq_reply; /* if history job and not asking for them, just return */ if ((!dohistjobs) && ((pjob->ji_qs.ji_state == JOB_STATE_FINISHED) || (pjob->ji_qs.ji_state == JOB_STATE_MOVED))) { return (PBSE_NONE); /* just return nothing */ } if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_SubJob) == 0) { /* this is not a subjob, go ahead and */ /* build the status reply for this job */ pal = (svrattrl *)GET_NEXT(preq->rq_ind.rq_status.rq_attr); rc = status_job(pjob, preq, pal, &preply->brp_un.brp_status, &bad); if (dosubjobs && (pjob->ji_qs.ji_svrflags & JOB_SVFLG_ArrayJob) && ((rc == PBSE_NONE) || (rc != PBSE_PERM)) && pjob->ji_ajtrk != NULL) { for (indx=0; indx<pjob->ji_ajtrk->tkm_ct; ++indx) { rc = status_subjob(pjob, preq, pal, indx, &preply->brp_un.brp_status, &bad); if (rc && (rc != PBSE_PERM)) break; } } if (rc && (rc != PBSE_PERM)) { return (rc); } } return (PBSE_NONE); }
static void req_stat_job_step2( struct stat_cntl *cntl) /* I/O (free'd on return) */ { svrattrl *pal; job *pjob = NULL; struct batch_request *preq; struct batch_reply *preply; int rc = 0; enum TJobStatTypeEnum type; pbs_queue *pque = NULL; int exec_only = 0; int bad = 0; long DTime; /* delta time - only report full pbs_attribute list if J->MTime > DTime */ static svrattrl *dpal = NULL; int job_array_index = 0; job_array *pa = NULL; char log_buf[LOCAL_LOG_BUF_SIZE]; all_jobs_iterator *iter; preq = cntl->sc_origrq; type = (enum TJobStatTypeEnum)cntl->sc_type; preply = &preq->rq_reply; /* See pbs_server_attributes(1B) for details on "poll_jobs" behaviour */ if (dpal == NULL) { /* build 'delta' pbs_attribute list */ svrattrl *tpal; tlist_head dalist; int aindex; int atrlist[] = { JOB_ATR_jobname, JOB_ATR_resc_used, JOB_ATR_LAST }; CLEAR_LINK(dalist); for (aindex = 0;atrlist[aindex] != JOB_ATR_LAST;aindex++) { if ((tpal = attrlist_create("", "", 23)) == NULL) { return; } tpal->al_valln = atrlist[aindex]; if (dpal == NULL) dpal = tpal; append_link(&dalist, &tpal->al_link, tpal); } } /* END if (dpal == NULL) */ if (type == tjstArray) { pa = get_array(preq->rq_ind.rq_status.rq_id); if (pa == NULL) { req_reject(PBSE_UNKARRAYID, 0, preq, NULL, "unable to find array"); return; } } { all_jobs *ajptr = NULL; if (type == tjstQueue) ajptr = cntl->sc_pque->qu_jobs; else if (type == tjstSummarizeArraysQueue) ajptr = cntl->sc_pque->qu_jobs_array_sum; else if (type == tjstSummarizeArraysServer) ajptr = &array_summary; else ajptr = &alljobs; ajptr->lock(); iter = ajptr->get_iterator(); ajptr->unlock(); } /* * now ready for part 3, building the status reply, * loop through again */ if ((type == tjstSummarizeArraysQueue) || (type == tjstSummarizeArraysServer)) { /* No array can be owned for these options */ update_array_statuses(); } if (type == tjstJob) pjob = svr_find_job(preq->rq_ind.rq_status.rq_id, FALSE); else if (type == tjstQueue) pjob = next_job(cntl->sc_pque->qu_jobs,iter); else if (type == tjstSummarizeArraysQueue) pjob = next_job(cntl->sc_pque->qu_jobs_array_sum,iter); else if (type == tjstSummarizeArraysServer) pjob = next_job(&array_summary,iter); else if (type == tjstArray) { job_array_index = -1; pjob = NULL; /* increment job_array_index until we find a non-null pointer or hit the end */ while (++job_array_index < pa->ai_qs.array_size) { if (pa->job_ids[job_array_index] != NULL) { if ((pjob = svr_find_job(pa->job_ids[job_array_index], FALSE)) != NULL) { break; } } } } else pjob = next_job(&alljobs,iter); DTime = 0; if (preq->rq_extend != NULL) { char *ptr; /* FORMAT: { EXECQONLY | DELTA:<EPOCHTIME> } */ if (strstr(preq->rq_extend, EXECQUEONLY)) exec_only = 1; ptr = strstr(preq->rq_extend, "DELTA:"); if (ptr != NULL) { ptr += strlen("delta:"); DTime = strtol(ptr, NULL, 10); } } if ((type == tjstTruncatedServer) || (type == tjstTruncatedQueue)) { long sentJobCounter; long qjcounter; long qmaxreport; all_queues_iterator *iter = NULL; svr_queues.lock(); iter = svr_queues.get_iterator(); svr_queues.unlock(); /* loop through all queues */ while ((pque = next_queue(&svr_queues,iter)) != NULL) { qjcounter = 0; if ((exec_only == 1) && (pque->qu_qs.qu_type != QTYPE_Execution)) { /* ignore routing queues */ unlock_queue(pque, __func__, "ignore queue", LOGLEVEL); continue; } if (((pque->qu_attr[QA_ATR_MaxReport].at_flags & ATR_VFLAG_SET) != 0) && (pque->qu_attr[QA_ATR_MaxReport].at_val.at_long >= 0)) { qmaxreport = pque->qu_attr[QA_ATR_MaxReport].at_val.at_long; } else { qmaxreport = TMAX_JOB; } if (LOGLEVEL >= 5) { sprintf(log_buf,"giving scheduler up to %ld idle jobs in queue %s\n", qmaxreport, pque->qu_qs.qu_name); log_event(PBSEVENT_SYSTEM,PBS_EVENTCLASS_QUEUE,pque->qu_qs.qu_name,log_buf); } sentJobCounter = 0; /* loop through jobs in queue */ if (pjob != NULL) unlock_ji_mutex(pjob, __func__, "5", LOGLEVEL); all_jobs_iterator *jobiter = NULL; pque->qu_jobs->lock(); jobiter = pque->qu_jobs->get_iterator(); pque->qu_jobs->unlock(); while ((pjob = next_job(pque->qu_jobs,jobiter)) != NULL) { if ((qjcounter >= qmaxreport) && (pjob->ji_qs.ji_state == JOB_STATE_QUEUED)) { /* max_report of queued jobs reached for queue */ unlock_ji_mutex(pjob, __func__, "6", LOGLEVEL); continue; } pal = (svrattrl *)GET_NEXT(preq->rq_ind.rq_status.rq_attr); rc = status_job( pjob, preq, (pjob->ji_wattr[JOB_ATR_mtime].at_val.at_long >= DTime) ? pal : dpal, &preply->brp_un.brp_status, &bad); if ((rc != 0) && (rc != PBSE_PERM)) { req_reject(rc, bad, preq, NULL, NULL); unlock_ji_mutex(pjob, __func__, "7", LOGLEVEL); unlock_queue(pque, __func__, "perm", LOGLEVEL); delete iter; return; } sentJobCounter++; if (pjob->ji_qs.ji_state == JOB_STATE_QUEUED) qjcounter++; unlock_ji_mutex(pjob, __func__, "8", LOGLEVEL); } /* END foreach (pjob from pque) */ if (LOGLEVEL >= 5) { sprintf(log_buf,"sent scheduler %ld total jobs for queue %s\n", sentJobCounter, pque->qu_qs.qu_name); log_event(PBSEVENT_SYSTEM,PBS_EVENTCLASS_QUEUE,pque->qu_qs.qu_name,log_buf); } unlock_queue(pque, __func__, "end while", LOGLEVEL); } /* END for (pque) */ reply_send_svr(preq); delete iter; return; } /* END if ((type == tjstTruncatedServer) || ...) */ while (pjob != NULL) { /* go ahead and build the status reply for this job */ if (exec_only) { if (cntl->sc_pque != NULL) { if (cntl->sc_pque->qu_qs.qu_type != QTYPE_Execution) goto nextjob; } else { if (pa != NULL) pthread_mutex_unlock(pa->ai_mutex); pque = get_jobs_queue(&pjob); if (pa != NULL) pthread_mutex_lock(pa->ai_mutex); if ((pjob == NULL) || (pque == NULL)) goto nextjob; mutex_mgr pque_mutex = mutex_mgr(pque->qu_mutex, true); if (pque->qu_qs.qu_type != QTYPE_Execution) { goto nextjob; } } } pal = (svrattrl *)GET_NEXT(preq->rq_ind.rq_status.rq_attr); rc = status_job( pjob, preq, pal, &preply->brp_un.brp_status, &bad); if ((rc != 0) && (rc != PBSE_PERM)) { if (pa != NULL) { unlock_ai_mutex(pa, __func__, "1", LOGLEVEL); } unlock_ji_mutex(pjob, __func__, "9", LOGLEVEL); req_reject(rc, bad, preq, NULL, NULL); delete iter; return; } /* get next job */ nextjob: if (pjob != NULL) unlock_ji_mutex(pjob, __func__, "10", LOGLEVEL); if (type == tjstJob) break; if (type == tjstQueue) pjob = next_job(cntl->sc_pque->qu_jobs,iter); else if (type == tjstSummarizeArraysQueue) pjob = next_job(cntl->sc_pque->qu_jobs_array_sum,iter); else if (type == tjstSummarizeArraysServer) pjob = next_job(&array_summary,iter); else if (type == tjstArray) { pjob = NULL; /* increment job_array_index until we find a non-null pointer or hit the end */ while (++job_array_index < pa->ai_qs.array_size) { if (pa->job_ids[job_array_index] != NULL) { if ((pjob = svr_find_job(pa->job_ids[job_array_index], FALSE)) != NULL) { break; } } } } else pjob = next_job(&alljobs,iter); rc = 0; } /* END while (pjob != NULL) */ delete iter; if (pa != NULL) { unlock_ai_mutex(pa, __func__, "1", LOGLEVEL); } reply_send_svr(preq); if (LOGLEVEL >= 7) { log_event(PBSEVENT_SYSTEM, PBS_EVENTCLASS_JOB, "req_statjob", "Successfully returned the status of queued jobs\n"); } return; } /* END req_stat_job_step2() */
static void req_stat_job_step2( struct stat_cntl *cntl) /* I/O (free'd on return) */ { svrattrl *pal; job *pjob = NULL; struct batch_request *preq; struct batch_reply *preply; int rc = 0; enum TJobStatTypeEnum type; pbs_queue *pque = NULL; int exec_only = 0; int bad = 0; long DTime; /* delta time - only report full pbs_attribute list if J->MTime > DTime */ static svrattrl *dpal = NULL; int job_array_index = 0; job_array *pa = NULL; char log_buf[LOCAL_LOG_BUF_SIZE]; int iter; time_t time_now = time(NULL); long poll_jobs = 0; char job_id[PBS_MAXSVRJOBID+1]; int job_substate = -1; time_t job_momstattime = -1; preq = cntl->sc_origrq; type = (enum TJobStatTypeEnum)cntl->sc_type; preply = &preq->rq_reply; /* See pbs_server_attributes(1B) for details on "poll_jobs" behaviour */ if (dpal == NULL) { /* build 'delta' pbs_attribute list */ svrattrl *tpal; tlist_head dalist; int aindex; int atrlist[] = { JOB_ATR_jobname, JOB_ATR_resc_used, JOB_ATR_LAST }; CLEAR_LINK(dalist); for (aindex = 0;atrlist[aindex] != JOB_ATR_LAST;aindex++) { if ((tpal = attrlist_create("", "", 23)) == NULL) { return; } tpal->al_valln = atrlist[aindex]; if (dpal == NULL) dpal = tpal; append_link(&dalist, &tpal->al_link, tpal); } } /* END if (dpal == NULL) */ if (type == tjstArray) { pa = get_array(preq->rq_ind.rq_status.rq_id); if (pa == NULL) { req_reject(PBSE_UNKARRAYID, 0, preq, NULL, "unable to find array"); return; } } iter = -1; get_svr_attr_l(SRV_ATR_PollJobs, &poll_jobs); if (!poll_jobs) { /* polljobs not set - indicates we may need to obtain fresh data from MOM */ if (cntl->sc_jobid[0] == '\0') pjob = NULL; else pjob = svr_find_job(cntl->sc_jobid, FALSE); while (1) { if (pjob == NULL) { /* start from the first job */ if (type == tjstJob) { pjob = svr_find_job(preq->rq_ind.rq_status.rq_id, FALSE); } else if (type == tjstQueue) { pjob = next_job(cntl->sc_pque->qu_jobs,&iter); } else if (type == tjstArray) { job_array_index = 0; /* increment job_array_index until we find a non-null pointer or hit the end */ while (job_array_index < pa->ai_qs.array_size) { if (pa->job_ids[job_array_index] != NULL) { if ((pjob = svr_find_job(pa->job_ids[job_array_index], FALSE)) != NULL) { unlock_ji_mutex(pjob, __func__, "2", LOGLEVEL); break; } } job_array_index++; } } else { pjob = next_job(&alljobs,&iter); } } /* END if (pjob == NULL) */ else { strcpy(job_id, pjob->ji_qs.ji_jobid); unlock_ji_mutex(pjob, __func__, "3", LOGLEVEL); if (type == tjstJob) break; if (type == tjstQueue) pjob = next_job(cntl->sc_pque->qu_jobs,&iter); else if (type == tjstArray) { pjob = NULL; /* increment job_array_index until we find a non-null pointer or hit the end */ while (++job_array_index < pa->ai_qs.array_size) { if (pa->job_ids[job_array_index] != NULL) { if ((pjob = svr_find_job(pa->job_ids[job_array_index], FALSE)) != NULL) { unlock_ji_mutex(pjob, __func__, "3", LOGLEVEL); break; } } } } else pjob = next_job(&alljobs,&iter); } if (pjob == NULL) break; strcpy(job_id, pjob->ji_qs.ji_jobid); job_substate = pjob->ji_qs.ji_substate; job_momstattime = pjob->ji_momstat; strcpy(cntl->sc_jobid, job_id); unlock_ji_mutex(pjob, __func__, "4", LOGLEVEL); pjob = NULL; /* PBS_RESTAT_JOB defaults to 30 seconds */ if ((job_substate == JOB_SUBSTATE_RUNNING) && ((time_now - job_momstattime) > JobStatRate)) { /* go to MOM for status */ if ((rc = stat_to_mom(job_id, cntl)) == PBSE_MEM_MALLOC) break; if (rc != 0) { pjob = svr_find_job(job_id, FALSE); rc = 0; continue; } if (pa != NULL) unlock_ai_mutex(pa, __func__, "1", LOGLEVEL); return; /* will pick up after mom replies */ } } /* END while(1) */ if (rc != 0) { if (pa != NULL) unlock_ai_mutex(pa, __func__, "2", LOGLEVEL); reply_free(preply); req_reject(rc, 0, preq, NULL, "cannot get update from mom"); return; } } /* END if (!server.sv_attr[SRV_ATR_PollJobs].at_val.at_long) */ /* * now ready for part 3, building the status reply, * loop through again */ if ((type == tjstSummarizeArraysQueue) || (type == tjstSummarizeArraysServer)) { /* No array can be owned for these options */ update_array_statuses(); } if (type == tjstJob) pjob = svr_find_job(preq->rq_ind.rq_status.rq_id, FALSE); else if (type == tjstQueue) pjob = next_job(cntl->sc_pque->qu_jobs,&iter); else if (type == tjstSummarizeArraysQueue) pjob = next_job(cntl->sc_pque->qu_jobs_array_sum,&iter); else if (type == tjstSummarizeArraysServer) pjob = next_job(&array_summary,&iter); else if (type == tjstArray) { job_array_index = -1; pjob = NULL; /* increment job_array_index until we find a non-null pointer or hit the end */ while (++job_array_index < pa->ai_qs.array_size) { if (pa->job_ids[job_array_index] != NULL) { if ((pjob = svr_find_job(pa->job_ids[job_array_index], FALSE)) != NULL) { break; } } } } else pjob = next_job(&alljobs,&iter); DTime = 0; if (preq->rq_extend != NULL) { char *ptr; /* FORMAT: { EXECQONLY | DELTA:<EPOCHTIME> } */ if (strstr(preq->rq_extend, EXECQUEONLY)) exec_only = 1; ptr = strstr(preq->rq_extend, "DELTA:"); if (ptr != NULL) { ptr += strlen("delta:"); DTime = strtol(ptr, NULL, 10); } } if ((type == tjstTruncatedServer) || (type == tjstTruncatedQueue)) { long sentJobCounter; long qjcounter; long qmaxreport; int iter = -1; /* loop through all queues */ while ((pque = next_queue(&svr_queues,&iter)) != NULL) { qjcounter = 0; if ((exec_only == 1) && (pque->qu_qs.qu_type != QTYPE_Execution)) { /* ignore routing queues */ unlock_queue(pque, __func__, "ignore queue", LOGLEVEL); continue; } if (((pque->qu_attr[QA_ATR_MaxReport].at_flags & ATR_VFLAG_SET) != 0) && (pque->qu_attr[QA_ATR_MaxReport].at_val.at_long >= 0)) { qmaxreport = pque->qu_attr[QA_ATR_MaxReport].at_val.at_long; } else { qmaxreport = TMAX_JOB; } if (LOGLEVEL >= 5) { sprintf(log_buf,"giving scheduler up to %ld idle jobs in queue %s\n", qmaxreport, pque->qu_qs.qu_name); log_event(PBSEVENT_SYSTEM,PBS_EVENTCLASS_QUEUE,pque->qu_qs.qu_name,log_buf); } sentJobCounter = 0; /* loop through jobs in queue */ if (pjob != NULL) unlock_ji_mutex(pjob, __func__, "5", LOGLEVEL); iter = -1; while ((pjob = next_job(pque->qu_jobs,&iter)) != NULL) { if ((qjcounter >= qmaxreport) && (pjob->ji_qs.ji_state == JOB_STATE_QUEUED)) { /* max_report of queued jobs reached for queue */ unlock_ji_mutex(pjob, __func__, "6", LOGLEVEL); continue; } pal = (svrattrl *)GET_NEXT(preq->rq_ind.rq_status.rq_attr); rc = status_job( pjob, preq, (pjob->ji_wattr[JOB_ATR_mtime].at_val.at_long >= DTime) ? pal : dpal, &preply->brp_un.brp_status, &bad); if ((rc != 0) && (rc != PBSE_PERM)) { req_reject(rc, bad, preq, NULL, NULL); if (pa != NULL) { unlock_ai_mutex(pa, __func__, "1", LOGLEVEL); } unlock_ji_mutex(pjob, __func__, "7", LOGLEVEL); unlock_queue(pque, __func__, "perm", LOGLEVEL); return; } sentJobCounter++; if (pjob->ji_qs.ji_state == JOB_STATE_QUEUED) qjcounter++; unlock_ji_mutex(pjob, __func__, "8", LOGLEVEL); } /* END foreach (pjob from pque) */ if (LOGLEVEL >= 5) { sprintf(log_buf,"sent scheduler %ld total jobs for queue %s\n", sentJobCounter, pque->qu_qs.qu_name); log_event(PBSEVENT_SYSTEM,PBS_EVENTCLASS_QUEUE,pque->qu_qs.qu_name,log_buf); } unlock_queue(pque, __func__, "end while", LOGLEVEL); } /* END for (pque) */ if (pa != NULL) unlock_ai_mutex(pa, __func__, "1", LOGLEVEL); reply_send_svr(preq); return; } /* END if ((type == tjstTruncatedServer) || ...) */ while (pjob != NULL) { /* go ahead and build the status reply for this job */ if (exec_only) { if (cntl->sc_pque != NULL) { if (cntl->sc_pque->qu_qs.qu_type != QTYPE_Execution) goto nextjob; } else { if (pa != NULL) pthread_mutex_unlock(pa->ai_mutex); pque = get_jobs_queue(&pjob); if (pa != NULL) pthread_mutex_lock(pa->ai_mutex); if ((pjob == NULL) || (pque == NULL)) goto nextjob; if (pque->qu_qs.qu_type != QTYPE_Execution) { unlock_queue(pque, __func__, "not exec", LOGLEVEL); goto nextjob; } unlock_queue(pque, __func__, "exec", LOGLEVEL); } } pal = (svrattrl *)GET_NEXT(preq->rq_ind.rq_status.rq_attr); rc = status_job( pjob, preq, pal, &preply->brp_un.brp_status, &bad); if ((rc != 0) && (rc != PBSE_PERM)) { if (pa != NULL) { unlock_ai_mutex(pa, __func__, "1", LOGLEVEL); } unlock_ji_mutex(pjob, __func__, "9", LOGLEVEL); req_reject(rc, bad, preq, NULL, NULL); return; } /* get next job */ nextjob: if (pjob != NULL) unlock_ji_mutex(pjob, __func__, "10", LOGLEVEL); if (type == tjstJob) break; if (type == tjstQueue) pjob = next_job(cntl->sc_pque->qu_jobs,&iter); else if (type == tjstSummarizeArraysQueue) pjob = next_job(cntl->sc_pque->qu_jobs_array_sum,&iter); else if (type == tjstSummarizeArraysServer) pjob = next_job(&array_summary,&iter); else if (type == tjstArray) { pjob = NULL; /* increment job_array_index until we find a non-null pointer or hit the end */ while (++job_array_index < pa->ai_qs.array_size) { if (pa->job_ids[job_array_index] != NULL) { if ((pjob = svr_find_job(pa->job_ids[job_array_index], FALSE)) != NULL) { break; } } } } else pjob = next_job(&alljobs,&iter); rc = 0; } /* END while (pjob != NULL) */ if (pa != NULL) { unlock_ai_mutex(pa, __func__, "1", LOGLEVEL); } reply_send_svr(preq); if (LOGLEVEL >= 7) { log_event(PBSEVENT_SYSTEM, PBS_EVENTCLASS_JOB, "req_statjob", "Successfully returned the status of queued jobs\n"); } return; } /* END req_stat_job_step2() */
/** * @brief * status_subjob - status a single subjob (of an Array Job) * Works by statusing the parrent unless subjob is actually running. * * @param[in,out] pjob - ptr to parent Array * @param[in] preq - request structure * @param[in] pal - specific attributes to status * @param[in] subj - if not = -1 then include subjob [n] * @param[in,out] pstathd - RETURN: head of list to append status to * @param[out] bad - RETURN: index of first bad attribute * * @return int * @retval 0 : success * @retval PBSE_PERM : client is not authorized to status the job * @retval PBSE_SYSTEM : memory allocation error * @retval PBSE_IVALREQ : something wrong with the flags */ int status_subjob(job *pjob, struct batch_request *preq, svrattrl *pal, int subj, pbs_list_head *pstathd, int *bad) { int limit = (int)JOB_ATR_LAST; struct brp_status *pstat; job *psubjob; /* ptr to job to status */ char realstate; int rc = 0; int oldeligflags = 0; int oldatypflags = 0; int subjob_state = -1; char *old_subjob_comment = NULL; /* see if the client is authorized to status this job */ if (! server.sv_attr[(int)SRV_ATR_query_others].at_val.at_long) if (svr_authorize_jobreq(preq, pjob)) return (PBSE_PERM); if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_ArrayJob) == 0) return PBSE_IVALREQ; /* if subjob is running, use real job structure */ if (get_subjob_state(pjob, subj) == JOB_STATE_RUNNING) { psubjob = find_job(mk_subjob_id(pjob, subj)); if (psubjob) status_job(psubjob, preq, pal, pstathd, bad); return 0; } /* otherwise we fake it with info from the parent */ /* allocate reply structure and fill in header portion */ /* for the general case, we don't want to include the parent's */ /* array related attrbutes as they belong only to the Array */ if (pal == NULL) limit = JOB_ATR_array; pstat = (struct brp_status *)malloc(sizeof(struct brp_status)); if (pstat == (struct brp_status *)0) return (PBSE_SYSTEM); CLEAR_LINK(pstat->brp_stlink); pstat->brp_objtype = MGR_OBJ_JOB; (void)strcpy(pstat->brp_objname, mk_subjob_id(pjob, subj)); CLEAR_HEAD(pstat->brp_attr); append_link(pstathd, &pstat->brp_stlink, pstat); /* add attributes to the status reply */ *bad = 0; /* * fake the job state and comment by setting the parent job's state * and comment to that of the subjob */ subjob_state = get_subjob_state(pjob, subj); realstate = pjob->ji_wattr[(int)JOB_ATR_state].at_val.at_char; pjob->ji_wattr[(int)JOB_ATR_state].at_val.at_char = statechars[subjob_state]; pjob->ji_wattr[(int)JOB_ATR_state].at_flags |= ATR_VFLAG_MODCACHE; if (subjob_state == JOB_STATE_EXPIRED || subjob_state == JOB_STATE_FINISHED) { if (pjob->ji_ajtrk->tkm_tbl[subj].trk_substate == JOB_SUBSTATE_FINISHED) { if (pjob->ji_wattr[(int)JOB_ATR_Comment].at_flags & ATR_VFLAG_SET) { old_subjob_comment = strdup(pjob->ji_wattr[(int)JOB_ATR_Comment].at_val.at_str); if (old_subjob_comment == (char *)0) return (PBSE_SYSTEM); } if (job_attr_def[(int)JOB_ATR_Comment].at_decode(&pjob->ji_wattr[(int)JOB_ATR_Comment], (char *)0, (char *)0, "Subjob finished") == PBSE_SYSTEM) { free(old_subjob_comment); return (PBSE_SYSTEM); } } else if (pjob->ji_ajtrk->tkm_tbl[subj].trk_substate == JOB_SUBSTATE_FAILED) { if (pjob->ji_wattr[(int)JOB_ATR_Comment].at_flags & ATR_VFLAG_SET) { old_subjob_comment = strdup(pjob->ji_wattr[(int)JOB_ATR_Comment].at_val.at_str); if (old_subjob_comment == (char *)0) return (PBSE_SYSTEM); } if (job_attr_def[(int)JOB_ATR_Comment].at_decode(&pjob->ji_wattr[(int)JOB_ATR_Comment], (char *)0, (char *)0, "Subjob failed") == PBSE_SYSTEM) { free(old_subjob_comment); return (PBSE_SYSTEM); } } else if (pjob->ji_ajtrk->tkm_tbl[subj].trk_substate == JOB_SUBSTATE_TERMINATED) { if (pjob->ji_wattr[(int)JOB_ATR_Comment].at_flags & ATR_VFLAG_SET) { old_subjob_comment = strdup(pjob->ji_wattr[(int)JOB_ATR_Comment].at_val.at_str); if (old_subjob_comment == (char *)0) return (PBSE_SYSTEM); } if (job_attr_def[(int)JOB_ATR_Comment].at_decode(&pjob->ji_wattr[(int)JOB_ATR_Comment], (char *)0, (char *)0, "Subjob terminated") == PBSE_SYSTEM) { free(old_subjob_comment); return (PBSE_SYSTEM); } } } /* when eligible_time_enable is off, */ /* clear the set flag so that eligible_time and accrue_type dont show */ if (server.sv_attr[(int)SRV_ATR_EligibleTimeEnable].at_val.at_long == 0) { oldeligflags = pjob->ji_wattr[(int)JOB_ATR_eligible_time].at_flags; pjob->ji_wattr[(int)JOB_ATR_eligible_time].at_flags &= ~ATR_VFLAG_SET; pjob->ji_wattr[(int)JOB_ATR_eligible_time].at_flags |= ATR_VFLAG_MODCACHE; oldatypflags = pjob->ji_wattr[(int)JOB_ATR_accrue_type].at_flags; pjob->ji_wattr[(int)JOB_ATR_accrue_type].at_flags &= ~ATR_VFLAG_SET; pjob->ji_wattr[(int)JOB_ATR_accrue_type].at_flags |= ATR_VFLAG_MODCACHE; /* Note: ATR_VFLAG_MODCACHE must be set because of svr_cached() does */ /* not correctly check ATR_VFLAG_SET */ } if (status_attrib(pal, job_attr_def, pjob->ji_wattr, limit, preq->rq_perm, &pstat->brp_attr, bad)) rc = PBSE_NOATTR; /* Set the parent state back to what it really is */ pjob->ji_wattr[(int)JOB_ATR_state].at_val.at_char = realstate; pjob->ji_wattr[(int)JOB_ATR_state].at_flags |= ATR_VFLAG_MODCACHE; /* Set the parent comment back to what it really is */ if (old_subjob_comment != NULL) { if (job_attr_def[(int)JOB_ATR_Comment].at_decode(&pjob->ji_wattr[(int)JOB_ATR_Comment], (char *)0, (char *)0, old_subjob_comment) == PBSE_SYSTEM) { free(old_subjob_comment); return (PBSE_SYSTEM); } free(old_subjob_comment); } /* reset the flags */ if (server.sv_attr[(int)SRV_ATR_EligibleTimeEnable].at_val.at_long == 0) { pjob->ji_wattr[(int)JOB_ATR_eligible_time].at_flags = oldeligflags; pjob->ji_wattr[(int)JOB_ATR_accrue_type].at_flags = oldatypflags; } return (rc); }
static void chkpntEnd (struct jobCard *jobCard, int w_status, bool_t *freed) { static char fname[] = "chkpntEnd()"; int savePid, saveStatus; if (IS_SUSP(jobCard->jobSpecs.jStatus) && !(jobCard->jobSpecs.jStatus & JOB_STAT_MIG)) jobsig(jobCard, SIGSTOP, TRUE); saveStatus = jobCard->jobSpecs.jStatus; if (jobCard->jobSpecs.jStatus & JOB_STAT_MIG) { if (w_status == 0) { if (!jobCard->missing) { jobCard->missing = TRUE; need_checkfinish = TRUE; return; } else if (jobCard->notReported == 0) return; if (jobCard->cleanupPid == 0) { if ((jobCard->cleanupPid = rmJobBufFilesPid(jobCard)) > 0) return; ls_syslog(LOG_ERR, _i18n_msg_get(ls_catd , NL_SETN, 5709, "%s: Unable to cleanup migrating job <%s>"), /* catgets 5709 */ fname, lsb_jobid2str(jobCard->jobSpecs.jobId)); } SBD_SET_STATE(jobCard, JOB_STAT_PEND); } else { jobCard->jobSpecs.jStatus &= ~JOB_STAT_MIG; } } savePid = jobCard->jobSpecs.actPid; if (status_job (BATCH_STATUS_JOB, jobCard, jobCard->jobSpecs.jStatus, w_status == 0 ? ERR_NO_ERROR : ERR_SYSACT_FAIL) < 0) { jobCard->jobSpecs.actPid = savePid; jobCard->jobSpecs.jStatus = saveStatus; } else { jobCard->lastChkpntTime = now; jobCard->jobSpecs.actPid = 0; jobCard->actStatus = ACT_NO; jobCard->jobSpecs.actValue = SIG_NULL; if (w_status == 0) { jobCard->migCnt = 1; } if (saveStatus & JOB_STAT_MIG) { if (w_status == 0) { cleanupMigJob(jobCard); deallocJobCard(jobCard); *freed = TRUE; } else jobCard->migCnt *= 2; } } }
void do_jobSetup(XDR * xdrs, int chfd, struct LSFHeader * reqHdr) { static char fname[] = "do_jobSetup()"; struct jobSetup jsetup; struct jobCard *jp = NULL; char found = FALSE; struct jobCard savejp; if (logclass & LC_EXEC) ls_syslog(LOG_DEBUG, "%s: Entering ...", fname); if (!xdr_jobSetup(xdrs, &jsetup, reqHdr)) { ls_syslog(LOG_ERR, I18N_FUNC_FAIL, fname, "xdr_jobSetup"); return; } for (jp = jobQueHead->forw; (jp != jobQueHead); jp = jp->forw) { if (jp->jobSpecs.jobId != jsetup.jobId) continue; found = TRUE; break; } if (found == FALSE) { ls_syslog(LOG_ERR, _i18n_msg_get(ls_catd , NL_SETN, 5838, "%s: Job <%s> is not found"), /* catgets 5838 */ fname, lsb_jobid2str(jsetup.jobId)); replyHdrWithRC(LSBE_NO_JOB, chfd, jsetup.jobId); return; } if (jp->jobSpecs.actPid) return; memcpy((char *) &savejp, (char *) jp, sizeof(savejp)); jp->execJobFlag |= JOB_EXEC_QPRE_KNOWN; if (jsetup.execJobFlag & JOB_EXEC_QPRE_OK) jp->execJobFlag |= JOB_EXEC_QPRE_OK; jp->jobSpecs.jobPid = jsetup.jobPid; jp->jobSpecs.jobPGid = jsetup.jobPGid; jp->jobSpecs.execUid = jsetup.execUid; strcpy(jp->jobSpecs.execUsername, jsetup.execUsername); jp->execGid = jsetup.execGid; strcpy(jp->execUsername, jsetup.execUsername); strcpy(jp->jobSpecs.execCwd, jsetup.execCwd); strcpy(jp->jobSpecs.execHome, jsetup.execHome); if (jsetup.jStatus & JOB_STAT_RUN) { if (!(jsetup.jStatus & JOB_STAT_PRE_EXEC)) jp->jobSpecs.jStatus &= ~JOB_STAT_PRE_EXEC; if (status_job(BATCH_STATUS_JOB, jp, jp->jobSpecs.jStatus, ERR_NO_ERROR) < 0) { memcpy((char *) jp, (char *) &savejp, sizeof(savejp)); return; } jp->execJobFlag |= JOB_EXEC_STARTED; } else { jp->jobSpecs.reasons = jsetup.reason; jp->collectedChild = TRUE; jp->notReported = 0; jp->exitPid = -1; jp->needReportRU = FALSE; jp->jobSpecs.jStatus = jsetup.jStatus; jp->w_status = jsetup.w_status; jp->lsfRusage = jsetup.lsfRusage; jp->cpuTime = jsetup.cpuTime; if (job_finish(jp, TRUE) < 0) { memcpy((char *) jp, (char *) &savejp, sizeof(savejp)); return; } } if (replyHdrWithRC(LSBE_NO_ERROR, chfd, jsetup.jobId) < 0) { ls_syslog(LOG_DEBUG, "%s: Reply header failed for job <%s>", fname, lsb_jobid2str(jsetup.jobId)); } if (logclass & LC_EXEC) ls_syslog(LOG_DEBUG1, "%s: JobId %s jstatus %d reason %x jobPid %d jobPGid %d execUid %d execGid <%d> execUser <%s> execHome <%s> execCwd <%s> execJobFlag %x cpuTime %f w_status %d", fname, lsb_jobid2str(jsetup.jobId), jsetup.jStatus, jsetup.reason, jsetup.jobPid, jsetup.jobPGid, jsetup.execUid, jsetup.execGid, jsetup.execUsername, jsetup.execHome, jsetup.execCwd, jsetup.execJobFlag, jsetup.cpuTime, jsetup.w_status); }
void do_newjob(XDR *xdrs, int chfd, struct LSFHeader *reqHdr) { static char fname[] = "do_newjob()"; char reply_buf[MSGSIZE]; XDR xdrs2; struct jobSpecs jobSpecs; struct jobReply jobReply; struct jobCard *jp; sbdReplyType reply; struct LSFHeader replyHdr; char *replyStruct; struct lsfAuth *auth = NULL; memset(&jobReply, 0, sizeof(struct jobReply)); if (!xdr_jobSpecs(xdrs, &jobSpecs, reqHdr)) { reply = ERR_BAD_REQ; ls_syslog(LOG_ERR, I18N_FUNC_FAIL, fname, "xdr_jobSpecs"); goto sendReply; } for (jp = jobQueHead->forw; (jp != jobQueHead); jp = jp->forw) { if (jp->jobSpecs.jobId == jobSpecs.jobId) { jobReply.jobId = jp->jobSpecs.jobId; jobReply.jobPid = jp->jobSpecs.jobPid; jobReply.jobPGid = jp->jobSpecs.jobPGid; jobReply.jStatus = jp->jobSpecs.jStatus; reply = ERR_NO_ERROR; goto sendReply; } } jp = calloc(1, sizeof(struct jobCard)); if (jp == NULL) { ls_syslog(LOG_ERR, I18N_JOB_FAIL_S_M, fname, lsb_jobid2str(jobSpecs.jobId), "calloc"); reply = ERR_MEM; goto sendReply; } memcpy((char *) &jp->jobSpecs, (char *) &jobSpecs, sizeof(struct jobSpecs)); jp->jobSpecs.jStatus &= ~JOB_STAT_MIG; jp->jobSpecs.startTime = now; jp->jobSpecs.reasons = 0; jp->jobSpecs.subreasons = 0; /* Initialize the core number */ jp->core_num = -1; if (jp->jobSpecs.jAttrib & Q_ATTRIB_EXCLUSIVE) { if (lockHosts (jp) < 0) { ls_syslog(LOG_ERR, I18N_JOB_FAIL_S, fname, lsb_jobid2str(jp->jobSpecs.jobId), "lockHosts"); unlockHosts (jp, jp->jobSpecs.numToHosts); reply = ERR_LOCK_FAIL; freeWeek(jp->week); FREEUP(jp); goto sendReply; } } jp->runTime = 0; if (initJobCard(jp, &jobSpecs, (int *)&reply) < 0) { if (jp->jobSpecs.jAttrib & Q_ATTRIB_EXCLUSIVE) { unlockHosts (jp, jp->jobSpecs.numToHosts); } FREEUP(jp); goto sendReply; } jp->execJobFlag = 0; if (jp->runTime < 0) { jp->runTime = 0; } jp->execGid = 0; jp->execUsername[0] = '\0'; jp->jobSpecs.execUid = -1; jp->jobSpecs.execUsername[0] = '\0'; if (jp->jobSpecs.jobSpoolDir[0] != '\0') { char *tmp; if ((tmp = getUnixSpoolDir (jp->jobSpecs.jobSpoolDir)) == NULL) { jp->jobSpecs.jobSpoolDir[0] = '\0'; } } if ((logclass & LC_TRACE) && jp->jobSpecs.jobSpoolDir[0] != 0) { ls_syslog(LOG_DEBUG, "%s: the SpoolDir for job <%s> is %s \n", fname, lsb_jobid2str(jp->jobSpecs.jobId), jp->jobSpecs.jobSpoolDir); } if (jp->jobSpecs.options & SUB_PRE_EXEC) SBD_SET_STATE(jp, (JOB_STAT_RUN | JOB_STAT_PRE_EXEC)) else SBD_SET_STATE(jp, JOB_STAT_RUN); reply = job_exec(jp, chfd); if (reply != ERR_NO_ERROR) { ls_syslog(LOG_ERR, I18N_JOB_FAIL_S, fname, lsb_jobid2str(jp->jobSpecs.jobId), "job_exec"); if (jp->jobSpecs.jAttrib & Q_ATTRIB_EXCLUSIVE) { unlockHosts (jp, jp->jobSpecs.numToHosts); } deallocJobCard(jp); } else { jobReply.jobId = jp->jobSpecs.jobId; jobReply.jobPid = jp->jobSpecs.jobPid; jobReply.jobPGid = jp->jobSpecs.jobPGid; jobReply.jStatus = jp->jobSpecs.jStatus; } sendReply: xdr_lsffree(xdr_jobSpecs, (char *)&jobSpecs, reqHdr); xdrmem_create(&xdrs2, reply_buf, MSGSIZE, XDR_ENCODE); initLSFHeader_(&replyHdr); replyHdr.opCode = reply; replyStruct = (reply == ERR_NO_ERROR) ? (char *) &jobReply : (char *) NULL; if (!xdr_encodeMsg(&xdrs2, replyStruct, &replyHdr, xdr_jobReply, 0, auth)) { ls_syslog(LOG_ERR, I18N_FUNC_FAIL, fname, "xdr_jobReply"); lsb_merr(_i18n_msg_get(ls_catd , NL_SETN, 5804, "Fatal error: xdr_jobReply() failed; sbatchd relifing")); /* catgets 5804 */ relife(); } if (chanWrite_(chfd, reply_buf, XDR_GETPOS(&xdrs2)) <= 0) { ls_syslog(LOG_ERR, _i18n_msg_get(ls_catd , NL_SETN, 5805, "%s: Sending jobReply (len=%d) to master failed: %m"), /* catgets 5805 */ fname, XDR_GETPOS(&xdrs2)); } xdr_destroy(&xdrs2); if (reply == ERR_NO_ERROR && !daemonParams[LSB_BSUBI_OLD].paramValue && PURE_INTERACTIVE(&jp->jobSpecs)) { if (status_job (BATCH_STATUS_JOB, jp, jp->jobSpecs.jStatus, ERR_NO_ERROR) < 0) { jp->notReported++; } } }
void req_stat_job_step2( struct stat_cntl *cntl) /* I/O (free'd on return) */ { batch_request *preq = cntl->sc_origrq; svrattrl *pal = (svrattrl *)GET_NEXT(preq->rq_ind.rq_status.rq_attr); job *pjob = NULL; struct batch_reply *preply = &preq->rq_reply; int rc = 0; enum TJobStatTypeEnum type = (enum TJobStatTypeEnum)cntl->sc_type; bool exec_only = false; int bad = 0; /* delta time - only report full pbs_attribute list if J->MTime > DTime */ int job_array_index = -1; job_array *pa = NULL; all_jobs_iterator *iter; if (preq->rq_extend != NULL) { /* FORMAT: { EXECQONLY } */ if (strstr(preq->rq_extend, EXECQUEONLY)) exec_only = true; } if ((type == tjstTruncatedServer) || (type == tjstTruncatedQueue)) { handle_truncated_qstat(exec_only, cntl->sc_condensed, preq); return; } /* END if ((type == tjstTruncatedServer) || ...) */ else if (type == tjstJob) { pjob = svr_find_job(preq->rq_ind.rq_status.rq_id, FALSE); if (pjob != NULL) { if ((rc = status_job(pjob, preq, pal, &preply->brp_un.brp_status, cntl->sc_condensed, &bad))) req_reject(rc, bad, preq, NULL, NULL); else reply_send_svr(preq); unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL); } else { req_reject(PBSE_JOBNOTFOUND, bad, preq, NULL, NULL); } } else { if (type == tjstArray) { pa = get_array(preq->rq_ind.rq_status.rq_id); if (pa == NULL) { req_reject(PBSE_UNKARRAYID, 0, preq, NULL, "unable to find array"); return; } } else if ((type == tjstSummarizeArraysQueue) || (type == tjstSummarizeArraysServer)) update_array_statuses(); iter = get_correct_status_iterator(cntl); for (pjob = get_next_status_job(cntl, job_array_index, pa, iter); pjob != NULL; pjob = get_next_status_job(cntl, job_array_index, pa, iter)) { mutex_mgr job_mutex(pjob->ji_mutex, true); /* go ahead and build the status reply for this job */ if (pjob->ji_being_recycled == true) continue; if (exec_only) { if (cntl->sc_pque != NULL) { if (cntl->sc_pque->qu_qs.qu_type != QTYPE_Execution) continue; } else if (in_execution_queue(pjob, pa) == false) continue; } rc = status_job(pjob, preq, pal, &preply->brp_un.brp_status, cntl->sc_condensed, &bad); if ((rc != PBSE_NONE) && (rc != PBSE_PERM)) { if (pa != NULL) unlock_ai_mutex(pa, __func__, "1", LOGLEVEL); req_reject(rc, bad, preq, NULL, NULL); delete iter; return; } } /* END for (pjob != NULL) */ delete iter; if (pa != NULL) { unlock_ai_mutex(pa, __func__, "1", LOGLEVEL); } reply_send_svr(preq); } if (LOGLEVEL >= 7) { log_event(PBSEVENT_SYSTEM, PBS_EVENTCLASS_JOB, "req_statjob", "Successfully returned the status of queued jobs\n"); } return; } /* END req_stat_job_step2() */
void handle_truncated_qstat( bool exec_only, bool condensed, batch_request *preq) { long sentJobCounter = 0; long qmaxreport; all_queues_iterator *queue_iter = NULL; pbs_queue *pque; char log_buf[LOCAL_LOG_BUF_SIZE]; job *pjob; svrattrl *pal = (svrattrl *)GET_NEXT(preq->rq_ind.rq_status.rq_attr); batch_reply *preply = &preq->rq_reply; int bad = 0; svr_queues.lock(); queue_iter = svr_queues.get_iterator(); svr_queues.unlock(); /* loop through all queues */ while ((pque = next_queue(&svr_queues, queue_iter)) != NULL) { long qjcounter = 0; mutex_mgr queue_mutex(pque->qu_mutex, true); if ((exec_only == true) && (pque->qu_qs.qu_type != QTYPE_Execution)) { /* ignore routing queues */ continue; } if (((pque->qu_attr[QA_ATR_MaxReport].at_flags & ATR_VFLAG_SET) != 0) && (pque->qu_attr[QA_ATR_MaxReport].at_val.at_long >= 0)) { qmaxreport = pque->qu_attr[QA_ATR_MaxReport].at_val.at_long; } else { qmaxreport = TMAX_JOB; } if (LOGLEVEL >= 5) { snprintf(log_buf, sizeof(log_buf), "Reporting up to %ld idle jobs in queue %s\n", qmaxreport, pque->qu_qs.qu_name); log_event(PBSEVENT_SYSTEM,PBS_EVENTCLASS_QUEUE,pque->qu_qs.qu_name,log_buf); } /* loop through jobs in queue */ all_jobs_iterator *jobiter = NULL; pque->qu_jobs->lock(); jobiter = pque->qu_jobs->get_iterator(); pque->qu_jobs->unlock(); while ((pjob = next_job(pque->qu_jobs, jobiter)) != NULL) { mutex_mgr job_mgr(pjob->ji_mutex, true); if ((qjcounter >= qmaxreport) && (pjob->ji_qs.ji_state == JOB_STATE_QUEUED)) { /* max_report of queued jobs reached for queue */ continue; } int rc = status_job(pjob, preq, pal, &preply->brp_un.brp_status, condensed, &bad); if ((rc != 0) && (rc != PBSE_PERM)) { req_reject(rc, bad, preq, NULL, NULL); delete queue_iter; return; } sentJobCounter++; if (pjob->ji_qs.ji_state == JOB_STATE_QUEUED) qjcounter++; } /* END foreach (pjob from pque) */ if (LOGLEVEL >= 5) { snprintf(log_buf, sizeof(log_buf), "Reported %ld total jobs for queue %s\n", sentJobCounter, pque->qu_qs.qu_name); log_event(PBSEVENT_SYSTEM,PBS_EVENTCLASS_QUEUE,pque->qu_qs.qu_name,log_buf); } } /* END for (pque) */ reply_send_svr(preq); delete queue_iter; return; } // END handle_truncated_qstat()
static void req_stat_job_step2( struct stat_cntl *cntl) /* I/O (freed on return) */ { svrattrl *pal; job *pjob = NULL; struct batch_request *preq; struct batch_reply *preply; int rc = 0; enum TJobStatTypeEnum type; pbs_queue *pque = NULL; int exec_only = 0; int IsTruncated = 0; long DTime; /* delta time - only report full attribute list if J->MTime > DTime */ static svrattrl *dpal = NULL; int job_array_index = 0; job_array *pa = NULL; preq = cntl->sc_origrq; type = (enum TJobStatTypeEnum)cntl->sc_type; preply = &preq->rq_reply; /* See pbs_server_attributes(1B) for details on "poll_jobs" behaviour */ /* NOTE: If IsTruncated is true, should walk all queues and walk jobs in each queue until max_reported is reached (NYI) */ if (dpal == NULL) { /* build 'delta' attribute list */ svrattrl *tpal; tlist_head dalist; int aindex; int atrlist[] = { JOB_ATR_jobname, JOB_ATR_resc_used, JOB_ATR_LAST }; CLEAR_LINK(dalist); for (aindex = 0;atrlist[aindex] != JOB_ATR_LAST;aindex++) { if ((tpal = attrlist_create("", "", 23)) == NULL) { return; } tpal->al_valln = atrlist[aindex]; if (dpal == NULL) dpal = tpal; append_link(&dalist, &tpal->al_link, tpal); } } /* END if (dpal == NULL) */ if (type == tjstArray) { pa = get_array(preq->rq_ind.rq_status.rq_id); } if (!server.sv_attr[(int)SRV_ATR_PollJobs].at_val.at_long) { /* polljobs not set - indicates we may need to obtain fresh data from MOM */ if (cntl->sc_jobid[0] == '\0') pjob = NULL; else pjob = find_job(cntl->sc_jobid); while (1) { if (pjob == NULL) { /* start from the first job */ if (type == tjstJob) { pjob = find_job(preq->rq_ind.rq_status.rq_id); } else if (type == tjstQueue) { pjob = (job *)GET_NEXT(cntl->sc_pque->qu_jobs); } else if (type == tjstArray) { job_array_index = 0; /* increment job_array_index until we find a non-null pointer or hit the end */ while (job_array_index < pa->ai_qs.array_size && (pjob = pa->jobs[job_array_index]) == NULL) job_array_index++; } else { if ((type == tjstTruncatedServer) || (type == tjstTruncatedQueue)) IsTruncated = TRUE; pjob = (job *)GET_NEXT(svr_alljobs); } } /* END if (pjob == NULL) */ else { /* get next job */ if (type == tjstJob) break; if (type == tjstQueue) pjob = (job *)GET_NEXT(pjob->ji_jobque); else pjob = (job *)GET_NEXT(pjob->ji_alljobs); if (type == tjstArray) { pjob = NULL; /* increment job_array_index until we find a non-null pointer or hit the end */ while (++job_array_index < pa->ai_qs.array_size && (pjob = pa->jobs[job_array_index]) == NULL) ; } } if (pjob == NULL) break; /* PBS_RESTAT_JOB defaults to 30 seconds */ if ((pjob->ji_qs.ji_substate == JOB_SUBSTATE_RUNNING) && ((time_now - pjob->ji_momstat) > JobStatRate)) { /* go to MOM for status */ strcpy(cntl->sc_jobid, pjob->ji_qs.ji_jobid); if ((rc = stat_to_mom(pjob, cntl)) == PBSE_SYSTEM) { break; } if (rc != 0) { rc = 0; continue; } return; /* will pick up after mom replies */ } } /* END while(1) */ if (cntl->sc_conn >= 0) svr_disconnect(cntl->sc_conn); /* close connection to MOM */ if (rc != 0) { free(cntl); reply_free(preply); req_reject(rc, 0, preq, NULL, "cannot get update from mom"); return; } } /* END if (!server.sv_attr[(int)SRV_ATR_PollJobs].at_val.at_long) */ /* * now ready for part 3, building the status reply, * loop through again */ if (type == tjstSummarizeArraysQueue || type == tjstSummarizeArraysServer) { update_array_statuses(); } if (type == tjstJob) pjob = find_job(preq->rq_ind.rq_status.rq_id); else if (type == tjstQueue) pjob = (job *)GET_NEXT(cntl->sc_pque->qu_jobs); else if (type == tjstSummarizeArraysQueue) pjob = (job *)GET_NEXT(cntl->sc_pque->qu_jobs_array_sum); else if (type == tjstSummarizeArraysServer) pjob = (job *)GET_NEXT(svr_jobs_array_sum); else if (type == tjstArray) { job_array_index = 0; pjob = NULL; /* increment job_array_index until we find a non-null pointer or hit the end */ while (job_array_index < pa->ai_qs.array_size && (pjob = pa->jobs[job_array_index]) == NULL) job_array_index++; } else pjob = (job *)GET_NEXT(svr_alljobs); DTime = 0; if (preq->rq_extend != NULL) { char *ptr; /* FORMAT: { EXECQONLY | DELTA:<EPOCHTIME> } */ if (strstr(preq->rq_extend, EXECQUEONLY)) exec_only = 1; ptr = strstr(preq->rq_extend, "DELTA:"); if (ptr != NULL) { ptr += strlen("delta:"); DTime = strtol(ptr, NULL, 10); } } free(cntl); if ((type == tjstTruncatedServer) || (type == tjstTruncatedQueue)) { long sentJobCounter; long qjcounter; long qmaxreport; /* loop through all queues */ for (pque = (pbs_queue *)GET_NEXT(svr_queues); pque != NULL; pque = (pbs_queue *)GET_NEXT(pque->qu_link)) { qjcounter = 0; if ((exec_only == 1) && (pque->qu_qs.qu_type != QTYPE_Execution)) { /* ignore routing queues */ continue; } if (((pque->qu_attr[QA_ATR_MaxReport].at_flags & ATR_VFLAG_SET) != 0) && (pque->qu_attr[QA_ATR_MaxReport].at_val.at_long >= 0)) { qmaxreport = pque->qu_attr[QA_ATR_MaxReport].at_val.at_long; } else { qmaxreport = TMAX_JOB; } if (LOGLEVEL >= 5) { sprintf(log_buffer,"giving scheduler up to %ld idle jobs in queue %s\n", qmaxreport, pque->qu_qs.qu_name); log_event( PBSEVENT_SYSTEM, PBS_EVENTCLASS_QUEUE, pque->qu_qs.qu_name, log_buffer); } sentJobCounter = 0; /* loop through jobs in queue */ for (pjob = (job *)GET_NEXT(pque->qu_jobs); pjob != NULL; pjob = (job *)GET_NEXT(pjob->ji_jobque)) { if ((qjcounter >= qmaxreport) && (pjob->ji_qs.ji_state == JOB_STATE_QUEUED)) { /* max_report of queued jobs reached for queue */ continue; } pal = (svrattrl *)GET_NEXT(preq->rq_ind.rq_status.rq_attr); rc = status_job( pjob, preq, (pjob->ji_wattr[(int)JOB_ATR_mtime].at_val.at_long >= DTime) ? pal : dpal, &preply->brp_un.brp_status, &bad); if ((rc != 0) && (rc != PBSE_PERM)) { req_reject(rc, bad, preq, NULL, NULL); return; } sentJobCounter++; if (pjob->ji_qs.ji_state == JOB_STATE_QUEUED) qjcounter++; } /* END for (pjob) */ if (LOGLEVEL >= 5) { sprintf(log_buffer,"sent scheduler %ld total jobs for queue %s\n", sentJobCounter, pque->qu_qs.qu_name); log_event( PBSEVENT_SYSTEM, PBS_EVENTCLASS_QUEUE, pque->qu_qs.qu_name, log_buffer); } } /* END for (pque) */ reply_send(preq); return; } /* END if ((type == tjstTruncatedServer) || ...) */ while (pjob != NULL) { /* go ahead and build the status reply for this job */ if (exec_only) { pque = find_queuebyname(pjob->ji_qs.ji_queue); if (pque->qu_qs.qu_type != QTYPE_Execution) goto nextjob; } pal = (svrattrl *)GET_NEXT(preq->rq_ind.rq_status.rq_attr); rc = status_job( pjob, preq, pal, &preply->brp_un.brp_status, &bad); if ((rc != 0) && (rc != PBSE_PERM)) { req_reject(rc, bad, preq, NULL, NULL); return; } /* get next job */ nextjob: if (type == tjstJob) break; if (type == tjstQueue) pjob = (job *)GET_NEXT(pjob->ji_jobque); else if (type == tjstSummarizeArraysQueue) pjob = (job *)GET_NEXT(pjob->ji_jobque_array_sum); else if (type == tjstSummarizeArraysServer) pjob = (job *)GET_NEXT(pjob->ji_jobs_array_sum); else if (type == tjstArray) { pjob = NULL; /* increment job_array_index until we find a non-null pointer or hit the end */ while (++job_array_index < pa->ai_qs.array_size && (pjob = pa->jobs[job_array_index]) == NULL) ; } else pjob = (job *)GET_NEXT(pjob->ji_alljobs); rc = 0; } /* END while (pjob != NULL) */ reply_send(preq); if (LOGLEVEL >= 7) { log_event(PBSEVENT_SYSTEM, PBS_EVENTCLASS_JOB, "req_statjob", "Successfully returned the status of queued jobs\n"); } return; } /* END req_stat_job_step2() */