void update_array_statuses() { job_array *pa; job *pjob; int iter = -1; unsigned int running; unsigned int queued; unsigned int complete; char log_buf[LOCAL_LOG_BUF_SIZE]; char jobid[PBS_MAXSVRJOBID+1]; while ((pa = next_array(&iter)) != NULL) { running = pa->ai_qs.jobs_running; complete = pa->ai_qs.num_failed + pa->ai_qs.num_successful; queued = pa->ai_qs.num_jobs - running - complete; if (LOGLEVEL >= 7) { sprintf(log_buf, "%s: unlocking ai_mutex", __func__); log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, __func__, log_buf); } strcpy(jobid, pa->ai_qs.parent_id); unlock_ai_mutex(pa, __func__, "1", LOGLEVEL); if ((pjob = svr_find_job(jobid, TRUE)) != NULL) { mutex_mgr pjob_mutex = mutex_mgr(pjob->ji_mutex, true); if (running > 0) { svr_setjobstate(pjob, JOB_STATE_RUNNING, pjob->ji_qs.ji_substate, FALSE); } else if ((complete > 0) && (queued == 0)) { svr_setjobstate(pjob, JOB_STATE_COMPLETE, pjob->ji_qs.ji_substate, FALSE); } else { /* default to just calling the array queued */ svr_setjobstate(pjob, JOB_STATE_QUEUED, pjob->ji_qs.ji_substate, FALSE); } } } /* END for each array */ } /* END update_array_statuses() */
/* return a server's array info struct corresponding to an array id */ job_array *get_array( char *id) { job_array *pa; int iter = -1; while ((pa = next_array(&iter)) != NULL) { lock_ai_mutex(pa, __func__, NULL, LOGLEVEL); if (strcmp(pa->ai_qs.parent_id, id) == 0) { snprintf(pa->ai_qs.parent_id, sizeof(pa->ai_qs.parent_id), "%s", id); return(pa); } unlock_ai_mutex(pa, __func__, "1", LOGLEVEL); } return(NULL); } /* END get_array() */
static void req_stat_job_step2( struct stat_cntl *cntl) /* I/O (free'd on return) */ { svrattrl *pal; job *pjob = NULL; struct batch_request *preq; struct batch_reply *preply; int rc = 0; enum TJobStatTypeEnum type; pbs_queue *pque = NULL; int exec_only = 0; int bad = 0; long DTime; /* delta time - only report full pbs_attribute list if J->MTime > DTime */ static svrattrl *dpal = NULL; int job_array_index = 0; job_array *pa = NULL; char log_buf[LOCAL_LOG_BUF_SIZE]; all_jobs_iterator *iter; preq = cntl->sc_origrq; type = (enum TJobStatTypeEnum)cntl->sc_type; preply = &preq->rq_reply; /* See pbs_server_attributes(1B) for details on "poll_jobs" behaviour */ if (dpal == NULL) { /* build 'delta' pbs_attribute list */ svrattrl *tpal; tlist_head dalist; int aindex; int atrlist[] = { JOB_ATR_jobname, JOB_ATR_resc_used, JOB_ATR_LAST }; CLEAR_LINK(dalist); for (aindex = 0;atrlist[aindex] != JOB_ATR_LAST;aindex++) { if ((tpal = attrlist_create("", "", 23)) == NULL) { return; } tpal->al_valln = atrlist[aindex]; if (dpal == NULL) dpal = tpal; append_link(&dalist, &tpal->al_link, tpal); } } /* END if (dpal == NULL) */ if (type == tjstArray) { pa = get_array(preq->rq_ind.rq_status.rq_id); if (pa == NULL) { req_reject(PBSE_UNKARRAYID, 0, preq, NULL, "unable to find array"); return; } } { all_jobs *ajptr = NULL; if (type == tjstQueue) ajptr = cntl->sc_pque->qu_jobs; else if (type == tjstSummarizeArraysQueue) ajptr = cntl->sc_pque->qu_jobs_array_sum; else if (type == tjstSummarizeArraysServer) ajptr = &array_summary; else ajptr = &alljobs; ajptr->lock(); iter = ajptr->get_iterator(); ajptr->unlock(); } /* * now ready for part 3, building the status reply, * loop through again */ if ((type == tjstSummarizeArraysQueue) || (type == tjstSummarizeArraysServer)) { /* No array can be owned for these options */ update_array_statuses(); } if (type == tjstJob) pjob = svr_find_job(preq->rq_ind.rq_status.rq_id, FALSE); else if (type == tjstQueue) pjob = next_job(cntl->sc_pque->qu_jobs,iter); else if (type == tjstSummarizeArraysQueue) pjob = next_job(cntl->sc_pque->qu_jobs_array_sum,iter); else if (type == tjstSummarizeArraysServer) pjob = next_job(&array_summary,iter); else if (type == tjstArray) { job_array_index = -1; pjob = NULL; /* increment job_array_index until we find a non-null pointer or hit the end */ while (++job_array_index < pa->ai_qs.array_size) { if (pa->job_ids[job_array_index] != NULL) { if ((pjob = svr_find_job(pa->job_ids[job_array_index], FALSE)) != NULL) { break; } } } } else pjob = next_job(&alljobs,iter); DTime = 0; if (preq->rq_extend != NULL) { char *ptr; /* FORMAT: { EXECQONLY | DELTA:<EPOCHTIME> } */ if (strstr(preq->rq_extend, EXECQUEONLY)) exec_only = 1; ptr = strstr(preq->rq_extend, "DELTA:"); if (ptr != NULL) { ptr += strlen("delta:"); DTime = strtol(ptr, NULL, 10); } } if ((type == tjstTruncatedServer) || (type == tjstTruncatedQueue)) { long sentJobCounter; long qjcounter; long qmaxreport; all_queues_iterator *iter = NULL; svr_queues.lock(); iter = svr_queues.get_iterator(); svr_queues.unlock(); /* loop through all queues */ while ((pque = next_queue(&svr_queues,iter)) != NULL) { qjcounter = 0; if ((exec_only == 1) && (pque->qu_qs.qu_type != QTYPE_Execution)) { /* ignore routing queues */ unlock_queue(pque, __func__, "ignore queue", LOGLEVEL); continue; } if (((pque->qu_attr[QA_ATR_MaxReport].at_flags & ATR_VFLAG_SET) != 0) && (pque->qu_attr[QA_ATR_MaxReport].at_val.at_long >= 0)) { qmaxreport = pque->qu_attr[QA_ATR_MaxReport].at_val.at_long; } else { qmaxreport = TMAX_JOB; } if (LOGLEVEL >= 5) { sprintf(log_buf,"giving scheduler up to %ld idle jobs in queue %s\n", qmaxreport, pque->qu_qs.qu_name); log_event(PBSEVENT_SYSTEM,PBS_EVENTCLASS_QUEUE,pque->qu_qs.qu_name,log_buf); } sentJobCounter = 0; /* loop through jobs in queue */ if (pjob != NULL) unlock_ji_mutex(pjob, __func__, "5", LOGLEVEL); all_jobs_iterator *jobiter = NULL; pque->qu_jobs->lock(); jobiter = pque->qu_jobs->get_iterator(); pque->qu_jobs->unlock(); while ((pjob = next_job(pque->qu_jobs,jobiter)) != NULL) { if ((qjcounter >= qmaxreport) && (pjob->ji_qs.ji_state == JOB_STATE_QUEUED)) { /* max_report of queued jobs reached for queue */ unlock_ji_mutex(pjob, __func__, "6", LOGLEVEL); continue; } pal = (svrattrl *)GET_NEXT(preq->rq_ind.rq_status.rq_attr); rc = status_job( pjob, preq, (pjob->ji_wattr[JOB_ATR_mtime].at_val.at_long >= DTime) ? pal : dpal, &preply->brp_un.brp_status, &bad); if ((rc != 0) && (rc != PBSE_PERM)) { req_reject(rc, bad, preq, NULL, NULL); unlock_ji_mutex(pjob, __func__, "7", LOGLEVEL); unlock_queue(pque, __func__, "perm", LOGLEVEL); delete iter; return; } sentJobCounter++; if (pjob->ji_qs.ji_state == JOB_STATE_QUEUED) qjcounter++; unlock_ji_mutex(pjob, __func__, "8", LOGLEVEL); } /* END foreach (pjob from pque) */ if (LOGLEVEL >= 5) { sprintf(log_buf,"sent scheduler %ld total jobs for queue %s\n", sentJobCounter, pque->qu_qs.qu_name); log_event(PBSEVENT_SYSTEM,PBS_EVENTCLASS_QUEUE,pque->qu_qs.qu_name,log_buf); } unlock_queue(pque, __func__, "end while", LOGLEVEL); } /* END for (pque) */ reply_send_svr(preq); delete iter; return; } /* END if ((type == tjstTruncatedServer) || ...) */ while (pjob != NULL) { /* go ahead and build the status reply for this job */ if (exec_only) { if (cntl->sc_pque != NULL) { if (cntl->sc_pque->qu_qs.qu_type != QTYPE_Execution) goto nextjob; } else { if (pa != NULL) pthread_mutex_unlock(pa->ai_mutex); pque = get_jobs_queue(&pjob); if (pa != NULL) pthread_mutex_lock(pa->ai_mutex); if ((pjob == NULL) || (pque == NULL)) goto nextjob; mutex_mgr pque_mutex = mutex_mgr(pque->qu_mutex, true); if (pque->qu_qs.qu_type != QTYPE_Execution) { goto nextjob; } } } pal = (svrattrl *)GET_NEXT(preq->rq_ind.rq_status.rq_attr); rc = status_job( pjob, preq, pal, &preply->brp_un.brp_status, &bad); if ((rc != 0) && (rc != PBSE_PERM)) { if (pa != NULL) { unlock_ai_mutex(pa, __func__, "1", LOGLEVEL); } unlock_ji_mutex(pjob, __func__, "9", LOGLEVEL); req_reject(rc, bad, preq, NULL, NULL); delete iter; return; } /* get next job */ nextjob: if (pjob != NULL) unlock_ji_mutex(pjob, __func__, "10", LOGLEVEL); if (type == tjstJob) break; if (type == tjstQueue) pjob = next_job(cntl->sc_pque->qu_jobs,iter); else if (type == tjstSummarizeArraysQueue) pjob = next_job(cntl->sc_pque->qu_jobs_array_sum,iter); else if (type == tjstSummarizeArraysServer) pjob = next_job(&array_summary,iter); else if (type == tjstArray) { pjob = NULL; /* increment job_array_index until we find a non-null pointer or hit the end */ while (++job_array_index < pa->ai_qs.array_size) { if (pa->job_ids[job_array_index] != NULL) { if ((pjob = svr_find_job(pa->job_ids[job_array_index], FALSE)) != NULL) { break; } } } } else pjob = next_job(&alljobs,iter); rc = 0; } /* END while (pjob != NULL) */ delete iter; if (pa != NULL) { unlock_ai_mutex(pa, __func__, "1", LOGLEVEL); } reply_send_svr(preq); if (LOGLEVEL >= 7) { log_event(PBSEVENT_SYSTEM, PBS_EVENTCLASS_JOB, "req_statjob", "Successfully returned the status of queued jobs\n"); } return; } /* END req_stat_job_step2() */
int execute_job_delete( job *pjob, /* M */ char *Msg, /* I */ struct batch_request *preq) /* I */ { struct work_task *pwtnew; int rc; char *sigt = "SIGTERM"; int has_mutex = TRUE; char log_buf[LOCAL_LOG_BUF_SIZE]; time_t time_now = time(NULL); long force_cancel = FALSE; long array_compatible = FALSE; chk_job_req_permissions(&pjob,preq); if (pjob == NULL) { /* preq is rejected in chk_job_req_permissions here */ return(-1); } if (pjob->ji_qs.ji_state == JOB_STATE_TRANSIT) { /* see note in req_delete - not sure this is possible still, * but the deleted code is irrelevant now. I will leave this * part --dbeer */ unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL); return(-1); } if (pjob->ji_qs.ji_substate == JOB_SUBSTATE_PRERUN || pjob->ji_qs.ji_substate == JOB_SUBSTATE_RERUN || pjob->ji_qs.ji_substate == JOB_SUBSTATE_RERUN1 || pjob->ji_qs.ji_substate == JOB_SUBSTATE_RERUN2 || pjob->ji_qs.ji_substate == JOB_SUBSTATE_RERUN3 ) { /* If JOB_SUBSTATE_PRERUN being sent to MOM, wait till she gets it going */ /* retry in one second */ /* If JOB_SUBSTATE_RERUN, RERUN1, RERUN2 or RERUN3 the job is being requeued. Wait until finished */ static time_t cycle_check_when = 0; static char cycle_check_jid[PBS_MAXSVRJOBID + 1]; if (cycle_check_when != 0) { if (!strcmp(pjob->ji_qs.ji_jobid, cycle_check_jid) && (time_now - cycle_check_when > 10)) { /* state not updated after 10 seconds */ /* did the mom ever get it? delete it anyways... */ cycle_check_jid[0] = '\0'; cycle_check_when = 0; goto jump; } if (time_now - cycle_check_when > 20) { /* give up after 20 seconds */ cycle_check_jid[0] = '\0'; cycle_check_when = 0; } } /* END if (cycle_check_when != 0) */ if (cycle_check_when == 0) { /* new PRERUN job located */ cycle_check_when = time_now; strcpy(cycle_check_jid, pjob->ji_qs.ji_jobid); } sprintf(log_buf, "job cannot be deleted, state=PRERUN, requeuing delete request"); log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf); pwtnew = set_task(WORK_Timed,time_now + 1,post_delete_route,preq,FALSE); unlock_ji_mutex(pjob, __func__, "2", LOGLEVEL); if (pwtnew == NULL) { req_reject(PBSE_SYSTEM, 0, preq, NULL, NULL); return(-1); } else { return(ROUTE_DELETE); } } /* END if (pjob->ji_qs.ji_substate == JOB_SUBSTATE_PRERUN) */ jump: /* * Log delete and if requesting client is not job owner, send mail. */ sprintf(log_buf, "requestor=%s@%s", preq->rq_user, preq->rq_host); /* NOTE: should annotate accounting record with extend message (NYI) */ account_record(PBS_ACCT_DEL, pjob, log_buf); sprintf(log_buf, msg_manager, msg_deletejob, preq->rq_user, preq->rq_host); log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf); /* NOTE: should incorporate job delete message */ if (Msg != NULL) { /* have text message in request extension, add it */ strcat(log_buf, "\n"); strcat(log_buf, Msg); } if ((svr_chk_owner(preq, pjob) != 0) && (pjob->ji_has_delete_nanny == FALSE)) { /* only send email if owner did not delete job and job deleted has not been previously attempted */ svr_mailowner(pjob, MAIL_DEL, MAIL_FORCE, log_buf); /* * If we sent mail and already sent the extra message * then reset message so we don't trigger a redundant email * in job_abt() */ if (Msg != NULL) { Msg = NULL; } } if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_FILE) != 0) { /* job has restart file at mom, change restart comment if failed */ change_restart_comment_if_needed(pjob); } if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING) { /* * setup a nanny task to make sure the job is actually deleted (see the * comments at job_delete_nanny()). */ if (pjob->ji_has_delete_nanny == TRUE) { unlock_ji_mutex(pjob, __func__, "3", LOGLEVEL); req_reject(PBSE_IVALREQ, 0, preq, NULL, "job cancel in progress"); return(-1); } apply_job_delete_nanny(pjob, time_now + 60); /* * Send signal request to MOM. The server will automagically * pick up and "finish" off the client request when MOM replies. */ get_batch_request_id(preq); if ((rc = issue_signal(&pjob, sigt, post_delete_mom1, strdup(preq->rq_id)))) { /* cant send to MOM */ req_reject(rc, 0, preq, NULL, NULL); } /* normally will ack reply when mom responds */ if (pjob != NULL) { sprintf(log_buf, msg_delrunjobsig, sigt); log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf); unlock_ji_mutex(pjob, __func__, "4", LOGLEVEL); } return(-1); } /* END if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING) */ /* make a cleanup task if set */ get_svr_attr_l(SRV_ATR_JobForceCancelTime, &force_cancel); if (force_cancel > 0) { char *dup_jobid = strdup(pjob->ji_qs.ji_jobid); set_task(WORK_Timed, time_now + force_cancel, ensure_deleted, dup_jobid, FALSE); } /* if configured, and this job didn't have a slot limit hold, free a job * held with the slot limit hold */ get_svr_attr_l(SRV_ATR_MoabArrayCompatible, &array_compatible); if ((array_compatible != FALSE) && ((pjob->ji_wattr[JOB_ATR_hold].at_val.at_long & HOLD_l) == FALSE)) { if ((pjob->ji_arraystructid[0] != '\0') && (pjob->ji_is_array_template == FALSE)) { int i; int newstate; int newsub; job *tmp; job_array *pa = get_jobs_array(&pjob); if (pjob == NULL) return(-1); for (i = 0; i < pa->ai_qs.array_size; i++) { if (pa->job_ids[i] == NULL) continue; if (!strcmp(pa->job_ids[i], pjob->ji_qs.ji_jobid)) continue; if ((tmp = svr_find_job(pa->job_ids[i], FALSE)) == NULL) { free(pa->job_ids[i]); pa->job_ids[i] = NULL; } else { if (tmp->ji_wattr[JOB_ATR_hold].at_val.at_long & HOLD_l) { tmp->ji_wattr[JOB_ATR_hold].at_val.at_long &= ~HOLD_l; if (tmp->ji_wattr[JOB_ATR_hold].at_val.at_long == 0) { tmp->ji_wattr[JOB_ATR_hold].at_flags &= ~ATR_VFLAG_SET; } svr_evaljobstate(tmp, &newstate, &newsub, 1); svr_setjobstate(tmp, newstate, newsub, FALSE); job_save(tmp, SAVEJOB_FULL, 0); unlock_ji_mutex(tmp, __func__, "5", LOGLEVEL); break; } unlock_ji_mutex(tmp, __func__, "6", LOGLEVEL); } } unlock_ai_mutex(pa, __func__, "1", LOGLEVEL); } } /* END MoabArrayCompatible check */ if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_FILE) != 0) { /* job has restart file at mom, do end job processing */ svr_setjobstate(pjob, JOB_STATE_EXITING, JOB_SUBSTATE_EXITING, FALSE); /* force new connection */ pjob->ji_momhandle = -1; if (LOGLEVEL >= 7) { sprintf(log_buf, "calling on_job_exit from %s", __func__); log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buf); } set_task(WORK_Immed, 0, on_job_exit_task, strdup(pjob->ji_qs.ji_jobid), FALSE); } else if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_StagedIn) != 0) { /* job has staged-in file, should remove them */ remove_stagein(&pjob); if (pjob != NULL) job_abt(&pjob, Msg); has_mutex = FALSE; } else { /* * the job is not transitting (though it may have been) and * is not running, so put in into a complete state. */ struct pbs_queue *pque; int KeepSeconds = 0; svr_setjobstate(pjob, JOB_STATE_COMPLETE, JOB_SUBSTATE_COMPLETE, FALSE); if ((pque = get_jobs_queue(&pjob)) != NULL) { pque->qu_numcompleted++; unlock_queue(pque, __func__, NULL, LOGLEVEL); if (LOGLEVEL >= 7) { sprintf(log_buf, "calling on_job_exit from %s", __func__); log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buf); } pthread_mutex_lock(server.sv_attr_mutex); KeepSeconds = attr_ifelse_long( &pque->qu_attr[QE_ATR_KeepCompleted], &server.sv_attr[SRV_ATR_KeepCompleted], 0); pthread_mutex_unlock(server.sv_attr_mutex); } else KeepSeconds = 0; if (pjob != NULL) { set_task(WORK_Timed, time_now + KeepSeconds, on_job_exit_task, strdup(pjob->ji_qs.ji_jobid), FALSE); } else has_mutex = FALSE; } /* END else if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_FILE) != 0) */ if (has_mutex == TRUE) unlock_ji_mutex(pjob, __func__, "7", LOGLEVEL); return(PBSE_NONE); } /* END execute_job_delete() */
static void req_stat_job_step2( struct stat_cntl *cntl) /* I/O (free'd on return) */ { svrattrl *pal; job *pjob = NULL; struct batch_request *preq; struct batch_reply *preply; int rc = 0; enum TJobStatTypeEnum type; pbs_queue *pque = NULL; int exec_only = 0; int bad = 0; long DTime; /* delta time - only report full pbs_attribute list if J->MTime > DTime */ static svrattrl *dpal = NULL; int job_array_index = 0; job_array *pa = NULL; char log_buf[LOCAL_LOG_BUF_SIZE]; int iter; time_t time_now = time(NULL); long poll_jobs = 0; char job_id[PBS_MAXSVRJOBID+1]; int job_substate = -1; time_t job_momstattime = -1; preq = cntl->sc_origrq; type = (enum TJobStatTypeEnum)cntl->sc_type; preply = &preq->rq_reply; /* See pbs_server_attributes(1B) for details on "poll_jobs" behaviour */ if (dpal == NULL) { /* build 'delta' pbs_attribute list */ svrattrl *tpal; tlist_head dalist; int aindex; int atrlist[] = { JOB_ATR_jobname, JOB_ATR_resc_used, JOB_ATR_LAST }; CLEAR_LINK(dalist); for (aindex = 0;atrlist[aindex] != JOB_ATR_LAST;aindex++) { if ((tpal = attrlist_create("", "", 23)) == NULL) { return; } tpal->al_valln = atrlist[aindex]; if (dpal == NULL) dpal = tpal; append_link(&dalist, &tpal->al_link, tpal); } } /* END if (dpal == NULL) */ if (type == tjstArray) { pa = get_array(preq->rq_ind.rq_status.rq_id); if (pa == NULL) { req_reject(PBSE_UNKARRAYID, 0, preq, NULL, "unable to find array"); return; } } iter = -1; get_svr_attr_l(SRV_ATR_PollJobs, &poll_jobs); if (!poll_jobs) { /* polljobs not set - indicates we may need to obtain fresh data from MOM */ if (cntl->sc_jobid[0] == '\0') pjob = NULL; else pjob = svr_find_job(cntl->sc_jobid, FALSE); while (1) { if (pjob == NULL) { /* start from the first job */ if (type == tjstJob) { pjob = svr_find_job(preq->rq_ind.rq_status.rq_id, FALSE); } else if (type == tjstQueue) { pjob = next_job(cntl->sc_pque->qu_jobs,&iter); } else if (type == tjstArray) { job_array_index = 0; /* increment job_array_index until we find a non-null pointer or hit the end */ while (job_array_index < pa->ai_qs.array_size) { if (pa->job_ids[job_array_index] != NULL) { if ((pjob = svr_find_job(pa->job_ids[job_array_index], FALSE)) != NULL) { unlock_ji_mutex(pjob, __func__, "2", LOGLEVEL); break; } } job_array_index++; } } else { pjob = next_job(&alljobs,&iter); } } /* END if (pjob == NULL) */ else { strcpy(job_id, pjob->ji_qs.ji_jobid); unlock_ji_mutex(pjob, __func__, "3", LOGLEVEL); if (type == tjstJob) break; if (type == tjstQueue) pjob = next_job(cntl->sc_pque->qu_jobs,&iter); else if (type == tjstArray) { pjob = NULL; /* increment job_array_index until we find a non-null pointer or hit the end */ while (++job_array_index < pa->ai_qs.array_size) { if (pa->job_ids[job_array_index] != NULL) { if ((pjob = svr_find_job(pa->job_ids[job_array_index], FALSE)) != NULL) { unlock_ji_mutex(pjob, __func__, "3", LOGLEVEL); break; } } } } else pjob = next_job(&alljobs,&iter); } if (pjob == NULL) break; strcpy(job_id, pjob->ji_qs.ji_jobid); job_substate = pjob->ji_qs.ji_substate; job_momstattime = pjob->ji_momstat; strcpy(cntl->sc_jobid, job_id); unlock_ji_mutex(pjob, __func__, "4", LOGLEVEL); pjob = NULL; /* PBS_RESTAT_JOB defaults to 30 seconds */ if ((job_substate == JOB_SUBSTATE_RUNNING) && ((time_now - job_momstattime) > JobStatRate)) { /* go to MOM for status */ if ((rc = stat_to_mom(job_id, cntl)) == PBSE_MEM_MALLOC) break; if (rc != 0) { pjob = svr_find_job(job_id, FALSE); rc = 0; continue; } if (pa != NULL) unlock_ai_mutex(pa, __func__, "1", LOGLEVEL); return; /* will pick up after mom replies */ } } /* END while(1) */ if (rc != 0) { if (pa != NULL) unlock_ai_mutex(pa, __func__, "2", LOGLEVEL); reply_free(preply); req_reject(rc, 0, preq, NULL, "cannot get update from mom"); return; } } /* END if (!server.sv_attr[SRV_ATR_PollJobs].at_val.at_long) */ /* * now ready for part 3, building the status reply, * loop through again */ if ((type == tjstSummarizeArraysQueue) || (type == tjstSummarizeArraysServer)) { /* No array can be owned for these options */ update_array_statuses(); } if (type == tjstJob) pjob = svr_find_job(preq->rq_ind.rq_status.rq_id, FALSE); else if (type == tjstQueue) pjob = next_job(cntl->sc_pque->qu_jobs,&iter); else if (type == tjstSummarizeArraysQueue) pjob = next_job(cntl->sc_pque->qu_jobs_array_sum,&iter); else if (type == tjstSummarizeArraysServer) pjob = next_job(&array_summary,&iter); else if (type == tjstArray) { job_array_index = -1; pjob = NULL; /* increment job_array_index until we find a non-null pointer or hit the end */ while (++job_array_index < pa->ai_qs.array_size) { if (pa->job_ids[job_array_index] != NULL) { if ((pjob = svr_find_job(pa->job_ids[job_array_index], FALSE)) != NULL) { break; } } } } else pjob = next_job(&alljobs,&iter); DTime = 0; if (preq->rq_extend != NULL) { char *ptr; /* FORMAT: { EXECQONLY | DELTA:<EPOCHTIME> } */ if (strstr(preq->rq_extend, EXECQUEONLY)) exec_only = 1; ptr = strstr(preq->rq_extend, "DELTA:"); if (ptr != NULL) { ptr += strlen("delta:"); DTime = strtol(ptr, NULL, 10); } } if ((type == tjstTruncatedServer) || (type == tjstTruncatedQueue)) { long sentJobCounter; long qjcounter; long qmaxreport; int iter = -1; /* loop through all queues */ while ((pque = next_queue(&svr_queues,&iter)) != NULL) { qjcounter = 0; if ((exec_only == 1) && (pque->qu_qs.qu_type != QTYPE_Execution)) { /* ignore routing queues */ unlock_queue(pque, __func__, "ignore queue", LOGLEVEL); continue; } if (((pque->qu_attr[QA_ATR_MaxReport].at_flags & ATR_VFLAG_SET) != 0) && (pque->qu_attr[QA_ATR_MaxReport].at_val.at_long >= 0)) { qmaxreport = pque->qu_attr[QA_ATR_MaxReport].at_val.at_long; } else { qmaxreport = TMAX_JOB; } if (LOGLEVEL >= 5) { sprintf(log_buf,"giving scheduler up to %ld idle jobs in queue %s\n", qmaxreport, pque->qu_qs.qu_name); log_event(PBSEVENT_SYSTEM,PBS_EVENTCLASS_QUEUE,pque->qu_qs.qu_name,log_buf); } sentJobCounter = 0; /* loop through jobs in queue */ if (pjob != NULL) unlock_ji_mutex(pjob, __func__, "5", LOGLEVEL); iter = -1; while ((pjob = next_job(pque->qu_jobs,&iter)) != NULL) { if ((qjcounter >= qmaxreport) && (pjob->ji_qs.ji_state == JOB_STATE_QUEUED)) { /* max_report of queued jobs reached for queue */ unlock_ji_mutex(pjob, __func__, "6", LOGLEVEL); continue; } pal = (svrattrl *)GET_NEXT(preq->rq_ind.rq_status.rq_attr); rc = status_job( pjob, preq, (pjob->ji_wattr[JOB_ATR_mtime].at_val.at_long >= DTime) ? pal : dpal, &preply->brp_un.brp_status, &bad); if ((rc != 0) && (rc != PBSE_PERM)) { req_reject(rc, bad, preq, NULL, NULL); if (pa != NULL) { unlock_ai_mutex(pa, __func__, "1", LOGLEVEL); } unlock_ji_mutex(pjob, __func__, "7", LOGLEVEL); unlock_queue(pque, __func__, "perm", LOGLEVEL); return; } sentJobCounter++; if (pjob->ji_qs.ji_state == JOB_STATE_QUEUED) qjcounter++; unlock_ji_mutex(pjob, __func__, "8", LOGLEVEL); } /* END foreach (pjob from pque) */ if (LOGLEVEL >= 5) { sprintf(log_buf,"sent scheduler %ld total jobs for queue %s\n", sentJobCounter, pque->qu_qs.qu_name); log_event(PBSEVENT_SYSTEM,PBS_EVENTCLASS_QUEUE,pque->qu_qs.qu_name,log_buf); } unlock_queue(pque, __func__, "end while", LOGLEVEL); } /* END for (pque) */ if (pa != NULL) unlock_ai_mutex(pa, __func__, "1", LOGLEVEL); reply_send_svr(preq); return; } /* END if ((type == tjstTruncatedServer) || ...) */ while (pjob != NULL) { /* go ahead and build the status reply for this job */ if (exec_only) { if (cntl->sc_pque != NULL) { if (cntl->sc_pque->qu_qs.qu_type != QTYPE_Execution) goto nextjob; } else { if (pa != NULL) pthread_mutex_unlock(pa->ai_mutex); pque = get_jobs_queue(&pjob); if (pa != NULL) pthread_mutex_lock(pa->ai_mutex); if ((pjob == NULL) || (pque == NULL)) goto nextjob; if (pque->qu_qs.qu_type != QTYPE_Execution) { unlock_queue(pque, __func__, "not exec", LOGLEVEL); goto nextjob; } unlock_queue(pque, __func__, "exec", LOGLEVEL); } } pal = (svrattrl *)GET_NEXT(preq->rq_ind.rq_status.rq_attr); rc = status_job( pjob, preq, pal, &preply->brp_un.brp_status, &bad); if ((rc != 0) && (rc != PBSE_PERM)) { if (pa != NULL) { unlock_ai_mutex(pa, __func__, "1", LOGLEVEL); } unlock_ji_mutex(pjob, __func__, "9", LOGLEVEL); req_reject(rc, bad, preq, NULL, NULL); return; } /* get next job */ nextjob: if (pjob != NULL) unlock_ji_mutex(pjob, __func__, "10", LOGLEVEL); if (type == tjstJob) break; if (type == tjstQueue) pjob = next_job(cntl->sc_pque->qu_jobs,&iter); else if (type == tjstSummarizeArraysQueue) pjob = next_job(cntl->sc_pque->qu_jobs_array_sum,&iter); else if (type == tjstSummarizeArraysServer) pjob = next_job(&array_summary,&iter); else if (type == tjstArray) { pjob = NULL; /* increment job_array_index until we find a non-null pointer or hit the end */ while (++job_array_index < pa->ai_qs.array_size) { if (pa->job_ids[job_array_index] != NULL) { if ((pjob = svr_find_job(pa->job_ids[job_array_index], FALSE)) != NULL) { break; } } } } else pjob = next_job(&alljobs,&iter); rc = 0; } /* END while (pjob != NULL) */ if (pa != NULL) { unlock_ai_mutex(pa, __func__, "1", LOGLEVEL); } reply_send_svr(preq); if (LOGLEVEL >= 7) { log_event(PBSEVENT_SYSTEM, PBS_EVENTCLASS_JOB, "req_statjob", "Successfully returned the status of queued jobs\n"); } return; } /* END req_stat_job_step2() */
void req_stat_job_step2( struct stat_cntl *cntl) /* I/O (free'd on return) */ { batch_request *preq = cntl->sc_origrq; svrattrl *pal = (svrattrl *)GET_NEXT(preq->rq_ind.rq_status.rq_attr); job *pjob = NULL; struct batch_reply *preply = &preq->rq_reply; int rc = 0; enum TJobStatTypeEnum type = (enum TJobStatTypeEnum)cntl->sc_type; bool exec_only = false; int bad = 0; /* delta time - only report full pbs_attribute list if J->MTime > DTime */ int job_array_index = -1; job_array *pa = NULL; all_jobs_iterator *iter; if (preq->rq_extend != NULL) { /* FORMAT: { EXECQONLY } */ if (strstr(preq->rq_extend, EXECQUEONLY)) exec_only = true; } if ((type == tjstTruncatedServer) || (type == tjstTruncatedQueue)) { handle_truncated_qstat(exec_only, cntl->sc_condensed, preq); return; } /* END if ((type == tjstTruncatedServer) || ...) */ else if (type == tjstJob) { pjob = svr_find_job(preq->rq_ind.rq_status.rq_id, FALSE); if (pjob != NULL) { if ((rc = status_job(pjob, preq, pal, &preply->brp_un.brp_status, cntl->sc_condensed, &bad))) req_reject(rc, bad, preq, NULL, NULL); else reply_send_svr(preq); unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL); } else { req_reject(PBSE_JOBNOTFOUND, bad, preq, NULL, NULL); } } else { if (type == tjstArray) { pa = get_array(preq->rq_ind.rq_status.rq_id); if (pa == NULL) { req_reject(PBSE_UNKARRAYID, 0, preq, NULL, "unable to find array"); return; } } else if ((type == tjstSummarizeArraysQueue) || (type == tjstSummarizeArraysServer)) update_array_statuses(); iter = get_correct_status_iterator(cntl); for (pjob = get_next_status_job(cntl, job_array_index, pa, iter); pjob != NULL; pjob = get_next_status_job(cntl, job_array_index, pa, iter)) { mutex_mgr job_mutex(pjob->ji_mutex, true); /* go ahead and build the status reply for this job */ if (pjob->ji_being_recycled == true) continue; if (exec_only) { if (cntl->sc_pque != NULL) { if (cntl->sc_pque->qu_qs.qu_type != QTYPE_Execution) continue; } else if (in_execution_queue(pjob, pa) == false) continue; } rc = status_job(pjob, preq, pal, &preply->brp_un.brp_status, cntl->sc_condensed, &bad); if ((rc != PBSE_NONE) && (rc != PBSE_PERM)) { if (pa != NULL) unlock_ai_mutex(pa, __func__, "1", LOGLEVEL); req_reject(rc, bad, preq, NULL, NULL); delete iter; return; } } /* END for (pjob != NULL) */ delete iter; if (pa != NULL) { unlock_ai_mutex(pa, __func__, "1", LOGLEVEL); } reply_send_svr(preq); } if (LOGLEVEL >= 7) { log_event(PBSEVENT_SYSTEM, PBS_EVENTCLASS_JOB, "req_statjob", "Successfully returned the status of queued jobs\n"); } return; } /* END req_stat_job_step2() */
int is_array( char *id) { job_array *pa; int iter = -1; char *bracket_ptr; char *end_bracket_ptr; char jobid[PBS_MAXSVRJOBID]; char temp_jobid[PBS_MAXSVRJOBID]; snprintf(jobid, sizeof(jobid), "%s", id); /* Check to see if we have an array dependency */ /* If there is an array dependency count then we will */ /* have an id of something like arrayid[][1]. We need to take */ /* off the [1] so we can compare the array id with and existing */ /* array entry. */ if ((bracket_ptr = strchr(jobid,'[')) != NULL) { /* Make sure the next character is ']' */ if (*(++bracket_ptr) != ']') { /* If we do not have a ']' then we have bad syntax. */ return(FALSE); } if (*(++bracket_ptr) == '[') { /* we made it to here. That means we have a count inside brackets. Just truncate them for the name comparison */ end_bracket_ptr = strchr(bracket_ptr, ']'); if (end_bracket_ptr == NULL) { /* we do not have a ']' then we have bad syntax. */ return(FALSE); } /* advance end_bracket_ptr one. We should be either NULL or '.' */ end_bracket_ptr++; /* truncate the string */ *bracket_ptr = 0; /* this makes jobid just the arrayid name */ /* append the rest of the job id */ snprintf(temp_jobid, sizeof(jobid), "%s%s", jobid, end_bracket_ptr); snprintf(jobid, sizeof(jobid), "%s", temp_jobid); } } else { /* No '[' then we do not have an array */ return (FALSE); } while ((pa = next_array(&iter)) != NULL) { lock_ai_mutex(pa, __func__, NULL, LOGLEVEL); if (strcmp(pa->ai_qs.parent_id, jobid) == 0) { unlock_ai_mutex(pa, __func__, "1", LOGLEVEL); return(TRUE); } unlock_ai_mutex(pa, __func__, "1", LOGLEVEL); } return(FALSE); } /* END is_array() */
int setup_array_struct( job *pjob) { job_array *pa; array_request_node *rn; int bad_token_count; int array_size; int rc; char log_buf[LOCAL_LOG_BUF_SIZE]; long max_array_size; pa = (job_array *)calloc(1,sizeof(job_array)); pa->ai_qs.struct_version = ARRAY_QS_STRUCT_VERSION; strcpy(pa->ai_qs.parent_id, pjob->ji_qs.ji_jobid); strcpy(pa->ai_qs.fileprefix, pjob->ji_qs.ji_fileprefix); snprintf(pa->ai_qs.owner, sizeof(pa->ai_qs.owner), "%s", pjob->ji_wattr[JOB_ATR_job_owner].at_val.at_str); snprintf(pa->ai_qs.submit_host, sizeof(pa->ai_qs.submit_host), "%s", get_variable(pjob, pbs_o_host)); pa->ai_qs.num_cloned = 0; CLEAR_HEAD(pa->request_tokens); pa->ai_mutex = calloc(1, sizeof(pthread_mutex_t)); pthread_mutex_init(pa->ai_mutex,NULL); lock_ai_mutex(pa, __func__, NULL, LOGLEVEL); if (job_save(pjob, SAVEJOB_FULL, 0) != 0) { /* the array is deleted in svr_job_purge */ unlock_ai_mutex(pa, __func__, "1", LOGLEVEL); svr_job_purge(pjob); /* Does job array need to be removed? */ if (LOGLEVEL >= 6) { log_record( PBSEVENT_JOB, PBS_EVENTCLASS_JOB, (pjob != NULL) ? pjob->ji_qs.ji_jobid : "NULL", "cannot save job"); } return(1); } if ((rc = set_slot_limit(pjob->ji_wattr[JOB_ATR_job_array_request].at_val.at_str, pa))) { long max_limit = 0; get_svr_attr_l(SRV_ATR_MaxSlotLimit, &max_limit); array_delete(pa); snprintf(log_buf,sizeof(log_buf), "Array %s requested a slot limit above the max limit %ld, rejecting\n", pa->ai_qs.parent_id, max_limit); log_event(PBSEVENT_SYSTEM,PBS_EVENTCLASS_JOB,pa->ai_qs.parent_id,log_buf); return(INVALID_SLOT_LIMIT); } pa->ai_qs.jobs_running = 0; pa->ai_qs.num_started = 0; pa->ai_qs.num_failed = 0; pa->ai_qs.num_successful = 0; bad_token_count = parse_array_request( pjob->ji_wattr[JOB_ATR_job_array_request].at_val.at_str, &(pa->request_tokens)); /* get the number of elements that should be allocated in the array */ rn = (array_request_node *)GET_NEXT(pa->request_tokens); array_size = 0; pa->ai_qs.num_jobs = 0; while (rn != NULL) { if (rn->end > array_size) array_size = rn->end; /* calculate the actual number of jobs (different from array size) */ pa->ai_qs.num_jobs += rn->end - rn->start + 1; rn = (array_request_node *)GET_NEXT(rn->request_tokens_link); } /* size of array is the biggest index + 1 */ array_size++; if (get_svr_attr_l(SRV_ATR_MaxArraySize, &max_array_size) == PBSE_NONE) { if (max_array_size < pa->ai_qs.num_jobs) { array_delete(pa); return(ARRAY_TOO_LARGE); } } /* initialize the array */ pa->job_ids = calloc(array_size, sizeof(char *)); if (pa->job_ids == NULL) { sprintf(log_buf, "Failed to alloc job_ids: job %s", pjob->ji_qs.ji_jobid); log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, __func__, log_buf); return(PBSE_MEM_MALLOC); } /* remember array_size */ pa->ai_qs.array_size = array_size; CLEAR_HEAD(pa->ai_qs.deps); array_save(pa); if (bad_token_count > 0) { array_delete(pa); return 2; } pjob->ji_arraystruct = pa; insert_array(pa); unlock_ai_mutex(pa, __func__, "1", LOGLEVEL); return(PBSE_NONE); } /* END setup_array_struct() */
/* delete a job array struct from memory and disk. This is used when the number * of jobs that belong to the array becomes zero. * returns zero if there are no errors, non-zero otherwise */ int array_delete( job_array *pa) { int i; char path[MAXPATHLEN + 1]; char log_buf[LOCAL_LOG_BUF_SIZE]; array_request_node *rn; struct array_depend *pdep; struct array_depend_job *pdj; /* first thing to do is take this out of the servers list of all arrays */ remove_array(pa); /* unlock the mutex and free it */ unlock_ai_mutex(pa, __func__, "1", LOGLEVEL); free(pa->ai_mutex); /* delete the on disk copy of the struct */ snprintf(path, sizeof(path), "%s%s%s", path_arrays, pa->ai_qs.fileprefix, ARRAY_FILE_SUFFIX); if (unlink(path)) { sprintf(log_buf, "unable to delete %s", path); log_err(errno, "array_delete", log_buf); } /* clear array request linked list */ for (rn = (array_request_node *)GET_NEXT(pa->request_tokens); rn != NULL; rn = (array_request_node *)GET_NEXT(pa->request_tokens)) { delete_link(&rn->request_tokens_link); free(rn); } /* free the memory for the job pointers */ for (i = 0; i < pa->ai_qs.array_size; i++) { if (pa->job_ids[i] != NULL) free(pa->job_ids[i]); } free(pa->job_ids); /* free the dependencies, if any */ for (pdep = (struct array_depend *)GET_NEXT(pa->ai_qs.deps); pdep != NULL; pdep = (struct array_depend *)GET_NEXT(pa->ai_qs.deps)) { delete_link(&pdep->dp_link); for (pdj = (struct array_depend_job *)GET_NEXT(pdep->dp_jobs); pdj != NULL; pdj = (struct array_depend_job *)GET_NEXT(pdep->dp_jobs)) { delete_link(&pdj->dc_link); free(pdj); } free(pdep); } /* purge the "template" job, this also deletes the shared script file for the array*/ if (pa->ai_qs.parent_id[0] != '\0') { job *pjob; if ((pjob = svr_find_job(pa->ai_qs.parent_id, FALSE)) != NULL) svr_job_purge(pjob); } /* free the memory allocated for the struct */ free(pa); return 0; } /* END array_delete() */
void update_array_statuses( job_array *owned) { job_array *pa; job *pj; job *pjob; int i; int iter = -1; unsigned int running; unsigned int queued; unsigned int held; unsigned int complete; char log_buf[LOCAL_LOG_BUF_SIZE]; while ((pa = next_array_check(&iter, owned)) != NULL) { running = 0; queued = 0; held = 0; complete = 0; for (i = 0; i < pa->ai_qs.array_size; i++) { if (pa->job_ids[i] != NULL) { if ((pj = svr_find_job(pa->job_ids[i], TRUE)) == NULL) { free(pa->job_ids[i]); pa->job_ids[i] = NULL; } else { if (pj->ji_qs.ji_state == JOB_STATE_RUNNING) { running++; } else if (pj->ji_qs.ji_state == JOB_STATE_QUEUED) { queued++; } else if (pj->ji_qs.ji_state == JOB_STATE_HELD) { held++; } else if (pj->ji_qs.ji_state == JOB_STATE_COMPLETE) { complete++; } unlock_ji_mutex(pj, __func__, "1", LOGLEVEL); } } } if (LOGLEVEL >= 7) { sprintf(log_buf, "%s: unlocking ai_mutex", __func__); log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, __func__, log_buf); } unlock_ai_mutex(pa, __func__, "1", LOGLEVEL); if ((pjob = svr_find_job(pa->ai_qs.parent_id, TRUE)) != NULL) { if (running > 0) { svr_setjobstate(pjob, JOB_STATE_RUNNING, pjob->ji_qs.ji_substate, FALSE); } else if (held > 0 && queued == 0 && complete == 0) { svr_setjobstate(pjob, JOB_STATE_HELD, pjob->ji_qs.ji_substate, FALSE); } else if (complete > 0 && queued == 0 && held == 0) { svr_setjobstate(pjob, JOB_STATE_COMPLETE, pjob->ji_qs.ji_substate, FALSE); } else { /* default to just calling the array queued */ svr_setjobstate(pjob, JOB_STATE_QUEUED, pjob->ji_qs.ji_substate, FALSE); } unlock_ji_mutex(pjob, __func__, "2", LOGLEVEL); } if (pa == owned) { lock_ai_mutex(pa, __func__, "1", LOGLEVEL); } } } /* END update_array_statuses() */
job *job_recov( char *filename) /* I */ /* pathname to job save file */ { int fds; job *pj; char *pn; char namebuf[MAXPATHLEN]; char log_buf[LOCAL_LOG_BUF_SIZE]; #ifndef PBS_MOM char parent_id[PBS_MAXSVRJOBID + 1]; job_array *pa; #endif pj = job_alloc(); /* allocate & initialize job structure space */ if (pj == NULL) { /* FAILURE - cannot alloc memory */ return(NULL); } snprintf(namebuf, MAXPATHLEN, "%s%s", path_jobs, filename); /* job directory path, filename */ fds = open(namebuf, O_RDONLY, 0); if (fds < 0) { snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "unable to open %s", namebuf); log_err(errno, __func__, log_buf); #ifndef PBS_MOM unlock_ji_mutex(pj, __func__, "1", LOGLEVEL); free(pj->ji_mutex); #endif free((char *)pj); /* FAILURE - cannot open job file */ return(NULL); } /* read in job quick save sub-structure */ if (read_ac_socket(fds, (char *)&pj->ji_qs, sizeof(pj->ji_qs)) != sizeof(pj->ji_qs) && pj->ji_qs.qs_version == PBS_QS_VERSION) { snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "Unable to read %s", namebuf); log_err(errno, __func__, log_buf); #ifndef PBS_MOM unlock_ji_mutex(pj, __func__, "2", LOGLEVEL); free(pj->ji_mutex); #endif free((char *)pj); close(fds); return(NULL); } /* is ji_qs the version we expect? */ if (pj->ji_qs.qs_version != PBS_QS_VERSION) { /* ji_qs is older version */ snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "%s appears to be from an old version. Attempting to convert.\n", namebuf); log_err(-1, __func__, log_buf); if (job_qs_upgrade(pj, fds, namebuf, pj->ji_qs.qs_version) != 0) { snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "unable to upgrade %s\n", namebuf); log_err(-1, __func__, log_buf); #ifndef PBS_MOM unlock_ji_mutex(pj, __func__, "3", LOGLEVEL); free(pj->ji_mutex); #endif free((char *)pj); close(fds); return(NULL); } } /* END if (pj->ji_qs.qs_version != PBS_QS_VERSION) */ /* Does file name match the internal name? */ /* This detects ghost files */ pn = strrchr(namebuf, (int)'/') + 1; if (strncmp(pn, pj->ji_qs.ji_fileprefix, strlen(pj->ji_qs.ji_fileprefix)) != 0) { /* mismatch, discard job */ snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "Job Id %s does not match file name for %s", pj->ji_qs.ji_jobid, namebuf); log_err(-1, __func__, log_buf); #ifndef PBS_MOM unlock_ji_mutex(pj, __func__, "4", LOGLEVEL); free(pj->ji_mutex); #endif free((char *)pj); close(fds); return(NULL); } /* read in working attributes */ if (recov_attr( fds, pj, job_attr_def, pj->ji_wattr, JOB_ATR_LAST, JOB_ATR_UNKN, TRUE) != 0) { snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "unable to recover %s (file is likely corrupted)", namebuf); log_err(-1, __func__, log_buf); #ifndef PBS_MOM unlock_ji_mutex(pj, __func__, "5", LOGLEVEL); job_free(pj, FALSE); #else mom_job_free(pj); #endif close(fds); return(NULL); } #ifndef PBS_MOM /* Comment out the mother superior tracking. Will be debugged later if (pj->ji_wattr[JOB_ATR_exec_host].at_val.at_str != NULL) {*/ /* add job to the mother superior list for it's node */ /* char *ms = strdup(pj->ji_wattr[JOB_ATR_exec_host].at_val.at_str); char *end = strchr(ms, '/'); if (end != NULL) *end = '\0'; if ((end = strchr(ms, '+')) != NULL) *end = '\0'; add_to_ms_list(ms, pj); free(ms); }*/ #endif #ifdef PBS_MOM /* read in tm sockets and ips */ if (recov_tmsock(fds, pj) != 0) { snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "warning: tmsockets not recovered from %s (written by an older pbs_mom?)", namebuf); log_err(-1, __func__, log_buf); } #else /* not PBS_MOM */ if (strchr(pj->ji_qs.ji_jobid, '[') != NULL) { /* job is part of an array. We need to put a link back to the server job array struct for this array. We also have to link this job into the linked list of jobs belonging to the array. */ array_get_parent_id(pj->ji_qs.ji_jobid, parent_id); pa = get_array(parent_id); if (pa == NULL) { job_abt(&pj, (char *)"Array job missing array struct, aborting job"); close(fds); return NULL; } strcpy(pj->ji_arraystructid, parent_id); if (strcmp(parent_id, pj->ji_qs.ji_jobid) == 0) { pj->ji_is_array_template = TRUE; } else { pa->job_ids[(int)pj->ji_wattr[JOB_ATR_job_array_id].at_val.at_long] = strdup(pj->ji_qs.ji_jobid); pa->jobs_recovered++; /* This is a bit of a kluge, but for some reason if an array job was on hold when the server went down the ji_wattr[JOB_ATR_hold].at_val.at_long value is 0 on recovery even though pj->ji_qs.ji_state is JOB_STATE_HELD and the substate is JOB_SUBSTATE_HELD */ if ((pj->ji_qs.ji_state == JOB_STATE_HELD) && (pj->ji_qs.ji_substate == JOB_SUBSTATE_HELD)) { pj->ji_wattr[JOB_ATR_hold].at_val.at_long = HOLD_l; pj->ji_wattr[JOB_ATR_hold].at_flags = ATR_VFLAG_SET; } } if (pa != NULL) { unlock_ai_mutex(pa, __func__, "1", LOGLEVEL); } } #endif close(fds); pj->ji_commit_done = 1; /* all done recovering the job */ job_save(pj, SAVEJOB_FULL, 0); return(pj); } /* END job_recov() */
int req_releasearray( void *vp) /* I */ { job *pjob; job_array *pa; char *range; int rc; int index; struct batch_request *preq = (struct batch_request *)vp; pa = get_array(preq->rq_ind.rq_release.rq_objname); if (pa == NULL) { req_reject(PBSE_IVALREQ,0,preq,NULL,"Cannot find array"); return(PBSE_NONE); } while (TRUE) { if (((index = first_job_index(pa)) == -1) || (pa->job_ids[index] == NULL)) { unlock_ai_mutex(pa, __func__, (char *)"1", LOGLEVEL); return(PBSE_NONE); } if ((pjob = svr_find_job(pa->job_ids[index], FALSE)) == NULL) { free(pa->job_ids[index]); pa->job_ids[index] = NULL; } else break; } if (svr_authorize_jobreq(preq, pjob) == -1) { req_reject(PBSE_PERM,0,preq,NULL,NULL); unlock_ai_mutex(pa, __func__, (char *)"2", LOGLEVEL); unlock_ji_mutex(pjob, __func__, (char *)"1", LOGLEVEL); return(PBSE_NONE); } unlock_ji_mutex(pjob, __func__, (char *)"2", LOGLEVEL); range = preq->rq_extend; if ((range != NULL) && (strstr(range,ARRAY_RANGE) != NULL)) { /* parse the array range */ if ((rc = release_array_range(pa,preq,range)) != 0) { unlock_ai_mutex(pa, __func__, (char *)"3", LOGLEVEL); req_reject(rc,0,preq,NULL,NULL); return(PBSE_NONE); } } else if ((rc = release_whole_array(pa,preq)) != 0) { unlock_ai_mutex(pa, __func__, (char *)"4", LOGLEVEL); req_reject(rc,0,preq,NULL,NULL); return(PBSE_NONE); } unlock_ai_mutex(pa, __func__, (char *)"5", LOGLEVEL); reply_ack(preq); return(PBSE_NONE); } /* END req_releasearray() */
int set_array_job_ids( job **pjob, /* M */ char *log_buf, /* error Buffer */ size_t buflen) /* error buffer length */ { int rc = PBSE_NONE; #ifndef PBS_MOM job *pj = *pjob; job_array *pa; char parent_id[PBS_MAXSVRJOBID + 1]; // If this variable isn't set this job isn't actually an array subjob. if ((pj->ji_wattr[JOB_ATR_job_array_id].at_flags & ATR_VFLAG_SET) == 0) { // Check and set if this is the array template job char *open_bracket = strchr(pj->ji_qs.ji_jobid, '['); if (open_bracket != NULL) { if (*(open_bracket + 1) == ']') pj->ji_is_array_template = true; } return(rc); } if (strchr(pj->ji_qs.ji_jobid, '[') != NULL) { /* job is part of an array. We need to put a link back to the server job array struct for this array. We also have to link this job into the linked list of jobs belonging to the array. */ array_get_parent_id(pj->ji_qs.ji_jobid, parent_id); pa = get_array(parent_id); if (pa == NULL) { job_abt(&pj, (char *)"Array job missing array struct, aborting job"); snprintf(log_buf, buflen, "Array job missing array struct %s", __func__); return(-1); } strcpy(pj->ji_arraystructid, parent_id); if (strcmp(parent_id, pj->ji_qs.ji_jobid) == 0) { pj->ji_is_array_template = true; } else { pa->job_ids[(int)pj->ji_wattr[JOB_ATR_job_array_id].at_val.at_long] = strdup(pj->ji_qs.ji_jobid); pa->jobs_recovered++; /* This is a bit of a kluge, but for some reason if an array job was on hold when the server went down the ji_wattr[JOB_ATR_hold].at_val.at_long value is 0 on recovery even though pj->ji_qs.ji_state is JOB_STATE_HELD and the substate is JOB_SUBSTATE_HELD */ if ((pj->ji_qs.ji_state == JOB_STATE_HELD) && (pj->ji_qs.ji_substate == JOB_SUBSTATE_HELD)) { pj->ji_wattr[JOB_ATR_hold].at_val.at_long = HOLD_l; pj->ji_wattr[JOB_ATR_hold].at_flags = ATR_VFLAG_SET; } } if (pa != NULL) { unlock_ai_mutex(pa, __func__, "1", LOGLEVEL); } } #endif /* !PBS_MOM */ return(rc); }