/* * release_job - releases the hold on job j * @param j - the job to modify * @return 0 if successful, a PBS error on failure */ int release_job( struct batch_request *preq, /* I */ void *j) /* I/O */ { long old_hold; int rc = 0; int newstate; int newsub; char *pset; job *pjob = (job *)j; char log_buf[LOCAL_LOG_BUF_SIZE]; pbs_attribute temphold; /* cannot do anything until we decode the holds to be set */ if ((rc = get_hold(&preq->rq_ind.rq_hold.rq_orig.rq_attr, &pset, &temphold)) != 0) { return(rc); } /* if other than HOLD_u is being released, must have privil */ if ((rc = chk_hold_priv(temphold.at_val.at_long, preq->rq_perm)) != 0) { return(rc); } /* unset the hold */ old_hold = pjob->ji_wattr[JOB_ATR_hold].at_val.at_long; if ((rc = job_attr_def[JOB_ATR_hold].at_set(&pjob->ji_wattr[JOB_ATR_hold], &temphold, DECR))) { return(rc); } /* everything went well, if holds changed, update the job state */ if (old_hold != pjob->ji_wattr[JOB_ATR_hold].at_val.at_long) { pjob->ji_modified = 1; /* indicates attributes changed */ svr_evaljobstate(pjob, &newstate, &newsub, 0); svr_setjobstate(pjob, newstate, newsub, FALSE); /* saves job */ } sprintf(log_buf, msg_jobholdrel, pset, preq->rq_user, preq->rq_host); log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buf); return(rc); } /* END release_job() */
void process_hold_reply( batch_request *preq) { job *pjob; pbs_attribute temphold; int newstate; int newsub; int rc; char *pset; char log_buf[LOCAL_LOG_BUF_SIZE]; /* preq was handled previously */ if (preq == NULL) return; preq->rq_conn = preq->rq_orgconn; /* restore client socket */ if ((pjob = svr_find_job(preq->rq_ind.rq_hold.rq_orig.rq_objname, FALSE)) == NULL) { log_event(PBSEVENT_DEBUG, PBS_EVENTCLASS_JOB, preq->rq_ind.rq_hold.rq_orig.rq_objname, msg_postmomnojob); req_reject(PBSE_UNKJOBID, 0, preq, NULL, msg_postmomnojob); } else { mutex_mgr job_mutex(pjob->ji_mutex, true); if (preq->rq_reply.brp_code != 0) { rc = get_hold(&preq->rq_ind.rq_hold.rq_orig.rq_attr, (const char **)&pset, &temphold); if (rc == 0) { rc = job_attr_def[JOB_ATR_hold].at_set(&pjob->ji_wattr[JOB_ATR_hold], &temphold, DECR); } pjob->ji_qs.ji_substate = JOB_SUBSTATE_RUNNING; /* reset it */ pjob->ji_modified = 1; /* indicate attributes changed */ svr_evaljobstate(*pjob, newstate, newsub, 0); svr_setjobstate(pjob, newstate, newsub, FALSE); /* saves job */ if (preq->rq_reply.brp_code != PBSE_NOSUP) { sprintf(log_buf, msg_mombadhold, preq->rq_reply.brp_code); log_event(PBSEVENT_DEBUG, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buf); req_reject(preq->rq_reply.brp_code, 0, preq, NULL, log_buf); } else { reply_ack(preq); } } else { /* record that MOM has a checkpoint file */ /* PBS_CHECKPOINT_MIGRATEABLE is defined as zero therefore this code will never fire. * And if these flags are not set, start_exec will not try to run the job from * the checkpoint image file. */ pjob->ji_qs.ji_svrflags |= JOB_SVFLG_CHECKPOINT_FILE; if (preq->rq_reply.brp_auxcode) /* checkpoint can be moved */ { pjob->ji_qs.ji_svrflags &= ~JOB_SVFLG_CHECKPOINT_FILE; pjob->ji_qs.ji_svrflags |= JOB_SVFLG_HASRUN | JOB_SVFLG_CHECKPOINT_MIGRATEABLE; } pjob->ji_modified = 1; /* indicate attributes changed */ svr_evaljobstate(*pjob, newstate, newsub, 0); svr_setjobstate(pjob, newstate, newsub, FALSE); /* saves job */ account_record(PBS_ACCT_CHKPNT, pjob, "Checkpointed and held"); /* note in accounting file */ reply_ack(preq); } } } /* END process_hold_reply() */
/* * release_job - releases the hold on job j * @param j - the job to modify * @param pa - a pointer to an array whose mutex we hold - always this job's array * @return 0 if successful, a PBS error on failure */ int release_job( struct batch_request *preq, /* I */ void *j, /* I/O */ job_array *pa) /* I */ { long old_hold; int rc = PBSE_NONE; int newstate; int newsub; char *pset; job *pjob = (job *)j; char log_buf[LOCAL_LOG_BUF_SIZE]; pbs_attribute temphold; // this function is meaningless for jobs in exiting or completed if (pjob->ji_qs.ji_state > JOB_STATE_RUNNING) return(PBSE_NONE); /* cannot do anything until we decode the holds to be set */ if ((rc = get_hold(&preq->rq_ind.rq_hold.rq_orig.rq_attr, (const char **)&pset, &temphold)) != 0) { return(rc); } /* if other than HOLD_u is being released, must have privil */ if ((rc = chk_hold_priv(temphold.at_val.at_long, preq->rq_perm)) != 0) { return(rc); } /* unset the hold */ old_hold = pjob->ji_wattr[JOB_ATR_hold].at_val.at_long; if ((rc = job_attr_def[JOB_ATR_hold].at_set(&pjob->ji_wattr[JOB_ATR_hold], &temphold, DECR))) { return(rc); } if (pjob->ji_arraystructid[0] != '\0') { // Make sure our slot limit counts are correct check_array_slot_limits(pjob, pa); } /* everything went well, if holds changed, update the job state */ if (old_hold != pjob->ji_wattr[JOB_ATR_hold].at_val.at_long) { pjob->ji_modified = 1; /* indicates attributes changed */ svr_evaljobstate(*pjob, newstate, newsub, 0); svr_setjobstate(pjob, newstate, newsub, FALSE); /* saves job */ } sprintf(log_buf, msg_jobholdrel, pset, preq->rq_user, preq->rq_host); log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buf); return(rc); } /* END release_job() */
int req_holdjob( batch_request *vp) /* I */ { long *hold_val; int newstate; int newsub; long old_hold; job *pjob; char *pset; int rc; pbs_attribute temphold; pbs_attribute *pattr; batch_request *preq = (struct batch_request *)vp; char log_buf[LOCAL_LOG_BUF_SIZE]; batch_request *dup_req = NULL; pjob = chk_job_request(preq->rq_ind.rq_hold.rq_orig.rq_objname, preq); if (pjob == NULL) { return(PBSE_NONE); } mutex_mgr job_mutex(pjob->ji_mutex, true); /* cannot do anything until we decode the holds to be set */ if ((rc = get_hold(&preq->rq_ind.rq_hold.rq_orig.rq_attr, (const char **)&pset, &temphold)) != 0) { req_reject(rc, 0, preq, NULL, NULL); return(PBSE_NONE); } /* if other than HOLD_u is being set, must have privil */ if ((rc = chk_hold_priv(temphold.at_val.at_long, preq->rq_perm)) != 0) { req_reject(rc, 0, preq, NULL, NULL); return(PBSE_NONE); } hold_val = &pjob->ji_wattr[JOB_ATR_hold].at_val.at_long; old_hold = *hold_val; *hold_val |= temphold.at_val.at_long; pjob->ji_wattr[JOB_ATR_hold].at_flags |= ATR_VFLAG_SET; sprintf(log_buf, msg_jobholdset, pset, preq->rq_user, preq->rq_host); pattr = &pjob->ji_wattr[JOB_ATR_checkpoint]; if ((pjob->ji_qs.ji_state == JOB_STATE_RUNNING) && ((pattr->at_flags & ATR_VFLAG_SET) && ((csv_find_string(pattr->at_val.at_str, "s") != NULL) || (csv_find_string(pattr->at_val.at_str, "c") != NULL) || (csv_find_string(pattr->at_val.at_str, "enabled") != NULL)))) { /* have MOM attempt checkpointing */ /* ** The jobid in the request always have the server suffix attached ** which is dropped when the server attribute ** 'display_job_server_suffix' is FALSE and so will in the MOM's. ** Therefore, it must be passed as the server to the MOM so she can ** find it to hold. */ if (strncmp(pjob->ji_qs.ji_jobid, preq->rq_ind.rq_hold.rq_orig.rq_objname, PBS_MAXSVRJOBID)) snprintf(preq->rq_ind.rq_hold.rq_orig.rq_objname, sizeof(preq->rq_ind.rq_hold.rq_orig.rq_objname), "%s", pjob->ji_qs.ji_jobid); if ((dup_req = duplicate_request(preq)) == NULL) { req_reject(rc, 0, preq, NULL, "memory allocation failure"); } /* The dup_req is freed in relay_to_mom (failure) * or in issue_Drequest (success) */ else if ((rc = relay_to_mom(&pjob, dup_req, NULL)) != PBSE_NONE) { free_br(dup_req); *hold_val = old_hold; /* reset to the old value */ req_reject(rc, 0, preq, NULL, "relay to mom failed"); if (pjob == NULL) job_mutex.set_unlock_on_exit(false); } else { if (pjob != NULL) { pjob->ji_qs.ji_svrflags |= JOB_SVFLG_HASRUN | JOB_SVFLG_CHECKPOINT_FILE; job_save(pjob, SAVEJOB_QUICK, 0); /* fill in log_buf again, since relay_to_mom changed it */ sprintf(log_buf, msg_jobholdset, pset, preq->rq_user, preq->rq_host); log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buf); unlock_ji_mutex(pjob, __func__, "3", LOGLEVEL); pjob = NULL; reply_ack(preq); } else job_mutex.set_unlock_on_exit(false); process_hold_reply(dup_req); } } #ifdef ENABLE_BLCR else if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING) { /* * This system is configured with BLCR checkpointing to be used, * but this Running job does not have checkpointing enabled, * so we reject the request */ log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf); req_reject(PBSE_IVALREQ, 0, preq, NULL, "job not held since checkpointing is expected but not enabled for job"); } #endif else { /* everything went well, may need to update the job state */ log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf); if (old_hold != *hold_val) { /* indicate attributes changed */ pjob->ji_modified = 1; svr_evaljobstate(*pjob, newstate, newsub, 0); svr_setjobstate(pjob, newstate, newsub, FALSE); } reply_ack(preq); } return(PBSE_NONE); } /* END req_holdjob() */
void req_holdarray(struct batch_request *preq) { int i; char *pset; char *range_str; int rc; attribute temphold; char owner[PBS_MAXUSER + 1]; job_array *pa; /* batch_request *preq_tmp; */ pa = get_array(preq->rq_ind.rq_hold.rq_orig.rq_objname); if (pa == NULL) { /* this shouldn't happen since we verify that this is a valid array just prior to calling this function */ req_reject(PBSE_UNKARRAYID, 0, preq, NULL, "unable to find array"); } get_jobowner(pa->ai_qs.owner, owner); if (svr_authorize_req(preq, owner, pa->ai_qs.submit_host) == -1) { sprintf(log_buffer, msg_permlog, preq->rq_type, "Array", preq->rq_ind.rq_delete.rq_objname, preq->rq_user, preq->rq_host); log_event( PBSEVENT_SECURITY, PBS_EVENTCLASS_JOB, preq->rq_ind.rq_delete.rq_objname, log_buffer); req_reject(PBSE_PERM, 0, preq, NULL, "operation not permitted"); return; } if ((rc = get_hold(&preq->rq_ind.rq_hold.rq_orig.rq_attr, &pset, &temphold)) != 0) { req_reject(rc, 0, preq, NULL, NULL); return; } /* if other than HOLD_u is being set, must have privil */ if ((rc = chk_hold_priv(temphold.at_val.at_long, preq->rq_perm)) != 0) { req_reject(rc, 0, preq, NULL, NULL); return; } /* get the range of jobs to iterate over */ range_str = preq->rq_extend; if ((range_str != NULL) && (strstr(range_str,ARRAY_RANGE) != NULL)) { if ((rc = hold_array_range(pa,range_str,&temphold)) != 0) { req_reject(rc,0,preq,NULL, "Error in specified array range"); } } else { /* do the entire array */ for (i = 0;i < pa->ai_qs.array_size;i++) { if (pa->jobs[i] == NULL) continue; hold_job(&temphold,pa->jobs[i]); } } reply_ack(preq); }
int req_holdarray( void *vp) /* I */ { int i; struct batch_request *preq = (struct batch_request *)vp; char *pset; char *range_str; int rc; pbs_attribute temphold; char owner[PBS_MAXUSER + 1]; job_array *pa; job *pjob; char log_buf[LOCAL_LOG_BUF_SIZE]; pa = get_array(preq->rq_ind.rq_hold.rq_orig.rq_objname); if (pa == NULL) { /* this shouldn't happen since we verify that this is a valid array just prior to calling this function */ req_reject(PBSE_UNKARRAYID, 0, preq, NULL, "unable to find array"); return(PBSE_NONE); } get_jobowner(pa->ai_qs.owner, owner); if (svr_authorize_req(preq, owner, pa->ai_qs.submit_host) == -1) { sprintf(log_buf, msg_permlog, preq->rq_type, "Array", preq->rq_ind.rq_delete.rq_objname, preq->rq_user, preq->rq_host); log_event(PBSEVENT_SECURITY, PBS_EVENTCLASS_JOB, preq->rq_ind.rq_delete.rq_objname, log_buf); if (LOGLEVEL >= 7) { sprintf(log_buf, "%s: unlocking ai_mutex", __func__); log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pa->ai_qs.parent_id, log_buf); } pthread_mutex_unlock(pa->ai_mutex); req_reject(PBSE_PERM, 0, preq, NULL, "operation not permitted"); return(PBSE_NONE); } if ((rc = get_hold(&preq->rq_ind.rq_hold.rq_orig.rq_attr, &pset, &temphold)) != 0) { if (LOGLEVEL >= 7) { sprintf(log_buf, "%s: unlocking ai_mutex", __func__); log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pa->ai_qs.parent_id, log_buf); } pthread_mutex_unlock(pa->ai_mutex); req_reject(rc, 0, preq, NULL, NULL); return(PBSE_NONE); } /* if other than HOLD_u is being set, must have privil */ if ((rc = chk_hold_priv(temphold.at_val.at_long, preq->rq_perm)) != 0) { if (LOGLEVEL >= 7) { sprintf(log_buf, "%s: unlocking ai_mutex", __func__); log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pa->ai_qs.parent_id, log_buf); } pthread_mutex_unlock(pa->ai_mutex); req_reject(rc, 0, preq, NULL, NULL); return(PBSE_NONE); } /* get the range of jobs to iterate over */ range_str = preq->rq_extend; if ((range_str != NULL) && (strstr(range_str,ARRAY_RANGE) != NULL)) { if ((rc = hold_array_range(pa,range_str,&temphold)) != 0) { pthread_mutex_unlock(pa->ai_mutex); req_reject(rc,0,preq,NULL, "Error in specified array range"); return(PBSE_NONE); } } else { /* do the entire array */ for (i = 0;i < pa->ai_qs.array_size;i++) { if (pa->job_ids[i] == NULL) continue; if ((pjob = svr_find_job(pa->job_ids[i], FALSE)) == NULL) { free(pa->job_ids[i]); pa->job_ids[i] = NULL; } else { hold_job(&temphold,pjob); if (LOGLEVEL >= 7) { sprintf(log_buf, "%s: unlocking ai_mutex", __func__); log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pa->ai_qs.parent_id, log_buf); } unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL); } } } if (LOGLEVEL >= 7) { sprintf(log_buf, "%s: unlocking ai_mutex", __func__); log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pa->ai_qs.parent_id, log_buf); } pthread_mutex_unlock(pa->ai_mutex); reply_ack(preq); return(PBSE_NONE); } /* END req_holdarray() */
void req_releasejob(struct batch_request *preq) { int jt; /* job type */ int newstate; int newsub; long old_hold; job *pjob; char *pset; int rc; pjob = chk_job_request(preq->rq_ind.rq_release.rq_objname, preq, &jt); if (pjob == (job *)0) return; if ((jt != IS_ARRAY_NO) && (jt != IS_ARRAY_ArrayJob)) { req_reject(PBSE_IVALREQ, 0, preq); return; } /* cannot do anything until we decode the holds to be set */ if ((rc=get_hold(&preq->rq_ind.rq_hold.rq_orig.rq_attr, &pset)) != 0) { req_reject(rc, 0, preq); return; } /* if other than HOLD_u is being released, must have privil */ if ((rc = chk_hold_priv(temphold.at_val.at_long, preq->rq_perm)) != 0) { req_reject(rc, 0, preq); return; } /* all ok so far, unset the hold */ old_hold = pjob->ji_wattr[(int)JOB_ATR_hold].at_val.at_long; rc = job_attr_def[(int)JOB_ATR_hold]. at_set(&pjob->ji_wattr[(int)JOB_ATR_hold], &temphold, DECR); if (rc) { req_reject(rc, 0, preq); return; } /* every thing went well, if holds changed, update the job state */ #ifndef NAS /* localmod 105 Always reset etime on release */ if (old_hold != pjob->ji_wattr[(int)JOB_ATR_hold].at_val.at_long) { #endif /* localmod 105 */ #ifdef NAS /* localmod 105 */ { attribute *etime = &pjob->ji_wattr[(int)JOB_ATR_etime]; etime->at_val.at_long = time_now; etime->at_flags |= ATR_VFLAG_SET|ATR_VFLAG_MODCACHE; #endif /* localmod 105 */ pjob->ji_modified = 1; /* indicates attributes changed */ svr_evaljobstate(pjob, &newstate, &newsub, 0); (void)svr_setjobstate(pjob, newstate, newsub); /* saves job */ } if (pjob->ji_wattr[(int)JOB_ATR_hold].at_val.at_long == 0) job_attr_def[(int)JOB_ATR_Comment].at_free(&pjob->ji_wattr[(int)JOB_ATR_Comment]); (void)sprintf(log_buffer, msg_jobholdrel, pset, preq->rq_user, preq->rq_host); log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, LOG_INFO, pjob->ji_qs.ji_jobid, log_buffer); reply_ack(preq); } /** * @brief * get_hold - search a list of attributes (svrattrl) for the hold-types * attribute. This is used by the Hold Job and Release Job request, * therefore it is an error if the hold-types attribute is not present, * or there is more than one. * * Decode the hold attribute into temphold. * * @param[in] phead - pbs list head. * @param[out] phead - RETURN - ptr to hold value * * @return error code */ static int get_hold(pbs_list_head *phead, char **pset) { int have_one = 0; struct svrattrl *holdattr = (struct svrattrl*)0; struct svrattrl *pal; pal = (struct svrattrl *)GET_NEXT((*phead)); while (pal) { if (!strcasecmp(pal->al_name, job_attr_def[(int)JOB_ATR_hold].at_name)) { holdattr = pal; *pset = pal->al_value; have_one++; } else { return (PBSE_IVALREQ); } pal = (struct svrattrl *)GET_NEXT(pal->al_link); } if (have_one != 1) return (PBSE_IVALREQ); /* decode into temporary attribute structure */ clear_attr(&temphold, &job_attr_def[(int)JOB_ATR_hold]); return (job_attr_def[(int)JOB_ATR_hold].at_decode( &temphold, holdattr->al_name, (char *)0, holdattr->al_value)); }
void req_holdjob(struct batch_request *preq) { long *hold_val; int jt; /* job type */ int newstate; int newsub; long old_hold; job *pjob; char *pset; int rc; char date[32]; time_t now; pjob = chk_job_request(preq->rq_ind.rq_hold.rq_orig.rq_objname, preq, &jt); if (pjob == (job *)0) return; if ((jt != IS_ARRAY_NO) && (jt != IS_ARRAY_ArrayJob)) { req_reject(PBSE_IVALREQ, 0, preq); return; } if ((pjob->ji_qs.ji_state == JOB_STATE_RUNNING) && (pjob->ji_qs.ji_substate == JOB_SUBSTATE_PROVISION)) { req_reject(PBSE_BADSTATE, 0, preq); return; } /* cannot do anything until we decode the holds to be set */ if ((rc=get_hold(&preq->rq_ind.rq_hold.rq_orig.rq_attr, &pset)) != 0) { req_reject(rc, 0, preq); return; } /* if other than HOLD_u is being set, must have privil */ if ((rc = chk_hold_priv(temphold.at_val.at_long, preq->rq_perm)) != 0) { req_reject(rc, 0, preq); return; } /* HOLD_bad_password can only be done by root or admin */ #ifdef WIN32 if ( (temphold.at_val.at_long & HOLD_bad_password) && \ !isAdminPrivilege(preq->rq_user) ) #else if ( (temphold.at_val.at_long & HOLD_bad_password) && \ strcasecmp(preq->rq_user, PBS_DEFAULT_ADMIN) != 0 ) #endif { req_reject(PBSE_PERM, 0, preq); return; } hold_val = &pjob->ji_wattr[(int)JOB_ATR_hold].at_val.at_long; old_hold = *hold_val; *hold_val |= temphold.at_val.at_long; pjob->ji_wattr[(int)JOB_ATR_hold].at_flags |= ATR_VFLAG_SET | ATR_VFLAG_MODCACHE; /* Note the hold time in the job comment. */ now = time(NULL); (void)strncpy(date, (const char *)ctime(&now), 24); date[24] = '\0'; (void)sprintf(log_buffer, "Job held by %s on %s", preq->rq_user, date); job_attr_def[(int)JOB_ATR_Comment].at_decode(&pjob->ji_wattr[(int)JOB_ATR_Comment], (char *)0, (char *)0, log_buffer); (void)sprintf(log_buffer, msg_jobholdset, pset, preq->rq_user, preq->rq_host); if ((pjob->ji_qs.ji_state == JOB_STATE_RUNNING) && (pjob->ji_qs.ji_substate != JOB_SUBSTATE_PRERUN) && (pjob->ji_wattr[(int)JOB_ATR_chkpnt].at_val.at_str) && (*pjob->ji_wattr[(int)JOB_ATR_chkpnt].at_val.at_str != 'n')) { /* have MOM attempt checkpointing */ if ((rc = relay_to_mom(pjob, preq, post_hold)) != 0) { *hold_val = old_hold; /* reset to the old value */ req_reject(rc, 0, preq); } else { pjob->ji_qs.ji_substate = JOB_SUBSTATE_RERUN; pjob->ji_qs.ji_svrflags |= JOB_SVFLG_HASRUN | JOB_SVFLG_CHKPT; (void)job_save(pjob, SAVEJOB_QUICK); log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, LOG_INFO, pjob->ji_qs.ji_jobid, log_buffer); } } else { /* every thing went well, may need to update the job state */ log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, LOG_INFO, pjob->ji_qs.ji_jobid, log_buffer); if (old_hold != *hold_val) { /* indicate attributes changed */ pjob->ji_modified = 1; svr_evaljobstate(pjob, &newstate, &newsub, 0); (void)svr_setjobstate(pjob, newstate, newsub); } reply_ack(preq); } }
static void process_hold_reply( struct work_task *pwt) { job *pjob; struct batch_request *preq; int newstate; int newsub; attribute temphold; char *pset; int rc; svr_disconnect(pwt->wt_event); /* close connection to MOM */ preq = pwt->wt_parm1; preq->rq_conn = preq->rq_orgconn; /* restore client socket */ if ((pjob = find_job(preq->rq_ind.rq_hold.rq_orig.rq_objname)) == (job *)0) { LOG_EVENT(PBSEVENT_DEBUG, PBS_EVENTCLASS_JOB, preq->rq_ind.rq_hold.rq_orig.rq_objname, msg_postmomnojob); req_reject(PBSE_UNKJOBID, 0, preq, NULL, msg_postmomnojob); } else if (preq->rq_reply.brp_code != 0) { rc = get_hold(&preq->rq_ind.rq_hold.rq_orig.rq_attr, &pset, &temphold); if (rc == 0) { rc = job_attr_def[(int)JOB_ATR_hold].at_set(&pjob->ji_wattr[(int)JOB_ATR_hold], &temphold, DECR); } pjob->ji_qs.ji_substate = JOB_SUBSTATE_RUNNING; /* reset it */ pjob->ji_modified = 1; /* indicate attributes changed */ svr_evaljobstate(pjob, &newstate, &newsub, 0); svr_setjobstate(pjob, newstate, newsub); /* saves job */ if (preq->rq_reply.brp_code != PBSE_NOSUP) { sprintf(log_buffer, msg_mombadhold, preq->rq_reply.brp_code); LOG_EVENT(PBSEVENT_DEBUG, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buffer); req_reject(preq->rq_reply.brp_code, 0, preq, NULL, log_buffer); } else { reply_ack(preq); } } else { /* record that MOM has a checkpoint file */ /* PBS_CHECKPOINT_MIGRATEABLE is defined as zero therefore this code will never fire. * And if these flags are not set, start_exec will not try to run the job from * the checkpoint image file. */ pjob->ji_qs.ji_svrflags |= JOB_SVFLG_CHECKPOINT_FILE; if (preq->rq_reply.brp_auxcode) /* checkpoint can be moved */ { pjob->ji_qs.ji_svrflags &= ~JOB_SVFLG_CHECKPOINT_FILE; pjob->ji_qs.ji_svrflags |= JOB_SVFLG_HASRUN | JOB_SVFLG_CHECKPOINT_MIGRATEABLE; } pjob->ji_modified = 1; /* indicate attributes changed */ svr_evaljobstate(pjob, &newstate, &newsub, 0); svr_setjobstate(pjob, newstate, newsub); /* saves job */ account_record(PBS_ACCT_CHKPNT, pjob, "Checkpointed and held"); /* note in accounting file */ reply_ack(preq); } }
void req_releasejob( struct batch_request *preq) /* ptr to the decoded request */ { int newstate; int newsub; long old_hold; job *pjob; char *pset; int rc; attribute temphold; pjob = chk_job_request(preq->rq_ind.rq_release.rq_objname, preq); if (pjob == NULL) { return; } /* cannot do anything until we decode the holds to be set */ if ((rc = get_hold(&preq->rq_ind.rq_hold.rq_orig.rq_attr, &pset, &temphold)) != 0) { req_reject(rc, 0, preq, NULL, NULL); return; } /* if other than HOLD_u is being released, must have privil */ if ((rc = chk_hold_priv(temphold.at_val.at_long, preq->rq_perm)) != 0) { req_reject(rc, 0, preq, NULL, NULL); return; } /* all ok so far, unset the hold */ old_hold = pjob->ji_wattr[(int)JOB_ATR_hold].at_val.at_long; if ((rc = job_attr_def[(int)JOB_ATR_hold].at_set(&pjob->ji_wattr[(int)JOB_ATR_hold], &temphold, DECR))) { req_reject(rc, 0, preq, NULL, NULL); return; } /* everything went well, if holds changed, update the job state */ if (old_hold != pjob->ji_wattr[(int)JOB_ATR_hold].at_val.at_long) { pjob->ji_modified = 1; /* indicates attributes changed */ svr_evaljobstate(pjob, &newstate, &newsub, 0); svr_setjobstate(pjob, newstate, newsub); /* saves job */ } sprintf(log_buffer, msg_jobholdrel, pset, preq->rq_user, preq->rq_host); LOG_EVENT(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buffer); reply_ack(preq); return; } /* END req_releasejob() */
void req_holdjob( struct batch_request *preq) { long *hold_val; int newstate; int newsub; long old_hold; job *pjob; char *pset; int rc; attribute temphold; attribute *pattr; pjob = chk_job_request(preq->rq_ind.rq_hold.rq_orig.rq_objname, preq); if (pjob == NULL) { return; } if (is_cloud_job(pjob)) { req_reject(PBSE_CLOUD_REQUEST,0,preq,NULL,NULL); } /* cannot do anything until we decode the holds to be set */ if ((rc = get_hold(&preq->rq_ind.rq_hold.rq_orig.rq_attr, &pset, &temphold)) != 0) { req_reject(rc, 0, preq, NULL, NULL); return; } /* if other than HOLD_u is being set, must have privil */ if ((rc = chk_hold_priv(temphold.at_val.at_long, preq->rq_perm)) != 0) { req_reject(rc, 0, preq, NULL, NULL); return; } hold_val = &pjob->ji_wattr[(int)JOB_ATR_hold].at_val.at_long; old_hold = *hold_val; *hold_val |= temphold.at_val.at_long; pjob->ji_wattr[(int)JOB_ATR_hold].at_flags |= ATR_VFLAG_SET; sprintf(log_buffer, msg_jobholdset, pset, preq->rq_user, preq->rq_host); pattr = &pjob->ji_wattr[(int)JOB_ATR_checkpoint]; if ((pjob->ji_qs.ji_state == JOB_STATE_RUNNING) && ((pattr->at_flags & ATR_VFLAG_SET) && ((csv_find_string(pattr->at_val.at_str, "s") != NULL) || (csv_find_string(pattr->at_val.at_str, "c") != NULL) || (csv_find_string(pattr->at_val.at_str, "enabled") != NULL)))) { /* have MOM attempt checkpointing */ if ((rc = relay_to_mom(pjob->ji_qs.ji_un.ji_exect.ji_momaddr, preq, process_hold_reply)) != 0) { *hold_val = old_hold; /* reset to the old value */ req_reject(rc, 0, preq, NULL, NULL); } else { pjob->ji_qs.ji_svrflags |= JOB_SVFLG_HASRUN | JOB_SVFLG_CHECKPOINT_FILE; job_save(pjob, SAVEJOB_QUICK); /* fill in log_buffer again, since relay_to_mom changed it */ sprintf(log_buffer, msg_jobholdset, pset, preq->rq_user, preq->rq_host); LOG_EVENT(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buffer); } } #ifdef ENABLE_BLCR else if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING) { /* * This system is configured with BLCR checkpointing to be used, * but this Running job does not have checkpointing enabled, * so we reject the request */ LOG_EVENT( PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buffer); req_reject(PBSE_IVALREQ, 0, preq, NULL, "job not held since checkpointing is expected but not enabled for job"); } #endif else { /* everything went well, may need to update the job state */ LOG_EVENT( PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buffer); if (old_hold != *hold_val) { /* indicate attributes changed */ pjob->ji_modified = 1; svr_evaljobstate(pjob, &newstate, &newsub, 0); svr_setjobstate(pjob, newstate, newsub); } reply_ack(preq); } } /* END req_holdjob() */