void *req_messagejob( batch_request *preq) /* I */ { job *pjob; int rc; batch_request *dup_req = NULL; if ((pjob = chk_job_request(preq->rq_ind.rq_message.rq_jid, preq)) == NULL) return(NULL); mutex_mgr job_mutex(pjob->ji_mutex, true); /* the job must be running */ if (pjob->ji_qs.ji_state != JOB_STATE_RUNNING) { req_reject(PBSE_BADSTATE, 0, preq, NULL, NULL); return(NULL); } if ((rc = copy_batchrequest(&dup_req, preq, 0, -1)) != 0) { req_reject(PBSE_MEM_MALLOC, 0, preq, NULL, NULL); } /* pass the request on to MOM */ /* The dup_req is freed in relay_to_mom (failure) * or in issue_Drequest (success) */ else if ((rc = relay_to_mom(&pjob, dup_req, NULL)) != PBSE_NONE) { req_reject(rc, 0, preq, NULL, NULL); /* unable to get to MOM */ free_br(dup_req); } else { post_message_req(dup_req); free_br(preq); } /* After MOM acts and replies to us, we pick up in post_message_req() */ if (pjob == NULL) job_mutex.set_lock_on_exit(false); return(NULL); } /* END req_messagejob() */
void *req_messagejob( void *vp) { struct batch_request *preq = (struct batch_request *)vp; job *pjob; int rc; struct batch_request *dup_req = NULL; if ((pjob = chk_job_request(preq->rq_ind.rq_message.rq_jid, preq)) == NULL) return(NULL); /* the job must be running */ if (pjob->ji_qs.ji_state != JOB_STATE_RUNNING) { req_reject(PBSE_BADSTATE, 0, preq, NULL, NULL); unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL); return(NULL); } if ((rc = copy_batchrequest(&dup_req, preq, 0, -1)) != 0) { req_reject(PBSE_MEM_MALLOC, 0, preq, NULL, NULL); } /* pass the request on to MOM */ /* The dup_req is freed in relay_to_mom (failure) * or in issue_Drequest (success) */ else if ((rc = relay_to_mom(&pjob, dup_req, post_message_req)) != 0) req_reject(rc, 0, preq, NULL, NULL); /* unable to get to MOM */ else free_br(preq); /* After MOM acts and replies to us, we pick up in post_message_req() */ if (pjob != NULL) unlock_ji_mutex(pjob, __func__, "2", LOGLEVEL); return(NULL); } /* END req_messagejob() */
/* * modify_whole_array() * modifies the entire job array * @SEE req_modify_array PARENT */ int modify_whole_array( job_array *pa, /* I/O */ svrattrl *plist, /* I */ struct batch_request *preq, /* I */ int checkpoint_req) /* I */ { int i; int rc = 0; int mom_relay = 0; char log_buf[LOCAL_LOG_BUF_SIZE]; job *pjob; for (i = 0; i < pa->ai_qs.array_size; i++) { if (pa->job_ids[i] == NULL) continue; if ((pjob = svr_find_job(pa->job_ids[i], FALSE)) == NULL) { free(pa->job_ids[i]); pa->job_ids[i] = NULL; } else { /* NO_MOM_RELAY will prevent modify_job from calling relay_to_mom */ rc = modify_job((void **)&pjob, plist, preq, checkpoint_req, NO_MOM_RELAY); if (rc == PBSE_RELAYED_TO_MOM) { struct batch_request *array_req = NULL; /* We told modify_job not to call relay_to_mom * so we need to contact the mom */ rc = copy_batchrequest(&array_req, preq, 0, i); if (rc != 0) { unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL); return(rc); } preq->rq_refcount++; if (mom_relay == 0) { preq->rq_refcount++; } mom_relay++; /* The array_req is freed in relay_to_mom (failure) * or in issue_Drequest (success) */ if ((rc = relay_to_mom(&pjob, array_req, post_modify_arrayreq))) { if (pjob != NULL) { snprintf(log_buf,sizeof(log_buf), "Unable to relay information to mom for job '%s'\n", pjob->ji_qs.ji_jobid); log_err(rc, __func__, log_buf); unlock_ji_mutex(pjob, __func__, "2", LOGLEVEL); } return(rc); /* unable to get to MOM */ } } if (pjob != NULL) unlock_ji_mutex(pjob, __func__, "3", LOGLEVEL); } } /* END foreach job in array */ if (mom_relay) { preq->rq_refcount--; if (preq->rq_refcount == 0) { free_br(preq); } return(PBSE_RELAYED_TO_MOM); } return(rc); } /* END modify_whole_array() */
int modify_job( void **j, /* O */ svrattrl *plist, /* I */ struct batch_request *preq, /* I */ int checkpoint_req, /* I */ int flag) /* I */ { int bad = 0; int i; int newstate; int newsubstate; resource_def *prsd; int rc; int sendmom = 0; int copy_checkpoint_files = FALSE; char log_buf[LOCAL_LOG_BUF_SIZE]; struct batch_request *dup_req = NULL; job *pjob = (job *)*j; if (pjob == NULL) { sprintf(log_buf, "job structure is NULL"); log_err(PBSE_IVALREQ, __func__, log_buf); return(PBSE_IVALREQ); } /* cannot be in exiting or transit, exiting has already been checked */ if (pjob->ji_qs.ji_state == JOB_STATE_TRANSIT) { /* FAILURE */ snprintf(log_buf,sizeof(log_buf), "Cannot modify job '%s' in transit\n", pjob->ji_qs.ji_jobid); log_err(PBSE_BADSTATE, __func__, log_buf); return(PBSE_BADSTATE); } if (((checkpoint_req == CHK_HOLD) || (checkpoint_req == CHK_CONT)) && (pjob->ji_qs.ji_substate == JOB_SUBSTATE_RUNNING)) { /* May need to request copy of the checkpoint file from mom */ copy_checkpoint_files = TRUE; if (checkpoint_req == CHK_HOLD) { sprintf(log_buf,"setting jobsubstate for %s to RERUN\n", pjob->ji_qs.ji_jobid); pjob->ji_qs.ji_substate = JOB_SUBSTATE_RERUN; job_save(pjob, SAVEJOB_QUICK, 0); log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buf); /* remove checkpoint restart file if there is one */ if (pjob->ji_wattr[JOB_ATR_restart_name].at_flags & ATR_VFLAG_SET) { cleanup_restart_file(pjob); } } } /* if job is running, special checks must be made */ /* NOTE: must determine if job exists down at MOM - this will occur if job is running, job is held, or job was held and just barely released (ie qhold/qrls) */ /* COMMENTED OUT BY JOSH B IN 2.3 DUE TO MAJOR PROBLEMS w/ CUSTOMERS * --FIX and uncomment once we know what is really going on. * * We now know that ji_destin gets set on a qmove and that the mom does not * have the job at that point. * if ((pjob->ji_qs.ji_state == JOB_STATE_RUNNING) || ((pjob->ji_qs.ji_state == JOB_STATE_HELD) && (pjob->ji_qs.ji_destin[0] != '\0')) || ((pjob->ji_qs.ji_state == JOB_STATE_QUEUED) && (pjob->ji_qs.ji_destin[0] != '\0'))) */ if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING) { while (plist != NULL) { /* is the pbs_attribute modifiable in RUN state ? */ i = find_attr(job_attr_def, plist->al_name, JOB_ATR_LAST); if ((i < 0) || ((job_attr_def[i].at_flags & ATR_DFLAG_ALTRUN) == 0)) { /* FAILURE */ snprintf(log_buf,sizeof(log_buf), "Cannot modify attribute '%s' while running\n", plist->al_name); log_err(PBSE_MODATRRUN, __func__, log_buf); return PBSE_MODATRRUN; } /* NOTE: only explicitly specified job attributes are routed down to MOM */ if (i == JOB_ATR_resource) { /* is the specified resource modifiable while */ /* the job is running */ prsd = find_resc_def(svr_resc_def, plist->al_resc, svr_resc_size); if (prsd == NULL) { /* FAILURE */ snprintf(log_buf,sizeof(log_buf), "Unknown attribute '%s'\n", plist->al_name); log_err(PBSE_UNKRESC, __func__, log_buf); return(PBSE_UNKRESC); } if ((prsd->rs_flags & ATR_DFLAG_ALTRUN) == 0) { /* FAILURE */ snprintf(log_buf,sizeof(log_buf), "Cannot modify attribute '%s' while running\n", plist->al_name); log_err(PBSE_MODATRRUN, __func__, log_buf); return(PBSE_MODATRRUN); } sendmom = 1; } /* else if ((i == JOB_ATR_checkpoint_name) || (i == JOB_ATR_variables)) { sendmom = 1; } */ plist = (svrattrl *)GET_NEXT(plist->al_link); } } /* END if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING) */ /* modify the job's attributes */ bad = 0; plist = (svrattrl *)GET_NEXT(preq->rq_ind.rq_modify.rq_attr); rc = modify_job_attr(pjob, plist, preq->rq_perm, &bad); if (rc) { /* FAILURE */ snprintf(log_buf,sizeof(log_buf), "Cannot set attributes for job '%s'\n", pjob->ji_qs.ji_jobid); log_err(rc, __func__, log_buf); if (rc == PBSE_JOBNOTFOUND) *j = NULL; return(rc); } /* Reset any defaults resource limit which might have been unset */ set_resc_deflt(pjob, NULL, FALSE); /* if job is not running, may need to change its state */ if (pjob->ji_qs.ji_state != JOB_STATE_RUNNING) { svr_evaljobstate(pjob, &newstate, &newsubstate, 0); svr_setjobstate(pjob, newstate, newsubstate, FALSE); } else { job_save(pjob, SAVEJOB_FULL, 0); } sprintf(log_buf, msg_manager, msg_jobmod, preq->rq_user, preq->rq_host); log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf); /* if a resource limit changed for a running job, send to MOM */ if (sendmom) { /* if the NO_MOM_RELAY flag is set the calling function will call relay_to_mom so we do not need to do it here */ if (flag != NO_MOM_RELAY) { /* The last number is unused unless this is an array */ if ((rc = copy_batchrequest(&dup_req, preq, 0, -1)) != 0) { } /* The dup_req is freed in relay_to_mom (failure) * or in issue_Drequest (success) */ else if ((rc = relay_to_mom(&pjob, dup_req, post_modify_req))) { if (pjob != NULL) { snprintf(log_buf,sizeof(log_buf), "Unable to relay information to mom for job '%s'\n", pjob->ji_qs.ji_jobid); log_err(rc, __func__, log_buf); } return(rc); /* unable to get to MOM */ } } return(PBSE_RELAYED_TO_MOM); } if (copy_checkpoint_files) { struct batch_request *momreq = 0; momreq = cpy_checkpoint(momreq, pjob, JOB_ATR_checkpoint_name, CKPT_DIR_OUT); if (momreq != NULL) { /* have files to copy */ momreq->rq_extra = strdup(pjob->ji_qs.ji_jobid); /* The momreq is freed in relay_to_mom (failure) * or in issue_Drequest (success) */ if (checkpoint_req == CHK_HOLD) { rc = relay_to_mom(&pjob, momreq, chkpt_xfr_hold); } else { rc = relay_to_mom(&pjob, momreq, chkpt_xfr_done); } if (rc != 0) { if (pjob != NULL) { snprintf(log_buf,sizeof(log_buf), "Unable to relay information to mom for job '%s'\n", pjob->ji_qs.ji_jobid); log_err(rc, __func__, log_buf); } return(PBSE_NONE); /* come back when mom replies */ } } else { log_err(-1, __func__, "Failed to get batch request"); } } return(PBSE_NONE); } /* END modify_job() */
int req_signaljob( void *vp) /* I */ { struct batch_request *preq = (struct batch_request *)vp; job *pjob; int rc; char log_buf[LOCAL_LOG_BUF_SIZE]; struct batch_request *dup_req = NULL; /* preq free'd in error cases */ if ((pjob = chk_job_request(preq->rq_ind.rq_signal.rq_jid, preq)) == 0) { return(PBSE_NONE); } /* the job must be running */ if (pjob->ji_qs.ji_state != JOB_STATE_RUNNING) { req_reject(PBSE_BADSTATE, 0, preq, NULL, NULL); unlock_ji_mutex(pjob, __func__, (char *)"1", LOGLEVEL); return(PBSE_NONE); } /* Special pseudo signals for suspend and resume require op/mgr */ if (!strcmp(preq->rq_ind.rq_signal.rq_signame, SIG_RESUME) || !strcmp(preq->rq_ind.rq_signal.rq_signame, SIG_SUSPEND)) { if ((preq->rq_perm & (ATR_DFLAG_OPRD | ATR_DFLAG_OPWR | ATR_DFLAG_MGRD | ATR_DFLAG_MGWR)) == 0) { /* for suspend/resume, must be mgr/op */ req_reject(PBSE_PERM, 0, preq, NULL, NULL); unlock_ji_mutex(pjob, __func__, (char *)"2", LOGLEVEL); return(PBSE_NONE); } } /* save job ptr for post_signal_req() */ preq->rq_extra = strdup(pjob->ji_qs.ji_jobid); /* FIXME: need a race-free check for available free subnodes before * resuming a suspended job */ #ifdef DONOTSUSPINTJOB /* interactive jobs don't resume correctly so don't allow a suspend */ if (!strcmp(preq->rq_ind.rq_signal.rq_signame, SIG_SUSPEND) && (pjob->ji_wattr[JOB_ATR_interactive].at_flags & ATR_VFLAG_SET) && (pjob->ji_wattr[JOB_ATR_interactive].at_val.at_long > 0)) { req_reject(PBSE_JOBTYPE, 0, preq, NULL, NULL); unlock_ji_mutex(pjob, __func__, (char *)"3", LOGLEVEL); return(PBSE_NONE); } #endif if (LOGLEVEL >= 6) { sprintf(log_buf, "relaying signal request to mom %lu", pjob->ji_qs.ji_un.ji_exect.ji_momaddr); log_record(PBSEVENT_SCHED,PBS_EVENTCLASS_REQUEST,"req_signaljob",log_buf); } /* send reply for asynchronous suspend */ if (preq->rq_type == PBS_BATCH_AsySignalJob) { reply_ack(preq); preq->rq_noreply = TRUE; } /* pass the request on to MOM */ if ((rc = copy_batchrequest(&dup_req, preq, 0, -1)) != 0) { req_reject(rc, 0, preq, NULL, "can not allocate memory"); unlock_ji_mutex(pjob, __func__, (char *)"4", LOGLEVEL); } /* The dup_req is freed in relay_to_mom (failure) * or in issue_Drequest (success) */ else { rc = relay_to_mom(&pjob, dup_req, NULL); if (pjob != NULL) unlock_ji_mutex(pjob, __func__, (char *)"4", LOGLEVEL); if (rc != PBSE_NONE) { free_br(dup_req); req_reject(rc, 0, preq, NULL, NULL); /* unable to get to MOM */ } else { post_signal_req(dup_req); free_br(preq); } } /* If successful we ack after mom replies to us, we pick up in post_signal_req() */ return(PBSE_NONE); } /* END req_signaljob() */
int modify_array_range( job_array *pa, /* I/O */ char *range, /* I */ svrattrl *plist, /* I */ struct batch_request *preq, /* I */ int checkpoint_req) /* I */ { char id[] = "modify_array_range"; tlist_head tl; int i, rc; int mom_relay = 0; array_request_node *rn; array_request_node *to_free; CLEAR_HEAD(tl); if (parse_array_request(range,&tl) > 0) { /* don't hold the jobs if range error */ return(FAILURE); } else { /* hold just that range from the array */ rn = (array_request_node*)GET_NEXT(tl); while (rn != NULL) { for (i = rn->start; i <= rn->end; i++) { if ((i >= pa->ai_qs.array_size) || (pa->jobs[i] == NULL)) continue; rc = modify_job(pa->jobs[i],plist,preq,checkpoint_req, NO_MOM_RELAY); if (rc == PBSE_RELAYED_TO_MOM) { struct batch_request *array_req = NULL; /* We told modify_job not to call relay_to_mom so we need to contact the mom */ rc = copy_batchrequest(&array_req, preq, 0, i); if (rc != 0) { return(rc); } preq->rq_refcount++; if (mom_relay == 0) { preq->rq_refcount++; } mom_relay++; if ((rc = relay_to_mom( pa->jobs[i], array_req, post_modify_arrayreq))) { snprintf(log_buffer,sizeof(log_buffer), "Unable to relay information to mom for job '%s'\n", pa->jobs[i]->ji_qs.ji_jobid); log_err(rc,id,log_buffer); return(rc); /* unable to get to MOM */ } } } /* release mem */ to_free = rn; rn = (array_request_node*)GET_NEXT(rn->request_tokens_link); free(to_free); } } if (mom_relay) { preq->rq_refcount--; if (preq->rq_refcount == 0) { free_br(preq); } return(PBSE_RELAYED_TO_MOM); } return(PBSE_NONE); } /* END modify_array_range() */
int modify_array_range( job_array *pa, /* I/O */ char *range, /* I */ svrattrl *plist, /* I */ struct batch_request *preq, /* I */ int checkpoint_req) /* I */ { char log_buf[LOCAL_LOG_BUF_SIZE]; tlist_head tl; int i; int rc; int mom_relay = 0; job *pjob; array_request_node *rn; array_request_node *to_free; CLEAR_HEAD(tl); if (parse_array_request(range,&tl) > 0) { /* don't hold the jobs if range error */ return(FAILURE); } else { /* hold just that range from the array */ rn = (array_request_node*)GET_NEXT(tl); while (rn != NULL) { for (i = rn->start; i <= rn->end; i++) { if ((i >= pa->ai_qs.array_size) || (pa->job_ids[i] == NULL)) continue; if ((pjob = svr_find_job(pa->job_ids[i], FALSE)) == NULL) { free(pa->job_ids[i]); pa->job_ids[i] = NULL; } else { pthread_mutex_unlock(pa->ai_mutex); rc = modify_job((void **)&pjob, plist, preq, checkpoint_req, NO_MOM_RELAY); pa = get_jobs_array(&pjob); if (pjob != NULL) { if (rc == PBSE_RELAYED_TO_MOM) { struct batch_request *array_req = NULL; /* We told modify_job not to call relay_to_mom so we need to contact the mom */ if ((rc = copy_batchrequest(&array_req, preq, 0, i)) != PBSE_NONE) { return(rc); } preq->rq_refcount++; if (mom_relay == 0) { preq->rq_refcount++; } mom_relay++; /* The array_req is freed in relay_to_mom (failure) * or in issue_Drequest (success) */ if ((rc = relay_to_mom(&pjob, array_req, NULL))) { snprintf(log_buf,sizeof(log_buf), "Unable to relay information to mom for job '%s'\n", pjob->ji_qs.ji_jobid); log_err(rc, __func__, log_buf); unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL); return(rc); /* unable to get to MOM */ } else { unlock_ji_mutex(pjob, __func__, "2", LOGLEVEL); post_modify_arrayreq(array_req); } } else unlock_ji_mutex(pjob, __func__, "2", LOGLEVEL); } else pa->job_ids[i] = NULL; } } /* release mem */ to_free = rn; rn = (array_request_node*)GET_NEXT(rn->request_tokens_link); free(to_free); } } if (mom_relay) { preq->rq_refcount--; if (preq->rq_refcount == 0) { free_br(preq); } return(PBSE_RELAYED_TO_MOM); } return(PBSE_NONE); } /* END modify_array_range() */
void *req_checkpointjob( void *vp) { struct batch_request *preq = (struct batch_request *)vp; job *pjob; int rc; pbs_attribute *pattr; char log_buf[LOCAL_LOG_BUF_SIZE]; struct batch_request *dup_req = NULL; if ((pjob = chk_job_request(preq->rq_ind.rq_manager.rq_objname, preq)) == NULL) { return(NULL); } pattr = &pjob->ji_wattr[JOB_ATR_checkpoint]; if ((pjob->ji_qs.ji_state == JOB_STATE_RUNNING) && ((pattr->at_flags & ATR_VFLAG_SET) && ((csv_find_string(pattr->at_val.at_str, "s") != NULL) || (csv_find_string(pattr->at_val.at_str, "c") != NULL) || (csv_find_string(pattr->at_val.at_str, "enabled") != NULL)))) { /* have MOM attempt checkpointing */ if ((rc = copy_batchrequest(&dup_req, preq, 0, -1)) != 0) { req_reject(rc, 0, preq, NULL, "failure to allocate memory"); } /* The dup_req is freed in relay_to_mom (failure) * or in issue_Drequest (success) */ else if ((rc = relay_to_mom(&pjob, dup_req, NULL)) != PBSE_NONE) { req_reject(rc, 0, preq, NULL, NULL); free_br(dup_req); } else { if (pjob != NULL) { pjob->ji_qs.ji_svrflags |= JOB_SVFLG_CHECKPOINT_FILE; job_save(pjob, SAVEJOB_QUICK, 0); log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buf); unlock_ji_mutex(pjob, __func__, (char *)"1", LOGLEVEL); pjob = NULL; } process_checkpoint_reply(dup_req); } } else { /* Job does not have checkpointing enabled, so reject the request */ log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf); req_reject(PBSE_IVALREQ, 0, preq, NULL, "job is not checkpointable"); } if (pjob != NULL) unlock_ji_mutex(pjob, __func__, (char *)"1", LOGLEVEL); return(NULL); } /* END req_checkpointjob() */
int req_holdjob( void *vp) /* I */ { long *hold_val; int newstate; int newsub; long old_hold; job *pjob; char *pset; int rc; pbs_attribute temphold; pbs_attribute *pattr; struct batch_request *preq = (struct batch_request *)vp; char log_buf[LOCAL_LOG_BUF_SIZE]; struct batch_request *dup_req = NULL; pjob = chk_job_request(preq->rq_ind.rq_hold.rq_orig.rq_objname, preq); if (pjob == NULL) { return(PBSE_NONE); } /* cannot do anything until we decode the holds to be set */ if ((rc = get_hold(&preq->rq_ind.rq_hold.rq_orig.rq_attr, &pset, &temphold)) != 0) { req_reject(rc, 0, preq, NULL, NULL); unlock_ji_mutex(pjob, __func__, (char *)"1", LOGLEVEL); return(PBSE_NONE); } /* if other than HOLD_u is being set, must have privil */ if ((rc = chk_hold_priv(temphold.at_val.at_long, preq->rq_perm)) != 0) { req_reject(rc, 0, preq, NULL, NULL); unlock_ji_mutex(pjob, __func__, (char *)"2", LOGLEVEL); return(PBSE_NONE); } hold_val = &pjob->ji_wattr[JOB_ATR_hold].at_val.at_long; old_hold = *hold_val; *hold_val |= temphold.at_val.at_long; pjob->ji_wattr[JOB_ATR_hold].at_flags |= ATR_VFLAG_SET; sprintf(log_buf, msg_jobholdset, pset, preq->rq_user, preq->rq_host); pattr = &pjob->ji_wattr[JOB_ATR_checkpoint]; if ((pjob->ji_qs.ji_state == JOB_STATE_RUNNING) && ((pattr->at_flags & ATR_VFLAG_SET) && ((csv_find_string(pattr->at_val.at_str, "s") != NULL) || (csv_find_string(pattr->at_val.at_str, "c") != NULL) || (csv_find_string(pattr->at_val.at_str, "enabled") != NULL)))) { /* have MOM attempt checkpointing */ if ((rc = copy_batchrequest(&dup_req, preq, 0, -1)) != 0) { req_reject(rc, 0, preq, NULL, "memory allocation failure"); } /* The dup_req is freed in relay_to_mom (failure) * or in issue_Drequest (success) */ else if ((rc = relay_to_mom(&pjob, dup_req, NULL)) != PBSE_NONE) { free_br(dup_req); *hold_val = old_hold; /* reset to the old value */ req_reject(rc, 0, preq, NULL, NULL); } else { if (pjob != NULL) { pjob->ji_qs.ji_svrflags |= JOB_SVFLG_HASRUN | JOB_SVFLG_CHECKPOINT_FILE; job_save(pjob, SAVEJOB_QUICK, 0); /* fill in log_buf again, since relay_to_mom changed it */ sprintf(log_buf, msg_jobholdset, pset, preq->rq_user, preq->rq_host); log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buf); unlock_ji_mutex(pjob, __func__, (char *)"3", LOGLEVEL); pjob = NULL; req_reject(rc, 0, preq, NULL, "relay to mom failed"); } process_hold_reply(dup_req); } } #ifdef ENABLE_BLCR else if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING) { /* * This system is configured with BLCR checkpointing to be used, * but this Running job does not have checkpointing enabled, * so we reject the request */ log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf); req_reject(PBSE_IVALREQ, 0, preq, NULL, "job not held since checkpointing is expected but not enabled for job"); } #endif else { /* everything went well, may need to update the job state */ log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf); if (old_hold != *hold_val) { /* indicate attributes changed */ pjob->ji_modified = 1; svr_evaljobstate(pjob, &newstate, &newsub, 0); svr_setjobstate(pjob, newstate, newsub, FALSE); } reply_ack(preq); } if (pjob != NULL) unlock_ji_mutex(pjob, __func__, (char *)"3", LOGLEVEL); return(PBSE_NONE); } /* END req_holdjob() */
/* * modify_whole_array() * modifies the entire job array * @SEE req_modify_array PARENT */ int modify_whole_array( job_array *pa, /* I/O */ svrattrl *plist, /* I */ struct batch_request *preq, /* I */ int checkpoint_req) /* I */ { char id[] = "modify_whole_array"; int i; int rc = 0; int mom_relay = 0; for (i = 0; i < pa->ai_qs.array_size; i++) { if (pa->jobs[i] == NULL) continue; /* NO_MOM_RELAY will prevent modify_job from calling relay_to_mom */ rc = modify_job(pa->jobs[i],plist,preq,checkpoint_req, NO_MOM_RELAY); if(rc == PBSE_RELAYED_TO_MOM) { struct batch_request *array_req = NULL; /* We told modify_job not to call relay_to_mom so we need to contact the mom */ rc = copy_batchrequest(&array_req, preq, 0, i); if(rc != 0) { return(rc); } preq->rq_refcount++; if(mom_relay == 0) { preq->rq_refcount++; } mom_relay++; if ((rc = relay_to_mom( pa->jobs[i]->ji_qs.ji_un.ji_exect.ji_momaddr, array_req, post_modify_arrayreq))) { snprintf(log_buffer,sizeof(log_buffer), "Unable to relay information to mom for job '%s'\n", pa->jobs[i]->ji_qs.ji_jobid); log_err(rc,id,log_buffer); return(rc); /* unable to get to MOM */ } } } if(mom_relay) { preq->rq_refcount--; if(preq->rq_refcount == 0) { free_br(preq); } return(PBSE_RELAYED_TO_MOM); } return(rc); } /* END modify_whole_array() */