void stat_update( struct batch_request *preq, struct stat_cntl *cntl) { job *pjob; struct batch_reply *preply; struct brp_status *pstatus; svrattrl *sattrl; int oldsid; int bad = 0; time_t time_now = time(NULL); char *msg_ptr = NULL; char log_buf[LOCAL_LOG_BUF_SIZE]; preply = &preq->rq_reply; if (preply->brp_un.brp_txt.brp_str != NULL) { msg_ptr = strstr(preply->brp_un.brp_txt.brp_str, PBS_MSG_EQUAL); if (msg_ptr != NULL) msg_ptr += strlen(PBS_MSG_EQUAL); } if (preply->brp_choice == BATCH_REPLY_CHOICE_Status) { pstatus = (struct brp_status *)GET_NEXT(preply->brp_un.brp_status); while (pstatus != NULL) { if ((pjob = svr_find_job(pstatus->brp_objname, FALSE)) != NULL) { mutex_mgr job_mutex(pjob->ji_mutex, true); sattrl = (svrattrl *)GET_NEXT(pstatus->brp_attr); oldsid = pjob->ji_wattr[JOB_ATR_session_id].at_val.at_long; modify_job_attr( pjob, sattrl, ATR_DFLAG_MGWR | ATR_DFLAG_SvWR, &bad); if (oldsid != pjob->ji_wattr[JOB_ATR_session_id].at_val.at_long) { /* first save since running job (or the sid has changed), */ /* must save session id */ job_save(pjob, SAVEJOB_FULL, 0); } #ifdef USESAVEDRESOURCES else { /* save so we can recover resources used */ job_save(pjob, SAVEJOB_FULL, 0); } #endif /* USESAVEDRESOURCES */ pjob->ji_momstat = time_now; } pstatus = (struct brp_status *)GET_NEXT(pstatus->brp_stlink); } /* END while (pstatus != NULL) */ } /* END if (preply->brp_choice == BATCH_REPLY_CHOICE_Status) */ else if ((preply->brp_choice == BATCH_REPLY_CHOICE_Text) && (preply->brp_code == PBSE_UNKJOBID) && (msg_ptr != NULL) && (!strcmp(msg_ptr, preq->rq_ind.rq_status.rq_id))) { /* we sent a stat request, but mom says it doesn't know anything about the job */ if ((pjob = svr_find_job(preq->rq_ind.rq_status.rq_id, FALSE)) != NULL) { /* job really isn't running any more - mom doesn't know anything about it this can happen if a diskless node reboots and the mom_priv/jobs directory is cleared, set its state to queued so job_abt doesn't think it is still running */ mutex_mgr job_mutex(pjob->ji_mutex, true); snprintf(log_buf, sizeof(log_buf), "mother superior no longer recognizes %s as a valid job, aborting. Last reported time was %ld", preq->rq_ind.rq_status.rq_id, pjob->ji_last_reported_time); log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, __func__, log_buf); svr_setjobstate(pjob, JOB_STATE_QUEUED, JOB_SUBSTATE_ABORT, FALSE); rel_resc(pjob); job_mutex.set_unlock_on_exit(false); job_abt(&pjob, "Job does not exist on node"); /* TODO, if the job is rerunnable we should set its state back to queued */ } } else { snprintf(log_buf, sizeof(log_buf), "Poll job request failed for job %s", preq->rq_ind.rq_status.rq_id); log_err(preply->brp_code, __func__, log_buf); } cntl->sc_conn = -1; if (cntl->sc_post) cntl->sc_post(cntl); /* continue where we left off */ /* If sc_post has a value it is: * req_stat_job_step2 * if so, it expects cntl to be free'd after the call */ free(cntl); /* a bit of a kludge but its saves an extra func */ return; } /* END stat_update() */
int modify_job( void **j, /* O */ svrattrl *plist, /* I */ struct batch_request *preq, /* I */ int checkpoint_req, /* I */ int flag) /* I */ { int bad = 0; int i; int newstate; int newsubstate; resource_def *prsd; int rc; int sendmom = 0; int copy_checkpoint_files = FALSE; char log_buf[LOCAL_LOG_BUF_SIZE]; struct batch_request *dup_req = NULL; job *pjob = (job *)*j; if (pjob == NULL) { sprintf(log_buf, "job structure is NULL"); log_err(PBSE_IVALREQ, __func__, log_buf); return(PBSE_IVALREQ); } /* cannot be in exiting or transit, exiting has already been checked */ if (pjob->ji_qs.ji_state == JOB_STATE_TRANSIT) { /* FAILURE */ snprintf(log_buf,sizeof(log_buf), "Cannot modify job '%s' in transit\n", pjob->ji_qs.ji_jobid); log_err(PBSE_BADSTATE, __func__, log_buf); return(PBSE_BADSTATE); } if (((checkpoint_req == CHK_HOLD) || (checkpoint_req == CHK_CONT)) && (pjob->ji_qs.ji_substate == JOB_SUBSTATE_RUNNING)) { /* May need to request copy of the checkpoint file from mom */ copy_checkpoint_files = TRUE; if (checkpoint_req == CHK_HOLD) { sprintf(log_buf,"setting jobsubstate for %s to RERUN\n", pjob->ji_qs.ji_jobid); pjob->ji_qs.ji_substate = JOB_SUBSTATE_RERUN; job_save(pjob, SAVEJOB_QUICK, 0); log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buf); /* remove checkpoint restart file if there is one */ if (pjob->ji_wattr[JOB_ATR_restart_name].at_flags & ATR_VFLAG_SET) { cleanup_restart_file(pjob); } } } /* if job is running, special checks must be made */ /* NOTE: must determine if job exists down at MOM - this will occur if job is running, job is held, or job was held and just barely released (ie qhold/qrls) */ /* COMMENTED OUT BY JOSH B IN 2.3 DUE TO MAJOR PROBLEMS w/ CUSTOMERS * --FIX and uncomment once we know what is really going on. * * We now know that ji_destin gets set on a qmove and that the mom does not * have the job at that point. * if ((pjob->ji_qs.ji_state == JOB_STATE_RUNNING) || ((pjob->ji_qs.ji_state == JOB_STATE_HELD) && (pjob->ji_qs.ji_destin[0] != '\0')) || ((pjob->ji_qs.ji_state == JOB_STATE_QUEUED) && (pjob->ji_qs.ji_destin[0] != '\0'))) */ if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING) { while (plist != NULL) { /* is the pbs_attribute modifiable in RUN state ? */ i = find_attr(job_attr_def, plist->al_name, JOB_ATR_LAST); if ((i < 0) || ((job_attr_def[i].at_flags & ATR_DFLAG_ALTRUN) == 0)) { /* FAILURE */ snprintf(log_buf,sizeof(log_buf), "Cannot modify attribute '%s' while running\n", plist->al_name); log_err(PBSE_MODATRRUN, __func__, log_buf); return PBSE_MODATRRUN; } /* NOTE: only explicitly specified job attributes are routed down to MOM */ if (i == JOB_ATR_resource) { /* is the specified resource modifiable while */ /* the job is running */ prsd = find_resc_def(svr_resc_def, plist->al_resc, svr_resc_size); if (prsd == NULL) { /* FAILURE */ snprintf(log_buf,sizeof(log_buf), "Unknown attribute '%s'\n", plist->al_name); log_err(PBSE_UNKRESC, __func__, log_buf); return(PBSE_UNKRESC); } if ((prsd->rs_flags & ATR_DFLAG_ALTRUN) == 0) { /* FAILURE */ snprintf(log_buf,sizeof(log_buf), "Cannot modify attribute '%s' while running\n", plist->al_name); log_err(PBSE_MODATRRUN, __func__, log_buf); return(PBSE_MODATRRUN); } sendmom = 1; } /* else if ((i == JOB_ATR_checkpoint_name) || (i == JOB_ATR_variables)) { sendmom = 1; } */ plist = (svrattrl *)GET_NEXT(plist->al_link); } } /* END if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING) */ /* modify the job's attributes */ bad = 0; plist = (svrattrl *)GET_NEXT(preq->rq_ind.rq_modify.rq_attr); rc = modify_job_attr(pjob, plist, preq->rq_perm, &bad); if (rc) { /* FAILURE */ snprintf(log_buf,sizeof(log_buf), "Cannot set attributes for job '%s'\n", pjob->ji_qs.ji_jobid); log_err(rc, __func__, log_buf); if (rc == PBSE_JOBNOTFOUND) *j = NULL; return(rc); } /* Reset any defaults resource limit which might have been unset */ set_resc_deflt(pjob, NULL, FALSE); /* if job is not running, may need to change its state */ if (pjob->ji_qs.ji_state != JOB_STATE_RUNNING) { svr_evaljobstate(pjob, &newstate, &newsubstate, 0); svr_setjobstate(pjob, newstate, newsubstate, FALSE); } else { job_save(pjob, SAVEJOB_FULL, 0); } sprintf(log_buf, msg_manager, msg_jobmod, preq->rq_user, preq->rq_host); log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf); /* if a resource limit changed for a running job, send to MOM */ if (sendmom) { /* if the NO_MOM_RELAY flag is set the calling function will call relay_to_mom so we do not need to do it here */ if (flag != NO_MOM_RELAY) { /* The last number is unused unless this is an array */ if ((rc = copy_batchrequest(&dup_req, preq, 0, -1)) != 0) { } /* The dup_req is freed in relay_to_mom (failure) * or in issue_Drequest (success) */ else if ((rc = relay_to_mom(&pjob, dup_req, post_modify_req))) { if (pjob != NULL) { snprintf(log_buf,sizeof(log_buf), "Unable to relay information to mom for job '%s'\n", pjob->ji_qs.ji_jobid); log_err(rc, __func__, log_buf); } return(rc); /* unable to get to MOM */ } } return(PBSE_RELAYED_TO_MOM); } if (copy_checkpoint_files) { struct batch_request *momreq = 0; momreq = cpy_checkpoint(momreq, pjob, JOB_ATR_checkpoint_name, CKPT_DIR_OUT); if (momreq != NULL) { /* have files to copy */ momreq->rq_extra = strdup(pjob->ji_qs.ji_jobid); /* The momreq is freed in relay_to_mom (failure) * or in issue_Drequest (success) */ if (checkpoint_req == CHK_HOLD) { rc = relay_to_mom(&pjob, momreq, chkpt_xfr_hold); } else { rc = relay_to_mom(&pjob, momreq, chkpt_xfr_done); } if (rc != 0) { if (pjob != NULL) { snprintf(log_buf,sizeof(log_buf), "Unable to relay information to mom for job '%s'\n", pjob->ji_qs.ji_jobid); log_err(rc, __func__, log_buf); } return(PBSE_NONE); /* come back when mom replies */ } } else { log_err(-1, __func__, "Failed to get batch request"); } } return(PBSE_NONE); } /* END modify_job() */
void stat_update( struct batch_request *preq, struct stat_cntl *cntl) { job *pjob; struct batch_reply *preply; struct brp_status *pstatus; svrattrl *sattrl; int oldsid; int bad = 0; time_t time_now = time(NULL); preply = &preq->rq_reply; if (preply->brp_choice == BATCH_REPLY_CHOICE_Status) { pstatus = (struct brp_status *)GET_NEXT(preply->brp_un.brp_status); while (pstatus != NULL) { if ((pjob = svr_find_job(pstatus->brp_objname, FALSE)) != NULL) { sattrl = (svrattrl *)GET_NEXT(pstatus->brp_attr); oldsid = pjob->ji_wattr[JOB_ATR_session_id].at_val.at_long; modify_job_attr( pjob, sattrl, ATR_DFLAG_MGWR | ATR_DFLAG_SvWR, &bad); if (oldsid != pjob->ji_wattr[JOB_ATR_session_id].at_val.at_long) { /* first save since running job (or the sid has changed), */ /* must save session id */ job_save(pjob, SAVEJOB_FULL, 0); } #ifdef USESAVEDRESOURCES else { /* save so we can recover resources used */ job_save(pjob, SAVEJOB_FULL, 0); } #endif /* USESAVEDRESOURCES */ pjob->ji_momstat = time_now; unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL); } pstatus = (struct brp_status *)GET_NEXT(pstatus->brp_stlink); } /* END while (pstatus != NULL) */ } /* END if (preply->brp_choice == BATCH_REPLY_CHOICE_Status) */ else { if (preply->brp_code == PBSE_UNKJOBID) { /* we sent a stat request, but mom says it doesn't know anything about the job */ if ((pjob = svr_find_job(preq->rq_ind.rq_status.rq_id, FALSE)) != NULL) { /* job really isn't running any more - mom doesn't know anything about it this can happen if a diskless node reboots and the mom_priv/jobs directory is cleared, set its state to queued so job_abt doesn't think it is still running */ svr_setjobstate(pjob, JOB_STATE_QUEUED, JOB_SUBSTATE_ABORT, FALSE); rel_resc(pjob); job_abt(&pjob, "Job does not exist on node"); /* TODO, if the job is rerunnable we should set its state back to queued */ } } } cntl->sc_conn = -1; /* MUTSU - Unlock job here? */ if (cntl->sc_post) cntl->sc_post(cntl); /* continue where we left off */ /* If sc_post has a value it is: * req_stat_job_step2 * if so, it expects cntl to be free'd after the call */ free(cntl); /* a bit of a kludge but its saves an extra func */ return; } /* END stat_update() */
void req_modifyjob(struct batch_request *preq) { int add_to_am_list = 0; /* if altered during sched cycle */ int bad = 0; int jt; /* job type */ int newstate; int newsubstate; resource_def *outsideselect = NULL; job *pjob; svrattrl *plist; resource *presc; resource_def *prsd; int rc; int running = 0; int sendmom = 0; char hook_msg[HOOK_MSG_SIZE]; int mod_project = 0; pbs_sched *psched; switch (process_hooks(preq, hook_msg, sizeof(hook_msg), pbs_python_set_interrupt)) { case 0: /* explicit reject */ reply_text(preq, PBSE_HOOKERROR, hook_msg); return; case 1: /* explicit accept */ if (recreate_request(preq) == -1) { /* error */ /* we have to reject the request, as 'preq' */ /* may have been partly modified */ strcpy(hook_msg, "modifyjob event: rejected request"); log_event(PBSEVENT_ERROR, PBS_EVENTCLASS_HOOK, LOG_ERR, "", hook_msg); reply_text(preq, PBSE_HOOKERROR, hook_msg); return; } break; case 2: /* no hook script executed - go ahead and accept event*/ break; default: log_event(PBSEVENT_DEBUG2, PBS_EVENTCLASS_HOOK, LOG_INFO, "", "modifyjob event: accept req by default"); } if (pseldef == NULL) /* do one time to keep handy */ pseldef = find_resc_def(svr_resc_def, "select", svr_resc_size); pjob = chk_job_request(preq->rq_ind.rq_modify.rq_objname, preq, &jt); if (pjob == NULL) return; if ((jt == IS_ARRAY_Single) || (jt == IS_ARRAY_Range)) { req_reject(PBSE_IVALREQ, 0, preq); return; } psched = find_sched_from_sock(preq->rq_conn); /* allow scheduler to modify job */ if (psched == NULL) { /* provisioning job is not allowed to be modified */ if ((pjob->ji_qs.ji_state == JOB_STATE_RUNNING) && (pjob->ji_qs.ji_substate == JOB_SUBSTATE_PROVISION)) { req_reject(PBSE_BADSTATE, 0, preq); return; } } /* cannot be in exiting or transit, exiting has already be checked */ if (pjob->ji_qs.ji_state == JOB_STATE_TRANSIT) { req_reject(PBSE_BADSTATE, 0, preq); return; } plist = (svrattrl *)GET_NEXT(preq->rq_ind.rq_modify.rq_attr); if (plist == NULL) { /* nothing to do */ reply_ack(preq); return; } /* * Special checks must be made: * if during a scheduling cycle and certain attributes are altered, * make a note of the job to prevent it from being run now; * if job is running, only certain attributes/resources can be * altered. */ if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING) { running = 1; } while (plist) { int i; i = find_attr(job_attr_def, plist->al_name, JOB_ATR_LAST); /* * Is the attribute being altered one which could change * scheduling (ATR_DFLAG_SCGALT set) and if a scheduling * cycle is in progress, then set flag to add the job to list * of jobs which cannot be run in this cycle. * If the scheduler itself sends a modify job request, * no need to delay the job until next cycle. */ if ((psched == NULL) && (scheduler_jobs_stat) && (job_attr_def[i].at_flags & ATR_DFLAG_SCGALT)) add_to_am_list = 1; /* Is the attribute modifiable in RUN state ? */ if (i < 0) { reply_badattr(PBSE_NOATTR, 1, plist, preq); return; } if ((running == 1) && ((job_attr_def[i].at_flags & ATR_DFLAG_ALTRUN) == 0)) { reply_badattr(PBSE_MODATRRUN, 1, plist, preq); return; } if (i == (int)JOB_ATR_resource) { prsd = find_resc_def(svr_resc_def, plist->al_resc, svr_resc_size); if (prsd == 0) { reply_badattr(PBSE_UNKRESC, 1, plist, preq); return; } /* is the specified resource modifiable while */ /* the job is running */ if (running) { if ((prsd->rs_flags & ATR_DFLAG_ALTRUN) == 0) { reply_badattr(PBSE_MODATRRUN, 1, plist, preq); return; } sendmom = 1; } /* should the resource be only in a select spec */ if (prsd->rs_flags & ATR_DFLAG_CVTSLT && !outsideselect && plist->al_atopl.value && plist->al_atopl.value[0]) { /* if "-lresource" is set and has non-NULL value, ** remember as potential bad resource ** if this appears along "select". */ outsideselect = prsd; } } if (strcmp(plist->al_name, ATTR_project) == 0) { mod_project = 1; } else if ((strcmp(plist->al_name, ATTR_runcount) == 0) && ((plist->al_flags & ATR_VFLAG_HOOK) == 0) && (plist->al_value != NULL) && (plist->al_value[0] != '\0') && ((preq->rq_perm & (ATR_DFLAG_MGWR | ATR_DFLAG_OPWR)) == 0) && (atol(plist->al_value) < \ pjob->ji_wattr[(int)JOB_ATR_runcount].at_val.at_long)) { sprintf(log_buffer, "regular user %s@%s cannot decrease '%s' attribute value from %ld to %ld", preq->rq_user, preq->rq_host, ATTR_runcount, pjob->ji_wattr[(int)JOB_ATR_runcount].at_val.at_long, atol(plist->al_value)); log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, LOG_ERR, pjob->ji_qs.ji_jobid, log_buffer); req_reject(PBSE_PERM, 0, preq); return; } plist = (svrattrl *)GET_NEXT(plist->al_link); } if (outsideselect) { presc = find_resc_entry(&pjob->ji_wattr[(int)JOB_ATR_resource], pseldef); if (presc && ((presc->rs_value.at_flags & ATR_VFLAG_DEFLT) == 0)) { /* select is not a default, so reject qalter */ resc_in_err = strdup(outsideselect->rs_name); req_reject(PBSE_INVALJOBRESC, 0, preq); return; } } /* modify the jobs attributes */ bad = 0; plist = (svrattrl *)GET_NEXT(preq->rq_ind.rq_modify.rq_attr); rc = modify_job_attr(pjob, plist, preq->rq_perm, &bad); if (rc) { if (pjob->ji_clterrmsg) reply_text(preq, rc, pjob->ji_clterrmsg); else reply_badattr(rc, bad, plist, preq); return; } /* If certain attributes modified and if in scheduling cycle */ /* then add to list of jobs which cannot be run in this cycle */ if (add_to_am_list) am_jobs_add(pjob); /* see req_runjob() */ /* check if project attribute was requested to be modified to */ /* be the default project value */ if (mod_project && (pjob->ji_wattr[(int)JOB_ATR_project].at_flags & \ ATR_VFLAG_SET)) { if (strcmp(pjob->ji_wattr[(int)JOB_ATR_project].at_val.at_str, PBS_DEFAULT_PROJECT) == 0) { sprintf(log_buffer, msg_defproject, ATTR_project, PBS_DEFAULT_PROJECT); #ifdef NAS /* localmod 107 */ log_event(PBSEVENT_DEBUG4, PBS_EVENTCLASS_JOB, LOG_INFO, pjob->ji_qs.ji_jobid, log_buffer); #else log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, LOG_INFO, pjob->ji_qs.ji_jobid, log_buffer); #endif /* localmod 107 */ } } if (pjob->ji_wattr[(int)JOB_ATR_resource].at_flags & ATR_VFLAG_MODIFY) { presc = find_resc_entry(&pjob->ji_wattr[(int)JOB_ATR_resource], pseldef); if (presc && (presc->rs_value.at_flags & ATR_VFLAG_DEFLT)) { /* changing Resource_List and select is a default */ /* clear "select" so it is rebuilt inset_resc_deflt */ pseldef->rs_free(&presc->rs_value); } } /* Reset any defaults resource limit which might have been unset */ if ((rc = set_resc_deflt((void *)pjob, JOB_OBJECT, NULL)) != 0) { req_reject(rc, 0, preq); return; } /* if job is not running, may need to change its state */ if (pjob->ji_qs.ji_state != JOB_STATE_RUNNING) { svr_evaljobstate(pjob, &newstate, &newsubstate, 0); (void)svr_setjobstate(pjob, newstate, newsubstate); } else { (void)job_save(pjob, SAVEJOB_FULL); } (void)sprintf(log_buffer, msg_manager, msg_jobmod, preq->rq_user, preq->rq_host); log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, LOG_INFO, pjob->ji_qs.ji_jobid, log_buffer); /* if a resource limit changed for a running job, send to MOM */ if (sendmom) { rc = relay_to_mom(pjob, preq, post_modify_req); if (rc) req_reject(rc, 0, preq); /* unable to get to MOM */ return; } reply_ack(preq); }
void update_job_data( struct pbsnode *np, /* I */ const char *jobstring_in) /* I (changed attributes sent by mom) */ { char *jobdata; char *jobdata_ptr; char *jobidstr; char *attr_name; char *attr_value; char log_buf[LOCAL_LOG_BUF_SIZE]; job *pjob = NULL; int on_node = FALSE; if ((jobstring_in == NULL) || (!isdigit(*jobstring_in))) { /* NO-OP */ return; } /* FORMAT <JOBID>:<atrtributename=value>,<atrtributename=value>... */ jobdata = strdup(jobstring_in); jobdata_ptr = jobdata; jobidstr = threadsafe_tokenizer(&jobdata_ptr, ":"); if ((jobidstr != NULL) && isdigit(*jobidstr)) { if (strstr(jobidstr, server_name) != NULL) { on_node = is_job_on_node(np, job_mapper.get_id(jobidstr)); pjob = svr_find_job(jobidstr, TRUE); if (pjob != NULL) { int bad; svrattrl tA; mutex_mgr job_mutex(pjob->ji_mutex, true); /* job exists, so get the attributes and update them */ attr_name = threadsafe_tokenizer(&jobdata_ptr, "="); while (attr_name != NULL) { attr_value = threadsafe_tokenizer(&jobdata_ptr, ","); if (LOGLEVEL >= 9) { sprintf(log_buf, "Mom sent changed attribute %s value %s for job %s", attr_name, attr_value, pjob->ji_qs.ji_jobid); log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf); } memset(&tA, 0, sizeof(tA)); tA.al_name = attr_name; tA.al_resc = (char *)""; tA.al_value = attr_value; tA.al_op = SET; modify_job_attr( pjob, &tA, /* I: ATTR_sched_hint - svrattrl */ ATR_DFLAG_MGWR | ATR_DFLAG_SvWR, &bad); attr_name = threadsafe_tokenizer(&jobdata_ptr, "="); } } if (on_node == FALSE) { /* job is reported by mom but server has no record of job */ sprintf(log_buf, "stray job %s reported on %s", jobidstr, np->nd_name); log_err(-1, __func__, log_buf); } } } free(jobdata); } /* END update_job_data() */
static void stat_update( struct work_task *pwt) { struct stat_cntl *cntl; job *pjob; struct batch_request *preq; struct batch_reply *preply; struct brp_status *pstatus; svrattrl *sattrl; int oldsid; preq = pwt->wt_parm1; preply = &preq->rq_reply; cntl = preq->rq_extra; if (preply->brp_choice == BATCH_REPLY_CHOICE_Status) { pstatus = (struct brp_status *)GET_NEXT(preply->brp_un.brp_status); while (pstatus != NULL) { if ((pjob = find_job(pstatus->brp_objname))) { sattrl = (svrattrl *)GET_NEXT(pstatus->brp_attr); oldsid = pjob->ji_wattr[(int)JOB_ATR_session_id].at_val.at_long; modify_job_attr( pjob, sattrl, ATR_DFLAG_MGWR | ATR_DFLAG_SvWR, &bad); if (oldsid != pjob->ji_wattr[(int)JOB_ATR_session_id].at_val.at_long) { /* first save since running job (or the sid has changed), */ /* must save session id */ job_save(pjob, SAVEJOB_FULL); svr_mailowner(pjob, MAIL_BEGIN, MAIL_NORMAL, NULL); } #ifdef USESAVEDRESOURCES else { /* save so we can recover resources used */ job_save(pjob, SAVEJOB_FULL); } #endif /* USESAVEDRESOURCES */ pjob->ji_momstat = time_now; } pstatus = (struct brp_status *)GET_NEXT(pstatus->brp_stlink); } /* END while (pstatus != NULL) */ } /* END if (preply->brp_choice == BATCH_REPLY_CHOICE_Status) */ else { if (preply->brp_code == PBSE_UNKJOBID) { /* we sent a stat request, but mom says it doesn't know anything about the job */ if ((pjob = find_job(preq->rq_ind.rq_status.rq_id))) { /* job really isn't running any more - mom doesn't know anything about it this can happen if a diskless node reboots and the mom_priv/jobs directory is cleared, set its state to queued so job_abt doesn't think it is still running */ svr_setjobstate(pjob, JOB_STATE_QUEUED, JOB_SUBSTATE_ABORT); rel_resc(pjob); job_abt(&pjob, "Job does not exist on node"); /* TODO, if the job is rerunnable we should set its state back to queued */ } } } release_req(pwt); cntl->sc_conn = -1; if (cntl->sc_post) cntl->sc_post(cntl); /* continue where we left off */ else free(cntl); /* a bit of a kludge but its saves an extra func */ return; } /* END stat_update() */