static void post_rerun(struct work_task *pwt) { job *pjob; struct batch_request *preq; preq = (struct batch_request *)pwt->wt_parm1; if (preq->rq_reply.brp_code != 0) { if ((pjob = find_job(preq->rq_ind.rq_signal.rq_jid)) != NULL) { (void)sprintf(log_buffer, "rerun signal reject by mom: %d", preq->rq_reply.brp_code); log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, LOG_INFO, preq->rq_ind.rq_signal.rq_jid, log_buffer); if ((preq->rq_reply.brp_code == PBSE_UNKJOBID) && (preq->rq_extra == 0)) { pjob->ji_qs.ji_substate = JOB_SUBSTATE_RERUN3; discard_job(pjob, "Force rerun", 1); force_reque(pjob); } } } release_req(pwt); return; }
static void post_chkpt(struct work_task *ptask) { job *pjob; struct batch_request *preq; preq = (struct batch_request *)ptask->wt_parm1; pjob = find_job(preq->rq_ind.rq_hold.rq_orig.rq_objname); if (!preq || !pjob) return; if (preq->rq_reply.brp_code == 0) { /* checkpointed ok */ if (preq->rq_reply.brp_auxcode) { /* chkpt can be moved */ pjob->ji_qs.ji_svrflags &= ~JOB_SVFLG_CHKPT; pjob->ji_qs.ji_svrflags |= JOB_SVFLG_ChkptMig; pjob->ji_modified = 1; (void)job_save(pjob, SAVEJOB_QUICK); } account_record(PBS_ACCT_CHKPNT, pjob, (char *)0); } else { /* need to try rerun if possible or just abort the job */ if (preq->rq_reply.brp_code != PBSE_CKPBSY) { pjob->ji_qs.ji_svrflags &= ~JOB_SVFLG_CHKPT; pjob->ji_qs.ji_substate = JOB_SUBSTATE_RUNNING; pjob->ji_modified = 1; (void)job_save(pjob, SAVEJOB_QUICK); if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING) rerun_or_kill(pjob, msg_on_shutdown); } } release_req(ptask); }
void chkpt_xfr_hold( struct work_task *ptask) { job *pjob; struct work_task *ptasknew; struct batch_request *preq; preq = (struct batch_request *)ptask->wt_parm1; pjob = (job *)preq->rq_extra; if (LOGLEVEL >= 7) { sprintf(log_buffer, "BLCR copy completed (state is %s-%s)", PJobState[pjob->ji_qs.ji_state], PJobSubState[pjob->ji_qs.ji_substate]); LOG_EVENT( PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buffer); } release_req(ptask); ptasknew = set_task(WORK_Immed, 0, mom_cleanup_checkpoint_hold, (void*)pjob); return; } /* END chkpt_xfr_hold() */
void chkpt_xfr_done( struct work_task *ptask) { /* Why are we grabbing a pointer to the job or the request here??? * Nothing is done??!!?? * If implemented later, thread protection must be added */ release_req(ptask); return; } /* END chkpt_xfr_done() */
void chkpt_xfr_done( struct work_task *ptask) { job *pjob; struct batch_request *preq; preq = (struct batch_request *)ptask->wt_parm1; pjob = (job *)preq->rq_extra; release_req(ptask); return; } /* END chkpt_xfr_done() */
static void job_delete_nanny( struct work_task *pwt) { job *pjob; char *sigk = "SIGKILL"; struct batch_request *newreq; /* short-circuit if nanny isn't enabled */ if (!server.sv_attr[SRV_ATR_JobNanny].at_val.at_long) { release_req(pwt); return; } pjob = (job *)pwt->wt_parm1; sprintf(log_buffer, "exiting job '%s' still exists, sending a SIGKILL", pjob->ji_qs.ji_jobid); log_err(-1, "job nanny", log_buffer); /* build up a Signal Job batch request */ if ((newreq = alloc_br(PBS_BATCH_SignalJob)) != NULL) { strcpy(newreq->rq_ind.rq_signal.rq_jid, pjob->ji_qs.ji_jobid); strncpy(newreq->rq_ind.rq_signal.rq_signame, sigk, PBS_SIGNAMESZ); } issue_signal(pjob, sigk, post_job_delete_nanny, newreq); apply_job_delete_nanny(pjob, time_now + 60); return; } /* END job_delete_nanny() */
static void post_checkpoint( struct work_task *ptask) { job *pjob; struct batch_request *preq; preq = (struct batch_request *)ptask->wt_parm1; pjob = find_job(preq->rq_ind.rq_hold.rq_orig.rq_objname); if (preq->rq_reply.brp_code == 0) { /* checkpointed ok */ if (preq->rq_reply.brp_auxcode) /* checkpoint can be moved */ { pjob->ji_qs.ji_svrflags = (pjob->ji_qs.ji_svrflags & ~JOB_SVFLG_CHECKPOINT_FILE) | JOB_SVFLG_HASRUN | JOB_SVFLG_CHECKPOINT_MIGRATEABLE; } } else { /* need to try rerun if possible or just abort the job */ if (pjob) { pjob->ji_qs.ji_svrflags &= ~JOB_SVFLG_CHECKPOINT_FILE; pjob->ji_qs.ji_substate = JOB_SUBSTATE_RUNNING; if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING) rerun_or_kill(pjob, msg_on_shutdown); } } release_req(ptask); } /* END post_checkpoint() */
static void post_doq(struct work_task *pwt) { struct batch_request *preq = (struct batch_request *)pwt->wt_parm1; char *jobid = preq->rq_ind.rq_register.rq_child; char *msg; job *pjob; job *ppjob; struct depend_job pparent; int rc; if (preq->rq_reply.brp_code) { /* request was rejected */ (void)strcpy(log_buffer, msg_regrej); (void)strcat(log_buffer, preq->rq_ind.rq_register.rq_parent); log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, LOG_INFO, jobid, log_buffer); pjob = find_job(jobid); if ((msg = pbse_to_txt(preq->rq_reply.brp_code)) != NULL) { (void)strcat(log_buffer, " "); (void)strcat(log_buffer, msg); } if (pjob) { if (preq->rq_reply.brp_code == PBSE_JOB_MOVED) { /* Creating a separate log buffer because if we end up aborting the submitted job * we don't want to change what goes into accounting log via job_abt */ char log_msg[LOG_BUF_SIZE]; snprintf(log_msg, sizeof(log_msg), "%s, %s", msg_job_moved, "sending dependency request to remote server"); log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, LOG_INFO, jobid, log_msg); ppjob = find_job(preq->rq_ind.rq_register.rq_parent); if(ppjob && (ppjob->ji_qs.ji_state == JOB_STATE_MOVED) && (ppjob->ji_qs.ji_substate == JOB_SUBSTATE_MOVED)) { char *destin; /* job destination should be <remote queue>@<remote server> */ destin = strchr(ppjob->ji_qs.ji_destin, (int)'@'); if (destin != NULL) { strncpy(pparent.dc_child, ppjob->ji_qs.ji_jobid, sizeof(pparent.dc_child)); strncpy(pparent.dc_svr, destin+1, sizeof(pparent.dc_svr)); rc = send_depend_req(pjob, &pparent, preq->rq_ind.rq_register.rq_dependtype, JOB_DEPEND_OP_REGISTER, SYNC_SCHED_HINT_NULL, post_doq); if (rc) { snprintf(log_msg, sizeof(log_msg), "%s", "Failed to send dependency request to remote server, aborting job"); log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, LOG_ERR, jobid, log_msg); check_block(pjob, log_buffer); job_abt(pjob, log_buffer); } } else { /* Ideally if a job is moved, destination can not be empty */ /* If we come across an empty destination, abort the job */ check_block(pjob, log_buffer); job_abt(pjob, log_buffer); } } else { check_block(pjob, log_buffer); job_abt(pjob, log_buffer); } } else { check_block(pjob, log_buffer); job_abt(pjob, log_buffer); } } } release_req(pwt); }
static void post_delete_mom1( struct work_task *pwt) { int delay = 0; int dellen = strlen(deldelaystr); job *pjob; struct work_task *pwtnew; pbs_queue *pque; struct batch_request *preq_sig; /* signal request to MOM */ struct batch_request *preq_clt; /* original client request */ int rc; preq_sig = pwt->wt_parm1; rc = preq_sig->rq_reply.brp_code; preq_clt = preq_sig->rq_extra; release_req(pwt); pjob = find_job(preq_clt->rq_ind.rq_delete.rq_objname); if (pjob == NULL) { /* job has gone away */ req_reject(PBSE_UNKJOBID, 0, preq_clt, NULL, NULL); return; } if (rc) { /* mom rejected request */ if (rc == PBSE_UNKJOBID) { /* MOM claims no knowledge, so just purge it */ log_event( PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, "MOM rejected signal during delete"); /* removed the resources assigned to job */ free_nodes(pjob); set_resc_assigned(pjob, DECR); job_purge(pjob); reply_ack(preq_clt); } else { req_reject(rc, 0, preq_clt, NULL, NULL); } return; } if (preq_clt->rq_extend) { if (strncmp(preq_clt->rq_extend, deldelaystr, dellen) == 0) { delay = atoi(preq_clt->rq_extend + dellen); } } reply_ack(preq_clt); /* dont need it, reply now */ /* * if no delay specified in original request, see if kill_delay * queue attribute is set. */ if (delay == 0) { pque = pjob->ji_qhdr; delay = attr_ifelse_long(&pque->qu_attr[QE_ATR_KillDelay], &server.sv_attr[SRV_ATR_KillDelay], 2); } pwtnew = set_task(WORK_Timed, delay + time_now, post_delete_mom2, pjob); if (pwtnew) { /* insure that work task will be removed if job goes away */ append_link(&pjob->ji_svrtask, &pwtnew->wt_linkobj, pwtnew); } /* * Since the first signal has succeeded, let's reschedule the * nanny to be 1 minute after the second phase. */ apply_job_delete_nanny(pjob, time_now + delay + 60); return; } /* END post_delete_mom1() */
static void post_job_delete_nanny( struct work_task *pwt) { struct batch_request *preq_sig; /* signal request to MOM */ int rc; job *pjob; preq_sig = pwt->wt_parm1; rc = preq_sig->rq_reply.brp_code; if (!server.sv_attr[SRV_ATR_JobNanny].at_val.at_long) { /* the admin disabled nanny within the last minute or so */ release_req(pwt); return; } /* extract job id from task */ pjob = find_job(preq_sig->rq_ind.rq_signal.rq_jid); if (pjob == NULL) { sprintf(log_buffer, "job delete nanny: the job disappeared (this is a BUG!)"); LOG_EVENT( PBSEVENT_ERROR, PBS_EVENTCLASS_JOB, preq_sig->rq_ind.rq_signal.rq_jid, log_buffer); } else if (rc == PBSE_UNKJOBID) { sprintf(log_buffer, "job delete nanny returned, but does not exist on mom"); LOG_EVENT( PBSEVENT_ERROR, PBS_EVENTCLASS_JOB, preq_sig->rq_ind.rq_signal.rq_jid, log_buffer); free_nodes(pjob); set_resc_assigned(pjob, DECR); job_purge(pjob); } /* free task */ release_req(pwt); return; } /* END post_job_delete_nanny() */
static void post_stagein( struct work_task *pwt) { int code; int newstate; int newsub; job *pjob; struct batch_request *preq; attribute *pwait; preq = pwt->wt_parm1; code = preq->rq_reply.brp_code; pjob = find_job(preq->rq_extra); free(preq->rq_extra); if (pjob != NULL) { if (code != 0) { /* stage in failed - hold job */ free_nodes(pjob); pwait = &pjob->ji_wattr[(int)JOB_ATR_exectime]; if ((pwait->at_flags & ATR_VFLAG_SET) == 0) { pwait->at_val.at_long = time_now + PBS_STAGEFAIL_WAIT; pwait->at_flags |= ATR_VFLAG_SET; job_set_wait(pwait, pjob, 0); } svr_setjobstate(pjob, JOB_STATE_WAITING, JOB_SUBSTATE_STAGEFAIL); if (preq->rq_reply.brp_choice == BATCH_REPLY_CHOICE_Text) { /* set job comment */ /* NYI */ svr_mailowner( pjob, MAIL_STAGEIN, MAIL_FORCE, preq->rq_reply.brp_un.brp_txt.brp_str); } } else { /* stage in was successful */ pjob->ji_qs.ji_svrflags |= JOB_SVFLG_StagedIn; if (pjob->ji_qs.ji_substate == JOB_SUBSTATE_STAGEGO) { if (is_checkpoint_restart(pjob)) { /* need to copy checkpoint file to mom before running */ svr_send_checkpoint( pjob, preq, JOB_STATE_RUNNING, JOB_SUBSTATE_CHKPTGO); } else { /* continue to start job running */ svr_strtjob2(pjob, NULL); } } else { svr_evaljobstate(pjob, &newstate, &newsub, 0); svr_setjobstate(pjob, newstate, newsub); } } } /* END if (pjob != NULL) */ release_req(pwt); /* close connection and release request */ return; } /* END post_stagein() */
static void post_checkpointsend( struct work_task *pwt) { int code; job *pjob; struct batch_request *preq; attribute *pwait; preq = pwt->wt_parm1; code = preq->rq_reply.brp_code; pjob = find_job(preq->rq_extra); free(preq->rq_extra); if (pjob != NULL) { if (code != 0) { /* copy failed - hold job */ free_nodes(pjob); pwait = &pjob->ji_wattr[(int)JOB_ATR_exectime]; if ((pwait->at_flags & ATR_VFLAG_SET) == 0) { pwait->at_val.at_long = time_now + PBS_STAGEFAIL_WAIT; pwait->at_flags |= ATR_VFLAG_SET; job_set_wait(pwait, pjob, 0); } svr_setjobstate(pjob, JOB_STATE_WAITING, JOB_SUBSTATE_STAGEFAIL); if (preq->rq_reply.brp_choice == BATCH_REPLY_CHOICE_Text) { sprintf(log_buffer, "Failed to copy checkpoint file to mom - %s", preq->rq_reply.brp_un.brp_txt.brp_str); log_event( PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buffer); /* NYI */ svr_mailowner( pjob, MAIL_CHKPTCOPY, MAIL_FORCE, preq->rq_reply.brp_un.brp_txt.brp_str); } } else { /* checkpoint copy was successful */ pjob->ji_qs.ji_svrflags |= JOB_SVFLG_CHECKPOINT_COPIED; /* set restart_name attribute to the checkpoint_name we just copied */ job_attr_def[(int)JOB_ATR_restart_name].at_set( &pjob->ji_wattr[(int)JOB_ATR_restart_name], &pjob->ji_wattr[(int)JOB_ATR_checkpoint_name], SET); pjob->ji_modified = 1; job_save(pjob, SAVEJOB_FULL); /* continue to start job running */ svr_strtjob2(pjob, NULL); } } /* END if (pjob != NULL) */ release_req(pwt); /* close connection and release request */ return; } /* END post_checkpointsend() */
static void stat_update( struct work_task *pwt) { struct stat_cntl *cntl; job *pjob; struct batch_request *preq; struct batch_reply *preply; struct brp_status *pstatus; svrattrl *sattrl; int oldsid; preq = pwt->wt_parm1; preply = &preq->rq_reply; cntl = preq->rq_extra; if (preply->brp_choice == BATCH_REPLY_CHOICE_Status) { pstatus = (struct brp_status *)GET_NEXT(preply->brp_un.brp_status); while (pstatus != NULL) { if ((pjob = find_job(pstatus->brp_objname))) { sattrl = (svrattrl *)GET_NEXT(pstatus->brp_attr); oldsid = pjob->ji_wattr[(int)JOB_ATR_session_id].at_val.at_long; modify_job_attr( pjob, sattrl, ATR_DFLAG_MGWR | ATR_DFLAG_SvWR, &bad); if (oldsid != pjob->ji_wattr[(int)JOB_ATR_session_id].at_val.at_long) { /* first save since running job (or the sid has changed), */ /* must save session id */ job_save(pjob, SAVEJOB_FULL); svr_mailowner(pjob, MAIL_BEGIN, MAIL_NORMAL, NULL); } #ifdef USESAVEDRESOURCES else { /* save so we can recover resources used */ job_save(pjob, SAVEJOB_FULL); } #endif /* USESAVEDRESOURCES */ pjob->ji_momstat = time_now; } pstatus = (struct brp_status *)GET_NEXT(pstatus->brp_stlink); } /* END while (pstatus != NULL) */ } /* END if (preply->brp_choice == BATCH_REPLY_CHOICE_Status) */ else { if (preply->brp_code == PBSE_UNKJOBID) { /* we sent a stat request, but mom says it doesn't know anything about the job */ if ((pjob = find_job(preq->rq_ind.rq_status.rq_id))) { /* job really isn't running any more - mom doesn't know anything about it this can happen if a diskless node reboots and the mom_priv/jobs directory is cleared, set its state to queued so job_abt doesn't think it is still running */ svr_setjobstate(pjob, JOB_STATE_QUEUED, JOB_SUBSTATE_ABORT); rel_resc(pjob); job_abt(&pjob, "Job does not exist on node"); /* TODO, if the job is rerunnable we should set its state back to queued */ } } } release_req(pwt); cntl->sc_conn = -1; if (cntl->sc_post) cntl->sc_post(cntl); /* continue where we left off */ else free(cntl); /* a bit of a kludge but its saves an extra func */ return; } /* END stat_update() */