void queue_route( pbs_queue *pque) { job *nxjb; job *pjob; int rc; pjob = (job *)GET_NEXT(pque->qu_jobs); while (pjob != NULL) { nxjb = (job *)GET_NEXT(pjob->ji_jobque); if (pjob->ji_qs.ji_un.ji_routet.ji_rteretry <= time_now) { if ((rc = job_route(pjob)) == PBSE_ROUTEREJ) job_abt(&pjob, pbse_to_txt(PBSE_ROUTEREJ)); else if (rc == PBSE_ROUTEEXPD) job_abt(&pjob, msg_routexceed); } pjob = nxjb; } return; }
int reroute_job( job *pjob) { int rc = PBSE_NONE; char log_buf[LOCAL_LOG_BUF_SIZE]; if (LOGLEVEL >= 8) { sprintf(log_buf, "%s", pjob->ji_qs.ji_jobid); LOG_EVENT(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, __func__, log_buf); } rc = job_route(pjob); if (rc == PBSE_ROUTEREJ) job_abt(&pjob, pbse_to_txt(PBSE_ROUTEREJ)); else if (rc == PBSE_ROUTEEXPD) job_abt(&pjob, msg_routexceed); else if (rc == PBSE_QUENOEN) job_abt(&pjob, msg_err_noqueue); return(rc); } /* END reroute_job() */
int reroute_job( job *pjob, pbs_queue *pque) { int rc = PBSE_NONE; char log_buf[LOCAL_LOG_BUF_SIZE]; if (LOGLEVEL >= 7) { sprintf(log_buf, "%s", pjob->ji_qs.ji_jobid); LOG_EVENT(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, __func__, log_buf); } if ((pque != NULL) && (pque->qu_qs.qu_type == QTYPE_RoutePush)) { rc = job_route(pjob); if (rc == PBSE_ROUTEREJ) job_abt(&pjob, pbse_to_txt(PBSE_ROUTEREJ)); else if (rc == PBSE_ROUTEEXPD) job_abt(&pjob, msg_routexceed); else if (rc == PBSE_QUENOEN) job_abt(&pjob, msg_err_noqueue); } return(rc); } /* END reroute_job() */
END_TEST START_TEST(job_abt_test) { int result = 0; struct job *null_job = NULL; result = job_abt(NULL, NULL); fail_unless(result != 0, "NULL input check fail"); result = job_abt(&null_job, NULL); fail_unless(result != 0, "NULL input check fail"); }
int set_array_job_ids( job **pjob, /* M */ char *log_buf, /* error Buffer */ size_t buflen) /* error buffer length */ { int rc = PBSE_NONE; #ifndef PBS_MOM job *pj = *pjob; job_array *pa; char parent_id[PBS_MAXSVRJOBID + 1]; if (strchr(pj->ji_qs.ji_jobid, '[') != NULL) { /* job is part of an array. We need to put a link back to the server job array struct for this array. We also have to link this job into the linked list of jobs belonging to the array. */ array_get_parent_id(pj->ji_qs.ji_jobid, parent_id); pa = get_array(parent_id); if (pa == NULL) { job_abt(&pj, (char *)"Array job missing array struct, aborting job"); snprintf(log_buf, buflen, "Array job missing array struct %s", __func__); return -1; } strcpy(pj->ji_arraystructid, parent_id); if (strcmp(parent_id, pj->ji_qs.ji_jobid) == 0) { pj->ji_is_array_template = TRUE; } else { pa->job_ids[(int)pj->ji_wattr[JOB_ATR_job_array_id].at_val.at_long] = strdup(pj->ji_qs.ji_jobid); pa->jobs_recovered++; /* This is a bit of a kluge, but for some reason if an array job was on hold when the server went down the ji_wattr[JOB_ATR_hold].at_val.at_long value is 0 on recovery even though pj->ji_qs.ji_state is JOB_STATE_HELD and the substate is JOB_SUBSTATE_HELD */ if ((pj->ji_qs.ji_state == JOB_STATE_HELD) && (pj->ji_qs.ji_substate == JOB_SUBSTATE_HELD)) { pj->ji_wattr[JOB_ATR_hold].at_val.at_long = HOLD_l; pj->ji_wattr[JOB_ATR_hold].at_flags = ATR_VFLAG_SET; } } if (pa != NULL) { unlock_ai_mutex(pa, __func__, "1", LOGLEVEL); } } #endif /* !PBS_MOM */ return rc; }
static void close_quejob( int sfds) { job *pjob; job *npjob; pjob = (job *)GET_NEXT(svr_newjobs); while (pjob != NULL) { npjob = GET_NEXT(pjob->ji_alljobs); if (pjob->ji_qs.ji_un.ji_newt.ji_fromsock == sfds) { if (pjob->ji_qs.ji_substate == JOB_SUBSTATE_TRANSICM) { #ifndef PBS_MOM if (pjob->ji_qs.ji_svrflags & JOB_SVFLG_HERE) { /* * the job was being created here for the first time * go ahead and enqueue it as QUEUED; otherwise, hold * it here as TRANSICM until we hear from the sending * server again to commit. */ delete_link(&pjob->ji_alljobs); pjob->ji_qs.ji_state = JOB_STATE_QUEUED; pjob->ji_qs.ji_substate = JOB_SUBSTATE_QUEUED; if (svr_enquejob(pjob)) job_abt(&pjob, msg_err_noqueue); } #endif /* PBS_MOM */ } else { /* else delete the job */ delete_link(&pjob->ji_alljobs); job_purge(pjob); } break; } /* END if (..) */ pjob = npjob; } return; } /* END close_quejob() */
static void rerun_or_kill( job *pjob, /* I (modified/freed) */ char *text) /* I */ { long server_state = server.sv_attr[(int)SRV_ATR_State].at_val.at_long; if (pjob->ji_wattr[(int)JOB_ATR_rerunable].at_val.at_long) { /* job is rerunable, mark it to be requeued */ issue_signal(pjob, "SIGKILL", release_req, 0); pjob->ji_qs.ji_substate = JOB_SUBSTATE_RERUN; strcpy(log_buffer, msg_init_queued); strcat(log_buffer, pjob->ji_qhdr->qu_qs.qu_name); strcat(log_buffer, text); } else if (server_state != SV_STATE_SHUTDEL) { /* job not rerunable, immediate shutdown - kill it off */ strcpy(log_buffer, msg_job_abort); strcat(log_buffer, text); /* need to record log message before purging job */ log_event( PBSEVENT_SYSTEM | PBSEVENT_JOB | PBSEVENT_DEBUG, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buffer); job_abt(&pjob, log_buffer); return; } else { /* delayed shutdown, leave job running */ strcpy(log_buffer, msg_leftrunning); strcat(log_buffer, text); } log_event(PBSEVENT_SYSTEM | PBSEVENT_JOB | PBSEVENT_DEBUG, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buffer); return; } /* END rerun_or_kill() */
int close_quejob_by_jobid( char *job_id) { int rc = PBSE_NONE; job *pjob = NULL; if (LOGLEVEL >= 10) { LOG_EVENT(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, __func__, job_id); } if ((pjob = svr_find_job(job_id, FALSE)) == NULL) { rc = PBSE_JOBNOTFOUND; return(rc); } mutex_mgr pjob_mutex = mutex_mgr(pjob->ji_mutex, true); if (pjob->ji_qs.ji_substate != JOB_SUBSTATE_TRANSICM) { remove_job(&newjobs,pjob); svr_job_purge(pjob); pjob = NULL; } else if (pjob->ji_qs.ji_svrflags & JOB_SVFLG_HERE) { remove_job(&newjobs,pjob); pjob->ji_qs.ji_state = JOB_STATE_QUEUED; pjob->ji_qs.ji_substate = JOB_SUBSTATE_QUEUED; rc = svr_enquejob(pjob, FALSE, -1, false); if ((rc == PBSE_JOBNOTFOUND) || (rc == PBSE_JOB_RECYCLED)) { pjob = NULL; } else if (rc != PBSE_NONE) { job_abt(&pjob, msg_err_noqueue); pjob = NULL; } } if (pjob == NULL) pjob_mutex.set_lock_on_exit(false); return(rc); } /* close_quejob_by_jobid() */
void stat_update( struct batch_request *preq, struct stat_cntl *cntl) { job *pjob; struct batch_reply *preply; struct brp_status *pstatus; svrattrl *sattrl; int oldsid; int bad = 0; time_t time_now = time(NULL); char *msg_ptr = NULL; char log_buf[LOCAL_LOG_BUF_SIZE]; preply = &preq->rq_reply; if (preply->brp_un.brp_txt.brp_str != NULL) { msg_ptr = strstr(preply->brp_un.brp_txt.brp_str, PBS_MSG_EQUAL); if (msg_ptr != NULL) msg_ptr += strlen(PBS_MSG_EQUAL); } if (preply->brp_choice == BATCH_REPLY_CHOICE_Status) { pstatus = (struct brp_status *)GET_NEXT(preply->brp_un.brp_status); while (pstatus != NULL) { if ((pjob = svr_find_job(pstatus->brp_objname, FALSE)) != NULL) { mutex_mgr job_mutex(pjob->ji_mutex, true); sattrl = (svrattrl *)GET_NEXT(pstatus->brp_attr); oldsid = pjob->ji_wattr[JOB_ATR_session_id].at_val.at_long; modify_job_attr( pjob, sattrl, ATR_DFLAG_MGWR | ATR_DFLAG_SvWR, &bad); if (oldsid != pjob->ji_wattr[JOB_ATR_session_id].at_val.at_long) { /* first save since running job (or the sid has changed), */ /* must save session id */ job_save(pjob, SAVEJOB_FULL, 0); } #ifdef USESAVEDRESOURCES else { /* save so we can recover resources used */ job_save(pjob, SAVEJOB_FULL, 0); } #endif /* USESAVEDRESOURCES */ pjob->ji_momstat = time_now; } pstatus = (struct brp_status *)GET_NEXT(pstatus->brp_stlink); } /* END while (pstatus != NULL) */ } /* END if (preply->brp_choice == BATCH_REPLY_CHOICE_Status) */ else if ((preply->brp_choice == BATCH_REPLY_CHOICE_Text) && (preply->brp_code == PBSE_UNKJOBID) && (msg_ptr != NULL) && (!strcmp(msg_ptr, preq->rq_ind.rq_status.rq_id))) { /* we sent a stat request, but mom says it doesn't know anything about the job */ if ((pjob = svr_find_job(preq->rq_ind.rq_status.rq_id, FALSE)) != NULL) { /* job really isn't running any more - mom doesn't know anything about it this can happen if a diskless node reboots and the mom_priv/jobs directory is cleared, set its state to queued so job_abt doesn't think it is still running */ mutex_mgr job_mutex(pjob->ji_mutex, true); snprintf(log_buf, sizeof(log_buf), "mother superior no longer recognizes %s as a valid job, aborting. Last reported time was %ld", preq->rq_ind.rq_status.rq_id, pjob->ji_last_reported_time); log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, __func__, log_buf); svr_setjobstate(pjob, JOB_STATE_QUEUED, JOB_SUBSTATE_ABORT, FALSE); rel_resc(pjob); job_mutex.set_unlock_on_exit(false); job_abt(&pjob, "Job does not exist on node"); /* TODO, if the job is rerunnable we should set its state back to queued */ } } else { snprintf(log_buf, sizeof(log_buf), "Poll job request failed for job %s", preq->rq_ind.rq_status.rq_id); log_err(preply->brp_code, __func__, log_buf); } cntl->sc_conn = -1; if (cntl->sc_post) cntl->sc_post(cntl); /* continue where we left off */ /* If sc_post has a value it is: * req_stat_job_step2 * if so, it expects cntl to be free'd after the call */ free(cntl); /* a bit of a kludge but its saves an extra func */ return; } /* END stat_update() */
/** * attempt_delete() * deletes a job differently depending on the job's state * * @return TRUE if the job was deleted, FALSE if skipped * @param pjob - a pointer to the job being handled */ int attempt_delete( void *j) /* I */ { int skipped = FALSE; int release_mutex = TRUE; job *pjob; time_t time_now = time(NULL); char log_buf[LOCAL_LOG_BUF_SIZE]; /* job considered deleted if null */ if (j == NULL) return(TRUE); pjob = (job *)j; if (pjob->ji_qs.ji_state == JOB_STATE_TRANSIT) { /* I'm not sure if this is still possible since the thread * waits on the job to finish transmiting, but I'll leave * this part here --dbeer */ skipped = TRUE; return(!skipped); } /* END if (pjob->ji_qs.ji_state == JOB_SUBSTATE_TRANSIT) */ else if (pjob->ji_qs.ji_substate == JOB_SUBSTATE_PRERUN) { /* we'll wait for the mom to get this job, then delete it */ skipped = TRUE; } /* END if (pjob->ji_qs.ji_substate == JOB_SUBSTATE_PRERUN) */ else if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING) { /* set up nanny */ if (pjob->ji_has_delete_nanny == FALSE) { apply_job_delete_nanny(pjob, time_now + 60); /* need to issue a signal to the mom, but we don't want to sent an ack to the * client when the mom replies */ issue_signal(&pjob, "SIGTERM", post_delete, NULL); } if (pjob != NULL) { if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_FILE) != 0) { /* job has restart file at mom, change restart comment if failed */ change_restart_comment_if_needed(pjob); } unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL); } return(!skipped); } /* END if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING) */ if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_FILE) != 0) { /* job has restart file at mom, change restart comment if failed */ change_restart_comment_if_needed(pjob); /* job has restart file at mom, do end job processing */ svr_setjobstate(pjob, JOB_STATE_EXITING, JOB_SUBSTATE_EXITING, FALSE); pjob->ji_momhandle = -1; /* force new connection */ if (LOGLEVEL >= 7) { sprintf(log_buf, "calling on_job_exit from %s", __func__); log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buf); } set_task(WORK_Immed, 0, on_job_exit, strdup(pjob->ji_qs.ji_jobid), FALSE); } else if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_StagedIn) != 0) { /* job has staged-in file, should remove them */ remove_stagein(&pjob); if (pjob != NULL) job_abt(&pjob, NULL); release_mutex = FALSE; } else { /* * the job is not transitting (though it may have been) and * is not running, so put in into a complete state. */ struct pbs_queue *pque; int KeepSeconds = 0; svr_setjobstate(pjob, JOB_STATE_COMPLETE, JOB_SUBSTATE_COMPLETE, FALSE); if ((pque = get_jobs_queue(&pjob)) != NULL) { pque->qu_numcompleted++; unlock_queue(pque, __func__, NULL, LOGLEVEL); } if (pjob != NULL) { pthread_mutex_lock(server.sv_attr_mutex); KeepSeconds = attr_ifelse_long( &pque->qu_attr[QE_ATR_KeepCompleted], &server.sv_attr[SRV_ATR_KeepCompleted], 0); pthread_mutex_unlock(server.sv_attr_mutex); if (LOGLEVEL >= 7) { sprintf(log_buf, "calling on_job_exit from %s", __func__); log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buf); } set_task(WORK_Timed, time_now + KeepSeconds, on_job_exit, strdup(pjob->ji_qs.ji_jobid), FALSE); } else release_mutex = FALSE; } if (release_mutex == TRUE) unlock_ji_mutex(pjob, __func__, "2", LOGLEVEL); return(!skipped); } /* END attempt_delete() */
void array_delete_wt( struct work_task *ptask) { struct batch_request *preq; job_array *pa; int i; char log_buf[LOCAL_LOG_BUF_SIZE]; int num_jobs = 0; int num_prerun = 0; job *pjob; preq = get_remove_batch_request((char *)ptask->wt_parm1); free(ptask->wt_mutex); free(ptask); if (preq == NULL) return; pa = get_array(preq->rq_ind.rq_delete.rq_objname); if (pa == NULL) { /* jobs must have exited already */ reply_ack(preq); return; } for (i = 0; i < pa->ai_qs.array_size; i++) { if (pa->job_ids[i] == NULL) continue; if ((pjob = svr_find_job(pa->job_ids[i], FALSE)) == NULL) { free(pa->job_ids[i]); pa->job_ids[i] = NULL; } else { num_jobs++; if (pjob->ji_qs.ji_substate == JOB_SUBSTATE_PRERUN) { num_prerun++; /* mom still hasn't gotten job?? delete anyway */ if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_FILE) != 0) { /* job has restart file at mom, do end job processing */ change_restart_comment_if_needed(pjob); svr_setjobstate(pjob, JOB_STATE_EXITING, JOB_SUBSTATE_EXITING, FALSE); pjob->ji_momhandle = -1; /* force new connection */ if (LOGLEVEL >= 7) { sprintf(log_buf, "calling on_job_exit from %s", __func__); log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buf); } set_task(WORK_Immed, 0, on_job_exit, strdup(pjob->ji_qs.ji_jobid), FALSE); unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL); } } else if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_StagedIn) != 0) { /* job has staged-in file, should remove them */ remove_stagein(&pjob); if (pjob != NULL) { /* job_abt() calls svr_job_purge which will try to lock the array again */ pthread_mutex_unlock(pa->ai_mutex); job_abt(&pjob, NULL); pthread_mutex_lock(pa->ai_mutex); } } else { /* job_abt() calls svr_job_purge which will try to lock the array again */ pthread_mutex_unlock(pa->ai_mutex); job_abt(&pjob, NULL); pthread_mutex_lock(pa->ai_mutex); } } /* END if (ji_substate == JOB_SUBSTATE_PRERUN) */ } /* END for each job in array */ pthread_mutex_unlock(pa->ai_mutex); if (LOGLEVEL >= 7) { sprintf(log_buf, "%s: unlocked ai_mutex", __func__); log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, __func__, log_buf); } if (num_jobs == num_prerun) { reply_ack(preq); } else { req_deletearray(preq); } } /* END array_delete_wt() */
/** * @brief * post_routejob - clean up action for child started in net_move/send_job * to "route" a job to another server * @par * If route was successfull, delete job. * @par * If route didn't work, mark destination not to be tried again for this * job and call route again. * * @param[in] pwt - work task structure * * @return none. */ static void post_routejob(struct work_task *pwt) { int newstate; int newsub; int r; int stat = pwt->wt_aux; job *jobp = (job *)pwt->wt_parm2; if (jobp == NULL) { log_event(PBSEVENT_ERROR, PBS_EVENTCLASS_JOB, LOG_INFO, "", "post_routejob failed, jobp NULL"); return; } if (WIFEXITED(stat)) { r = WEXITSTATUS(stat); } else { r = SEND_JOB_FATAL; (void)sprintf(log_buffer, msg_badexit, stat); (void)strcat(log_buffer, __func__); log_event(PBSEVENT_SYSTEM, PBS_EVENTCLASS_JOB, LOG_NOTICE, jobp->ji_qs.ji_jobid, log_buffer); } switch (r) { case SEND_JOB_OK: /* normal return, job was routed */ if (jobp->ji_qs.ji_svrflags & JOB_SVFLG_StagedIn) remove_stagein(jobp); /* * If the server is configured to keep job history and the job * is created here, do not purge the job structure but save * it for history purpose. No need to check for sub-jobs as * sub jobs can not be routed. */ if (svr_chk_history_conf()) svr_setjob_histinfo(jobp, T_MOV_JOB); else job_purge(jobp); /* need to remove server job struct */ return; case SEND_JOB_FATAL: /* permanent rejection (or signal) */ if (jobp->ji_qs.ji_substate == JOB_SUBSTATE_ABORT) { /* Job Delete in progress, just set to queued status */ (void)svr_setjobstate(jobp, JOB_STATE_QUEUED, JOB_SUBSTATE_ABORT); return; } add_dest(jobp); /* else mark destination as bad */ /* fall through */ default : /* try routing again */ /* force re-eval of job state out of Transit */ svr_evaljobstate(jobp, &newstate, &newsub, 1); (void)svr_setjobstate(jobp, newstate, newsub); jobp->ji_retryok = 1; if ((r = job_route(jobp)) == PBSE_ROUTEREJ) (void)job_abt(jobp, msg_routebad); else if (r != 0) (void)job_abt(jobp, msg_routexceed); break; } return; }
void stat_update( struct batch_request *preq, struct stat_cntl *cntl) { job *pjob; struct batch_reply *preply; struct brp_status *pstatus; svrattrl *sattrl; int oldsid; int bad = 0; time_t time_now = time(NULL); preply = &preq->rq_reply; if (preply->brp_choice == BATCH_REPLY_CHOICE_Status) { pstatus = (struct brp_status *)GET_NEXT(preply->brp_un.brp_status); while (pstatus != NULL) { if ((pjob = svr_find_job(pstatus->brp_objname, FALSE)) != NULL) { sattrl = (svrattrl *)GET_NEXT(pstatus->brp_attr); oldsid = pjob->ji_wattr[JOB_ATR_session_id].at_val.at_long; modify_job_attr( pjob, sattrl, ATR_DFLAG_MGWR | ATR_DFLAG_SvWR, &bad); if (oldsid != pjob->ji_wattr[JOB_ATR_session_id].at_val.at_long) { /* first save since running job (or the sid has changed), */ /* must save session id */ job_save(pjob, SAVEJOB_FULL, 0); } #ifdef USESAVEDRESOURCES else { /* save so we can recover resources used */ job_save(pjob, SAVEJOB_FULL, 0); } #endif /* USESAVEDRESOURCES */ pjob->ji_momstat = time_now; unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL); } pstatus = (struct brp_status *)GET_NEXT(pstatus->brp_stlink); } /* END while (pstatus != NULL) */ } /* END if (preply->brp_choice == BATCH_REPLY_CHOICE_Status) */ else { if (preply->brp_code == PBSE_UNKJOBID) { /* we sent a stat request, but mom says it doesn't know anything about the job */ if ((pjob = svr_find_job(preq->rq_ind.rq_status.rq_id, FALSE)) != NULL) { /* job really isn't running any more - mom doesn't know anything about it this can happen if a diskless node reboots and the mom_priv/jobs directory is cleared, set its state to queued so job_abt doesn't think it is still running */ svr_setjobstate(pjob, JOB_STATE_QUEUED, JOB_SUBSTATE_ABORT, FALSE); rel_resc(pjob); job_abt(&pjob, "Job does not exist on node"); /* TODO, if the job is rerunnable we should set its state back to queued */ } } } cntl->sc_conn = -1; /* MUTSU - Unlock job here? */ if (cntl->sc_post) cntl->sc_post(cntl); /* continue where we left off */ /* If sc_post has a value it is: * req_stat_job_step2 * if so, it expects cntl to be free'd after the call */ free(cntl); /* a bit of a kludge but its saves an extra func */ return; } /* END stat_update() */
static void post_doq(struct work_task *pwt) { struct batch_request *preq = (struct batch_request *)pwt->wt_parm1; char *jobid = preq->rq_ind.rq_register.rq_child; char *msg; job *pjob; job *ppjob; struct depend_job pparent; int rc; if (preq->rq_reply.brp_code) { /* request was rejected */ (void)strcpy(log_buffer, msg_regrej); (void)strcat(log_buffer, preq->rq_ind.rq_register.rq_parent); log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, LOG_INFO, jobid, log_buffer); pjob = find_job(jobid); if ((msg = pbse_to_txt(preq->rq_reply.brp_code)) != NULL) { (void)strcat(log_buffer, " "); (void)strcat(log_buffer, msg); } if (pjob) { if (preq->rq_reply.brp_code == PBSE_JOB_MOVED) { /* Creating a separate log buffer because if we end up aborting the submitted job * we don't want to change what goes into accounting log via job_abt */ char log_msg[LOG_BUF_SIZE]; snprintf(log_msg, sizeof(log_msg), "%s, %s", msg_job_moved, "sending dependency request to remote server"); log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, LOG_INFO, jobid, log_msg); ppjob = find_job(preq->rq_ind.rq_register.rq_parent); if(ppjob && (ppjob->ji_qs.ji_state == JOB_STATE_MOVED) && (ppjob->ji_qs.ji_substate == JOB_SUBSTATE_MOVED)) { char *destin; /* job destination should be <remote queue>@<remote server> */ destin = strchr(ppjob->ji_qs.ji_destin, (int)'@'); if (destin != NULL) { strncpy(pparent.dc_child, ppjob->ji_qs.ji_jobid, sizeof(pparent.dc_child)); strncpy(pparent.dc_svr, destin+1, sizeof(pparent.dc_svr)); rc = send_depend_req(pjob, &pparent, preq->rq_ind.rq_register.rq_dependtype, JOB_DEPEND_OP_REGISTER, SYNC_SCHED_HINT_NULL, post_doq); if (rc) { snprintf(log_msg, sizeof(log_msg), "%s", "Failed to send dependency request to remote server, aborting job"); log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, LOG_ERR, jobid, log_msg); check_block(pjob, log_buffer); job_abt(pjob, log_buffer); } } else { /* Ideally if a job is moved, destination can not be empty */ /* If we come across an empty destination, abort the job */ check_block(pjob, log_buffer); job_abt(pjob, log_buffer); } } else { check_block(pjob, log_buffer); job_abt(pjob, log_buffer); } } else { check_block(pjob, log_buffer); job_abt(pjob, log_buffer); } } } release_req(pwt); }
void finish_routing_processing( job *pjob, int status) { int newstate; int newsub; if (pjob == NULL) return; if (LOGLEVEL >= 10) log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, __func__, pjob->ji_qs.ji_jobid); switch (status) { case LOCUTION_SUCCESS: /* normal return, job was routed */ if (pjob->ji_qs.ji_svrflags & JOB_SVFLG_StagedIn) remove_stagein(&pjob); if (pjob != NULL) { if (pjob->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_COPIED) remove_checkpoint(&pjob); if (pjob != NULL) svr_job_purge(pjob); /* need to remove server job struct */ } break; case LOCUTION_FAIL: /* permanent rejection (or signal) */ if (pjob->ji_qs.ji_substate == JOB_SUBSTATE_ABORT) { /* job delete in progress, just set to queued status */ svr_setjobstate(pjob, JOB_STATE_QUEUED, JOB_SUBSTATE_ABORT, FALSE); svr_mailowner(pjob, 'a', TRUE, "Couldn't route job to remote server"); unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL); return; } add_dest(pjob); /* else mark destination as bad */ /* fall through */ default: /* try routing again */ svr_mailowner(pjob, 'a', TRUE, "Couldn't route job to remote server"); /* force re-eval of job state out of Transit */ svr_evaljobstate(*pjob, newstate, newsub, 1); svr_setjobstate(pjob, newstate, newsub, FALSE); if ((status = job_route(pjob)) == PBSE_ROUTEREJ) job_abt(&pjob, pbse_to_txt(PBSE_ROUTEREJ)); else if (status != 0) job_abt(&pjob, msg_routexceed); else unlock_ji_mutex(pjob, __func__, "2", LOGLEVEL); break; } /* END switch (status) */ return; } /* END finish_routing_processing() */
int delete_inactive_job( job **pjob_ptr, const char *Msg) { job *pjob; char log_buf[LOCAL_LOG_BUF_SIZE]; if (pjob_ptr == NULL) return(PBSE_BAD_PARAMETER); pjob = *pjob_ptr; if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_FILE) != 0) { /* job has restart file at mom, do end job processing */ svr_setjobstate(pjob, JOB_STATE_EXITING, JOB_SUBSTATE_EXITING, FALSE); /* force new connection */ pjob->ji_momhandle = -1; if (LOGLEVEL >= 7) { sprintf(log_buf, "calling on_job_exit from %s", __func__); log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buf); } set_task(WORK_Immed, 0, on_job_exit_task, strdup(pjob->ji_qs.ji_jobid), FALSE); } else if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_StagedIn) != 0) { /* job has staged-in file, should remove them */ remove_stagein(&pjob); if (pjob != NULL) job_abt(&pjob, Msg); } else { /* * the job is not transitting (though it may have been) and * is not running, so put in into a complete state. */ struct pbs_queue *pque; int KeepSeconds = 0; svr_setjobstate(pjob, JOB_STATE_COMPLETE, JOB_SUBSTATE_COMPLETE, FALSE); if ((pque = get_jobs_queue(&pjob)) != NULL) { unlock_queue(pque, __func__, NULL, LOGLEVEL); if (LOGLEVEL >= 7) { sprintf(log_buf, "calling on_job_exit from %s", __func__); log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buf); } pthread_mutex_lock(server.sv_attr_mutex); KeepSeconds = attr_ifelse_long( &pque->qu_attr[QE_ATR_KeepCompleted], &server.sv_attr[SRV_ATR_KeepCompleted], 0); pthread_mutex_unlock(server.sv_attr_mutex); } else KeepSeconds = 0; if (pjob != NULL) set_task(WORK_Timed, time(NULL) + KeepSeconds, on_job_exit_task, strdup(pjob->ji_qs.ji_jobid), FALSE); } /* END else if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_FILE) != 0) */ if (pjob == NULL) *pjob_ptr = NULL; return(PBSE_NONE); } /* END delete_inactive_job() */
void rerun_or_kill( job **pjob_ptr, /* I (modified/freed) */ char *text) /* I */ { long server_state = SV_STATE_DOWN; char log_buf[LOCAL_LOG_BUF_SIZE]; pbs_queue *pque; job *pjob = *pjob_ptr; get_svr_attr_l(SRV_ATR_State, &server_state); if (pjob->ji_wattr[JOB_ATR_rerunable].at_val.at_long) { /* job is rerunable, mark it to be requeued */ issue_signal(&pjob, "SIGKILL", free_br, NULL); if (pjob != NULL) { pjob->ji_qs.ji_substate = JOB_SUBSTATE_RERUN; if ((pque = get_jobs_queue(&pjob)) != NULL) { snprintf(log_buf, sizeof(log_buf), "%s%s%s", msg_init_queued, pque->qu_qs.qu_name, text); unlock_queue(pque, __func__, NULL, LOGLEVEL); } } } else if (server_state != SV_STATE_SHUTDEL) { /* job not rerunable, immediate shutdown - kill it off */ snprintf(log_buf, sizeof(log_buf), "%s%s", msg_job_abort, text); /* need to record log message before purging job */ log_event( PBSEVENT_SYSTEM | PBSEVENT_JOB | PBSEVENT_DEBUG, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buf); job_abt(pjob_ptr, log_buf); return; } else { /* delayed shutdown, leave job running */ snprintf(log_buf, sizeof(log_buf), "%s%s", msg_leftrunning, text); } if (pjob != NULL) { log_event( PBSEVENT_SYSTEM | PBSEVENT_JOB | PBSEVENT_DEBUG, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buf); } return; } /* END rerun_or_kill() */
static void post_routejob( struct work_task *pwt) { int newstate; int newsub; int r; int stat = pwt->wt_aux; char *id = "post_routejob"; job *jobp = (job *)pwt->wt_parm1; if (WIFEXITED(stat)) { r = WEXITSTATUS(stat); } else { r = 2; sprintf(log_buffer, msg_badexit, stat); strcat(log_buffer, id); log_event( PBSEVENT_SYSTEM, PBS_EVENTCLASS_JOB, jobp->ji_qs.ji_jobid, log_buffer); } switch (r) { case 0: /* normal return, job was routed */ if (jobp->ji_qs.ji_svrflags & JOB_SVFLG_StagedIn) remove_stagein(jobp); if (jobp->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_COPIED) remove_checkpoint(jobp); job_purge(jobp); /* need to remove server job struct */ return; /*NOTREACHED*/ break; case 1: /* permanent rejection (or signal) */ if (jobp->ji_qs.ji_substate == JOB_SUBSTATE_ABORT) { /* job delete in progress, just set to queued status */ svr_setjobstate(jobp, JOB_STATE_QUEUED, JOB_SUBSTATE_ABORT); return; } add_dest(jobp); /* else mark destination as bad */ /* fall through */ default : /* try routing again */ /* force re-eval of job state out of Transit */ svr_evaljobstate(jobp, &newstate, &newsub, 1); svr_setjobstate(jobp, newstate, newsub); if ((r = job_route(jobp)) == PBSE_ROUTEREJ) job_abt(&jobp, pbse_to_txt(PBSE_ROUTEREJ)); else if (r != 0) job_abt(&jobp, msg_routexceed); break; } /* END switch (r) */ return; } /* END post_routejob() */
job *job_recov( char *filename) /* I */ /* pathname to job save file */ { int fds; job *pj; char *pn; char namebuf[MAXPATHLEN]; char log_buf[LOCAL_LOG_BUF_SIZE]; #ifndef PBS_MOM char parent_id[PBS_MAXSVRJOBID + 1]; job_array *pa; #endif pj = job_alloc(); /* allocate & initialize job structure space */ if (pj == NULL) { /* FAILURE - cannot alloc memory */ return(NULL); } snprintf(namebuf, MAXPATHLEN, "%s%s", path_jobs, filename); /* job directory path, filename */ fds = open(namebuf, O_RDONLY, 0); if (fds < 0) { snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "unable to open %s", namebuf); log_err(errno, __func__, log_buf); #ifndef PBS_MOM unlock_ji_mutex(pj, __func__, "1", LOGLEVEL); free(pj->ji_mutex); #endif free((char *)pj); /* FAILURE - cannot open job file */ return(NULL); } /* read in job quick save sub-structure */ if (read_ac_socket(fds, (char *)&pj->ji_qs, sizeof(pj->ji_qs)) != sizeof(pj->ji_qs) && pj->ji_qs.qs_version == PBS_QS_VERSION) { snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "Unable to read %s", namebuf); log_err(errno, __func__, log_buf); #ifndef PBS_MOM unlock_ji_mutex(pj, __func__, "2", LOGLEVEL); free(pj->ji_mutex); #endif free((char *)pj); close(fds); return(NULL); } /* is ji_qs the version we expect? */ if (pj->ji_qs.qs_version != PBS_QS_VERSION) { /* ji_qs is older version */ snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "%s appears to be from an old version. Attempting to convert.\n", namebuf); log_err(-1, __func__, log_buf); if (job_qs_upgrade(pj, fds, namebuf, pj->ji_qs.qs_version) != 0) { snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "unable to upgrade %s\n", namebuf); log_err(-1, __func__, log_buf); #ifndef PBS_MOM unlock_ji_mutex(pj, __func__, "3", LOGLEVEL); free(pj->ji_mutex); #endif free((char *)pj); close(fds); return(NULL); } } /* END if (pj->ji_qs.qs_version != PBS_QS_VERSION) */ /* Does file name match the internal name? */ /* This detects ghost files */ pn = strrchr(namebuf, (int)'/') + 1; if (strncmp(pn, pj->ji_qs.ji_fileprefix, strlen(pj->ji_qs.ji_fileprefix)) != 0) { /* mismatch, discard job */ snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "Job Id %s does not match file name for %s", pj->ji_qs.ji_jobid, namebuf); log_err(-1, __func__, log_buf); #ifndef PBS_MOM unlock_ji_mutex(pj, __func__, "4", LOGLEVEL); free(pj->ji_mutex); #endif free((char *)pj); close(fds); return(NULL); } /* read in working attributes */ if (recov_attr( fds, pj, job_attr_def, pj->ji_wattr, JOB_ATR_LAST, JOB_ATR_UNKN, TRUE) != 0) { snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "unable to recover %s (file is likely corrupted)", namebuf); log_err(-1, __func__, log_buf); #ifndef PBS_MOM unlock_ji_mutex(pj, __func__, "5", LOGLEVEL); job_free(pj, FALSE); #else mom_job_free(pj); #endif close(fds); return(NULL); } #ifndef PBS_MOM /* Comment out the mother superior tracking. Will be debugged later if (pj->ji_wattr[JOB_ATR_exec_host].at_val.at_str != NULL) {*/ /* add job to the mother superior list for it's node */ /* char *ms = strdup(pj->ji_wattr[JOB_ATR_exec_host].at_val.at_str); char *end = strchr(ms, '/'); if (end != NULL) *end = '\0'; if ((end = strchr(ms, '+')) != NULL) *end = '\0'; add_to_ms_list(ms, pj); free(ms); }*/ #endif #ifdef PBS_MOM /* read in tm sockets and ips */ if (recov_tmsock(fds, pj) != 0) { snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "warning: tmsockets not recovered from %s (written by an older pbs_mom?)", namebuf); log_err(-1, __func__, log_buf); } #else /* not PBS_MOM */ if (strchr(pj->ji_qs.ji_jobid, '[') != NULL) { /* job is part of an array. We need to put a link back to the server job array struct for this array. We also have to link this job into the linked list of jobs belonging to the array. */ array_get_parent_id(pj->ji_qs.ji_jobid, parent_id); pa = get_array(parent_id); if (pa == NULL) { job_abt(&pj, (char *)"Array job missing array struct, aborting job"); close(fds); return NULL; } strcpy(pj->ji_arraystructid, parent_id); if (strcmp(parent_id, pj->ji_qs.ji_jobid) == 0) { pj->ji_is_array_template = TRUE; } else { pa->job_ids[(int)pj->ji_wattr[JOB_ATR_job_array_id].at_val.at_long] = strdup(pj->ji_qs.ji_jobid); pa->jobs_recovered++; /* This is a bit of a kluge, but for some reason if an array job was on hold when the server went down the ji_wattr[JOB_ATR_hold].at_val.at_long value is 0 on recovery even though pj->ji_qs.ji_state is JOB_STATE_HELD and the substate is JOB_SUBSTATE_HELD */ if ((pj->ji_qs.ji_state == JOB_STATE_HELD) && (pj->ji_qs.ji_substate == JOB_SUBSTATE_HELD)) { pj->ji_wattr[JOB_ATR_hold].at_val.at_long = HOLD_l; pj->ji_wattr[JOB_ATR_hold].at_flags = ATR_VFLAG_SET; } } if (pa != NULL) { unlock_ai_mutex(pa, __func__, "1", LOGLEVEL); } } #endif close(fds); pj->ji_commit_done = 1; /* all done recovering the job */ job_save(pj, SAVEJOB_FULL, 0); return(pj); } /* END job_recov() */
job *job_recov( char *filename) /* I */ /* pathname to job save file */ { int fds; job *pj; char *pn; char namebuf[MAXPATHLEN]; int qs_upgrade; #ifndef PBS_MOM char parent_id[PBS_MAXSVRJOBID + 1]; job_array *pa; #endif qs_upgrade = FALSE; pj = job_alloc(); /* allocate & initialize job structure space */ if (pj == NULL) { /* FAILURE - cannot alloc memory */ return(NULL); } strcpy(namebuf, path_jobs); /* job directory path */ strcat(namebuf, filename); fds = open(namebuf, O_RDONLY, 0); if (fds < 0) { sprintf(log_buffer, "unable to open %s", namebuf); log_err(errno, "job_recov", log_buffer); free((char *)pj); /* FAILURE - cannot open job file */ return(NULL); } /* read in job quick save sub-structure */ if (read(fds, (char *)&pj->ji_qs, quicksize) != (ssize_t)quicksize && pj->ji_qs.qs_version == PBS_QS_VERSION) { sprintf(log_buffer, "Unable to read %s", namebuf); log_err(errno, "job_recov", log_buffer); free((char *)pj); close(fds); return(NULL); } /* is ji_qs the version we expect? */ if (pj->ji_qs.qs_version != PBS_QS_VERSION) { /* ji_qs is older version */ sprintf(log_buffer, "%s appears to be from an old version. Attempting to convert.\n", namebuf); log_err(-1, "job_recov", log_buffer); if (job_qs_upgrade(pj, fds, namebuf, pj->ji_qs.qs_version) != 0) { sprintf(log_buffer, "unable to upgrade %s\n", namebuf); log_err(-1, "job_recov", log_buffer); free((char *)pj); close(fds); return(NULL); } qs_upgrade = TRUE; } /* END if (pj->ji_qs.qs_version != PBS_QS_VERSION) */ /* Does file name match the internal name? */ /* This detects ghost files */ pn = strrchr(namebuf, (int)'/') + 1; if (strncmp(pn, pj->ji_qs.ji_fileprefix, strlen(pj->ji_qs.ji_fileprefix)) != 0) { /* mismatch, discard job */ sprintf(log_buffer, "Job Id %s does not match file name for %s", pj->ji_qs.ji_jobid, namebuf); log_err(-1, "job_recov", log_buffer); free((char *)pj); close(fds); return(NULL); } /* read in working attributes */ if (recov_attr( fds, pj, job_attr_def, pj->ji_wattr, (int)JOB_ATR_LAST, (int)JOB_ATR_UNKN, TRUE) != 0) { sprintf(log_buffer, "unable to recover %s (file is likely corrupted)", namebuf); log_err(-1, "job_recov", log_buffer); job_free(pj); close(fds); return(NULL); } #ifdef PBS_MOM /* read in tm sockets and ips */ if (recov_tmsock(fds, pj) != 0) { sprintf(log_buffer, "warning: tmsockets not recovered from %s (written by an older pbs_mom?)", namebuf); log_err(-1, "job_recov", log_buffer); } #else /* PBS_MOM */ if (pj->ji_wattr[(int)JOB_ATR_job_array_request].at_flags & ATR_VFLAG_SET) { /* job is part of an array. We need to put a link back to the server job array struct for this array. We also have to link this job into the linked list of jobs belonging to the array. */ array_get_parent_id(pj->ji_qs.ji_jobid, parent_id); pa = get_array(parent_id); if (strcmp(parent_id, pj->ji_qs.ji_jobid) == 0) { pj->ji_is_array_template = TRUE; pj->ji_arraystruct = pa; } else { /* XXX should we move this up after pa = get_array... */ if (pa == NULL) { job_abt(&pj, "Array job missing array struct, aborting job"); close(fds); return NULL; } else { pa->jobs[(int)pj->ji_wattr[JOB_ATR_job_array_id].at_val.at_long] = (void *)pj; pj->ji_arraystruct = pa; pa->jobs_recovered++; } } } #endif close(fds); /* all done recovering the job */ if (qs_upgrade == TRUE) { job_save(pj, SAVEJOB_FULL); } return(pj); } /* END job_recov() */
void req_deletejob( struct batch_request *preq) /* I */ { job *pjob; struct work_task *pwtold; struct work_task *pwtnew; struct work_task *pwtcheck; int rc; char *sigt = "SIGTERM"; char *Msg = NULL; /* check if we are getting a purgecomplete from scheduler */ if ((preq->rq_extend != NULL) && !strncmp(preq->rq_extend,PURGECOMP,strlen(PURGECOMP))) { /* * purge_completed_jobs will respond with either an ack or reject */ purge_completed_jobs(preq); return; } /* The way this is implemented, if the user enters the command "qdel -p <jobid>", * they can then delete jobs other than their own since the authorization * checks are made below in chk_job_request. This should probably be fixed. */ if (forced_jobpurge(preq) != 0) { return; } /* NOTE: should support rq_objname={<JOBID>|ALL|<name:<JOBNAME>} */ /* NYI */ pjob = chk_job_request(preq->rq_ind.rq_delete.rq_objname, preq); if (pjob == NULL) { /* NOTE: chk_job_request() will issue req_reject() */ return; } if (preq->rq_extend != NULL) { if (strncmp(preq->rq_extend, deldelaystr, strlen(deldelaystr)) && strncmp(preq->rq_extend, delasyncstr, strlen(delasyncstr)) && strncmp(preq->rq_extend, delpurgestr, strlen(delpurgestr))) { /* have text message in request extension, add it */ Msg = preq->rq_extend; /* * Message capability is only for operators and managers. * Check if request is authorized */ if ((preq->rq_perm & (ATR_DFLAG_OPRD | ATR_DFLAG_OPWR | ATR_DFLAG_MGRD | ATR_DFLAG_MGWR)) == 0) { req_reject(PBSE_PERM, 0, preq, NULL, "must have operator or manager privilege to use -m parameter"); return; } } } if (pjob->ji_qs.ji_state == JOB_STATE_TRANSIT) { /* * Find pid of router from existing work task entry, * then establish another work task on same child. * Next, signal the router and wait for its completion; */ pwtold = (struct work_task *)GET_NEXT(pjob->ji_svrtask); while (pwtold != NULL) { if ((pwtold->wt_type == WORK_Deferred_Child) || (pwtold->wt_type == WORK_Deferred_Cmp)) { pwtnew = set_task( pwtold->wt_type, pwtold->wt_event, post_delete_route, preq); if (pwtnew != NULL) { /* * reset type in case the SIGCHLD came * in during the set_task; it makes * sure that next_task() will find the * new entry. */ pwtnew->wt_type = pwtold->wt_type; pwtnew->wt_aux = pwtold->wt_aux; kill((pid_t)pwtold->wt_event, SIGTERM); pjob->ji_qs.ji_substate = JOB_SUBSTATE_ABORT; return; /* all done for now */ } else { req_reject(PBSE_SYSTEM, 0, preq, NULL, NULL); return; } } pwtold = (struct work_task *)GET_NEXT(pwtold->wt_linkobj); } /* should never get here ... */ log_err(-1, "req_delete", "Did not find work task for router"); req_reject(PBSE_INTERNAL, 0, preq, NULL, NULL); return; } if (pjob->ji_qs.ji_substate == JOB_SUBSTATE_PRERUN || pjob->ji_qs.ji_substate == JOB_SUBSTATE_RERUN || pjob->ji_qs.ji_substate == JOB_SUBSTATE_RERUN1 || pjob->ji_qs.ji_substate == JOB_SUBSTATE_RERUN2 || pjob->ji_qs.ji_substate == JOB_SUBSTATE_RERUN3 ) { /* If JOB_SUBSTATE_PRERUN being sent to MOM, wait till she gets it going */ /* retry in one second */ /* If JOB_SUBSTATE_RERUN, RERUN1, RERUN2 or RERUN3 the job is being requeued. Wait until finished */ static time_t cycle_check_when = 0; static char cycle_check_jid[PBS_MAXSVRJOBID + 1]; if (cycle_check_when != 0) { if (!strcmp(pjob->ji_qs.ji_jobid, cycle_check_jid) && (time_now - cycle_check_when > 10)) { /* state not updated after 10 seconds */ /* did the mom ever get it? delete it anyways... */ cycle_check_jid[0] = '\0'; cycle_check_when = 0; goto jump; } if (time_now - cycle_check_when > 20) { /* give up after 20 seconds */ cycle_check_jid[0] = '\0'; cycle_check_when = 0; } } /* END if (cycle_check_when != 0) */ if (cycle_check_when == 0) { /* new PRERUN job located */ cycle_check_when = time_now; strcpy(cycle_check_jid, pjob->ji_qs.ji_jobid); } sprintf(log_buffer, "job cannot be deleted, state=PRERUN, requeuing delete request"); log_event( PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buffer); pwtnew = set_task( WORK_Timed, time_now + 1, post_delete_route, preq); if (pwtnew == 0) req_reject(PBSE_SYSTEM, 0, preq, NULL, NULL); return; } /* END if (pjob->ji_qs.ji_substate == JOB_SUBSTATE_PRERUN) */ jump: /* * Log delete and if requesting client is not job owner, send mail. */ sprintf(log_buffer, "requestor=%s@%s", preq->rq_user, preq->rq_host); /* NOTE: should annotate accounting record with extend message (NYI) */ account_record(PBS_ACCT_DEL, pjob, log_buffer); sprintf(log_buffer, msg_manager, msg_deletejob, preq->rq_user, preq->rq_host); log_event( PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buffer); /* NOTE: should incorporate job delete message */ if (Msg != NULL) { /* have text message in request extension, add it */ strcat(log_buffer, "\n"); strcat(log_buffer, Msg); } if ((svr_chk_owner(preq, pjob) != 0) && !has_job_delete_nanny(pjob)) { /* only send email if owner did not delete job and job deleted has not been previously attempted */ svr_mailowner(pjob, MAIL_DEL, MAIL_FORCE, log_buffer); /* * If we sent mail and already sent the extra message * then reset message so we don't trigger a redundant email * in job_abt() */ if (Msg != NULL) { Msg = NULL; } } if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_FILE) != 0) { /* job has restart file at mom, change restart comment if failed */ change_restart_comment_if_needed(pjob); } if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING) { /* * setup a nanny task to make sure the job is actually deleted (see the * comments at job_delete_nanny()). */ if (has_job_delete_nanny(pjob)) { req_reject(PBSE_IVALREQ, 0, preq, NULL, "job cancel in progress"); return; } apply_job_delete_nanny(pjob, time_now + 60); /* check if we are getting a asynchronous delete */ if ((preq->rq_extend != NULL) && !strncmp(preq->rq_extend,DELASYNC,strlen(DELASYNC))) { struct batch_request *preq_tmp = NULL; /* * Respond with an ack now instead of after MOM processing * Create a new batch request and fill it in. It will be freed by reply_ack */ snprintf(log_buffer,sizeof(log_buffer), "Deleting job asynchronously"); log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buffer); preq_tmp = alloc_br(PBS_BATCH_DeleteJob); preq_tmp->rq_perm = preq->rq_perm; preq_tmp->rq_ind.rq_manager.rq_cmd = preq->rq_ind.rq_manager.rq_cmd; preq_tmp->rq_ind.rq_manager.rq_objtype = preq->rq_ind.rq_manager.rq_objtype; preq_tmp->rq_fromsvr = preq->rq_fromsvr; preq_tmp->rq_extsz = preq->rq_extsz; preq_tmp->rq_conn = preq->rq_conn; memcpy(preq_tmp->rq_ind.rq_manager.rq_objname, preq->rq_ind.rq_manager.rq_objname, PBS_MAXSVRJOBID + 1); memcpy(preq_tmp->rq_user, preq->rq_user, PBS_MAXUSER + 1); memcpy(preq_tmp->rq_host, preq->rq_host, PBS_MAXHOSTNAME + 1); reply_ack(preq_tmp); preq->rq_noreply = TRUE; /* set for no more replies */ } /* make a cleanup task if set */ if ((server.sv_attr[SRV_ATR_JobForceCancelTime].at_flags & ATR_VFLAG_SET) && (server.sv_attr[SRV_ATR_JobForceCancelTime].at_val.at_long > 0)) { pwtcheck = set_task( WORK_Timed, time_now + server.sv_attr[SRV_ATR_JobForceCancelTime].at_val.at_long, ensure_deleted, preq); if (pwtcheck != NULL) append_link(&pjob->ji_svrtask, &pwtcheck->wt_linkobj, pwtcheck); } /* * Send signal request to MOM. The server will automagically * pick up and "finish" off the client request when MOM replies. */ if ((rc = issue_signal(pjob, sigt, post_delete_mom1, preq))) { /* cant send to MOM */ req_reject(rc, 0, preq, NULL, NULL); } /* normally will ack reply when mom responds */ sprintf(log_buffer, msg_delrunjobsig, sigt); LOG_EVENT( PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buffer); return; } /* END if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING) */ /* make a cleanup task if set */ if ((server.sv_attr[SRV_ATR_JobForceCancelTime].at_flags & ATR_VFLAG_SET) && (server.sv_attr[SRV_ATR_JobForceCancelTime].at_val.at_long > 0)) { pwtcheck = set_task( WORK_Timed, time_now + server.sv_attr[SRV_ATR_JobForceCancelTime].at_val.at_long, ensure_deleted, preq); if (pwtcheck != NULL) append_link(&pjob->ji_svrtask, &pwtcheck->wt_linkobj, pwtcheck); } /* if configured, and this job didn't have a slot limit hold, free a job * held with the slot limit hold */ if ((server.sv_attr[SRV_ATR_MoabArrayCompatible].at_val.at_long != FALSE) && ((pjob->ji_wattr[JOB_ATR_hold].at_val.at_long & HOLD_l) == FALSE)) { if ((pjob->ji_arraystruct != NULL) && (pjob->ji_is_array_template == FALSE)) { int i; int newstate; int newsub; job *tmp; job_array *pa = pjob->ji_arraystruct; for (i = 0; i < pa->ai_qs.array_size; i++) { if (pa->jobs[i] == NULL) continue; tmp = (job *)pa->jobs[i]; if (tmp->ji_wattr[JOB_ATR_hold].at_val.at_long & HOLD_l) { tmp->ji_wattr[JOB_ATR_hold].at_val.at_long &= ~HOLD_l; if (tmp->ji_wattr[JOB_ATR_hold].at_val.at_long == 0) { tmp->ji_wattr[JOB_ATR_hold].at_flags &= ~ATR_VFLAG_SET; } svr_evaljobstate(tmp, &newstate, &newsub, 1); svr_setjobstate(tmp, newstate, newsub); job_save(tmp, SAVEJOB_FULL, 0); break; } } } } /* END MoabArrayCompatible check */ if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_FILE) != 0) { /* job has restart file at mom, do end job processing */ svr_setjobstate(pjob, JOB_STATE_EXITING, JOB_SUBSTATE_EXITING); pjob->ji_momhandle = -1; /* force new connection */ pwtnew = set_task(WORK_Immed, 0, on_job_exit, (void *)pjob); if (pwtnew) { append_link(&pjob->ji_svrtask, &pwtnew->wt_linkobj, pwtnew); } } else if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_StagedIn) != 0) { /* job has staged-in file, should remove them */ remove_stagein(pjob); job_abt(&pjob, Msg); } else { /* * the job is not transitting (though it may have been) and * is not running, so put in into a complete state. */ struct work_task *ptask; struct pbs_queue *pque; int KeepSeconds = 0; svr_setjobstate(pjob, JOB_STATE_COMPLETE, JOB_SUBSTATE_COMPLETE); if ((pque = pjob->ji_qhdr) && (pque != NULL)) { pque->qu_numcompleted++; } KeepSeconds = attr_ifelse_long( &pque->qu_attr[QE_ATR_KeepCompleted], &server.sv_attr[SRV_ATR_KeepCompleted], 0); ptask = set_task(WORK_Timed, time_now + KeepSeconds, on_job_exit, pjob); if (ptask != NULL) { append_link(&pjob->ji_svrtask, &ptask->wt_linkobj, ptask); } } /* END else if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_FILE) != 0) */ reply_ack(preq); return; } /* END req_deletejob() */
void array_delete_wt(struct work_task *ptask) { struct batch_request *preq; job_array *pa; /*struct work_task *pnew_task;*/ struct work_task *pwtnew; int i; static int last_check = 0; static char *last_id = NULL; preq = ptask->wt_parm1; pa = get_array(preq->rq_ind.rq_delete.rq_objname); if (pa == NULL) { /* jobs must have exited already */ reply_ack(preq); last_check = 0; free(last_id); last_id = NULL; return; } if (last_id == NULL) { last_id = strdup(preq->rq_ind.rq_delete.rq_objname); last_check = time_now; } else if (strcmp(last_id, preq->rq_ind.rq_delete.rq_objname) != 0) { last_check = time_now; free(last_id); last_id = strdup(preq->rq_ind.rq_delete.rq_objname); } else if (time_now - last_check > 10) { int num_jobs; int num_prerun; job *pjob; num_jobs = 0; num_prerun = 0; for (i = 0; i < pa->ai_qs.array_size; i++) { if (pa->jobs[i] == NULL) continue; pjob = (job *)pa->jobs[i]; num_jobs++; if (pjob->ji_qs.ji_substate == JOB_SUBSTATE_PRERUN) { num_prerun++; /* mom still hasn't gotten job?? delete anyway */ if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_FILE) != 0) { /* job has restart file at mom, do end job processing */ change_restart_comment_if_needed(pjob); svr_setjobstate(pjob, JOB_STATE_EXITING, JOB_SUBSTATE_EXITING); pjob->ji_momhandle = -1; /* force new connection */ pwtnew = set_task(WORK_Immed, 0, on_job_exit, (void *)pjob); if (pwtnew) { append_link(&pjob->ji_svrtask, &pwtnew->wt_linkobj, pwtnew); } } else if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_StagedIn) != 0) { /* job has staged-in file, should remove them */ remove_stagein(pjob); job_abt(&pjob, NULL); } else { job_abt(&pjob, NULL); } } } if (num_jobs == num_prerun) { reply_ack(preq); free(last_id); last_id = NULL; return; } } req_deletearray(preq); }
/** * attempt_delete() * deletes a job differently depending on the job's state * * @return TRUE if the job was deleted, FALSE if skipped * @param pjob - a pointer to the job being handled */ int attempt_delete( void *j) /* I */ { int skipped = FALSE; struct work_task *pwtold; struct work_task *pwtnew; job *pjob; /* job considered deleted if null */ if (j == NULL) return(TRUE); pjob = (job *)j; if (pjob->ji_qs.ji_state == JOB_STATE_TRANSIT) { /* * Find pid of router from existing work task entry, * then establish another work task on same child. * Next, signal the router and wait for its completion; */ pwtold = (struct work_task *)GET_NEXT(pjob->ji_svrtask); while (pwtold != NULL) { if ((pwtold->wt_type == WORK_Deferred_Child) || (pwtold->wt_type == WORK_Deferred_Cmp)) { kill((pid_t)pwtold->wt_event, SIGTERM); pjob->ji_qs.ji_substate = JOB_SUBSTATE_ABORT; } pwtold = (struct work_task *)GET_NEXT(pwtold->wt_linkobj); } skipped = TRUE; return(!skipped); } /* END if (pjob->ji_qs.ji_state == JOB_SUBSTATE_TRANSIT) */ else if (pjob->ji_qs.ji_substate == JOB_SUBSTATE_PRERUN) { /* we'll wait for the mom to get this job, then delete it */ skipped = TRUE; } /* END if (pjob->ji_qs.ji_substate == JOB_SUBSTATE_PRERUN) */ else if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING) { /* set up nanny */ if (!has_job_delete_nanny(pjob)) { apply_job_delete_nanny(pjob, time_now + 60); /* need to issue a signal to the mom, but we don't want to sent an ack to the * client when the mom replies */ issue_signal(pjob, "SIGTERM", post_delete, NULL); } if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_FILE) != 0) { /* job has restart file at mom, change restart comment if failed */ change_restart_comment_if_needed(pjob); } return(!skipped); } /* END if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING) */ if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_FILE) != 0) { /* job has restart file at mom, change restart comment if failed */ change_restart_comment_if_needed(pjob); /* job has restart file at mom, do end job processing */ svr_setjobstate(pjob, JOB_STATE_EXITING, JOB_SUBSTATE_EXITING); pjob->ji_momhandle = -1; /* force new connection */ pwtnew = set_task(WORK_Immed, 0, on_job_exit, (void *)pjob); if (pwtnew) { append_link(&pjob->ji_svrtask, &pwtnew->wt_linkobj, pwtnew); } } else if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_StagedIn) != 0) { /* job has staged-in file, should remove them */ remove_stagein(pjob); job_abt(&pjob, NULL); } else { /* * the job is not transitting (though it may have been) and * is not running, so put in into a complete state. */ struct work_task *ptask; struct pbs_queue *pque; int KeepSeconds = 0; svr_setjobstate(pjob, JOB_STATE_COMPLETE, JOB_SUBSTATE_COMPLETE); if ((pque = pjob->ji_qhdr) && (pque != NULL)) { pque->qu_numcompleted++; } KeepSeconds = attr_ifelse_long( &pque->qu_attr[(int)QE_ATR_KeepCompleted], &server.sv_attr[(int)SRV_ATR_KeepCompleted], 0); ptask = set_task(WORK_Timed, time_now + KeepSeconds, on_job_exit, pjob); if (ptask != NULL) { append_link(&pjob->ji_svrtask, &ptask->wt_linkobj, ptask); } } return(!skipped); } /* END attempt_delete() */
void req_register(struct batch_request *preq) { int made; attribute *pattr; struct depend *pdep; struct depend_job *pdj; job *pjob; char *ps; struct work_task *ptask; int rc = 0; int revtype; int type; int savetype = SAVEJOB_FULL; /* make sure request is from a server */ if (!preq->rq_fromsvr) { #ifdef NAS /* localmod 109 */ sprintf(log_buffer, "Dependency request not from server"); log_event(PBSEVENT_DEBUG, PBS_EVENTCLASS_JOB, LOG_INFO, preq->rq_ind.rq_register.rq_parent, log_buffer); #endif /* localmod 109 */ req_reject(PBSE_IVALREQ, 0, preq); return; } /* find the "parent" job specified in the request */ if ((pjob = find_job(preq->rq_ind.rq_register.rq_parent)) == NULL) { /* * job not found... if server is initializing, it may not * yet recovered, that is not an error. */ if (server.sv_attr[(int)SRV_ATR_State].at_val.at_long != SV_STATE_INIT) { log_event(PBSEVENT_DEBUG, PBS_EVENTCLASS_JOB, LOG_INFO, preq->rq_ind.rq_register.rq_parent, msg_unkjobid); req_reject(PBSE_UNKJOBID, 0, preq); } else { reply_ack(preq); } return; } pattr = &pjob->ji_wattr[(int)JOB_ATR_depend]; type = preq->rq_ind.rq_register.rq_dependtype; pjob->ji_modified = 1; /* more of the server:port fix kludge */ ps = strchr(preq->rq_ind.rq_register.rq_child, (int)'@'); if (ps != NULL) { (void)strcpy(preq->rq_ind.rq_register.rq_svr, ps+1); *ps = '\0'; } else { (void)strcpy(preq->rq_ind.rq_register.rq_svr, preq->rq_host); } if (pjob->ji_qs.ji_state == JOB_STATE_MOVED) { snprintf(log_buffer, sizeof(log_buffer), "Parent %s%s", msg_movejob, pjob->ji_qs.ji_destin); log_event(PBSEVENT_DEBUG|PBSEVENT_SYSTEM|PBSEVENT_ERROR, PBS_EVENTCLASS_REQUEST, LOG_INFO, preq->rq_ind.rq_register.rq_child, log_buffer); req_reject(PBSE_JOB_MOVED, 0, preq); return; } switch (preq->rq_ind.rq_register.rq_op) { /* * Register a dependency */ case JOB_DEPEND_OP_REGISTER: switch (type) { case JOB_DEPEND_TYPE_AFTERSTART: if (pjob->ji_qs.ji_substate >= JOB_SUBSTATE_RUNNING) { /* job already running, setup task to send */ /* release back to child and continue with */ /* registration process */ ptask = set_task(WORK_Immed, 0, post_run_depend, (void *)pjob); if (ptask) append_link(&pjob->ji_svrtask, &ptask->wt_linkobj, ptask); } /* fall through to complete registration */ case JOB_DEPEND_TYPE_AFTERANY: case JOB_DEPEND_TYPE_AFTEROK: case JOB_DEPEND_TYPE_AFTERNOTOK: rc = register_dep(pattr, preq, type, &made); break; case JOB_DEPEND_TYPE_BEFORESTART: case JOB_DEPEND_TYPE_BEFOREANY: case JOB_DEPEND_TYPE_BEFOREOK: case JOB_DEPEND_TYPE_BEFORENOTOK: /* * Check job owner for permission, use the real * job owner, not the sending server's name. */ (void)strcpy(preq->rq_user, preq->rq_ind.rq_register.rq_owner); if (svr_chk_owner(preq, pjob)) { rc = PBSE_PERM; /* not same user */ } else { /* ok owner, see if job has "on" */ pdep = find_depend(JOB_DEPEND_TYPE_ON, pattr); if (pdep == 0) { /* on "on", see if child already registered */ revtype = type ^ (JOB_DEPEND_TYPE_BEFORESTART - JOB_DEPEND_TYPE_AFTERSTART); pdep = find_depend(revtype, pattr); if (pdep == 0) { /* no "on" and no prior - return error */ rc = PBSE_BADDEPEND; } else { pdj = find_dependjob(pdep, preq->rq_ind.rq_register.rq_child); if (pdj) { /* has prior register, update it */ (void)strcpy(pdj->dc_svr, preq->rq_ind.rq_register.rq_svr); } } } else if ((rc=register_dep(pattr, preq, type, &made)) == 0) { if (made) { /* first time registered */ if (--pdep->dp_numexp <= 0) del_depend(pdep); } } } break; default: #ifdef NAS /* localmod 109 */ sprintf(log_buffer, "Unknown dep. op: %d", preq->rq_ind.rq_register.rq_op); log_event(PBSEVENT_DEBUG, PBS_EVENTCLASS_JOB, LOG_INFO, preq->rq_ind.rq_register.rq_parent, log_buffer); #endif /* localmod 109 */ rc = PBSE_IVALREQ; break; } break; /* * Release a dependency so job might run */ case JOB_DEPEND_OP_RELEASE: switch (type) { case JOB_DEPEND_TYPE_BEFORESTART: case JOB_DEPEND_TYPE_BEFOREANY: case JOB_DEPEND_TYPE_BEFOREOK: case JOB_DEPEND_TYPE_BEFORENOTOK: /* predecessor sent release-reduce "on", */ /* see if this job can now run */ type ^= (JOB_DEPEND_TYPE_BEFORESTART - JOB_DEPEND_TYPE_AFTERSTART); if ((pdep = find_depend(type, pattr)) != NULL) { pdj = find_dependjob(pdep, preq->rq_ind.rq_register.rq_child); if (pdj) { del_depend_job(pdj); pattr->at_flags |= ATR_VFLAG_MODIFY | ATR_VFLAG_MODCACHE; savetype = SAVEJOB_FULLFORCE; (void)sprintf(log_buffer, msg_registerrel, preq->rq_ind.rq_register.rq_child); log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, LOG_INFO, pjob->ji_qs.ji_jobid, log_buffer); if (GET_NEXT(pdep->dp_jobs) == 0) { /* no more dependencies of this type */ del_depend(pdep); set_depend_hold(pjob, pattr); } break; } #ifdef NAS /* localmod 109 */ sprintf(log_buffer, "Dep.rls. job not found: %d/%s", type, preq->rq_ind.rq_register.rq_child); } else { sprintf(log_buffer, "Dep.rls. type not found: %d", type); #endif /* localmod 109 */ } #ifdef NAS /* localmod 109 */ log_event(PBSEVENT_DEBUG, PBS_EVENTCLASS_JOB, LOG_INFO, preq->rq_ind.rq_register.rq_parent, log_buffer); #endif /* localmod 109 */ rc = PBSE_IVALREQ; break; } break; case JOB_DEPEND_OP_READY: rc = PBSE_NOSYNCMSTR; break; case JOB_DEPEND_OP_DELETE: (void)sprintf(log_buffer, msg_registerdel, preq->rq_ind.rq_register.rq_child); log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, LOG_INFO, pjob->ji_qs.ji_jobid, log_buffer); job_abt(pjob, log_buffer); break; case JOB_DEPEND_OP_UNREG: unregister_dep(pattr, preq); set_depend_hold(pjob, pattr); break; default: sprintf(log_buffer, msg_illregister, preq->rq_ind.rq_register.rq_parent); log_event(PBSEVENT_DEBUG|PBSEVENT_SYSTEM|PBSEVENT_ERROR, PBS_EVENTCLASS_REQUEST, LOG_INFO, preq->rq_host, log_buffer); rc = PBSE_IVALREQ; break;; } if (rc) { pjob->ji_modified = 0; req_reject(rc, 0, preq); } else { /* If this is an array job, forcibly save it to ensure * dependencies are recorded. */ if (pjob->ji_qs.ji_svrflags & JOB_SVFLG_ArrayJob) savetype = SAVEJOB_FULLFORCE; if (pjob->ji_modified) (void)job_save(pjob, savetype); reply_ack(preq); } return; }
int execute_job_delete( job *pjob, /* M */ char *Msg, /* I */ struct batch_request *preq) /* I */ { struct work_task *pwtnew; int rc; char *sigt = "SIGTERM"; int has_mutex = TRUE; char log_buf[LOCAL_LOG_BUF_SIZE]; time_t time_now = time(NULL); long force_cancel = FALSE; long array_compatible = FALSE; chk_job_req_permissions(&pjob,preq); if (pjob == NULL) { /* preq is rejected in chk_job_req_permissions here */ return(-1); } if (pjob->ji_qs.ji_state == JOB_STATE_TRANSIT) { /* see note in req_delete - not sure this is possible still, * but the deleted code is irrelevant now. I will leave this * part --dbeer */ unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL); return(-1); } if (pjob->ji_qs.ji_substate == JOB_SUBSTATE_PRERUN || pjob->ji_qs.ji_substate == JOB_SUBSTATE_RERUN || pjob->ji_qs.ji_substate == JOB_SUBSTATE_RERUN1 || pjob->ji_qs.ji_substate == JOB_SUBSTATE_RERUN2 || pjob->ji_qs.ji_substate == JOB_SUBSTATE_RERUN3 ) { /* If JOB_SUBSTATE_PRERUN being sent to MOM, wait till she gets it going */ /* retry in one second */ /* If JOB_SUBSTATE_RERUN, RERUN1, RERUN2 or RERUN3 the job is being requeued. Wait until finished */ static time_t cycle_check_when = 0; static char cycle_check_jid[PBS_MAXSVRJOBID + 1]; if (cycle_check_when != 0) { if (!strcmp(pjob->ji_qs.ji_jobid, cycle_check_jid) && (time_now - cycle_check_when > 10)) { /* state not updated after 10 seconds */ /* did the mom ever get it? delete it anyways... */ cycle_check_jid[0] = '\0'; cycle_check_when = 0; goto jump; } if (time_now - cycle_check_when > 20) { /* give up after 20 seconds */ cycle_check_jid[0] = '\0'; cycle_check_when = 0; } } /* END if (cycle_check_when != 0) */ if (cycle_check_when == 0) { /* new PRERUN job located */ cycle_check_when = time_now; strcpy(cycle_check_jid, pjob->ji_qs.ji_jobid); } sprintf(log_buf, "job cannot be deleted, state=PRERUN, requeuing delete request"); log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf); pwtnew = set_task(WORK_Timed,time_now + 1,post_delete_route,preq,FALSE); unlock_ji_mutex(pjob, __func__, "2", LOGLEVEL); if (pwtnew == NULL) { req_reject(PBSE_SYSTEM, 0, preq, NULL, NULL); return(-1); } else { return(ROUTE_DELETE); } } /* END if (pjob->ji_qs.ji_substate == JOB_SUBSTATE_PRERUN) */ jump: /* * Log delete and if requesting client is not job owner, send mail. */ sprintf(log_buf, "requestor=%s@%s", preq->rq_user, preq->rq_host); /* NOTE: should annotate accounting record with extend message (NYI) */ account_record(PBS_ACCT_DEL, pjob, log_buf); sprintf(log_buf, msg_manager, msg_deletejob, preq->rq_user, preq->rq_host); log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf); /* NOTE: should incorporate job delete message */ if (Msg != NULL) { /* have text message in request extension, add it */ strcat(log_buf, "\n"); strcat(log_buf, Msg); } if ((svr_chk_owner(preq, pjob) != 0) && (pjob->ji_has_delete_nanny == FALSE)) { /* only send email if owner did not delete job and job deleted has not been previously attempted */ svr_mailowner(pjob, MAIL_DEL, MAIL_FORCE, log_buf); /* * If we sent mail and already sent the extra message * then reset message so we don't trigger a redundant email * in job_abt() */ if (Msg != NULL) { Msg = NULL; } } if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_FILE) != 0) { /* job has restart file at mom, change restart comment if failed */ change_restart_comment_if_needed(pjob); } if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING) { /* * setup a nanny task to make sure the job is actually deleted (see the * comments at job_delete_nanny()). */ if (pjob->ji_has_delete_nanny == TRUE) { unlock_ji_mutex(pjob, __func__, "3", LOGLEVEL); req_reject(PBSE_IVALREQ, 0, preq, NULL, "job cancel in progress"); return(-1); } apply_job_delete_nanny(pjob, time_now + 60); /* * Send signal request to MOM. The server will automagically * pick up and "finish" off the client request when MOM replies. */ get_batch_request_id(preq); if ((rc = issue_signal(&pjob, sigt, post_delete_mom1, strdup(preq->rq_id)))) { /* cant send to MOM */ req_reject(rc, 0, preq, NULL, NULL); } /* normally will ack reply when mom responds */ if (pjob != NULL) { sprintf(log_buf, msg_delrunjobsig, sigt); log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf); unlock_ji_mutex(pjob, __func__, "4", LOGLEVEL); } return(-1); } /* END if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING) */ /* make a cleanup task if set */ get_svr_attr_l(SRV_ATR_JobForceCancelTime, &force_cancel); if (force_cancel > 0) { char *dup_jobid = strdup(pjob->ji_qs.ji_jobid); set_task(WORK_Timed, time_now + force_cancel, ensure_deleted, dup_jobid, FALSE); } /* if configured, and this job didn't have a slot limit hold, free a job * held with the slot limit hold */ get_svr_attr_l(SRV_ATR_MoabArrayCompatible, &array_compatible); if ((array_compatible != FALSE) && ((pjob->ji_wattr[JOB_ATR_hold].at_val.at_long & HOLD_l) == FALSE)) { if ((pjob->ji_arraystruct != NULL) && (pjob->ji_is_array_template == FALSE)) { int i; int newstate; int newsub; job *tmp; job_array *pa = get_jobs_array(&pjob); if (pjob == NULL) return(-1); for (i = 0; i < pa->ai_qs.array_size; i++) { if (pa->job_ids[i] == NULL) continue; if (!strcmp(pa->job_ids[i], pjob->ji_qs.ji_jobid)) continue; if ((tmp = svr_find_job(pa->job_ids[i], FALSE)) == NULL) { free(pa->job_ids[i]); pa->job_ids[i] = NULL; } else { if (tmp->ji_wattr[JOB_ATR_hold].at_val.at_long & HOLD_l) { tmp->ji_wattr[JOB_ATR_hold].at_val.at_long &= ~HOLD_l; if (tmp->ji_wattr[JOB_ATR_hold].at_val.at_long == 0) { tmp->ji_wattr[JOB_ATR_hold].at_flags &= ~ATR_VFLAG_SET; } svr_evaljobstate(tmp, &newstate, &newsub, 1); svr_setjobstate(tmp, newstate, newsub, FALSE); job_save(tmp, SAVEJOB_FULL, 0); unlock_ji_mutex(tmp, __func__, "5", LOGLEVEL); break; } unlock_ji_mutex(tmp, __func__, "6", LOGLEVEL); } } if (LOGLEVEL >= 7) { sprintf(log_buf, "%s: unlocking ai_mutex", __func__); log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buf); } pthread_mutex_unlock(pa->ai_mutex); } } /* END MoabArrayCompatible check */ if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_FILE) != 0) { /* job has restart file at mom, do end job processing */ svr_setjobstate(pjob, JOB_STATE_EXITING, JOB_SUBSTATE_EXITING, FALSE); /* force new connection */ pjob->ji_momhandle = -1; if (LOGLEVEL >= 7) { sprintf(log_buf, "calling on_job_exit from %s", __func__); log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buf); } set_task(WORK_Immed, 0, on_job_exit, strdup(pjob->ji_qs.ji_jobid), FALSE); } else if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_StagedIn) != 0) { /* job has staged-in file, should remove them */ remove_stagein(&pjob); if (pjob != NULL) job_abt(&pjob, Msg); has_mutex = FALSE; } else { /* * the job is not transitting (though it may have been) and * is not running, so put in into a complete state. */ struct pbs_queue *pque; int KeepSeconds = 0; svr_setjobstate(pjob, JOB_STATE_COMPLETE, JOB_SUBSTATE_COMPLETE, FALSE); if ((pque = get_jobs_queue(&pjob)) != NULL) { pque->qu_numcompleted++; unlock_queue(pque, __func__, NULL, LOGLEVEL); if (LOGLEVEL >= 7) { sprintf(log_buf, "calling on_job_exit from %s", __func__); log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buf); } pthread_mutex_lock(server.sv_attr_mutex); KeepSeconds = attr_ifelse_long( &pque->qu_attr[QE_ATR_KeepCompleted], &server.sv_attr[SRV_ATR_KeepCompleted], 0); pthread_mutex_unlock(server.sv_attr_mutex); } else KeepSeconds = 0; if (pjob != NULL) { set_task(WORK_Timed, time_now + KeepSeconds, on_job_exit, strdup(pjob->ji_qs.ji_jobid), FALSE); } else has_mutex = FALSE; } /* END else if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_FILE) != 0) */ if (has_mutex == TRUE) unlock_ji_mutex(pjob, __func__, "7", LOGLEVEL); return(PBSE_NONE); } /* END execute_job_delete() */
static void stat_update( struct work_task *pwt) { struct stat_cntl *cntl; job *pjob; struct batch_request *preq; struct batch_reply *preply; struct brp_status *pstatus; svrattrl *sattrl; int oldsid; preq = pwt->wt_parm1; preply = &preq->rq_reply; cntl = preq->rq_extra; if (preply->brp_choice == BATCH_REPLY_CHOICE_Status) { pstatus = (struct brp_status *)GET_NEXT(preply->brp_un.brp_status); while (pstatus != NULL) { if ((pjob = find_job(pstatus->brp_objname))) { sattrl = (svrattrl *)GET_NEXT(pstatus->brp_attr); oldsid = pjob->ji_wattr[(int)JOB_ATR_session_id].at_val.at_long; modify_job_attr( pjob, sattrl, ATR_DFLAG_MGWR | ATR_DFLAG_SvWR, &bad); if (oldsid != pjob->ji_wattr[(int)JOB_ATR_session_id].at_val.at_long) { /* first save since running job (or the sid has changed), */ /* must save session id */ job_save(pjob, SAVEJOB_FULL); svr_mailowner(pjob, MAIL_BEGIN, MAIL_NORMAL, NULL); } #ifdef USESAVEDRESOURCES else { /* save so we can recover resources used */ job_save(pjob, SAVEJOB_FULL); } #endif /* USESAVEDRESOURCES */ pjob->ji_momstat = time_now; } pstatus = (struct brp_status *)GET_NEXT(pstatus->brp_stlink); } /* END while (pstatus != NULL) */ } /* END if (preply->brp_choice == BATCH_REPLY_CHOICE_Status) */ else { if (preply->brp_code == PBSE_UNKJOBID) { /* we sent a stat request, but mom says it doesn't know anything about the job */ if ((pjob = find_job(preq->rq_ind.rq_status.rq_id))) { /* job really isn't running any more - mom doesn't know anything about it this can happen if a diskless node reboots and the mom_priv/jobs directory is cleared, set its state to queued so job_abt doesn't think it is still running */ svr_setjobstate(pjob, JOB_STATE_QUEUED, JOB_SUBSTATE_ABORT); rel_resc(pjob); job_abt(&pjob, "Job does not exist on node"); /* TODO, if the job is rerunnable we should set its state back to queued */ } } } release_req(pwt); cntl->sc_conn = -1; if (cntl->sc_post) cntl->sc_post(cntl); /* continue where we left off */ else free(cntl); /* a bit of a kludge but its saves an extra func */ return; } /* END stat_update() */