/** * @brief * chk_array_doneness - check if all subjobs are expired and if so, * purge the Array Job itself * * @param[in,out] parent - pointer to parent job. * * @return void */ void chk_array_doneness(job *parent) { char acctbuf[40]; int e; int i; struct ajtrkhd *ptbl = parent->ji_ajtrk; if (ptbl == NULL) return; if (ptbl->tkm_flags & TKMFLG_NO_DELETE) return; /* delete of subjobs in progress, don't array */ if (ptbl->tkm_subjsct[JOB_STATE_QUEUED] + ptbl->tkm_subjsct[JOB_STATE_RUNNING] + ptbl->tkm_subjsct[JOB_STATE_EXITING] == 0) { /* Array Job all done, do simple eoj processing */ for (e=i=0; i<ptbl->tkm_ct; ++i) { if (ptbl->tkm_tbl[i].trk_error > 0) e = 1; else if (ptbl->tkm_tbl[i].trk_error < 0) { e = 2; break; } } parent->ji_qs.ji_un_type = JOB_UNION_TYPE_EXEC; parent->ji_qs.ji_un.ji_exect.ji_momaddr = 0; parent->ji_qs.ji_un.ji_exect.ji_momport = 0; parent->ji_qs.ji_un.ji_exect.ji_exitstat = e; check_block(parent, ""); if (parent->ji_qs.ji_state == JOB_STATE_BEGUN) { /* if BEGUN, issue 'E' account record */ sprintf(acctbuf, msg_job_end_stat, e); account_job_update(parent, PBS_ACCT_LAST); account_jobend(parent, acctbuf, PBS_ACCT_END); svr_mailowner(parent, MAIL_END, MAIL_NORMAL, acctbuf); } if (parent->ji_wattr[(int)JOB_ATR_depend].at_flags & ATR_VFLAG_SET) (void)depend_on_term(parent); /* * Check if the history of the finished job can be saved or it needs to be purged . */ svr_saveorpurge_finjobhist(parent); } else { (void)job_save(parent, SAVEJOB_FULL); } }
void force_purge_work( job *pjob) { char log_buf[LOCAL_LOG_BUF_SIZE]; pbs_queue *pque; snprintf(log_buf, sizeof(log_buf), "purging job %s without checking MOM", pjob->ji_qs.ji_jobid); log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf); free_nodes(pjob); if ((pque = get_jobs_queue(&pjob)) != NULL) { mutex_mgr pque_mutex = mutex_mgr(pque->qu_mutex, true); if (pque->qu_qs.qu_type == QTYPE_Execution) { pque_mutex.unlock(); set_resc_assigned(pjob, DECR); } } depend_on_term(pjob); svr_setjobstate(pjob, JOB_STATE_COMPLETE, JOB_SUBSTATE_COMPLETE, FALSE); if (pjob != NULL) { if (is_ms_on_server(pjob)) { char log_buf[LOCAL_LOG_BUF_SIZE]; if (LOGLEVEL >= 7) { snprintf(log_buf, sizeof(log_buf), "Mother Superior is on the server, not cleaning spool files in %s", __func__); log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buf); } svr_job_purge(pjob, 1); } else svr_job_purge(pjob); } } /* END force_purge_work() */
int execute_job_delete( job *pjob, /* M */ char *Msg, /* I */ struct batch_request *preq) /* I */ { struct work_task *pwtnew; int rc; const char *sigt = "SIGTERM"; const char *del = "delete"; char log_buf[LOCAL_LOG_BUF_SIZE]; time_t time_now = time(NULL); long force_cancel = FALSE; long array_compatible = FALSE; chk_job_req_permissions(&pjob,preq); if (pjob == NULL) { /* preq is rejected in chk_job_req_permissions here */ return(-1); } mutex_mgr job_mutex(pjob->ji_mutex, true); if (LOGLEVEL >= 10) log_event(PBSEVENT_DEBUG, PBS_EVENTCLASS_QUEUE, __func__, pjob->ji_qs.ji_jobid); if (pjob->ji_qs.ji_state == JOB_STATE_TRANSIT) { /* see note in req_delete - not sure this is possible still, * but the deleted code is irrelevant now. I will leave this * part --dbeer */ return(-1); } if (pjob->ji_qs.ji_substate == JOB_SUBSTATE_PRERUN || pjob->ji_qs.ji_substate == JOB_SUBSTATE_RERUN || pjob->ji_qs.ji_substate == JOB_SUBSTATE_RERUN1 || pjob->ji_qs.ji_substate == JOB_SUBSTATE_RERUN2 || pjob->ji_qs.ji_substate == JOB_SUBSTATE_RERUN3 ) { /* If JOB_SUBSTATE_PRERUN being sent to MOM, wait till she gets it going */ /* retry in one second */ /* If JOB_SUBSTATE_RERUN, RERUN1, RERUN2 or RERUN3 the job is being requeued. Wait until finished */ static time_t cycle_check_when = 0; static char cycle_check_jid[PBS_MAXSVRJOBID + 1]; if (cycle_check_when != 0) { if (!strcmp(pjob->ji_qs.ji_jobid, cycle_check_jid) && (time_now - cycle_check_when > 10)) { /* state not updated after 10 seconds */ /* did the mom ever get it? delete it anyways... */ cycle_check_jid[0] = '\0'; cycle_check_when = 0; goto jump; } if (time_now - cycle_check_when > 20) { /* give up after 20 seconds */ cycle_check_jid[0] = '\0'; cycle_check_when = 0; } } /* END if (cycle_check_when != 0) */ if (cycle_check_when == 0) { /* new PRERUN job located */ cycle_check_when = time_now; strcpy(cycle_check_jid, pjob->ji_qs.ji_jobid); } sprintf(log_buf, "job cannot be deleted, state=PRERUN, requeuing delete request"); log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf); pwtnew = set_task(WORK_Timed,time_now + 1,post_delete_route,preq,FALSE); if (pwtnew == NULL) { req_reject(PBSE_SYSTEM, 0, preq, NULL, NULL); return(-1); } else { return(ROUTE_DELETE); } } /* END if (pjob->ji_qs.ji_substate == JOB_SUBSTATE_PRERUN) */ jump: /* * Log delete and if requesting client is not job owner, send mail. */ sprintf(log_buf, "requestor=%s@%s", preq->rq_user, preq->rq_host); /* NOTE: should annotate accounting record with extend message (NYI) */ account_record(PBS_ACCT_DEL, pjob, log_buf); sprintf(log_buf, msg_manager, msg_deletejob, preq->rq_user, preq->rq_host); log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf); /* NOTE: should incorporate job delete message */ if (Msg != NULL) { /* have text message in request extension, add it */ int len = strlen(log_buf); snprintf(log_buf + len, sizeof(log_buf) - len, "\n%s", Msg); } if ((svr_chk_owner(preq, pjob) != 0) && (pjob->ji_has_delete_nanny == FALSE)) { /* only send email if owner did not delete job and job deleted has not been previously attempted */ svr_mailowner(pjob, MAIL_DEL, MAIL_FORCE, log_buf); /* * If we sent mail and already sent the extra message * then reset message so we don't trigger a redundant email * in job_abt() */ if (Msg != NULL) { Msg = NULL; } } if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_FILE) != 0) { /* job has restart file at mom, change restart comment if failed */ change_restart_comment_if_needed(pjob); } if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING) { /* * setup a nanny task to make sure the job is actually deleted (see the * comments at job_delete_nanny()). */ if (pjob->ji_has_delete_nanny == TRUE) { req_reject(PBSE_IVALREQ, 0, preq, NULL, "job cancel in progress"); return(-1); } apply_job_delete_nanny(pjob, time_now + 60); /* * Send signal request to MOM. The server will automagically * pick up and "finish" off the client request when MOM replies. */ get_batch_request_id(preq); if ((rc = issue_signal(&pjob, sigt, post_delete_mom1,strdup(del), strdup(preq->rq_id)))) { /* cant send to MOM */ req_reject(rc, 0, preq, NULL, NULL); } /* normally will ack reply when mom responds */ if (pjob != NULL) { sprintf(log_buf, msg_delrunjobsig, sigt); log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf); } else job_mutex.set_unlock_on_exit(false); return(-1); } /* END if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING) */ /* make a cleanup task if set */ get_svr_attr_l(SRV_ATR_JobForceCancelTime, &force_cancel); if (force_cancel > 0) { char *dup_jobid = strdup(pjob->ji_qs.ji_jobid); set_task(WORK_Timed, time_now + force_cancel, ensure_deleted, dup_jobid, FALSE); } /* if configured, and this job didn't have a slot limit hold, free a job * held with the slot limit hold */ get_svr_attr_l(SRV_ATR_MoabArrayCompatible, &array_compatible); if ((array_compatible != FALSE) && ((pjob->ji_wattr[JOB_ATR_hold].at_val.at_long & HOLD_l) == FALSE)) { if ((pjob->ji_arraystructid[0] != '\0') && (pjob->ji_is_array_template == FALSE)) { int i; int newstate; int newsub; job *tmp; job_array *pa = get_jobs_array(&pjob); if (pjob == NULL) { job_mutex.set_unlock_on_exit(false); return(-1); } std::string dup_job_id(pjob->ji_qs.ji_jobid); if (pa != NULL) { for (i = 0; i < pa->ai_qs.array_size; i++) { if (pa->job_ids[i] == NULL) continue; if (!strcmp(pa->job_ids[i], pjob->ji_qs.ji_jobid)) continue; job_mutex.unlock(); if ((tmp = svr_find_job(pa->job_ids[i], FALSE)) == NULL) { free(pa->job_ids[i]); pa->job_ids[i] = NULL; } else { if (tmp->ji_wattr[JOB_ATR_hold].at_val.at_long & HOLD_l) { tmp->ji_wattr[JOB_ATR_hold].at_val.at_long &= ~HOLD_l; if (tmp->ji_wattr[JOB_ATR_hold].at_val.at_long == 0) { tmp->ji_wattr[JOB_ATR_hold].at_flags &= ~ATR_VFLAG_SET; } svr_evaljobstate(*tmp, newstate, newsub, 1); svr_setjobstate(tmp, newstate, newsub, FALSE); job_save(tmp, SAVEJOB_FULL, 0); unlock_ji_mutex(tmp, __func__, "5", LOGLEVEL); pjob = svr_find_job((char *)dup_job_id.c_str(),FALSE); //Job might have disappeared. job_mutex.set_lock_state(true); break; } unlock_ji_mutex(tmp, __func__, "6", LOGLEVEL); } if ((pjob = svr_find_job((char *)dup_job_id.c_str(),FALSE)) == NULL) //Job disappeared. { break; } job_mutex.set_lock_state(true); } if (pjob->ji_qs.ji_state != JOB_STATE_RUNNING) { long job_atr_hold = pjob->ji_wattr[JOB_ATR_hold].at_val.at_long; int job_exit_status = pjob->ji_qs.ji_un.ji_exect.ji_exitstat; int job_state = pjob->ji_qs.ji_state; job_mutex.unlock(); update_array_values(pa,job_state,aeTerminate, (char*)dup_job_id.c_str(), job_atr_hold, job_exit_status); if((pjob = svr_find_job((char *)dup_job_id.c_str(),FALSE)) != NULL) job_mutex.mark_as_locked(); } unlock_ai_mutex(pa, __func__, "1", LOGLEVEL); } } } /* END MoabArrayCompatible check */ if (pjob == NULL) { job_mutex.set_unlock_on_exit(false); return -1; } depend_on_term(pjob); if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_FILE) != 0) { /* job has restart file at mom, do end job processing */ svr_setjobstate(pjob, JOB_STATE_EXITING, JOB_SUBSTATE_EXITING, FALSE); /* force new connection */ pjob->ji_momhandle = -1; if (LOGLEVEL >= 7) { sprintf(log_buf, "calling on_job_exit from %s", __func__); log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buf); } set_task(WORK_Immed, 0, on_job_exit_task, strdup(pjob->ji_qs.ji_jobid), FALSE); } else if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_StagedIn) != 0) { /* job has staged-in file, should remove them */ remove_stagein(&pjob); job_mutex.set_unlock_on_exit(false); if (pjob != NULL) job_abt(&pjob, Msg); } delete_inactive_job(&pjob, Msg); if (pjob == NULL) job_mutex.set_unlock_on_exit(false); return(PBSE_NONE); } /* END execute_job_delete() */