void req_runjob( struct batch_request *preq) /* I (modified) */ { job *pjob; int rc; void *bp; int setneednodes; char failhost[1024]; char emsg[1024]; /* chk_job_torun will extract job id and assign hostlist if specified */ if (getenv("TORQUEAUTONN")) setneednodes = 1; else setneednodes = 0; if ((pjob = chk_job_torun(preq, setneednodes)) == NULL) { /* FAILURE - chk_job_torun performs req_reject internally */ return; } /* we don't currently allow running of an entire job array */ if (strstr(pjob->ji_qs.ji_jobid,"[]") != NULL) { req_reject(PBSE_IVALREQ, 0, preq, NULL, "cannot run a job array"); return; } if (preq->rq_conn == scheduler_sock) ++scheduler_jobct; /* see scheduler_close() */ sprintf(log_buffer, msg_manager, msg_jobrun, preq->rq_user, preq->rq_host); log_event( PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buffer); /* If async run, reply now; otherwise reply is handled in */ /* post_sendmom or post_stagein */ /* perhaps node assignment should be handled immediately in async run? */ if ((preq != NULL) && (preq->rq_type == PBS_BATCH_AsyrunJob)) { reply_ack(preq); preq = NULL; /* cleared so we don't try to reuse */ } /* if the job is part of an array, check the slot limit */ if ((pjob->ji_arraystruct != NULL) && (pjob->ji_is_array_template == FALSE)) { job_array *pa = pjob->ji_arraystruct; if ((pa->ai_qs.slot_limit < 0) || (pa->ai_qs.slot_limit > pa->ai_qs.jobs_running)) { update_array_values(pa,pjob,pjob->ji_qs.ji_state,aeRun); } else { snprintf(log_buffer,sizeof(log_buffer), "Cannot run job. Array slot limit is %d and there are already %d jobs running\n", pa->ai_qs.slot_limit, pa->ai_qs.jobs_running); if (preq != NULL) req_reject(PBSE_IVALREQ,0,preq,NULL,log_buffer); return; } } /* NOTE: nodes assigned to job in svr_startjob() */ rc = svr_startjob(pjob, preq, failhost, emsg); if ((rc != 0) && (preq != NULL)) { free_nodes(pjob); /* if the job has a non-empty rejectdest list, pass the first host into req_reject() */ if ((bp = GET_NEXT(pjob->ji_rejectdest)) != NULL) { req_reject(rc, 0, preq, ((struct badplace *)bp)->bp_dest, "could not contact host"); } else { req_reject(rc, 0, preq, failhost, emsg); } } return; } /* END req_runjob() */
int execute_job_delete( job *pjob, /* M */ char *Msg, /* I */ struct batch_request *preq) /* I */ { struct work_task *pwtnew; int rc; const char *sigt = "SIGTERM"; const char *del = "delete"; char log_buf[LOCAL_LOG_BUF_SIZE]; time_t time_now = time(NULL); long force_cancel = FALSE; long array_compatible = FALSE; chk_job_req_permissions(&pjob,preq); if (pjob == NULL) { /* preq is rejected in chk_job_req_permissions here */ return(-1); } mutex_mgr job_mutex(pjob->ji_mutex, true); if (LOGLEVEL >= 10) log_event(PBSEVENT_DEBUG, PBS_EVENTCLASS_QUEUE, __func__, pjob->ji_qs.ji_jobid); if (pjob->ji_qs.ji_state == JOB_STATE_TRANSIT) { /* see note in req_delete - not sure this is possible still, * but the deleted code is irrelevant now. I will leave this * part --dbeer */ return(-1); } if (pjob->ji_qs.ji_substate == JOB_SUBSTATE_PRERUN || pjob->ji_qs.ji_substate == JOB_SUBSTATE_RERUN || pjob->ji_qs.ji_substate == JOB_SUBSTATE_RERUN1 || pjob->ji_qs.ji_substate == JOB_SUBSTATE_RERUN2 || pjob->ji_qs.ji_substate == JOB_SUBSTATE_RERUN3 ) { /* If JOB_SUBSTATE_PRERUN being sent to MOM, wait till she gets it going */ /* retry in one second */ /* If JOB_SUBSTATE_RERUN, RERUN1, RERUN2 or RERUN3 the job is being requeued. Wait until finished */ static time_t cycle_check_when = 0; static char cycle_check_jid[PBS_MAXSVRJOBID + 1]; if (cycle_check_when != 0) { if (!strcmp(pjob->ji_qs.ji_jobid, cycle_check_jid) && (time_now - cycle_check_when > 10)) { /* state not updated after 10 seconds */ /* did the mom ever get it? delete it anyways... */ cycle_check_jid[0] = '\0'; cycle_check_when = 0; goto jump; } if (time_now - cycle_check_when > 20) { /* give up after 20 seconds */ cycle_check_jid[0] = '\0'; cycle_check_when = 0; } } /* END if (cycle_check_when != 0) */ if (cycle_check_when == 0) { /* new PRERUN job located */ cycle_check_when = time_now; strcpy(cycle_check_jid, pjob->ji_qs.ji_jobid); } sprintf(log_buf, "job cannot be deleted, state=PRERUN, requeuing delete request"); log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf); pwtnew = set_task(WORK_Timed,time_now + 1,post_delete_route,preq,FALSE); if (pwtnew == NULL) { req_reject(PBSE_SYSTEM, 0, preq, NULL, NULL); return(-1); } else { return(ROUTE_DELETE); } } /* END if (pjob->ji_qs.ji_substate == JOB_SUBSTATE_PRERUN) */ jump: /* * Log delete and if requesting client is not job owner, send mail. */ sprintf(log_buf, "requestor=%s@%s", preq->rq_user, preq->rq_host); /* NOTE: should annotate accounting record with extend message (NYI) */ account_record(PBS_ACCT_DEL, pjob, log_buf); sprintf(log_buf, msg_manager, msg_deletejob, preq->rq_user, preq->rq_host); log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf); /* NOTE: should incorporate job delete message */ if (Msg != NULL) { /* have text message in request extension, add it */ int len = strlen(log_buf); snprintf(log_buf + len, sizeof(log_buf) - len, "\n%s", Msg); } if ((svr_chk_owner(preq, pjob) != 0) && (pjob->ji_has_delete_nanny == FALSE)) { /* only send email if owner did not delete job and job deleted has not been previously attempted */ svr_mailowner(pjob, MAIL_DEL, MAIL_FORCE, log_buf); /* * If we sent mail and already sent the extra message * then reset message so we don't trigger a redundant email * in job_abt() */ if (Msg != NULL) { Msg = NULL; } } if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_FILE) != 0) { /* job has restart file at mom, change restart comment if failed */ change_restart_comment_if_needed(pjob); } if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING) { /* * setup a nanny task to make sure the job is actually deleted (see the * comments at job_delete_nanny()). */ if (pjob->ji_has_delete_nanny == TRUE) { req_reject(PBSE_IVALREQ, 0, preq, NULL, "job cancel in progress"); return(-1); } apply_job_delete_nanny(pjob, time_now + 60); /* * Send signal request to MOM. The server will automagically * pick up and "finish" off the client request when MOM replies. */ get_batch_request_id(preq); if ((rc = issue_signal(&pjob, sigt, post_delete_mom1,strdup(del), strdup(preq->rq_id)))) { /* cant send to MOM */ req_reject(rc, 0, preq, NULL, NULL); } /* normally will ack reply when mom responds */ if (pjob != NULL) { sprintf(log_buf, msg_delrunjobsig, sigt); log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf); } else job_mutex.set_unlock_on_exit(false); return(-1); } /* END if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING) */ /* make a cleanup task if set */ get_svr_attr_l(SRV_ATR_JobForceCancelTime, &force_cancel); if (force_cancel > 0) { char *dup_jobid = strdup(pjob->ji_qs.ji_jobid); set_task(WORK_Timed, time_now + force_cancel, ensure_deleted, dup_jobid, FALSE); } /* if configured, and this job didn't have a slot limit hold, free a job * held with the slot limit hold */ get_svr_attr_l(SRV_ATR_MoabArrayCompatible, &array_compatible); if ((array_compatible != FALSE) && ((pjob->ji_wattr[JOB_ATR_hold].at_val.at_long & HOLD_l) == FALSE)) { if ((pjob->ji_arraystructid[0] != '\0') && (pjob->ji_is_array_template == FALSE)) { int i; int newstate; int newsub; job *tmp; job_array *pa = get_jobs_array(&pjob); if (pjob == NULL) { job_mutex.set_unlock_on_exit(false); return(-1); } std::string dup_job_id(pjob->ji_qs.ji_jobid); if (pa != NULL) { for (i = 0; i < pa->ai_qs.array_size; i++) { if (pa->job_ids[i] == NULL) continue; if (!strcmp(pa->job_ids[i], pjob->ji_qs.ji_jobid)) continue; job_mutex.unlock(); if ((tmp = svr_find_job(pa->job_ids[i], FALSE)) == NULL) { free(pa->job_ids[i]); pa->job_ids[i] = NULL; } else { if (tmp->ji_wattr[JOB_ATR_hold].at_val.at_long & HOLD_l) { tmp->ji_wattr[JOB_ATR_hold].at_val.at_long &= ~HOLD_l; if (tmp->ji_wattr[JOB_ATR_hold].at_val.at_long == 0) { tmp->ji_wattr[JOB_ATR_hold].at_flags &= ~ATR_VFLAG_SET; } svr_evaljobstate(*tmp, newstate, newsub, 1); svr_setjobstate(tmp, newstate, newsub, FALSE); job_save(tmp, SAVEJOB_FULL, 0); unlock_ji_mutex(tmp, __func__, "5", LOGLEVEL); pjob = svr_find_job((char *)dup_job_id.c_str(),FALSE); //Job might have disappeared. job_mutex.set_lock_state(true); break; } unlock_ji_mutex(tmp, __func__, "6", LOGLEVEL); } if ((pjob = svr_find_job((char *)dup_job_id.c_str(),FALSE)) == NULL) //Job disappeared. { break; } job_mutex.set_lock_state(true); } if (pjob->ji_qs.ji_state != JOB_STATE_RUNNING) { long job_atr_hold = pjob->ji_wattr[JOB_ATR_hold].at_val.at_long; int job_exit_status = pjob->ji_qs.ji_un.ji_exect.ji_exitstat; int job_state = pjob->ji_qs.ji_state; job_mutex.unlock(); update_array_values(pa,job_state,aeTerminate, (char*)dup_job_id.c_str(), job_atr_hold, job_exit_status); if((pjob = svr_find_job((char *)dup_job_id.c_str(),FALSE)) != NULL) job_mutex.mark_as_locked(); } unlock_ai_mutex(pa, __func__, "1", LOGLEVEL); } } } /* END MoabArrayCompatible check */ if (pjob == NULL) { job_mutex.set_unlock_on_exit(false); return -1; } depend_on_term(pjob); if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_FILE) != 0) { /* job has restart file at mom, do end job processing */ svr_setjobstate(pjob, JOB_STATE_EXITING, JOB_SUBSTATE_EXITING, FALSE); /* force new connection */ pjob->ji_momhandle = -1; if (LOGLEVEL >= 7) { sprintf(log_buf, "calling on_job_exit from %s", __func__); log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buf); } set_task(WORK_Immed, 0, on_job_exit_task, strdup(pjob->ji_qs.ji_jobid), FALSE); } else if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_StagedIn) != 0) { /* job has staged-in file, should remove them */ remove_stagein(&pjob); job_mutex.set_unlock_on_exit(false); if (pjob != NULL) job_abt(&pjob, Msg); } delete_inactive_job(&pjob, Msg); if (pjob == NULL) job_mutex.set_unlock_on_exit(false); return(PBSE_NONE); } /* END execute_job_delete() */