void ensure_deleted( struct work_task *ptask) /* I */ { struct batch_request *preq; job *pjob; preq = ptask->wt_parm1; if ((pjob = find_job(preq->rq_ind.rq_delete.rq_objname)) == NULL) { /* job doesn't exist, we're done */ return; } sprintf(log_buffer, "purging job without checking MOM"); log_event( PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buffer); free_nodes(pjob); if (pjob->ji_qhdr->qu_qs.qu_type == QTYPE_Execution) { set_resc_assigned(pjob, DECR); } job_purge(pjob); } /* END ensure_deleted() */
void force_purge_work( job *pjob) { char log_buf[LOCAL_LOG_BUF_SIZE]; pbs_queue *pque; snprintf(log_buf, sizeof(log_buf), "purging job %s without checking MOM", pjob->ji_qs.ji_jobid); log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf); free_nodes(pjob); if ((pque = get_jobs_queue(&pjob)) != NULL) { if (pjob->ji_qhdr->qu_qs.qu_type == QTYPE_Execution) { unlock_queue(pque, __func__, NULL, LOGLEVEL); set_resc_assigned(pjob, DECR); } else unlock_queue(pque, __func__, NULL, LOGLEVEL); } if (pjob != NULL) svr_job_purge(pjob); } /* END force_purge_work() */
void post_job_delete_nanny( batch_request *preq_sig) { int rc; job *pjob; char log_buf[LOCAL_LOG_BUF_SIZE]; long nanny = 0; if (preq_sig == NULL) return; rc = preq_sig->rq_reply.brp_code; get_svr_attr_l(SRV_ATR_JobNanny, &nanny); if (!nanny) { /* the admin disabled nanny within the last minute or so */ free_br(preq_sig); return; } /* extract job id from task */ pjob = svr_find_job(preq_sig->rq_ind.rq_signal.rq_jid, FALSE); if (pjob == NULL) { sprintf(log_buf, "job delete nanny: the job disappeared (this is a BUG!)"); log_event(PBSEVENT_ERROR,PBS_EVENTCLASS_JOB,preq_sig->rq_ind.rq_signal.rq_jid,log_buf); } else if (rc == PBSE_UNKJOBID) { sprintf(log_buf, "job delete nanny returned, but does not exist on mom"); log_event(PBSEVENT_ERROR,PBS_EVENTCLASS_JOB,preq_sig->rq_ind.rq_signal.rq_jid,log_buf); free_nodes(pjob); set_resc_assigned(pjob, DECR); free_br(preq_sig); svr_job_purge(pjob); return; } unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL); /* free task */ free_br(preq_sig); return; } /* END post_job_delete_nanny() */
void force_purge_work( job *pjob) { char log_buf[LOCAL_LOG_BUF_SIZE]; pbs_queue *pque; snprintf(log_buf, sizeof(log_buf), "purging job %s without checking MOM", pjob->ji_qs.ji_jobid); log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf); free_nodes(pjob); if ((pque = get_jobs_queue(&pjob)) != NULL) { mutex_mgr pque_mutex = mutex_mgr(pque->qu_mutex, true); if (pque->qu_qs.qu_type == QTYPE_Execution) { pque_mutex.unlock(); set_resc_assigned(pjob, DECR); } } depend_on_term(pjob); svr_setjobstate(pjob, JOB_STATE_COMPLETE, JOB_SUBSTATE_COMPLETE, FALSE); if (pjob != NULL) { if (is_ms_on_server(pjob)) { char log_buf[LOCAL_LOG_BUF_SIZE]; if (LOGLEVEL >= 7) { snprintf(log_buf, sizeof(log_buf), "Mother Superior is on the server, not cleaning spool files in %s", __func__); log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buf); } svr_job_purge(pjob, 1); } else svr_job_purge(pjob); } } /* END force_purge_work() */
static void post_delete_mom1( struct work_task *pwt) { int delay = 0; int dellen = strlen(deldelaystr); job *pjob; pbs_queue *pque; char *preq_clt_id; struct batch_request *preq_sig; /* signal request to MOM */ struct batch_request *preq_clt = NULL; /* original client request */ int rc; time_t time_now = time(NULL); preq_sig = get_remove_batch_request((char *)pwt->wt_parm1); free(pwt->wt_mutex); free(pwt); if (preq_sig == NULL) return; rc = preq_sig->rq_reply.brp_code; preq_clt_id = preq_sig->rq_extra; free_br(preq_sig); if (preq_clt_id != NULL) { preq_clt = get_remove_batch_request(preq_clt_id); free(preq_clt_id); } /* the client request has been handled another way, nothing left to do */ if (preq_clt == NULL) return; pjob = svr_find_job(preq_clt->rq_ind.rq_delete.rq_objname, FALSE); if (pjob == NULL) { /* job has gone away */ req_reject(PBSE_UNKJOBID, 0, preq_clt, NULL, NULL); return; } if (rc) { /* mom rejected request */ if (rc == PBSE_UNKJOBID) { /* MOM claims no knowledge, so just purge it */ log_event( PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, "MOM rejected signal during delete"); /* removed the resources assigned to job */ free_nodes(pjob); set_resc_assigned(pjob, DECR); svr_job_purge(pjob); reply_ack(preq_clt); } else { req_reject(rc, 0, preq_clt, NULL, NULL); unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL); } return; } if (preq_clt->rq_extend) { if (strncmp(preq_clt->rq_extend, deldelaystr, dellen) == 0) { delay = atoi(preq_clt->rq_extend + dellen); } } reply_ack(preq_clt); /* dont need it, reply now */ /* * if no delay specified in original request, see if kill_delay * queue attribute is set. */ if (delay == 0) { if ((pque = get_jobs_queue(&pjob)) != NULL) { pthread_mutex_lock(server.sv_attr_mutex); delay = attr_ifelse_long(&pque->qu_attr[QE_ATR_KillDelay], &server.sv_attr[SRV_ATR_KillDelay], 2); pthread_mutex_unlock(server.sv_attr_mutex); unlock_queue(pque, __func__, NULL, LOGLEVEL); } else if (pjob != NULL) return; } set_task(WORK_Timed, delay + time_now, post_delete_mom2, strdup(pjob->ji_qs.ji_jobid), FALSE); /* * Since the first signal has succeeded, let's reschedule the * nanny to be 1 minute after the second phase. */ apply_job_delete_nanny(pjob, time_now + delay + 60); unlock_ji_mutex(pjob, __func__, "2", LOGLEVEL); } /* END post_delete_mom1() */
void delay_and_send_sig_kill( batch_request *preq_sig) { int delay = 0; job *pjob; pbs_queue *pque; batch_request *preq_clt = NULL; /* original client request */ int rc; time_t time_now = time(NULL); char log_buf[LOCAL_LOG_BUF_SIZE]; if (preq_sig == NULL) return; rc = preq_sig->rq_reply.brp_code; if (preq_sig->rq_extend != NULL) { preq_clt = get_remove_batch_request(preq_sig->rq_extend); } /* the client request has been handled another way, nothing left to do */ if (preq_clt == NULL) return; if ((pjob = chk_job_request(preq_clt->rq_ind.rq_rerun, preq_clt)) == NULL) { /* job has gone away, chk_job_request() calls req_reject() on failure */ return; } mutex_mgr pjob_mutex = mutex_mgr(pjob->ji_mutex, true); if (rc) { /* mom rejected request */ if (rc == PBSE_UNKJOBID) { /* MOM claims no knowledge, so just purge it */ log_event( PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, "MOM rejected signal during rerun"); /* removed the resources assigned to job */ free_nodes(pjob); set_resc_assigned(pjob, DECR); unlock_ji_mutex(pjob, __func__, "3", LOGLEVEL); svr_job_purge(pjob); reply_ack(preq_clt); } else { pjob_mutex.unlock(); req_reject(rc, 0, preq_clt, NULL, NULL); } return; } // Apply the user delay first so it takes precedence. if (pjob->ji_wattr[JOB_ATR_user_kill_delay].at_flags & ATR_VFLAG_SET) delay = pjob->ji_wattr[JOB_ATR_user_kill_delay].at_val.at_long; if ((pque = get_jobs_queue(&pjob)) != NULL) { mutex_mgr pque_mutex = mutex_mgr(pque->qu_mutex, true); mutex_mgr server_mutex = mutex_mgr(server.sv_attr_mutex, false); if (delay == 0) { delay = attr_ifelse_long(&pque->qu_attr[QE_ATR_KillDelay], &server.sv_attr[SRV_ATR_KillDelay], 0); } } else { /* why is the pque null. Something went wrong */ snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "jobid %s returned a null queue", pjob->ji_qs.ji_jobid); req_reject(PBSE_UNKQUE, 0, preq_clt, NULL, log_buf); return; } pjob_mutex.unlock(); reply_ack(preq_clt); set_task(WORK_Timed, delay + time_now, send_sig_kill, strdup(pjob->ji_qs.ji_jobid), FALSE); } // END delay_and_send_sig_kill()
static int forced_jobpurge( struct batch_request *preq) { job *pjob; if ((pjob = find_job(preq->rq_ind.rq_delete.rq_objname)) == NULL) { log_event( PBSEVENT_DEBUG, PBS_EVENTCLASS_JOB, preq->rq_ind.rq_delete.rq_objname, pbse_to_txt(PBSE_UNKJOBID)); req_reject(PBSE_UNKJOBID, 0, preq, NULL, NULL); return(-1); } /* check about possibly purging the job */ if (preq->rq_extend != NULL) { if (!strncmp(preq->rq_extend, delpurgestr, strlen(delpurgestr))) { if (((preq->rq_perm & (ATR_DFLAG_OPRD | ATR_DFLAG_OPWR | ATR_DFLAG_MGRD | ATR_DFLAG_MGWR)) != 0) || ((svr_chk_owner(preq, pjob) == 0) && (server.sv_attr[SRV_ATR_OwnerPurge].at_val.at_long))) { sprintf(log_buffer, "purging job without checking MOM"); log_event( PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buffer); reply_ack(preq); free_nodes(pjob); if (pjob->ji_qhdr->qu_qs.qu_type == QTYPE_Execution) { set_resc_assigned(pjob, DECR); } job_purge(pjob); return(1); } else { /* FAILURE */ req_reject(PBSE_PERM, 0, preq, NULL, NULL); return(-1); } } } return(0); } /* END forced_jobpurge() */
static void post_delete_mom1( struct work_task *pwt) { int delay = 0; int dellen = strlen(deldelaystr); job *pjob; struct work_task *pwtnew; pbs_queue *pque; struct batch_request *preq_sig; /* signal request to MOM */ struct batch_request *preq_clt; /* original client request */ int rc; preq_sig = pwt->wt_parm1; rc = preq_sig->rq_reply.brp_code; preq_clt = preq_sig->rq_extra; release_req(pwt); pjob = find_job(preq_clt->rq_ind.rq_delete.rq_objname); if (pjob == NULL) { /* job has gone away */ req_reject(PBSE_UNKJOBID, 0, preq_clt, NULL, NULL); return; } if (rc) { /* mom rejected request */ if (rc == PBSE_UNKJOBID) { /* MOM claims no knowledge, so just purge it */ log_event( PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, "MOM rejected signal during delete"); /* removed the resources assigned to job */ free_nodes(pjob); set_resc_assigned(pjob, DECR); job_purge(pjob); reply_ack(preq_clt); } else { req_reject(rc, 0, preq_clt, NULL, NULL); } return; } if (preq_clt->rq_extend) { if (strncmp(preq_clt->rq_extend, deldelaystr, dellen) == 0) { delay = atoi(preq_clt->rq_extend + dellen); } } reply_ack(preq_clt); /* dont need it, reply now */ /* * if no delay specified in original request, see if kill_delay * queue attribute is set. */ if (delay == 0) { pque = pjob->ji_qhdr; delay = attr_ifelse_long(&pque->qu_attr[QE_ATR_KillDelay], &server.sv_attr[SRV_ATR_KillDelay], 2); } pwtnew = set_task(WORK_Timed, delay + time_now, post_delete_mom2, pjob); if (pwtnew) { /* insure that work task will be removed if job goes away */ append_link(&pjob->ji_svrtask, &pwtnew->wt_linkobj, pwtnew); } /* * Since the first signal has succeeded, let's reschedule the * nanny to be 1 minute after the second phase. */ apply_job_delete_nanny(pjob, time_now + delay + 60); return; } /* END post_delete_mom1() */
static void post_job_delete_nanny( struct work_task *pwt) { struct batch_request *preq_sig; /* signal request to MOM */ int rc; job *pjob; preq_sig = pwt->wt_parm1; rc = preq_sig->rq_reply.brp_code; if (!server.sv_attr[SRV_ATR_JobNanny].at_val.at_long) { /* the admin disabled nanny within the last minute or so */ release_req(pwt); return; } /* extract job id from task */ pjob = find_job(preq_sig->rq_ind.rq_signal.rq_jid); if (pjob == NULL) { sprintf(log_buffer, "job delete nanny: the job disappeared (this is a BUG!)"); LOG_EVENT( PBSEVENT_ERROR, PBS_EVENTCLASS_JOB, preq_sig->rq_ind.rq_signal.rq_jid, log_buffer); } else if (rc == PBSE_UNKJOBID) { sprintf(log_buffer, "job delete nanny returned, but does not exist on mom"); LOG_EVENT( PBSEVENT_ERROR, PBS_EVENTCLASS_JOB, preq_sig->rq_ind.rq_signal.rq_jid, log_buffer); free_nodes(pjob); set_resc_assigned(pjob, DECR); job_purge(pjob); } /* free task */ release_req(pwt); return; } /* END post_job_delete_nanny() */
static void post_sendmom( struct work_task *pwt) /* I */ { char *id = "post_sendmom"; int newstate; int newsub; int r; int stat; job *jobp = (job *)pwt->wt_parm1; struct batch_request *preq = (struct batch_request *)pwt->wt_parm2; char *MOMName = NULL; int jindex; long DTime = time_now - 10000; if (LOGLEVEL >= 6) { log_record( PBSEVENT_JOB, PBS_EVENTCLASS_JOB, jobp->ji_qs.ji_jobid, "entering post_sendmom"); } stat = pwt->wt_aux; if (WIFEXITED(stat)) { r = WEXITSTATUS(stat); } else { r = 2; /* cannot get child exit status */ sprintf(log_buffer, msg_badexit, stat); strcat(log_buffer, id); log_event( PBSEVENT_SYSTEM, PBS_EVENTCLASS_JOB, jobp->ji_qs.ji_jobid, log_buffer); } /* maintain local struct to associate job id with dispatch time */ for (jindex = 0;jindex < 20;jindex++) { if (DispatchJob[jindex] == jobp) { DTime = DispatchTime[jindex]; DispatchJob[jindex] = NULL; MOMName = DispatchNode[jindex]; break; } } if (LOGLEVEL >= 1) { sprintf(log_buffer, "child reported %s for job after %ld seconds (dest=%s), rc=%d", (r == 0) ? "success" : "failure", time_now - DTime, (MOMName != NULL) ? MOMName : "???", r); log_event( PBSEVENT_SYSTEM, PBS_EVENTCLASS_JOB, jobp->ji_qs.ji_jobid, log_buffer); } switch (r) { case 0: /* send to MOM went ok */ jobp->ji_qs.ji_svrflags &= ~JOB_SVFLG_HOTSTART; if (preq != NULL) reply_ack(preq); /* record start time for accounting */ jobp->ji_qs.ji_stime = time_now; /* update resource usage attributes */ set_resc_assigned(jobp, INCR); if (jobp->ji_qs.ji_substate == JOB_SUBSTATE_PRERUN) { /* may be EXITING if job finished first */ svr_setjobstate(jobp, JOB_STATE_RUNNING, JOB_SUBSTATE_RUNNING); /* above saves job structure */ } /* accounting log for start or restart */ if (jobp->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_FILE) account_record(PBS_ACCT_RESTRT, jobp, "Restart from checkpoint"); else account_jobstr(jobp); /* if any dependencies, see if action required */ if (jobp->ji_wattr[(int)JOB_ATR_depend].at_flags & ATR_VFLAG_SET) depend_on_exec(jobp); /* * it is unfortunate, but while the job has gone into execution, * there is no way of obtaining the session id except by making * a status request of MOM. (Even if the session id was passed * back to the sending child, it couldn't get up to the parent.) */ jobp->ji_momstat = 0; stat_mom_job(jobp); break; case 10: /* NOTE: if r == 10, connection to mom timed out. Mark node down */ stream_eof(-1, jobp->ji_qs.ji_un.ji_exect.ji_momaddr, 0); /* send failed, requeue the job */ log_event( PBSEVENT_JOB, PBS_EVENTCLASS_JOB, jobp->ji_qs.ji_jobid, "unable to run job, MOM rejected/timeout"); free_nodes(jobp); if (jobp->ji_qs.ji_substate != JOB_SUBSTATE_ABORT) { if (preq != NULL) req_reject(PBSE_MOMREJECT, 0, preq, MOMName, "connection to mom timed out"); svr_evaljobstate(jobp, &newstate, &newsub, 1); svr_setjobstate(jobp, newstate, newsub); } else { if (preq != NULL) req_reject(PBSE_BADSTATE, 0, preq, MOMName, "job was aborted by mom"); } break; case 1: /* commit failed */ default: { int JobOK = 0; /* send failed, requeue the job */ sprintf(log_buffer, "unable to run job, MOM rejected/rc=%d", r); log_event( PBSEVENT_JOB, PBS_EVENTCLASS_JOB, jobp->ji_qs.ji_jobid, log_buffer); free_nodes(jobp); if (jobp->ji_qs.ji_substate != JOB_SUBSTATE_ABORT) { if (preq != NULL) { char tmpLine[1024]; if (preq->rq_reply.brp_code == PBSE_JOBEXIST) { /* job already running, start request failed but return success since * desired behavior (job is running) is accomplished */ JobOK = 1; } else { sprintf(tmpLine, "cannot send job to %s, state=%s", (MOMName != NULL) ? MOMName : "mom", PJobSubState[jobp->ji_qs.ji_substate]); req_reject(PBSE_MOMREJECT, 0, preq, MOMName, tmpLine); } } if (JobOK == 1) { /* do not re-establish accounting - completed first time job was started */ /* update mom-based job status */ jobp->ji_momstat = 0; stat_mom_job(jobp); } else { svr_evaljobstate(jobp, &newstate, &newsub, 1); svr_setjobstate(jobp, newstate, newsub); } } else { if (preq != NULL) req_reject(PBSE_BADSTATE, 0, preq, MOMName, "send failed - abort"); } break; } } /* END switch (r) */ return; } /* END post_sendmom() */
void delay_and_send_sig_kill(batch_request *preq_sig) { int delay = 0; job *pjob; pbs_queue *pque; struct batch_request *preq_clt = NULL; /* original client request */ int rc; time_t time_now = time(NULL); if (preq_sig == NULL) return; rc = preq_sig->rq_reply.brp_code; if (preq_sig->rq_extend != NULL) { preq_clt = get_remove_batch_request(preq_sig->rq_extend); } free_br(preq_sig); /* the client request has been handled another way, nothing left to do */ if (preq_clt == NULL) return; if ((pjob = chk_job_request(preq_clt->rq_ind.rq_rerun, preq_clt)) == NULL) { /* job has gone away */ req_reject(PBSE_UNKJOBID, 0, preq_clt, NULL, NULL); return; } if (rc) { /* mom rejected request */ if (rc == PBSE_UNKJOBID) { /* MOM claims no knowledge, so just purge it */ log_event( PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, "MOM rejected signal during rerun"); /* removed the resources assigned to job */ free_nodes(pjob); set_resc_assigned(pjob, DECR); unlock_ji_mutex(pjob, __func__, "3", LOGLEVEL); svr_job_purge(pjob); reply_ack(preq_clt); } else { unlock_ji_mutex(pjob, __func__, "3", LOGLEVEL); req_reject(rc, 0, preq_clt, NULL, NULL); } return; } if ((pque = get_jobs_queue(&pjob)) != NULL) { mutex_mgr pque_mutex = mutex_mgr(pque->qu_mutex, true); pthread_mutex_lock(server.sv_attr_mutex); delay = attr_ifelse_long(&pque->qu_attr[QE_ATR_KillDelay], &server.sv_attr[SRV_ATR_KillDelay], 0); pthread_mutex_unlock(server.sv_attr_mutex); } else if (pjob == NULL) { unlock_ji_mutex(pjob, __func__, "3", LOGLEVEL); return; } unlock_ji_mutex(pjob, __func__, "3", LOGLEVEL); reply_ack(preq_clt); set_task(WORK_Timed, delay + time_now, send_sig_kill, strdup(pjob->ji_qs.ji_jobid), FALSE); }
void req_confirmresv(struct batch_request *preq) { char buf[PBS_MAXQRESVNAME+PBS_MAXHOSTNAME+256] = {0}; /* FQDN resvID+text */ time_t newstart = 0; attribute *petime = NULL; resc_resv *presv = NULL; int rc = 0; int state = 0; int sub = 0; int resv_count = 0; int is_degraded = 0; long next_retry_time = 0; char *execvnodes = NULL; char *next_execvnode = NULL; char **short_xc = NULL; char **tofree = NULL; char *str_time = NULL; extern char server_host[]; int is_being_altered = 0; char *tmp_buf = NULL; size_t tmp_buf_size = 0; if ((preq->rq_perm & (ATR_DFLAG_MGWR | ATR_DFLAG_OPWR)) == 0) { req_reject(PBSE_PERM, 0, preq); return; } presv = find_resv(preq->rq_ind.rq_run.rq_jid); if (presv == NULL) { req_reject(PBSE_UNKRESVID, 0, preq); return; } is_degraded = presv->ri_qs.ri_substate == RESV_DEGRADED ? 1 : 0; is_being_altered = presv->ri_alter_flags; if (preq->rq_extend == NULL) { req_reject(PBSE_resvFail, 0, preq); return; } /* If the reservation was degraded and it could not be reconfirmed by the * scheduler, then the retry time for that reservation is reset to the half- * time between now and the time to reservation start or, if the retry time * is invalid, set it to some time after the soonest occurrence is to start */ if (strcmp(preq->rq_extend, PBS_RESV_CONFIRM_FAIL) == 0) { if (is_degraded && !is_being_altered) { long degraded_time = presv->ri_degraded_time; DBPRT(("degraded_time of %s is %s", presv->ri_qs.ri_resvID, ctime(°raded_time))); next_retry_time = time_now + ((degraded_time - time_now)/2); /* If reservation is still degraded, and time of degraded resv to start * is over cutoff from now, then set a time to try again. */ if (next_retry_time <= (degraded_time - reserve_retry_cutoff)) { set_resv_retry(presv, next_retry_time); str_time = ctime(&(presv->ri_wattr[RESV_ATR_retry].at_val.at_long)); if (str_time != NULL) { str_time[strlen(str_time)-1] = '\0'; (void)snprintf(log_buffer, sizeof(log_buffer), "Next attempt to reconfirm reservation will be made on %s", str_time); log_event(PBSEVENT_DEBUG2, PBS_EVENTCLASS_RESV, LOG_NOTICE, presv->ri_qs.ri_resvID, log_buffer); } } else { /* reached a retry attempt that falls within the cutoff * When processing an advance reservation, unset retry attribute */ if (presv->ri_wattr[RESV_ATR_resv_standing].at_val.at_long == 0) { unset_resv_retry(presv); } else { /* When processing a standing reservation, set a retry time * past the end time of the soonest occurrence. */ set_resv_retry(presv, presv->ri_wattr[RESV_ATR_end].at_val.at_long + RESV_RETRY_DELAY); } } } else { if (!is_being_altered) log_event(PBS_EVENTCLASS_RESV, PBS_EVENTCLASS_RESV, LOG_INFO, presv->ri_qs.ri_resvID, "Reservation denied"); /* Clients waiting on an interactive request must be * notified of the failure to confirm */ if ((presv->ri_brp != NULL) && (presv->ri_wattr[RESV_ATR_interactive].at_flags & ATR_VFLAG_SET)) { presv->ri_wattr[RESV_ATR_interactive].at_flags &= ~ATR_VFLAG_SET; snprintf(buf, sizeof(buf), "%s DENIED", presv->ri_qs.ri_resvID); (void)reply_text(presv->ri_brp, PBSE_NONE, buf); presv->ri_brp = NULL; } if (!is_being_altered) { (void)snprintf(log_buffer, sizeof(log_buffer), "requestor=%s@%s", msg_daemonname, server_host); account_recordResv(PBS_ACCT_DRss, presv, log_buffer); log_event(PBSEVENT_DEBUG, PBS_EVENTCLASS_RESV, LOG_NOTICE, presv->ri_qs.ri_resvID, "reservation deleted"); resv_purge(presv); } } if (presv->ri_qs.ri_state == RESV_BEING_ALTERED) { resv_revert_alter_times(presv); log_event(PBSEVENT_RESV, PBS_EVENTCLASS_RESV, LOG_INFO, presv->ri_qs.ri_resvID, "Reservation alter denied"); } reply_ack(preq); return; } #ifdef NAS /* localmod 122 */ /* If an advance reservation has already been confirmed there's no * work to be done. */ if (presv->ri_qs.ri_state == RESV_CONFIRMED && !presv->ri_wattr[RESV_ATR_resv_standing].at_val.at_long) { reply_ack(preq); return; } #endif /* localmod 122 */ /* Do not alter a reservation that started running when the reconfirmation * message was received. If a standing reservation, then set a retry time * past the end of this occurrence. */ if (presv->ri_qs.ri_state == RESV_RUNNING) { if (presv->ri_wattr[RESV_ATR_resv_standing].at_val.at_long) set_resv_retry(presv, presv->ri_wattr[RESV_ATR_end].at_val.at_long + 10); req_reject(PBSE_TOOLATE, 0, preq); return; } petime = &presv->ri_wattr[RESV_ATR_end]; /* if passed in the confirmation, set a new start time */ if ((newstart = (time_t)preq->rq_ind.rq_run.rq_resch) != 0) { presv->ri_qs.ri_stime = newstart; presv->ri_wattr[RESV_ATR_start].at_val.at_long = newstart; presv->ri_wattr[RESV_ATR_start].at_flags |= ATR_VFLAG_SET | ATR_VFLAG_MODIFY | ATR_VFLAG_MODCACHE; presv->ri_qs.ri_etime = newstart + presv->ri_qs.ri_duration; petime->at_val.at_long = presv->ri_qs.ri_etime; petime->at_flags |= ATR_VFLAG_SET | ATR_VFLAG_MODIFY | ATR_VFLAG_MODCACHE; } /* The main difference between an advance reservation and a standing * reservation is the format of the execvnodes returned by "rq_destin": * An advance reservation has a single execvnode while a standing reservation * has a sting with the particular format: * <num_resv>#<execvnode1>[<range>]<exevnode2>[... * describing the execvnodes associated to each occurrence. */ if (presv->ri_wattr[RESV_ATR_resv_standing].at_val.at_long) { /* The number of occurrences in the standing reservation and index are parsed * from the execvnode string which is of the form: * <num_occurrences>#<vnode1>[range1]<vnode2>[range2]... */ resv_count = get_execvnodes_count(preq->rq_ind.rq_run.rq_destin); if (resv_count == 0) { req_reject(PBSE_INTERNAL, 0, preq); return; } execvnodes = strdup(preq->rq_ind.rq_run.rq_destin); if (execvnodes == NULL) { req_reject(PBSE_SYSTEM, 0, preq); return; } DBPRT(("stdg_resv conf: execvnodes_seq is %s\n", execvnodes)); /* execvnodes is of the form: * <num_resv>#<(execvnode1)>[<range>]<(exevnode2)>[... * this "condensed" string is unrolled into a pointer array of * execvnodes per occurrence, e.g. short_xc[0] are the execvnodes * for 1st occurrence, short_xc[1] for the 2nd etc... * If something goes wrong during unrolling then NULL is returned. * which causes the confirmation message to be rejected */ short_xc = unroll_execvnode_seq(execvnodes, &tofree); if (short_xc == NULL) { free(execvnodes); req_reject(PBSE_SYSTEM, 0, preq); return; } /* The execvnode of the soonest (i.e., next) occurrence */ next_execvnode = strdup(short_xc[0]); if (next_execvnode == NULL) { free(short_xc); free_execvnode_seq(tofree); free(execvnodes); req_reject(PBSE_SYSTEM, 0, preq); return; } /* Release the now obsolete allocations used to manipulate the * unrolled string */ free(short_xc); free_execvnode_seq(tofree); free(execvnodes); /* When confirming for the first time, set the index and count */ if (!is_degraded) { /* Add first occurrence's end date on timed task list */ if (presv->ri_wattr[RESV_ATR_start].at_val.at_long != PBS_RESV_FUTURE_SCH) { if (gen_task_EndResvWindow(presv)) { free(next_execvnode); req_reject(PBSE_SYSTEM, 0, preq); return; } } if (!is_being_altered) { presv->ri_wattr[RESV_ATR_resv_count].at_val.at_long = resv_count; presv->ri_wattr[RESV_ATR_resv_count].at_flags |= ATR_VFLAG_SET | ATR_VFLAG_MODIFY | ATR_VFLAG_MODCACHE; } /* Set first occurrence to index 1 * (rather than 0 because it gets displayed in pbs_rstat -f) */ presv->ri_wattr[RESV_ATR_resv_idx].at_val.at_long = 1; presv->ri_wattr[RESV_ATR_resv_idx].at_flags |= ATR_VFLAG_SET | ATR_VFLAG_MODIFY | ATR_VFLAG_MODCACHE; } /* Skip setting the execvnodes sequence when reconfirming the last * occurrence or when altering a reservation. */ if (!is_being_altered) { if (presv->ri_wattr[RESV_ATR_resv_idx].at_val.at_long < presv->ri_wattr[RESV_ATR_resv_count].at_val.at_long) { /* now assign the execvnodes sequence attribute */ (void) resv_attr_def[(int)RESV_ATR_resv_execvnodes].at_free( &presv->ri_wattr[(int)RESV_ATR_resv_execvnodes]); (void) resv_attr_def[(int)RESV_ATR_resv_execvnodes].at_decode( &presv->ri_wattr[(int)RESV_ATR_resv_execvnodes], NULL, NULL, preq->rq_ind.rq_run.rq_destin); } } } else { /* Advance reservation */ next_execvnode = strdup(preq->rq_ind.rq_run.rq_destin); if (next_execvnode == NULL) { req_reject(PBSE_SYSTEM, 0, preq); return; } } /* Is reservation still a viable reservation? */ if ((rc = chk_resvReq_viable(presv)) != 0) { free(next_execvnode); req_reject(PBSE_BADTSPEC, 0, preq); return; } /* When reconfirming a degraded reservation, first free the nodes linked * to the reservation and unset all attributes relating to retry attempts */ if (is_degraded) { free_resvNodes(presv); /* Reset retry time */ unset_resv_retry(presv); /* reset vnodes_down counter to 0 */ presv->ri_vnodes_down = 0; } if (is_being_altered & RESV_END_TIME_MODIFIED) { if (gen_task_EndResvWindow(presv)) { free(next_execvnode); req_reject(PBSE_SYSTEM, 0, preq); return; } } /* * Assign the allocated resources to the reservation * and the reservation to the associated vnodes. */ if (is_being_altered) free_resvNodes(presv); rc = assign_resv_resc(presv, next_execvnode); if (rc != PBSE_NONE) { free(next_execvnode); req_reject(rc, 0, preq); return; } /* place "Time4resv" task on "task_list_timed" only if this is a * confirmation but not the reconfirmation of a degraded reservation as * in this case, the reservation had already been confirmed and added to * the task list before */ if (!is_degraded && (is_being_altered != RESV_END_TIME_MODIFIED) && (rc = gen_task_Time4resv(presv)) != 0) { free(next_execvnode); req_reject(rc, 0, preq); return; } /* * compute new values for state and substate * and update the resc_resv object with these * newly computed values */ eval_resvState(presv, RESVSTATE_gen_task_Time4resv, 0, &state, &sub); (void)resv_setResvState(presv, state, sub); cmp_resvStateRelated_attrs((void *)presv, presv->ri_qs.ri_type); Update_Resvstate_if_resv(presv->ri_jbp); if (presv->ri_modified) (void)job_or_resv_save((void *)presv, SAVERESV_FULL, RESC_RESV_OBJECT); log_buffer[0] = '\0'; /* * Notify all interested parties that the reservation * is moving from state UNCONFIRMED to CONFIRMED */ if (presv->ri_brp) { presv = find_resv(presv->ri_qs.ri_resvID); if (presv->ri_wattr[(int)RESV_ATR_convert].at_val.at_str != NULL) { rc = cnvrt_qmove(presv); if (rc != 0) { snprintf(buf, sizeof(buf), "%.240s FAILED", presv->ri_qs.ri_resvID); } else { snprintf(buf, sizeof(buf), "%.240s CONFIRMED", presv->ri_qs.ri_resvID); } } else { snprintf(buf, sizeof(buf), "%.240s CONFIRMED", presv->ri_qs.ri_resvID); } rc = reply_text(presv->ri_brp, PBSE_NONE, buf); presv->ri_brp = NULL; } svr_mailownerResv(presv, MAIL_CONFIRM, MAIL_NORMAL, log_buffer); presv->ri_wattr[RESV_ATR_interactive].at_flags &= ~ATR_VFLAG_SET; if (is_being_altered) { /* * If the reservation is currently running and its start time is being * altered after the current time, It is going back to the confirmed state. * We need to stop the reservation queue as it would have been started at * the original start time. * This will prevent any jobs - that are submitted after the * reservation's start time is changed - from running. * The reservation went to CO from RN while being altered, that means the reservation * had resources assigned. We should decrement their usages until it starts running * again, where the resources will be accounted again. */ if (presv->ri_qs.ri_state == RESV_CONFIRMED && presv->ri_alter_state == RESV_RUNNING) { change_enableORstart(presv, Q_CHNG_START, "FALSE"); if (presv->ri_giveback) { set_resc_assigned((void *)presv, 1, DECR); presv->ri_giveback = 0; } } /* * Reset only the flags and end time backup here, as we will need * the start time backup in Time4occurrenceFinish for a standing * reservation. Reset it for an advanced reservation. */ if (!(presv->ri_wattr[RESV_ATR_resv_standing].at_val.at_long)) { presv->ri_alter_stime = 0; } presv->ri_alter_etime = 0; presv->ri_alter_flags = 0; log_event(PBSEVENT_RESV, PBS_EVENTCLASS_RESV, LOG_INFO, presv->ri_qs.ri_resvID, "Reservation alter confirmed"); } else { log_event(PBSEVENT_RESV, PBS_EVENTCLASS_RESV, LOG_INFO, presv->ri_qs.ri_resvID, "Reservation confirmed"); } if (!is_degraded) { /* 100 extra bytes for field names, times, and count */ tmp_buf_size = 100 + strlen(preq->rq_user) + strlen(preq->rq_host) + strlen(next_execvnode); if (tmp_buf_size > sizeof(buf)) { tmp_buf = malloc(tmp_buf_size); if (tmp_buf == NULL) { snprintf(log_buffer, LOG_BUF_SIZE-1, "malloc failure (errno %d)", errno); log_err(PBSE_SYSTEM, __func__, log_buffer); free(next_execvnode); reply_ack(preq); return; } } else { tmp_buf = buf; tmp_buf_size = sizeof(buf); } if (presv->ri_wattr[RESV_ATR_resv_standing].at_val.at_long) { (void)snprintf(tmp_buf, tmp_buf_size, "requestor=%s@%s start=%ld end=%ld nodes=%s count=%ld", preq->rq_user, preq->rq_host, presv->ri_qs.ri_stime, presv->ri_qs.ri_etime, next_execvnode, presv->ri_wattr[RESV_ATR_resv_count].at_val.at_long); } else { (void)snprintf(tmp_buf, tmp_buf_size, "requestor=%s@%s start=%ld end=%ld nodes=%s", preq->rq_user, preq->rq_host, presv->ri_qs.ri_stime, presv->ri_qs.ri_etime, next_execvnode); } account_recordResv(PBS_ACCT_CR, presv, tmp_buf); if (tmp_buf != buf) { free(tmp_buf); tmp_buf_size = 0; } } free(next_execvnode); reply_ack(preq); return; }