int req_rerunjob( batch_request *preq) { int rc = PBSE_NONE; job *pjob; int MgrRequired = TRUE; char log_buf[LOCAL_LOG_BUF_SIZE]; /* check if requestor is admin, job owner, etc */ if (!strcasecmp(preq->rq_ind.rq_rerun, "all")) { return(handle_requeue_all(preq)); } if ((pjob = chk_job_request(preq->rq_ind.rq_rerun, preq)) == 0) { /* FAILURE */ /* chk_job_request calls req_reject() */ rc = PBSE_SYSTEM; return rc; /* This needs to fixed to return an accurate error */ } mutex_mgr pjob_mutex = mutex_mgr(pjob->ji_mutex, true); /* the job must be running or completed */ if (pjob->ji_qs.ji_state >= JOB_STATE_EXITING) { if (pjob->ji_wattr[JOB_ATR_checkpoint_name].at_flags & ATR_VFLAG_SET) { /* allow end-users to rerun checkpointed jobs */ MgrRequired = FALSE; } } else if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING) { /* job is running */ /* NO-OP */ } else if (pjob->ji_qs.ji_state == JOB_STATE_QUEUED) { // If we are already queued, then there is nothing to do. rc = PBSE_NONE; reply_ack(preq); return(rc); } else { /* FAILURE - job is in bad state */ rc = PBSE_BADSTATE; snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "job %s is in a bad state", preq->rq_ind.rq_rerun); req_reject(rc, 0, preq, NULL, log_buf); return rc; } if ((MgrRequired == TRUE) && ((preq->rq_perm & (ATR_DFLAG_MGWR | ATR_DFLAG_OPWR)) == 0)) { /* FAILURE */ rc = PBSE_PERM; snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "additional permissions required (ATR_DFLAG_MGWR | ATR_DFLAG_OPWR)"); req_reject(rc, 0, preq, NULL, log_buf); return rc; } /* the job must be rerunnable */ if (pjob->ji_wattr[JOB_ATR_rerunable].at_val.at_long == 0) { /* NOTE: should force override this constraint? maybe (???) */ /* no, the user is saying that the job will break, and IEEE Std 1003.1 specifically says rerun is to be rejected if rerunable==FALSE -garrick */ rc = PBSE_NORERUN; snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "job %s not rerunnable", preq->rq_ind.rq_rerun); req_reject(rc, 0, preq, NULL, log_buf); return rc; } if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING) { /* ask MOM to kill off the job if it is running */ int delay = 0; pbs_queue *pque; // Apply the user delay first so it takes precedence. if (pjob->ji_wattr[JOB_ATR_user_kill_delay].at_flags & ATR_VFLAG_SET) delay = pjob->ji_wattr[JOB_ATR_user_kill_delay].at_val.at_long; if ((pque = get_jobs_queue(&pjob)) != NULL) { mutex_mgr pque_mutex = mutex_mgr(pque->qu_mutex, true); mutex_mgr server_mutex = mutex_mgr(server.sv_attr_mutex, false); if (delay == 0) { delay = attr_ifelse_long(&pque->qu_attr[QE_ATR_KillDelay], &server.sv_attr[SRV_ATR_KillDelay], 0); } } else { /* why is the pque null. Something went wrong */ snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "jobid %s returned a null queue", pjob->ji_qs.ji_jobid); req_reject(PBSE_UNKQUE, 0, preq, NULL, log_buf); return(PBSE_UNKQUE); } pjob->ji_qs.ji_substate = JOB_SUBSTATE_RERUN; if (delay != 0) { static const char *rerun = "rerun"; char *extra = strdup(rerun); get_batch_request_id(preq); /* If a qrerun -f is given requeue the job regardless of the outcome of issue_signal*/ if ((preq->rq_extend) && (!strncasecmp(preq->rq_extend, RERUNFORCE, strlen(RERUNFORCE)))) { std::string extend = RERUNFORCE; batch_request *dup = new batch_request(*preq); get_batch_request_id(dup); rc = issue_signal(&pjob, "SIGTERM", delay_and_send_sig_kill, extra, strdup(dup->rq_id.c_str())); if (rc == PBSE_NORELYMOM) { dup->rq_reply.brp_code = PBSE_NORELYMOM; pjob_mutex.unlock(); post_rerun(dup); pjob = svr_find_job(preq->rq_ind.rq_signal.rq_jid, FALSE); if (pjob == NULL) { delete dup; return(PBSE_NONE); } pjob_mutex.set_lock_state(true); rc = PBSE_NONE; } delete dup; } else { rc = issue_signal(&pjob, "SIGTERM", delay_and_send_sig_kill, extra, strdup(preq->rq_id.c_str())); if (rc != PBSE_NONE) { /* cant send to MOM */ req_reject(rc, 0, preq, NULL, NULL); } return(rc); } } else { static const char *rerun = "rerun"; char *extra = strdup(rerun); /* If a qrerun -f is given requeue the job regardless of the outcome of issue_signal*/ if (preq->rq_extend && !strncasecmp(preq->rq_extend, RERUNFORCE, strlen(RERUNFORCE))) { std::string extend = RERUNFORCE; rc = issue_signal(&pjob, "SIGKILL", post_rerun, extra, strdup(extend.c_str())); if (rc == PBSE_NORELYMOM) rc = PBSE_NONE; } else rc = issue_signal(&pjob, "SIGKILL", post_rerun, extra, NULL); } } else { if (pjob->ji_wattr[JOB_ATR_hold].at_val.at_long == HOLD_n) { svr_setjobstate(pjob, JOB_STATE_QUEUED, JOB_SUBSTATE_QUEUED, FALSE); } else { svr_setjobstate(pjob, JOB_STATE_HELD, JOB_SUBSTATE_HELD, FALSE); } /* reset some job attributes */ pjob->ji_wattr[JOB_ATR_comp_time].at_flags &= ~ATR_VFLAG_SET; pjob->ji_wattr[JOB_ATR_reported].at_flags &= ~ATR_VFLAG_SET; set_statechar(pjob); rc = -1; } /* finalize_rerunjob will return with pjob->ji_mutex unlocked */ pjob_mutex.set_unlock_on_exit(false); return finalize_rerunjob(preq,pjob,rc); }
int req_rerunjob( struct batch_request *preq) { int rc = PBSE_NONE; job *pjob; int MgrRequired = TRUE; char log_buf[LOCAL_LOG_BUF_SIZE]; /* check if requestor is admin, job owner, etc */ if ((pjob = chk_job_request(preq->rq_ind.rq_rerun, preq)) == 0) { /* FAILURE */ /* chk_job_request calls req_reject() */ rc = PBSE_SYSTEM; return rc; /* This needs to fixed to return an accurate error */ } /* the job must be running or completed */ if (pjob->ji_qs.ji_state >= JOB_STATE_EXITING) { if (pjob->ji_wattr[JOB_ATR_checkpoint_name].at_flags & ATR_VFLAG_SET) { /* allow end-users to rerun checkpointed jobs */ MgrRequired = FALSE; } } else if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING) { /* job is running */ /* NO-OP */ } else { /* FAILURE - job is in bad state */ rc = PBSE_BADSTATE; snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "job %s is in a bad state", preq->rq_ind.rq_rerun); req_reject(rc, 0, preq, NULL, log_buf); unlock_ji_mutex(pjob, __func__, "2", LOGLEVEL); return rc; } if ((MgrRequired == TRUE) && ((preq->rq_perm & (ATR_DFLAG_MGWR | ATR_DFLAG_OPWR)) == 0)) { /* FAILURE */ rc = PBSE_PERM; snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "additional permissions required (ATR_DFLAG_MGWR | ATR_DFLAG_OPWR)"); req_reject(rc, 0, preq, NULL, log_buf); unlock_ji_mutex(pjob, __func__, "3", LOGLEVEL); return rc; } /* the job must be rerunnable */ if (pjob->ji_wattr[JOB_ATR_rerunable].at_val.at_long == 0) { /* NOTE: should force override this constraint? maybe (???) */ /* no, the user is saying that the job will break, and IEEE Std 1003.1 specifically says rerun is to be rejected if rerunable==FALSE -garrick */ rc = PBSE_NORERUN; snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "job %s not rerunnable", preq->rq_ind.rq_rerun); req_reject(rc, 0, preq, NULL, log_buf); unlock_ji_mutex(pjob, __func__, "4", LOGLEVEL); return rc; } if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING) { /* ask MOM to kill off the job if it is running */ int delay = 0; pbs_queue *pque; if ((pque = get_jobs_queue(&pjob)) != NULL) { mutex_mgr pque_mutex = mutex_mgr(pque->qu_mutex, true); pthread_mutex_lock(server.sv_attr_mutex); delay = attr_ifelse_long(&pque->qu_attr[QE_ATR_KillDelay], &server.sv_attr[SRV_ATR_KillDelay], 0); pthread_mutex_unlock(server.sv_attr_mutex); } else if (pjob == NULL) { rc = PBSE_NORERUN; unlock_ji_mutex(pjob, __func__, "4", LOGLEVEL); return rc; } if(delay != 0) { static const char *rerun = "rerun"; char *extra = strdup(rerun); get_batch_request_id(preq); if ((rc = issue_signal(&pjob, "SIGTERM", delay_and_send_sig_kill, extra, strdup(preq->rq_id)))) { /* cant send to MOM */ req_reject(rc, 0, preq, NULL, NULL); } unlock_ji_mutex(pjob, __func__, "4", LOGLEVEL); return rc; } else { static const char *rerun = "rerun"; char *extra = strdup(rerun); rc = issue_signal(&pjob, "SIGKILL", post_rerun, extra, NULL); } } else { if (pjob->ji_wattr[JOB_ATR_hold].at_val.at_long == HOLD_n) { svr_setjobstate(pjob, JOB_STATE_QUEUED, JOB_SUBSTATE_QUEUED, FALSE); } else { svr_setjobstate(pjob, JOB_STATE_HELD, JOB_SUBSTATE_HELD, FALSE); } /* reset some job attributes */ pjob->ji_wattr[JOB_ATR_comp_time].at_flags &= ~ATR_VFLAG_SET; pjob->ji_wattr[JOB_ATR_reported].at_flags &= ~ATR_VFLAG_SET; set_statechar(pjob); rc = -1; } return finalize_rerunjob(preq,pjob,rc); }