END_TEST START_TEST(set_statechar_test) { struct job test_job; memset(&test_job, 0, sizeof(test_job)); set_statechar(NULL); set_statechar(&test_job); }
int req_rerunjob( batch_request *preq) { int rc = PBSE_NONE; job *pjob; int MgrRequired = TRUE; char log_buf[LOCAL_LOG_BUF_SIZE]; /* check if requestor is admin, job owner, etc */ if (!strcasecmp(preq->rq_ind.rq_rerun, "all")) { return(handle_requeue_all(preq)); } if ((pjob = chk_job_request(preq->rq_ind.rq_rerun, preq)) == 0) { /* FAILURE */ /* chk_job_request calls req_reject() */ rc = PBSE_SYSTEM; return rc; /* This needs to fixed to return an accurate error */ } mutex_mgr pjob_mutex = mutex_mgr(pjob->ji_mutex, true); /* the job must be running or completed */ if (pjob->ji_qs.ji_state >= JOB_STATE_EXITING) { if (pjob->ji_wattr[JOB_ATR_checkpoint_name].at_flags & ATR_VFLAG_SET) { /* allow end-users to rerun checkpointed jobs */ MgrRequired = FALSE; } } else if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING) { /* job is running */ /* NO-OP */ } else if (pjob->ji_qs.ji_state == JOB_STATE_QUEUED) { // If we are already queued, then there is nothing to do. rc = PBSE_NONE; reply_ack(preq); return(rc); } else { /* FAILURE - job is in bad state */ rc = PBSE_BADSTATE; snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "job %s is in a bad state", preq->rq_ind.rq_rerun); req_reject(rc, 0, preq, NULL, log_buf); return rc; } if ((MgrRequired == TRUE) && ((preq->rq_perm & (ATR_DFLAG_MGWR | ATR_DFLAG_OPWR)) == 0)) { /* FAILURE */ rc = PBSE_PERM; snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "additional permissions required (ATR_DFLAG_MGWR | ATR_DFLAG_OPWR)"); req_reject(rc, 0, preq, NULL, log_buf); return rc; } /* the job must be rerunnable */ if (pjob->ji_wattr[JOB_ATR_rerunable].at_val.at_long == 0) { /* NOTE: should force override this constraint? maybe (???) */ /* no, the user is saying that the job will break, and IEEE Std 1003.1 specifically says rerun is to be rejected if rerunable==FALSE -garrick */ rc = PBSE_NORERUN; snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "job %s not rerunnable", preq->rq_ind.rq_rerun); req_reject(rc, 0, preq, NULL, log_buf); return rc; } if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING) { /* ask MOM to kill off the job if it is running */ int delay = 0; pbs_queue *pque; // Apply the user delay first so it takes precedence. if (pjob->ji_wattr[JOB_ATR_user_kill_delay].at_flags & ATR_VFLAG_SET) delay = pjob->ji_wattr[JOB_ATR_user_kill_delay].at_val.at_long; if ((pque = get_jobs_queue(&pjob)) != NULL) { mutex_mgr pque_mutex = mutex_mgr(pque->qu_mutex, true); mutex_mgr server_mutex = mutex_mgr(server.sv_attr_mutex, false); if (delay == 0) { delay = attr_ifelse_long(&pque->qu_attr[QE_ATR_KillDelay], &server.sv_attr[SRV_ATR_KillDelay], 0); } } else { /* why is the pque null. Something went wrong */ snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "jobid %s returned a null queue", pjob->ji_qs.ji_jobid); req_reject(PBSE_UNKQUE, 0, preq, NULL, log_buf); return(PBSE_UNKQUE); } pjob->ji_qs.ji_substate = JOB_SUBSTATE_RERUN; if (delay != 0) { static const char *rerun = "rerun"; char *extra = strdup(rerun); get_batch_request_id(preq); /* If a qrerun -f is given requeue the job regardless of the outcome of issue_signal*/ if ((preq->rq_extend) && (!strncasecmp(preq->rq_extend, RERUNFORCE, strlen(RERUNFORCE)))) { std::string extend = RERUNFORCE; batch_request *dup = new batch_request(*preq); get_batch_request_id(dup); rc = issue_signal(&pjob, "SIGTERM", delay_and_send_sig_kill, extra, strdup(dup->rq_id.c_str())); if (rc == PBSE_NORELYMOM) { dup->rq_reply.brp_code = PBSE_NORELYMOM; pjob_mutex.unlock(); post_rerun(dup); pjob = svr_find_job(preq->rq_ind.rq_signal.rq_jid, FALSE); if (pjob == NULL) { delete dup; return(PBSE_NONE); } pjob_mutex.set_lock_state(true); rc = PBSE_NONE; } delete dup; } else { rc = issue_signal(&pjob, "SIGTERM", delay_and_send_sig_kill, extra, strdup(preq->rq_id.c_str())); if (rc != PBSE_NONE) { /* cant send to MOM */ req_reject(rc, 0, preq, NULL, NULL); } return(rc); } } else { static const char *rerun = "rerun"; char *extra = strdup(rerun); /* If a qrerun -f is given requeue the job regardless of the outcome of issue_signal*/ if (preq->rq_extend && !strncasecmp(preq->rq_extend, RERUNFORCE, strlen(RERUNFORCE))) { std::string extend = RERUNFORCE; rc = issue_signal(&pjob, "SIGKILL", post_rerun, extra, strdup(extend.c_str())); if (rc == PBSE_NORELYMOM) rc = PBSE_NONE; } else rc = issue_signal(&pjob, "SIGKILL", post_rerun, extra, NULL); } } else { if (pjob->ji_wattr[JOB_ATR_hold].at_val.at_long == HOLD_n) { svr_setjobstate(pjob, JOB_STATE_QUEUED, JOB_SUBSTATE_QUEUED, FALSE); } else { svr_setjobstate(pjob, JOB_STATE_HELD, JOB_SUBSTATE_HELD, FALSE); } /* reset some job attributes */ pjob->ji_wattr[JOB_ATR_comp_time].at_flags &= ~ATR_VFLAG_SET; pjob->ji_wattr[JOB_ATR_reported].at_flags &= ~ATR_VFLAG_SET; set_statechar(pjob); rc = -1; } /* finalize_rerunjob will return with pjob->ji_mutex unlocked */ pjob_mutex.set_unlock_on_exit(false); return finalize_rerunjob(preq,pjob,rc); }
void post_signal_req( batch_request *preq) { char *jobid; job *pjob; char log_buf[LOCAL_LOG_BUF_SIZE]; /* request has been handled elsewhere */ if (preq == NULL) return; preq->rq_conn = preq->rq_orgconn; /* restore client socket */ if (preq->rq_reply.brp_code) { log_event( PBSEVENT_DEBUG, PBS_EVENTCLASS_REQUEST, preq->rq_ind.rq_signal.rq_jid, pbse_to_txt(PBSE_MOMREJECT)); errno = 0; req_reject(preq->rq_reply.brp_code, 0, preq, NULL, NULL); } else { if ((jobid = preq->rq_extra) == NULL) { log_err(ENOMEM, __func__, (char *)"Cannot allocate memory! FAILURE"); return; } if ((pjob = svr_find_job(jobid, FALSE)) != NULL) { if (strcmp(preq->rq_ind.rq_signal.rq_signame, SIG_SUSPEND) == 0) { if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_Suspend) == 0) { pjob->ji_qs.ji_svrflags |= JOB_SVFLG_Suspend; set_statechar(pjob); job_save(pjob, SAVEJOB_QUICK, 0); /* release resources allocated to suspended job - NORWAY */ free_nodes(pjob); } } else if (strcmp(preq->rq_ind.rq_signal.rq_signame, SIG_RESUME) == 0) { if (pjob->ji_qs.ji_svrflags & JOB_SVFLG_Suspend) { /* re-allocate assigned node to resumed job - NORWAY */ set_old_nodes(pjob); pjob->ji_qs.ji_svrflags &= ~JOB_SVFLG_Suspend; set_statechar(pjob); job_save(pjob, SAVEJOB_QUICK, 0); } } unlock_ji_mutex(pjob, __func__, (char *)"5", LOGLEVEL); } else { /* job is gone */ snprintf(log_buf,sizeof(log_buf), "Cannot find job '%s', assuming success", jobid); log_event(PBSEVENT_SYSTEM, PBS_EVENTCLASS_JOB, __func__, log_buf); } free(jobid); reply_ack(preq); } return; } /* END post_signal_req() */
int req_rerunjob( struct batch_request *preq) { int rc = PBSE_NONE; job *pjob; int Force; int MgrRequired = TRUE; char log_buf[LOCAL_LOG_BUF_SIZE]; /* check if requestor is admin, job owner, etc */ if ((pjob = chk_job_request(preq->rq_ind.rq_rerun, preq)) == 0) { /* FAILURE */ /* chk_job_request calls req_reject() */ rc = PBSE_SYSTEM; return rc; /* This needs to fixed to return an accurate error */ } /* the job must be running or completed */ if (pjob->ji_qs.ji_state >= JOB_STATE_EXITING) { if (pjob->ji_wattr[JOB_ATR_checkpoint_name].at_flags & ATR_VFLAG_SET) { /* allow end-users to rerun checkpointed jobs */ MgrRequired = FALSE; } } else if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING) { /* job is running */ /* NO-OP */ } else { /* FAILURE - job is in bad state */ rc = PBSE_BADSTATE; snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "job %s is in a bad state", preq->rq_ind.rq_rerun); req_reject(rc, 0, preq, NULL, log_buf); unlock_ji_mutex(pjob, __func__, "2", LOGLEVEL); return rc; } if ((MgrRequired == TRUE) && ((preq->rq_perm & (ATR_DFLAG_MGWR | ATR_DFLAG_OPWR)) == 0)) { /* FAILURE */ rc = PBSE_PERM; snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "additional permissions required (ATR_DFLAG_MGWR | ATR_DFLAG_OPWR)"); req_reject(rc, 0, preq, NULL, log_buf); unlock_ji_mutex(pjob, __func__, "3", LOGLEVEL); return rc; } /* the job must be rerunnable */ if (pjob->ji_wattr[JOB_ATR_rerunable].at_val.at_long == 0) { /* NOTE: should force override this constraint? maybe (???) */ /* no, the user is saying that the job will break, and IEEE Std 1003.1 specifically says rerun is to be rejected if rerunable==FALSE -garrick */ rc = PBSE_NORERUN; snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "job %s not rerunnable", preq->rq_ind.rq_rerun); req_reject(rc, 0, preq, NULL, log_buf); unlock_ji_mutex(pjob, __func__, "4", LOGLEVEL); return rc; } if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING) { /* ask MOM to kill off the job if it is running */ static const char *rerun = "rerun"; char *extra = strdup(rerun); rc = issue_signal(&pjob, "SIGKILL", post_rerun, extra); } else { if (pjob->ji_wattr[JOB_ATR_hold].at_val.at_long == HOLD_n) { svr_setjobstate(pjob, JOB_STATE_QUEUED, JOB_SUBSTATE_QUEUED, FALSE); } else { svr_setjobstate(pjob, JOB_STATE_HELD, JOB_SUBSTATE_HELD, FALSE); } /* reset some job attributes */ pjob->ji_wattr[JOB_ATR_comp_time].at_flags &= ~ATR_VFLAG_SET; pjob->ji_wattr[JOB_ATR_reported].at_flags &= ~ATR_VFLAG_SET; set_statechar(pjob); rc = -1; } if (preq->rq_extend && !strncasecmp(preq->rq_extend, RERUNFORCE, strlen(RERUNFORCE))) Force = 1; else Force = 0; switch (rc) { case - 1: /* completed job was requeued */ /* clear out job completion time if there is one */ break; case 0: /* requeue request successful */ if (pjob != NULL) pjob->ji_qs.ji_substate = JOB_SUBSTATE_RERUN; break; case PBSE_SYSTEM: /* This may not be accurate...*/ rc = PBSE_MEM_MALLOC; snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "Can not allocate memory"); req_reject(rc, 0, preq, NULL, log_buf); return rc; break; default: if (Force == 0) { rc = PBSE_MOMREJECT; snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "Rejected by mom"); req_reject(rc, 0, preq, NULL, log_buf); if (pjob != NULL) unlock_ji_mutex(pjob, __func__, "5", LOGLEVEL); return rc; } else { int newstate; int newsubst; unsigned int dummy; char *tmp; long cray_enabled = FALSE; if (pjob != NULL) { get_svr_attr_l(SRV_ATR_CrayEnabled, &cray_enabled); if ((cray_enabled == TRUE) && (pjob->ji_wattr[JOB_ATR_login_node_id].at_val.at_str != NULL)) tmp = parse_servername(pjob->ji_wattr[JOB_ATR_login_node_id].at_val.at_str, &dummy); else tmp = parse_servername(pjob->ji_wattr[JOB_ATR_exec_host].at_val.at_str, &dummy); /* Cannot communicate with MOM, forcibly requeue job. This is a relatively disgusting thing to do */ sprintf(log_buf, "rerun req to %s failed (rc=%d), forcibly requeueing job", tmp, rc); free(tmp); log_event( PBSEVENT_ERROR | PBSEVENT_ADMIN | PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buf); log_err(-1, __func__, log_buf); strcat(log_buf, ", previous output files may be lost"); svr_mailowner(pjob, MAIL_OTHER, MAIL_FORCE, log_buf); svr_setjobstate(pjob, JOB_STATE_EXITING, JOB_SUBSTATE_RERUN3, FALSE); rel_resc(pjob); /* free resc assigned to job */ if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_HOTSTART) == 0) { /* in case of server shutdown, don't clear exec_host */ /* will use it on hotstart when next comes up */ job_attr_def[JOB_ATR_exec_host].at_free(&pjob->ji_wattr[JOB_ATR_exec_host]); job_attr_def[JOB_ATR_session_id].at_free(&pjob->ji_wattr[JOB_ATR_session_id]); job_attr_def[JOB_ATR_exec_gpus].at_free(&pjob->ji_wattr[JOB_ATR_exec_gpus]); } pjob->ji_modified = 1; /* force full job save */ pjob->ji_momhandle = -1; pjob->ji_qs.ji_svrflags &= ~JOB_SVFLG_StagedIn; svr_evaljobstate(pjob, &newstate, &newsubst, 0); svr_setjobstate(pjob, newstate, newsubst, FALSE); } } break; } /* END switch (rc) */ /* So job has run and is to be rerun (not restarted) */ if (pjob == NULL) { rc = PBSE_JOB_RERUN; } else { pjob->ji_qs.ji_svrflags = (pjob->ji_qs.ji_svrflags & ~(JOB_SVFLG_CHECKPOINT_FILE |JOB_SVFLG_CHECKPOINT_MIGRATEABLE | JOB_SVFLG_CHECKPOINT_COPIED)) | JOB_SVFLG_HASRUN; sprintf(log_buf, msg_manager, msg_jobrerun, preq->rq_user, preq->rq_host); log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf); reply_ack(preq); /* note in accounting file */ account_record(PBS_ACCT_RERUN, pjob, NULL); unlock_ji_mutex(pjob, __func__, "6", LOGLEVEL); } return rc; } /* END req_rerunjob() */
int req_rerunjob( struct batch_request *preq) { int rc = PBSE_NONE; job *pjob; int MgrRequired = TRUE; char log_buf[LOCAL_LOG_BUF_SIZE]; /* check if requestor is admin, job owner, etc */ if ((pjob = chk_job_request(preq->rq_ind.rq_rerun, preq)) == 0) { /* FAILURE */ /* chk_job_request calls req_reject() */ rc = PBSE_SYSTEM; return rc; /* This needs to fixed to return an accurate error */ } /* the job must be running or completed */ if (pjob->ji_qs.ji_state >= JOB_STATE_EXITING) { if (pjob->ji_wattr[JOB_ATR_checkpoint_name].at_flags & ATR_VFLAG_SET) { /* allow end-users to rerun checkpointed jobs */ MgrRequired = FALSE; } } else if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING) { /* job is running */ /* NO-OP */ } else { /* FAILURE - job is in bad state */ rc = PBSE_BADSTATE; snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "job %s is in a bad state", preq->rq_ind.rq_rerun); req_reject(rc, 0, preq, NULL, log_buf); unlock_ji_mutex(pjob, __func__, "2", LOGLEVEL); return rc; } if ((MgrRequired == TRUE) && ((preq->rq_perm & (ATR_DFLAG_MGWR | ATR_DFLAG_OPWR)) == 0)) { /* FAILURE */ rc = PBSE_PERM; snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "additional permissions required (ATR_DFLAG_MGWR | ATR_DFLAG_OPWR)"); req_reject(rc, 0, preq, NULL, log_buf); unlock_ji_mutex(pjob, __func__, "3", LOGLEVEL); return rc; } /* the job must be rerunnable */ if (pjob->ji_wattr[JOB_ATR_rerunable].at_val.at_long == 0) { /* NOTE: should force override this constraint? maybe (???) */ /* no, the user is saying that the job will break, and IEEE Std 1003.1 specifically says rerun is to be rejected if rerunable==FALSE -garrick */ rc = PBSE_NORERUN; snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "job %s not rerunnable", preq->rq_ind.rq_rerun); req_reject(rc, 0, preq, NULL, log_buf); unlock_ji_mutex(pjob, __func__, "4", LOGLEVEL); return rc; } if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING) { /* ask MOM to kill off the job if it is running */ int delay = 0; pbs_queue *pque; if ((pque = get_jobs_queue(&pjob)) != NULL) { mutex_mgr pque_mutex = mutex_mgr(pque->qu_mutex, true); pthread_mutex_lock(server.sv_attr_mutex); delay = attr_ifelse_long(&pque->qu_attr[QE_ATR_KillDelay], &server.sv_attr[SRV_ATR_KillDelay], 0); pthread_mutex_unlock(server.sv_attr_mutex); } else if (pjob == NULL) { rc = PBSE_NORERUN; unlock_ji_mutex(pjob, __func__, "4", LOGLEVEL); return rc; } if(delay != 0) { static const char *rerun = "rerun"; char *extra = strdup(rerun); get_batch_request_id(preq); if ((rc = issue_signal(&pjob, "SIGTERM", delay_and_send_sig_kill, extra, strdup(preq->rq_id)))) { /* cant send to MOM */ req_reject(rc, 0, preq, NULL, NULL); } unlock_ji_mutex(pjob, __func__, "4", LOGLEVEL); return rc; } else { static const char *rerun = "rerun"; char *extra = strdup(rerun); rc = issue_signal(&pjob, "SIGKILL", post_rerun, extra, NULL); } } else { if (pjob->ji_wattr[JOB_ATR_hold].at_val.at_long == HOLD_n) { svr_setjobstate(pjob, JOB_STATE_QUEUED, JOB_SUBSTATE_QUEUED, FALSE); } else { svr_setjobstate(pjob, JOB_STATE_HELD, JOB_SUBSTATE_HELD, FALSE); } /* reset some job attributes */ pjob->ji_wattr[JOB_ATR_comp_time].at_flags &= ~ATR_VFLAG_SET; pjob->ji_wattr[JOB_ATR_reported].at_flags &= ~ATR_VFLAG_SET; set_statechar(pjob); rc = -1; } return finalize_rerunjob(preq,pjob,rc); }
static void post_signal_req( struct work_task *pwt) { job *pjob; struct batch_request *preq; svr_disconnect(pwt->wt_event); /* disconnect from MOM */ preq = pwt->wt_parm1; preq->rq_conn = preq->rq_orgconn; /* restore client socket */ if (preq->rq_reply.brp_code) { log_event( PBSEVENT_DEBUG, PBS_EVENTCLASS_REQUEST, preq->rq_ind.rq_signal.rq_jid, pbse_to_txt(PBSE_MOMREJECT)); errno = 0; req_reject(preq->rq_reply.brp_code, 0, preq, NULL, NULL); } else { pjob = preq->rq_extra; if (strcmp(preq->rq_ind.rq_signal.rq_signame, SIG_SUSPEND) == 0) { if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_Suspend) == 0) { pjob->ji_qs.ji_svrflags |= JOB_SVFLG_Suspend; set_statechar(pjob); job_save(pjob, SAVEJOB_QUICK, 0); /* release resources allocated to suspended job - NORWAY */ free_nodes(pjob); } } else if (strcmp(preq->rq_ind.rq_signal.rq_signame, SIG_RESUME) == 0) { if (pjob->ji_qs.ji_svrflags & JOB_SVFLG_Suspend) { /* re-allocate assigned node to resumed job - NORWAY */ set_old_nodes(pjob); pjob->ji_qs.ji_svrflags &= ~JOB_SVFLG_Suspend; set_statechar(pjob); job_save(pjob, SAVEJOB_QUICK, 0); } } reply_ack(preq); } return; } /* END post_signal_req() */