int req_signaljob( void *vp) /* I */ { struct batch_request *preq = (struct batch_request *)vp; job *pjob; int rc; char log_buf[LOCAL_LOG_BUF_SIZE]; struct batch_request *dup_req = NULL; /* preq free'd in error cases */ if ((pjob = chk_job_request(preq->rq_ind.rq_signal.rq_jid, preq)) == 0) { return(PBSE_NONE); } /* the job must be running */ if (pjob->ji_qs.ji_state != JOB_STATE_RUNNING) { req_reject(PBSE_BADSTATE, 0, preq, NULL, NULL); unlock_ji_mutex(pjob, __func__, (char *)"1", LOGLEVEL); return(PBSE_NONE); } /* Special pseudo signals for suspend and resume require op/mgr */ if (!strcmp(preq->rq_ind.rq_signal.rq_signame, SIG_RESUME) || !strcmp(preq->rq_ind.rq_signal.rq_signame, SIG_SUSPEND)) { if ((preq->rq_perm & (ATR_DFLAG_OPRD | ATR_DFLAG_OPWR | ATR_DFLAG_MGRD | ATR_DFLAG_MGWR)) == 0) { /* for suspend/resume, must be mgr/op */ req_reject(PBSE_PERM, 0, preq, NULL, NULL); unlock_ji_mutex(pjob, __func__, (char *)"2", LOGLEVEL); return(PBSE_NONE); } } /* save job ptr for post_signal_req() */ preq->rq_extra = strdup(pjob->ji_qs.ji_jobid); /* FIXME: need a race-free check for available free subnodes before * resuming a suspended job */ #ifdef DONOTSUSPINTJOB /* interactive jobs don't resume correctly so don't allow a suspend */ if (!strcmp(preq->rq_ind.rq_signal.rq_signame, SIG_SUSPEND) && (pjob->ji_wattr[JOB_ATR_interactive].at_flags & ATR_VFLAG_SET) && (pjob->ji_wattr[JOB_ATR_interactive].at_val.at_long > 0)) { req_reject(PBSE_JOBTYPE, 0, preq, NULL, NULL); unlock_ji_mutex(pjob, __func__, (char *)"3", LOGLEVEL); return(PBSE_NONE); } #endif if (LOGLEVEL >= 6) { sprintf(log_buf, "relaying signal request to mom %lu", pjob->ji_qs.ji_un.ji_exect.ji_momaddr); log_record(PBSEVENT_SCHED,PBS_EVENTCLASS_REQUEST,"req_signaljob",log_buf); } /* send reply for asynchronous suspend */ if (preq->rq_type == PBS_BATCH_AsySignalJob) { reply_ack(preq); preq->rq_noreply = TRUE; } /* pass the request on to MOM */ if ((rc = copy_batchrequest(&dup_req, preq, 0, -1)) != 0) { req_reject(rc, 0, preq, NULL, "can not allocate memory"); unlock_ji_mutex(pjob, __func__, (char *)"4", LOGLEVEL); } /* The dup_req is freed in relay_to_mom (failure) * or in issue_Drequest (success) */ else { rc = relay_to_mom(&pjob, dup_req, NULL); if (pjob != NULL) unlock_ji_mutex(pjob, __func__, (char *)"4", LOGLEVEL); if (rc != PBSE_NONE) { free_br(dup_req); req_reject(rc, 0, preq, NULL, NULL); /* unable to get to MOM */ } else { post_signal_req(dup_req); free_br(preq); } } /* If successful we ack after mom replies to us, we pick up in post_signal_req() */ return(PBSE_NONE); } /* END req_signaljob() */
int req_signaljob( batch_request *preq) /* I */ { job *pjob; int rc; char log_buf[LOCAL_LOG_BUF_SIZE]; batch_request *dup_req = NULL; /* preq free'd in error cases */ if ((pjob = chk_job_request(preq->rq_ind.rq_signal.rq_jid, preq)) == 0) { return(PBSE_NONE); } mutex_mgr job_mutex(pjob->ji_mutex, true); /* the job must be running */ if (pjob->ji_qs.ji_state != JOB_STATE_RUNNING) { req_reject(PBSE_BADSTATE, 0, preq, NULL, NULL); return(PBSE_NONE); } /* Special pseudo signals for suspend and resume require op/mgr */ if (!strcmp(preq->rq_ind.rq_signal.rq_signame, SIG_RESUME) || !strcmp(preq->rq_ind.rq_signal.rq_signame, SIG_SUSPEND)) { if ((preq->rq_perm & (ATR_DFLAG_OPRD | ATR_DFLAG_OPWR | ATR_DFLAG_MGRD | ATR_DFLAG_MGWR)) == 0) { /* for suspend/resume, must be mgr/op */ req_reject(PBSE_PERM, 0, preq, NULL, NULL); return(PBSE_NONE); } } /* save job ptr for post_signal_req() */ preq->rq_extra = strdup(pjob->ji_qs.ji_jobid); /* FIXME: need a race-free check for available free subnodes before * resuming a suspended job */ #ifdef DONOTSUSPINTJOB /* interactive jobs don't resume correctly so don't allow a suspend */ if (!strcmp(preq->rq_ind.rq_signal.rq_signame, SIG_SUSPEND) && (pjob->ji_wattr[JOB_ATR_interactive].at_flags & ATR_VFLAG_SET) && (pjob->ji_wattr[JOB_ATR_interactive].at_val.at_long > 0)) { req_reject(PBSE_JOBTYPE, 0, preq, NULL, NULL); return(PBSE_NONE); } #endif if (LOGLEVEL >= 6) { char ipstr[128]; sprintf(log_buf, "relaying signal request to mom %s", netaddr_long(pjob->ji_qs.ji_un.ji_exect.ji_momaddr,ipstr)); log_record(PBSEVENT_SCHED,PBS_EVENTCLASS_REQUEST,"req_signaljob",log_buf); } /* send reply for asynchronous suspend */ if (preq->rq_type == PBS_BATCH_AsySignalJob) { /* reply_ack will free preq. We need to copy it before we call reply_ack */ batch_request *new_preq; new_preq = duplicate_request(preq, -1); if (new_preq == NULL) { sprintf(log_buf, "failed to duplicate batch request"); log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, __func__, log_buf); return(PBSE_MEM_MALLOC); } get_batch_request_id(new_preq); reply_ack(new_preq); preq->rq_noreply = TRUE; } /* pass the request on to MOM */ if ((dup_req = duplicate_request(preq)) == NULL) { req_reject(PBSE_SYSTEM, 0, preq, NULL, "can not allocate memory"); } /* The dup_req is freed in relay_to_mom (failure) * or in issue_Drequest (success) */ else { rc = relay_to_mom(&pjob, dup_req, NULL); if (pjob != NULL) job_mutex.unlock(); else job_mutex.set_unlock_on_exit(false); if (rc != PBSE_NONE) { free_br(dup_req); req_reject(rc, 0, preq, NULL, NULL); /* unable to get to MOM */ } else { post_signal_req(dup_req); free_br(preq); } } /* If successful we ack after mom replies to us, we pick up in post_signal_req() */ return(PBSE_NONE); } /* END req_signaljob() */