static void process_gpu_request_reply( struct work_task *pwt) { char *id = "process_gpu_request_reply"; struct batch_request *preq; svr_disconnect(pwt->wt_event); /* close connection to MOM */ preq = pwt->wt_parm1; preq->rq_conn = preq->rq_orgconn; /* restore client socket */ if (preq->rq_reply.brp_code != 0) { sprintf(log_buffer, "MOM failed on GPU request, rc = %d", preq->rq_reply.brp_code); log_err(errno, id, log_buffer); req_reject(preq->rq_reply.brp_code, 0, preq, NULL, log_buffer); } else { /* record that MOM changed gpu mode */ if (LOGLEVEL >= 7) { sprintf( log_buffer, "GPU control request completed for node %s gpuid %s mode %d reset_perm %d reset_vol %d", preq->rq_ind.rq_gpuctrl.rq_momnode, preq->rq_ind.rq_gpuctrl.rq_gpuid, preq->rq_ind.rq_gpuctrl.rq_gpumode, preq->rq_ind.rq_gpuctrl.rq_reset_perm, preq->rq_ind.rq_gpuctrl.rq_reset_vol); log_ext(-1, id, log_buffer, LOG_INFO); } reply_ack(preq); } }
void req_jobcredential( struct batch_request *preq) /* ptr to the decoded request */ { job *pj; pj = locate_new_job(preq->rq_conn, NULL); if (pj == NULL) { req_reject(PBSE_IVALREQ, 0, preq, NULL, NULL); return; } reply_ack(preq); return; } /* END req_jobcredential() */
/* * process_gpu_request_reply * called when a gpu change request was sent to MOM and the answer * is received. Completes the gpu request. */ void process_gpu_request_reply( batch_request *preq) { char log_buf[LOCAL_LOG_BUF_SIZE]; if (preq == NULL) return; preq->rq_conn = preq->rq_orgconn; /* restore client socket */ if (preq->rq_reply.brp_code != 0) { sprintf(log_buf, "MOM failed on GPU request, rc = %d", preq->rq_reply.brp_code); log_err(errno, __func__, log_buf); req_reject(preq->rq_reply.brp_code, 0, preq, NULL, log_buf); } else { /* record that MOM changed gpu mode */ if (LOGLEVEL >= 7) { sprintf( log_buf, "GPU control request completed for node %s gpuid %s mode %d reset_perm %d reset_vol %d", preq->rq_ind.rq_gpuctrl.rq_momnode, preq->rq_ind.rq_gpuctrl.rq_gpuid, preq->rq_ind.rq_gpuctrl.rq_gpumode, preq->rq_ind.rq_gpuctrl.rq_reset_perm, preq->rq_ind.rq_gpuctrl.rq_reset_vol); log_ext(-1, __func__, log_buf, LOG_INFO); } reply_ack(preq); } } /* END process_gpu_request_reply() */
int get_UID( int s, char *munge_buf, struct batch_request *preq) { char *ptr; char user_name[PBS_MAXUSER]; int i = 0; ptr = strstr(munge_buf, "UID:"); if (!ptr) { req_reject(PBSE_SYSTEM, 0, preq, NULL, "could not read unmunge data user"); return(-1); } ptr = strchr(ptr, ':'); ptr++; while (*ptr == SPACE) { ptr++; } memset(user_name, 0, sizeof(user_name)); while ((*ptr != SPACE) && (!isspace(*ptr)) && (i < (int)sizeof(user_name))) { user_name[i++] = *ptr; ptr++; } strncpy(conn_credent[s].username, user_name, sizeof(conn_credent[s].username) - 1); conn_credent[s].username[sizeof(conn_credent[s].username) - 1] = 0; return(PBSE_NONE); } /* END get_UID() */
void post_message_req( batch_request *preq) { char log_buf[LOCAL_LOG_BUF_SIZE]; /* preq has been hadnled previously */ if (preq == NULL) return; preq->rq_conn = preq->rq_orgconn; /* restore socket to client */ sprintf(log_buf, msg_messagejob, preq->rq_reply.brp_code); log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, preq->rq_ind.rq_message.rq_jid, log_buf); if (preq->rq_reply.brp_code) req_reject(preq->rq_reply.brp_code, 0, preq, NULL, NULL); else reply_ack(preq); } /* END post_message_req() */
void *modify_job_work( batch_request *vp) /* I */ { job *pjob; svrattrl *plist; int checkpoint_req = FALSE; batch_request *preq = (struct batch_request *)vp; pjob = svr_find_job(preq->rq_ind.rq_modify.rq_objname, FALSE); if (pjob == NULL) { req_reject(PBSE_JOBNOTFOUND, 0, preq, NULL, "Job unexpectedly deleted"); return(NULL); } mutex_mgr job_mutex(pjob->ji_mutex, true); /* pbs_mom sets the extend string to trigger copying of checkpoint files */ if (preq->rq_extend != NULL) { if (strcmp(preq->rq_extend,CHECKPOINTHOLD) == 0) { checkpoint_req = CHK_HOLD; } else if (strcmp(preq->rq_extend,CHECKPOINTCONT) == 0) { checkpoint_req = CHK_CONT; } } plist = (svrattrl *)GET_NEXT(preq->rq_ind.rq_modify.rq_attr); /* modify_job will free preq and respond to it */ modify_job((void **)&pjob, plist, preq, checkpoint_req, 0); return(NULL); } /* END modify_job_work() */
static void process_checkpoint_reply( struct work_task *pwt) { job *pjob; struct batch_request *preq; svr_disconnect(pwt->wt_event); /* close connection to MOM */ preq = get_remove_batch_request(pwt->wt_parm1); free(pwt->wt_mutex); free(pwt); /* preq handled previously */ if (preq == NULL) return; preq->rq_conn = preq->rq_orgconn; /* restore client socket */ if ((pjob = svr_find_job(preq->rq_ind.rq_manager.rq_objname, FALSE)) == NULL) { log_event(PBSEVENT_DEBUG, PBS_EVENTCLASS_JOB, preq->rq_ind.rq_manager.rq_objname, msg_postmomnojob); req_reject(PBSE_UNKJOBID, 0, preq, NULL, msg_postmomnojob); } else { /* record that MOM has a checkpoint file */ account_record(PBS_ACCT_CHKPNT, pjob, "Checkpointed"); /* note in accounting file */ reply_ack(preq); unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL); } } /* END process_checkpoint_reply() */
void req_shutdown(struct batch_request *preq) { int type; extern int shutdown_who; if ((preq->rq_perm & (ATR_DFLAG_MGWR | ATR_DFLAG_MGRD | ATR_DFLAG_OPRD | ATR_DFLAG_OPWR)) == 0) { req_reject(PBSE_PERM, 0, preq); return; } (void)sprintf(log_buffer, msg_shutdown_op, preq->rq_user, preq->rq_host); log_event(PBSEVENT_SYSTEM|PBSEVENT_ADMIN|PBSEVENT_DEBUG, PBS_EVENTCLASS_SERVER, LOG_NOTICE, msg_daemonname, log_buffer); pshutdown_request = preq; /* save for reply from main() when done */ type = preq->rq_ind.rq_shutdown; shutdown_who = type & SHUT_WHO_MASK; if (shutdown_who & SHUT_WHO_SECDONLY) (void)failover_send_shutdown(FAILOVER_SecdShutdown); if (shutdown_who & SHUT_WHO_SCHED) (void)contact_sched(SCH_QUIT, NULL); /* tell scheduler to quit */ if (shutdown_who & SHUT_WHO_SECDONLY) { reply_ack(preq); return; /* do NOT shutdown this Server */ } /* Moms are told to shutdown in pbsd_main.c after main loop */ svr_shutdown(type); return; }
void req_stat_sched(struct batch_request *preq) { svrattrl *pal; struct batch_reply *preply; int rc = 0; pbs_sched *psched; /* allocate a reply structure and a status sub-structure */ preply = &preq->rq_reply; preply->brp_choice = BATCH_REPLY_CHOICE_Status; CLEAR_HEAD(preply->brp_un.brp_status); for (psched = (pbs_sched *) GET_NEXT(svr_allscheds); (psched != NULL); psched = (pbs_sched *) GET_NEXT(psched->sc_link) ) { rc = status_sched(psched, preq, &preply->brp_un.brp_status); if (rc != 0) { break; } } if (!rc) { (void)reply_send(preq); } else { if (rc != PBSE_NOATTR) req_reject(rc, 0, preq); else { pal = (svrattrl *)GET_NEXT(preq->rq_ind. rq_status.rq_attr); reply_badattr(rc, bad, pal, preq); } } }
int handle_single_delete( batch_request *preq, batch_request *preq_tmp, char *Msg) { char *jobid = preq->rq_ind.rq_delete.rq_objname; job *pjob = svr_find_job(jobid, FALSE); if (pjob == NULL) { log_event(PBSEVENT_DEBUG,PBS_EVENTCLASS_JOB,jobid,pbse_to_txt(PBSE_UNKJOBID)); req_reject(PBSE_UNKJOBID, 0, preq, NULL, "cannot locate job"); } else { std::string jobID = pjob->ji_qs.ji_jobid; unlock_ji_mutex(pjob, __func__, NULL, LOGLEVEL); removeBeforeAnyDependencies(jobID.c_str()); /* send the asynchronous reply if needed */ if (preq_tmp != NULL) { reply_ack(preq_tmp); preq->rq_noreply = TRUE; /* set for no more replies */ enqueue_threadpool_request(single_delete_work, preq, async_pool); } else single_delete_work(preq); } return(PBSE_NONE); } /* END handle_single_delete() */
void req_connect( struct batch_request *preq) { int sock = preq->rq_conn; unsigned short conn_authen; /* Called from one location inside a lock */ pthread_mutex_lock(svr_conn[sock].cn_mutex); conn_authen = svr_conn[sock].cn_authen; pthread_mutex_unlock(svr_conn[sock].cn_mutex); if (conn_authen == 0) { reply_ack(preq); } else { req_reject(PBSE_BADCRED, 0, preq, NULL, "Connection not authorized"); } return; } /* END req_connect() */
int handle_single_delete( struct batch_request *preq, struct batch_request *preq_tmp, char *Msg) { int rc= -1; char *jobid = preq->rq_ind.rq_delete.rq_objname; job *pjob = svr_find_job(jobid, FALSE); if (pjob == NULL) { log_event(PBSEVENT_DEBUG,PBS_EVENTCLASS_JOB,jobid,pbse_to_txt(PBSE_UNKJOBID)); req_reject(PBSE_UNKJOBID, 0, preq, NULL, "cannot locate job"); } else { if (preq_tmp != NULL) { reply_ack(preq_tmp); preq->rq_noreply = TRUE; /* set for no more replies */ } /* mutex is freed below */ if ((rc = forced_jobpurge(pjob, preq)) == PBSE_NONE) rc = execute_job_delete(pjob, Msg, preq); } if ((rc == PBSE_NONE) || (rc == PURGE_SUCCESS)) reply_ack(preq); return(PBSE_NONE); } /* END handle_single_delete() */
static void req_stat_job_step2( struct stat_cntl *cntl) /* I/O (free'd on return) */ { svrattrl *pal; job *pjob = NULL; struct batch_request *preq; struct batch_reply *preply; int rc = 0; enum TJobStatTypeEnum type; pbs_queue *pque = NULL; int exec_only = 0; int bad = 0; long DTime; /* delta time - only report full pbs_attribute list if J->MTime > DTime */ static svrattrl *dpal = NULL; int job_array_index = 0; job_array *pa = NULL; char log_buf[LOCAL_LOG_BUF_SIZE]; all_jobs_iterator *iter; preq = cntl->sc_origrq; type = (enum TJobStatTypeEnum)cntl->sc_type; preply = &preq->rq_reply; /* See pbs_server_attributes(1B) for details on "poll_jobs" behaviour */ if (dpal == NULL) { /* build 'delta' pbs_attribute list */ svrattrl *tpal; tlist_head dalist; int aindex; int atrlist[] = { JOB_ATR_jobname, JOB_ATR_resc_used, JOB_ATR_LAST }; CLEAR_LINK(dalist); for (aindex = 0;atrlist[aindex] != JOB_ATR_LAST;aindex++) { if ((tpal = attrlist_create("", "", 23)) == NULL) { return; } tpal->al_valln = atrlist[aindex]; if (dpal == NULL) dpal = tpal; append_link(&dalist, &tpal->al_link, tpal); } } /* END if (dpal == NULL) */ if (type == tjstArray) { pa = get_array(preq->rq_ind.rq_status.rq_id); if (pa == NULL) { req_reject(PBSE_UNKARRAYID, 0, preq, NULL, "unable to find array"); return; } } { all_jobs *ajptr = NULL; if (type == tjstQueue) ajptr = cntl->sc_pque->qu_jobs; else if (type == tjstSummarizeArraysQueue) ajptr = cntl->sc_pque->qu_jobs_array_sum; else if (type == tjstSummarizeArraysServer) ajptr = &array_summary; else ajptr = &alljobs; ajptr->lock(); iter = ajptr->get_iterator(); ajptr->unlock(); } /* * now ready for part 3, building the status reply, * loop through again */ if ((type == tjstSummarizeArraysQueue) || (type == tjstSummarizeArraysServer)) { /* No array can be owned for these options */ update_array_statuses(); } if (type == tjstJob) pjob = svr_find_job(preq->rq_ind.rq_status.rq_id, FALSE); else if (type == tjstQueue) pjob = next_job(cntl->sc_pque->qu_jobs,iter); else if (type == tjstSummarizeArraysQueue) pjob = next_job(cntl->sc_pque->qu_jobs_array_sum,iter); else if (type == tjstSummarizeArraysServer) pjob = next_job(&array_summary,iter); else if (type == tjstArray) { job_array_index = -1; pjob = NULL; /* increment job_array_index until we find a non-null pointer or hit the end */ while (++job_array_index < pa->ai_qs.array_size) { if (pa->job_ids[job_array_index] != NULL) { if ((pjob = svr_find_job(pa->job_ids[job_array_index], FALSE)) != NULL) { break; } } } } else pjob = next_job(&alljobs,iter); DTime = 0; if (preq->rq_extend != NULL) { char *ptr; /* FORMAT: { EXECQONLY | DELTA:<EPOCHTIME> } */ if (strstr(preq->rq_extend, EXECQUEONLY)) exec_only = 1; ptr = strstr(preq->rq_extend, "DELTA:"); if (ptr != NULL) { ptr += strlen("delta:"); DTime = strtol(ptr, NULL, 10); } } if ((type == tjstTruncatedServer) || (type == tjstTruncatedQueue)) { long sentJobCounter; long qjcounter; long qmaxreport; all_queues_iterator *iter = NULL; svr_queues.lock(); iter = svr_queues.get_iterator(); svr_queues.unlock(); /* loop through all queues */ while ((pque = next_queue(&svr_queues,iter)) != NULL) { qjcounter = 0; if ((exec_only == 1) && (pque->qu_qs.qu_type != QTYPE_Execution)) { /* ignore routing queues */ unlock_queue(pque, __func__, "ignore queue", LOGLEVEL); continue; } if (((pque->qu_attr[QA_ATR_MaxReport].at_flags & ATR_VFLAG_SET) != 0) && (pque->qu_attr[QA_ATR_MaxReport].at_val.at_long >= 0)) { qmaxreport = pque->qu_attr[QA_ATR_MaxReport].at_val.at_long; } else { qmaxreport = TMAX_JOB; } if (LOGLEVEL >= 5) { sprintf(log_buf,"giving scheduler up to %ld idle jobs in queue %s\n", qmaxreport, pque->qu_qs.qu_name); log_event(PBSEVENT_SYSTEM,PBS_EVENTCLASS_QUEUE,pque->qu_qs.qu_name,log_buf); } sentJobCounter = 0; /* loop through jobs in queue */ if (pjob != NULL) unlock_ji_mutex(pjob, __func__, "5", LOGLEVEL); all_jobs_iterator *jobiter = NULL; pque->qu_jobs->lock(); jobiter = pque->qu_jobs->get_iterator(); pque->qu_jobs->unlock(); while ((pjob = next_job(pque->qu_jobs,jobiter)) != NULL) { if ((qjcounter >= qmaxreport) && (pjob->ji_qs.ji_state == JOB_STATE_QUEUED)) { /* max_report of queued jobs reached for queue */ unlock_ji_mutex(pjob, __func__, "6", LOGLEVEL); continue; } pal = (svrattrl *)GET_NEXT(preq->rq_ind.rq_status.rq_attr); rc = status_job( pjob, preq, (pjob->ji_wattr[JOB_ATR_mtime].at_val.at_long >= DTime) ? pal : dpal, &preply->brp_un.brp_status, &bad); if ((rc != 0) && (rc != PBSE_PERM)) { req_reject(rc, bad, preq, NULL, NULL); unlock_ji_mutex(pjob, __func__, "7", LOGLEVEL); unlock_queue(pque, __func__, "perm", LOGLEVEL); delete iter; return; } sentJobCounter++; if (pjob->ji_qs.ji_state == JOB_STATE_QUEUED) qjcounter++; unlock_ji_mutex(pjob, __func__, "8", LOGLEVEL); } /* END foreach (pjob from pque) */ if (LOGLEVEL >= 5) { sprintf(log_buf,"sent scheduler %ld total jobs for queue %s\n", sentJobCounter, pque->qu_qs.qu_name); log_event(PBSEVENT_SYSTEM,PBS_EVENTCLASS_QUEUE,pque->qu_qs.qu_name,log_buf); } unlock_queue(pque, __func__, "end while", LOGLEVEL); } /* END for (pque) */ reply_send_svr(preq); delete iter; return; } /* END if ((type == tjstTruncatedServer) || ...) */ while (pjob != NULL) { /* go ahead and build the status reply for this job */ if (exec_only) { if (cntl->sc_pque != NULL) { if (cntl->sc_pque->qu_qs.qu_type != QTYPE_Execution) goto nextjob; } else { if (pa != NULL) pthread_mutex_unlock(pa->ai_mutex); pque = get_jobs_queue(&pjob); if (pa != NULL) pthread_mutex_lock(pa->ai_mutex); if ((pjob == NULL) || (pque == NULL)) goto nextjob; mutex_mgr pque_mutex = mutex_mgr(pque->qu_mutex, true); if (pque->qu_qs.qu_type != QTYPE_Execution) { goto nextjob; } } } pal = (svrattrl *)GET_NEXT(preq->rq_ind.rq_status.rq_attr); rc = status_job( pjob, preq, pal, &preply->brp_un.brp_status, &bad); if ((rc != 0) && (rc != PBSE_PERM)) { if (pa != NULL) { unlock_ai_mutex(pa, __func__, "1", LOGLEVEL); } unlock_ji_mutex(pjob, __func__, "9", LOGLEVEL); req_reject(rc, bad, preq, NULL, NULL); delete iter; return; } /* get next job */ nextjob: if (pjob != NULL) unlock_ji_mutex(pjob, __func__, "10", LOGLEVEL); if (type == tjstJob) break; if (type == tjstQueue) pjob = next_job(cntl->sc_pque->qu_jobs,iter); else if (type == tjstSummarizeArraysQueue) pjob = next_job(cntl->sc_pque->qu_jobs_array_sum,iter); else if (type == tjstSummarizeArraysServer) pjob = next_job(&array_summary,iter); else if (type == tjstArray) { pjob = NULL; /* increment job_array_index until we find a non-null pointer or hit the end */ while (++job_array_index < pa->ai_qs.array_size) { if (pa->job_ids[job_array_index] != NULL) { if ((pjob = svr_find_job(pa->job_ids[job_array_index], FALSE)) != NULL) { break; } } } } else pjob = next_job(&alljobs,iter); rc = 0; } /* END while (pjob != NULL) */ delete iter; if (pa != NULL) { unlock_ai_mutex(pa, __func__, "1", LOGLEVEL); } reply_send_svr(preq); if (LOGLEVEL >= 7) { log_event(PBSEVENT_SYSTEM, PBS_EVENTCLASS_JOB, "req_statjob", "Successfully returned the status of queued jobs\n"); } return; } /* END req_stat_job_step2() */
int execute_job_delete( job *pjob, /* M */ char *Msg, /* I */ struct batch_request *preq) /* I */ { struct work_task *pwtnew; int rc; char *sigt = "SIGTERM"; int has_mutex = TRUE; char log_buf[LOCAL_LOG_BUF_SIZE]; time_t time_now = time(NULL); long force_cancel = FALSE; long array_compatible = FALSE; chk_job_req_permissions(&pjob,preq); if (pjob == NULL) { /* preq is rejected in chk_job_req_permissions here */ return(-1); } if (pjob->ji_qs.ji_state == JOB_STATE_TRANSIT) { /* see note in req_delete - not sure this is possible still, * but the deleted code is irrelevant now. I will leave this * part --dbeer */ unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL); return(-1); } if (pjob->ji_qs.ji_substate == JOB_SUBSTATE_PRERUN || pjob->ji_qs.ji_substate == JOB_SUBSTATE_RERUN || pjob->ji_qs.ji_substate == JOB_SUBSTATE_RERUN1 || pjob->ji_qs.ji_substate == JOB_SUBSTATE_RERUN2 || pjob->ji_qs.ji_substate == JOB_SUBSTATE_RERUN3 ) { /* If JOB_SUBSTATE_PRERUN being sent to MOM, wait till she gets it going */ /* retry in one second */ /* If JOB_SUBSTATE_RERUN, RERUN1, RERUN2 or RERUN3 the job is being requeued. Wait until finished */ static time_t cycle_check_when = 0; static char cycle_check_jid[PBS_MAXSVRJOBID + 1]; if (cycle_check_when != 0) { if (!strcmp(pjob->ji_qs.ji_jobid, cycle_check_jid) && (time_now - cycle_check_when > 10)) { /* state not updated after 10 seconds */ /* did the mom ever get it? delete it anyways... */ cycle_check_jid[0] = '\0'; cycle_check_when = 0; goto jump; } if (time_now - cycle_check_when > 20) { /* give up after 20 seconds */ cycle_check_jid[0] = '\0'; cycle_check_when = 0; } } /* END if (cycle_check_when != 0) */ if (cycle_check_when == 0) { /* new PRERUN job located */ cycle_check_when = time_now; strcpy(cycle_check_jid, pjob->ji_qs.ji_jobid); } sprintf(log_buf, "job cannot be deleted, state=PRERUN, requeuing delete request"); log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf); pwtnew = set_task(WORK_Timed,time_now + 1,post_delete_route,preq,FALSE); unlock_ji_mutex(pjob, __func__, "2", LOGLEVEL); if (pwtnew == NULL) { req_reject(PBSE_SYSTEM, 0, preq, NULL, NULL); return(-1); } else { return(ROUTE_DELETE); } } /* END if (pjob->ji_qs.ji_substate == JOB_SUBSTATE_PRERUN) */ jump: /* * Log delete and if requesting client is not job owner, send mail. */ sprintf(log_buf, "requestor=%s@%s", preq->rq_user, preq->rq_host); /* NOTE: should annotate accounting record with extend message (NYI) */ account_record(PBS_ACCT_DEL, pjob, log_buf); sprintf(log_buf, msg_manager, msg_deletejob, preq->rq_user, preq->rq_host); log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf); /* NOTE: should incorporate job delete message */ if (Msg != NULL) { /* have text message in request extension, add it */ strcat(log_buf, "\n"); strcat(log_buf, Msg); } if ((svr_chk_owner(preq, pjob) != 0) && (pjob->ji_has_delete_nanny == FALSE)) { /* only send email if owner did not delete job and job deleted has not been previously attempted */ svr_mailowner(pjob, MAIL_DEL, MAIL_FORCE, log_buf); /* * If we sent mail and already sent the extra message * then reset message so we don't trigger a redundant email * in job_abt() */ if (Msg != NULL) { Msg = NULL; } } if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_FILE) != 0) { /* job has restart file at mom, change restart comment if failed */ change_restart_comment_if_needed(pjob); } if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING) { /* * setup a nanny task to make sure the job is actually deleted (see the * comments at job_delete_nanny()). */ if (pjob->ji_has_delete_nanny == TRUE) { unlock_ji_mutex(pjob, __func__, "3", LOGLEVEL); req_reject(PBSE_IVALREQ, 0, preq, NULL, "job cancel in progress"); return(-1); } apply_job_delete_nanny(pjob, time_now + 60); /* * Send signal request to MOM. The server will automagically * pick up and "finish" off the client request when MOM replies. */ get_batch_request_id(preq); if ((rc = issue_signal(&pjob, sigt, post_delete_mom1, strdup(preq->rq_id)))) { /* cant send to MOM */ req_reject(rc, 0, preq, NULL, NULL); } /* normally will ack reply when mom responds */ if (pjob != NULL) { sprintf(log_buf, msg_delrunjobsig, sigt); log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf); unlock_ji_mutex(pjob, __func__, "4", LOGLEVEL); } return(-1); } /* END if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING) */ /* make a cleanup task if set */ get_svr_attr_l(SRV_ATR_JobForceCancelTime, &force_cancel); if (force_cancel > 0) { char *dup_jobid = strdup(pjob->ji_qs.ji_jobid); set_task(WORK_Timed, time_now + force_cancel, ensure_deleted, dup_jobid, FALSE); } /* if configured, and this job didn't have a slot limit hold, free a job * held with the slot limit hold */ get_svr_attr_l(SRV_ATR_MoabArrayCompatible, &array_compatible); if ((array_compatible != FALSE) && ((pjob->ji_wattr[JOB_ATR_hold].at_val.at_long & HOLD_l) == FALSE)) { if ((pjob->ji_arraystruct != NULL) && (pjob->ji_is_array_template == FALSE)) { int i; int newstate; int newsub; job *tmp; job_array *pa = get_jobs_array(&pjob); if (pjob == NULL) return(-1); for (i = 0; i < pa->ai_qs.array_size; i++) { if (pa->job_ids[i] == NULL) continue; if (!strcmp(pa->job_ids[i], pjob->ji_qs.ji_jobid)) continue; if ((tmp = svr_find_job(pa->job_ids[i], FALSE)) == NULL) { free(pa->job_ids[i]); pa->job_ids[i] = NULL; } else { if (tmp->ji_wattr[JOB_ATR_hold].at_val.at_long & HOLD_l) { tmp->ji_wattr[JOB_ATR_hold].at_val.at_long &= ~HOLD_l; if (tmp->ji_wattr[JOB_ATR_hold].at_val.at_long == 0) { tmp->ji_wattr[JOB_ATR_hold].at_flags &= ~ATR_VFLAG_SET; } svr_evaljobstate(tmp, &newstate, &newsub, 1); svr_setjobstate(tmp, newstate, newsub, FALSE); job_save(tmp, SAVEJOB_FULL, 0); unlock_ji_mutex(tmp, __func__, "5", LOGLEVEL); break; } unlock_ji_mutex(tmp, __func__, "6", LOGLEVEL); } } if (LOGLEVEL >= 7) { sprintf(log_buf, "%s: unlocking ai_mutex", __func__); log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buf); } pthread_mutex_unlock(pa->ai_mutex); } } /* END MoabArrayCompatible check */ if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_FILE) != 0) { /* job has restart file at mom, do end job processing */ svr_setjobstate(pjob, JOB_STATE_EXITING, JOB_SUBSTATE_EXITING, FALSE); /* force new connection */ pjob->ji_momhandle = -1; if (LOGLEVEL >= 7) { sprintf(log_buf, "calling on_job_exit from %s", __func__); log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buf); } set_task(WORK_Immed, 0, on_job_exit, strdup(pjob->ji_qs.ji_jobid), FALSE); } else if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_StagedIn) != 0) { /* job has staged-in file, should remove them */ remove_stagein(&pjob); if (pjob != NULL) job_abt(&pjob, Msg); has_mutex = FALSE; } else { /* * the job is not transitting (though it may have been) and * is not running, so put in into a complete state. */ struct pbs_queue *pque; int KeepSeconds = 0; svr_setjobstate(pjob, JOB_STATE_COMPLETE, JOB_SUBSTATE_COMPLETE, FALSE); if ((pque = get_jobs_queue(&pjob)) != NULL) { pque->qu_numcompleted++; unlock_queue(pque, __func__, NULL, LOGLEVEL); if (LOGLEVEL >= 7) { sprintf(log_buf, "calling on_job_exit from %s", __func__); log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buf); } pthread_mutex_lock(server.sv_attr_mutex); KeepSeconds = attr_ifelse_long( &pque->qu_attr[QE_ATR_KeepCompleted], &server.sv_attr[SRV_ATR_KeepCompleted], 0); pthread_mutex_unlock(server.sv_attr_mutex); } else KeepSeconds = 0; if (pjob != NULL) { set_task(WORK_Timed, time_now + KeepSeconds, on_job_exit, strdup(pjob->ji_qs.ji_jobid), FALSE); } else has_mutex = FALSE; } /* END else if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_FILE) != 0) */ if (has_mutex == TRUE) unlock_ji_mutex(pjob, __func__, "7", LOGLEVEL); return(PBSE_NONE); } /* END execute_job_delete() */
int handle_delete_all( struct batch_request *preq, struct batch_request *preq_tmp, char *Msg) { /* don't use the actual request so we can reply about all of the jobs */ struct batch_request *preq_dup = duplicate_request(preq); job *pjob; int iter = -1; int failed_deletes = 0; int total_jobs = 0; int rc = PBSE_NONE; char tmpLine[MAXLINE]; preq_dup->rq_noreply = TRUE; if (preq_tmp != NULL) { reply_ack(preq_tmp); preq->rq_noreply = TRUE; /* set for no more replies */ } while ((pjob = next_job(&alljobs, &iter)) != NULL) { if ((rc = forced_jobpurge(pjob, preq_dup)) == PURGE_SUCCESS) { continue; } if (pjob->ji_qs.ji_state >= JOB_STATE_EXITING) { unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL); continue; } total_jobs++; /* mutex is freed below */ if (rc == PBSE_NONE) { if ((rc = execute_job_delete(pjob, Msg, preq_dup)) == PBSE_NONE) reply_ack(preq_dup); /* mark this as NULL because it has been freed */ preq_dup = NULL; } if (rc != PURGE_SUCCESS) { /* duplicate the preq so we don't have a problem with double frees */ preq_dup = duplicate_request(preq); preq_dup->rq_noreply = TRUE; if ((rc == MOM_DELETE) || (rc == ROUTE_DELETE)) failed_deletes++; } } if (failed_deletes == 0) { reply_ack(preq); /* PURGE SUCCESS means this was qdel -p all. In this case no reply_*() * functions have been called */ if (rc == PURGE_SUCCESS) { free_br(preq_dup); preq_dup = NULL; } } else { snprintf(tmpLine,sizeof(tmpLine),"Deletes failed for %d of %d jobs", failed_deletes, total_jobs); req_reject(PBSE_SYSTEM, 0, preq, NULL, tmpLine); } /* preq_dup happens at the end of the loop, so free the extra one if * it is there */ if (preq_dup != NULL) free_br(preq_dup); return(PBSE_NONE); } /* END handle_delete_all() */
int finalize_rerunjob( batch_request *preq, job *pjob, int rc) { int Force; char log_buf[LOCAL_LOG_BUF_SIZE]; if (pjob == NULL) return(PBSE_BAD_PARAMETER); mutex_mgr pjob_mutex = mutex_mgr(pjob->ji_mutex, true); if (preq->rq_extend && !strncasecmp(preq->rq_extend, RERUNFORCE, strlen(RERUNFORCE))) Force = 1; else Force = 0; switch (rc) { case -1: /* completed job was requeued */ /* clear out job completion time if there is one */ break; case 0: /* requeue request successful */ pjob->ji_qs.ji_substate = JOB_SUBSTATE_RERUN; break; case PBSE_SYSTEM: /* This may not be accurate...*/ rc = PBSE_MEM_MALLOC; snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "Can not allocate memory"); req_reject(rc, 0, preq, NULL, log_buf); return rc; break; default: if (Force == 0) { rc = PBSE_MOMREJECT; snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "Rejected by mom"); req_reject(rc, 0, preq, NULL, log_buf); return rc; } else { int newstate; int newsubst; unsigned int dummy; char *tmp; if ((cray_enabled == true) && (pjob->ji_wattr[JOB_ATR_login_node_id].at_val.at_str != NULL)) tmp = parse_servername(pjob->ji_wattr[JOB_ATR_login_node_id].at_val.at_str, &dummy); else tmp = parse_servername(pjob->ji_wattr[JOB_ATR_exec_host].at_val.at_str, &dummy); /* Cannot communicate with MOM, forcibly requeue job. This is a relatively disgusting thing to do */ sprintf(log_buf, "rerun req to %s failed (rc=%d), forcibly requeueing job", tmp, rc); free(tmp); log_event( PBSEVENT_ERROR | PBSEVENT_ADMIN | PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buf); log_err(-1, __func__, log_buf); strcat(log_buf, ", previous output files may be lost"); svr_mailowner(pjob, MAIL_OTHER, MAIL_FORCE, log_buf); svr_setjobstate(pjob, JOB_STATE_EXITING, JOB_SUBSTATE_RERUN3, FALSE); rel_resc(pjob); /* free resc assigned to job */ pjob->ji_modified = 1; /* force full job save */ pjob->ji_momhandle = -1; pjob->ji_qs.ji_svrflags &= ~JOB_SVFLG_StagedIn; svr_evaljobstate(*pjob, newstate, newsubst, 0); svr_setjobstate(pjob, newstate, newsubst, FALSE); } break; } /* END switch (rc) */ pjob->ji_qs.ji_svrflags = (pjob->ji_qs.ji_svrflags & ~(JOB_SVFLG_CHECKPOINT_FILE |JOB_SVFLG_CHECKPOINT_MIGRATEABLE | JOB_SVFLG_CHECKPOINT_COPIED)) | JOB_SVFLG_HASRUN; sprintf(log_buf, msg_manager, msg_jobrerun, preq->rq_user, preq->rq_host); log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf); reply_ack(preq); /* note in accounting file */ account_record(PBS_ACCT_RERUN, pjob, NULL); return rc; } /* END req_rerunjob() */
void purge_completed_jobs( struct batch_request *preq) /* I */ { job *pjob; char *time_str; time_t purge_time = 0; int iter; char log_buf[LOCAL_LOG_BUF_SIZE]; /* get the time to purge the jobs that completed before */ time_str = preq->rq_extend; time_str += strlen(PURGECOMP); purge_time = strtol(time_str,NULL,10); /* * Clean unreported capability is only for operators and managers. * Check if request is authorized */ if ((preq->rq_perm & (ATR_DFLAG_OPRD|ATR_DFLAG_OPWR| ATR_DFLAG_MGRD|ATR_DFLAG_MGWR)) == 0) { req_reject(PBSE_PERM,0,preq,NULL, "must have operator or manager privilege to use -c parameter"); return; } reply_ack(preq); if (LOGLEVEL >= 4) { sprintf(log_buf,"Received purge completed jobs command, purge time is %ld (%s)", (long)purge_time, preq->rq_extend); log_event(PBSEVENT_SYSTEM, PBS_EVENTCLASS_REQUEST, __func__, log_buf); } iter = -1; while ((pjob = next_job(&alljobs,&iter)) != NULL) { if ((pjob->ji_qs.ji_substate == JOB_SUBSTATE_COMPLETE) && (pjob->ji_wattr[JOB_ATR_comp_time].at_val.at_long <= purge_time) && ((pjob->ji_wattr[JOB_ATR_reported].at_flags & ATR_VFLAG_SET) != 0) && (pjob->ji_wattr[JOB_ATR_reported].at_val.at_long == 0)) { if (LOGLEVEL >= 4) { sprintf(log_buf,"Reported job is COMPLETED (%ld), setting reported to TRUE", pjob->ji_wattr[JOB_ATR_comp_time].at_val.at_long); log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf); } pjob->ji_wattr[JOB_ATR_reported].at_val.at_long = 1; pjob->ji_wattr[JOB_ATR_reported].at_flags = ATR_VFLAG_SET | ATR_VFLAG_MODIFY; job_save(pjob, SAVEJOB_FULL, 0); } unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL); } return; } /* END purge_completed_jobs() */
void delay_and_send_sig_kill( batch_request *preq_sig) { int delay = 0; job *pjob; pbs_queue *pque; batch_request *preq_clt = NULL; /* original client request */ int rc; time_t time_now = time(NULL); char log_buf[LOCAL_LOG_BUF_SIZE]; if (preq_sig == NULL) return; rc = preq_sig->rq_reply.brp_code; if (preq_sig->rq_extend != NULL) { preq_clt = get_remove_batch_request(preq_sig->rq_extend); } /* the client request has been handled another way, nothing left to do */ if (preq_clt == NULL) return; if ((pjob = chk_job_request(preq_clt->rq_ind.rq_rerun, preq_clt)) == NULL) { /* job has gone away, chk_job_request() calls req_reject() on failure */ return; } mutex_mgr pjob_mutex = mutex_mgr(pjob->ji_mutex, true); if (rc) { /* mom rejected request */ if (rc == PBSE_UNKJOBID) { /* MOM claims no knowledge, so just purge it */ log_event( PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, "MOM rejected signal during rerun"); /* removed the resources assigned to job */ free_nodes(pjob); set_resc_assigned(pjob, DECR); unlock_ji_mutex(pjob, __func__, "3", LOGLEVEL); svr_job_purge(pjob); reply_ack(preq_clt); } else { pjob_mutex.unlock(); req_reject(rc, 0, preq_clt, NULL, NULL); } return; } // Apply the user delay first so it takes precedence. if (pjob->ji_wattr[JOB_ATR_user_kill_delay].at_flags & ATR_VFLAG_SET) delay = pjob->ji_wattr[JOB_ATR_user_kill_delay].at_val.at_long; if ((pque = get_jobs_queue(&pjob)) != NULL) { mutex_mgr pque_mutex = mutex_mgr(pque->qu_mutex, true); mutex_mgr server_mutex = mutex_mgr(server.sv_attr_mutex, false); if (delay == 0) { delay = attr_ifelse_long(&pque->qu_attr[QE_ATR_KillDelay], &server.sv_attr[SRV_ATR_KillDelay], 0); } } else { /* why is the pque null. Something went wrong */ snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "jobid %s returned a null queue", pjob->ji_qs.ji_jobid); req_reject(PBSE_UNKQUE, 0, preq_clt, NULL, log_buf); return; } pjob_mutex.unlock(); reply_ack(preq_clt); set_task(WORK_Timed, delay + time_now, send_sig_kill, strdup(pjob->ji_qs.ji_jobid), FALSE); } // END delay_and_send_sig_kill()
int req_rerunjob( batch_request *preq) { int rc = PBSE_NONE; job *pjob; int MgrRequired = TRUE; char log_buf[LOCAL_LOG_BUF_SIZE]; /* check if requestor is admin, job owner, etc */ if (!strcasecmp(preq->rq_ind.rq_rerun, "all")) { return(handle_requeue_all(preq)); } if ((pjob = chk_job_request(preq->rq_ind.rq_rerun, preq)) == 0) { /* FAILURE */ /* chk_job_request calls req_reject() */ rc = PBSE_SYSTEM; return rc; /* This needs to fixed to return an accurate error */ } mutex_mgr pjob_mutex = mutex_mgr(pjob->ji_mutex, true); /* the job must be running or completed */ if (pjob->ji_qs.ji_state >= JOB_STATE_EXITING) { if (pjob->ji_wattr[JOB_ATR_checkpoint_name].at_flags & ATR_VFLAG_SET) { /* allow end-users to rerun checkpointed jobs */ MgrRequired = FALSE; } } else if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING) { /* job is running */ /* NO-OP */ } else if (pjob->ji_qs.ji_state == JOB_STATE_QUEUED) { // If we are already queued, then there is nothing to do. rc = PBSE_NONE; reply_ack(preq); return(rc); } else { /* FAILURE - job is in bad state */ rc = PBSE_BADSTATE; snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "job %s is in a bad state", preq->rq_ind.rq_rerun); req_reject(rc, 0, preq, NULL, log_buf); return rc; } if ((MgrRequired == TRUE) && ((preq->rq_perm & (ATR_DFLAG_MGWR | ATR_DFLAG_OPWR)) == 0)) { /* FAILURE */ rc = PBSE_PERM; snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "additional permissions required (ATR_DFLAG_MGWR | ATR_DFLAG_OPWR)"); req_reject(rc, 0, preq, NULL, log_buf); return rc; } /* the job must be rerunnable */ if (pjob->ji_wattr[JOB_ATR_rerunable].at_val.at_long == 0) { /* NOTE: should force override this constraint? maybe (???) */ /* no, the user is saying that the job will break, and IEEE Std 1003.1 specifically says rerun is to be rejected if rerunable==FALSE -garrick */ rc = PBSE_NORERUN; snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "job %s not rerunnable", preq->rq_ind.rq_rerun); req_reject(rc, 0, preq, NULL, log_buf); return rc; } if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING) { /* ask MOM to kill off the job if it is running */ int delay = 0; pbs_queue *pque; // Apply the user delay first so it takes precedence. if (pjob->ji_wattr[JOB_ATR_user_kill_delay].at_flags & ATR_VFLAG_SET) delay = pjob->ji_wattr[JOB_ATR_user_kill_delay].at_val.at_long; if ((pque = get_jobs_queue(&pjob)) != NULL) { mutex_mgr pque_mutex = mutex_mgr(pque->qu_mutex, true); mutex_mgr server_mutex = mutex_mgr(server.sv_attr_mutex, false); if (delay == 0) { delay = attr_ifelse_long(&pque->qu_attr[QE_ATR_KillDelay], &server.sv_attr[SRV_ATR_KillDelay], 0); } } else { /* why is the pque null. Something went wrong */ snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "jobid %s returned a null queue", pjob->ji_qs.ji_jobid); req_reject(PBSE_UNKQUE, 0, preq, NULL, log_buf); return(PBSE_UNKQUE); } pjob->ji_qs.ji_substate = JOB_SUBSTATE_RERUN; if (delay != 0) { static const char *rerun = "rerun"; char *extra = strdup(rerun); get_batch_request_id(preq); /* If a qrerun -f is given requeue the job regardless of the outcome of issue_signal*/ if ((preq->rq_extend) && (!strncasecmp(preq->rq_extend, RERUNFORCE, strlen(RERUNFORCE)))) { std::string extend = RERUNFORCE; batch_request *dup = new batch_request(*preq); get_batch_request_id(dup); rc = issue_signal(&pjob, "SIGTERM", delay_and_send_sig_kill, extra, strdup(dup->rq_id.c_str())); if (rc == PBSE_NORELYMOM) { dup->rq_reply.brp_code = PBSE_NORELYMOM; pjob_mutex.unlock(); post_rerun(dup); pjob = svr_find_job(preq->rq_ind.rq_signal.rq_jid, FALSE); if (pjob == NULL) { delete dup; return(PBSE_NONE); } pjob_mutex.set_lock_state(true); rc = PBSE_NONE; } delete dup; } else { rc = issue_signal(&pjob, "SIGTERM", delay_and_send_sig_kill, extra, strdup(preq->rq_id.c_str())); if (rc != PBSE_NONE) { /* cant send to MOM */ req_reject(rc, 0, preq, NULL, NULL); } return(rc); } } else { static const char *rerun = "rerun"; char *extra = strdup(rerun); /* If a qrerun -f is given requeue the job regardless of the outcome of issue_signal*/ if (preq->rq_extend && !strncasecmp(preq->rq_extend, RERUNFORCE, strlen(RERUNFORCE))) { std::string extend = RERUNFORCE; rc = issue_signal(&pjob, "SIGKILL", post_rerun, extra, strdup(extend.c_str())); if (rc == PBSE_NORELYMOM) rc = PBSE_NONE; } else rc = issue_signal(&pjob, "SIGKILL", post_rerun, extra, NULL); } } else { if (pjob->ji_wattr[JOB_ATR_hold].at_val.at_long == HOLD_n) { svr_setjobstate(pjob, JOB_STATE_QUEUED, JOB_SUBSTATE_QUEUED, FALSE); } else { svr_setjobstate(pjob, JOB_STATE_HELD, JOB_SUBSTATE_HELD, FALSE); } /* reset some job attributes */ pjob->ji_wattr[JOB_ATR_comp_time].at_flags &= ~ATR_VFLAG_SET; pjob->ji_wattr[JOB_ATR_reported].at_flags &= ~ATR_VFLAG_SET; set_statechar(pjob); rc = -1; } /* finalize_rerunjob will return with pjob->ji_mutex unlocked */ pjob_mutex.set_unlock_on_exit(false); return finalize_rerunjob(preq,pjob,rc); }
int req_movejob( batch_request *req) /* I */ { job *jobp; char log_buf[LOCAL_LOG_BUF_SIZE]; int local_errno = 0; jobp = chk_job_request(req->rq_ind.rq_move.rq_jid, req); if (jobp == NULL) { return(PBSE_NONE); } mutex_mgr job_mutex(jobp->ji_mutex, true); if (LOGLEVEL >= 7) { sprintf(log_buf, "%s", jobp->ji_qs.ji_jobid); LOG_EVENT(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, __func__, log_buf); } if ((jobp->ji_qs.ji_state != JOB_STATE_QUEUED) && (jobp->ji_qs.ji_state != JOB_STATE_HELD) && (jobp->ji_qs.ji_state != JOB_STATE_WAITING)) { #ifndef NDEBUG sprintf(log_buf, "%s %d %s", pbse_to_txt(PBSE_BADSTATE), jobp->ji_qs.ji_state, __func__); log_event(PBSEVENT_DEBUG,PBS_EVENTCLASS_JOB,jobp->ji_qs.ji_jobid,log_buf); #endif /* NDEBUG */ req_reject(PBSE_BADSTATE, 0, req, NULL, NULL); return(PBSE_NONE); } /* * svr_movejob() does the real work, handles both local and * network moves */ /* We have found that sometimes the destination queue and the parent queue are the same. If so we do not need to do anything else */ if (strcmp(jobp->ji_qs.ji_queue, req->rq_ind.rq_move.rq_destin) == 0) { sprintf(log_buf, "Job %s already in queue %s", jobp->ji_qs.ji_jobid, jobp->ji_qs.ji_queue); if (LOGLEVEL >= 7) { log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, __func__, log_buf); } req_reject(PBSE_JOB_ALREADY_IN_QUEUE, 0, req, NULL, log_buf); return(PBSE_NONE); } switch (svr_movejob(jobp, req->rq_ind.rq_move.rq_destin, &local_errno, req)) { case 0: /* success */ snprintf(log_buf, sizeof(log_buf), "%s", msg_movejob); snprintf(log_buf + strlen(log_buf), sizeof(log_buf) - strlen(log_buf), msg_manager, req->rq_ind.rq_move.rq_destin, req->rq_user, req->rq_host); log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,jobp->ji_qs.ji_jobid,log_buf); reply_ack(req); break; case - 1: case 1: /* fail */ /* NOTE: can pass detailed response to requestor (NYI) */ req_reject(local_errno, 0, req, NULL, NULL); break; case 2: /* deferred, will be handled by */ /* post_movejob() when the child completes */ /* NO-OP */ break; } /* END switch (svr_movejob(jobp,req->rq_ind.rq_move.rq_destin,req)) */ return(PBSE_NONE); } /* END req_movejob() */
int req_orderjob( struct batch_request *vp) /* I */ { job *pjob; job *pjob1; job *pjob2; int rank; int rc = 0; char tmpqn[PBS_MAXQUEUENAME+1]; struct batch_request *req = (struct batch_request *)vp; char log_buf[LOCAL_LOG_BUF_SIZE]; pbs_queue *pque1; pbs_queue *pque2; if ((pjob1 = chk_job_request(req->rq_ind.rq_move.rq_jid, req)) == NULL) { return(PBSE_NONE); } mutex_mgr job1_mutex(pjob1->ji_mutex, true); if ((pjob2 = chk_job_request(req->rq_ind.rq_move.rq_destin, req)) == NULL) { return(PBSE_NONE); } mutex_mgr job2_mutex(pjob2->ji_mutex, true); if (((pjob = pjob1)->ji_qs.ji_state == JOB_STATE_RUNNING) || ((pjob = pjob2)->ji_qs.ji_state == JOB_STATE_RUNNING)) { #ifndef NDEBUG sprintf(log_buf, "%s %d", pbse_to_txt(PBSE_BADSTATE), pjob->ji_qs.ji_state); strcat(log_buf, __func__); log_event( PBSEVENT_DEBUG, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buf); #endif /* NDEBUG */ req_reject(PBSE_BADSTATE, 0, req, NULL, NULL); return(PBSE_NONE); } else if ((pjob1->ji_qhdr == NULL) || (pjob2->ji_qhdr == NULL)) { req_reject(PBSE_BADSTATE, 0, req, NULL, "One of the jobs does not have a queue"); return(PBSE_NONE); } else if (pjob1->ji_qhdr != pjob2->ji_qhdr) { /* jobs are in different queues */ int ok = FALSE; if ((pque2 = get_jobs_queue(&pjob2)) == NULL) { rc = PBSE_BADSTATE; job2_mutex.set_lock_on_exit(false); } else { mutex_mgr pque2_mutex = mutex_mgr(pque2->qu_mutex, true); if ((rc = svr_chkque(pjob1, pque2, get_variable(pjob1, pbs_o_host), MOVE_TYPE_Order, NULL)) == PBSE_NONE) { pque2_mutex.unlock(); if ((pque1 = get_jobs_queue(&pjob1)) == NULL) { rc = PBSE_BADSTATE; job1_mutex.set_lock_on_exit(false); } else if (pjob1 != NULL) { mutex_mgr pque1_mutex = mutex_mgr(pque1->qu_mutex, true); if ((rc = svr_chkque(pjob2, pque1, get_variable(pjob2, pbs_o_host), MOVE_TYPE_Order, NULL)) == PBSE_NONE) { ok = TRUE; } } } } if (ok == FALSE) { req_reject(rc, 0, req, NULL, NULL); return(PBSE_NONE); } } /* now swap the order of the two jobs in the queue lists */ rank = pjob1->ji_wattr[JOB_ATR_qrank].at_val.at_long; pjob1->ji_wattr[JOB_ATR_qrank].at_val.at_long = pjob2->ji_wattr[JOB_ATR_qrank].at_val.at_long; pjob2->ji_wattr[JOB_ATR_qrank].at_val.at_long = rank; if (pjob1->ji_qhdr != pjob2->ji_qhdr) { strcpy(tmpqn, pjob1->ji_qs.ji_queue); strcpy(pjob1->ji_qs.ji_queue, pjob2->ji_qs.ji_queue); strcpy(pjob2->ji_qs.ji_queue, tmpqn); svr_dequejob(pjob1, FALSE); svr_dequejob(pjob2, FALSE); if (svr_enquejob(pjob1, FALSE, -1) == PBSE_JOB_RECYCLED) { pjob1 = NULL; job1_mutex.set_lock_on_exit(false); } if (svr_enquejob(pjob2, FALSE, -1) == PBSE_JOB_RECYCLED) { pjob2 = NULL; job2_mutex.set_lock_on_exit(false); } } else { if ((pque1 = get_jobs_queue(&pjob1)) != NULL) { mutex_mgr pque1_mutex = mutex_mgr(pque1->qu_mutex, true); swap_jobs(pque1->qu_jobs,pjob1,pjob2); swap_jobs(NULL,pjob1,pjob2); } } /* need to update disk copy of both jobs to save new order */ if (pjob1 != NULL) { job_save(pjob1, SAVEJOB_FULL, 0); } if (pjob2 != NULL) { job_save(pjob2, SAVEJOB_FULL, 0); } /* SUCCESS */ reply_ack(req); return(PBSE_NONE); } /* END req_orderjob() */
void req_deletearray(struct batch_request *preq) { job_array *pa; char *range; struct work_task *ptask; int num_skipped; char owner[PBS_MAXUSER + 1]; pa = get_array(preq->rq_ind.rq_delete.rq_objname); if (pa == NULL) { reply_ack(preq); return; } /* check authorization */ get_jobowner(pa->ai_qs.owner, owner); if (svr_authorize_req(preq, owner, pa->ai_qs.submit_host) == -1) { sprintf(log_buffer, msg_permlog, preq->rq_type, "Array", preq->rq_ind.rq_delete.rq_objname, preq->rq_user, preq->rq_host); log_event( PBSEVENT_SECURITY, PBS_EVENTCLASS_JOB, preq->rq_ind.rq_delete.rq_objname, log_buffer); req_reject(PBSE_PERM, 0, preq, NULL, "operation not permitted"); return; } /* get the range of jobs to iterate over */ range = preq->rq_extend; if ((range != NULL) && (strstr(range,ARRAY_RANGE) != NULL)) { /* parse the array range */ num_skipped = delete_array_range(pa,range); if (num_skipped < 0) { /* ERROR */ req_reject(PBSE_IVALREQ,0,preq,NULL,"Error in specified array range"); return; } } else { num_skipped = delete_whole_array(pa); } /* check if the array is gone */ if ((pa = get_array(preq->rq_ind.rq_delete.rq_objname)) != NULL) { /* some jobs were not deleted. They must have been running or had JOB_SUBSTATE_TRANSIT */ if (num_skipped != 0) { ptask = set_task(WORK_Timed, time_now + 2, array_delete_wt, preq); if(ptask) { return; } } } /* now that the whole array is deleted, we should mail the user if necessary */ reply_ack(preq); return; }
static void post_movejob( struct work_task *pwt) { char *id = "post_movejob"; struct batch_request *req; int newstate; int newsub; int stat; int r; job *jobp; req = (struct batch_request *)pwt->wt_parm2; stat = pwt->wt_aux; pbs_errno = PBSE_NONE; if (req->rq_type != PBS_BATCH_MoveJob) { sprintf(log_buffer, "bad request type %d\n", req->rq_type); log_err(-1, id, log_buffer); return; } jobp = find_job(req->rq_ind.rq_move.rq_jid); if ((jobp == NULL) || (jobp != (job *)pwt->wt_parm1)) { sprintf(log_buffer, "job %s not found\n", req->rq_ind.rq_move.rq_jid); log_err(-1, id, log_buffer); } if (WIFEXITED(stat)) { r = WEXITSTATUS(stat); if (r == 0) { /* purge server's job structure */ if (jobp->ji_qs.ji_svrflags & JOB_SVFLG_StagedIn) remove_stagein(jobp); if (jobp->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_COPIED) remove_checkpoint(jobp); strcpy(log_buffer, msg_movejob); sprintf(log_buffer + strlen(log_buffer), msg_manager, req->rq_ind.rq_move.rq_destin, req->rq_user, req->rq_host); job_purge(jobp); } else { r = PBSE_ROUTEREJ; } } else { r = PBSE_SYSTEM; sprintf(log_buffer, msg_badexit, stat); strcat(log_buffer, id); log_event( PBSEVENT_SYSTEM, PBS_EVENTCLASS_JOB, jobp->ji_qs.ji_jobid, log_buffer); } if (r) { if (jobp != NULL) { /* force re-eval of job state out of Transit */ svr_evaljobstate(jobp, &newstate, &newsub, 1); svr_setjobstate(jobp, newstate, newsub); } req_reject(r, 0, req, NULL, NULL); } else { reply_ack(req); } return; } /* END post_movejob() */
void req_relnodesjob(struct batch_request *preq) { int jt; /* job type */ job *pjob; int rc; char *jid; int i, offset; char *nodeslist = NULL; char msg[LOG_BUF_SIZE]; if (preq == NULL) return; jid = preq->rq_ind.rq_relnodes.rq_jid; if (jid == NULL) return; /* ** Returns job pointer for singleton job or "parent" of ** an array job. */ pjob = chk_job_request(jid, preq, &jt); if (pjob == NULL) { return; } if (jt == IS_ARRAY_NO) { /* a regular job is okay */ /* the job must be running */ if ((pjob->ji_qs.ji_state != JOB_STATE_RUNNING) || (pjob->ji_qs.ji_substate != JOB_SUBSTATE_RUNNING)) { req_reject(PBSE_BADSTATE, 0, preq); return; } } else if (jt == IS_ARRAY_Single) { /* a single subjob is okay */ offset = subjob_index_to_offset(pjob, get_index_from_jid(jid)); if (offset == -1) { req_reject(PBSE_UNKJOBID, 0, preq); return; } i = get_subjob_state(pjob, offset); if (i == -1) { req_reject(PBSE_IVALREQ, 0, preq); return; } if (i != JOB_STATE_RUNNING) { req_reject(PBSE_BADSTATE, 0, preq); return; } if ((pjob = pjob->ji_ajtrk->tkm_tbl[offset].trk_psubjob) == NULL) { req_reject(PBSE_UNKJOBID, 0, preq); return; } if (pjob->ji_qs.ji_substate != JOB_SUBSTATE_RUNNING) { req_reject(PBSE_BADSTATE, 0, preq); return; } } else { reply_text(preq, PBSE_NOSUP, "not supported for Array Jobs or multiple sub-jobs"); return; } nodeslist = preq->rq_ind.rq_relnodes.rq_node_list; if ((nodeslist != NULL) && (nodeslist[0] == '\0')) { nodeslist = NULL; } rc = free_sister_vnodes(pjob, nodeslist, msg, LOG_BUF_SIZE, preq); if (rc != 0) { reply_text(preq, PBSE_SYSTEM, msg); } }
int req_stat_node( struct batch_request *preq) { char *name; int rc = PBSE_NONE; int type = 0; int bad = 0; struct pbsnode *pnode = NULL; struct batch_reply *preply; struct prop props; svrattrl *pal; /* * first, check that the server indeed has a list of nodes * and if it does, validate the name of the requested object-- * either name is that of a specific node, or name[0] is null/@ * meaning request is for all nodes in the server's jurisdiction */ if (LOGLEVEL >= 6) { log_record( PBSEVENT_SCHED, PBS_EVENTCLASS_REQUEST, __func__, "entered"); } if (svr_totnodes <= 0) { rc = PBSE_NONODES; req_reject(rc, 0, preq, NULL, "node list is empty - check 'server_priv/nodes' file"); return rc; } name = preq->rq_ind.rq_status.rq_id; if ((*name == '\0') || (*name == '@')) { type = 1; } else if ((*name == ':') && (*(name + 1) != '\0')) { if (!strcmp(name + 1, "ALL")) { type = 1; /* psuedo-group for all nodes */ } else { type = 2; props.name = name + 1; props.mark = 1; props.next = NULL; } } preply = &preq->rq_reply; preply->brp_choice = BATCH_REPLY_CHOICE_Status; CLEAR_HEAD(preply->brp_un.brp_status); if (type == 0) { /* get status of the named node */ pnode = find_nodebyname(name); if (pnode == NULL) { rc = PBSE_UNKNODE; req_reject(rc, 0, preq, NULL, "cannot locate specified node"); return(rc); } /* get the status on all of the numa nodes */ if (pnode->nd_is_alps_reporter == TRUE) rc = get_alps_statuses(pnode, preq, &bad, &preply->brp_un.brp_status); else rc = get_numa_statuses(pnode, preq, &bad, &preply->brp_un.brp_status); unlock_node(pnode, __func__, "type == 0", LOGLEVEL); } else { /* get status of all or several nodes */ all_nodes_iterator *iter = NULL; while ((pnode = next_host(&allnodes,&iter,NULL)) != NULL) { if ((type == 2) && (!hasprop(pnode, &props))) { unlock_node(pnode, __func__, "type != 0, next_host", LOGLEVEL); continue; } /* get the status on all of the numa nodes */ if (pnode->nd_is_alps_reporter == TRUE) rc = get_alps_statuses(pnode, preq, &bad, &preply->brp_un.brp_status); else rc = get_numa_statuses(pnode, preq, &bad, &preply->brp_un.brp_status); if (rc != PBSE_NONE) { unlock_node(pnode, __func__, "type != 0, rc != 0, get_numa_statuses", LOGLEVEL); break; } unlock_node(pnode, __func__, "type != 0, rc == 0, get_numa_statuses", LOGLEVEL); } if (iter != NULL) delete iter; } if (rc == PBSE_NONE) { /* SUCCESS */ reply_send_svr(preq); } else { if (rc != PBSE_UNKNODEATR) { req_reject(rc, 0, preq, NULL, NULL); } else { pal = (svrattrl *)GET_NEXT(preq->rq_ind.rq_status.rq_attr); reply_badattr(rc, bad, pal, preq); } } return(rc); } /* END req_stat_node() */
int req_deletejob( struct batch_request *preq) /* I */ { char *Msg = NULL; struct batch_request *preq_tmp = NULL; char log_buf[LOCAL_LOG_BUF_SIZE]; /* check if we are getting a purgecomplete from scheduler */ if (preq->rq_extend != NULL) { if (!strncmp(preq->rq_extend,PURGECOMP,strlen(PURGECOMP))) { /* purge_completed_jobs will respond with either an ack or reject */ purge_completed_jobs(preq); return(PBSE_NONE); } else if (strncmp(preq->rq_extend, deldelaystr, strlen(deldelaystr)) && strncmp(preq->rq_extend, delasyncstr, strlen(delasyncstr)) && strncmp(preq->rq_extend, delpurgestr, strlen(delpurgestr))) { /* have text message in request extension, add it */ Msg = preq->rq_extend; /* Message capability is only for operators and managers. * Check if request is authorized */ if ((preq->rq_perm & (ATR_DFLAG_OPRD | ATR_DFLAG_OPWR | ATR_DFLAG_MGRD | ATR_DFLAG_MGWR)) == 0) { req_reject(PBSE_PERM, 0, preq, NULL, "must have operator or manager privilege to use -m parameter"); return(PBSE_NONE); } } /* check if we are getting a asynchronous delete */ else if (!strncmp(preq->rq_extend,delasyncstr,strlen(delasyncstr))) { /* * Respond with an ack now instead of after MOM processing * Create a new batch request and fill it in. It will be freed by reply_ack */ snprintf(log_buf,sizeof(log_buf), "Deleting job asynchronously"); log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,preq->rq_ind.rq_delete.rq_objname,log_buf); preq_tmp = duplicate_request(preq); } } if (strcasecmp(preq->rq_ind.rq_delete.rq_objname,"all") == 0) { handle_delete_all(preq, preq_tmp, Msg); } else { handle_single_delete(preq, preq_tmp, Msg); } return(PBSE_NONE); } /* END req_deletejob() */
int req_stat_job( struct batch_request *preq) /* ptr to the decoded request */ { struct stat_cntl *cntl; /* see svrfunc.h */ char *name; job *pjob = NULL; pbs_queue *pque = NULL; int rc = PBSE_NONE; char log_buf[LOCAL_LOG_BUF_SIZE]; enum TJobStatTypeEnum type = tjstNONE; /* * first, validate the name of the requested object, either * a job, a queue, or the whole server. */ if (LOGLEVEL >= 7) { sprintf(log_buf, "note"); log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, __func__, log_buf); } /* FORMAT: name = { <JOBID> | <QUEUEID> | '' } */ name = preq->rq_ind.rq_status.rq_id; if (preq->rq_extend != NULL) { /* evaluate pbs_job_stat() 'extension' field */ if (!strncasecmp(preq->rq_extend, "truncated", strlen("truncated"))) { /* truncate response by 'max_report' */ type = tjstTruncatedServer; } else if (!strncasecmp(preq->rq_extend, "summarize_arrays", strlen("summarize_arrays"))) { type = tjstSummarizeArraysServer; } } /* END if (preq->rq_extend != NULL) */ if (isdigit((int)*name)) { /* status a single job */ if (is_array(name)) { if (type != tjstSummarizeArraysServer) { type = tjstArray; } } else { type = tjstJob; if ((pjob = svr_find_job(name, FALSE)) == NULL) { rc = PBSE_UNKJOBID; } else unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL); } } else if (isalpha(name[0])) { if (type == tjstNONE) type = tjstQueue; else if (type == tjstSummarizeArraysServer) type = tjstSummarizeArraysQueue; else type = tjstTruncatedQueue; /* if found, this mutex is released later */ if ((pque = find_queuebyname(name)) == NULL) { rc = PBSE_UNKQUE; } } else if ((*name == '\0') || (*name == '@')) { /* status all jobs at server */ if (type == tjstNONE) type = tjstServer; } else { rc = PBSE_IVALREQ; } if (rc != 0) { /* is invalid - an error */ req_reject(rc, 0, preq, NULL, NULL); return(rc); } preq->rq_reply.brp_choice = BATCH_REPLY_CHOICE_Status; CLEAR_HEAD(preq->rq_reply.brp_un.brp_status); cntl = (struct stat_cntl *)calloc(1, sizeof(struct stat_cntl)); if (cntl == NULL) { if (pque != NULL) unlock_queue(pque, "req_stat_job", (char *)"no memory cntl", LOGLEVEL); req_reject(PBSE_SYSTEM, 0, preq, NULL, NULL); return(PBSE_SYSTEM); } if ((type == tjstTruncatedQueue) || (type == tjstTruncatedServer)) { if (pque != NULL) { unlock_queue(pque, __func__, "", LOGLEVEL); pque = NULL; } } cntl->sc_type = (int)type; cntl->sc_conn = -1; cntl->sc_pque = pque; cntl->sc_origrq = preq; cntl->sc_post = req_stat_job_step2; cntl->sc_jobid[0] = '\0'; /* cause "start from beginning" */ req_stat_job_step2(cntl); /* go to step 2, see if running is current */ if (pque != NULL) unlock_queue(pque, "req_stat_job", (char *)"success", LOGLEVEL); free(cntl); return(PBSE_NONE); } /* END req_stat_job() */
int req_stat_svr( struct batch_request *preq) /* ptr to the decoded request */ { svrattrl *pal; struct batch_reply *preply; struct brp_status *pstat; int bad = 0; char nc_buf[128]; int numjobs; int netrates[3]; memset(netrates, 0, sizeof(netrates)); /* update count and state counts from sv_numjobs and sv_jobstates */ lock_sv_qs_mutex(server.sv_qs_mutex, __func__); numjobs = server.sv_qs.sv_numjobs; unlock_sv_qs_mutex(server.sv_qs_mutex, __func__); pthread_mutex_lock(server.sv_attr_mutex); server.sv_attr[SRV_ATR_TotalJobs].at_val.at_long = numjobs; server.sv_attr[SRV_ATR_TotalJobs].at_flags |= ATR_VFLAG_SET; pthread_mutex_lock(server.sv_jobstates_mutex); update_state_ct( &server.sv_attr[SRV_ATR_JobsByState], server.sv_jobstates, server.sv_jobstbuf); pthread_mutex_unlock(server.sv_jobstates_mutex); netcounter_get(netrates); snprintf(nc_buf, 127, "%d %d %d", netrates[0], netrates[1], netrates[2]); if (server.sv_attr[SRV_ATR_NetCounter].at_val.at_str != NULL) free(server.sv_attr[SRV_ATR_NetCounter].at_val.at_str); server.sv_attr[SRV_ATR_NetCounter].at_val.at_str = strdup(nc_buf); if (server.sv_attr[SRV_ATR_NetCounter].at_val.at_str != NULL) server.sv_attr[SRV_ATR_NetCounter].at_flags |= ATR_VFLAG_SET; pthread_mutex_unlock(server.sv_attr_mutex); /* allocate a reply structure and a status sub-structure */ preply = &preq->rq_reply; preply->brp_choice = BATCH_REPLY_CHOICE_Status; CLEAR_HEAD(preply->brp_un.brp_status); pstat = (struct brp_status *)calloc(1, sizeof(struct brp_status)); if (pstat == NULL) { reply_free(preply); req_reject(PBSE_SYSTEM, 0, preq, NULL, NULL); pthread_mutex_unlock(server.sv_attr_mutex); return(PBSE_SYSTEM); } CLEAR_LINK(pstat->brp_stlink); strcpy(pstat->brp_objname, server_name); pstat->brp_objtype = MGR_OBJ_SERVER; CLEAR_HEAD(pstat->brp_attr); append_link(&preply->brp_un.brp_status, &pstat->brp_stlink, pstat); /* add attributes to the status reply */ pal = (svrattrl *)GET_NEXT(preq->rq_ind.rq_status.rq_attr); if (status_attrib( pal, svr_attr_def, server.sv_attr, SRV_ATR_LAST, preq->rq_perm, &pstat->brp_attr, &bad, 1)) /* IsOwner == TRUE */ { reply_badattr(PBSE_NOATTR, bad, pal, preq); } else { reply_send_svr(preq); } return(PBSE_NONE); } /* END req_stat_svr() */
static void post_delete_mom1( struct work_task *pwt) { int delay = 0; int dellen = strlen(deldelaystr); job *pjob; pbs_queue *pque; char *preq_clt_id; struct batch_request *preq_sig; /* signal request to MOM */ struct batch_request *preq_clt = NULL; /* original client request */ int rc; time_t time_now = time(NULL); preq_sig = get_remove_batch_request((char *)pwt->wt_parm1); free(pwt->wt_mutex); free(pwt); if (preq_sig == NULL) return; rc = preq_sig->rq_reply.brp_code; preq_clt_id = preq_sig->rq_extra; free_br(preq_sig); if (preq_clt_id != NULL) { preq_clt = get_remove_batch_request(preq_clt_id); free(preq_clt_id); } /* the client request has been handled another way, nothing left to do */ if (preq_clt == NULL) return; pjob = svr_find_job(preq_clt->rq_ind.rq_delete.rq_objname, FALSE); if (pjob == NULL) { /* job has gone away */ req_reject(PBSE_UNKJOBID, 0, preq_clt, NULL, NULL); return; } if (rc) { /* mom rejected request */ if (rc == PBSE_UNKJOBID) { /* MOM claims no knowledge, so just purge it */ log_event( PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, "MOM rejected signal during delete"); /* removed the resources assigned to job */ free_nodes(pjob); set_resc_assigned(pjob, DECR); svr_job_purge(pjob); reply_ack(preq_clt); } else { req_reject(rc, 0, preq_clt, NULL, NULL); unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL); } return; } if (preq_clt->rq_extend) { if (strncmp(preq_clt->rq_extend, deldelaystr, dellen) == 0) { delay = atoi(preq_clt->rq_extend + dellen); } } reply_ack(preq_clt); /* dont need it, reply now */ /* * if no delay specified in original request, see if kill_delay * queue attribute is set. */ if (delay == 0) { if ((pque = get_jobs_queue(&pjob)) != NULL) { pthread_mutex_lock(server.sv_attr_mutex); delay = attr_ifelse_long(&pque->qu_attr[QE_ATR_KillDelay], &server.sv_attr[SRV_ATR_KillDelay], 2); pthread_mutex_unlock(server.sv_attr_mutex); unlock_queue(pque, __func__, NULL, LOGLEVEL); } else if (pjob != NULL) return; } set_task(WORK_Timed, delay + time_now, post_delete_mom2, strdup(pjob->ji_qs.ji_jobid), FALSE); /* * Since the first signal has succeeded, let's reschedule the * nanny to be 1 minute after the second phase. */ apply_job_delete_nanny(pjob, time_now + delay + 60); unlock_ji_mutex(pjob, __func__, "2", LOGLEVEL); } /* END post_delete_mom1() */
void req_py_spawn(struct batch_request *preq) { int jt; /* job type */ job *pjob; int rc; char *jid = preq->rq_ind.rq_py_spawn.rq_jid; int i, offset; /* ** Returns job pointer for singleton job or "parent" of ** an array job. */ pjob = chk_job_request(jid, preq, &jt); if (pjob == NULL) return; /* see if requestor is the job owner */ if (svr_chk_owner(preq, pjob) != 0) { req_reject(PBSE_PERM, 0, preq); return; } if (jt == IS_ARRAY_NO) { /* a regular job is okay */ /* the job must be running */ if ((pjob->ji_qs.ji_state != JOB_STATE_RUNNING) || (pjob->ji_qs.ji_substate != JOB_SUBSTATE_RUNNING)) { req_reject(PBSE_BADSTATE, 0, preq); return; } } else if (jt == IS_ARRAY_Single) { /* a single subjob is okay */ offset = subjob_index_to_offset(pjob, get_index_from_jid(jid)); if (offset == -1) { req_reject(PBSE_UNKJOBID, 0, preq); return; } i = get_subjob_state(pjob, offset); if (i == -1) { req_reject(PBSE_IVALREQ, 0, preq); return; } if (i != JOB_STATE_RUNNING) { req_reject(PBSE_BADSTATE, 0, preq); return; } if ((pjob = pjob->ji_ajtrk->tkm_tbl[offset].trk_psubjob) == NULL) { req_reject(PBSE_UNKJOBID, 0, preq); return; } if (pjob->ji_qs.ji_substate != JOB_SUBSTATE_RUNNING) { req_reject(PBSE_BADSTATE, 0, preq); return; } } else { reply_text(preq, PBSE_NOSUP, "not supported for Array Jobs or multiple sub-jobs"); return; } /* ** Pass the request on to MOM. If this works, the function ** post_py_spawn_req will be called to handle the reply. ** If it fails, send the reply now. */ rc = relay_to_mom(pjob, preq, post_py_spawn_req); if (rc) req_reject(rc, 0, preq); /* unable to get to MOM */ }