void post_job_delete_nanny( batch_request *preq_sig) { int rc; job *pjob; char log_buf[LOCAL_LOG_BUF_SIZE]; long nanny = 0; if (preq_sig == NULL) return; rc = preq_sig->rq_reply.brp_code; get_svr_attr_l(SRV_ATR_JobNanny, &nanny); if (!nanny) { /* the admin disabled nanny within the last minute or so */ free_br(preq_sig); return; } /* extract job id from task */ pjob = svr_find_job(preq_sig->rq_ind.rq_signal.rq_jid, FALSE); if (pjob == NULL) { sprintf(log_buf, "job delete nanny: the job disappeared (this is a BUG!)"); log_event(PBSEVENT_ERROR,PBS_EVENTCLASS_JOB,preq_sig->rq_ind.rq_signal.rq_jid,log_buf); } else if (rc == PBSE_UNKJOBID) { sprintf(log_buf, "job delete nanny returned, but does not exist on mom"); log_event(PBSEVENT_ERROR,PBS_EVENTCLASS_JOB,preq_sig->rq_ind.rq_signal.rq_jid,log_buf); free_nodes(pjob); set_resc_assigned(pjob, DECR); free_br(preq_sig); svr_job_purge(pjob); return; } unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL); /* free task */ free_br(preq_sig); return; } /* END post_job_delete_nanny() */
void post_modify_arrayreq( batch_request *preq) { job *pjob; char log_buf[LOCAL_LOG_BUF_SIZE]; if (preq == NULL) return; preq->rq_conn = preq->rq_orgconn; /* restore socket to client */ if ((preq->rq_reply.brp_code) && (preq->rq_reply.brp_code != PBSE_UNKJOBID)) { sprintf(log_buf, msg_mombadmodify, preq->rq_reply.brp_code); log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,preq->rq_ind.rq_modify.rq_objname,log_buf); free_br(preq); } else { if (preq->rq_reply.brp_code == PBSE_UNKJOBID) { if ((pjob = svr_find_job(preq->rq_ind.rq_modify.rq_objname, FALSE)) == NULL) { free_br(preq); return; } else { mutex_mgr job_mutex = mutex_mgr(pjob->ji_mutex, true); if (LOGLEVEL >= 0) { sprintf(log_buf, "post_modify_req: PBSE_UNKJOBID for job %s in state %s-%s, dest = %s", pjob->ji_qs.ji_jobid, PJobState[pjob->ji_qs.ji_state], PJobSubState[pjob->ji_qs.ji_substate], pjob->ji_qs.ji_destin); log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf); } } } free_br(preq); } return; } /* END post_modify_arrayreq() */
int reply_send_async(struct batch_request *request) { int sfds = request->rq_conn; /* socket */ // only thread client responses if (svr_conn[sfds].cn_active != FromClientDIS) return reply_send(request); /* determine where the reply should go, remote or local */ if (sfds == PBS_LOCAL_CONNECTION) // default to synchronous version return reply_send(request); else if (sfds >= 0) { int rc = dis_reply_write_async(sfds, &request->rq_reply); if ((request->rq_type != PBS_BATCH_AsyModifyJob) || (request->rq_noreply == TRUE)) { free_br(request); } return rc; } /* Otherwise, the reply is to be sent to a remote client */ return 0; }
void post_rerun( batch_request *preq) { int newstate; int newsub; job *pjob; char log_buf[LOCAL_LOG_BUF_SIZE]; if (preq == NULL) return; if (preq->rq_reply.brp_code != 0) { sprintf(log_buf, "rerun signal reject by mom: %d", preq->rq_reply.brp_code); log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,preq->rq_ind.rq_signal.rq_jid,log_buf); if ((pjob = svr_find_job(preq->rq_ind.rq_signal.rq_jid, FALSE))) { mutex_mgr job_mutex(pjob->ji_mutex, true); svr_evaljobstate(pjob, &newstate, &newsub, 1); svr_setjobstate(pjob, newstate, newsub, FALSE); } } free_br(preq); return; } /* END post_rerun() */
void chkpt_xfr_hold( batch_request *preq, job *pjob) { char log_buf[LOCAL_LOG_BUF_SIZE]; if ((preq == NULL) || (preq->rq_extra == NULL) || (pjob == NULL)) return; if (LOGLEVEL >= 7) { sprintf(log_buf, "BLCR copy completed (state is %s-%s)", PJobState[pjob->ji_qs.ji_state], PJobSubState[pjob->ji_qs.ji_substate]); log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buf); } free_br(preq); set_task(WORK_Immed, 0, mom_cleanup_checkpoint_hold, strdup(pjob->ji_qs.ji_jobid), FALSE); return; } /* END chkpt_xfr_hold() */
void chkpt_xfr_done( batch_request *preq) { free_br(preq); } /* END chkpt_xfr_done() */
void release_req(struct work_task *pwt) { free_br((struct batch_request *)pwt->wt_parm1); if (pwt->wt_event != -1 && pwt->wt_aux2 != 1) /* not rpp */ svr_disconnect(pwt->wt_event); }
void issue_track( job *pjob) { struct batch_request *preq; char *pc; preq = alloc_br(PBS_BATCH_TrackJob); if (preq == (struct batch_request *)0) return; preq->rq_ind.rq_track.rq_hopcount = pjob->ji_wattr[JOB_ATR_hopcount].at_val.at_long; strcpy(preq->rq_ind.rq_track.rq_jid, pjob->ji_qs.ji_jobid); strcpy(preq->rq_ind.rq_track.rq_location, server_name); preq->rq_ind.rq_track.rq_state[0] = pjob->ji_wattr[JOB_ATR_state].at_val.at_char; pc = pjob->ji_qs.ji_jobid; while (*pc != '.') pc++; issue_to_svr(++pc, preq, NULL); free_br(preq); }
void *send_power_state_to_mom( void *arg) { struct batch_request *pRequest = (struct batch_request *)arg; struct pbsnode *pNode = find_nodebyname(pRequest->rq_host); if (pNode == NULL) { free_br(pRequest); return NULL; } int handle = 0; int local_errno = 0; handle = svr_connect(pNode->nd_addrs[0],pNode->nd_mom_port,&local_errno,pNode,NULL); if (handle < 0) { unlock_node(pNode, __func__, "Error connecting", LOGLEVEL); return NULL; } unlock_node(pNode, __func__, "Done connecting", LOGLEVEL); issue_Drequest(handle, pRequest, true); return NULL; }
void *req_messagejob( batch_request *preq) /* I */ { job *pjob; int rc; batch_request *dup_req = NULL; if ((pjob = chk_job_request(preq->rq_ind.rq_message.rq_jid, preq)) == NULL) return(NULL); mutex_mgr job_mutex(pjob->ji_mutex, true); /* the job must be running */ if (pjob->ji_qs.ji_state != JOB_STATE_RUNNING) { req_reject(PBSE_BADSTATE, 0, preq, NULL, NULL); return(NULL); } if ((rc = copy_batchrequest(&dup_req, preq, 0, -1)) != 0) { req_reject(PBSE_MEM_MALLOC, 0, preq, NULL, NULL); } /* pass the request on to MOM */ /* The dup_req is freed in relay_to_mom (failure) * or in issue_Drequest (success) */ else if ((rc = relay_to_mom(&pjob, dup_req, NULL)) != PBSE_NONE) { req_reject(rc, 0, preq, NULL, NULL); /* unable to get to MOM */ free_br(dup_req); } else { post_message_req(dup_req); free_br(preq); } /* After MOM acts and replies to us, we pick up in post_message_req() */ if (pjob == NULL) job_mutex.set_lock_on_exit(false); return(NULL); } /* END req_messagejob() */
void *check_if_orphaned( void *vp) { char *rsv_id = (char *)vp; char job_id[PBS_MAXSVRJOBID]; struct batch_request *preq; int handle = -1; int retries = 0; struct pbsnode *pnode; char log_buf[LOCAL_LOG_BUF_SIZE]; if (is_orphaned(rsv_id, job_id) == TRUE) { if((preq = alloc_br(PBS_BATCH_DeleteReservation)) == NULL) return NULL; preq->rq_extend = rsv_id; /* Assume the request will be successful and remove the RSV from the hash table */ remove_alps_reservation(rsv_id); if ((pnode = get_next_login_node(NULL)) != NULL) { struct in_addr hostaddr; int local_errno; pbs_net_t momaddr; memcpy(&hostaddr, &pnode->nd_sock_addr.sin_addr, sizeof(hostaddr)); momaddr = ntohl(hostaddr.s_addr); snprintf(log_buf, sizeof(log_buf), "Found orphan ALPS reservation ID %s for job %s; asking %s to remove it", rsv_id, job_id, pnode->nd_name); log_record(PBSEVENT_DEBUG, PBS_EVENTCLASS_SERVER, __func__, log_buf); while ((handle < 0) && (retries < 3)) { handle = svr_connect(momaddr, pnode->nd_mom_port, &local_errno, pnode, NULL, ToServerDIS); retries++; } /* unlock before the network transaction */ unlock_node(pnode, __func__, NULL, LOGLEVEL); if (handle >= 0) issue_Drequest(handle, preq, true); free_br(preq); } } else free(rsv_id); return(NULL); } /* END check_if_orphaned() */
void release_req( struct work_task *pwt) { free_br((struct batch_request *)pwt->wt_parm1); if (pwt->wt_event != -1) svr_disconnect(pwt->wt_event); return; }
void issue_track( job *pjob) { struct batch_request *preq; char *pc; char *sname; char log_buf[LOCAL_LOG_BUF_SIZE]; if ((pc = strchr(pjob->ji_qs.ji_jobid, '.')) == NULL) { snprintf(log_buf, sizeof(log_buf), "Remote job routing is not compatible with display_job_server_suffix set to false. Cannot track %s", pjob->ji_qs.ji_jobid); log_err(-1, __func__, log_buf); return; } sname = pc + 1; /* do not issue track requests to ourselves */ if (!strcmp(sname, server_name)) { snprintf(log_buf, sizeof(log_buf), "%s erroneously called for local job %s", __func__, pjob->ji_qs.ji_jobid); log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, __func__, log_buf); return; } preq = alloc_br(PBS_BATCH_TrackJob); if (preq == NULL) return; preq->rq_ind.rq_track.rq_hopcount = pjob->ji_wattr[JOB_ATR_hopcount].at_val.at_long; strcpy(preq->rq_ind.rq_track.rq_jid, pjob->ji_qs.ji_jobid); strcpy(preq->rq_ind.rq_track.rq_location, server_name); preq->rq_ind.rq_track.rq_state[0] = pjob->ji_wattr[JOB_ATR_state].at_val.at_char; pc = pjob->ji_qs.ji_jobid; while (*pc != '.') pc++; issue_to_svr(++pc, preq, NULL); free_br(preq); }
int issue_signal( job **pjob_ptr, char *signame, /* name of the signal to send */ void (*func)(batch_request *), void *extra) /* extra parameter to be stored in sig request */ { int rc; job *pjob = *pjob_ptr; struct batch_request *newreq; char jobid[PBS_MAXSVRJOBID + 1]; /* build up a Signal Job batch request */ if ((newreq = alloc_br(PBS_BATCH_SignalJob)) == NULL) { /* FAILURE */ return(PBSE_SYSTEM); } newreq->rq_extra = extra; strcpy(newreq->rq_ind.rq_signal.rq_jid, pjob->ji_qs.ji_jobid); snprintf(newreq->rq_ind.rq_signal.rq_signame, sizeof(newreq->rq_ind.rq_signal.rq_signame), "%s", signame); /* The newreq is freed in relay_to_mom (failure) * or in issue_Drequest (success) */ rc = relay_to_mom(&pjob, newreq, NULL); if ((rc == PBSE_NONE) && (pjob != NULL)) { strcpy(jobid, pjob->ji_qs.ji_jobid); unlock_ji_mutex(pjob, __func__, NULL, 0); func(newreq); *pjob_ptr = svr_find_job((char *)jobid, TRUE); } else { free_br(newreq); if (pjob == NULL) *pjob_ptr = NULL; } return(rc); } /* END issue_signal() */
void remove_stagein( job **pjob_ptr) /* I */ { struct batch_request *preq = 0; job *pjob = *pjob_ptr; u_long addr; preq = cpy_stage(preq, pjob, JOB_ATR_stagein, 0); if (preq != NULL) { /* have files to delete */ /* change the request type from copy to delete */ preq->rq_type = PBS_BATCH_DelFiles; preq->rq_extra = NULL; addr = pjob->ji_qs.ji_un.ji_exect.ji_momaddr; addr += pjob->ji_qs.ji_un.ji_exect.ji_mom_rmport; addr += pjob->ji_qs.ji_un.ji_exect.ji_momport; /* The preq is freed in relay_to_mom (failure) * or in issue_Drequest (success) */ if (relay_to_mom(&pjob, preq, NULL) == PBSE_NONE) { if (pjob != NULL) pjob->ji_qs.ji_svrflags &= ~JOB_SVFLG_StagedIn; } else { /* log that we were unable to remove the files */ log_event( PBSEVENT_JOB, PBS_EVENTCLASS_FILE, pjob->ji_qs.ji_jobid, "unable to remove staged in files for job"); } free_br(preq); } return; } /* END remove_stagein() */
void remove_stagein( job *pjob) /* I */ { struct batch_request *preq = 0; u_long addr; preq = cpy_stage(preq, pjob, JOB_ATR_stagein, 0); if (preq != NULL) { /* have files to delete */ /* change the request type from copy to delete */ preq->rq_type = PBS_BATCH_DelFiles; preq->rq_extra = NULL; addr = pjob->ji_qs.ji_un.ji_exect.ji_momaddr; addr += pjob->ji_qs.ji_un.ji_exect.ji_mom_rmport; addr += pjob->ji_qs.ji_un.ji_exect.ji_momport; if (relay_to_mom( pjob, preq, release_req) == 0) { pjob->ji_qs.ji_svrflags &= ~JOB_SVFLG_StagedIn; } else { /* log that we were unable to remove the files */ log_event( PBSEVENT_JOB, PBS_EVENTCLASS_FILE, pjob->ji_qs.ji_jobid, "unable to remove staged in files for job"); free_br(preq); } } return; } /* END remove_stagein() */
void *check_if_orphaned( void *vp) { char *rsv_id = (char *)vp; struct batch_request *preq; int handle = -1; int retries = 0; struct pbsnode *pnode; if (is_orphaned(rsv_id) == TRUE) { preq = alloc_br(PBS_BATCH_DeleteReservation); preq->rq_extend = rsv_id; if ((pnode = get_next_login_node(NULL)) != NULL) { struct in_addr hostaddr; int local_errno; pbs_net_t momaddr; memcpy(&hostaddr, &pnode->nd_sock_addr.sin_addr, sizeof(hostaddr)); momaddr = ntohl(hostaddr.s_addr); while ((handle < 0) && (retries < 3)) { handle = svr_connect(momaddr, pnode->nd_mom_port, &local_errno, pnode, NULL, ToServerDIS); retries++; } /* unlock before the network transaction */ unlock_node(pnode, __func__, NULL, 0); if (handle >= 0) { issue_Drequest(handle, preq, release_req, 0); } else free_br(preq); } } else free(rsv_id); return(NULL); } /* END check_if_orphaned() */
int copy_attribute_list( batch_request *preq, batch_request *preq_tmp) { svrattrl *pal = (svrattrl *)GET_NEXT(preq->rq_ind.rq_manager.rq_attr); tlist_head *phead = &preq_tmp->rq_ind.rq_manager.rq_attr; svrattrl *newpal = NULL; while (pal != NULL) { newpal = (svrattrl *)calloc(1, pal->al_tsize + 1); if (!newpal) { free_br(preq_tmp); return(PBSE_SYSTEM); } CLEAR_LINK(newpal->al_link); newpal->al_atopl.next = 0; newpal->al_tsize = pal->al_tsize + 1; newpal->al_nameln = pal->al_nameln; newpal->al_flags = pal->al_flags; newpal->al_atopl.name = (char *)newpal + sizeof(svrattrl); strcpy((char *)newpal->al_atopl.name, pal->al_atopl.name); newpal->al_nameln = pal->al_nameln; newpal->al_atopl.resource = newpal->al_atopl.name + newpal->al_nameln; if (pal->al_atopl.resource != NULL) strcpy((char *)newpal->al_atopl.resource, pal->al_atopl.resource); newpal->al_rescln = pal->al_rescln; newpal->al_atopl.value = newpal->al_atopl.name + newpal->al_nameln + newpal->al_rescln; strcpy((char *)newpal->al_atopl.value, pal->al_atopl.value); newpal->al_valln = pal->al_valln; newpal->al_atopl.op = pal->al_atopl.op; pal = (struct svrattrl *)GET_NEXT(pal->al_link); } if ((phead != NULL) && (newpal != NULL)) append_link(phead, &newpal->al_link, newpal); return(PBSE_NONE); } /* END copy_attribute_list() */
END_TEST START_TEST(test_alloc_br) { batch_request *preq = alloc_br(PBS_BATCH_QueueJob); fail_unless(preq->rq_type == PBS_BATCH_QueueJob); fail_unless(preq->rq_conn == -1); fail_unless(preq->rq_orgconn == -1); fail_unless(preq->rq_reply.brp_choice == BATCH_REPLY_CHOICE_NULL); fail_unless(preq->rq_noreply == FALSE); fail_unless(preq->rq_time > 0); free_br(preq); fail_unless(free_attrlist_called > 0); }
void release_req( struct work_task *pwt) { batch_request *preq; char *br_id = pwt->wt_parm1; if ((preq = get_remove_batch_request(br_id)) != NULL) free_br(preq); if (pwt->wt_event != -1) svr_disconnect(pwt->wt_event); free(pwt->wt_mutex); free(pwt); } /* END release_req() */
void post_checkpoint( batch_request *preq) { job *pjob; if (preq == NULL) return; pjob = svr_find_job(preq->rq_ind.rq_hold.rq_orig.rq_objname, FALSE); if (preq->rq_reply.brp_code == 0) { /* checkpointed ok */ if ((preq->rq_reply.brp_auxcode) && (pjob != NULL)) /* checkpoint can be moved */ { pjob->ji_qs.ji_svrflags = (pjob->ji_qs.ji_svrflags & ~JOB_SVFLG_CHECKPOINT_FILE) | JOB_SVFLG_HASRUN | JOB_SVFLG_CHECKPOINT_MIGRATEABLE; } } else { /* need to try rerun if possible or just abort the job */ if (pjob) { pjob->ji_qs.ji_svrflags &= ~JOB_SVFLG_CHECKPOINT_FILE; pjob->ji_qs.ji_substate = JOB_SUBSTATE_RUNNING; if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING) rerun_or_kill(&pjob, msg_on_shutdown); } } free_br(preq); if (pjob != NULL) unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL); } /* END post_checkpoint() */
void *req_messagejob( void *vp) { struct batch_request *preq = (struct batch_request *)vp; job *pjob; int rc; struct batch_request *dup_req = NULL; if ((pjob = chk_job_request(preq->rq_ind.rq_message.rq_jid, preq)) == NULL) return(NULL); /* the job must be running */ if (pjob->ji_qs.ji_state != JOB_STATE_RUNNING) { req_reject(PBSE_BADSTATE, 0, preq, NULL, NULL); unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL); return(NULL); } if ((rc = copy_batchrequest(&dup_req, preq, 0, -1)) != 0) { req_reject(PBSE_MEM_MALLOC, 0, preq, NULL, NULL); } /* pass the request on to MOM */ /* The dup_req is freed in relay_to_mom (failure) * or in issue_Drequest (success) */ else if ((rc = relay_to_mom(&pjob, dup_req, post_message_req)) != 0) req_reject(rc, 0, preq, NULL, NULL); /* unable to get to MOM */ else free_br(preq); /* After MOM acts and replies to us, we pick up in post_message_req() */ if (pjob != NULL) unlock_ji_mutex(pjob, __func__, "2", LOGLEVEL); return(NULL); } /* END req_messagejob() */
void chkpt_xfr_hold( struct work_task *ptask) { job *pjob; struct batch_request *preq; char log_buf[LOCAL_LOG_BUF_SIZE]; preq = get_remove_batch_request(ptask->wt_parm1); free(ptask->wt_mutex); free(ptask); if ((preq == NULL) || (preq->rq_extra == NULL)) return; if ((pjob = svr_find_job(preq->rq_extra, FALSE)) == NULL) return; if (LOGLEVEL >= 7) { sprintf(log_buf, "BLCR copy completed (state is %s-%s)", PJobState[pjob->ji_qs.ji_state], PJobSubState[pjob->ji_qs.ji_substate]); log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buf); } free_br(preq); set_task(WORK_Immed, 0, mom_cleanup_checkpoint_hold, strdup(pjob->ji_qs.ji_jobid), FALSE); unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL); return; } /* END chkpt_xfr_hold() */
int reply_send_svr( struct batch_request *request) /* I (freed) */ { int rc = 0; char log_buf[LOCAL_LOG_BUF_SIZE]; int sfds = request->rq_conn; /* socket */ /* Handle remote replies - local batch requests no longer create work tasks */ if (sfds >= 0) { /* Otherwise, the reply is to be sent to a remote client */ if (request->rq_noreply != TRUE) { rc = dis_reply_write(sfds, &request->rq_reply); if (LOGLEVEL >= 7) { sprintf(log_buf, "Reply sent for request type %s on socket %d", reqtype_to_txt(request->rq_type), sfds); log_record(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, __func__, log_buf); } } } if (((request->rq_type != PBS_BATCH_AsyModifyJob) && (request->rq_type != PBS_BATCH_AsyrunJob) && (request->rq_type != PBS_BATCH_AsySignalJob)) || (request->rq_noreply == TRUE)) { free_br(request); } return(rc); } /* END reply_send_svr() */
int reply_send_mom( struct batch_request *request) /* I (freed) */ { int rc = 0; int sfds = request->rq_conn; /* socket */ /* determine where the reply should go, remote or local */ if (sfds == PBS_LOCAL_CONNECTION) { rc = PBSE_SYSTEM; } else if (sfds >= 0) { /* Otherwise, the reply is to be sent to a remote client */ rc = dis_reply_write(sfds, &request->rq_reply); } free_br(request); return(rc); } /* END reply_send_mom() */
static void post_delete_mom1( struct work_task *pwt) { int delay = 0; int dellen = strlen(deldelaystr); job *pjob; pbs_queue *pque; char *preq_clt_id; struct batch_request *preq_sig; /* signal request to MOM */ struct batch_request *preq_clt = NULL; /* original client request */ int rc; time_t time_now = time(NULL); preq_sig = get_remove_batch_request((char *)pwt->wt_parm1); free(pwt->wt_mutex); free(pwt); if (preq_sig == NULL) return; rc = preq_sig->rq_reply.brp_code; preq_clt_id = preq_sig->rq_extra; free_br(preq_sig); if (preq_clt_id != NULL) { preq_clt = get_remove_batch_request(preq_clt_id); free(preq_clt_id); } /* the client request has been handled another way, nothing left to do */ if (preq_clt == NULL) return; pjob = svr_find_job(preq_clt->rq_ind.rq_delete.rq_objname, FALSE); if (pjob == NULL) { /* job has gone away */ req_reject(PBSE_UNKJOBID, 0, preq_clt, NULL, NULL); return; } if (rc) { /* mom rejected request */ if (rc == PBSE_UNKJOBID) { /* MOM claims no knowledge, so just purge it */ log_event( PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, "MOM rejected signal during delete"); /* removed the resources assigned to job */ free_nodes(pjob); set_resc_assigned(pjob, DECR); svr_job_purge(pjob); reply_ack(preq_clt); } else { req_reject(rc, 0, preq_clt, NULL, NULL); unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL); } return; } if (preq_clt->rq_extend) { if (strncmp(preq_clt->rq_extend, deldelaystr, dellen) == 0) { delay = atoi(preq_clt->rq_extend + dellen); } } reply_ack(preq_clt); /* dont need it, reply now */ /* * if no delay specified in original request, see if kill_delay * queue attribute is set. */ if (delay == 0) { if ((pque = get_jobs_queue(&pjob)) != NULL) { pthread_mutex_lock(server.sv_attr_mutex); delay = attr_ifelse_long(&pque->qu_attr[QE_ATR_KillDelay], &server.sv_attr[SRV_ATR_KillDelay], 2); pthread_mutex_unlock(server.sv_attr_mutex); unlock_queue(pque, __func__, NULL, LOGLEVEL); } else if (pjob != NULL) return; } set_task(WORK_Timed, delay + time_now, post_delete_mom2, strdup(pjob->ji_qs.ji_jobid), FALSE); /* * Since the first signal has succeeded, let's reschedule the * nanny to be 1 minute after the second phase. */ apply_job_delete_nanny(pjob, time_now + delay + 60); unlock_ji_mutex(pjob, __func__, "2", LOGLEVEL); } /* END post_delete_mom1() */
int handle_delete_all( struct batch_request *preq, struct batch_request *preq_tmp, char *Msg) { /* don't use the actual request so we can reply about all of the jobs */ struct batch_request *preq_dup = duplicate_request(preq); job *pjob; int iter = -1; int failed_deletes = 0; int total_jobs = 0; int rc = PBSE_NONE; char tmpLine[MAXLINE]; preq_dup->rq_noreply = TRUE; if (preq_tmp != NULL) { reply_ack(preq_tmp); preq->rq_noreply = TRUE; /* set for no more replies */ } while ((pjob = next_job(&alljobs, &iter)) != NULL) { if ((rc = forced_jobpurge(pjob, preq_dup)) == PURGE_SUCCESS) { continue; } if (pjob->ji_qs.ji_state >= JOB_STATE_EXITING) { unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL); continue; } total_jobs++; /* mutex is freed below */ if (rc == PBSE_NONE) { if ((rc = execute_job_delete(pjob, Msg, preq_dup)) == PBSE_NONE) reply_ack(preq_dup); /* mark this as NULL because it has been freed */ preq_dup = NULL; } if (rc != PURGE_SUCCESS) { /* duplicate the preq so we don't have a problem with double frees */ preq_dup = duplicate_request(preq); preq_dup->rq_noreply = TRUE; if ((rc == MOM_DELETE) || (rc == ROUTE_DELETE)) failed_deletes++; } } if (failed_deletes == 0) { reply_ack(preq); /* PURGE SUCCESS means this was qdel -p all. In this case no reply_*() * functions have been called */ if (rc == PURGE_SUCCESS) { free_br(preq_dup); preq_dup = NULL; } } else { snprintf(tmpLine,sizeof(tmpLine),"Deletes failed for %d of %d jobs", failed_deletes, total_jobs); req_reject(PBSE_SYSTEM, 0, preq, NULL, tmpLine); } /* preq_dup happens at the end of the loop, so free the extra one if * it is there */ if (preq_dup != NULL) free_br(preq_dup); return(PBSE_NONE); } /* END handle_delete_all() */
int stat_to_mom( char *job_id, struct stat_cntl *cntl) /* M */ { struct batch_request *newrq; int rc = PBSE_NONE; unsigned long addr; char log_buf[LOCAL_LOG_BUF_SIZE+1]; struct pbsnode *node; int handle = -1; unsigned long job_momaddr = -1; unsigned short job_momport = -1; char *job_momname = NULL; job *pjob = NULL; if ((pjob = svr_find_job(job_id, FALSE)) == NULL) return(PBSE_JOBNOTFOUND); mutex_mgr job_mutex(pjob->ji_mutex, true); if ((pjob->ji_qs.ji_un.ji_exect.ji_momaddr == 0) || (!pjob->ji_wattr[JOB_ATR_exec_host].at_val.at_str)) { job_mutex.unlock(); snprintf(log_buf, sizeof(log_buf), "Job %s missing MOM's information. Skipping statting on this job", pjob->ji_qs.ji_jobid); log_record(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, __func__, log_buf); return PBSE_BAD_PARAMETER; } job_momaddr = pjob->ji_qs.ji_un.ji_exect.ji_momaddr; job_momport = pjob->ji_qs.ji_un.ji_exect.ji_momport; job_momname = strdup(pjob->ji_wattr[JOB_ATR_exec_host].at_val.at_str); job_mutex.unlock(); if (job_momname == NULL) return PBSE_MEM_MALLOC; if ((newrq = alloc_br(PBS_BATCH_StatusJob)) == NULL) { free(job_momname); return PBSE_MEM_MALLOC; } if (cntl->sc_type == 1) snprintf(newrq->rq_ind.rq_status.rq_id, sizeof(newrq->rq_ind.rq_status.rq_id), "%s", job_id); else newrq->rq_ind.rq_status.rq_id[0] = '\0'; /* get stat of all */ CLEAR_HEAD(newrq->rq_ind.rq_status.rq_attr); /* if MOM is down just return stale information */ addr = job_momaddr; node = tfind_addr(addr,job_momport,job_momname); free(job_momname); if (node == NULL) return PBSE_UNKNODE; if ((node->nd_state & INUSE_DOWN)||(node->nd_power_state != POWER_STATE_RUNNING)) { if (LOGLEVEL >= 6) { snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "node '%s' is allocated to job but in state 'down'", node->nd_name); log_event(PBSEVENT_SYSTEM,PBS_EVENTCLASS_JOB,job_id,log_buf); } unlock_node(node, __func__, "no rely mom", LOGLEVEL); free_br(newrq); return PBSE_NORELYMOM; } /* get connection to MOM */ unlock_node(node, __func__, "before svr_connect", LOGLEVEL); handle = svr_connect(job_momaddr, job_momport, &rc, NULL, NULL); if (handle >= 0) { if ((rc = issue_Drequest(handle, newrq, true)) == PBSE_NONE) { stat_update(newrq, cntl); } } else rc = PBSE_CONNECT; if (rc == PBSE_SYSTEM) rc = PBSE_MEM_MALLOC; free_br(newrq); return(rc); } /* END stat_to_mom() */
void process_request(int sfds) { int rc; struct batch_request *request; conn_t *conn; time_now = time(NULL); conn = get_conn(sfds); if (!conn) { log_event(PBSEVENT_SYSTEM, PBS_EVENTCLASS_REQUEST, LOG_ERR, "process_request", "did not find socket in connection table"); #ifdef WIN32 (void)closesocket(sfds); #else (void)close(sfds); #endif return; } if ((request = alloc_br(0)) == NULL) { log_event(PBSEVENT_SYSTEM, PBS_EVENTCLASS_REQUEST, LOG_ERR, "process_request", "Unable to allocate request structure"); close_conn(sfds); return; } request->rq_conn = sfds; /* * Read in the request and decode it to the internal request structure. */ if (get_connecthost(sfds, request->rq_host, PBS_MAXHOSTNAME)) { (void)sprintf(log_buffer, "%s: %lu", msg_reqbadhost, get_connectaddr(sfds)); log_event(PBSEVENT_DEBUG, PBS_EVENTCLASS_REQUEST, LOG_DEBUG, "", log_buffer); req_reject(PBSE_BADHOST, 0, request); return; } #ifndef PBS_MOM if (conn->cn_active == FromClientDIS) { rc = dis_request_read(sfds, request); } else { log_event(PBSEVENT_SYSTEM, PBS_EVENTCLASS_REQUEST, LOG_ERR, "process_req", "request on invalid type of connection"); close_conn(sfds); free_br(request); return; } #else /* PBS_MOM */ rc = dis_request_read(sfds, request); #endif /* PBS_MOM */ if (rc == -1) { /* End of file */ close_client(sfds); free_br(request); return; } else if ((rc == PBSE_SYSTEM) || (rc == PBSE_INTERNAL)) { /* read error, likely cannot send reply so just disconnect */ /* ??? not sure about this ??? */ close_client(sfds); free_br(request); return; } else if (rc > 0) { /* * request didn't decode, either garbage or unknown * request type, in ether case, return reject-reply */ req_reject(rc, 0, request); close_client(sfds); return; } #ifndef PBS_MOM /* If the request is coming on the socket we opened to the */ /* scheduler, change the "user" from "root" to "Scheduler" */ if (find_sched_from_sock(request->rq_conn) != NULL) { strncpy(request->rq_user, PBS_SCHED_DAEMON_NAME, PBS_MAXUSER); request->rq_user[PBS_MAXUSER] = '\0'; } #endif /* PBS_MOM */ (void)sprintf(log_buffer, msg_request, request->rq_type, request->rq_user, request->rq_host, sfds); log_event(PBSEVENT_DEBUG2, PBS_EVENTCLASS_REQUEST, LOG_DEBUG, "", log_buffer); /* is the request from a host acceptable to the server */ if (request->rq_type == PBS_BATCH_AuthExternal) { rc = authenticate_external(conn, request); if (rc == 0) reply_ack(request); else if (rc == -2) req_reject(PBSE_NOSUP, 0, request); else req_reject(PBSE_BADCRED, 0, request); return; } #ifndef PBS_MOM if (server.sv_attr[(int)SRV_ATR_acl_host_enable].at_val.at_long) { /* acl enabled, check it; always allow myself */ struct pbsnode *isanode = NULL; if ((server.sv_attr[SRV_ATR_acl_host_moms_enable].at_flags & ATR_VFLAG_SET) && (server.sv_attr[(int)SRV_ATR_acl_host_moms_enable].at_val.at_long == 1)) { isanode = find_nodebyaddr(get_connectaddr(sfds)); if ((isanode != NULL) && (isanode->nd_state & INUSE_DELETED)) isanode = NULL; } if (isanode == NULL) { if ((acl_check(&server.sv_attr[(int)SRV_ATR_acl_hosts], request->rq_host, ACL_Host) == 0) && (strcasecmp(server_host, request->rq_host) != 0)) { req_reject(PBSE_BADHOST, 0, request); close_client(sfds); return; } } } /* * determine source (user client or another server) of request. * set the permissions granted to the client */ if (conn->cn_authen & PBS_NET_CONN_FROM_PRIVIL) { /* request came from another server */ request->rq_fromsvr = 1; request->rq_perm = ATR_DFLAG_USRD | ATR_DFLAG_USWR | ATR_DFLAG_OPRD | ATR_DFLAG_OPWR | ATR_DFLAG_MGRD | ATR_DFLAG_MGWR | ATR_DFLAG_SvWR; } else { /* request not from another server */ request->rq_fromsvr = 0; /* * Client must be authenticated by a Authenticate User Request, * if not, reject request and close connection. * -- The following is retained for compat with old cmds -- * The exception to this is of course the Connect Request which * cannot have been authenticated, because it contains the * needed ticket; so trap it here. Of course, there is no * prior authentication on the Authenticate User request either, * but it comes over a reserved port and appears from another * server, hence is automatically granted authorization. */ if (request->rq_type == PBS_BATCH_Connect) { req_connect(request); return; } if ((conn->cn_authen & PBS_NET_CONN_AUTHENTICATED) ==0) { rc = PBSE_BADCRED; } else { rc = authenticate_user(request, conn); } if (rc != 0) { req_reject(rc, 0, request); if (rc == PBSE_BADCRED) close_client(sfds); return; } request->rq_perm = svr_get_privilege(request->rq_user, request->rq_host); } /* if server shutting down, disallow new jobs and new running */ if (server.sv_attr[(int)SRV_ATR_State].at_val.at_long > SV_STATE_RUN) { switch (request->rq_type) { case PBS_BATCH_AsyrunJob: case PBS_BATCH_JobCred: case PBS_BATCH_UserCred: case PBS_BATCH_UserMigrate: case PBS_BATCH_MoveJob: case PBS_BATCH_QueueJob: case PBS_BATCH_RunJob: case PBS_BATCH_StageIn: case PBS_BATCH_jobscript: req_reject(PBSE_SVRDOWN, 0, request); return; } } #else /* THIS CODE FOR MOM ONLY */ /* check connecting host against allowed list of ok clients */ if (!addrfind(conn->cn_addr)) { req_reject(PBSE_BADHOST, 0, request); close_client(sfds); return; } request->rq_fromsvr = 1; request->rq_perm = ATR_DFLAG_USRD | ATR_DFLAG_USWR | ATR_DFLAG_OPRD | ATR_DFLAG_OPWR | ATR_DFLAG_MGRD | ATR_DFLAG_MGWR | ATR_DFLAG_SvWR | ATR_DFLAG_MOM; #endif /* * dispatch the request to the correct processing function. * The processing function must call reply_send() to free * the request struture. */ dispatch_request(sfds, request); return; }
/* * modify_whole_array() * modifies the entire job array * @SEE req_modify_array PARENT */ int modify_whole_array( job_array *pa, /* I/O */ svrattrl *plist, /* I */ struct batch_request *preq, /* I */ int checkpoint_req) /* I */ { int i; int rc = 0; int mom_relay = 0; char log_buf[LOCAL_LOG_BUF_SIZE]; job *pjob; for (i = 0; i < pa->ai_qs.array_size; i++) { if (pa->job_ids[i] == NULL) continue; if ((pjob = svr_find_job(pa->job_ids[i], FALSE)) == NULL) { free(pa->job_ids[i]); pa->job_ids[i] = NULL; } else { /* NO_MOM_RELAY will prevent modify_job from calling relay_to_mom */ rc = modify_job((void **)&pjob, plist, preq, checkpoint_req, NO_MOM_RELAY); if (rc == PBSE_RELAYED_TO_MOM) { struct batch_request *array_req = NULL; /* We told modify_job not to call relay_to_mom * so we need to contact the mom */ rc = copy_batchrequest(&array_req, preq, 0, i); if (rc != 0) { unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL); return(rc); } preq->rq_refcount++; if (mom_relay == 0) { preq->rq_refcount++; } mom_relay++; /* The array_req is freed in relay_to_mom (failure) * or in issue_Drequest (success) */ if ((rc = relay_to_mom(&pjob, array_req, post_modify_arrayreq))) { if (pjob != NULL) { snprintf(log_buf,sizeof(log_buf), "Unable to relay information to mom for job '%s'\n", pjob->ji_qs.ji_jobid); log_err(rc, __func__, log_buf); unlock_ji_mutex(pjob, __func__, "2", LOGLEVEL); } return(rc); /* unable to get to MOM */ } } if (pjob != NULL) unlock_ji_mutex(pjob, __func__, "3", LOGLEVEL); } } /* END foreach job in array */ if (mom_relay) { preq->rq_refcount--; if (preq->rq_refcount == 0) { free_br(preq); } return(PBSE_RELAYED_TO_MOM); } return(rc); } /* END modify_whole_array() */