void issue_track( job *pjob) { struct batch_request *preq; char *pc; preq = alloc_br(PBS_BATCH_TrackJob); if (preq == (struct batch_request *)0) return; preq->rq_ind.rq_track.rq_hopcount = pjob->ji_wattr[JOB_ATR_hopcount].at_val.at_long; strcpy(preq->rq_ind.rq_track.rq_jid, pjob->ji_qs.ji_jobid); strcpy(preq->rq_ind.rq_track.rq_location, server_name); preq->rq_ind.rq_track.rq_state[0] = pjob->ji_wattr[JOB_ATR_state].at_val.at_char; pc = pjob->ji_qs.ji_jobid; while (*pc != '.') pc++; issue_to_svr(++pc, preq, NULL); free_br(preq); }
struct batch_request *duplicate_request( struct batch_request *preq) { struct batch_request *preq_tmp = alloc_br(preq->rq_type); preq_tmp->rq_perm = preq->rq_perm; preq_tmp->rq_ind.rq_manager.rq_cmd = preq->rq_ind.rq_manager.rq_cmd; preq_tmp->rq_ind.rq_manager.rq_objtype = preq->rq_ind.rq_manager.rq_objtype; preq_tmp->rq_fromsvr = preq->rq_fromsvr; preq_tmp->rq_extsz = preq->rq_extsz; preq_tmp->rq_conn = preq->rq_conn; memcpy(preq_tmp->rq_ind.rq_manager.rq_objname, preq->rq_ind.rq_manager.rq_objname, PBS_MAXSVRJOBID + 1); memcpy(preq_tmp->rq_user, preq->rq_user, PBS_MAXUSER + 1); memcpy(preq_tmp->rq_host, preq->rq_host, PBS_MAXHOSTNAME + 1); if (preq->rq_extend != NULL) preq_tmp->rq_extend = strdup(preq->rq_extend); if (preq->rq_type == PBS_BATCH_RunJob) { if (preq->rq_ind.rq_run.rq_destin) preq_tmp->rq_ind.rq_run.rq_destin = strdup(preq->rq_ind.rq_run.rq_destin); } return(preq_tmp); } /* END duplicate_request() */
void *check_if_orphaned( void *vp) { char *rsv_id = (char *)vp; char job_id[PBS_MAXSVRJOBID]; struct batch_request *preq; int handle = -1; int retries = 0; struct pbsnode *pnode; char log_buf[LOCAL_LOG_BUF_SIZE]; if (is_orphaned(rsv_id, job_id) == TRUE) { if((preq = alloc_br(PBS_BATCH_DeleteReservation)) == NULL) return NULL; preq->rq_extend = rsv_id; /* Assume the request will be successful and remove the RSV from the hash table */ remove_alps_reservation(rsv_id); if ((pnode = get_next_login_node(NULL)) != NULL) { struct in_addr hostaddr; int local_errno; pbs_net_t momaddr; memcpy(&hostaddr, &pnode->nd_sock_addr.sin_addr, sizeof(hostaddr)); momaddr = ntohl(hostaddr.s_addr); snprintf(log_buf, sizeof(log_buf), "Found orphan ALPS reservation ID %s for job %s; asking %s to remove it", rsv_id, job_id, pnode->nd_name); log_record(PBSEVENT_DEBUG, PBS_EVENTCLASS_SERVER, __func__, log_buf); while ((handle < 0) && (retries < 3)) { handle = svr_connect(momaddr, pnode->nd_mom_port, &local_errno, pnode, NULL, ToServerDIS); retries++; } /* unlock before the network transaction */ unlock_node(pnode, __func__, NULL, LOGLEVEL); if (handle >= 0) issue_Drequest(handle, preq, true); free_br(preq); } } else free(rsv_id); return(NULL); } /* END check_if_orphaned() */
static void job_delete_nanny( struct work_task *pwt) { job *pjob; char *sigk = "SIGKILL"; char *jobid; struct batch_request *newreq; char log_buf[LOCAL_LOG_BUF_SIZE]; time_t time_now = time(NULL); long nanny = FALSE; /* short-circuit if nanny isn't enabled */ get_svr_attr_l(SRV_ATR_JobNanny, &nanny); if (!nanny) { jobid = (char *)pwt->wt_parm1; if (jobid != NULL) { pjob = svr_find_job(jobid, FALSE); if (pjob != NULL) { sprintf(log_buf, "exiting job '%s' still exists, sending a SIGKILL", pjob->ji_qs.ji_jobid); log_err(-1, "job nanny", log_buf); /* build up a Signal Job batch request */ if ((newreq = alloc_br(PBS_BATCH_SignalJob)) != NULL) { strcpy(newreq->rq_ind.rq_signal.rq_jid, pjob->ji_qs.ji_jobid); snprintf(newreq->rq_ind.rq_signal.rq_signame, sizeof(newreq->rq_ind.rq_signal.rq_signame), "%s", sigk); } issue_signal(&pjob, sigk, post_job_delete_nanny, newreq); if (pjob != NULL) { apply_job_delete_nanny(pjob, time_now + 60); unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL); } } } else { log_err(ENOMEM, __func__, "Cannot allocate memory"); } } if (pwt->wt_parm1 != NULL) free(pwt->wt_parm1); free(pwt->wt_mutex); free(pwt); } /* END job_delete_nanny() */
/** * @brief * Duplicate the existing batch request for a running subjob * * @param[in] opreq - the batch status request structure to duplicate * @param[in] pjob - the parent job structure of the subjob * @param[in] func - the function to call after duplicating the batch * structure. * @par * 1. duplicate the batch request * 2. replace the job id with the one from the running subjob * 3. link the new batch request to the original and incr its ref ct * 4. call the "func" with the new batch request and job * @note * Currently, this is called in PBS_Batch_DeleteJob, PBS_Batch_SignalJob, * PBS_Batch_Rerun, and PBS_Batch_RunJob subjob requests. * For any other request types, be sure to add another switch case below * (matching request type). */ void dup_br_for_subjob(struct batch_request *opreq, job *pjob, void (*func)(struct batch_request *, job *)) { struct batch_request *npreq; npreq = alloc_br(opreq->rq_type); if (npreq == NULL) return; npreq->rq_perm = opreq->rq_perm; npreq->rq_fromsvr = opreq->rq_fromsvr; npreq->rq_conn = opreq->rq_conn; npreq->rq_orgconn = opreq->rq_orgconn; npreq->rq_time = opreq->rq_time; strcpy(npreq->rq_user, opreq->rq_user); strcpy(npreq->rq_host, opreq->rq_host); npreq->rq_extend = opreq->rq_extend; npreq->rq_reply.brp_choice = BATCH_REPLY_CHOICE_NULL; npreq->rq_refct = 0; /* for each type, update the job id with the one from the new job */ switch (opreq->rq_type) { case PBS_BATCH_DeleteJob: npreq->rq_ind.rq_delete = opreq->rq_ind.rq_delete; (void)strcpy(npreq->rq_ind.rq_delete.rq_objname, pjob->ji_qs.ji_jobid); break; case PBS_BATCH_SignalJob: npreq->rq_ind.rq_signal = opreq->rq_ind.rq_signal; (void)strcpy(npreq->rq_ind.rq_signal.rq_jid, pjob->ji_qs.ji_jobid); break; case PBS_BATCH_Rerun: (void)strcpy(npreq->rq_ind.rq_rerun, pjob->ji_qs.ji_jobid); break; case PBS_BATCH_RunJob: npreq->rq_ind.rq_run = opreq->rq_ind.rq_run; (void)strcpy(npreq->rq_ind.rq_run.rq_jid, pjob->ji_qs.ji_jobid); break; default: delete_link(&npreq->rq_link); free(npreq); return; } npreq->rq_parentbr = opreq; opreq->rq_refct++; func(npreq, pjob); }
void issue_track( job *pjob) { struct batch_request *preq; char *pc; char *sname; char log_buf[LOCAL_LOG_BUF_SIZE]; if ((pc = strchr(pjob->ji_qs.ji_jobid, '.')) == NULL) { snprintf(log_buf, sizeof(log_buf), "Remote job routing is not compatible with display_job_server_suffix set to false. Cannot track %s", pjob->ji_qs.ji_jobid); log_err(-1, __func__, log_buf); return; } sname = pc + 1; /* do not issue track requests to ourselves */ if (!strcmp(sname, server_name)) { snprintf(log_buf, sizeof(log_buf), "%s erroneously called for local job %s", __func__, pjob->ji_qs.ji_jobid); log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, __func__, log_buf); return; } preq = alloc_br(PBS_BATCH_TrackJob); if (preq == NULL) return; preq->rq_ind.rq_track.rq_hopcount = pjob->ji_wattr[JOB_ATR_hopcount].at_val.at_long; strcpy(preq->rq_ind.rq_track.rq_jid, pjob->ji_qs.ji_jobid); strcpy(preq->rq_ind.rq_track.rq_location, server_name); preq->rq_ind.rq_track.rq_state[0] = pjob->ji_wattr[JOB_ATR_state].at_val.at_char; pc = pjob->ji_qs.ji_jobid; while (*pc != '.') pc++; issue_to_svr(++pc, preq, NULL); free_br(preq); }
int issue_signal( job **pjob_ptr, char *signame, /* name of the signal to send */ void (*func)(batch_request *), void *extra) /* extra parameter to be stored in sig request */ { int rc; job *pjob = *pjob_ptr; struct batch_request *newreq; char jobid[PBS_MAXSVRJOBID + 1]; /* build up a Signal Job batch request */ if ((newreq = alloc_br(PBS_BATCH_SignalJob)) == NULL) { /* FAILURE */ return(PBSE_SYSTEM); } newreq->rq_extra = extra; strcpy(newreq->rq_ind.rq_signal.rq_jid, pjob->ji_qs.ji_jobid); snprintf(newreq->rq_ind.rq_signal.rq_signame, sizeof(newreq->rq_ind.rq_signal.rq_signame), "%s", signame); /* The newreq is freed in relay_to_mom (failure) * or in issue_Drequest (success) */ rc = relay_to_mom(&pjob, newreq, NULL); if ((rc == PBSE_NONE) && (pjob != NULL)) { strcpy(jobid, pjob->ji_qs.ji_jobid); unlock_ji_mutex(pjob, __func__, NULL, 0); func(newreq); *pjob_ptr = svr_find_job((char *)jobid, TRUE); } else { free_br(newreq); if (pjob == NULL) *pjob_ptr = NULL; } return(rc); } /* END issue_signal() */
void *check_if_orphaned( void *vp) { char *rsv_id = (char *)vp; struct batch_request *preq; int handle = -1; int retries = 0; struct pbsnode *pnode; if (is_orphaned(rsv_id) == TRUE) { preq = alloc_br(PBS_BATCH_DeleteReservation); preq->rq_extend = rsv_id; if ((pnode = get_next_login_node(NULL)) != NULL) { struct in_addr hostaddr; int local_errno; pbs_net_t momaddr; memcpy(&hostaddr, &pnode->nd_sock_addr.sin_addr, sizeof(hostaddr)); momaddr = ntohl(hostaddr.s_addr); while ((handle < 0) && (retries < 3)) { handle = svr_connect(momaddr, pnode->nd_mom_port, &local_errno, pnode, NULL, ToServerDIS); retries++; } /* unlock before the network transaction */ unlock_node(pnode, __func__, NULL, 0); if (handle >= 0) { issue_Drequest(handle, preq, release_req, 0); } else free_br(preq); } } else free(rsv_id); return(NULL); } /* END check_if_orphaned() */
END_TEST START_TEST(test_alloc_br) { batch_request *preq = alloc_br(PBS_BATCH_QueueJob); fail_unless(preq->rq_type == PBS_BATCH_QueueJob); fail_unless(preq->rq_conn == -1); fail_unless(preq->rq_orgconn == -1); fail_unless(preq->rq_reply.brp_choice == BATCH_REPLY_CHOICE_NULL); fail_unless(preq->rq_noreply == FALSE); fail_unless(preq->rq_time > 0); free_br(preq); fail_unless(free_attrlist_called > 0); }
/** * @brief * qmove a job into a reservation * * @parm[in] presv - reservation structure * * @return int * * @retval 0 : Success * @retval -1 : Failure * */ int cnvrt_qmove(resc_resv *presv) { int rc; struct job *pjob; struct work_task wtnew; char *q_job_id, *at; struct batch_request *reqcnvrt; if (gen_task_EndResvWindow(presv)) { (void)resv_purge(presv); return (-1); } pjob = find_job(presv->ri_wattr[(int)RESV_ATR_convert].at_val.at_str); if (pjob != NULL) q_job_id = pjob->ji_qs.ji_jobid; else { (void)resv_purge(presv); return (-1); } if ((reqcnvrt = alloc_br(PBS_BATCH_MoveJob)) == NULL) { (void)resv_purge(presv); return (-1); } reqcnvrt->rq_perm = (presv->ri_brp)->rq_perm; strcpy(reqcnvrt->rq_user, (presv->ri_brp)->rq_user); strcpy(reqcnvrt->rq_host, (presv->ri_brp)->rq_host); snprintf(reqcnvrt->rq_ind.rq_move.rq_jid, sizeof(reqcnvrt->rq_ind.rq_move.rq_jid), "%s", q_job_id); at = strchr(presv->ri_qs.ri_resvID, (int)'.'); if (at) *at = '\0'; snprintf(reqcnvrt->rq_ind.rq_move.rq_destin, sizeof(reqcnvrt->rq_ind.rq_move.rq_destin), "%s", presv->ri_qs.ri_resvID); if (at) *at = '.'; snprintf(pjob->ji_qs.ji_destin, PBS_MAXROUTEDEST, "%s", reqcnvrt->rq_ind.rq_move.rq_destin); rc = cnvrt_local_move(pjob, reqcnvrt); wtnew.wt_parm1 = (void *)presv; cnvrt_delete(&wtnew); if (rc != 0) return (-1); return (0); }
int shutdown_preempt_chkpt(job *pjob) { struct batch_request *phold; attribute temp; void (*func)(struct work_task *); long *hold_val = NULL; long old_hold = 0; phold = alloc_br(PBS_BATCH_HoldJob); if (phold == NULL) return (PBSE_SYSTEM); temp.at_flags = ATR_VFLAG_SET; temp.at_type = job_attr_def[(int)JOB_ATR_hold].at_type; temp.at_user_encoded = NULL; temp.at_priv_encoded = NULL; temp.at_val.at_long = HOLD_s; phold->rq_perm = ATR_DFLAG_MGRD | ATR_DFLAG_MGWR; (void)strcpy(phold->rq_ind.rq_hold.rq_orig.rq_objname, pjob->ji_qs.ji_jobid); CLEAR_HEAD(phold->rq_ind.rq_hold.rq_orig.rq_attr); if (job_attr_def[(int)JOB_ATR_hold].at_encode(&temp, &phold->rq_ind.rq_hold.rq_orig.rq_attr, job_attr_def[(int)JOB_ATR_hold].at_name, NULL, ATR_ENCODE_CLIENT, NULL) < 0) return (PBSE_SYSTEM); phold->rq_extra = pjob; func = post_chkpt; if (relay_to_mom(pjob, phold, func) == 0) { if (pjob->ji_qs.ji_state == JOB_STATE_TRANSIT) svr_setjobstate(pjob, JOB_STATE_RUNNING, JOB_SUBSTATE_RUNNING); pjob->ji_qs.ji_svrflags |= (JOB_SVFLG_HASRUN | JOB_SVFLG_CHKPT | JOB_SVFLG_HASHOLD); pjob->ji_modified = 1; (void)job_save(pjob, SAVEJOB_QUICK); return (0); } else { *hold_val = old_hold; /* reset to the old value */ return (-1); } }
END_TEST START_TEST(cpy_checkpoint_test) { struct job *test_job = job_alloc(); struct batch_request *result = cpy_checkpoint(NULL, test_job, JOB_ATR_checkpoint_name, CKPT_DIR_IN); struct batch_request *initial = alloc_br(/*PBS_BATCH_CheckpointJob*/0); fail_unless(result == NULL, "NULL batch_request input fail"); result = cpy_checkpoint(initial, NULL, JOB_ATR_checkpoint_name, CKPT_DIR_IN); fail_unless(result == NULL, "NULL job input fail"); /*TODO: add test for valid input, invalid dir value*/ }
static void job_delete_nanny( struct work_task *pwt) { job *pjob; char *sigk = "SIGKILL"; struct batch_request *newreq; /* short-circuit if nanny isn't enabled */ if (!server.sv_attr[SRV_ATR_JobNanny].at_val.at_long) { release_req(pwt); return; } pjob = (job *)pwt->wt_parm1; sprintf(log_buffer, "exiting job '%s' still exists, sending a SIGKILL", pjob->ji_qs.ji_jobid); log_err(-1, "job nanny", log_buffer); /* build up a Signal Job batch request */ if ((newreq = alloc_br(PBS_BATCH_SignalJob)) != NULL) { strcpy(newreq->rq_ind.rq_signal.rq_jid, pjob->ji_qs.ji_jobid); strncpy(newreq->rq_ind.rq_signal.rq_signame, sigk, PBS_SIGNAMESZ); } issue_signal(pjob, sigk, post_job_delete_nanny, newreq); apply_job_delete_nanny(pjob, time_now + 60); return; } /* END job_delete_nanny() */
static int shutdown_chkpt(job *pjob) { struct batch_request *phold; attribute temp; phold = alloc_br(PBS_BATCH_HoldJob); if (phold == (struct batch_request *)0) return (PBSE_SYSTEM); temp.at_flags = ATR_VFLAG_SET; temp.at_type = job_attr_def[(int)JOB_ATR_hold].at_type; temp.at_user_encoded = NULL; temp.at_priv_encoded = NULL; temp.at_val.at_long = HOLD_s; phold->rq_perm = ATR_DFLAG_MGRD | ATR_DFLAG_MGWR; (void)strcpy(phold->rq_ind.rq_hold.rq_orig.rq_objname, pjob->ji_qs.ji_jobid); CLEAR_HEAD(phold->rq_ind.rq_hold.rq_orig.rq_attr); if (job_attr_def[(int)JOB_ATR_hold].at_encode(&temp, &phold->rq_ind.rq_hold.rq_orig.rq_attr, job_attr_def[(int)JOB_ATR_hold].at_name, (char *)0, ATR_ENCODE_CLIENT, NULL) < 0) return (PBSE_SYSTEM); if (relay_to_mom(pjob, phold, post_chkpt) == 0) { if (pjob->ji_qs.ji_state == JOB_STATE_TRANSIT) svr_setjobstate(pjob, JOB_STATE_RUNNING, JOB_SUBSTATE_RUNNING); pjob->ji_qs.ji_substate = JOB_SUBSTATE_RERUN; pjob->ji_qs.ji_svrflags |= JOB_SVFLG_HASRUN; pjob->ji_qs.ji_svrflags |= JOB_SVFLG_CHKPT; pjob->ji_modified = 1; (void)job_save(pjob, SAVEJOB_QUICK); return (0); } else return (-1); }
int issue_signal( job *pjob, char *signame, /* name of the signal to send */ void (*func)(struct work_task *), void *extra) /* extra parameter to be stored in sig request */ { int rc; struct batch_request *newreq; /* build up a Signal Job batch request */ if ((newreq = alloc_br(PBS_BATCH_SignalJob)) == NULL) { /* FAILURE */ return(PBSE_SYSTEM); } newreq->rq_extra = extra; strcpy(newreq->rq_ind.rq_signal.rq_jid, pjob->ji_qs.ji_jobid); strncpy(newreq->rq_ind.rq_signal.rq_signame, signame, PBS_SIGNAMESZ); rc = relay_to_mom( pjob, newreq, func); /* when MOM replies, we just free the request structure */ return(rc); } /* END issue_signal() */
int stat_to_mom( char *job_id, struct stat_cntl *cntl) /* M */ { struct batch_request *newrq; int rc = PBSE_NONE; unsigned long addr; char log_buf[LOCAL_LOG_BUF_SIZE+1]; struct pbsnode *node; int handle = -1; unsigned long job_momaddr = -1; unsigned short job_momport = -1; char *job_momname = NULL; job *pjob = NULL; if ((pjob = svr_find_job(job_id, FALSE)) == NULL) return(PBSE_JOBNOTFOUND); mutex_mgr job_mutex(pjob->ji_mutex, true); if ((pjob->ji_qs.ji_un.ji_exect.ji_momaddr == 0) || (!pjob->ji_wattr[JOB_ATR_exec_host].at_val.at_str)) { job_mutex.unlock(); snprintf(log_buf, sizeof(log_buf), "Job %s missing MOM's information. Skipping statting on this job", pjob->ji_qs.ji_jobid); log_record(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, __func__, log_buf); return PBSE_BAD_PARAMETER; } job_momaddr = pjob->ji_qs.ji_un.ji_exect.ji_momaddr; job_momport = pjob->ji_qs.ji_un.ji_exect.ji_momport; job_momname = strdup(pjob->ji_wattr[JOB_ATR_exec_host].at_val.at_str); job_mutex.unlock(); if (job_momname == NULL) return PBSE_MEM_MALLOC; if ((newrq = alloc_br(PBS_BATCH_StatusJob)) == NULL) { free(job_momname); return PBSE_MEM_MALLOC; } if (cntl->sc_type == 1) snprintf(newrq->rq_ind.rq_status.rq_id, sizeof(newrq->rq_ind.rq_status.rq_id), "%s", job_id); else newrq->rq_ind.rq_status.rq_id[0] = '\0'; /* get stat of all */ CLEAR_HEAD(newrq->rq_ind.rq_status.rq_attr); /* if MOM is down just return stale information */ addr = job_momaddr; node = tfind_addr(addr,job_momport,job_momname); free(job_momname); if (node == NULL) return PBSE_UNKNODE; if ((node->nd_state & INUSE_DOWN)||(node->nd_power_state != POWER_STATE_RUNNING)) { if (LOGLEVEL >= 6) { snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "node '%s' is allocated to job but in state 'down'", node->nd_name); log_event(PBSEVENT_SYSTEM,PBS_EVENTCLASS_JOB,job_id,log_buf); } unlock_node(node, __func__, "no rely mom", LOGLEVEL); free_br(newrq); return PBSE_NORELYMOM; } /* get connection to MOM */ unlock_node(node, __func__, "before svr_connect", LOGLEVEL); handle = svr_connect(job_momaddr, job_momport, &rc, NULL, NULL); if (handle >= 0) { if ((rc = issue_Drequest(handle, newrq, true)) == PBSE_NONE) { stat_update(newrq, cntl); } } else rc = PBSE_CONNECT; if (rc == PBSE_SYSTEM) rc = PBSE_MEM_MALLOC; free_br(newrq); return(rc); } /* END stat_to_mom() */
void process_request(int sfds) { int rc; struct batch_request *request; conn_t *conn; time_now = time(NULL); conn = get_conn(sfds); if (!conn) { log_event(PBSEVENT_SYSTEM, PBS_EVENTCLASS_REQUEST, LOG_ERR, "process_request", "did not find socket in connection table"); #ifdef WIN32 (void)closesocket(sfds); #else (void)close(sfds); #endif return; } if ((request = alloc_br(0)) == NULL) { log_event(PBSEVENT_SYSTEM, PBS_EVENTCLASS_REQUEST, LOG_ERR, "process_request", "Unable to allocate request structure"); close_conn(sfds); return; } request->rq_conn = sfds; /* * Read in the request and decode it to the internal request structure. */ if (get_connecthost(sfds, request->rq_host, PBS_MAXHOSTNAME)) { (void)sprintf(log_buffer, "%s: %lu", msg_reqbadhost, get_connectaddr(sfds)); log_event(PBSEVENT_DEBUG, PBS_EVENTCLASS_REQUEST, LOG_DEBUG, "", log_buffer); req_reject(PBSE_BADHOST, 0, request); return; } #ifndef PBS_MOM if (conn->cn_active == FromClientDIS) { rc = dis_request_read(sfds, request); } else { log_event(PBSEVENT_SYSTEM, PBS_EVENTCLASS_REQUEST, LOG_ERR, "process_req", "request on invalid type of connection"); close_conn(sfds); free_br(request); return; } #else /* PBS_MOM */ rc = dis_request_read(sfds, request); #endif /* PBS_MOM */ if (rc == -1) { /* End of file */ close_client(sfds); free_br(request); return; } else if ((rc == PBSE_SYSTEM) || (rc == PBSE_INTERNAL)) { /* read error, likely cannot send reply so just disconnect */ /* ??? not sure about this ??? */ close_client(sfds); free_br(request); return; } else if (rc > 0) { /* * request didn't decode, either garbage or unknown * request type, in ether case, return reject-reply */ req_reject(rc, 0, request); close_client(sfds); return; } #ifndef PBS_MOM /* If the request is coming on the socket we opened to the */ /* scheduler, change the "user" from "root" to "Scheduler" */ if (find_sched_from_sock(request->rq_conn) != NULL) { strncpy(request->rq_user, PBS_SCHED_DAEMON_NAME, PBS_MAXUSER); request->rq_user[PBS_MAXUSER] = '\0'; } #endif /* PBS_MOM */ (void)sprintf(log_buffer, msg_request, request->rq_type, request->rq_user, request->rq_host, sfds); log_event(PBSEVENT_DEBUG2, PBS_EVENTCLASS_REQUEST, LOG_DEBUG, "", log_buffer); /* is the request from a host acceptable to the server */ if (request->rq_type == PBS_BATCH_AuthExternal) { rc = authenticate_external(conn, request); if (rc == 0) reply_ack(request); else if (rc == -2) req_reject(PBSE_NOSUP, 0, request); else req_reject(PBSE_BADCRED, 0, request); return; } #ifndef PBS_MOM if (server.sv_attr[(int)SRV_ATR_acl_host_enable].at_val.at_long) { /* acl enabled, check it; always allow myself */ struct pbsnode *isanode = NULL; if ((server.sv_attr[SRV_ATR_acl_host_moms_enable].at_flags & ATR_VFLAG_SET) && (server.sv_attr[(int)SRV_ATR_acl_host_moms_enable].at_val.at_long == 1)) { isanode = find_nodebyaddr(get_connectaddr(sfds)); if ((isanode != NULL) && (isanode->nd_state & INUSE_DELETED)) isanode = NULL; } if (isanode == NULL) { if ((acl_check(&server.sv_attr[(int)SRV_ATR_acl_hosts], request->rq_host, ACL_Host) == 0) && (strcasecmp(server_host, request->rq_host) != 0)) { req_reject(PBSE_BADHOST, 0, request); close_client(sfds); return; } } } /* * determine source (user client or another server) of request. * set the permissions granted to the client */ if (conn->cn_authen & PBS_NET_CONN_FROM_PRIVIL) { /* request came from another server */ request->rq_fromsvr = 1; request->rq_perm = ATR_DFLAG_USRD | ATR_DFLAG_USWR | ATR_DFLAG_OPRD | ATR_DFLAG_OPWR | ATR_DFLAG_MGRD | ATR_DFLAG_MGWR | ATR_DFLAG_SvWR; } else { /* request not from another server */ request->rq_fromsvr = 0; /* * Client must be authenticated by a Authenticate User Request, * if not, reject request and close connection. * -- The following is retained for compat with old cmds -- * The exception to this is of course the Connect Request which * cannot have been authenticated, because it contains the * needed ticket; so trap it here. Of course, there is no * prior authentication on the Authenticate User request either, * but it comes over a reserved port and appears from another * server, hence is automatically granted authorization. */ if (request->rq_type == PBS_BATCH_Connect) { req_connect(request); return; } if ((conn->cn_authen & PBS_NET_CONN_AUTHENTICATED) ==0) { rc = PBSE_BADCRED; } else { rc = authenticate_user(request, conn); } if (rc != 0) { req_reject(rc, 0, request); if (rc == PBSE_BADCRED) close_client(sfds); return; } request->rq_perm = svr_get_privilege(request->rq_user, request->rq_host); } /* if server shutting down, disallow new jobs and new running */ if (server.sv_attr[(int)SRV_ATR_State].at_val.at_long > SV_STATE_RUN) { switch (request->rq_type) { case PBS_BATCH_AsyrunJob: case PBS_BATCH_JobCred: case PBS_BATCH_UserCred: case PBS_BATCH_UserMigrate: case PBS_BATCH_MoveJob: case PBS_BATCH_QueueJob: case PBS_BATCH_RunJob: case PBS_BATCH_StageIn: case PBS_BATCH_jobscript: req_reject(PBSE_SVRDOWN, 0, request); return; } } #else /* THIS CODE FOR MOM ONLY */ /* check connecting host against allowed list of ok clients */ if (!addrfind(conn->cn_addr)) { req_reject(PBSE_BADHOST, 0, request); close_client(sfds); return; } request->rq_fromsvr = 1; request->rq_perm = ATR_DFLAG_USRD | ATR_DFLAG_USWR | ATR_DFLAG_OPRD | ATR_DFLAG_OPWR | ATR_DFLAG_MGRD | ATR_DFLAG_MGWR | ATR_DFLAG_SvWR | ATR_DFLAG_MOM; #endif /* * dispatch the request to the correct processing function. * The processing function must call reply_send() to free * the request struture. */ dispatch_request(sfds, request); return; }
int copy_batchrequest( struct batch_request **newreq, struct batch_request *preq, int type, int jobid) { struct batch_request *request; svrattrl *pal = NULL; svrattrl *newpal = NULL; tlist_head *phead = NULL; char *ptr1; char *ptr2; char newjobname[PBS_MAXSVRJOBID+1]; request = alloc_br(type); if (request) { request->rq_type = preq->rq_type; request->rq_perm = preq->rq_perm; request->rq_fromsvr = preq->rq_fromsvr; request->rq_conn = preq->rq_conn; request->rq_orgconn = preq->rq_orgconn; request->rq_extsz = preq->rq_extsz; request->rq_time = preq->rq_time; strcpy(request->rq_user, preq->rq_user); strcpy(request->rq_host, preq->rq_host); request->rq_reply.brp_choice = preq->rq_reply.brp_choice; request->rq_noreply = preq->rq_noreply; /* we need to copy rq_extend if there is any data */ if (preq->rq_extend) { request->rq_extend = (char *)calloc(1, strlen(preq->rq_extend) + 1); if (request->rq_extend == NULL) { free_br(request); return(PBSE_SYSTEM); } strcpy(request->rq_extend, preq->rq_extend); } /* remember the batch_request we copied */ request->rq_extra = (void *)preq; switch(preq->rq_type) { /* This function was created for a modify arracy request (PBS_BATCH_ModifyJob) the preq->rq_ind structure was allocated in dis_request_read. If other BATCH types are needed refer to that function to see how the rq_ind structure was allocated and then copy it here. */ case PBS_BATCH_DeleteJob: case PBS_BATCH_HoldJob: case PBS_BATCH_CheckpointJob: case PBS_BATCH_ModifyJob: case PBS_BATCH_AsyModifyJob: /* based on how decode_DIS_Manage allocates data */ CLEAR_HEAD(request->rq_ind.rq_manager.rq_attr); phead = &request->rq_ind.rq_manager.rq_attr; request->rq_ind.rq_manager.rq_cmd = preq->rq_ind.rq_manager.rq_cmd; request->rq_ind.rq_manager.rq_objtype = preq->rq_ind.rq_manager.rq_objtype; /* If this is a job array it is possible we only have the array name and not the individual job. We need to find out what we have and modify the name if needed */ ptr1 = strstr(preq->rq_ind.rq_manager.rq_objname, "[]"); if ((ptr1) && (jobid != -1)) { ptr1++; strcpy(newjobname, preq->rq_ind.rq_manager.rq_objname); ptr2 = strstr(newjobname, "[]"); ptr2++; *ptr2 = 0; sprintf(request->rq_ind.rq_manager.rq_objname,"%s%d%s", newjobname, jobid, ptr1); } else strcpy(request->rq_ind.rq_manager.rq_objname, preq->rq_ind.rq_manager.rq_objname); /* copy the attribute list */ pal = (svrattrl *)GET_NEXT(preq->rq_ind.rq_manager.rq_attr); while(pal != NULL) { newpal = (svrattrl *)calloc(1, pal->al_tsize + 1); if (!newpal) { free_br(request); return(PBSE_SYSTEM); } CLEAR_LINK(newpal->al_link); newpal->al_atopl.next = 0; newpal->al_tsize = pal->al_tsize + 1; newpal->al_nameln = pal->al_nameln; newpal->al_flags = pal->al_flags; newpal->al_atopl.name = (char *)newpal + sizeof(svrattrl); strcpy(newpal->al_atopl.name, pal->al_atopl.name); newpal->al_nameln = pal->al_nameln; newpal->al_atopl.resource = newpal->al_atopl.name + newpal->al_nameln; if (pal->al_atopl.resource != NULL) strcpy(newpal->al_atopl.resource, pal->al_atopl.resource); newpal->al_rescln = pal->al_rescln; newpal->al_atopl.value = newpal->al_atopl.name + newpal->al_nameln + newpal->al_rescln; strcpy(newpal->al_atopl.value, pal->al_atopl.value); newpal->al_valln = pal->al_valln; newpal->al_atopl.op = pal->al_atopl.op; pal = (struct svrattrl *)GET_NEXT(pal->al_link); } break; case PBS_BATCH_SignalJob: strcpy(request->rq_ind.rq_signal.rq_jid, preq->rq_ind.rq_signal.rq_jid); strcpy(request->rq_ind.rq_signal.rq_signame, preq->rq_ind.rq_signal.rq_signame); request->rq_extra = strdup(preq->rq_extra); break; case PBS_BATCH_MessJob: strcpy(request->rq_ind.rq_message.rq_jid, preq->rq_ind.rq_message.rq_jid); request->rq_ind.rq_message.rq_file = preq->rq_ind.rq_message.rq_file; strcpy(request->rq_ind.rq_message.rq_text, preq->rq_ind.rq_message.rq_text); break; default: break; } if ((phead != NULL) && (newpal != NULL)) append_link(phead, &newpal->al_link, newpal); *newreq = request; return(0); } else return(PBSE_SYSTEM); }
void mom_cleanup_checkpoint_hold( struct work_task *ptask) { int rc = 0; job *pjob; char *jobid; struct batch_request *preq; char log_buf[LOCAL_LOG_BUF_SIZE]; time_t time_now = time(NULL); jobid = (char *)ptask->wt_parm1; free(ptask->wt_mutex); free(ptask); if (jobid == NULL) { log_err(ENOMEM, __func__, "Cannot allocate memory"); return; } pjob = svr_find_job(jobid, FALSE); if (pjob == NULL) { if (LOGLEVEL >= 3) { sprintf(log_buf, "%s:failed to find job\n", __func__); log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,jobid,log_buf); } free(jobid); return; } free(jobid); if (LOGLEVEL >= 7) { sprintf(log_buf, "checking mom cleanup job state is %s-%s\n", PJobState[pjob->ji_qs.ji_state], PJobSubState[pjob->ji_qs.ji_substate]); log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf); } /* * if the job is no longer running then we have recieved the job obit * and need to request the mom to clean up after the job */ if (pjob->ji_qs.ji_state != JOB_STATE_RUNNING) { if ((preq = alloc_br(PBS_BATCH_DeleteJob)) == NULL) { log_err(-1, __func__, "unable to allocate DeleteJob request - big trouble!"); } else { strcpy(preq->rq_ind.rq_delete.rq_objname, pjob->ji_qs.ji_jobid); /* The preq is freed in relay_to_mom (failure) * or in issue_Drequest (success) */ if ((rc = relay_to_mom(&pjob, preq, release_req)) != 0) { if (pjob != NULL) { snprintf(log_buf,sizeof(log_buf), "Unable to relay information to mom for job '%s'\n", pjob->ji_qs.ji_jobid); log_err(rc, __func__, log_buf); unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL); } return; } if ((LOGLEVEL >= 7) && (pjob != NULL)) { log_event( PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, "requested mom cleanup"); } } } else { set_task(WORK_Timed, time_now + 1, mom_cleanup_checkpoint_hold, strdup(pjob->ji_qs.ji_jobid), FALSE); } if (pjob != NULL) unlock_ji_mutex(pjob, __func__, "2", LOGLEVEL); } /* END mom_cleanup_checkpoint_hold() */
int stat_to_mom( char *job_id, struct stat_cntl *cntl) /* M */ { struct batch_request *newrq; int rc = PBSE_NONE; unsigned long addr; char log_buf[LOCAL_LOG_BUF_SIZE+1]; struct pbsnode *node; int handle = -1; unsigned long job_momaddr = -1; unsigned short job_momport = -1; char *job_momname = NULL; job *pjob = NULL; if ((pjob = svr_find_job(job_id, FALSE)) == NULL) return PBSE_JOBNOTFOUND; job_momaddr = pjob->ji_qs.ji_un.ji_exect.ji_momaddr; job_momport = pjob->ji_qs.ji_un.ji_exect.ji_momport; job_momname = strdup(pjob->ji_wattr[JOB_ATR_exec_host].at_val.at_str); unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL); if (job_momname == NULL) return PBSE_MEM_MALLOC; if ((newrq = alloc_br(PBS_BATCH_StatusJob)) == NULL) { free(job_momname); return PBSE_MEM_MALLOC; } if (cntl->sc_type == 1) strcpy(newrq->rq_ind.rq_status.rq_id, job_id); else newrq->rq_ind.rq_status.rq_id[0] = '\0'; /* get stat of all */ CLEAR_HEAD(newrq->rq_ind.rq_status.rq_attr); /* if MOM is down just return stale information */ addr = job_momaddr; node = tfind_addr(addr,job_momport,job_momname); free(job_momname); if (node == NULL) return PBSE_UNKNODE; if (node->nd_state & INUSE_DOWN) { if (LOGLEVEL >= 6) { snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "node '%s' is allocated to job but in state 'down'", node->nd_name); log_event(PBSEVENT_SYSTEM,PBS_EVENTCLASS_JOB,job_id,log_buf); } unlock_node(node, __func__, "no rely mom", LOGLEVEL); free_br(newrq); return PBSE_NORELYMOM; } /* get connection to MOM */ unlock_node(node, __func__, "before svr_connect", LOGLEVEL); handle = svr_connect(job_momaddr, job_momport, &rc, NULL, NULL, ToServerDIS); /* Unlock job here */ if (handle >= 0) { if ((rc = issue_Drequest(handle, newrq)) == PBSE_NONE) { stat_update(newrq, cntl); } } else rc = PBSE_CONNECT; if (rc == PBSE_SYSTEM) rc = PBSE_MEM_MALLOC; free_br(newrq); return rc; } /* END stat_to_mom() */
int process_request( struct tcp_chan *chan) /* file descriptor (socket) to get request */ { int rc = PBSE_NONE; struct batch_request *request = NULL; char log_buf[LOCAL_LOG_BUF_SIZE]; long acl_enable = FALSE; long state = SV_STATE_DOWN; time_t time_now = time(NULL); int free_request = TRUE; char tmpLine[MAXLINE]; char *auth_err = NULL; enum conn_type conn_active; unsigned short conn_socktype; unsigned short conn_authen; unsigned long conn_addr; int sfds = chan->sock; pthread_mutex_lock(svr_conn[sfds].cn_mutex); conn_active = svr_conn[sfds].cn_active; conn_socktype = svr_conn[sfds].cn_socktype; conn_authen = svr_conn[sfds].cn_authen; conn_addr = svr_conn[sfds].cn_addr; svr_conn[sfds].cn_lasttime = time_now; pthread_mutex_unlock(svr_conn[sfds].cn_mutex); if ((request = alloc_br(0)) == NULL) { snprintf(tmpLine, sizeof(tmpLine), "cannot allocate memory for request from %lu", conn_addr); req_reject(PBSE_MEM_MALLOC, 0, request, NULL, tmpLine); free_request = FALSE; rc = PBSE_SYSTEM; goto process_request_cleanup; } request->rq_conn = sfds; /* * Read in the request and decode it to the internal request structure. */ if (conn_active == FromClientDIS || conn_active == ToServerDIS) { #ifdef ENABLE_UNIX_SOCKETS if ((conn_socktype & PBS_SOCK_UNIX) && (conn_authen != PBS_NET_CONN_AUTHENTICATED)) { /* get_creds interestingly always returns 0 */ get_creds(sfds, conn_credent[sfds].username, conn_credent[sfds].hostname); } #endif /* END ENABLE_UNIX_SOCKETS */ rc = dis_request_read(chan, request); } else { char out[80]; snprintf(tmpLine, MAXLINE, "request on invalid type of connection: %d, sock type: %d, from address %s", conn_active,conn_socktype, netaddr_long(conn_addr, out)); log_event(PBSEVENT_SYSTEM, PBS_EVENTCLASS_REQUEST, "process_req", tmpLine); snprintf(tmpLine, sizeof(tmpLine), "request on invalid type of connection (%d) from %s", conn_active, netaddr_long(conn_addr, out)); req_reject(PBSE_BADHOST, 0, request, NULL, tmpLine); free_request = FALSE; rc = PBSE_BADHOST; goto process_request_cleanup; } if (rc == -1) { /* FAILURE */ /* premature end of file */ rc = PBSE_PREMATURE_EOF; goto process_request_cleanup; } if ((rc == PBSE_SYSTEM) || (rc == PBSE_INTERNAL) || (rc == PBSE_SOCKET_CLOSE)) { /* FAILURE */ /* read error, likely cannot send reply so just disconnect */ /* ??? not sure about this ??? */ goto process_request_cleanup; } if (rc > 0) { /* FAILURE */ /* * request didn't decode, either garbage or unknown * request type, in either case, return reject-reply */ req_reject(rc, 0, request, NULL, "cannot decode message"); free_request = FALSE; goto process_request_cleanup; } if (get_connecthost(sfds, request->rq_host, PBS_MAXHOSTNAME) != 0) { sprintf(log_buf, "%s: %lu", pbse_to_txt(PBSE_BADHOST), conn_addr); log_event(PBSEVENT_DEBUG, PBS_EVENTCLASS_REQUEST, "", log_buf); snprintf(tmpLine, sizeof(tmpLine), "cannot determine hostname for connection from %lu", conn_addr); req_reject(PBSE_BADHOST, 0, request, NULL, tmpLine); free_request = FALSE; rc = PBSE_BADHOST; goto process_request_cleanup; } if (LOGLEVEL >= 1) { sprintf(log_buf, msg_request, reqtype_to_txt(request->rq_type), request->rq_user, request->rq_host, sfds); log_event(PBSEVENT_DEBUG2, PBS_EVENTCLASS_REQUEST, "", log_buf); } /* is the request from a host acceptable to the server */ if (conn_socktype & PBS_SOCK_UNIX) { strcpy(request->rq_host, server_name); } get_svr_attr_l(SRV_ATR_acl_host_enable, &acl_enable); if (acl_enable) { /* acl enabled, check it; always allow myself and nodes */ struct array_strings *pas = NULL; struct pbsnode *isanode; get_svr_attr_arst(SRV_ATR_acl_hosts, &pas); isanode = PGetNodeFromAddr(conn_addr); if ((isanode == NULL) && (strcmp(server_host, request->rq_host) != 0) && (acl_check_my_array_string(pas, request->rq_host, ACL_Host) == 0)) { char tmpLine[MAXLINE]; snprintf(tmpLine, sizeof(tmpLine), "request not authorized from host %s", request->rq_host); req_reject(PBSE_BADHOST, 0, request, NULL, tmpLine); free_request = FALSE; rc = PBSE_BADHOST; goto process_request_cleanup; } if (isanode != NULL) unlock_node(isanode, "process_request", NULL, LOGLEVEL); } /* * determine source (user client or another server) of request. * set the permissions granted to the client */ if (conn_authen == PBS_NET_CONN_FROM_PRIVIL) { /* request came from another server */ request->rq_fromsvr = 1; request->rq_perm = ATR_DFLAG_USRD | ATR_DFLAG_USWR | ATR_DFLAG_OPRD | ATR_DFLAG_OPWR | ATR_DFLAG_MGRD | ATR_DFLAG_MGWR | ATR_DFLAG_SvWR; } else { /* request not from another server */ conn_credent[sfds].timestamp = time_now; request->rq_fromsvr = 0; /* * Client must be authenticated by an Authenticate User Request, if not, * reject request and close connection. -- The following is retained for * compat with old cmds -- The exception to this is of course the Connect * Request which cannot have been authenticated, because it contains the * needed ticket; so trap it here. Of course, there is no prior * authentication on the Authenticate User request either, but it comes * over a reserved port and appears from another server, hence is * automatically granted authentication. * * The above is only true with inet sockets. With unix domain sockets, the * user creds were read before the first dis_request_read call above. * We automatically granted authentication because we can trust the socket * creds. Authorization is still granted in svr_get_privilege below */ if (request->rq_type == PBS_BATCH_Connect) { req_connect(request); if (conn_socktype == PBS_SOCK_INET) { rc = PBSE_IVALREQ; req_reject(rc, 0, request, NULL, NULL); free_request = FALSE; goto process_request_cleanup; } } if (conn_socktype & PBS_SOCK_UNIX) { pthread_mutex_lock(svr_conn[sfds].cn_mutex); svr_conn[sfds].cn_authen = PBS_NET_CONN_AUTHENTICATED; pthread_mutex_unlock(svr_conn[sfds].cn_mutex); } if (ENABLE_TRUSTED_AUTH == TRUE ) rc = PBSE_NONE; /* bypass the authentication of the user--trust the client completely */ else if (munge_on) { /* If munge_on is true we will validate the connection now */ if (request->rq_type == PBS_BATCH_AltAuthenUser) { rc = req_altauthenuser(request); free_request = FALSE; goto process_request_cleanup; } else { rc = authenticate_user(request, &conn_credent[sfds], &auth_err); } } else if (conn_authen != PBS_NET_CONN_AUTHENTICATED) /* skip checking user if we did not get an authenticated credential */ rc = PBSE_BADCRED; else rc = authenticate_user(request, &conn_credent[sfds], &auth_err); if (rc != 0) { req_reject(rc, 0, request, NULL, auth_err); if (auth_err != NULL) free(auth_err); free_request = FALSE; goto process_request_cleanup; } /* * pbs_mom and checkpoint restart scripts both need the authority to do * alters and releases on checkpointable jobs. Allow manager permission * for root on the jobs execution node. */ if (((request->rq_type == PBS_BATCH_ModifyJob) || (request->rq_type == PBS_BATCH_ReleaseJob)) && (strcmp(request->rq_user, PBS_DEFAULT_ADMIN) == 0)) { job *pjob; char *dptr; int skip = FALSE; char short_host[PBS_MAXHOSTNAME+1]; /* make short host name */ strcpy(short_host, request->rq_host); if ((dptr = strchr(short_host, '.')) != NULL) { *dptr = '\0'; } if ((pjob = svr_find_job(request->rq_ind.rq_modify.rq_objname, FALSE)) != (job *)0) { if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING) { if ((pjob->ji_wattr[JOB_ATR_checkpoint].at_flags & ATR_VFLAG_SET) && ((csv_find_string(pjob->ji_wattr[JOB_ATR_checkpoint].at_val.at_str, "s") != NULL) || (csv_find_string(pjob->ji_wattr[JOB_ATR_checkpoint].at_val.at_str, "c") != NULL) || (csv_find_string(pjob->ji_wattr[JOB_ATR_checkpoint].at_val.at_str, "enabled") != NULL)) && (strstr(pjob->ji_wattr[JOB_ATR_exec_host].at_val.at_str, short_host) != NULL)) { request->rq_perm = svr_get_privilege(request->rq_user, server_host); skip = TRUE; } } unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL); } if (!skip) { request->rq_perm = svr_get_privilege(request->rq_user, request->rq_host); } } else { request->rq_perm = svr_get_privilege(request->rq_user, request->rq_host); } } /* END else (conn_authen == PBS_NET_CONN_FROM_PRIVIL) */ /* if server shutting down, disallow new jobs and new running */ get_svr_attr_l(SRV_ATR_State, &state); if (state > SV_STATE_RUN) { switch (request->rq_type) { case PBS_BATCH_AsyrunJob: case PBS_BATCH_JobCred: case PBS_BATCH_MoveJob: case PBS_BATCH_QueueJob: case PBS_BATCH_RunJob: case PBS_BATCH_StageIn: case PBS_BATCH_jobscript: req_reject(PBSE_SVRDOWN, 0, request, NULL, NULL); rc = PBSE_SVRDOWN; free_request = FALSE; goto process_request_cleanup; /*NOTREACHED*/ break; } } /* * dispatch the request to the correct processing function. * The processing function must call reply_send() to free * the request struture. */ rc = dispatch_request(sfds, request); return(rc); process_request_cleanup: if (free_request == TRUE) free_br(request); return(rc); } /* END process_request() */
void req_deletejob( struct batch_request *preq) /* I */ { job *pjob; struct work_task *pwtold; struct work_task *pwtnew; struct work_task *pwtcheck; int rc; char *sigt = "SIGTERM"; char *Msg = NULL; /* check if we are getting a purgecomplete from scheduler */ if ((preq->rq_extend != NULL) && !strncmp(preq->rq_extend,PURGECOMP,strlen(PURGECOMP))) { /* * purge_completed_jobs will respond with either an ack or reject */ purge_completed_jobs(preq); return; } /* The way this is implemented, if the user enters the command "qdel -p <jobid>", * they can then delete jobs other than their own since the authorization * checks are made below in chk_job_request. This should probably be fixed. */ if (forced_jobpurge(preq) != 0) { return; } /* NOTE: should support rq_objname={<JOBID>|ALL|<name:<JOBNAME>} */ /* NYI */ pjob = chk_job_request(preq->rq_ind.rq_delete.rq_objname, preq); if (pjob == NULL) { /* NOTE: chk_job_request() will issue req_reject() */ return; } if (preq->rq_extend != NULL) { if (strncmp(preq->rq_extend, deldelaystr, strlen(deldelaystr)) && strncmp(preq->rq_extend, delasyncstr, strlen(delasyncstr)) && strncmp(preq->rq_extend, delpurgestr, strlen(delpurgestr))) { /* have text message in request extension, add it */ Msg = preq->rq_extend; /* * Message capability is only for operators and managers. * Check if request is authorized */ if ((preq->rq_perm & (ATR_DFLAG_OPRD | ATR_DFLAG_OPWR | ATR_DFLAG_MGRD | ATR_DFLAG_MGWR)) == 0) { req_reject(PBSE_PERM, 0, preq, NULL, "must have operator or manager privilege to use -m parameter"); return; } } } if (pjob->ji_qs.ji_state == JOB_STATE_TRANSIT) { /* * Find pid of router from existing work task entry, * then establish another work task on same child. * Next, signal the router and wait for its completion; */ pwtold = (struct work_task *)GET_NEXT(pjob->ji_svrtask); while (pwtold != NULL) { if ((pwtold->wt_type == WORK_Deferred_Child) || (pwtold->wt_type == WORK_Deferred_Cmp)) { pwtnew = set_task( pwtold->wt_type, pwtold->wt_event, post_delete_route, preq); if (pwtnew != NULL) { /* * reset type in case the SIGCHLD came * in during the set_task; it makes * sure that next_task() will find the * new entry. */ pwtnew->wt_type = pwtold->wt_type; pwtnew->wt_aux = pwtold->wt_aux; kill((pid_t)pwtold->wt_event, SIGTERM); pjob->ji_qs.ji_substate = JOB_SUBSTATE_ABORT; return; /* all done for now */ } else { req_reject(PBSE_SYSTEM, 0, preq, NULL, NULL); return; } } pwtold = (struct work_task *)GET_NEXT(pwtold->wt_linkobj); } /* should never get here ... */ log_err(-1, "req_delete", "Did not find work task for router"); req_reject(PBSE_INTERNAL, 0, preq, NULL, NULL); return; } if (pjob->ji_qs.ji_substate == JOB_SUBSTATE_PRERUN || pjob->ji_qs.ji_substate == JOB_SUBSTATE_RERUN || pjob->ji_qs.ji_substate == JOB_SUBSTATE_RERUN1 || pjob->ji_qs.ji_substate == JOB_SUBSTATE_RERUN2 || pjob->ji_qs.ji_substate == JOB_SUBSTATE_RERUN3 ) { /* If JOB_SUBSTATE_PRERUN being sent to MOM, wait till she gets it going */ /* retry in one second */ /* If JOB_SUBSTATE_RERUN, RERUN1, RERUN2 or RERUN3 the job is being requeued. Wait until finished */ static time_t cycle_check_when = 0; static char cycle_check_jid[PBS_MAXSVRJOBID + 1]; if (cycle_check_when != 0) { if (!strcmp(pjob->ji_qs.ji_jobid, cycle_check_jid) && (time_now - cycle_check_when > 10)) { /* state not updated after 10 seconds */ /* did the mom ever get it? delete it anyways... */ cycle_check_jid[0] = '\0'; cycle_check_when = 0; goto jump; } if (time_now - cycle_check_when > 20) { /* give up after 20 seconds */ cycle_check_jid[0] = '\0'; cycle_check_when = 0; } } /* END if (cycle_check_when != 0) */ if (cycle_check_when == 0) { /* new PRERUN job located */ cycle_check_when = time_now; strcpy(cycle_check_jid, pjob->ji_qs.ji_jobid); } sprintf(log_buffer, "job cannot be deleted, state=PRERUN, requeuing delete request"); log_event( PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buffer); pwtnew = set_task( WORK_Timed, time_now + 1, post_delete_route, preq); if (pwtnew == 0) req_reject(PBSE_SYSTEM, 0, preq, NULL, NULL); return; } /* END if (pjob->ji_qs.ji_substate == JOB_SUBSTATE_PRERUN) */ jump: /* * Log delete and if requesting client is not job owner, send mail. */ sprintf(log_buffer, "requestor=%s@%s", preq->rq_user, preq->rq_host); /* NOTE: should annotate accounting record with extend message (NYI) */ account_record(PBS_ACCT_DEL, pjob, log_buffer); sprintf(log_buffer, msg_manager, msg_deletejob, preq->rq_user, preq->rq_host); log_event( PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buffer); /* NOTE: should incorporate job delete message */ if (Msg != NULL) { /* have text message in request extension, add it */ strcat(log_buffer, "\n"); strcat(log_buffer, Msg); } if ((svr_chk_owner(preq, pjob) != 0) && !has_job_delete_nanny(pjob)) { /* only send email if owner did not delete job and job deleted has not been previously attempted */ svr_mailowner(pjob, MAIL_DEL, MAIL_FORCE, log_buffer); /* * If we sent mail and already sent the extra message * then reset message so we don't trigger a redundant email * in job_abt() */ if (Msg != NULL) { Msg = NULL; } } if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_FILE) != 0) { /* job has restart file at mom, change restart comment if failed */ change_restart_comment_if_needed(pjob); } if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING) { /* * setup a nanny task to make sure the job is actually deleted (see the * comments at job_delete_nanny()). */ if (has_job_delete_nanny(pjob)) { req_reject(PBSE_IVALREQ, 0, preq, NULL, "job cancel in progress"); return; } apply_job_delete_nanny(pjob, time_now + 60); /* check if we are getting a asynchronous delete */ if ((preq->rq_extend != NULL) && !strncmp(preq->rq_extend,DELASYNC,strlen(DELASYNC))) { struct batch_request *preq_tmp = NULL; /* * Respond with an ack now instead of after MOM processing * Create a new batch request and fill it in. It will be freed by reply_ack */ snprintf(log_buffer,sizeof(log_buffer), "Deleting job asynchronously"); log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buffer); preq_tmp = alloc_br(PBS_BATCH_DeleteJob); preq_tmp->rq_perm = preq->rq_perm; preq_tmp->rq_ind.rq_manager.rq_cmd = preq->rq_ind.rq_manager.rq_cmd; preq_tmp->rq_ind.rq_manager.rq_objtype = preq->rq_ind.rq_manager.rq_objtype; preq_tmp->rq_fromsvr = preq->rq_fromsvr; preq_tmp->rq_extsz = preq->rq_extsz; preq_tmp->rq_conn = preq->rq_conn; memcpy(preq_tmp->rq_ind.rq_manager.rq_objname, preq->rq_ind.rq_manager.rq_objname, PBS_MAXSVRJOBID + 1); memcpy(preq_tmp->rq_user, preq->rq_user, PBS_MAXUSER + 1); memcpy(preq_tmp->rq_host, preq->rq_host, PBS_MAXHOSTNAME + 1); reply_ack(preq_tmp); preq->rq_noreply = TRUE; /* set for no more replies */ } /* make a cleanup task if set */ if ((server.sv_attr[SRV_ATR_JobForceCancelTime].at_flags & ATR_VFLAG_SET) && (server.sv_attr[SRV_ATR_JobForceCancelTime].at_val.at_long > 0)) { pwtcheck = set_task( WORK_Timed, time_now + server.sv_attr[SRV_ATR_JobForceCancelTime].at_val.at_long, ensure_deleted, preq); if (pwtcheck != NULL) append_link(&pjob->ji_svrtask, &pwtcheck->wt_linkobj, pwtcheck); } /* * Send signal request to MOM. The server will automagically * pick up and "finish" off the client request when MOM replies. */ if ((rc = issue_signal(pjob, sigt, post_delete_mom1, preq))) { /* cant send to MOM */ req_reject(rc, 0, preq, NULL, NULL); } /* normally will ack reply when mom responds */ sprintf(log_buffer, msg_delrunjobsig, sigt); LOG_EVENT( PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buffer); return; } /* END if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING) */ /* make a cleanup task if set */ if ((server.sv_attr[SRV_ATR_JobForceCancelTime].at_flags & ATR_VFLAG_SET) && (server.sv_attr[SRV_ATR_JobForceCancelTime].at_val.at_long > 0)) { pwtcheck = set_task( WORK_Timed, time_now + server.sv_attr[SRV_ATR_JobForceCancelTime].at_val.at_long, ensure_deleted, preq); if (pwtcheck != NULL) append_link(&pjob->ji_svrtask, &pwtcheck->wt_linkobj, pwtcheck); } /* if configured, and this job didn't have a slot limit hold, free a job * held with the slot limit hold */ if ((server.sv_attr[SRV_ATR_MoabArrayCompatible].at_val.at_long != FALSE) && ((pjob->ji_wattr[JOB_ATR_hold].at_val.at_long & HOLD_l) == FALSE)) { if ((pjob->ji_arraystruct != NULL) && (pjob->ji_is_array_template == FALSE)) { int i; int newstate; int newsub; job *tmp; job_array *pa = pjob->ji_arraystruct; for (i = 0; i < pa->ai_qs.array_size; i++) { if (pa->jobs[i] == NULL) continue; tmp = (job *)pa->jobs[i]; if (tmp->ji_wattr[JOB_ATR_hold].at_val.at_long & HOLD_l) { tmp->ji_wattr[JOB_ATR_hold].at_val.at_long &= ~HOLD_l; if (tmp->ji_wattr[JOB_ATR_hold].at_val.at_long == 0) { tmp->ji_wattr[JOB_ATR_hold].at_flags &= ~ATR_VFLAG_SET; } svr_evaljobstate(tmp, &newstate, &newsub, 1); svr_setjobstate(tmp, newstate, newsub); job_save(tmp, SAVEJOB_FULL, 0); break; } } } } /* END MoabArrayCompatible check */ if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_FILE) != 0) { /* job has restart file at mom, do end job processing */ svr_setjobstate(pjob, JOB_STATE_EXITING, JOB_SUBSTATE_EXITING); pjob->ji_momhandle = -1; /* force new connection */ pwtnew = set_task(WORK_Immed, 0, on_job_exit, (void *)pjob); if (pwtnew) { append_link(&pjob->ji_svrtask, &pwtnew->wt_linkobj, pwtnew); } } else if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_StagedIn) != 0) { /* job has staged-in file, should remove them */ remove_stagein(pjob); job_abt(&pjob, Msg); } else { /* * the job is not transitting (though it may have been) and * is not running, so put in into a complete state. */ struct work_task *ptask; struct pbs_queue *pque; int KeepSeconds = 0; svr_setjobstate(pjob, JOB_STATE_COMPLETE, JOB_SUBSTATE_COMPLETE); if ((pque = pjob->ji_qhdr) && (pque != NULL)) { pque->qu_numcompleted++; } KeepSeconds = attr_ifelse_long( &pque->qu_attr[QE_ATR_KeepCompleted], &server.sv_attr[SRV_ATR_KeepCompleted], 0); ptask = set_task(WORK_Timed, time_now + KeepSeconds, on_job_exit, pjob); if (ptask != NULL) { append_link(&pjob->ji_svrtask, &ptask->wt_linkobj, ptask); } } /* END else if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_FILE) != 0) */ reply_ack(preq); return; } /* END req_deletejob() */
int set_node_power_state( struct pbsnode **ppNode, unsigned short newState) { struct pbsnode *pNode = *ppNode; if (pNode->nd_addrs == NULL) { return PBSE_BAD_PARAMETER; } if (newState == POWER_STATE_RUNNING) { static std::string interface; static unsigned char mac_addr[6]; if (interface.length() == 0) { if (!getMacAddr(interface,mac_addr)) { return PBSE_SYSTEM; } } int sock; if ((sock = socket(AF_INET,SOCK_PACKET,SOCK_PACKET)) < 0) { return PBSE_SYSTEM; } unsigned char outpack[1000]; memcpy(outpack+6,mac_addr,6); memcpy(outpack,pNode->nd_mac_addr,6); outpack[12] = 0x08; outpack[13] = 0x42; int offset = 14; memset(outpack + offset,0xff,6); offset += 6; for (int i = 0;i < 16;i++) { memcpy(outpack + offset,pNode->nd_mac_addr,6); offset += 6; } int one = 1; if (setsockopt(sock, SOL_SOCKET, SO_BROADCAST, (char *)&one, sizeof(one)) < 0) { close(sock); return PBSE_SYSTEM; } struct sockaddr whereto; whereto.sa_family = 0; snprintf(whereto.sa_data, sizeof(whereto.sa_data), "%s", interface.c_str()); if (sendto(sock, outpack, offset, 0, &whereto, sizeof(whereto)) < 0) { close(sock); return PBSE_SYSTEM; } close(sock); return PBSE_NONE; } if (pNode->nd_job_usages.size() != 0) { //Can't change the power state on a node with running jobs. return PBSE_CANT_CHANGE_POWER_STATE_WITH_JOBS_RUNNING; } struct batch_request *request = alloc_br(PBS_BATCH_ChangePowerState); if (request == NULL) { return PBSE_SYSTEM; } request->rq_ind.rq_powerstate = newState; pNode->nd_power_state_change_time = time(NULL); snprintf(request->rq_host, sizeof(request->rq_host), "%s", pNode->nd_name); std::string hostname(request->rq_host); int rc = PBSE_NONE; { int handle = 0; int local_errno = 0; handle = svr_connect(pNode->nd_addrs[0],pNode->nd_mom_port,&local_errno,pNode,NULL); if(handle < 0) { unlock_node(pNode, __func__, "Error connecting", LOGLEVEL); *ppNode = NULL; return local_errno; } unlock_node(pNode, __func__, "Done connecting", LOGLEVEL); *ppNode = NULL; rc = issue_Drequest(handle, request,true); if(rc == PBSE_NONE) { rc = request->rq_reply.brp_code; if(rc < 0) rc = -rc; } } pNode = find_nodebyname(hostname.c_str()); *ppNode = pNode; if ((rc == PBSE_NONE)&&(pNode != NULL)) { pNode->nd_power_state = newState; } return(rc); }
int issue_signal( job **pjob_ptr, const char *signame, /* name of the signal to send */ void (*func)(struct batch_request *), void *extra, /* extra parameter to be stored in sig request */ char *extend) /* Parameter to put in extended part of request */ { int rc; job *pjob = *pjob_ptr; struct batch_request *newreq; char jobid[PBS_MAXSVRJOBID + 1]; /* build up a Signal Job batch request */ if ((newreq = alloc_br(PBS_BATCH_SignalJob)) == NULL) { /* FAILURE */ return(PBSE_SYSTEM); } newreq->rq_extra = extra; newreq->rq_extend = extend; if (extend != NULL) { newreq->rq_extsz = strlen(extend); } strcpy(jobid, pjob->ji_qs.ji_jobid); strcpy(newreq->rq_ind.rq_signal.rq_jid, pjob->ji_qs.ji_jobid); snprintf(newreq->rq_ind.rq_signal.rq_signame, sizeof(newreq->rq_ind.rq_signal.rq_signame), "%s", signame); /* The newreq is freed in relay_to_mom (failure) * or in issue_Drequest (success) */ rc = relay_to_mom(&pjob, newreq, NULL); if ((rc == PBSE_NONE) && (pjob != NULL)) { strcpy(jobid, pjob->ji_qs.ji_jobid); unlock_ji_mutex(pjob, __func__, NULL, LOGLEVEL); func(newreq); *pjob_ptr = svr_find_job((char *)jobid, TRUE); } else if ((extend != NULL) && (!strcmp(extend, RERUNFORCE))) { if (pjob == NULL) { *pjob_ptr = svr_find_job((char *)jobid, TRUE); pjob = *pjob_ptr; } /* The job state is normally set when the obit arrives. But since the MOM is not responding we need to set the state here */ if (pjob != NULL) { /* Rerunning job, if not checkpointed, clear "resources_used and requeue job */ if ((pjob->ji_qs.ji_svrflags & (JOB_SVFLG_CHECKPOINT_FILE | JOB_SVFLG_CHECKPOINT_MIGRATEABLE)) == 0) { job_attr_def[JOB_ATR_resc_used].at_free(&pjob->ji_wattr[JOB_ATR_resc_used]); } else if (pjob->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_FILE) { /* non-migratable checkpoint (cray), leave there */ /* and just requeue the job */ rel_resc(pjob); pjob->ji_qs.ji_svrflags |= JOB_SVFLG_HASRUN; svr_setjobstate(pjob, JOB_STATE_QUEUED, JOB_SUBSTATE_QUEUED, FALSE); pjob->ji_momhandle = -1; unlock_ji_mutex(pjob, __func__, "8", LOGLEVEL); return(PBSE_SYSTEM); } rel_resc(pjob); /* free resc assigned to job */ /* Now re-queue the job */ pjob->ji_modified = 1; /* force full job save */ pjob->ji_momhandle = -1; pjob->ji_qs.ji_svrflags &= ~JOB_SVFLG_StagedIn; svr_setjobstate(pjob, JOB_STATE_QUEUED, JOB_SUBSTATE_QUEUED, FALSE); unlock_ji_mutex(pjob, __func__, NULL, LOGLEVEL); func(newreq); rc = PBSE_NONE; } else rc = PBSE_JOBNOTFOUND; } else { free_br(newreq); if (pjob == NULL) *pjob_ptr = NULL; } return(rc); } /* END issue_signal() */
int set_node_power_state(struct pbsnode *pNode,struct pbsnode *newNode) { if(pNode->nd_addrs == NULL) { return PBSE_BAD_PARAMETER; } if(newNode->nd_power_state == POWER_STATE_RUNNING) { newNode->nd_power_state = pNode->nd_power_state; //Don't change the power state here. //Let the mom update change the state //back to running. static std::string interface; static unsigned char mac_addr[6]; if(interface.length() == 0) { if(!getMacAddr(interface,mac_addr)) { return PBSE_SYSTEM; } } int sock; if((sock = socket(AF_INET,SOCK_PACKET,SOCK_PACKET)) < 0) { return PBSE_SYSTEM; } unsigned char outpack[1000]; memcpy(outpack+6,mac_addr,6); memcpy(outpack,pNode->nd_mac_addr,6); outpack[12] = 0x08; outpack[13] = 0x42; int offset = 14; memset(outpack + offset,0xff,6); offset += 6; for(int i = 0;i < 16;i++) { memcpy(outpack + offset,pNode->nd_mac_addr,6); offset += 6; } int one = 1; if (setsockopt(sock, SOL_SOCKET, SO_BROADCAST, (char *)&one, sizeof(one)) < 0) { close(sock); return PBSE_SYSTEM; } struct sockaddr whereto; whereto.sa_family = 0; strcpy(whereto.sa_data,interface.c_str()); if (sendto(sock, outpack, offset, 0, &whereto, sizeof(whereto)) < 0) { close(sock); return PBSE_SYSTEM; } close(sock); return PBSE_NONE; } if(pNode->nd_job_usages.size() != 0) { //Can't change the power state on a node with running jobs. return PBSE_CANT_CHANGE_POWER_STATE_WITH_JOBS_RUNNING; } struct batch_request *request = alloc_br(PBS_BATCH_ChangePowerState); if(request == NULL) { return PBSE_SYSTEM; } request->rq_ind.rq_powerstate = newNode->nd_power_state; newNode->nd_power_state_change_time = time(NULL); strncpy(request->rq_host,pNode->nd_name,sizeof(request->rq_host)); int rc = enqueue_threadpool_request(send_power_state_to_mom,(void *)request,task_pool); return(rc); }
static int shutdown_checkpoint( job *pjob) { struct batch_request *phold; attribute temp; phold = alloc_br(PBS_BATCH_HoldJob); if (phold == NULL) { return(PBSE_SYSTEM); } temp.at_flags = ATR_VFLAG_SET; temp.at_type = job_attr_def[(int)JOB_ATR_hold].at_type; temp.at_val.at_long = HOLD_s; phold->rq_perm = ATR_DFLAG_MGRD | ATR_DFLAG_MGWR; strcpy(phold->rq_ind.rq_hold.rq_orig.rq_objname, pjob->ji_qs.ji_jobid); CLEAR_HEAD(phold->rq_ind.rq_hold.rq_orig.rq_attr); if (job_attr_def[(int)JOB_ATR_hold].at_encode( &temp, &phold->rq_ind.rq_hold.rq_orig.rq_attr, job_attr_def[(int)JOB_ATR_hold].at_name, NULL, ATR_ENCODE_CLIENT) < 0) { return(PBSE_SYSTEM); } if (relay_to_mom(pjob->ji_qs.ji_un.ji_exect.ji_momaddr, phold, post_checkpoint) != 0) { /* FAILURE */ return(-1); } pjob->ji_qs.ji_substate = JOB_SUBSTATE_RERUN; pjob->ji_qs.ji_svrflags |= JOB_SVFLG_HASRUN | JOB_SVFLG_CHECKPOINT_FILE; if (LOGLEVEL >= 1) { log_event( PBSEVENT_SYSTEM | PBSEVENT_JOB | PBSEVENT_DEBUG, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, "shutting down with active checkpointable job"); } job_save(pjob, SAVEJOB_QUICK); return(0); } /* END shutdown_checkpoint() */
static int shutdown_checkpoint( job **pjob_ptr) { job *pjob = *pjob_ptr; struct batch_request *phold; pbs_attribute temp; char jobid[PBS_MAXSVRJOBID + 1]; int rc = PBSE_NONE; phold = alloc_br(PBS_BATCH_HoldJob); if (phold == NULL) { return(PBSE_SYSTEM); } temp.at_flags = ATR_VFLAG_SET; temp.at_type = job_attr_def[JOB_ATR_hold].at_type; temp.at_val.at_long = HOLD_s; phold->rq_perm = ATR_DFLAG_MGRD | ATR_DFLAG_MGWR; strcpy(phold->rq_ind.rq_hold.rq_orig.rq_objname, pjob->ji_qs.ji_jobid); CLEAR_HEAD(phold->rq_ind.rq_hold.rq_orig.rq_attr); if (job_attr_def[JOB_ATR_hold].at_encode( &temp, &phold->rq_ind.rq_hold.rq_orig.rq_attr, job_attr_def[JOB_ATR_hold].at_name, NULL, ATR_ENCODE_CLIENT, 0) < 0) { free_br(phold); return(PBSE_SYSTEM); } /* The phold is freed in relay_to_mom (failure) * or in issue_Drequest (success) */ if ((rc = relay_to_mom(&pjob, phold, NULL)) != PBSE_NONE) { /* FAILURE */ free_br(phold); return(-1); } jobid[0] = '\0'; if (pjob != NULL) { pjob->ji_qs.ji_substate = JOB_SUBSTATE_RERUN; pjob->ji_qs.ji_svrflags |= JOB_SVFLG_HASRUN | JOB_SVFLG_CHECKPOINT_FILE; if (LOGLEVEL >= 1) { log_event( PBSEVENT_SYSTEM | PBSEVENT_JOB | PBSEVENT_DEBUG, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, "shutting down with active checkpointable job"); } job_save(pjob, SAVEJOB_QUICK, 0); strcpy(jobid, pjob->ji_qs.ji_jobid); unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL); } if (rc == PBSE_NONE) { post_checkpoint(phold); if (jobid[0] != '\0') *pjob_ptr = svr_find_job(jobid, TRUE); } return(PBSE_NONE); } /* END shutdown_checkpoint() */
void *mom_process_request( void *sock_num) /* file descriptor (socket) to get request */ { int rc; struct batch_request *request = NULL; int sfds = *(int *)sock_num; struct tcp_chan *chan = NULL; time_now = time(NULL); if ((request = alloc_br(0)) == NULL) { mom_close_client(sfds); return NULL; } request->rq_conn = sfds; if ((chan = DIS_tcp_setup(sfds)) == NULL) { mom_close_client(sfds); free_br(request); return NULL; } /* Read in the request and decode it to the internal request structure. */ rc = dis_request_read(chan, request); if (rc == -1) { /* FAILURE */ /* premature end of file */ mom_close_client(chan->sock); free_br(request); DIS_tcp_cleanup(chan); return NULL; } if ((rc == PBSE_SYSTEM) || (rc == PBSE_INTERNAL) || (rc == PBSE_SOCKET_CLOSE)) { /* FAILURE */ /* read error, likely cannot send reply so just disconnect */ /* ??? not sure about this ??? */ mom_close_client(chan->sock); free_br(request); DIS_tcp_cleanup(chan); return NULL; } if (rc > 0) { /* FAILURE */ /* * request didn't decode, either garbage or unknown * request type, in either case, return reject-reply */ req_reject(rc, 0, request, NULL, "cannot decode message"); mom_close_client(chan->sock); DIS_tcp_cleanup(chan); return NULL; } if (get_connecthost(chan->sock, request->rq_host, PBS_MAXHOSTNAME) != 0) { char tmpLine[MAXLINE]; sprintf(log_buffer, "%s: %lu", pbse_to_txt(PBSE_BADHOST), get_connectaddr(chan->sock,FALSE)); log_event(PBSEVENT_DEBUG, PBS_EVENTCLASS_REQUEST, "", log_buffer); snprintf(tmpLine, sizeof(tmpLine), "cannot determine hostname for connection from %lu", get_connectaddr(chan->sock,FALSE)); req_reject(PBSE_BADHOST, 0, request, NULL, tmpLine); mom_close_client(chan->sock); DIS_tcp_cleanup(chan); return NULL; } if (LOGLEVEL >= 1) { sprintf( log_buffer, msg_request, reqtype_to_txt(request->rq_type), request->rq_user, request->rq_host, chan->sock); log_event(PBSEVENT_DEBUG2, PBS_EVENTCLASS_REQUEST, "", log_buffer); } /* is the request from a host acceptable to the server */ { /*extern tree *okclients; */ extern void mom_server_update_receive_time_by_ip(u_long ipaddr, const char *cmd); /* check connecting host against allowed list of ok clients */ if (LOGLEVEL >= 6) { sprintf(log_buffer, "request type %s from host %s received", reqtype_to_txt(request->rq_type), request->rq_host); log_record(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, __func__, log_buffer); } if (!AVL_is_in_tree_no_port_compare(svr_conn[chan->sock].cn_addr, 0, okclients)) { sprintf(log_buffer, "request type %s from host %s rejected (host not authorized)", reqtype_to_txt(request->rq_type), request->rq_host); log_record(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, __func__, log_buffer); req_reject(PBSE_BADHOST, 0, request, NULL, "request not authorized"); mom_close_client(chan->sock); DIS_tcp_cleanup(chan); return NULL; } if (LOGLEVEL >= 3) { sprintf(log_buffer, "request type %s from host %s allowed", reqtype_to_txt(request->rq_type), request->rq_host); log_record(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, __func__, log_buffer); } mom_server_update_receive_time_by_ip(svr_conn[chan->sock].cn_addr, reqtype_to_txt(request->rq_type)); } /* END BLOCK */ request->rq_fromsvr = 1; request->rq_perm = ATR_DFLAG_USRD | ATR_DFLAG_USWR | ATR_DFLAG_OPRD | ATR_DFLAG_OPWR | ATR_DFLAG_MGRD | ATR_DFLAG_MGWR | ATR_DFLAG_SvWR | ATR_DFLAG_MOM; /* * dispatch the request to the correct processing function. * The processing function must call reply_send() to free * the request struture. */ mom_dispatch_request(chan->sock, request); DIS_tcp_cleanup(chan); return NULL; } /* END mom_process_request() */
batch_request *duplicate_request(batch_request *preq, int job_index) { batch_request *preq_tmp = alloc_br(preq->rq_type); char *ptr1; char *ptr2; char newjobname[PBS_MAXSVRJOBID+1]; if (preq_tmp == NULL) return(NULL); preq_tmp->rq_perm = preq->rq_perm; preq_tmp->rq_fromsvr = preq->rq_fromsvr; preq_tmp->rq_conn = preq->rq_conn; preq_tmp->rq_time = preq->rq_time; preq_tmp->rq_orgconn = preq->rq_orgconn; memcpy(preq_tmp->rq_ind.rq_manager.rq_objname, preq->rq_ind.rq_manager.rq_objname, PBS_MAXSVRJOBID + 1); strcpy(preq_tmp->rq_user, preq->rq_user); strcpy(preq_tmp->rq_host, preq->rq_host); if (preq->rq_extend != NULL) preq_tmp->rq_extend = strdup(preq->rq_extend); switch (preq->rq_type) { /* This function was created for a modify array request (PBS_BATCH_ModifyJob) the preq->rq_ind structure was allocated in dis_request_read. If other BATCH types are needed refer to that function to see how the rq_ind structure was allocated and then copy it here. */ case PBS_BATCH_DeleteJob: case PBS_BATCH_HoldJob: case PBS_BATCH_CheckpointJob: case PBS_BATCH_ModifyJob: case PBS_BATCH_AsyModifyJob: /* based on how decode_DIS_Manage allocates data */ CLEAR_HEAD(preq_tmp->rq_ind.rq_manager.rq_attr); preq_tmp->rq_ind.rq_manager.rq_cmd = preq->rq_ind.rq_manager.rq_cmd; preq_tmp->rq_ind.rq_manager.rq_objtype = preq->rq_ind.rq_manager.rq_objtype; if (job_index != -1) { /* If this is a job array it is possible we only have the array name and not the individual job. We need to find out what we have and modify the name if needed */ ptr1 = strstr(preq->rq_ind.rq_manager.rq_objname, "[]"); if (ptr1) { ptr1++; strcpy(newjobname, preq->rq_ind.rq_manager.rq_objname); ptr2 = strstr(newjobname, "[]"); ptr2++; *ptr2 = 0; sprintf(preq_tmp->rq_ind.rq_manager.rq_objname,"%s%d%s", newjobname, job_index, ptr1); } else strcpy(preq_tmp->rq_ind.rq_manager.rq_objname, preq->rq_ind.rq_manager.rq_objname); } /* copy the attribute list */ if (copy_attribute_list(preq, preq_tmp) != PBSE_NONE) return(NULL); break; case PBS_BATCH_SignalJob: strcpy(preq_tmp->rq_ind.rq_signal.rq_jid, preq->rq_ind.rq_signal.rq_jid); strcpy(preq_tmp->rq_ind.rq_signal.rq_signame, preq->rq_ind.rq_signal.rq_signame); preq_tmp->rq_extra = strdup((char *)preq->rq_extra); break; case PBS_BATCH_MessJob: strcpy(preq_tmp->rq_ind.rq_message.rq_jid, preq->rq_ind.rq_message.rq_jid); preq_tmp->rq_ind.rq_message.rq_file = preq->rq_ind.rq_message.rq_file; strcpy(preq_tmp->rq_ind.rq_message.rq_text, preq->rq_ind.rq_message.rq_text); break; case PBS_BATCH_RunJob: if (preq->rq_ind.rq_run.rq_destin) preq_tmp->rq_ind.rq_run.rq_destin = strdup(preq->rq_ind.rq_run.rq_destin); break; default: break; } return(preq_tmp); }
void process_request( int sfds) /* file descriptor (socket) to get request */ { #ifdef PBS_MOM char *id = "process_request"; #endif int rc; struct batch_request *request = NULL; #ifndef PBS_MOM char *auth_err = NULL; #endif time_now = time(NULL); request = alloc_br(0); request->rq_conn = sfds; /* * Read in the request and decode it to the internal request structure. */ #ifndef PBS_MOM if (svr_conn[sfds].cn_active == FromClientDIS) { #ifdef ENABLE_UNIX_SOCKETS if ((svr_conn[sfds].cn_socktype & PBS_SOCK_UNIX) && (svr_conn[sfds].cn_authen != PBS_NET_CONN_AUTHENTICATED)) { get_creds(sfds, conn_credent[sfds].username, conn_credent[sfds].hostname); } #endif /* END ENABLE_UNIX_SOCKETS */ rc = dis_request_read(sfds, request); } else { LOG_EVENT( PBSEVENT_SYSTEM, PBS_EVENTCLASS_REQUEST, "process_req", "request on invalid type of connection"); close_conn(sfds); free_br(request); return; } #else /* PBS_MOM */ rc = dis_request_read(sfds, request); #endif /* PBS_MOM */ if (rc == -1) { /* FAILURE */ /* premature end of file */ close_client(sfds); free_br(request); return; } if ((rc == PBSE_SYSTEM) || (rc == PBSE_INTERNAL)) { /* FAILURE */ /* read error, likely cannot send reply so just disconnect */ /* ??? not sure about this ??? */ close_client(sfds); free_br(request); return; } if (rc > 0) { /* FAILURE */ /* * request didn't decode, either garbage or unknown * request type, in either case, return reject-reply */ req_reject(rc, 0, request, NULL, "cannot decode message"); close_client(sfds); return; } if (get_connecthost(sfds, request->rq_host, PBS_MAXHOSTNAME) != 0) { char tmpLine[1024]; sprintf(log_buffer, "%s: %lu", pbse_to_txt(PBSE_BADHOST), get_connectaddr(sfds)); LOG_EVENT(PBSEVENT_DEBUG, PBS_EVENTCLASS_REQUEST, "", log_buffer); snprintf(tmpLine, sizeof(tmpLine), "cannot determine hostname for connection from %lu", get_connectaddr(sfds)); req_reject(PBSE_BADHOST, 0, request, NULL, tmpLine); return; } if (LOGLEVEL >= 1) { sprintf( log_buffer, msg_request, reqtype_to_txt(request->rq_type), request->rq_user, request->rq_host, sfds); LOG_EVENT(PBSEVENT_DEBUG2, PBS_EVENTCLASS_REQUEST, "", log_buffer); } /* is the request from a host acceptable to the server */ #ifndef PBS_MOM if (svr_conn[sfds].cn_socktype & PBS_SOCK_UNIX) { strcpy(request->rq_host, server_name); } if (server.sv_attr[SRV_ATR_acl_host_enable].at_val.at_long) { /* acl enabled, check it; always allow myself and nodes */ struct pbsnode *isanode; isanode = PGetNodeFromAddr(get_connectaddr(sfds)); if ((isanode == NULL) && (strcmp(server_host, request->rq_host) != 0) && (acl_check( &server.sv_attr[SRV_ATR_acl_hosts], request->rq_host, ACL_Host) == 0)) { char tmpLine[1024]; snprintf(tmpLine, sizeof(tmpLine), "request not authorized from host %s", request->rq_host); req_reject(PBSE_BADHOST, 0, request, NULL, tmpLine); close_client(sfds); return; } } /* * determine source (user client or another server) of request. * set the permissions granted to the client */ if (svr_conn[sfds].cn_authen == PBS_NET_CONN_FROM_PRIVIL) { /* request came from another server */ request->rq_fromsvr = 1; request->rq_perm = ATR_DFLAG_USRD | ATR_DFLAG_USWR | ATR_DFLAG_OPRD | ATR_DFLAG_OPWR | ATR_DFLAG_MGRD | ATR_DFLAG_MGWR | ATR_DFLAG_SvWR; } else { /* request not from another server */ request->rq_fromsvr = 0; /* * Client must be authenticated by an Authenticate User Request, if not, * reject request and close connection. -- The following is retained for * compat with old cmds -- The exception to this is of course the Connect * Request which cannot have been authenticated, because it contains the * needed ticket; so trap it here. Of course, there is no prior * authentication on the Authenticate User request either, but it comes * over a reserved port and appears from another server, hence is * automatically granted authentication. * * The above is only true with inet sockets. With unix domain sockets, the * user creds were read before the first dis_request_read call above. * We automatically granted authentication because we can trust the socket * creds. Authorization is still granted in svr_get_privilege below */ if (request->rq_type == PBS_BATCH_Connect) { req_connect(request); if (svr_conn[sfds].cn_socktype == PBS_SOCK_INET) return; } if (svr_conn[sfds].cn_socktype & PBS_SOCK_UNIX) { conn_credent[sfds].timestamp = time_now; svr_conn[sfds].cn_authen = PBS_NET_CONN_AUTHENTICATED; } if (ENABLE_TRUSTED_AUTH == TRUE ) rc = 0; /* bypass the authentication of the user--trust the client completely */ else if (munge_on) { /* If munge_on is true we will validate the connection now */ if ( request->rq_type == PBS_BATCH_AltAuthenUser) { rc = req_altauthenuser(request); if (rc == PBSE_NONE) { conn_credent[sfds].timestamp = time_now; svr_conn[sfds].cn_authen = PBS_NET_CONN_AUTHENTICATED; } return; } else if (svr_conn[sfds].cn_authen != PBS_NET_CONN_AUTHENTICATED) /* skip checking user if we did not get an authenticated credential */ rc = PBSE_BADCRED; else { rc = authenticate_user(request, &conn_credent[sfds], &auth_err); } } else if (svr_conn[sfds].cn_authen != PBS_NET_CONN_AUTHENTICATED) rc = PBSE_BADCRED; else rc = authenticate_user(request, &conn_credent[sfds], &auth_err); if (rc != 0) { req_reject(rc, 0, request, NULL, auth_err); if (auth_err != NULL) free(auth_err); close_client(sfds); return; } /* * pbs_mom and checkpoint restart scripts both need the authority to do * alters and releases on checkpointable jobs. Allow manager permission * for root on the jobs execution node. */ if (((request->rq_type == PBS_BATCH_ModifyJob) || (request->rq_type == PBS_BATCH_ReleaseJob)) && (strcmp(request->rq_user, PBS_DEFAULT_ADMIN) == 0)) { job *pjob; char *dptr; int skip = FALSE; char short_host[PBS_MAXHOSTNAME+1]; /* make short host name */ strcpy(short_host, request->rq_host); if ((dptr = strchr(short_host, '.')) != NULL) { *dptr = '\0'; } if (((pjob = find_job(request->rq_ind.rq_modify.rq_objname)) != (job *)0) && (pjob->ji_qs.ji_state == JOB_STATE_RUNNING)) { if ((pjob->ji_wattr[JOB_ATR_checkpoint].at_flags & ATR_VFLAG_SET) && ((csv_find_string(pjob->ji_wattr[JOB_ATR_checkpoint].at_val.at_str, "s") != NULL) || (csv_find_string(pjob->ji_wattr[JOB_ATR_checkpoint].at_val.at_str, "c") != NULL) || (csv_find_string(pjob->ji_wattr[JOB_ATR_checkpoint].at_val.at_str, "enabled") != NULL)) && (strstr(pjob->ji_wattr[JOB_ATR_exec_host].at_val.at_str, short_host) != NULL)) { request->rq_perm = svr_get_privilege(request->rq_user, server_host); skip = TRUE; } } if (!skip) { request->rq_perm = svr_get_privilege(request->rq_user, request->rq_host); } } else { request->rq_perm = svr_get_privilege(request->rq_user, request->rq_host); } } /* END else (svr_conn[sfds].cn_authen == PBS_NET_CONN_FROM_PRIVIL) */ /* if server shutting down, disallow new jobs and new running */ if (server.sv_attr[SRV_ATR_State].at_val.at_long > SV_STATE_RUN) { switch (request->rq_type) { case PBS_BATCH_AsyrunJob: case PBS_BATCH_JobCred: case PBS_BATCH_MoveJob: case PBS_BATCH_QueueJob: case PBS_BATCH_RunJob: case PBS_BATCH_StageIn: case PBS_BATCH_jobscript: req_reject(PBSE_SVRDOWN, 0, request, NULL, NULL); return; /*NOTREACHED*/ break; } } #else /* THIS CODE FOR MOM ONLY */ { /*extern tree *okclients; */ extern void mom_server_update_receive_time_by_ip(u_long ipaddr, const char *cmd); /* check connecting host against allowed list of ok clients */ if (LOGLEVEL >= 6) { sprintf(log_buffer, "request type %s from host %s received", reqtype_to_txt(request->rq_type), request->rq_host); log_record( PBSEVENT_JOB, PBS_EVENTCLASS_JOB, id, log_buffer); } /* if (!tfind(svr_conn[sfds].cn_addr, &okclients)) */ if (!AVL_is_in_tree(svr_conn[sfds].cn_addr, 0, okclients)) { sprintf(log_buffer, "request type %s from host %s rejected (host not authorized)", reqtype_to_txt(request->rq_type), request->rq_host); log_record( PBSEVENT_JOB, PBS_EVENTCLASS_JOB, id, log_buffer); req_reject(PBSE_BADHOST, 0, request, NULL, "request not authorized"); close_client(sfds); return; } if (LOGLEVEL >= 3) { sprintf(log_buffer, "request type %s from host %s allowed", reqtype_to_txt(request->rq_type), request->rq_host); log_record( PBSEVENT_JOB, PBS_EVENTCLASS_JOB, id, log_buffer); } mom_server_update_receive_time_by_ip(svr_conn[sfds].cn_addr, reqtype_to_txt(request->rq_type)); } /* END BLOCK */ request->rq_fromsvr = 1; request->rq_perm = ATR_DFLAG_USRD | ATR_DFLAG_USWR | ATR_DFLAG_OPRD | ATR_DFLAG_OPWR | ATR_DFLAG_MGRD | ATR_DFLAG_MGWR | ATR_DFLAG_SvWR | ATR_DFLAG_MOM; #endif /* END else !PBS_MOM */ /* * dispatch the request to the correct processing function. * The processing function must call reply_send() to free * the request struture. */ dispatch_request(sfds, request); return; } /* END process_request() */