void * send_power_state_to_mom(void *arg) { struct batch_request *pRequest = (struct batch_request *)arg; struct pbsnode *pNode = find_nodebyname(pRequest->rq_host); if(pNode == NULL) { free_br(pRequest); return NULL; } int handle = 0; int local_errno = 0; handle = svr_connect(pNode->nd_addrs[0],pNode->nd_mom_port,&local_errno,pNode,NULL); if(handle < 0) { unlock_node(pNode, __func__, "Error connecting", LOGLEVEL); return NULL; } unlock_node(pNode, __func__, "Done connecting", LOGLEVEL); issue_Drequest(handle, pRequest); return NULL; }
int stat_to_mom( char *job_id, struct stat_cntl *cntl) /* M */ { struct batch_request *newrq; int rc = PBSE_NONE; unsigned long addr; char log_buf[LOCAL_LOG_BUF_SIZE+1]; struct pbsnode *node; int handle = -1; unsigned long job_momaddr = -1; unsigned short job_momport = -1; char *job_momname = NULL; job *pjob = NULL; if ((pjob = svr_find_job(job_id, FALSE)) == NULL) return(PBSE_JOBNOTFOUND); mutex_mgr job_mutex(pjob->ji_mutex, true); if ((pjob->ji_qs.ji_un.ji_exect.ji_momaddr == 0) || (!pjob->ji_wattr[JOB_ATR_exec_host].at_val.at_str)) { job_mutex.unlock(); snprintf(log_buf, sizeof(log_buf), "Job %s missing MOM's information. Skipping statting on this job", pjob->ji_qs.ji_jobid); log_record(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, __func__, log_buf); return PBSE_BAD_PARAMETER; } job_momaddr = pjob->ji_qs.ji_un.ji_exect.ji_momaddr; job_momport = pjob->ji_qs.ji_un.ji_exect.ji_momport; job_momname = strdup(pjob->ji_wattr[JOB_ATR_exec_host].at_val.at_str); job_mutex.unlock(); if (job_momname == NULL) return PBSE_MEM_MALLOC; if ((newrq = alloc_br(PBS_BATCH_StatusJob)) == NULL) { free(job_momname); return PBSE_MEM_MALLOC; } if (cntl->sc_type == 1) snprintf(newrq->rq_ind.rq_status.rq_id, sizeof(newrq->rq_ind.rq_status.rq_id), "%s", job_id); else newrq->rq_ind.rq_status.rq_id[0] = '\0'; /* get stat of all */ CLEAR_HEAD(newrq->rq_ind.rq_status.rq_attr); /* if MOM is down just return stale information */ addr = job_momaddr; node = tfind_addr(addr,job_momport,job_momname); free(job_momname); if (node == NULL) return PBSE_UNKNODE; if ((node->nd_state & INUSE_DOWN)||(node->nd_power_state != POWER_STATE_RUNNING)) { if (LOGLEVEL >= 6) { snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "node '%s' is allocated to job but in state 'down'", node->nd_name); log_event(PBSEVENT_SYSTEM,PBS_EVENTCLASS_JOB,job_id,log_buf); } unlock_node(node, __func__, "no rely mom", LOGLEVEL); free_br(newrq); return PBSE_NORELYMOM; } /* get connection to MOM */ unlock_node(node, __func__, "before svr_connect", LOGLEVEL); handle = svr_connect(job_momaddr, job_momport, &rc, NULL, NULL); if (handle >= 0) { if ((rc = issue_Drequest(handle, newrq, true)) == PBSE_NONE) { stat_update(newrq, cntl); } } else rc = PBSE_CONNECT; if (rc == PBSE_SYSTEM) rc = PBSE_MEM_MALLOC; free_br(newrq); return(rc); } /* END stat_to_mom() */
void req_gpuctrl( struct batch_request *preq) { char *id = "req_gpuctrl"; char *nodename = NULL; char *gpuid = NULL; int gpumode = -1; int reset_perm = -1; int reset_vol = -1; #ifdef NVIDIA_GPUS struct pbsnode *pnode = NULL; int gpuidx = -1; int rc = 0; int conn; #endif /* NVIDIA_GPUS */ if ((preq->rq_perm & (ATR_DFLAG_MGWR | ATR_DFLAG_MGRD | ATR_DFLAG_OPRD | ATR_DFLAG_OPWR)) == 0) { req_reject(PBSE_PERM, 0, preq, NULL, NULL); return; } nodename = preq->rq_ind.rq_gpuctrl.rq_momnode; gpuid = preq->rq_ind.rq_gpuctrl.rq_gpuid; gpumode = preq->rq_ind.rq_gpuctrl.rq_gpumode; reset_perm = preq->rq_ind.rq_gpuctrl.rq_reset_perm; reset_vol = preq->rq_ind.rq_gpuctrl.rq_reset_vol; #ifdef NVIDIA_GPUS if (LOGLEVEL >= 7) { sprintf( log_buffer, "GPU control request for node %s gpuid %s mode %d reset_perm %d reset_vol %d", nodename, gpuid, gpumode, reset_perm, reset_vol); log_ext(-1, id, log_buffer, LOG_INFO); } /* validate mom node exists */ pnode = find_nodebyname(nodename); if (pnode == NULL) { req_reject(PBSE_UNKNODE, 0, preq, NULL, NULL); return; } /* validate that the node is up */ if (pnode->nd_state & (INUSE_DELETED | INUSE_DOWN | INUSE_OFFLINE | INUSE_UNKNOWN)) { sprintf( log_buffer, "Node %s is not available", pnode->nd_name); req_reject(PBSE_UNKREQ, 0, preq, NULL, log_buffer); return; } /* validate that the node has real gpus not virtual */ if (!pnode->nd_gpus_real) { req_reject(PBSE_UNKREQ, 0, preq, NULL, "Not allowed for virtual gpus"); return; } /* validate the gpuid exists */ if ((gpuidx = gpu_entry_by_id(pnode, gpuid, FALSE)) == -1) { req_reject(PBSE_UNKREQ, 0, preq, NULL, "GPU ID does not exist on node"); return; } /* validate that we have a real request */ if ((gpumode == -1) && (reset_perm == -1) && (reset_vol == -1)) { req_reject(PBSE_UNKREQ, 0, preq, NULL, "No action specified"); return; } /* for mode changes validate the mode with the driver_version */ if ((pnode->nd_gpusn[gpuidx].driver_ver == 260) && (gpumode > 2)) { req_reject(PBSE_UNKREQ, 0, preq, NULL, "GPU driver version does not support mode 3"); return; } /* we need to relay request to the mom for processing */ /* have MOM attempt to change the gpu mode */ preq->rq_orgconn = preq->rq_conn; /* restore client socket */ conn = svr_connect( pnode->nd_addrs[0], pbs_mom_port, process_Dreply, ToServerDIS); if (conn >= 0) { if ((rc = issue_Drequest(conn, preq, process_gpu_request_reply, NULL)) != 0) { req_reject(rc, 0, preq, NULL, NULL); } } else { req_reject(PBSE_UNKREQ, 0, preq, NULL, "Failed to get connection to mom"); } #else sprintf( log_buffer, "GPU control request not supported: node %s gpuid %s mode %d reset_perm %d reset_vol %d", nodename, gpuid, gpumode, reset_perm, reset_vol); if (LOGLEVEL >= 3) { log_ext(-1, id, log_buffer, LOG_INFO); } req_reject(PBSE_NOSUP, 0, preq, NULL, NULL); #endif /* NVIDIA_GPUS */ return; }
int req_gpuctrl_svr( struct batch_request *preq) { int rc = PBSE_NONE; char *nodename = NULL; char *gpuid = NULL; int gpumode = -1; int reset_perm = -1; int reset_vol = -1; char log_buf[LOCAL_LOG_BUF_SIZE+1]; int local_errno = 0; struct pbsnode *pnode = NULL; int gpuidx = -1; int conn; if ((preq->rq_perm & (ATR_DFLAG_MGWR | ATR_DFLAG_MGRD | ATR_DFLAG_OPRD | ATR_DFLAG_OPWR)) == 0) { rc = PBSE_PERM; snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "invalid permissions (ATR_DFLAG_MGWR | ATR_DFLAG_MGRD | ATR_DFLAG_OPRD | ATR_DFLAG_OPWR)"); req_reject(rc, 0, preq, NULL, log_buf); return rc; } nodename = preq->rq_ind.rq_gpuctrl.rq_momnode; gpuid = preq->rq_ind.rq_gpuctrl.rq_gpuid; gpumode = preq->rq_ind.rq_gpuctrl.rq_gpumode; reset_perm = preq->rq_ind.rq_gpuctrl.rq_reset_perm; reset_vol = preq->rq_ind.rq_gpuctrl.rq_reset_vol; if (LOGLEVEL >= 7) { sprintf( log_buf, "GPU control request for node %s gpuid %s mode %d reset_perm %d reset_vol %d", nodename, gpuid, gpumode, reset_perm, reset_vol); log_ext(-1, __func__, log_buf, LOG_INFO); } /* validate mom node exists */ pnode = find_nodebyname(nodename); if (pnode == NULL) { req_reject(PBSE_UNKNODE, 0, preq, NULL, NULL); return PBSE_UNKNODE; } /* validate that the node is up */ if ((pnode->nd_state & (INUSE_DOWN | INUSE_OFFLINE | INUSE_UNKNOWN))||(pnode->nd_power_state != POWER_STATE_RUNNING)) { rc = PBSE_UNKREQ; sprintf(log_buf,"Node %s is not available",pnode->nd_name); req_reject(rc, 0, preq, NULL, log_buf); unlock_node(pnode, __func__, NULL, LOGLEVEL); return rc; } /* validate that the node has real gpus not virtual */ if (!pnode->nd_gpus_real) { rc = PBSE_UNKREQ; req_reject(rc, 0, preq, NULL, "Not allowed for virtual gpus"); unlock_node(pnode, __func__, NULL, LOGLEVEL); return rc; } /* validate the gpuid exists */ if ((gpuidx = gpu_entry_by_id(pnode, gpuid, FALSE)) == -1) { rc = PBSE_UNKREQ; req_reject(rc, 0, preq, NULL, "GPU ID does not exist on node"); unlock_node(pnode, __func__, NULL, LOGLEVEL); return rc; } /* validate that we have a real request */ if ((gpumode == -1) && (reset_perm == -1) && (reset_vol == -1)) { rc = PBSE_UNKREQ; req_reject(rc, 0, preq, NULL, "No action specified"); unlock_node(pnode, __func__, NULL, LOGLEVEL); return rc; } /* for mode changes validate the mode with the driver_version */ if ((pnode->nd_gpusn[gpuidx].driver_ver == 260) && (gpumode > 2)) { rc = PBSE_UNKREQ; req_reject(rc, 0, preq, NULL, "GPU driver version does not support mode 3"); unlock_node(pnode, __func__, NULL, LOGLEVEL); return rc; } /* we need to relay request to the mom for processing */ /* have MOM attempt to change the gpu mode */ preq->rq_orgconn = preq->rq_conn; /* restore client socket */ unlock_node(pnode, __func__, NULL, LOGLEVEL); conn = svr_connect( pnode->nd_addrs[0], pbs_mom_port, &local_errno, NULL, NULL); if (conn >= 0) { if ((rc = issue_Drequest(conn, preq)) != PBSE_NONE) req_reject(rc, 0, preq, NULL, NULL); else process_gpu_request_reply(preq); } else { req_reject(PBSE_UNKREQ, 0, preq, NULL, "Failed to get connection to mom"); } return rc; }
int stat_to_mom( char *job_id, struct stat_cntl *cntl) /* M */ { struct batch_request *newrq; int rc = PBSE_NONE; unsigned long addr; char log_buf[LOCAL_LOG_BUF_SIZE+1]; struct pbsnode *node; int handle = -1; unsigned long job_momaddr = -1; unsigned short job_momport = -1; char *job_momname = NULL; job *pjob = NULL; if ((pjob = svr_find_job(job_id, FALSE)) == NULL) return PBSE_JOBNOTFOUND; job_momaddr = pjob->ji_qs.ji_un.ji_exect.ji_momaddr; job_momport = pjob->ji_qs.ji_un.ji_exect.ji_momport; job_momname = strdup(pjob->ji_wattr[JOB_ATR_exec_host].at_val.at_str); unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL); if (job_momname == NULL) return PBSE_MEM_MALLOC; if ((newrq = alloc_br(PBS_BATCH_StatusJob)) == NULL) { free(job_momname); return PBSE_MEM_MALLOC; } if (cntl->sc_type == 1) strcpy(newrq->rq_ind.rq_status.rq_id, job_id); else newrq->rq_ind.rq_status.rq_id[0] = '\0'; /* get stat of all */ CLEAR_HEAD(newrq->rq_ind.rq_status.rq_attr); /* if MOM is down just return stale information */ addr = job_momaddr; node = tfind_addr(addr,job_momport,job_momname); free(job_momname); if (node == NULL) return PBSE_UNKNODE; if (node->nd_state & INUSE_DOWN) { if (LOGLEVEL >= 6) { snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "node '%s' is allocated to job but in state 'down'", node->nd_name); log_event(PBSEVENT_SYSTEM,PBS_EVENTCLASS_JOB,job_id,log_buf); } unlock_node(node, __func__, "no rely mom", LOGLEVEL); free_br(newrq); return PBSE_NORELYMOM; } /* get connection to MOM */ unlock_node(node, __func__, "before svr_connect", LOGLEVEL); handle = svr_connect(job_momaddr, job_momport, &rc, NULL, NULL, ToServerDIS); /* Unlock job here */ if (handle >= 0) { if ((rc = issue_Drequest(handle, newrq)) == PBSE_NONE) { stat_update(newrq, cntl); } } else rc = PBSE_CONNECT; if (rc == PBSE_SYSTEM) rc = PBSE_MEM_MALLOC; free_br(newrq); return rc; } /* END stat_to_mom() */
int relay_to_mom( job **pjob_ptr, struct batch_request *request, /* the request to send */ void (*func)(struct work_task *)) { int handle; /* a client style connection handle */ int rc; int local_errno = 0; pbs_net_t addr; unsigned short port; job *pjob = *pjob_ptr; char jobid[PBS_MAXSVRJOBID + 1]; char *job_momname = NULL; struct pbsnode *node; char log_buf[LOCAL_LOG_BUF_SIZE]; /* if MOM is down don't try to connect */ addr = pjob->ji_qs.ji_un.ji_exect.ji_momaddr; port = pjob->ji_qs.ji_un.ji_exect.ji_momport; job_momname = strdup(pjob->ji_wattr[JOB_ATR_exec_host].at_val.at_str); if (job_momname == NULL) return PBSE_MEM_MALLOC; if ((node = tfind_addr(addr, port, job_momname)) == NULL) { free(job_momname); return(PBSE_NORELYMOM); } free(job_momname); if ((node != NULL) && (node->nd_state & INUSE_DOWN)) { unlock_node(node, __func__, "no rely mom", LOGLEVEL); return(PBSE_NORELYMOM); } if (LOGLEVEL >= 7) { char *tmp = netaddr_pbs_net_t(pjob->ji_qs.ji_un.ji_exect.ji_momaddr); sprintf(log_buf, "momaddr=%s",tmp); log_record(PBSEVENT_SCHED, PBS_EVENTCLASS_REQUEST, __func__, log_buf); free(tmp); } unlock_node(node, __func__, "after svr_connect", LOGLEVEL); handle = svr_connect( pjob->ji_qs.ji_un.ji_exect.ji_momaddr, pjob->ji_qs.ji_un.ji_exect.ji_momport, &local_errno, NULL, NULL, ToServerDIS); if (handle < 0) { log_event(PBSEVENT_ERROR,PBS_EVENTCLASS_REQUEST,"",msg_norelytomom); return(PBSE_NORELYMOM); } strcpy(jobid, pjob->ji_qs.ji_jobid); unlock_ji_mutex(pjob, __func__, NULL, LOGLEVEL); request->rq_orgconn = request->rq_conn; /* save client socket */ rc = issue_Drequest(handle, request); *pjob_ptr = svr_find_job(jobid, TRUE); return(rc); } /* END relay_to_mom() */
int issue_to_svr( char *servern, /* I */ struct batch_request *preq, /* I */ void (*replyfunc) (struct work_task *)) /* I */ { int rc = PBSE_NONE; int do_retry = 0; int handle; int my_err = 0; pbs_net_t svraddr; char *svrname; unsigned int port = pbs_server_port_dis; struct work_task *pwt; time_t time_now = time(NULL); snprintf(preq->rq_host, sizeof(preq->rq_host), "%s", servern); preq->rq_fromsvr = 1; preq->rq_perm = ATR_DFLAG_MGRD | ATR_DFLAG_MGWR | ATR_DFLAG_SvWR; svrname = parse_servername(servern, &port); svraddr = get_hostaddr(&my_err,svrname); free(svrname); if (svraddr == (pbs_net_t)0) { if (my_err == PBS_NET_RC_RETRY) { /* Non fatal error - retry */ do_retry = 1; } } else { handle = svr_connect(svraddr, port, &my_err, NULL, NULL, ToServerDIS); if (handle >= 0) { if (((rc = issue_Drequest(handle, preq)) == PBSE_NONE) && (handle != PBS_LOCAL_CONNECTION)) { /* preq is already freed if handle == PBS_LOCAL_CONNECTION - a reply * has always been sent */ rc = preq->rq_reply.brp_code; } return(rc); } else if (handle == PBS_NET_RC_RETRY) { do_retry = 1; } } /* if reached here, it didn`t go, do we retry? */ if (do_retry) { if (preq->rq_id == NULL) get_batch_request_id(preq); pwt = set_task(WORK_Timed, (long)(time_now + PBS_NET_RETRY_TIME), reissue_to_svr, preq->rq_id, TRUE); pwt->wt_parmfunc = replyfunc; pthread_mutex_unlock(pwt->wt_mutex); return(PBSE_NONE); } /* FAILURE */ return(PBSE_INTERNAL); } /* END issue_to_svr() */
int relay_to_mom( job **pjob_ptr, struct batch_request *request, /* the request to send */ void (*func)(struct work_task *)) { int handle; /* a client style connection handle */ int rc; int local_errno = 0; pbs_net_t addr; unsigned short port; job *pjob = *pjob_ptr; char jobid[PBS_MAXSVRJOBID + 1]; char *job_momname = NULL; struct pbsnode *node; char log_buf[LOCAL_LOG_BUF_SIZE]; std::string node_name; if (pjob->ji_wattr[JOB_ATR_exec_host].at_val.at_str == NULL) { snprintf(log_buf, sizeof(log_buf), "attempting to send a request to %s's mom but no exec_host list?", pjob->ji_qs.ji_jobid); log_err(PBSE_BADSTATE, __func__, log_buf); return(PBSE_BADSTATE); } /* if MOM is down don't try to connect */ addr = pjob->ji_qs.ji_un.ji_exect.ji_momaddr; port = pjob->ji_qs.ji_un.ji_exect.ji_momport; job_momname = strdup(pjob->ji_wattr[JOB_ATR_exec_host].at_val.at_str); if (job_momname == NULL) return PBSE_MEM_MALLOC; if ((node = tfind_addr(addr, port, job_momname)) == NULL) { free(job_momname); return(PBSE_NORELYMOM); } free(job_momname); if ((node != NULL) && ((node->nd_state & INUSE_NOT_READY)|| (node->nd_power_state != POWER_STATE_RUNNING))) { node->unlock_node(__func__, "no relay mom", LOGLEVEL); return(PBSE_NORELYMOM); } if (LOGLEVEL >= 7) { char *tmp = netaddr_pbs_net_t(pjob->ji_qs.ji_un.ji_exect.ji_momaddr); sprintf(log_buf, "momaddr=%s",tmp); log_record(PBSEVENT_SCHED, PBS_EVENTCLASS_REQUEST, __func__, log_buf); free(tmp); } node_name = node->get_name(); node->unlock_node(__func__, "after svr_connect", LOGLEVEL); strcpy(jobid, pjob->ji_qs.ji_jobid); unlock_ji_mutex(pjob, __func__, NULL, LOGLEVEL); *pjob_ptr = NULL; handle = svr_connect(addr, port, &local_errno, NULL, NULL); if (handle < 0) { update_failure_counts(node_name.c_str(), -1); log_event(PBSEVENT_ERROR,PBS_EVENTCLASS_REQUEST,"",msg_norelytomom); return(PBSE_NORELYMOM); } request->rq_orgconn = request->rq_conn; /* save client socket */ rc = issue_Drequest(handle, request, true); if (request->rq_reply.brp_code == PBSE_TIMEOUT) update_failure_counts(node_name.c_str(), PBSE_TIMEOUT); else update_failure_counts(node_name.c_str(), 0); *pjob_ptr = svr_find_job(jobid, TRUE); return(rc); } /* END relay_to_mom() */
int issue_to_svr( const char *servern, /* I */ struct batch_request **preq_ptr, /* I */ void (*replyfunc) (struct work_task *)) /* I */ { int rc = PBSE_NONE; bool do_retry = false; int handle; int my_err = 0; pbs_net_t svraddr; char *svrname; unsigned int port = pbs_server_port_dis; batch_request *preq = *preq_ptr; snprintf(preq->rq_host, sizeof(preq->rq_host), "%s", servern); preq->rq_fromsvr = 1; preq->rq_perm = ATR_DFLAG_MGRD | ATR_DFLAG_MGWR | ATR_DFLAG_SvWR; svrname = parse_servername(servern, &port); svraddr = get_hostaddr(&my_err,svrname); free(svrname); if (svraddr == (pbs_net_t)0) { if (my_err == PBS_NET_RC_RETRY) { /* Non fatal error - retry */ do_retry = true; } } else { handle = svr_connect(svraddr, port, &my_err, NULL, NULL); if (handle >= 0) { if (((rc = issue_Drequest(handle, preq, true)) == PBSE_NONE) && (handle != PBS_LOCAL_CONNECTION)) { /* preq is already freed if handle == PBS_LOCAL_CONNECTION - a reply * has always been sent */ rc = preq->rq_reply.brp_code; } else if (handle == PBS_LOCAL_CONNECTION) *preq_ptr = NULL; return(rc); } else if (handle == PBS_NET_RC_RETRY) do_retry = true; } /* if reached here, it didn`t go, do we retry? */ if (do_retry) { queue_a_retry_task(preq, replyfunc); return(PBSE_NONE); } /* FAILURE */ return(PBSE_INTERNAL); } /* END issue_to_svr() */
int set_node_power_state( struct pbsnode **ppNode, unsigned short newState) { struct pbsnode *pNode = *ppNode; if (pNode->nd_addrs == NULL) { return PBSE_BAD_PARAMETER; } if (newState == POWER_STATE_RUNNING) { static std::string interface; static unsigned char mac_addr[6]; if (interface.length() == 0) { if (!getMacAddr(interface,mac_addr)) { return PBSE_SYSTEM; } } int sock; if ((sock = socket(AF_INET,SOCK_PACKET,SOCK_PACKET)) < 0) { return PBSE_SYSTEM; } unsigned char outpack[1000]; memcpy(outpack+6,mac_addr,6); memcpy(outpack,pNode->nd_mac_addr,6); outpack[12] = 0x08; outpack[13] = 0x42; int offset = 14; memset(outpack + offset,0xff,6); offset += 6; for (int i = 0;i < 16;i++) { memcpy(outpack + offset,pNode->nd_mac_addr,6); offset += 6; } int one = 1; if (setsockopt(sock, SOL_SOCKET, SO_BROADCAST, (char *)&one, sizeof(one)) < 0) { close(sock); return PBSE_SYSTEM; } struct sockaddr whereto; whereto.sa_family = 0; snprintf(whereto.sa_data, sizeof(whereto.sa_data), "%s", interface.c_str()); if (sendto(sock, outpack, offset, 0, &whereto, sizeof(whereto)) < 0) { close(sock); return PBSE_SYSTEM; } close(sock); return PBSE_NONE; } if (pNode->nd_job_usages.size() != 0) { //Can't change the power state on a node with running jobs. return PBSE_CANT_CHANGE_POWER_STATE_WITH_JOBS_RUNNING; } struct batch_request *request = alloc_br(PBS_BATCH_ChangePowerState); if (request == NULL) { return PBSE_SYSTEM; } request->rq_ind.rq_powerstate = newState; pNode->nd_power_state_change_time = time(NULL); snprintf(request->rq_host, sizeof(request->rq_host), "%s", pNode->nd_name); std::string hostname(request->rq_host); int rc = PBSE_NONE; { int handle = 0; int local_errno = 0; handle = svr_connect(pNode->nd_addrs[0],pNode->nd_mom_port,&local_errno,pNode,NULL); if(handle < 0) { unlock_node(pNode, __func__, "Error connecting", LOGLEVEL); *ppNode = NULL; return local_errno; } unlock_node(pNode, __func__, "Done connecting", LOGLEVEL); *ppNode = NULL; rc = issue_Drequest(handle, request,true); if(rc == PBSE_NONE) { rc = request->rq_reply.brp_code; if(rc < 0) rc = -rc; } } pNode = find_nodebyname(hostname.c_str()); *ppNode = pNode; if ((rc == PBSE_NONE)&&(pNode != NULL)) { pNode->nd_power_state = newState; } return(rc); }
int stat_to_mom( job *pjob, /* I */ struct stat_cntl *cntl) /* I/O */ { struct batch_request *newrq; int rc; struct work_task *pwt = 0; struct pbsnode *node; if ((newrq = alloc_br(PBS_BATCH_StatusJob)) == NULL) { return(PBSE_SYSTEM); } /* set up status request, save address of cntl in request for later */ newrq->rq_extra = (void *)cntl; if (cntl->sc_type == 1) strcpy(newrq->rq_ind.rq_status.rq_id, pjob->ji_qs.ji_jobid); else newrq->rq_ind.rq_status.rq_id[0] = '\0'; /* get stat of all */ CLEAR_HEAD(newrq->rq_ind.rq_status.rq_attr); /* if MOM is down just return stale information */ if (((node = tfind_addr(pjob->ji_qs.ji_un.ji_exect.ji_momaddr)) != NULL) && (node->nd_state & (INUSE_DELETED | INUSE_DOWN))) { if (LOGLEVEL >= 6) { sprintf(log_buffer, "node '%s' is allocated to job but in state '%s'", node->nd_name, (node->nd_state & INUSE_DELETED) ? "deleted" : "down"); log_event( PBSEVENT_SYSTEM, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buffer); } return(PBSE_NORELYMOM); } /* get connection to MOM */ cntl->sc_conn = svr_connect( pjob->ji_qs.ji_un.ji_exect.ji_momaddr, pbs_mom_port, process_Dreply, ToServerDIS); if ((rc = cntl->sc_conn) >= 0) rc = issue_Drequest(cntl->sc_conn, newrq, stat_update, &pwt); if (rc != 0) { /* request failed */ if (pwt) delete_task(pwt); free_br(newrq); if (cntl->sc_conn >= 0) svr_disconnect(cntl->sc_conn); } /* END if (rc != NULL) */ return(rc); } /* END stat_to_mom() */
void *check_if_orphaned( void *vp) { char *node_name = (char *)vp; char *rsv_id = NULL; std::string job_id; batch_request *preq; int handle = -1; int retries = 0; struct pbsnode *pnode; char log_buf[LOCAL_LOG_BUF_SIZE]; if ((rsv_id = strchr(node_name, ':')) != NULL) { *rsv_id = '\0'; rsv_id++; } else { free(node_name); return(NULL); } if (alps_reservations.is_orphaned(rsv_id, job_id) == true) { // Make sure the node with the orphan is not available for jobs if ((pnode = find_nodebyname(node_name)) != NULL) { if ((pnode->nd_state & (INUSE_BUSY | INUSE_DOWN)) == 0) { snprintf(log_buf, sizeof(log_buf), "Node %s has an orphan but wasn't marked as busy. Marking as busy now.", node_name); log_err(-1, __func__, log_buf); update_node_state(pnode, INUSE_BUSY); } pnode->unlock_node(__func__, NULL, LOGLEVEL); } if ((preq = alloc_br(PBS_BATCH_DeleteReservation)) == NULL) { free(node_name); alps_reservations.remove_from_orphaned_list(rsv_id); return(NULL); } preq->rq_extend = strdup(rsv_id); if ((pnode = get_next_login_node(NULL)) != NULL) { struct in_addr hostaddr; int local_errno; pbs_net_t momaddr; memcpy(&hostaddr, &pnode->nd_sock_addr.sin_addr, sizeof(hostaddr)); momaddr = ntohl(hostaddr.s_addr); snprintf(log_buf, sizeof(log_buf), "Found orphan ALPS reservation ID %s for job %s; asking %s to remove it", rsv_id, job_id.c_str(), pnode->get_name()); log_record(PBSEVENT_DEBUG, PBS_EVENTCLASS_SERVER, __func__, log_buf); while ((handle < 0) && (retries < 3)) { handle = svr_connect(momaddr, pnode->nd_mom_port, &local_errno, pnode, NULL); retries++; } /* unlock before the network transaction */ pnode->unlock_node(__func__, NULL, LOGLEVEL); if (handle >= 0) issue_Drequest(handle, preq, true); free_br(preq); } alps_reservations.remove_from_orphaned_list(rsv_id); } free(node_name); return(NULL); } /* END check_if_orphaned() */
int issue_to_svr( char *servern, /* I */ struct batch_request *preq, /* I */ void (*replyfunc) (struct work_task *)) /* I */ { int do_retry = 0; int handle; pbs_net_t svraddr; char *svrname; unsigned int port = pbs_server_port_dis; struct work_task *pwt; strcpy(preq->rq_host, servern); preq->rq_fromsvr = 1; preq->rq_perm = ATR_DFLAG_MGRD | ATR_DFLAG_MGWR | ATR_DFLAG_SvWR; svrname = parse_servername(servern, &port); svraddr = get_hostaddr(svrname); if (svraddr == (pbs_net_t)0) { if (pbs_errno == PBS_NET_RC_RETRY) { /* Non fatal error - retry */ do_retry = 1; } } else { handle = svr_connect(svraddr, port, process_Dreply, ToServerDIS); if (handle >= 0) { return(issue_Drequest(handle, preq, replyfunc, NULL)); } else if (handle == PBS_NET_RC_RETRY) { do_retry = 1; } } /* if reached here, it didn`t go, do we retry? */ if (do_retry) { pwt = set_task( WORK_Timed, (long)(time_now + PBS_NET_RETRY_TIME), reissue_to_svr, (void *)preq); pwt->wt_parmfunc = replyfunc; return(0); } /* FAILURE */ return(-1); } /* END issue_to_svr() */
int relay_to_mom( job *pjob, struct batch_request *request, /* the request to send */ void (*func)(struct work_task *)) { char *id = "relay_to_mom"; int conn; /* a client style connection handle */ int rc; pbs_net_t addr; struct pbsnode *node; /* if MOM is down don't try to connect */ addr = pjob->ji_qs.ji_un.ji_exect.ji_momaddr; node = tfind_addr(addr,pjob->ji_qs.ji_un.ji_exect.ji_momport,pjob); if ((node != NULL) && (node->nd_state & (INUSE_DELETED|INUSE_DOWN))) { return(PBSE_NORELYMOM); } if (LOGLEVEL >= 7) { sprintf(log_buffer, "momaddr=%s", netaddr_pbs_net_t(pjob->ji_qs.ji_un.ji_exect.ji_momaddr)); log_record( PBSEVENT_SCHED, PBS_EVENTCLASS_REQUEST, id, log_buffer); } conn = svr_connect( pjob->ji_qs.ji_un.ji_exect.ji_momaddr, pjob->ji_qs.ji_un.ji_exect.ji_momport, process_Dreply, ToServerDIS); if (conn < 0) { LOG_EVENT( PBSEVENT_ERROR, PBS_EVENTCLASS_REQUEST, "", msg_norelytomom); return(PBSE_NORELYMOM); } request->rq_orgconn = request->rq_conn; /* save client socket */ rc = issue_Drequest(conn, request, func, NULL); return(rc); } /* END relay_to_mom() */