void *send_power_state_to_mom( void *arg) { struct batch_request *pRequest = (struct batch_request *)arg; struct pbsnode *pNode = find_nodebyname(pRequest->rq_host); if (pNode == NULL) { free_br(pRequest); return NULL; } int handle = 0; int local_errno = 0; handle = svr_connect(pNode->nd_addrs[0],pNode->nd_mom_port,&local_errno,pNode,NULL); if (handle < 0) { unlock_node(pNode, __func__, "Error connecting", LOGLEVEL); return NULL; } unlock_node(pNode, __func__, "Done connecting", LOGLEVEL); issue_Drequest(handle, pRequest, true); return NULL; }
struct pbsnode *get_node_from_str( const char *str, /* I */ const char *orig_id, /* I */ struct pbsnode *np) /* M */ { /* this is a node reporting on another node as well */ const char *node_id = str + strlen("node="); struct pbsnode *next = NULL; char log_buf[LOCAL_LOG_BUF_SIZE]; /* don't do anything if the name is the same as this node's name */ if (strcmp(node_id, np->nd_name)) { unlock_node(np, __func__, "np not numa update", LOGLEVEL); next = find_nodebyname(node_id); if (next == NULL) { /* NYI: should we add logic here to attempt the canonical name if this * is the short name, and attempt the short name if this is the * canonical name? */ /* ERROR */ snprintf(log_buf,sizeof(log_buf), "Node %s is reporting on node %s, which pbs_server doesn't know about\n", orig_id, node_id); log_err(-1, __func__, log_buf); } else { if (LOGLEVEL >= 7) { snprintf(log_buf,sizeof(log_buf), "Node %s is reporting for node %s\n", orig_id, node_id); log_event(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, __func__, log_buf); } next->nd_lastupdate = time(NULL); } } else { next = np; next->nd_lastupdate = time(NULL); } /* next may be NULL */ return(next); } /* END get_node_from_str() */
END_TEST #endif START_TEST(test_update_failure_counts) { const char *name = "lihue"; struct pbsnode *pnode = find_nodebyname(name); update_failure_counts(name, -1); update_failure_counts(name, -1); // Make sure the two failures are correctly counted fail_unless(pnode->nd_proximal_failures == 2); fail_unless(pnode->nd_consecutive_successes == 0); fail_unless(pnode->nd_state == INUSE_FREE); // One success shouldn't reset the failure counts update_failure_counts(name, 0); fail_unless(pnode->nd_proximal_failures == 2); fail_unless(pnode->nd_consecutive_successes == 1); fail_unless(pnode->nd_state == INUSE_FREE); // Two should update_failure_counts(name, 0); fail_unless(pnode->nd_proximal_failures == 0); fail_unless(pnode->nd_consecutive_successes == 2); fail_unless(pnode->nd_state == INUSE_FREE); // One failure should reset the success count update_failure_counts(name, 1); fail_unless(pnode->nd_proximal_failures == 1); fail_unless(pnode->nd_consecutive_successes == 0); fail_unless(pnode->nd_state == INUSE_FREE); // State shouldn't change until there are 3 proximal failures update_failure_counts(name, 1); fail_unless(pnode->nd_proximal_failures == 2); fail_unless(pnode->nd_consecutive_successes == 0); fail_unless(pnode->nd_state == INUSE_FREE); update_failure_counts(name, 1); fail_unless(pnode->nd_state != INUSE_FREE); fail_unless(pnode->nd_proximal_failures == 3); // State shouldn't reset until there are 2 consecutive successes update_failure_counts(name, 0); fail_unless(pnode->nd_state != INUSE_FREE); fail_unless(pnode->nd_proximal_failures == 3); fail_unless(pnode->nd_consecutive_successes == 1); update_failure_counts(name, 0); fail_unless(pnode->nd_state == INUSE_FREE); fail_unless(pnode->nd_proximal_failures == 0); fail_unless(pnode->nd_consecutive_successes == 2); }
int is_reporter_node( const char *node_id) { struct pbsnode *pnode = find_nodebyname(node_id); int rc = FALSE; if (pnode != NULL) { rc = pnode->nd_is_alps_reporter; pnode->unlock_node(__func__, NULL, LOGLEVEL); } return(rc); } /* END is_reporter_node() */
int get_mom_node_version( const char *job_id, int &version) { job *pjob; pbsnode *pnode; pjob = svr_find_job(job_id, TRUE); if (pjob == NULL) return(PBSE_UNKJOBID); mutex_mgr job_mutex(pjob->ji_mutex, true); pnode = find_nodebyname(pjob->ji_qs.ji_destin); if (pnode == NULL) return(PBSE_UNKNODE); mutex_mgr node_mutex(&pnode->nd_mutex, true); version = pnode->get_version(); return(PBSE_NONE); }
int process_alps_status( char *nd_name, dynamic_string *status_info) { char *str; char node_index_buf[MAXLINE]; int node_index = 0; struct pbsnode *parent; struct pbsnode *current = NULL; int rc; pbs_attribute temp; memset(&temp, 0, sizeof(temp)); if ((rc = decode_arst(&temp, NULL, NULL, NULL, 0)) != PBSE_NONE) { log_record(PBSEVENT_DEBUG, PBS_EVENTCLASS_NODE, __func__, "cannot initialize attribute"); return(rc); } /* if we can't find the parent node, ignore the update */ if ((parent = find_nodebyname(nd_name)) == NULL) return(PBSE_NONE); /* loop over each string */ for (str = status_info->str; str != NULL && *str != '\0'; str += strlen(str) + 1) { if (!strncmp(str, "node=", strlen("node="))) { if (str != status_info->str) { snprintf(node_index_buf, sizeof(node_index_buf), "node_index=%d", node_index++); decode_arst(&temp, NULL, NULL, node_index_buf, 0); save_node_status(current, &temp); } if ((current = determine_node_from_str(str, parent, current)) == NULL) break; else continue; } /* process the gpu status information separately */ if (!strcmp(CRAY_GPU_STATUS_START, str)) { process_gpu_status(current, &str); continue; } else if (!strncmp(reservation_id, str, strlen(reservation_id))) { process_reservation_id(current, str); } /* save this as is to the status strings */ else if ((rc = decode_arst(&temp, NULL, NULL, str, 0)) != PBSE_NONE) { free_arst(&temp); return(rc); } /* perform any special processing */ if (!strncmp(str, cproc_eq, cproc_eq_len)) { set_ncpus(current, str); } else if (!strncmp(str, state, strlen(state))) { set_state(current, str); } } /* END processing the status update */ if (current != NULL) { snprintf(node_index_buf, sizeof(node_index_buf), "node_index=%d", node_index++); decode_arst(&temp, NULL, NULL, node_index_buf, 0); save_node_status(current, &temp); unlock_node(current, __func__, NULL, 0); } unlock_node(parent, __func__, NULL, 0); return(PBSE_NONE); } /* END process_alps_status() */
void *check_if_orphaned( void *vp) { char *node_name = (char *)vp; char *rsv_id = NULL; std::string job_id; batch_request *preq; int handle = -1; int retries = 0; struct pbsnode *pnode; char log_buf[LOCAL_LOG_BUF_SIZE]; if ((rsv_id = strchr(node_name, ':')) != NULL) { *rsv_id = '\0'; rsv_id++; } else { free(node_name); return(NULL); } if (alps_reservations.is_orphaned(rsv_id, job_id) == true) { // Make sure the node with the orphan is not available for jobs if ((pnode = find_nodebyname(node_name)) != NULL) { if ((pnode->nd_state & (INUSE_BUSY | INUSE_DOWN)) == 0) { snprintf(log_buf, sizeof(log_buf), "Node %s has an orphan but wasn't marked as busy. Marking as busy now.", node_name); log_err(-1, __func__, log_buf); update_node_state(pnode, INUSE_BUSY); } pnode->unlock_node(__func__, NULL, LOGLEVEL); } if ((preq = alloc_br(PBS_BATCH_DeleteReservation)) == NULL) { free(node_name); alps_reservations.remove_from_orphaned_list(rsv_id); return(NULL); } preq->rq_extend = strdup(rsv_id); if ((pnode = get_next_login_node(NULL)) != NULL) { struct in_addr hostaddr; int local_errno; pbs_net_t momaddr; memcpy(&hostaddr, &pnode->nd_sock_addr.sin_addr, sizeof(hostaddr)); momaddr = ntohl(hostaddr.s_addr); snprintf(log_buf, sizeof(log_buf), "Found orphan ALPS reservation ID %s for job %s; asking %s to remove it", rsv_id, job_id.c_str(), pnode->get_name()); log_record(PBSEVENT_DEBUG, PBS_EVENTCLASS_SERVER, __func__, log_buf); while ((handle < 0) && (retries < 3)) { handle = svr_connect(momaddr, pnode->nd_mom_port, &local_errno, pnode, NULL); retries++; } /* unlock before the network transaction */ pnode->unlock_node(__func__, NULL, LOGLEVEL); if (handle >= 0) issue_Drequest(handle, preq, true); free_br(preq); } alps_reservations.remove_from_orphaned_list(rsv_id); } free(node_name); return(NULL); } /* END check_if_orphaned() */
int req_stat_node( struct batch_request *preq) { char *name; int rc = PBSE_NONE; int type = 0; int bad = 0; struct pbsnode *pnode = NULL; struct batch_reply *preply; struct prop props; svrattrl *pal; /* * first, check that the server indeed has a list of nodes * and if it does, validate the name of the requested object-- * either name is that of a specific node, or name[0] is null/@ * meaning request is for all nodes in the server's jurisdiction */ if (LOGLEVEL >= 6) { log_record( PBSEVENT_SCHED, PBS_EVENTCLASS_REQUEST, __func__, "entered"); } if (svr_totnodes <= 0) { rc = PBSE_NONODES; req_reject(rc, 0, preq, NULL, "node list is empty - check 'server_priv/nodes' file"); return rc; } name = preq->rq_ind.rq_status.rq_id; if ((*name == '\0') || (*name == '@')) { type = 1; } else if ((*name == ':') && (*(name + 1) != '\0')) { if (!strcmp(name + 1, "ALL")) { type = 1; /* psuedo-group for all nodes */ } else { type = 2; props.name = name + 1; props.mark = 1; props.next = NULL; } } preply = &preq->rq_reply; preply->brp_choice = BATCH_REPLY_CHOICE_Status; CLEAR_HEAD(preply->brp_un.brp_status); if (type == 0) { /* get status of the named node */ pnode = find_nodebyname(name); if (pnode == NULL) { rc = PBSE_UNKNODE; req_reject(rc, 0, preq, NULL, "cannot locate specified node"); return(rc); } /* get the status on all of the numa nodes */ if (pnode->nd_is_alps_reporter == TRUE) rc = get_alps_statuses(pnode, preq, &bad, &preply->brp_un.brp_status); else rc = get_numa_statuses(pnode, preq, &bad, &preply->brp_un.brp_status); unlock_node(pnode, __func__, "type == 0", LOGLEVEL); } else { /* get status of all or several nodes */ all_nodes_iterator *iter = NULL; while ((pnode = next_host(&allnodes,&iter,NULL)) != NULL) { if ((type == 2) && (!hasprop(pnode, &props))) { unlock_node(pnode, __func__, "type != 0, next_host", LOGLEVEL); continue; } /* get the status on all of the numa nodes */ if (pnode->nd_is_alps_reporter == TRUE) rc = get_alps_statuses(pnode, preq, &bad, &preply->brp_un.brp_status); else rc = get_numa_statuses(pnode, preq, &bad, &preply->brp_un.brp_status); if (rc != PBSE_NONE) { unlock_node(pnode, __func__, "type != 0, rc != 0, get_numa_statuses", LOGLEVEL); break; } unlock_node(pnode, __func__, "type != 0, rc == 0, get_numa_statuses", LOGLEVEL); } if (iter != NULL) delete iter; } if (rc == PBSE_NONE) { /* SUCCESS */ reply_send_svr(preq); } else { if (rc != PBSE_UNKNODEATR) { req_reject(rc, 0, preq, NULL, NULL); } else { pal = (svrattrl *)GET_NEXT(preq->rq_ind.rq_status.rq_attr); reply_badattr(rc, bad, pal, preq); } } return(rc); } /* END req_stat_node() */
void req_gpuctrl( struct batch_request *preq) { char *id = "req_gpuctrl"; char *nodename = NULL; char *gpuid = NULL; int gpumode = -1; int reset_perm = -1; int reset_vol = -1; #ifdef NVIDIA_GPUS struct pbsnode *pnode = NULL; int gpuidx = -1; int rc = 0; int conn; #endif /* NVIDIA_GPUS */ if ((preq->rq_perm & (ATR_DFLAG_MGWR | ATR_DFLAG_MGRD | ATR_DFLAG_OPRD | ATR_DFLAG_OPWR)) == 0) { req_reject(PBSE_PERM, 0, preq, NULL, NULL); return; } nodename = preq->rq_ind.rq_gpuctrl.rq_momnode; gpuid = preq->rq_ind.rq_gpuctrl.rq_gpuid; gpumode = preq->rq_ind.rq_gpuctrl.rq_gpumode; reset_perm = preq->rq_ind.rq_gpuctrl.rq_reset_perm; reset_vol = preq->rq_ind.rq_gpuctrl.rq_reset_vol; #ifdef NVIDIA_GPUS if (LOGLEVEL >= 7) { sprintf( log_buffer, "GPU control request for node %s gpuid %s mode %d reset_perm %d reset_vol %d", nodename, gpuid, gpumode, reset_perm, reset_vol); log_ext(-1, id, log_buffer, LOG_INFO); } /* validate mom node exists */ pnode = find_nodebyname(nodename); if (pnode == NULL) { req_reject(PBSE_UNKNODE, 0, preq, NULL, NULL); return; } /* validate that the node is up */ if (pnode->nd_state & (INUSE_DELETED | INUSE_DOWN | INUSE_OFFLINE | INUSE_UNKNOWN)) { sprintf( log_buffer, "Node %s is not available", pnode->nd_name); req_reject(PBSE_UNKREQ, 0, preq, NULL, log_buffer); return; } /* validate that the node has real gpus not virtual */ if (!pnode->nd_gpus_real) { req_reject(PBSE_UNKREQ, 0, preq, NULL, "Not allowed for virtual gpus"); return; } /* validate the gpuid exists */ if ((gpuidx = gpu_entry_by_id(pnode, gpuid, FALSE)) == -1) { req_reject(PBSE_UNKREQ, 0, preq, NULL, "GPU ID does not exist on node"); return; } /* validate that we have a real request */ if ((gpumode == -1) && (reset_perm == -1) && (reset_vol == -1)) { req_reject(PBSE_UNKREQ, 0, preq, NULL, "No action specified"); return; } /* for mode changes validate the mode with the driver_version */ if ((pnode->nd_gpusn[gpuidx].driver_ver == 260) && (gpumode > 2)) { req_reject(PBSE_UNKREQ, 0, preq, NULL, "GPU driver version does not support mode 3"); return; } /* we need to relay request to the mom for processing */ /* have MOM attempt to change the gpu mode */ preq->rq_orgconn = preq->rq_conn; /* restore client socket */ conn = svr_connect( pnode->nd_addrs[0], pbs_mom_port, process_Dreply, ToServerDIS); if (conn >= 0) { if ((rc = issue_Drequest(conn, preq, process_gpu_request_reply, NULL)) != 0) { req_reject(rc, 0, preq, NULL, NULL); } } else { req_reject(PBSE_UNKREQ, 0, preq, NULL, "Failed to get connection to mom"); } #else sprintf( log_buffer, "GPU control request not supported: node %s gpuid %s mode %d reset_perm %d reset_vol %d", nodename, gpuid, gpumode, reset_perm, reset_vol); if (LOGLEVEL >= 3) { log_ext(-1, id, log_buffer, LOG_INFO); } req_reject(PBSE_NOSUP, 0, preq, NULL, NULL); #endif /* NVIDIA_GPUS */ return; }
int process_status_info( const char *nd_name, std::vector<std::string> &status_info) { const char *name = nd_name; pbsnode *current; bool mom_job_sync = true; bool auto_np = false; bool down_on_error = false; bool note_append_on_error = false; int dont_change_state = FALSE; int rc = PBSE_NONE; bool send_hello = false; std::string temp; #ifdef PENABLE_LINUX_CGROUPS bool force_layout_update = false; #endif get_svr_attr_b(SRV_ATR_MomJobSync, &mom_job_sync); get_svr_attr_b(SRV_ATR_AutoNodeNP, &auto_np); get_svr_attr_b(SRV_ATR_NoteAppendOnError, ¬e_append_on_error); get_svr_attr_b(SRV_ATR_DownOnError, &down_on_error); /* if original node cannot be found do not process the update */ if ((current = find_nodebyname(nd_name)) == NULL) return(PBSE_NONE); //A node we put to sleep is up and running. if (current->nd_power_state != POWER_STATE_RUNNING) { //Make sure we wait for a stray update that came after we changed the state to pass //by. if((current->nd_power_state_change_time + NODE_POWER_CHANGE_TIMEOUT) < time(NULL)) { current->nd_power_state = POWER_STATE_RUNNING; write_node_power_state(); } } /* loop over each string */ for (unsigned int i = 0; i != status_info.size(); i++) { const char *str = status_info[i].c_str(); /* these two options are for switching nodes */ if (!strncmp(str, NUMA_KEYWORD, strlen(NUMA_KEYWORD))) { /* if we've already processed some, save this before moving on */ if (i != 0) { save_node_status(current, temp); temp.clear(); } dont_change_state = FALSE; if ((current = get_numa_from_str(str, current)) == NULL) break; else continue; } else if (!strncmp(str, "node=", strlen("node="))) { /* if we've already processed some, save this before moving on */ if (i != 0) { save_node_status(current, temp); temp.clear(); } dont_change_state = FALSE; if ((current = get_node_from_str(str, name, current)) == NULL) break; else { if (current->nd_mom_reported_down == TRUE) { /* There is a race condition if using a mom hierarchy and manually * shutting down a non-level 1 mom: if its message that the mom is * shutting down gets there before its last status update, the node * can incorrectly be set as free again. For that reason, only set * a mom back up if its reporting for itself. */ if (strcmp(name, str + strlen("node=")) != 0) dont_change_state = TRUE; else current->nd_mom_reported_down = FALSE; } continue; } } /* add the info to the "temp" pbs_attribute */ else if (!strcmp(str, START_GPU_STATUS)) { is_gpustat_get(current, i, status_info); continue; } else if (!strcmp(str, START_MIC_STATUS)) { process_mic_status(current, i, status_info); continue; } #ifdef PENABLE_LINUX_CGROUPS else if (!strcmp(str, "force_layout_update")) { force_layout_update = true; continue; } else if (!strncmp(str, "layout", 6)) { // Add 7 to skip "layout=" update_layout_if_needed(current, str + 7, force_layout_update); // reset this to false in case we have a mom hierarchy in place force_layout_update = false; continue; } #endif else if (!strncmp(str, PLUGIN_EQUALS, PLUGIN_EQ_LEN)) { current->capture_plugin_resources(str + PLUGIN_EQ_LEN); continue; } else if (!strncmp(str, "jobs=", 5)) { /* walk job list reported by mom */ sync_job_info *sji = new sync_job_info(); sji->node_name = current->get_name(); sji->job_info = str + 5; sji->sync_jobs = mom_job_sync; // sji is freed in sync_node_jobs() enqueue_threadpool_request(sync_node_jobs, sji, task_pool); continue; } else if (!strcmp(str, "first_update=true")) { /* mom is requesting that we send the mom hierarchy file to her */ //remove_hello(&hellos, current->nd_id); send_hello = true; /* reset gpu data in case mom reconnects with changed gpus */ clear_nvidia_gpus(current); continue; } else { // Save this string to our status line. if (temp.size() > 0) temp += ","; if (!strncmp(str, "message=", 8)) { std::string no_newlines(str); size_t pos = no_newlines.find('\n'); while (pos != std::string::npos) { no_newlines.replace(pos, 1, 1, ' '); pos = no_newlines.find('\n'); } temp += no_newlines; } else temp += str; if (!strncmp(str, "state", 5)) { if (dont_change_state == FALSE) process_state_str(current, str); } else if ((allow_any_mom == TRUE) && (!strncmp(str, "uname", 5))) { process_uname_str(current, str); } else if (!strncmp(str, "me", 2)) /* shorter str compare than "message" */ { if ((!strncmp(str, "message=ERROR", 13)) && (down_on_error == TRUE)) { update_node_state(current, INUSE_DOWN); dont_change_state = TRUE; if (note_append_on_error == true) { set_note_error(current, str); } } } else if (!strncmp(str,"macaddr=",8)) { update_node_mac_addr(current,str + 8); } else if ((mom_job_sync == true) && (!strncmp(str, "jobdata=", 8))) { /* update job attributes based on what the MOM gives us */ update_job_data(current, str + strlen("jobdata=")); } else if ((auto_np) && (!(strncmp(str, "ncpus=", 6)))) { handle_auto_np(current, str); } else if (!strncmp(str, "version=", 8)) { current->set_version(str + 8); } } } /* END processing strings */ if (current != NULL) { save_node_status(current, temp); current->unlock_node(__func__, NULL, LOGLEVEL); } if ((rc == PBSE_NONE) && (send_hello == true)) rc = SEND_HELLO; return(rc); } /* END process_status_info() */
int set_node_power_state( struct pbsnode **ppNode, unsigned short newState) { struct pbsnode *pNode = *ppNode; if (pNode->nd_addrs == NULL) { return PBSE_BAD_PARAMETER; } if (newState == POWER_STATE_RUNNING) { static std::string interface; static unsigned char mac_addr[6]; if (interface.length() == 0) { if (!getMacAddr(interface,mac_addr)) { return PBSE_SYSTEM; } } int sock; if ((sock = socket(AF_INET,SOCK_PACKET,SOCK_PACKET)) < 0) { return PBSE_SYSTEM; } unsigned char outpack[1000]; memcpy(outpack+6,mac_addr,6); memcpy(outpack,pNode->nd_mac_addr,6); outpack[12] = 0x08; outpack[13] = 0x42; int offset = 14; memset(outpack + offset,0xff,6); offset += 6; for (int i = 0;i < 16;i++) { memcpy(outpack + offset,pNode->nd_mac_addr,6); offset += 6; } int one = 1; if (setsockopt(sock, SOL_SOCKET, SO_BROADCAST, (char *)&one, sizeof(one)) < 0) { close(sock); return PBSE_SYSTEM; } struct sockaddr whereto; whereto.sa_family = 0; snprintf(whereto.sa_data, sizeof(whereto.sa_data), "%s", interface.c_str()); if (sendto(sock, outpack, offset, 0, &whereto, sizeof(whereto)) < 0) { close(sock); return PBSE_SYSTEM; } close(sock); return PBSE_NONE; } if (pNode->nd_job_usages.size() != 0) { //Can't change the power state on a node with running jobs. return PBSE_CANT_CHANGE_POWER_STATE_WITH_JOBS_RUNNING; } struct batch_request *request = alloc_br(PBS_BATCH_ChangePowerState); if (request == NULL) { return PBSE_SYSTEM; } request->rq_ind.rq_powerstate = newState; pNode->nd_power_state_change_time = time(NULL); snprintf(request->rq_host, sizeof(request->rq_host), "%s", pNode->nd_name); std::string hostname(request->rq_host); int rc = PBSE_NONE; { int handle = 0; int local_errno = 0; handle = svr_connect(pNode->nd_addrs[0],pNode->nd_mom_port,&local_errno,pNode,NULL); if(handle < 0) { unlock_node(pNode, __func__, "Error connecting", LOGLEVEL); *ppNode = NULL; return local_errno; } unlock_node(pNode, __func__, "Done connecting", LOGLEVEL); *ppNode = NULL; rc = issue_Drequest(handle, request,true); if(rc == PBSE_NONE) { rc = request->rq_reply.brp_code; if(rc < 0) rc = -rc; } } pNode = find_nodebyname(hostname.c_str()); *ppNode = pNode; if ((rc == PBSE_NONE)&&(pNode != NULL)) { pNode->nd_power_state = newState; } return(rc); }
int process_alps_status( char *nd_name, boost::ptr_vector<std::string>& status_info) { char *current_node_id = NULL; char node_index_buf[MAXLINE]; int node_index = 0; struct pbsnode *parent; struct pbsnode *current = NULL; int rc; pbs_attribute temp; hash_table_t *rsv_ht; char log_buf[LOCAL_LOG_BUF_SIZE]; memset(&temp, 0, sizeof(temp)); if ((rc = decode_arst(&temp, NULL, NULL, NULL, 0)) != PBSE_NONE) { log_record(PBSEVENT_DEBUG, PBS_EVENTCLASS_NODE, __func__, "cannot initialize attribute"); return(rc); } /* if we can't find the parent node, ignore the update */ if ((parent = find_nodebyname(nd_name)) == NULL) return(PBSE_NONE); /* keep track of reservations so that they're only processed once per update */ rsv_ht = create_hash(INITIAL_RESERVATION_HOLDER_SIZE); /* loop over each string */ for(boost::ptr_vector<std::string>::iterator i = status_info.begin();i != status_info.end();i++) { const char *str = i->c_str(); if (!strncmp(str, "node=", strlen("node="))) { if (i != status_info.begin()) { snprintf(node_index_buf, sizeof(node_index_buf), "node_index=%d", node_index++); decode_arst(&temp, NULL, NULL, node_index_buf, 0); save_node_status(current, &temp); } if ((current = determine_node_from_str(str, parent, current)) == NULL) break; else continue; } if(current == NULL) continue; /* process the gpu status information separately */ if (!strcmp(CRAY_GPU_STATUS_START, str)) { rc = process_gpu_status(current, i,status_info.end()); str = i->c_str(); continue; } else if (!strncmp(reservation_id, str, strlen(reservation_id))) { const char *just_rsv_id = str + strlen(reservation_id); if (get_value_hash(rsv_ht, just_rsv_id) == -1) { add_hash(rsv_ht, 1, strdup(just_rsv_id)); /* sub-functions will attempt to lock a job, so we must unlock the * reporter node */ unlock_node(parent, __func__, NULL, LOGLEVEL); process_reservation_id(current, str); current_node_id = strdup(current->nd_name); unlock_node(current, __func__, NULL, LOGLEVEL); /* re-lock the parent */ if ((parent = find_nodebyname(nd_name)) == NULL) { /* reporter node disappeared - this shouldn't be possible */ log_err(PBSE_UNKNODE, __func__, "Alps reporter node disappeared while recording a reservation"); free_arst(&temp); free_all_keys(rsv_ht); free_hash(rsv_ht); free(current_node_id); return(PBSE_NONE); } if ((current = find_node_in_allnodes(&parent->alps_subnodes, current_node_id)) == NULL) { /* current node disappeared, this shouldn't be possible either */ unlock_node(parent, __func__, NULL, LOGLEVEL); snprintf(log_buf, sizeof(log_buf), "Current node '%s' disappeared while recording a reservation", current_node_id); log_err(PBSE_UNKNODE, __func__, log_buf); free_arst(&temp); free_all_keys(rsv_ht); free_hash(rsv_ht); free(current_node_id); return(PBSE_NONE); } free(current_node_id); current_node_id = NULL; } } /* save this as is to the status strings */ else if ((rc = decode_arst(&temp, NULL, NULL, str, 0)) != PBSE_NONE) { free_arst(&temp); free_all_keys(rsv_ht); free_hash(rsv_ht); return(rc); } /* perform any special processing */ if (!strncmp(str, cproc_eq, ac_cproc_eq_len)) { set_ncpus(current, parent, str); } else if (!strncmp(str, state, strlen(state))) { set_state(current, str); } } /* END processing the status update */ if (current != NULL) { snprintf(node_index_buf, sizeof(node_index_buf), "node_index=%d", node_index++); decode_arst(&temp, NULL, NULL, node_index_buf, 0); save_node_status(current, &temp); unlock_node(current, __func__, NULL, LOGLEVEL); } unlock_node(parent, __func__, NULL, LOGLEVEL); free_all_keys(rsv_ht); free_hash(rsv_ht); return(PBSE_NONE); } /* END process_alps_status() */
void req_stat_node( struct batch_request *preq) /* ptr to the decoded request */ { char *name; struct pbsnode *pnode = NULL; struct batch_reply *preply; svrattrl *pal; int rc = 0; int type = 0; int i; struct prop props; char *id = "req_stat_node"; /* * first, check that the server indeed has a list of nodes * and if it does, validate the name of the requested object-- * either name is that of a specific node, or name[0] is null/@ * meaning request is for all nodes in the server's jurisdiction */ if (LOGLEVEL >= 6) { log_record( PBSEVENT_SCHED, PBS_EVENTCLASS_REQUEST, id, "entered"); } if ((pbsndmast == NULL) || (svr_totnodes <= 0)) { req_reject(PBSE_NONODES, 0, preq, NULL, "node list is empty - check 'server_priv/nodes' file"); return; } name = preq->rq_ind.rq_status.rq_id; if ((*name == '\0') || (*name == '@')) { type = 1; } else if ((*name == ':') && (*(name + 1) != '\0')) { if (!strcmp(name + 1, "ALL")) { type = 1; /* psuedo-group for all nodes */ } else { type = 2; props.name = name + 1; props.mark = 1; props.next = NULL; } } else { pnode = find_nodebyname(name); if (pnode == NULL) { req_reject(PBSE_UNKNODE, 0, preq, NULL, "cannot locate specified node"); return; } } preply = &preq->rq_reply; preply->brp_choice = BATCH_REPLY_CHOICE_Status; CLEAR_HEAD(preply->brp_un.brp_status); if (type == 0) { /* get status of the named node */ rc = status_node(pnode, preq, &preply->brp_un.brp_status); } else { /* get status of all or several nodes */ for (i = 0;i < svr_totnodes;i++) { pnode = pbsndmast[i]; if ((type == 2) && !hasprop(pnode, &props)) continue; if ((rc = status_node(pnode, preq, &preply->brp_un.brp_status)) != 0) break; } } if (!rc) { /* SUCCESS */ reply_send(preq); } else { if (rc != PBSE_UNKNODEATR) { req_reject(rc, 0, preq, NULL, NULL); } else { pal = (svrattrl *)GET_NEXT(preq->rq_ind.rq_status.rq_attr); reply_badattr(rc, bad, pal, preq); } } return; } /* END req_stat_node() */
/************************************************* * svr_is_request * * Return: svr_is_request always returns a non-zero value * and it must call close_conn to close the connection * before returning. PBSE_SOCKET_CLOSE is the code * for a successful return. But which ever retun * code is iused it must terminate the while loop * in start_process_pbs_server_port. *************************************************/ int svr_is_request( struct tcp_chan *chan, int version) { int command = 0; int ret = DIS_SUCCESS; int i; int err; char nodename[PBS_MAXHOSTNAME]; int perm = ATR_DFLAG_MGRD | ATR_DFLAG_MGWR; unsigned long ipaddr; unsigned short mom_port; unsigned short rm_port; unsigned long tmpaddr; struct sockaddr_in *addr = NULL; struct sockaddr s_addr; unsigned int len = sizeof(s_addr); struct pbsnode *node = NULL; char *node_name = NULL; char log_buf[LOCAL_LOG_BUF_SIZE+1]; command = disrsi(chan, &ret); if (ret != DIS_SUCCESS) goto err; if (LOGLEVEL >= 4) { snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "message received from sock %d (version %d)", chan->sock, version); log_event(PBSEVENT_ADMIN,PBS_EVENTCLASS_SERVER,__func__,log_buf); } if (getpeername(chan->sock, &s_addr, &len) != 0) { close_conn(chan->sock, FALSE); log_err(errno,__func__, (char *)"Cannot get socket name using getpeername\n"); return(PBSE_SOCKET_CLOSE); } addr = (struct sockaddr_in *)&s_addr; if (version != IS_PROTOCOL_VER) { snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "protocol version %d unknown from %s", version, netaddr(addr)); log_err(-1, __func__, log_buf); close_conn(chan->sock, FALSE); return PBSE_SOCKET_DATA; } /* check that machine is known */ mom_port = disrsi(chan, &ret); rm_port = disrsi(chan, &ret); if (LOGLEVEL >= 3) { snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "message received from addr %s: mom_port %d - rm_port %d", netaddr(addr), mom_port, rm_port); log_event(PBSEVENT_ADMIN,PBS_EVENTCLASS_SERVER,__func__,log_buf); } ipaddr = ntohl(addr->sin_addr.s_addr); if ((node = AVL_find(ipaddr, mom_port, ipaddrs)) != NULL) { lock_node(node, __func__, "AVL_find", LOGLEVEL); } /* END if AVL_find != NULL) */ else if (allow_any_mom) { char *name = get_cached_nameinfo(addr); if (name != NULL) snprintf(nodename, sizeof(nodename), "%s", name); else if (getnameinfo(&s_addr, len, nodename, sizeof(nodename)-1, NULL, 0, 0) != 0) { tmpaddr = ntohl(addr->sin_addr.s_addr); sprintf(nodename, "0x%lX", tmpaddr); } else insert_addr_name_info(nodename, NULL, addr); err = create_partial_pbs_node(nodename, ipaddr, perm); if (err == PBSE_NONE) { node = AVL_find(ipaddr, 0, ipaddrs); lock_node(node, __func__, "no error", LOGLEVEL); } } if (node == NULL) { /* node not listed in trusted ipaddrs list */ snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "bad attempt to connect from %s (address not trusted - check entry in server_priv/nodes)", netaddr(addr)); if (LOGLEVEL >= 2) { log_record(PBSEVENT_SCHED, PBS_EVENTCLASS_REQUEST, __func__, log_buf); } else { log_err(-1, __func__, log_buf); } close_conn(chan->sock, FALSE); return PBSE_SOCKET_CLOSE; } if (LOGLEVEL >= 3) { snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "message %s (%d) received from mom on host %s (%s) (sock %d)", PBSServerCmds2[command], command, node->nd_name, netaddr(addr), chan->sock); log_event(PBSEVENT_ADMIN,PBS_EVENTCLASS_SERVER,__func__,log_buf); } switch (command) { case IS_NULL: /* a ping from server */ DBPRT(("%s: IS_NULL\n", __func__)) break; case IS_UPDATE: DBPRT(("%s: IS_UPDATE\n", __func__)) i = disrui(chan, &ret); if (ret != DIS_SUCCESS) { if (LOGLEVEL >= 1) { snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "IS_UPDATE error %d on node %s\n", ret, node->nd_name); log_err(ret, __func__, log_buf); } goto err; } DBPRT(("%s: IS_UPDATE %s 0x%x\n", __func__, node->nd_name, i)) update_node_state(node, i); if ((node->nd_state & INUSE_DOWN) != 0) { node->nd_mom_reported_down = TRUE; } break; case IS_STATUS: if (LOGLEVEL >= 2) { snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "IS_STATUS received from %s", node->nd_name); log_event(PBSEVENT_ADMIN, PBS_EVENTCLASS_SERVER, __func__, log_buf); } if ((node_name = strdup(node->nd_name)) == NULL) goto err; unlock_node(node, __func__, "before is_stat_get", LOGLEVEL); ret = is_stat_get(node_name, chan); node = find_nodebyname(node_name); if (ret == SEND_HELLO) { struct hello_info *hi = (struct hello_info *)calloc(1, sizeof(struct hello_info)); write_tcp_reply(chan, IS_PROTOCOL, IS_PROTOCOL_VER, IS_STATUS, DIS_SUCCESS); hi->name = strdup(node_name); enqueue_threadpool_request(send_hierarchy_threadtask, hi); ret = DIS_SUCCESS; } else write_tcp_reply(chan,IS_PROTOCOL,IS_PROTOCOL_VER,IS_STATUS,ret); if(node != NULL) node->nd_stream = -1; if (ret != DIS_SUCCESS) { if (LOGLEVEL >= 1) { snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "IS_STATUS error %d on node %s", ret, node_name); log_err(ret, __func__, log_buf); } free(node_name); goto err; } free(node_name); break; default: snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "unknown command %d sent from %s", command, node->nd_name); log_err(-1, __func__, log_buf); goto err; break; } /* END switch (command) */ /* must be closed because mom opens and closes this connection each time */ close_conn(chan->sock, FALSE); if(node != NULL) unlock_node(node, __func__, "close", LOGLEVEL); return PBSE_SOCKET_CLOSE; err: /* a DIS write error has occurred */ if (node != NULL) { if (LOGLEVEL >= 1) { DBPRT(("%s: error processing node %s\n", __func__, node->nd_name)) } sprintf(log_buf, "%s from %s(%s)", dis_emsg[ret], node->nd_name, netaddr(addr)); unlock_node(node, __func__, "err", LOGLEVEL); } else {
int process_alps_status( const char *nd_name, std::vector<std::string> &status_info) { const char *ccu_p = NULL; char *current_node_id = NULL; struct pbsnode *parent; struct pbsnode *current = NULL; #ifdef PENABLE_LINUX_CGROUPS int numa_nodes = 0; int sockets = 0; #endif std::string temp; container::item_container<const char *> rsv_ht; char log_buf[LOCAL_LOG_BUF_SIZE]; /* if we can't find the parent node, ignore the update */ if ((parent = find_nodebyname(nd_name)) == NULL) return(PBSE_NONE); /* loop over each string */ for (unsigned int i = 0; i < status_info.size(); i++) { const char *str = status_info[i].c_str(); if (!strncmp(str, "node=", strlen("node="))) { if (i != 0) { if (current != NULL) save_node_status(current, temp); temp.clear(); } if ((current = determine_node_from_str(str, parent, current)) == NULL) break; else { #ifdef PENABLE_LINUX_CGROUPS sockets = 0; numa_nodes = 0; #endif continue; } } if (current == NULL) continue; /* process the gpu status information separately */ if (!strcmp(CRAY_GPU_STATUS_START, str)) { process_gpu_status(current, i, status_info); continue; } else if (!strncmp(reservation_id, str, strlen(reservation_id))) { const char *just_rsv_id = str + strlen(reservation_id); rsv_ht.lock(); if (rsv_ht.find(just_rsv_id) == NULL) { rsv_ht.insert(just_rsv_id,just_rsv_id); rsv_ht.unlock(); /* sub-functions will attempt to lock a job, so we must unlock the * reporter node */ parent->unlock_node(__func__, NULL, LOGLEVEL); process_reservation_id(current, str); current_node_id = strdup(current->get_name()); current->unlock_node(__func__, NULL, LOGLEVEL); /* re-lock the parent */ if ((parent = find_nodebyname(nd_name)) == NULL) { /* reporter node disappeared - this shouldn't be possible */ log_err(PBSE_UNKNODE, __func__, "Alps reporter node disappeared while recording a reservation"); free(current_node_id); return(PBSE_NONE); } if ((current = find_node_in_allnodes(parent->alps_subnodes, current_node_id)) == NULL) { /* current node disappeared, this shouldn't be possible either */ parent->unlock_node(__func__, NULL, LOGLEVEL); snprintf(log_buf, sizeof(log_buf), "Current node '%s' disappeared while recording a reservation", current_node_id); log_err(PBSE_UNKNODE, __func__, log_buf); free(current_node_id); return(PBSE_NONE); } free(current_node_id); current_node_id = NULL; } else { rsv_ht.unlock(); } } /* save this as is to the status strings */ else { if (temp.size() > 0) temp += ","; temp += str; } /* perform any special processing */ if (!strncmp(str, ccu_eq, ac_ccu_eq_len)) { /* save compute unit count in case we need it */ /* note: this string (ccu_eq (CCU=)) needs to be found before cprocs_eq (CPROCS=) */ /* for the node */ ccu_p = str; } else if (!strncmp(str, cproc_eq, ac_cproc_eq_len)) { int ncpus; long svr_nppcu_value = 0; /* * Get the server nppcu value which determines how Hyper-Threaded * cores are reported. When server nppcu value is: * * 0 - Let ALPS choose whether or not to use Hyper-Threaded cores * (report all cores) * 1 - Do not use Hyper-Threaded cores * (report only physical core (compute unit count) * 2 - Use Hyper-Threaded cores * (report all cores) */ get_svr_attr_l(SRV_ATR_nppcu, &svr_nppcu_value); if (svr_nppcu_value == NPPCU_NO_USE_HT && ccu_p != NULL) { /* no HT (nppcu==1), so use compute unit count */ ncpus = atoi(ccu_p + ac_ccu_eq_len); /* use CPROC value if we are using APBASIL protocol < 1.3 */ if (ncpus == 0) ncpus = atoi(str + ac_cproc_eq_len); /* reset the pointer */ ccu_p = NULL; } else { /* let ALPS choose (nppcu==0) or use HT (nppcu==2), use actual processor count */ ncpus = atoi(str + ac_cproc_eq_len); } set_ncpus(current, parent, ncpus); #ifdef PENABLE_LINUX_CGROUPS if (numa_nodes == 0) numa_nodes = 1; if ((current->nd_layout.is_initialized() == false) || (current->nd_layout.getTotalThreads() != current->nd_slots.get_total_execution_slots())) { Machine m(current->nd_slots.get_total_execution_slots(), numa_nodes, sockets); current->nd_layout = m; } #endif } else if (!strncmp(str, state, strlen(state))) { set_state(current, str); } #ifdef PENABLE_LINUX_CGROUPS else if (!strncmp(str, "totmem", 6)) { set_total_memory(current, str); } else if (!strncmp(str, numas, 10)) { // 11 is strlen("numa_nodes=") numa_nodes = strtol(str + 11, NULL, 10); } else if (!strncmp(str, "socket", 6)) { // 7 is strlen("socket=") sockets = strtol(str + 7, NULL, 10); } #endif } /* END processing the status update */ if (current != NULL) { save_node_status(current, temp); current->unlock_node(__func__, NULL, LOGLEVEL); } parent->unlock_node(__func__, NULL, LOGLEVEL); return(PBSE_NONE); } /* END process_alps_status() */
/************************************************* * svr_is_request * * Return: svr_is_request always returns a non-zero value * and it must call close_conn to close the connection * before returning. PBSE_SOCKET_CLOSE is the code * for a successful return. But which ever retun * code is iused it must terminate the while loop * in start_process_pbs_server_port. *************************************************/ void *svr_is_request( void *v) { int command = 0; int ret = DIS_SUCCESS; int i; int err; char nodename[PBS_MAXHOSTNAME]; int perm = ATR_DFLAG_MGRD | ATR_DFLAG_MGWR; unsigned long ipaddr; unsigned short mom_port; unsigned short rm_port; unsigned long tmpaddr; struct sockaddr_in addr; struct pbsnode *node = NULL; char log_buf[LOCAL_LOG_BUF_SIZE+1]; char msg_buf[80]; char tmp[80]; int version; struct tcp_chan *chan; long *args; is_request_info *isr = (is_request_info *)v; if (isr == NULL) return(NULL); chan = isr->chan; args = isr->args; version = disrsi(chan, &ret); if (ret != DIS_SUCCESS) { log_err(-1, __func__, "Cannot read version - skipping this request.\n"); close_conn(chan->sock, FALSE); DIS_tcp_cleanup(chan); return(NULL); } command = disrsi(chan, &ret); if (ret != DIS_SUCCESS) { snprintf(log_buf, sizeof(log_buf), "could not read command: %d", ret); log_err(-1, __func__, log_buf); close_conn(chan->sock, FALSE); DIS_tcp_cleanup(chan); return(NULL); } if (LOGLEVEL >= 4) { snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "message received from sock %d (version %d)", chan->sock, version); log_event(PBSEVENT_ADMIN, PBS_EVENTCLASS_SERVER, __func__, log_buf); } /* Just a note to let us know we only do IPv4 for now */ addr.sin_family = AF_INET; memcpy(&addr.sin_addr, (void *)&args[1], sizeof(struct in_addr)); addr.sin_port = args[2]; if (version != IS_PROTOCOL_VER) { netaddr_long(args[1], tmp); sprintf(msg_buf, "%s:%ld", tmp, args[2]); snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "protocol version %d unknown from %s", version, msg_buf); log_err(-1, __func__, log_buf); close_conn(chan->sock, FALSE); DIS_tcp_cleanup(chan); return(NULL); } /* check that machine is known */ mom_port = disrsi(chan, &ret); rm_port = disrsi(chan, &ret); if (LOGLEVEL >= 3) { netaddr_long(args[1], tmp); sprintf(msg_buf, "%s:%ld", tmp, args[2]); snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "message received from addr %s: mom_port %d - rm_port %d", msg_buf, mom_port, rm_port); log_event(PBSEVENT_ADMIN,PBS_EVENTCLASS_SERVER,__func__,log_buf); } ipaddr = args[1]; if ((node = AVL_find(ipaddr, mom_port, ipaddrs)) != NULL) { node->lock_node(__func__, "AVL_find", LOGLEVEL); } /* END if AVL_find != NULL) */ else if (allow_any_mom) { const char *name = get_cached_nameinfo(&addr); if (name != NULL) snprintf(nodename, sizeof(nodename), "%s", name); else if (getnameinfo((struct sockaddr *)&addr, sizeof(addr), nodename, sizeof(nodename)-1, NULL, 0, 0) != 0) { tmpaddr = ntohl(addr.sin_addr.s_addr); sprintf(nodename, "0x%lX", tmpaddr); } else insert_addr_name_info(NULL, nodename); err = create_partial_pbs_node(nodename, ipaddr, perm); if (err == PBSE_NONE) { node = AVL_find(ipaddr, 0, ipaddrs); node->lock_node(__func__, "no error", LOGLEVEL); } } if (node == NULL) { /* node not listed in trusted ipaddrs list */ netaddr_long(args[1], tmp); sprintf(msg_buf, "%s:%ld", tmp, args[2]); snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "bad attempt to connect from %s (address not trusted - check entry in server_priv/nodes)", msg_buf); if (LOGLEVEL >= 2) { log_record(PBSEVENT_SCHED, PBS_EVENTCLASS_REQUEST, __func__, log_buf); } else { log_err(-1, __func__, log_buf); } close_conn(chan->sock, FALSE); DIS_tcp_cleanup(chan); return(NULL); } if (LOGLEVEL >= 3) { netaddr_long(args[1], tmp); sprintf(msg_buf, "%s:%ld", tmp, args[2]); snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "message %s (%d) received from mom on host %s (%s) (sock %d)", PBSServerCmds2[command], command, node->get_name(), msg_buf, chan->sock); log_event(PBSEVENT_ADMIN,PBS_EVENTCLASS_SERVER,__func__,log_buf); } mutex_mgr node_mutex(&node->nd_mutex, true); switch (command) { case IS_NULL: /* a ping from server */ DBPRT(("%s: IS_NULL\n", __func__)) break; case IS_UPDATE: DBPRT(("%s: IS_UPDATE\n", __func__)) i = disrui(chan, &ret); if (ret != DIS_SUCCESS) { if (LOGLEVEL >= 1) { snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "IS_UPDATE error %d on node %s\n", ret, node->get_name()); log_err(ret, __func__, log_buf); } goto err; } DBPRT(("%s: IS_UPDATE %s 0x%x\n", __func__, node->get_name(), i)) update_node_state(node, i); if ((node->nd_state & INUSE_DOWN) != 0) { node->nd_mom_reported_down = TRUE; } break; case IS_STATUS: { std::string node_name = node->get_name(); if (LOGLEVEL >= 2) { snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "IS_STATUS received from %s", node->get_name()); log_event(PBSEVENT_ADMIN, PBS_EVENTCLASS_SERVER, __func__, log_buf); } node_mutex.unlock(); ret = is_stat_get(node_name.c_str(), chan); node = find_nodebyname(node_name.c_str()); if (node != NULL) { node->nd_stream = -1; node_mutex.mark_as_locked(); if (ret == SEND_HELLO) { //struct hello_info *hi = new hello_info(node->nd_id); write_tcp_reply(chan, IS_PROTOCOL, IS_PROTOCOL_VER, IS_STATUS, DIS_SUCCESS); hierarchy_handler.sendHierarchyToANode(node); ret = DIS_SUCCESS; } else write_tcp_reply(chan,IS_PROTOCOL,IS_PROTOCOL_VER,IS_STATUS,ret); } if (ret != DIS_SUCCESS) { if (LOGLEVEL >= 1) { snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "IS_STATUS error %d on node %s", ret, node_name.c_str()); log_err(ret, __func__, log_buf); } goto err; } break; } default: snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "unknown command %d sent from %s", command, node->get_name()); log_err(-1, __func__, log_buf); goto err; break; } /* END switch (command) */ /* must be closed because mom opens and closes this connection each time */ close_conn(chan->sock, FALSE); DIS_tcp_cleanup(chan); return(NULL); err: /* a DIS write error has occurred */ if (node != NULL) { if (LOGLEVEL >= 1) { DBPRT(("%s: error processing node %s\n", __func__, node->get_name())) } netaddr_long(args[1], tmp); sprintf(msg_buf, "%s:%ld", tmp, args[2]); sprintf(log_buf, "%s from %s(%s)", dis_emsg[ret], node->get_name(), msg_buf); } else {
int req_gpuctrl_svr( struct batch_request *preq) { int rc = PBSE_NONE; char *nodename = NULL; char *gpuid = NULL; int gpumode = -1; int reset_perm = -1; int reset_vol = -1; char log_buf[LOCAL_LOG_BUF_SIZE+1]; int local_errno = 0; struct pbsnode *pnode = NULL; int gpuidx = -1; int conn; if ((preq->rq_perm & (ATR_DFLAG_MGWR | ATR_DFLAG_MGRD | ATR_DFLAG_OPRD | ATR_DFLAG_OPWR)) == 0) { rc = PBSE_PERM; snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "invalid permissions (ATR_DFLAG_MGWR | ATR_DFLAG_MGRD | ATR_DFLAG_OPRD | ATR_DFLAG_OPWR)"); req_reject(rc, 0, preq, NULL, log_buf); return rc; } nodename = preq->rq_ind.rq_gpuctrl.rq_momnode; gpuid = preq->rq_ind.rq_gpuctrl.rq_gpuid; gpumode = preq->rq_ind.rq_gpuctrl.rq_gpumode; reset_perm = preq->rq_ind.rq_gpuctrl.rq_reset_perm; reset_vol = preq->rq_ind.rq_gpuctrl.rq_reset_vol; if (LOGLEVEL >= 7) { sprintf( log_buf, "GPU control request for node %s gpuid %s mode %d reset_perm %d reset_vol %d", nodename, gpuid, gpumode, reset_perm, reset_vol); log_ext(-1, __func__, log_buf, LOG_INFO); } /* validate mom node exists */ pnode = find_nodebyname(nodename); if (pnode == NULL) { req_reject(PBSE_UNKNODE, 0, preq, NULL, NULL); return PBSE_UNKNODE; } /* validate that the node is up */ if ((pnode->nd_state & (INUSE_DOWN | INUSE_OFFLINE | INUSE_UNKNOWN))||(pnode->nd_power_state != POWER_STATE_RUNNING)) { rc = PBSE_UNKREQ; sprintf(log_buf,"Node %s is not available",pnode->nd_name); req_reject(rc, 0, preq, NULL, log_buf); unlock_node(pnode, __func__, NULL, LOGLEVEL); return rc; } /* validate that the node has real gpus not virtual */ if (!pnode->nd_gpus_real) { rc = PBSE_UNKREQ; req_reject(rc, 0, preq, NULL, "Not allowed for virtual gpus"); unlock_node(pnode, __func__, NULL, LOGLEVEL); return rc; } /* validate the gpuid exists */ if ((gpuidx = gpu_entry_by_id(pnode, gpuid, FALSE)) == -1) { rc = PBSE_UNKREQ; req_reject(rc, 0, preq, NULL, "GPU ID does not exist on node"); unlock_node(pnode, __func__, NULL, LOGLEVEL); return rc; } /* validate that we have a real request */ if ((gpumode == -1) && (reset_perm == -1) && (reset_vol == -1)) { rc = PBSE_UNKREQ; req_reject(rc, 0, preq, NULL, "No action specified"); unlock_node(pnode, __func__, NULL, LOGLEVEL); return rc; } /* for mode changes validate the mode with the driver_version */ if ((pnode->nd_gpusn[gpuidx].driver_ver == 260) && (gpumode > 2)) { rc = PBSE_UNKREQ; req_reject(rc, 0, preq, NULL, "GPU driver version does not support mode 3"); unlock_node(pnode, __func__, NULL, LOGLEVEL); return rc; } /* we need to relay request to the mom for processing */ /* have MOM attempt to change the gpu mode */ preq->rq_orgconn = preq->rq_conn; /* restore client socket */ unlock_node(pnode, __func__, NULL, LOGLEVEL); conn = svr_connect( pnode->nd_addrs[0], pbs_mom_port, &local_errno, NULL, NULL); if (conn >= 0) { if ((rc = issue_Drequest(conn, preq)) != PBSE_NONE) req_reject(rc, 0, preq, NULL, NULL); else process_gpu_request_reply(preq); } else { req_reject(PBSE_UNKREQ, 0, preq, NULL, "Failed to get connection to mom"); } return rc; }
void req_stat_node(struct batch_request *preq) { char *name; struct batch_reply *preply; svrattrl *pal; struct pbsnode *pnode = NULL; int rc = 0; int type = 0; int i; /* * first, check that the server indeed has a list of nodes * and if it does, validate the name of the requested object-- * either name is that of a spedific node, or name[0] is null/@ * meaning request is for all nodes in the server's jurisdiction */ if (pbsndlist == 0 || svr_totnodes <= 0) { req_reject(PBSE_NONODES, 0, preq); return; } resc_access_perm = preq->rq_perm; name = preq->rq_ind.rq_status.rq_id; if ((*name == '\0') || (*name =='@')) type = 1; else { pnode = find_nodebyname(name); if (pnode == NULL) { req_reject(PBSE_UNKNODE, 0, preq); return; } } preply = &preq->rq_reply; preply->brp_choice = BATCH_REPLY_CHOICE_Status; CLEAR_HEAD(preply->brp_un.brp_status); if (type == 0) { /* get status of the named node */ rc = status_node(pnode, preq, &preply->brp_un.brp_status); } else { /* get status of all nodes */ for (i = 0; i < svr_totnodes; i++) { pnode = pbsndlist[i]; rc = status_node(pnode, preq, &preply->brp_un.brp_status); if (rc) break; } } if (!rc) { (void)reply_send(preq); } else { if (rc != PBSE_UNKNODEATR) req_reject(rc, 0, preq); else { pal = (svrattrl *)GET_NEXT(preq->rq_ind. rq_status.rq_attr); reply_badattr(rc, bad, pal, preq); } } }
void mgr_node_modify( struct batch_request *preq) /* I */ { int need_todo = 0; int rc; int bad = 0; const char *nodename = NULL; svrattrl *plist; node_check_info nci; struct pbsnode *pnode = NULL; nodename = preq->rq_ind.rq_manager.rq_objname; pnode = find_nodebyname(nodename); if (pnode == NULL) { req_reject(PBSE_UNKNODE, 0, preq, NULL, NULL); return; } plist = (svrattrl *)GET_NEXT(preq->rq_ind.rq_manager.rq_attr); save_characteristic(pnode,&nci); rc = mgr_modify_node( &pnode, node_attr_def, ND_ATR_LAST, plist, preq->rq_perm, &bad, ATR_ACTION_ALTER); if (rc != 0) { /* In the specific node case, reply w/ error and return*/ switch (rc) { case PBSE_INTERNAL: case PBSE_SYSTEM: req_reject(rc, bad, preq, NULL, NULL); break; case PBSE_NOATTR: case PBSE_ATTRRO: case PBSE_MUTUALEX: case PBSE_BADNDATVAL: reply_badattr(rc, bad, plist, preq); break; default: req_reject(rc, 0, preq, NULL, NULL); break; } if(pnode != NULL) { unlock_node(pnode, "mgr_node_set", (char *)"error", LOGLEVEL); pnode = NULL; } return; } /* END if (rc != 0) */ else { /* modifications succeeded for this node */ if(pnode != NULL) { chk_characteristic(pnode, &nci, &need_todo); } } if(pnode != NULL) { unlock_node(pnode, "mgr_node_set", (char *)"single_node", LOGLEVEL); pnode = NULL; } if (need_todo & WRITENODE_STATE) { /*some nodes set to "offline"*/ write_node_state(); need_todo &= ~(WRITENODE_STATE); } if (need_todo & WRITENODE_POWER_STATE) { /*some nodes changed power state*/ write_node_power_state(); need_todo &= ~(WRITENODE_POWER_STATE); } if (need_todo & WRITENODE_NOTE) { /*some nodes have new "note"s*/ write_node_note(); need_todo &= ~(WRITENODE_NOTE); } if (need_todo & WRITE_NEW_NODESFILE) { /*create/delete/prop/ntype change*/ if (!update_nodes_file(NULL)) need_todo &= ~(WRITE_NEW_NODESFILE); /*successful on update*/ } recompute_ntype_cnts(); reply_ack(preq); /*request completely successful*/ return; } /* END void mgr_node_set() */
int process_alps_status( char *nd_name, dynamic_string *status_info) { char *str; char *ccu_p = NULL; char *current_node_id = NULL; char node_index_buf[MAXLINE]; int node_index = 0; struct pbsnode *parent; struct pbsnode *current = NULL; int rc; pbs_attribute temp; hash_table_t *rsv_ht; char log_buf[LOCAL_LOG_BUF_SIZE]; memset(&temp, 0, sizeof(temp)); if ((rc = decode_arst(&temp, NULL, NULL, NULL, 0)) != PBSE_NONE) { log_record(PBSEVENT_DEBUG, PBS_EVENTCLASS_NODE, __func__, "cannot initialize attribute"); return(rc); } /* if we can't find the parent node, ignore the update */ if ((parent = find_nodebyname(nd_name)) == NULL) return(PBSE_NONE); /* keep track of reservations so that they're only processed once per update */ rsv_ht = create_hash(INITIAL_RESERVATION_HOLDER_SIZE); /* loop over each string */ for (str = status_info->str; str != NULL && *str != '\0'; str += strlen(str) + 1) { if (!strncmp(str, "node=", strlen("node="))) { if (str != status_info->str) { snprintf(node_index_buf, sizeof(node_index_buf), "node_index=%d", node_index++); decode_arst(&temp, NULL, NULL, node_index_buf, 0); if (current != NULL) save_node_status(current, &temp); } if ((current = determine_node_from_str(str, parent, current)) == NULL) break; else continue; } if (current == NULL) continue; /* process the gpu status information separately */ if (!strcmp(CRAY_GPU_STATUS_START, str)) { process_gpu_status(current, &str); continue; } else if (!strncmp(reservation_id, str, strlen(reservation_id))) { char *just_rsv_id = str + strlen(reservation_id); if (get_value_hash(rsv_ht, just_rsv_id) == -1) { add_hash(rsv_ht, 1, strdup(just_rsv_id)); /* sub-functions will attempt to lock a job, so we must unlock the * reporter node */ unlock_node(parent, __func__, NULL, LOGLEVEL); process_reservation_id(current, str); current_node_id = strdup(current->nd_name); unlock_node(current, __func__, NULL, LOGLEVEL); /* re-lock the parent */ if ((parent = find_nodebyname(nd_name)) == NULL) { /* reporter node disappeared - this shouldn't be possible */ log_err(PBSE_UNKNODE, __func__, "Alps reporter node disappeared while recording a reservation"); free_arst(&temp); free_all_keys(rsv_ht); free_hash(rsv_ht); free(current_node_id); return(PBSE_NONE); } if ((current = find_node_in_allnodes(&parent->alps_subnodes, current_node_id)) == NULL) { /* current node disappeared, this shouldn't be possible either */ unlock_node(parent, __func__, NULL, LOGLEVEL); snprintf(log_buf, sizeof(log_buf), "Current node '%s' disappeared while recording a reservation", current_node_id); log_err(PBSE_UNKNODE, __func__, log_buf); free_arst(&temp); free_all_keys(rsv_ht); free_hash(rsv_ht); free(current_node_id); return(PBSE_NONE); } free(current_node_id); current_node_id = NULL; } } /* save this as is to the status strings */ else if ((rc = decode_arst(&temp, NULL, NULL, str, 0)) != PBSE_NONE) { free_arst(&temp); free_all_keys(rsv_ht); free_hash(rsv_ht); return(rc); } /* perform any special processing */ if (!strncmp(str, ccu_eq, ac_ccu_eq_len)) { /* save compute unit count in case we need it */ /* note: this string (ccu_eq (CCU=)) needs to be found before cprocs_eq (CPROCS=) */ /* for the node */ ccu_p = str; } else if (!strncmp(str, cproc_eq, ac_cproc_eq_len)) { int ncpus; long svr_nppcu_value = 0; /* * Get the server nppcu value which determines how Hyper-Threaded * cores are reported. When server nppcu value is: * * 0 - Let ALPS choose whether or not to use Hyper-Threaded cores * (report all cores) * 1 - Do not use Hyper-Threaded cores * (report only physical core (compute unit count) * 2 - Use Hyper-Threaded cores * (report all cores) */ get_svr_attr_l(SRV_ATR_nppcu, &svr_nppcu_value); if (svr_nppcu_value == NPPCU_NO_USE_HT && ccu_p != NULL) { /* no HT (nppcu==1), so use compute unit count */ ncpus = atoi(ccu_p + ac_ccu_eq_len); /* use CPROC value if we are using APBASIL protocol < 1.3 */ if (ncpus == 0) ncpus = atoi(str + ac_cproc_eq_len); /* reset the pointer */ ccu_p = NULL; } else { /* let ALPS choose (nppcu==0) or use HT (nppcu==2), use actual processor count */ ncpus = atoi(str + ac_cproc_eq_len); } set_ncpus(current, parent, ncpus); } else if (!strncmp(str, state, strlen(state))) { set_state(current, str); } } /* END processing the status update */ if (current != NULL) { snprintf(node_index_buf, sizeof(node_index_buf), "node_index=%d", node_index++); decode_arst(&temp, NULL, NULL, node_index_buf, 0); save_node_status(current, &temp); unlock_node(current, __func__, NULL, LOGLEVEL); } unlock_node(parent, __func__, NULL, LOGLEVEL); free_all_keys(rsv_ht); free_hash(rsv_ht); return(PBSE_NONE); } /* END process_alps_status() */
int site_check_user_map( job *pjob, /* I */ char *luser, /* I */ char *EMsg) /* O (optional,minsize=1024) */ { char *orighost; char owner[PBS_MAXUSER + 1]; char *p1; char *p2; int rc; int ProxyAllowed = 0; int ProxyRequested = 0; int HostAllowed = 0; char *dptr; #ifdef MUNGE_AUTH char uh[PBS_MAXUSER + PBS_MAXHOSTNAME + 2]; #endif if (EMsg != NULL) EMsg[0] = '\0'; /* get just the owner name, without the "@host" */ p1 = pjob->ji_wattr[JOB_ATR_job_owner].at_val.at_str; p2 = owner; while ((*p1 != '@') && (*p1 != '\0')) *p2++ = *p1++; *p2 = '\0'; orighost = get_variable(pjob, pbs_o_host); if (orighost == NULL) { /* access denied */ log_event( PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, msg_orighost); if (EMsg != NULL) strcpy(EMsg, "source host not specified"); return(-1); } if ((server.sv_attr[(int)SRV_ATR_AllowProxyUser].at_flags & ATR_VFLAG_SET) && \ (server.sv_attr[(int)SRV_ATR_AllowProxyUser].at_val.at_long == 1)) { ProxyAllowed = 1; } if (strcmp(owner, luser) != 0) { ProxyRequested = 1; } if (!strcmp(orighost, server_host) && !strcmp(owner, luser)) { /* submitting from server host, access allowed */ if ((ProxyRequested == 0) || (ProxyAllowed == 1)) { return(0); } /* host is fine, must validate proxy via ruserok() */ HostAllowed = 1; } /* make short host name */ if ((dptr = strchr(orighost, '.')) != NULL) { *dptr = '\0'; } if ((HostAllowed == 0) && (server.sv_attr[SRV_ATR_AllowNodeSubmit].at_flags & ATR_VFLAG_SET) && (server.sv_attr[SRV_ATR_AllowNodeSubmit].at_val.at_long == 1) && (find_nodebyname(orighost) != NULL)) { /* job submitted from compute host, access allowed */ if (dptr != NULL) *dptr = '.'; if ((ProxyRequested == 0) || (ProxyAllowed == 1)) { return(0); } /* host is fine, must validate proxy via ruserok() */ HostAllowed = 1; } if ((HostAllowed == 0) && (server.sv_attr[(int)SRV_ATR_SubmitHosts].at_flags & ATR_VFLAG_SET)) { struct array_strings *submithosts = NULL; char *testhost; int hostnum = 0; submithosts = server.sv_attr[(int)SRV_ATR_SubmitHosts].at_val.at_arst; for (hostnum = 0;hostnum < submithosts->as_usedptr;hostnum++) { testhost = submithosts->as_string[hostnum]; if (!strcasecmp(testhost, orighost)) { /* job submitted from host found in trusted submit host list, access allowed */ if (dptr != NULL) *dptr = '.'; if ((ProxyRequested == 0) || (ProxyAllowed == 1)) { return(0); } /* host is fine, must validate proxy via ruserok() */ HostAllowed = 1; break; } } /* END for (hostnum) */ } /* END if (SRV_ATR_SubmitHosts) */ if (dptr != NULL) *dptr = '.'; #ifdef MUNGE_AUTH sprintf(uh, "%s@%s", owner, orighost); rc = acl_check(&server.sv_attr[SRV_ATR_authusers], uh, ACL_User_Host); if(rc <= 0) { /* rc == 0 means we did not find a match. this is a failure */ if(EMsg != NULL) { snprintf(EMsg, 1024, "could not authorize user %s from %s", owner, orighost); } rc = -1; /* -1 is what set_jobexid is expecting for a failure*/ } else { /*SUCCESS*/ rc = 0; /* the call to ruserok below was in the code first. ruserok returns 0 on success but acl_check returns a positive value on success. We set rc to 0 to be consistent with the original ruserok functionality */ } #else rc = ruserok(orighost, 0, owner, luser); if (rc != 0 && EMsg != NULL) { /* Test rc so as to not fill this message in the case of success, since other * callers might not fill this message in the case of their errors and * very misleading error message will go into the logs. */ snprintf(EMsg, 1024, "ruserok failed validating %s/%s from %s", owner, luser, orighost); } #endif #ifdef sun /* broken Sun ruserok() sets process so it appears to be owned */ /* by the luser, change it back for cosmetic reasons */ setuid(0); #endif /* sun */ return(rc); } /* END site_check_user_map() */
int process_status_info( const char *nd_name, std::vector<std::string> &status_info) { const char *name = nd_name; struct pbsnode *current; long mom_job_sync = FALSE; long auto_np = FALSE; long down_on_error = FALSE; int dont_change_state = FALSE; pbs_attribute temp; int rc = PBSE_NONE; bool send_hello = false; get_svr_attr_l(SRV_ATR_MomJobSync, &mom_job_sync); get_svr_attr_l(SRV_ATR_AutoNodeNP, &auto_np); get_svr_attr_l(SRV_ATR_DownOnError, &down_on_error); /* Before filling the "temp" pbs_attribute, initialize it. * The second and third parameter to decode_arst are never * used, so just leave them empty. (GBS) */ memset(&temp, 0, sizeof(temp)); if ((rc = decode_arst(&temp, NULL, NULL, NULL, 0)) != PBSE_NONE) { log_record(PBSEVENT_DEBUG, PBS_EVENTCLASS_NODE, __func__, "cannot initialize attribute"); return(rc); } /* if original node cannot be found do not process the update */ if ((current = find_nodebyname(nd_name)) == NULL) return(PBSE_NONE); //A node we put to sleep is up and running. if (current->nd_power_state != POWER_STATE_RUNNING) { //Make sure we wait for a stray update that came after we changed the state to pass //by. if((current->nd_power_state_change_time + NODE_POWER_CHANGE_TIMEOUT) < time(NULL)) { current->nd_power_state = POWER_STATE_RUNNING; write_node_power_state(); } } /* loop over each string */ for (unsigned int i = 0; i != status_info.size(); i++) { const char *str = status_info[i].c_str(); /* these two options are for switching nodes */ if (!strncmp(str, NUMA_KEYWORD, strlen(NUMA_KEYWORD))) { /* if we've already processed some, save this before moving on */ if (i != 0) save_node_status(current, &temp); dont_change_state = FALSE; if ((current = get_numa_from_str(str, current)) == NULL) break; else continue; } else if (!strncmp(str, "node=", strlen("node="))) { /* if we've already processed some, save this before moving on */ if (i != 0) save_node_status(current, &temp); dont_change_state = FALSE; if ((current = get_node_from_str(str, name, current)) == NULL) break; else { if (current->nd_mom_reported_down == TRUE) { /* There is a race condition if using a mom hierarchy and manually * shutting down a non-level 1 mom: if its message that the mom is * shutting down gets there before its last status update, the node * can incorrectly be set as free again. For that reason, only set * a mom back up if its reporting for itself. */ if (strcmp(name, str + strlen("node=")) != 0) dont_change_state = TRUE; else current->nd_mom_reported_down = FALSE; } continue; } } /* add the info to the "temp" pbs_attribute */ else if (!strcmp(str, START_GPU_STATUS)) { is_gpustat_get(current, i, status_info); str = status_info[i].c_str(); } else if (!strcmp(str, START_MIC_STATUS)) { process_mic_status(current, i, status_info); str = status_info[i].c_str(); } #ifdef PENABLE_LINUX_CGROUPS else if (!strncmp(str, "layout", 6)) { if (current->nd_layout == NULL) { current->nd_layout = new Machine(status_info[i]); } continue; } #endif else if (!strcmp(str, "first_update=true")) { /* mom is requesting that we send the mom hierarchy file to her */ //remove_hello(&hellos, current->nd_id); send_hello = true; /* reset gpu data in case mom reconnects with changed gpus */ clear_nvidia_gpus(current); } else if ((rc = decode_arst(&temp, NULL, NULL, str, 0)) != PBSE_NONE) { DBPRT(("is_stat_get: cannot add attributes\n")); free_arst(&temp); break; } if (!strncmp(str, "state", 5)) { if (dont_change_state == FALSE) process_state_str(current, str); } else if ((allow_any_mom == TRUE) && (!strncmp(str, "uname", 5))) { process_uname_str(current, str); } else if (!strncmp(str, "me", 2)) /* shorter str compare than "message" */ { if ((!strncmp(str, "message=ERROR", 13)) && (down_on_error == TRUE)) { update_node_state(current, INUSE_DOWN); dont_change_state = TRUE; set_note_error(current, str); } } else if (!strncmp(str,"macaddr=",8)) { update_node_mac_addr(current,str + 8); } else if ((mom_job_sync == TRUE) && (!strncmp(str, "jobdata=", 8))) { /* update job attributes based on what the MOM gives us */ update_job_data(current, str + strlen("jobdata=")); } else if ((mom_job_sync == TRUE) && (!strncmp(str, "jobs=", 5))) { /* walk job list reported by mom */ size_t len = strlen(str) + strlen(current->nd_name) + 2; char *jobstr = (char *)calloc(1, len); sync_job_info *sji = (sync_job_info *)calloc(1, sizeof(sync_job_info)); if ((jobstr != NULL) && (sji != NULL)) { sprintf(jobstr, "%s:%s", current->nd_name, str+5); sji->input = jobstr; sji->timestamp = time(NULL); /* sji must be freed in sync_node_jobs */ enqueue_threadpool_request(sync_node_jobs, sji, task_pool); } else { if (jobstr != NULL) { free(jobstr); } if (sji != NULL) { free(sji); } } } else if (auto_np) { if (!(strncmp(str, "ncpus=", 6))) { handle_auto_np(current, str); } } } /* END processing strings */ if (current != NULL) { save_node_status(current, &temp); unlock_node(current, __func__, NULL, LOGLEVEL); } if ((rc == PBSE_NONE) && (send_hello == true)) rc = SEND_HELLO; return(rc); } /* END process_status_info() */
END_TEST START_TEST(find_nodebyname_test) { struct pbsnode node1; struct pbsnode node2; struct pbsnode reporter; struct pbsnode *pnode; alps_reporter = &reporter; memset(&node1, 0, sizeof(node1)); memset(&node2, 0, sizeof(node2)); node1.nd_name = (char *)"bob"; node2.nd_name = (char *)"tom"; alps_reporter->alps_subnodes = new all_nodes(); initialize_allnodes(&allnodes, &node1, &node2); initialize_allnodes(alps_reporter->alps_subnodes, &node1, &node2); cray_enabled = FALSE; pnode = find_nodebyname(NULL); fail_unless(pnode == NULL, "NULL nodename input fail"); pnode = find_nodebyname("george"); fail_unless(pnode == NULL, "george found but doesn't exist"); pnode = find_nodebyname("bob"); fail_unless(pnode == &node1, "couldn't find bob?"); pnode = find_nodebyname("tom"); fail_unless(pnode == &node2, "couldn't find tom?"); pnode = find_nodebyname(strdup("tom-0")); fail_unless(!strcmp(pnode->nd_name, "0"), "found an incorrect node name"); pnode = find_nodebyname(strdup("tom-1")); fail_unless(!strcmp(pnode->nd_name, "1"), "found an incorrect node name"); pnode = find_nodebyname(strdup("tom-10")); fail_unless(pnode == NULL, "found an incorrect node name"); pnode = find_nodebyname(strdup("bob/0")); fail_unless(pnode == &node1, "couldn't find bob with the exec_host format"); allnodes.lock(); allnodes.clear(); allnodes.unlock(); cray_enabled = TRUE; pnode = find_nodebyname("tom"); fail_unless(pnode == &node2, "couldn't find tom?"); cray_enabled = TRUE; pnode = find_nodebyname("bob"); fail_unless(pnode == &node1, "couldn't find bob?"); cray_enabled = TRUE; pnode = find_nodebyname("george"); fail_unless(pnode == NULL, "george found but doesn't exist"); alps_reporter = NULL; }
END_TEST START_TEST(find_nodebyname_test) { struct pbsnode node1; struct pbsnode node2; struct pbsnode reporter; struct pbsnode *pnode; alps_reporter = &reporter; node1.change_name("bob"); node2.change_name("tom"); alps_reporter->alps_subnodes = new all_nodes(); initialize_allnodes(&allnodes, &node1, &node2); initialize_allnodes(alps_reporter->alps_subnodes, &node1, &node2); cray_enabled = false; pnode = find_nodebyname(NULL); fail_unless(pnode == NULL, "NULL nodename input fail"); pnode = find_nodebyname("george"); fail_unless(pnode == NULL, "george found but doesn't exist"); pnode = find_nodebyname("bob"); fail_unless(pnode == &node1, "couldn't find bob?"); pnode->unlock_node("a", "b", 0); pnode = find_nodebyname("tom"); fail_unless(pnode == &node2, "couldn't find tom?"); pnode->unlock_node("a", "b", 0); pnode = find_nodebyname(strdup("tom-0")); fail_unless(!strcmp(pnode->get_name(), "0"), "found an incorrect node name"); pnode->unlock_node("a", "b", 0); pnode = find_nodebyname(strdup("tom-1")); fail_unless(!strcmp(pnode->get_name(), "1"), "found an incorrect node name"); pnode->unlock_node("a", "b", 0); pnode = find_nodebyname(strdup("tom-10")); fail_unless(pnode == NULL, "found an incorrect node name"); pnode = find_nodebyname(strdup("bob/0")); fail_unless(pnode == &node1, "couldn't find bob with the exec_host format"); pnode->unlock_node("a", "b", 0); allnodes.lock(); allnodes.clear(); allnodes.unlock(); cray_enabled = true; pnode = find_nodebyname("tom"); fail_unless(pnode == &node2, "couldn't find tom?"); cray_enabled = true; pnode = find_nodebyname("bob"); fail_unless(pnode == &node1, "couldn't find bob?"); cray_enabled = true; pnode = find_nodebyname("george"); fail_unless(pnode == NULL, "george found but doesn't exist"); alps_reporter = NULL; }