static void wtr_state_change(struct aps_controller *aps) { // to idle state // in the last version, wtr to idle occurs in wtr sub state machine // but now i remove it here if (is_rcv_nr_idle_both_sides(aps)) { fill_nr_idle(aps); exit_cur_sw_state(aps); wtr_state_exit(aps); set_state(aps, PRIM_IDLE); update_node_state(aps, IDLE); } else if (is_to_pass(aps)) { exit_cur_sw_state(aps); wtr_state_exit(aps); set_state(aps, PRIM_PASS); update_node_state(aps, PASS); } else if (is_to_wtr(aps)) {// keep in wtr process set_state(aps, PRIM_WTR); wtr_state_run(aps); } else if (is_to_switch(aps)) { drop_switch_if_occupy(aps); set_state(aps, PRIM_SWITCH); wtr_state_exit(aps); //exit wtr state firstly sw_state_run(aps); // go to switch state } else if (is_to_k_pass(aps)) { exit_cur_sw_state(aps); wtr_state_exit(aps); set_state(aps, PRIM_K_PASS); update_node_state(aps, k_PASS); } else { assert(0); } }
static void switch_state_change(struct aps_controller *aps) { // to idle state if (is_only_rcv_brq_it_sourcing_both_sides(aps)) { fill_nr_idle(aps); exit_cur_sw_state(aps); // exit switch and it's sub states set_state(aps, PRIM_IDLE); update_node_state(aps, IDLE); } else if (is_to_pass(aps)) { exit_cur_sw_state(aps); // exit switch and it's sub states set_state(aps, PRIM_PASS); update_node_state(aps, PASS); } else if (is_to_wtr(aps)) { // note: here need not exit switch and it's sub states // because switch state should be reserved for returning from wtr some time. // goto next layer of wtr sm set_state(aps, PRIM_WTR); wtr_state_run(aps); } else if (is_to_switch(aps)) { drop_switch_if_occupy(aps); set_state(aps, PRIM_SWITCH); sw_state_run(aps); // go to next layer } else if (is_to_k_pass(aps)) { exit_cur_sw_state(aps); // exit switch and it's sub states set_state(aps, PRIM_K_PASS); update_node_state(aps, k_PASS); } else { assert(0); } }
static void k_pass_state_change(struct aps_controller *aps) { // to idle state if (is_rcv_nr_idle_both_sides(aps)) { fill_nr_idle(aps); set_state(aps, PRIM_IDLE); update_node_state(aps, IDLE); } else if (is_to_switch(aps)) { drop_switch_if_occupy(aps); set_state(aps, PRIM_SWITCH); sw_state_run(aps); // go to next layer } else if (is_to_pass(aps)) { set_state(aps, PRIM_PASS); update_node_state(aps, PASS); } else { assert(0); } }
int set_state( struct pbsnode *pnode, char *str) { char *state_str = str + strlen("state="); if (!strcmp(state_str, "UP")) update_node_state(pnode, INUSE_FREE); else if (!strcmp(state_str, "DOWN")) update_node_state(pnode, INUSE_DOWN); else if (!strcmp(state_str, "BUSY")) update_node_state(pnode, INUSE_BUSY); return(PBSE_NONE); } /* END set_state() */
/* * _gtk_check_menu_item_set_active: * @check_menu_item: a #GtkCheckMenuItem * @is_active: whether the action is active or not * * Sets the #GtkCheckMenuItem:active property directly. This function does * not emit signals or notifications: it is left to the caller to do so. */ void _gtk_check_menu_item_set_active (GtkCheckMenuItem *check_menu_item, gboolean is_active) { GtkCheckMenuItemPrivate *priv = check_menu_item->priv; priv->active = is_active; update_node_state (check_menu_item); }
int process_state_str( struct pbsnode *np, const char *str) { char log_buf[LOCAL_LOG_BUF_SIZE]; int rc = PBSE_NONE; if (!strncmp(str, "state=down", 10)) { update_node_state(np, INUSE_DOWN); } else if (!strncmp(str, "state=busy", 10)) { update_node_state(np, INUSE_BUSY); } else if (!strncmp(str, "state=free", 10)) { update_node_state(np, INUSE_FREE); } else { sprintf(log_buf, "unknown %s from node %s", str, (np->nd_name != NULL) ? np->nd_name : "NULL"); log_err(-1, __func__, log_buf); update_node_state(np, INUSE_UNKNOWN); } if (LOGLEVEL >= 9) { sprintf(log_buf, "node '%s' is at state '0x%x'\n", np->nd_name, np->nd_state); log_event(PBSEVENT_ADMIN, PBS_EVENTCLASS_SERVER, __func__, log_buf); } return(rc); } /* END process_state_str() */
static void gtk_check_menu_item_state_flags_changed (GtkWidget *widget, GtkStateFlags previous_state) { GtkCheckMenuItem *check_menu_item = GTK_CHECK_MENU_ITEM (widget); update_node_state (check_menu_item); GTK_WIDGET_CLASS (gtk_check_menu_item_parent_class)->state_flags_changed (widget, previous_state); }
static void pass_state_change(struct aps_controller *aps) { // to idle state if (NR == get_highest_brq_for_me(aps) && is_rcv_nr_idle_both_sides(aps)) { fill_nr_idle(aps); set_state(aps, PRIM_IDLE); update_node_state(aps, IDLE); } else if (is_recv_same_pri_long_brq_to_me(aps) || is_recv_long_brq_and_nr_from_same_neib(aps) || is_to_pass(aps)) { // keep pass set_state(aps, PRIM_PASS); update_node_state(aps, PASS); } else if (is_to_switch(aps)) { drop_switch_if_occupy(aps); set_state(aps, PRIM_SWITCH); sw_state_run(aps); // go to next layer } else if (is_to_k_pass(aps)) { set_state(aps, PRIM_K_PASS); update_node_state(aps, k_PASS); } else { assert(0); } }
static void start_up_state_change(struct aps_controller *aps) { if (!IS_NE_READY) { tx_default_kbytes(aps); return; } if (is_to_idle(aps)) { fill_nr_idle(aps); set_state(aps, PRIM_IDLE); update_node_state(aps, IDLE); } else if (is_to_switch(aps)) { drop_switch_if_occupy(aps); set_state(aps, PRIM_SWITCH); sw_state_run(aps); // go to next layer } else if (is_to_pass(aps)) { set_state(aps, PRIM_PASS); update_node_state(aps, PASS); } else if (is_to_k_pass(aps)) { set_state(aps, PRIM_K_PASS); update_node_state(aps, k_PASS); } else { assert(0); } }
bool pbsnode::update_internal_failure_counts( int rc) { bool held = false; char log_buf[2048]; if (rc == PBSE_NONE) { this->nd_consecutive_successes++; if (this->nd_consecutive_successes > 1) { this->nd_proximal_failures = 0; if (this->nd_state & INUSE_NETWORK_FAIL) { snprintf(log_buf, sizeof(log_buf), "Node '%s' has had two or more consecutive network successes, marking online.", this->nd_name.c_str()); log_record(1, 2, __func__, log_buf); this->remove_node_state_flag(INUSE_NETWORK_FAIL); } } } else { this->nd_proximal_failures++; this->nd_consecutive_successes = 0; if ((this->nd_proximal_failures > 2) && ((this->nd_state & INUSE_NETWORK_FAIL) == 0)) { snprintf(log_buf, sizeof(log_buf), "Node '%s' has had %d failures in close proximity, marking offline.", this->nd_name.c_str(), this->nd_proximal_failures); log_record(1, 2, __func__, log_buf); update_node_state(this, INUSE_NETWORK_FAIL); held = true; } } return(held); }
static void gtk_check_menu_item_activate (GtkMenuItem *menu_item) { GtkCheckMenuItemPrivate *priv; GtkCheckMenuItem *check_menu_item = GTK_CHECK_MENU_ITEM (menu_item); priv = check_menu_item->priv; priv->active = !priv->active; gtk_check_menu_item_toggled (check_menu_item); update_node_state (check_menu_item); gtk_widget_queue_draw (GTK_WIDGET (check_menu_item)); GTK_MENU_ITEM_CLASS (gtk_check_menu_item_parent_class)->activate (menu_item); g_object_notify (G_OBJECT (check_menu_item), "active"); }
/** * gtk_check_menu_item_set_inconsistent: * @check_menu_item: a #GtkCheckMenuItem * @setting: %TRUE to display an “inconsistent” third state check * * If the user has selected a range of elements (such as some text or * spreadsheet cells) that are affected by a boolean setting, and the * current values in that range are inconsistent, you may want to * display the check in an “in between” state. This function turns on * “in between” display. Normally you would turn off the inconsistent * state again if the user explicitly selects a setting. This has to be * done manually, gtk_check_menu_item_set_inconsistent() only affects * visual appearance, it doesn’t affect the semantics of the widget. * **/ void gtk_check_menu_item_set_inconsistent (GtkCheckMenuItem *check_menu_item, gboolean setting) { GtkCheckMenuItemPrivate *priv; g_return_if_fail (GTK_IS_CHECK_MENU_ITEM (check_menu_item)); priv = check_menu_item->priv; setting = setting != FALSE; if (setting != priv->inconsistent) { priv->inconsistent = setting; update_node_state (check_menu_item); gtk_widget_queue_draw (GTK_WIDGET (check_menu_item)); g_object_notify (G_OBJECT (check_menu_item), "inconsistent"); } }
int process_status_info( const char *nd_name, std::vector<std::string> &status_info) { const char *name = nd_name; struct pbsnode *current; long mom_job_sync = FALSE; long auto_np = FALSE; long down_on_error = FALSE; int dont_change_state = FALSE; pbs_attribute temp; int rc = PBSE_NONE; bool send_hello = false; get_svr_attr_l(SRV_ATR_MomJobSync, &mom_job_sync); get_svr_attr_l(SRV_ATR_AutoNodeNP, &auto_np); get_svr_attr_l(SRV_ATR_DownOnError, &down_on_error); /* Before filling the "temp" pbs_attribute, initialize it. * The second and third parameter to decode_arst are never * used, so just leave them empty. (GBS) */ memset(&temp, 0, sizeof(temp)); if ((rc = decode_arst(&temp, NULL, NULL, NULL, 0)) != PBSE_NONE) { log_record(PBSEVENT_DEBUG, PBS_EVENTCLASS_NODE, __func__, "cannot initialize attribute"); return(rc); } /* if original node cannot be found do not process the update */ if ((current = find_nodebyname(nd_name)) == NULL) return(PBSE_NONE); //A node we put to sleep is up and running. if (current->nd_power_state != POWER_STATE_RUNNING) { //Make sure we wait for a stray update that came after we changed the state to pass //by. if((current->nd_power_state_change_time + NODE_POWER_CHANGE_TIMEOUT) < time(NULL)) { current->nd_power_state = POWER_STATE_RUNNING; write_node_power_state(); } } /* loop over each string */ for (unsigned int i = 0; i != status_info.size(); i++) { const char *str = status_info[i].c_str(); /* these two options are for switching nodes */ if (!strncmp(str, NUMA_KEYWORD, strlen(NUMA_KEYWORD))) { /* if we've already processed some, save this before moving on */ if (i != 0) save_node_status(current, &temp); dont_change_state = FALSE; if ((current = get_numa_from_str(str, current)) == NULL) break; else continue; } else if (!strncmp(str, "node=", strlen("node="))) { /* if we've already processed some, save this before moving on */ if (i != 0) save_node_status(current, &temp); dont_change_state = FALSE; if ((current = get_node_from_str(str, name, current)) == NULL) break; else { if (current->nd_mom_reported_down == TRUE) { /* There is a race condition if using a mom hierarchy and manually * shutting down a non-level 1 mom: if its message that the mom is * shutting down gets there before its last status update, the node * can incorrectly be set as free again. For that reason, only set * a mom back up if its reporting for itself. */ if (strcmp(name, str + strlen("node=")) != 0) dont_change_state = TRUE; else current->nd_mom_reported_down = FALSE; } continue; } } /* add the info to the "temp" pbs_attribute */ else if (!strcmp(str, START_GPU_STATUS)) { is_gpustat_get(current, i, status_info); str = status_info[i].c_str(); } else if (!strcmp(str, START_MIC_STATUS)) { process_mic_status(current, i, status_info); str = status_info[i].c_str(); } #ifdef PENABLE_LINUX_CGROUPS else if (!strncmp(str, "layout", 6)) { if (current->nd_layout == NULL) { current->nd_layout = new Machine(status_info[i]); } continue; } #endif else if (!strcmp(str, "first_update=true")) { /* mom is requesting that we send the mom hierarchy file to her */ //remove_hello(&hellos, current->nd_id); send_hello = true; /* reset gpu data in case mom reconnects with changed gpus */ clear_nvidia_gpus(current); } else if ((rc = decode_arst(&temp, NULL, NULL, str, 0)) != PBSE_NONE) { DBPRT(("is_stat_get: cannot add attributes\n")); free_arst(&temp); break; } if (!strncmp(str, "state", 5)) { if (dont_change_state == FALSE) process_state_str(current, str); } else if ((allow_any_mom == TRUE) && (!strncmp(str, "uname", 5))) { process_uname_str(current, str); } else if (!strncmp(str, "me", 2)) /* shorter str compare than "message" */ { if ((!strncmp(str, "message=ERROR", 13)) && (down_on_error == TRUE)) { update_node_state(current, INUSE_DOWN); dont_change_state = TRUE; set_note_error(current, str); } } else if (!strncmp(str,"macaddr=",8)) { update_node_mac_addr(current,str + 8); } else if ((mom_job_sync == TRUE) && (!strncmp(str, "jobdata=", 8))) { /* update job attributes based on what the MOM gives us */ update_job_data(current, str + strlen("jobdata=")); } else if ((mom_job_sync == TRUE) && (!strncmp(str, "jobs=", 5))) { /* walk job list reported by mom */ size_t len = strlen(str) + strlen(current->nd_name) + 2; char *jobstr = (char *)calloc(1, len); sync_job_info *sji = (sync_job_info *)calloc(1, sizeof(sync_job_info)); if ((jobstr != NULL) && (sji != NULL)) { sprintf(jobstr, "%s:%s", current->nd_name, str+5); sji->input = jobstr; sji->timestamp = time(NULL); /* sji must be freed in sync_node_jobs */ enqueue_threadpool_request(sync_node_jobs, sji, task_pool); } else { if (jobstr != NULL) { free(jobstr); } if (sji != NULL) { free(sji); } } } else if (auto_np) { if (!(strncmp(str, "ncpus=", 6))) { handle_auto_np(current, str); } } } /* END processing strings */ if (current != NULL) { save_node_status(current, &temp); unlock_node(current, __func__, NULL, LOGLEVEL); } if ((rc == PBSE_NONE) && (send_hello == true)) rc = SEND_HELLO; return(rc); } /* END process_status_info() */
int process_status_info( const char *nd_name, std::vector<std::string> &status_info) { const char *name = nd_name; pbsnode *current; bool mom_job_sync = true; bool auto_np = false; bool down_on_error = false; bool note_append_on_error = false; int dont_change_state = FALSE; int rc = PBSE_NONE; bool send_hello = false; std::string temp; #ifdef PENABLE_LINUX_CGROUPS bool force_layout_update = false; #endif get_svr_attr_b(SRV_ATR_MomJobSync, &mom_job_sync); get_svr_attr_b(SRV_ATR_AutoNodeNP, &auto_np); get_svr_attr_b(SRV_ATR_NoteAppendOnError, ¬e_append_on_error); get_svr_attr_b(SRV_ATR_DownOnError, &down_on_error); /* if original node cannot be found do not process the update */ if ((current = find_nodebyname(nd_name)) == NULL) return(PBSE_NONE); //A node we put to sleep is up and running. if (current->nd_power_state != POWER_STATE_RUNNING) { //Make sure we wait for a stray update that came after we changed the state to pass //by. if((current->nd_power_state_change_time + NODE_POWER_CHANGE_TIMEOUT) < time(NULL)) { current->nd_power_state = POWER_STATE_RUNNING; write_node_power_state(); } } /* loop over each string */ for (unsigned int i = 0; i != status_info.size(); i++) { const char *str = status_info[i].c_str(); /* these two options are for switching nodes */ if (!strncmp(str, NUMA_KEYWORD, strlen(NUMA_KEYWORD))) { /* if we've already processed some, save this before moving on */ if (i != 0) { save_node_status(current, temp); temp.clear(); } dont_change_state = FALSE; if ((current = get_numa_from_str(str, current)) == NULL) break; else continue; } else if (!strncmp(str, "node=", strlen("node="))) { /* if we've already processed some, save this before moving on */ if (i != 0) { save_node_status(current, temp); temp.clear(); } dont_change_state = FALSE; if ((current = get_node_from_str(str, name, current)) == NULL) break; else { if (current->nd_mom_reported_down == TRUE) { /* There is a race condition if using a mom hierarchy and manually * shutting down a non-level 1 mom: if its message that the mom is * shutting down gets there before its last status update, the node * can incorrectly be set as free again. For that reason, only set * a mom back up if its reporting for itself. */ if (strcmp(name, str + strlen("node=")) != 0) dont_change_state = TRUE; else current->nd_mom_reported_down = FALSE; } continue; } } /* add the info to the "temp" pbs_attribute */ else if (!strcmp(str, START_GPU_STATUS)) { is_gpustat_get(current, i, status_info); continue; } else if (!strcmp(str, START_MIC_STATUS)) { process_mic_status(current, i, status_info); continue; } #ifdef PENABLE_LINUX_CGROUPS else if (!strcmp(str, "force_layout_update")) { force_layout_update = true; continue; } else if (!strncmp(str, "layout", 6)) { // Add 7 to skip "layout=" update_layout_if_needed(current, str + 7, force_layout_update); // reset this to false in case we have a mom hierarchy in place force_layout_update = false; continue; } #endif else if (!strncmp(str, PLUGIN_EQUALS, PLUGIN_EQ_LEN)) { current->capture_plugin_resources(str + PLUGIN_EQ_LEN); continue; } else if (!strncmp(str, "jobs=", 5)) { /* walk job list reported by mom */ sync_job_info *sji = new sync_job_info(); sji->node_name = current->get_name(); sji->job_info = str + 5; sji->sync_jobs = mom_job_sync; // sji is freed in sync_node_jobs() enqueue_threadpool_request(sync_node_jobs, sji, task_pool); continue; } else if (!strcmp(str, "first_update=true")) { /* mom is requesting that we send the mom hierarchy file to her */ //remove_hello(&hellos, current->nd_id); send_hello = true; /* reset gpu data in case mom reconnects with changed gpus */ clear_nvidia_gpus(current); continue; } else { // Save this string to our status line. if (temp.size() > 0) temp += ","; if (!strncmp(str, "message=", 8)) { std::string no_newlines(str); size_t pos = no_newlines.find('\n'); while (pos != std::string::npos) { no_newlines.replace(pos, 1, 1, ' '); pos = no_newlines.find('\n'); } temp += no_newlines; } else temp += str; if (!strncmp(str, "state", 5)) { if (dont_change_state == FALSE) process_state_str(current, str); } else if ((allow_any_mom == TRUE) && (!strncmp(str, "uname", 5))) { process_uname_str(current, str); } else if (!strncmp(str, "me", 2)) /* shorter str compare than "message" */ { if ((!strncmp(str, "message=ERROR", 13)) && (down_on_error == TRUE)) { update_node_state(current, INUSE_DOWN); dont_change_state = TRUE; if (note_append_on_error == true) { set_note_error(current, str); } } } else if (!strncmp(str,"macaddr=",8)) { update_node_mac_addr(current,str + 8); } else if ((mom_job_sync == true) && (!strncmp(str, "jobdata=", 8))) { /* update job attributes based on what the MOM gives us */ update_job_data(current, str + strlen("jobdata=")); } else if ((auto_np) && (!(strncmp(str, "ncpus=", 6)))) { handle_auto_np(current, str); } else if (!strncmp(str, "version=", 8)) { current->set_version(str + 8); } } } /* END processing strings */ if (current != NULL) { save_node_status(current, temp); current->unlock_node(__func__, NULL, LOGLEVEL); } if ((rc == PBSE_NONE) && (send_hello == true)) rc = SEND_HELLO; return(rc); } /* END process_status_info() */
int process_state_str( pbsnode *np, const char *str) { char log_buf[LOCAL_LOG_BUF_SIZE]; int rc = PBSE_NONE; if (np->nd_state & INUSE_NOHIERARCHY) { sprintf(log_buf, "node %s has not received its hierarchy yet.", np->get_name()); log_err(-1, __func__, log_buf); return(PBSE_HIERARCHY_NOT_SENT); } if (!strncmp(str, "state=down", 10)) { update_node_state(np, INUSE_DOWN); } else if (!strncmp(str, "state=busy", 10)) { if (np->nd_state == INUSE_DOWN) { restore_note(np); } update_node_state(np, INUSE_BUSY); } else if (!strncmp(str, "state=free", 10)) { if (np->nd_state == INUSE_DOWN) { restore_note(np); } update_node_state(np, INUSE_FREE); } else { sprintf(log_buf, "unknown %s from node %s", str, np->get_name()); log_err(-1, __func__, log_buf); update_node_state(np, INUSE_UNKNOWN); } if (LOGLEVEL >= 9) { sprintf(log_buf, "node '%s' is at state '0x%x'\n", np->get_name(), np->nd_state); log_event(PBSEVENT_ADMIN, PBS_EVENTCLASS_SERVER, __func__, log_buf); } return(rc); } /* END process_state_str() */
void *check_if_orphaned( void *vp) { char *node_name = (char *)vp; char *rsv_id = NULL; std::string job_id; batch_request *preq; int handle = -1; int retries = 0; struct pbsnode *pnode; char log_buf[LOCAL_LOG_BUF_SIZE]; if ((rsv_id = strchr(node_name, ':')) != NULL) { *rsv_id = '\0'; rsv_id++; } else { free(node_name); return(NULL); } if (alps_reservations.is_orphaned(rsv_id, job_id) == true) { // Make sure the node with the orphan is not available for jobs if ((pnode = find_nodebyname(node_name)) != NULL) { if ((pnode->nd_state & (INUSE_BUSY | INUSE_DOWN)) == 0) { snprintf(log_buf, sizeof(log_buf), "Node %s has an orphan but wasn't marked as busy. Marking as busy now.", node_name); log_err(-1, __func__, log_buf); update_node_state(pnode, INUSE_BUSY); } pnode->unlock_node(__func__, NULL, LOGLEVEL); } if ((preq = alloc_br(PBS_BATCH_DeleteReservation)) == NULL) { free(node_name); alps_reservations.remove_from_orphaned_list(rsv_id); return(NULL); } preq->rq_extend = strdup(rsv_id); if ((pnode = get_next_login_node(NULL)) != NULL) { struct in_addr hostaddr; int local_errno; pbs_net_t momaddr; memcpy(&hostaddr, &pnode->nd_sock_addr.sin_addr, sizeof(hostaddr)); momaddr = ntohl(hostaddr.s_addr); snprintf(log_buf, sizeof(log_buf), "Found orphan ALPS reservation ID %s for job %s; asking %s to remove it", rsv_id, job_id.c_str(), pnode->get_name()); log_record(PBSEVENT_DEBUG, PBS_EVENTCLASS_SERVER, __func__, log_buf); while ((handle < 0) && (retries < 3)) { handle = svr_connect(momaddr, pnode->nd_mom_port, &local_errno, pnode, NULL); retries++; } /* unlock before the network transaction */ pnode->unlock_node(__func__, NULL, LOGLEVEL); if (handle >= 0) issue_Drequest(handle, preq, true); free_br(preq); } alps_reservations.remove_from_orphaned_list(rsv_id); } free(node_name); return(NULL); } /* END check_if_orphaned() */
/************************************************* * svr_is_request * * Return: svr_is_request always returns a non-zero value * and it must call close_conn to close the connection * before returning. PBSE_SOCKET_CLOSE is the code * for a successful return. But which ever retun * code is iused it must terminate the while loop * in start_process_pbs_server_port. *************************************************/ int svr_is_request( struct tcp_chan *chan, int version) { int command = 0; int ret = DIS_SUCCESS; int i; int err; char nodename[PBS_MAXHOSTNAME]; int perm = ATR_DFLAG_MGRD | ATR_DFLAG_MGWR; unsigned long ipaddr; unsigned short mom_port; unsigned short rm_port; unsigned long tmpaddr; struct sockaddr_in *addr = NULL; struct sockaddr s_addr; unsigned int len = sizeof(s_addr); struct pbsnode *node = NULL; char *node_name = NULL; char log_buf[LOCAL_LOG_BUF_SIZE+1]; command = disrsi(chan, &ret); if (ret != DIS_SUCCESS) goto err; if (LOGLEVEL >= 4) { snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "message received from sock %d (version %d)", chan->sock, version); log_event(PBSEVENT_ADMIN,PBS_EVENTCLASS_SERVER,__func__,log_buf); } if (getpeername(chan->sock, &s_addr, &len) != 0) { close_conn(chan->sock, FALSE); log_err(errno,__func__, (char *)"Cannot get socket name using getpeername\n"); return(PBSE_SOCKET_CLOSE); } addr = (struct sockaddr_in *)&s_addr; if (version != IS_PROTOCOL_VER) { snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "protocol version %d unknown from %s", version, netaddr(addr)); log_err(-1, __func__, log_buf); close_conn(chan->sock, FALSE); return PBSE_SOCKET_DATA; } /* check that machine is known */ mom_port = disrsi(chan, &ret); rm_port = disrsi(chan, &ret); if (LOGLEVEL >= 3) { snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "message received from addr %s: mom_port %d - rm_port %d", netaddr(addr), mom_port, rm_port); log_event(PBSEVENT_ADMIN,PBS_EVENTCLASS_SERVER,__func__,log_buf); } ipaddr = ntohl(addr->sin_addr.s_addr); if ((node = AVL_find(ipaddr, mom_port, ipaddrs)) != NULL) { lock_node(node, __func__, "AVL_find", LOGLEVEL); } /* END if AVL_find != NULL) */ else if (allow_any_mom) { char *name = get_cached_nameinfo(addr); if (name != NULL) snprintf(nodename, sizeof(nodename), "%s", name); else if (getnameinfo(&s_addr, len, nodename, sizeof(nodename)-1, NULL, 0, 0) != 0) { tmpaddr = ntohl(addr->sin_addr.s_addr); sprintf(nodename, "0x%lX", tmpaddr); } else insert_addr_name_info(nodename, NULL, addr); err = create_partial_pbs_node(nodename, ipaddr, perm); if (err == PBSE_NONE) { node = AVL_find(ipaddr, 0, ipaddrs); lock_node(node, __func__, "no error", LOGLEVEL); } } if (node == NULL) { /* node not listed in trusted ipaddrs list */ snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "bad attempt to connect from %s (address not trusted - check entry in server_priv/nodes)", netaddr(addr)); if (LOGLEVEL >= 2) { log_record(PBSEVENT_SCHED, PBS_EVENTCLASS_REQUEST, __func__, log_buf); } else { log_err(-1, __func__, log_buf); } close_conn(chan->sock, FALSE); return PBSE_SOCKET_CLOSE; } if (LOGLEVEL >= 3) { snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "message %s (%d) received from mom on host %s (%s) (sock %d)", PBSServerCmds2[command], command, node->nd_name, netaddr(addr), chan->sock); log_event(PBSEVENT_ADMIN,PBS_EVENTCLASS_SERVER,__func__,log_buf); } switch (command) { case IS_NULL: /* a ping from server */ DBPRT(("%s: IS_NULL\n", __func__)) break; case IS_UPDATE: DBPRT(("%s: IS_UPDATE\n", __func__)) i = disrui(chan, &ret); if (ret != DIS_SUCCESS) { if (LOGLEVEL >= 1) { snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "IS_UPDATE error %d on node %s\n", ret, node->nd_name); log_err(ret, __func__, log_buf); } goto err; } DBPRT(("%s: IS_UPDATE %s 0x%x\n", __func__, node->nd_name, i)) update_node_state(node, i); if ((node->nd_state & INUSE_DOWN) != 0) { node->nd_mom_reported_down = TRUE; } break; case IS_STATUS: if (LOGLEVEL >= 2) { snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "IS_STATUS received from %s", node->nd_name); log_event(PBSEVENT_ADMIN, PBS_EVENTCLASS_SERVER, __func__, log_buf); } if ((node_name = strdup(node->nd_name)) == NULL) goto err; unlock_node(node, __func__, "before is_stat_get", LOGLEVEL); ret = is_stat_get(node_name, chan); node = find_nodebyname(node_name); if (ret == SEND_HELLO) { struct hello_info *hi = (struct hello_info *)calloc(1, sizeof(struct hello_info)); write_tcp_reply(chan, IS_PROTOCOL, IS_PROTOCOL_VER, IS_STATUS, DIS_SUCCESS); hi->name = strdup(node_name); enqueue_threadpool_request(send_hierarchy_threadtask, hi); ret = DIS_SUCCESS; } else write_tcp_reply(chan,IS_PROTOCOL,IS_PROTOCOL_VER,IS_STATUS,ret); if(node != NULL) node->nd_stream = -1; if (ret != DIS_SUCCESS) { if (LOGLEVEL >= 1) { snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "IS_STATUS error %d on node %s", ret, node_name); log_err(ret, __func__, log_buf); } free(node_name); goto err; } free(node_name); break; default: snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "unknown command %d sent from %s", command, node->nd_name); log_err(-1, __func__, log_buf); goto err; break; } /* END switch (command) */ /* must be closed because mom opens and closes this connection each time */ close_conn(chan->sock, FALSE); if(node != NULL) unlock_node(node, __func__, "close", LOGLEVEL); return PBSE_SOCKET_CLOSE; err: /* a DIS write error has occurred */ if (node != NULL) { if (LOGLEVEL >= 1) { DBPRT(("%s: error processing node %s\n", __func__, node->nd_name)) } sprintf(log_buf, "%s from %s(%s)", dis_emsg[ret], node->nd_name, netaddr(addr)); unlock_node(node, __func__, "err", LOGLEVEL); } else {
/************************************************* * svr_is_request * * Return: svr_is_request always returns a non-zero value * and it must call close_conn to close the connection * before returning. PBSE_SOCKET_CLOSE is the code * for a successful return. But which ever retun * code is iused it must terminate the while loop * in start_process_pbs_server_port. *************************************************/ void *svr_is_request( void *v) { int command = 0; int ret = DIS_SUCCESS; int i; int err; char nodename[PBS_MAXHOSTNAME]; int perm = ATR_DFLAG_MGRD | ATR_DFLAG_MGWR; unsigned long ipaddr; unsigned short mom_port; unsigned short rm_port; unsigned long tmpaddr; struct sockaddr_in addr; struct pbsnode *node = NULL; char log_buf[LOCAL_LOG_BUF_SIZE+1]; char msg_buf[80]; char tmp[80]; int version; struct tcp_chan *chan; long *args; is_request_info *isr = (is_request_info *)v; if (isr == NULL) return(NULL); chan = isr->chan; args = isr->args; version = disrsi(chan, &ret); if (ret != DIS_SUCCESS) { log_err(-1, __func__, "Cannot read version - skipping this request.\n"); close_conn(chan->sock, FALSE); DIS_tcp_cleanup(chan); return(NULL); } command = disrsi(chan, &ret); if (ret != DIS_SUCCESS) { snprintf(log_buf, sizeof(log_buf), "could not read command: %d", ret); log_err(-1, __func__, log_buf); close_conn(chan->sock, FALSE); DIS_tcp_cleanup(chan); return(NULL); } if (LOGLEVEL >= 4) { snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "message received from sock %d (version %d)", chan->sock, version); log_event(PBSEVENT_ADMIN, PBS_EVENTCLASS_SERVER, __func__, log_buf); } /* Just a note to let us know we only do IPv4 for now */ addr.sin_family = AF_INET; memcpy(&addr.sin_addr, (void *)&args[1], sizeof(struct in_addr)); addr.sin_port = args[2]; if (version != IS_PROTOCOL_VER) { netaddr_long(args[1], tmp); sprintf(msg_buf, "%s:%ld", tmp, args[2]); snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "protocol version %d unknown from %s", version, msg_buf); log_err(-1, __func__, log_buf); close_conn(chan->sock, FALSE); DIS_tcp_cleanup(chan); return(NULL); } /* check that machine is known */ mom_port = disrsi(chan, &ret); rm_port = disrsi(chan, &ret); if (LOGLEVEL >= 3) { netaddr_long(args[1], tmp); sprintf(msg_buf, "%s:%ld", tmp, args[2]); snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "message received from addr %s: mom_port %d - rm_port %d", msg_buf, mom_port, rm_port); log_event(PBSEVENT_ADMIN,PBS_EVENTCLASS_SERVER,__func__,log_buf); } ipaddr = args[1]; if ((node = AVL_find(ipaddr, mom_port, ipaddrs)) != NULL) { node->lock_node(__func__, "AVL_find", LOGLEVEL); } /* END if AVL_find != NULL) */ else if (allow_any_mom) { const char *name = get_cached_nameinfo(&addr); if (name != NULL) snprintf(nodename, sizeof(nodename), "%s", name); else if (getnameinfo((struct sockaddr *)&addr, sizeof(addr), nodename, sizeof(nodename)-1, NULL, 0, 0) != 0) { tmpaddr = ntohl(addr.sin_addr.s_addr); sprintf(nodename, "0x%lX", tmpaddr); } else insert_addr_name_info(NULL, nodename); err = create_partial_pbs_node(nodename, ipaddr, perm); if (err == PBSE_NONE) { node = AVL_find(ipaddr, 0, ipaddrs); node->lock_node(__func__, "no error", LOGLEVEL); } } if (node == NULL) { /* node not listed in trusted ipaddrs list */ netaddr_long(args[1], tmp); sprintf(msg_buf, "%s:%ld", tmp, args[2]); snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "bad attempt to connect from %s (address not trusted - check entry in server_priv/nodes)", msg_buf); if (LOGLEVEL >= 2) { log_record(PBSEVENT_SCHED, PBS_EVENTCLASS_REQUEST, __func__, log_buf); } else { log_err(-1, __func__, log_buf); } close_conn(chan->sock, FALSE); DIS_tcp_cleanup(chan); return(NULL); } if (LOGLEVEL >= 3) { netaddr_long(args[1], tmp); sprintf(msg_buf, "%s:%ld", tmp, args[2]); snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "message %s (%d) received from mom on host %s (%s) (sock %d)", PBSServerCmds2[command], command, node->get_name(), msg_buf, chan->sock); log_event(PBSEVENT_ADMIN,PBS_EVENTCLASS_SERVER,__func__,log_buf); } mutex_mgr node_mutex(&node->nd_mutex, true); switch (command) { case IS_NULL: /* a ping from server */ DBPRT(("%s: IS_NULL\n", __func__)) break; case IS_UPDATE: DBPRT(("%s: IS_UPDATE\n", __func__)) i = disrui(chan, &ret); if (ret != DIS_SUCCESS) { if (LOGLEVEL >= 1) { snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "IS_UPDATE error %d on node %s\n", ret, node->get_name()); log_err(ret, __func__, log_buf); } goto err; } DBPRT(("%s: IS_UPDATE %s 0x%x\n", __func__, node->get_name(), i)) update_node_state(node, i); if ((node->nd_state & INUSE_DOWN) != 0) { node->nd_mom_reported_down = TRUE; } break; case IS_STATUS: { std::string node_name = node->get_name(); if (LOGLEVEL >= 2) { snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "IS_STATUS received from %s", node->get_name()); log_event(PBSEVENT_ADMIN, PBS_EVENTCLASS_SERVER, __func__, log_buf); } node_mutex.unlock(); ret = is_stat_get(node_name.c_str(), chan); node = find_nodebyname(node_name.c_str()); if (node != NULL) { node->nd_stream = -1; node_mutex.mark_as_locked(); if (ret == SEND_HELLO) { //struct hello_info *hi = new hello_info(node->nd_id); write_tcp_reply(chan, IS_PROTOCOL, IS_PROTOCOL_VER, IS_STATUS, DIS_SUCCESS); hierarchy_handler.sendHierarchyToANode(node); ret = DIS_SUCCESS; } else write_tcp_reply(chan,IS_PROTOCOL,IS_PROTOCOL_VER,IS_STATUS,ret); } if (ret != DIS_SUCCESS) { if (LOGLEVEL >= 1) { snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "IS_STATUS error %d on node %s", ret, node_name.c_str()); log_err(ret, __func__, log_buf); } goto err; } break; } default: snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "unknown command %d sent from %s", command, node->get_name()); log_err(-1, __func__, log_buf); goto err; break; } /* END switch (command) */ /* must be closed because mom opens and closes this connection each time */ close_conn(chan->sock, FALSE); DIS_tcp_cleanup(chan); return(NULL); err: /* a DIS write error has occurred */ if (node != NULL) { if (LOGLEVEL >= 1) { DBPRT(("%s: error processing node %s\n", __func__, node->get_name())) } netaddr_long(args[1], tmp); sprintf(msg_buf, "%s:%ld", tmp, args[2]); sprintf(log_buf, "%s from %s(%s)", dis_emsg[ret], node->get_name(), msg_buf); } else {
int process_state_str( struct pbsnode *np, char *str) { char log_buf[LOCAL_LOG_BUF_SIZE]; struct pbssubn *sp = NULL; int rc = PBSE_NONE; if (!strncmp(str, "state=down", 10)) { update_node_state(np, INUSE_DOWN); } else if (!strncmp(str, "state=busy", 10)) { update_node_state(np, INUSE_BUSY); } else if (!strncmp(str, "state=free", 10)) { update_node_state(np, INUSE_FREE); } else { sprintf(log_buf, "unknown %s from node %s", str, (np->nd_name != NULL) ? np->nd_name : "NULL"); log_err(-1, __func__, log_buf); update_node_state(np, INUSE_UNKNOWN); } if (LOGLEVEL >= 9) { sprintf(log_buf, "node '%s' is at state '0x%x'\n", np->nd_name, np->nd_state); log_event(PBSEVENT_ADMIN, PBS_EVENTCLASS_SERVER, __func__, log_buf); } for (sp = np->nd_psn; sp != NULL; sp = sp->next) { if ((!(np->nd_state & INUSE_OFFLINE)) && (sp->inuse & INUSE_OFFLINE)) { /* this doesn't seem to ever happen */ if (LOGLEVEL >= 2) { sprintf(log_buf, "sync'ing subnode state '%s' with node state on node %s\n", "offline", np->nd_name); log_event(PBSEVENT_ADMIN, PBS_EVENTCLASS_SERVER, __func__, log_buf); } sp->inuse &= ~INUSE_OFFLINE; } sp->inuse &= ~INUSE_DOWN; } return(rc); } /* END process_state_str() */