int process_status_info( const char *nd_name, std::vector<std::string> &status_info) { const char *name = nd_name; struct pbsnode *current; long mom_job_sync = FALSE; long auto_np = FALSE; long down_on_error = FALSE; int dont_change_state = FALSE; pbs_attribute temp; int rc = PBSE_NONE; bool send_hello = false; get_svr_attr_l(SRV_ATR_MomJobSync, &mom_job_sync); get_svr_attr_l(SRV_ATR_AutoNodeNP, &auto_np); get_svr_attr_l(SRV_ATR_DownOnError, &down_on_error); /* Before filling the "temp" pbs_attribute, initialize it. * The second and third parameter to decode_arst are never * used, so just leave them empty. (GBS) */ memset(&temp, 0, sizeof(temp)); if ((rc = decode_arst(&temp, NULL, NULL, NULL, 0)) != PBSE_NONE) { log_record(PBSEVENT_DEBUG, PBS_EVENTCLASS_NODE, __func__, "cannot initialize attribute"); return(rc); } /* if original node cannot be found do not process the update */ if ((current = find_nodebyname(nd_name)) == NULL) return(PBSE_NONE); //A node we put to sleep is up and running. if (current->nd_power_state != POWER_STATE_RUNNING) { //Make sure we wait for a stray update that came after we changed the state to pass //by. if((current->nd_power_state_change_time + NODE_POWER_CHANGE_TIMEOUT) < time(NULL)) { current->nd_power_state = POWER_STATE_RUNNING; write_node_power_state(); } } /* loop over each string */ for (unsigned int i = 0; i != status_info.size(); i++) { const char *str = status_info[i].c_str(); /* these two options are for switching nodes */ if (!strncmp(str, NUMA_KEYWORD, strlen(NUMA_KEYWORD))) { /* if we've already processed some, save this before moving on */ if (i != 0) save_node_status(current, &temp); dont_change_state = FALSE; if ((current = get_numa_from_str(str, current)) == NULL) break; else continue; } else if (!strncmp(str, "node=", strlen("node="))) { /* if we've already processed some, save this before moving on */ if (i != 0) save_node_status(current, &temp); dont_change_state = FALSE; if ((current = get_node_from_str(str, name, current)) == NULL) break; else { if (current->nd_mom_reported_down == TRUE) { /* There is a race condition if using a mom hierarchy and manually * shutting down a non-level 1 mom: if its message that the mom is * shutting down gets there before its last status update, the node * can incorrectly be set as free again. For that reason, only set * a mom back up if its reporting for itself. */ if (strcmp(name, str + strlen("node=")) != 0) dont_change_state = TRUE; else current->nd_mom_reported_down = FALSE; } continue; } } /* add the info to the "temp" pbs_attribute */ else if (!strcmp(str, START_GPU_STATUS)) { is_gpustat_get(current, i, status_info); str = status_info[i].c_str(); } else if (!strcmp(str, START_MIC_STATUS)) { process_mic_status(current, i, status_info); str = status_info[i].c_str(); } #ifdef PENABLE_LINUX_CGROUPS else if (!strncmp(str, "layout", 6)) { if (current->nd_layout == NULL) { current->nd_layout = new Machine(status_info[i]); } continue; } #endif else if (!strcmp(str, "first_update=true")) { /* mom is requesting that we send the mom hierarchy file to her */ //remove_hello(&hellos, current->nd_id); send_hello = true; /* reset gpu data in case mom reconnects with changed gpus */ clear_nvidia_gpus(current); } else if ((rc = decode_arst(&temp, NULL, NULL, str, 0)) != PBSE_NONE) { DBPRT(("is_stat_get: cannot add attributes\n")); free_arst(&temp); break; } if (!strncmp(str, "state", 5)) { if (dont_change_state == FALSE) process_state_str(current, str); } else if ((allow_any_mom == TRUE) && (!strncmp(str, "uname", 5))) { process_uname_str(current, str); } else if (!strncmp(str, "me", 2)) /* shorter str compare than "message" */ { if ((!strncmp(str, "message=ERROR", 13)) && (down_on_error == TRUE)) { update_node_state(current, INUSE_DOWN); dont_change_state = TRUE; set_note_error(current, str); } } else if (!strncmp(str,"macaddr=",8)) { update_node_mac_addr(current,str + 8); } else if ((mom_job_sync == TRUE) && (!strncmp(str, "jobdata=", 8))) { /* update job attributes based on what the MOM gives us */ update_job_data(current, str + strlen("jobdata=")); } else if ((mom_job_sync == TRUE) && (!strncmp(str, "jobs=", 5))) { /* walk job list reported by mom */ size_t len = strlen(str) + strlen(current->nd_name) + 2; char *jobstr = (char *)calloc(1, len); sync_job_info *sji = (sync_job_info *)calloc(1, sizeof(sync_job_info)); if ((jobstr != NULL) && (sji != NULL)) { sprintf(jobstr, "%s:%s", current->nd_name, str+5); sji->input = jobstr; sji->timestamp = time(NULL); /* sji must be freed in sync_node_jobs */ enqueue_threadpool_request(sync_node_jobs, sji, task_pool); } else { if (jobstr != NULL) { free(jobstr); } if (sji != NULL) { free(sji); } } } else if (auto_np) { if (!(strncmp(str, "ncpus=", 6))) { handle_auto_np(current, str); } } } /* END processing strings */ if (current != NULL) { save_node_status(current, &temp); unlock_node(current, __func__, NULL, LOGLEVEL); } if ((rc == PBSE_NONE) && (send_hello == true)) rc = SEND_HELLO; return(rc); } /* END process_status_info() */
int process_status_info( const char *nd_name, std::vector<std::string> &status_info) { const char *name = nd_name; pbsnode *current; bool mom_job_sync = true; bool auto_np = false; bool down_on_error = false; bool note_append_on_error = false; int dont_change_state = FALSE; int rc = PBSE_NONE; bool send_hello = false; std::string temp; #ifdef PENABLE_LINUX_CGROUPS bool force_layout_update = false; #endif get_svr_attr_b(SRV_ATR_MomJobSync, &mom_job_sync); get_svr_attr_b(SRV_ATR_AutoNodeNP, &auto_np); get_svr_attr_b(SRV_ATR_NoteAppendOnError, ¬e_append_on_error); get_svr_attr_b(SRV_ATR_DownOnError, &down_on_error); /* if original node cannot be found do not process the update */ if ((current = find_nodebyname(nd_name)) == NULL) return(PBSE_NONE); //A node we put to sleep is up and running. if (current->nd_power_state != POWER_STATE_RUNNING) { //Make sure we wait for a stray update that came after we changed the state to pass //by. if((current->nd_power_state_change_time + NODE_POWER_CHANGE_TIMEOUT) < time(NULL)) { current->nd_power_state = POWER_STATE_RUNNING; write_node_power_state(); } } /* loop over each string */ for (unsigned int i = 0; i != status_info.size(); i++) { const char *str = status_info[i].c_str(); /* these two options are for switching nodes */ if (!strncmp(str, NUMA_KEYWORD, strlen(NUMA_KEYWORD))) { /* if we've already processed some, save this before moving on */ if (i != 0) { save_node_status(current, temp); temp.clear(); } dont_change_state = FALSE; if ((current = get_numa_from_str(str, current)) == NULL) break; else continue; } else if (!strncmp(str, "node=", strlen("node="))) { /* if we've already processed some, save this before moving on */ if (i != 0) { save_node_status(current, temp); temp.clear(); } dont_change_state = FALSE; if ((current = get_node_from_str(str, name, current)) == NULL) break; else { if (current->nd_mom_reported_down == TRUE) { /* There is a race condition if using a mom hierarchy and manually * shutting down a non-level 1 mom: if its message that the mom is * shutting down gets there before its last status update, the node * can incorrectly be set as free again. For that reason, only set * a mom back up if its reporting for itself. */ if (strcmp(name, str + strlen("node=")) != 0) dont_change_state = TRUE; else current->nd_mom_reported_down = FALSE; } continue; } } /* add the info to the "temp" pbs_attribute */ else if (!strcmp(str, START_GPU_STATUS)) { is_gpustat_get(current, i, status_info); continue; } else if (!strcmp(str, START_MIC_STATUS)) { process_mic_status(current, i, status_info); continue; } #ifdef PENABLE_LINUX_CGROUPS else if (!strcmp(str, "force_layout_update")) { force_layout_update = true; continue; } else if (!strncmp(str, "layout", 6)) { // Add 7 to skip "layout=" update_layout_if_needed(current, str + 7, force_layout_update); // reset this to false in case we have a mom hierarchy in place force_layout_update = false; continue; } #endif else if (!strncmp(str, PLUGIN_EQUALS, PLUGIN_EQ_LEN)) { current->capture_plugin_resources(str + PLUGIN_EQ_LEN); continue; } else if (!strncmp(str, "jobs=", 5)) { /* walk job list reported by mom */ sync_job_info *sji = new sync_job_info(); sji->node_name = current->get_name(); sji->job_info = str + 5; sji->sync_jobs = mom_job_sync; // sji is freed in sync_node_jobs() enqueue_threadpool_request(sync_node_jobs, sji, task_pool); continue; } else if (!strcmp(str, "first_update=true")) { /* mom is requesting that we send the mom hierarchy file to her */ //remove_hello(&hellos, current->nd_id); send_hello = true; /* reset gpu data in case mom reconnects with changed gpus */ clear_nvidia_gpus(current); continue; } else { // Save this string to our status line. if (temp.size() > 0) temp += ","; if (!strncmp(str, "message=", 8)) { std::string no_newlines(str); size_t pos = no_newlines.find('\n'); while (pos != std::string::npos) { no_newlines.replace(pos, 1, 1, ' '); pos = no_newlines.find('\n'); } temp += no_newlines; } else temp += str; if (!strncmp(str, "state", 5)) { if (dont_change_state == FALSE) process_state_str(current, str); } else if ((allow_any_mom == TRUE) && (!strncmp(str, "uname", 5))) { process_uname_str(current, str); } else if (!strncmp(str, "me", 2)) /* shorter str compare than "message" */ { if ((!strncmp(str, "message=ERROR", 13)) && (down_on_error == TRUE)) { update_node_state(current, INUSE_DOWN); dont_change_state = TRUE; if (note_append_on_error == true) { set_note_error(current, str); } } } else if (!strncmp(str,"macaddr=",8)) { update_node_mac_addr(current,str + 8); } else if ((mom_job_sync == true) && (!strncmp(str, "jobdata=", 8))) { /* update job attributes based on what the MOM gives us */ update_job_data(current, str + strlen("jobdata=")); } else if ((auto_np) && (!(strncmp(str, "ncpus=", 6)))) { handle_auto_np(current, str); } else if (!strncmp(str, "version=", 8)) { current->set_version(str + 8); } } } /* END processing strings */ if (current != NULL) { save_node_status(current, temp); current->unlock_node(__func__, NULL, LOGLEVEL); } if ((rc == PBSE_NONE) && (send_hello == true)) rc = SEND_HELLO; return(rc); } /* END process_status_info() */