예제 #1
0
int process_status_info(

  const char               *nd_name,
  std::vector<std::string> &status_info)

  {
  const char     *name = nd_name;
  struct pbsnode *current;
  long            mom_job_sync = FALSE;
  long            auto_np = FALSE;
  long            down_on_error = FALSE;
  int             dont_change_state = FALSE;
  pbs_attribute   temp;
  int             rc = PBSE_NONE;
  bool            send_hello = false;

  get_svr_attr_l(SRV_ATR_MomJobSync, &mom_job_sync);
  get_svr_attr_l(SRV_ATR_AutoNodeNP, &auto_np);
  get_svr_attr_l(SRV_ATR_DownOnError, &down_on_error);

  /* Before filling the "temp" pbs_attribute, initialize it.
   * The second and third parameter to decode_arst are never
   * used, so just leave them empty. (GBS) */
  memset(&temp, 0, sizeof(temp));

  if ((rc = decode_arst(&temp, NULL, NULL, NULL, 0)) != PBSE_NONE)
    {
    log_record(PBSEVENT_DEBUG, PBS_EVENTCLASS_NODE, __func__, "cannot initialize attribute");
    return(rc);
    }

  /* if original node cannot be found do not process the update */
  if ((current = find_nodebyname(nd_name)) == NULL)
    return(PBSE_NONE);

  //A node we put to sleep is up and running.
  if (current->nd_power_state != POWER_STATE_RUNNING)
    {
    //Make sure we wait for a stray update that came after we changed the state to pass
    //by.
    if((current->nd_power_state_change_time + NODE_POWER_CHANGE_TIMEOUT) < time(NULL))
      {
      current->nd_power_state = POWER_STATE_RUNNING;
      write_node_power_state();
      }
    }

  /* loop over each string */
  for (unsigned int i = 0; i != status_info.size(); i++)
    {
    const char *str = status_info[i].c_str();
    /* these two options are for switching nodes */
    if (!strncmp(str, NUMA_KEYWORD, strlen(NUMA_KEYWORD)))
      {
      /* if we've already processed some, save this before moving on */
      if (i != 0)
        save_node_status(current, &temp);
      
      dont_change_state = FALSE;

      if ((current = get_numa_from_str(str, current)) == NULL)
        break;
      else
        continue;
      }
    else if (!strncmp(str, "node=", strlen("node=")))
      {
      /* if we've already processed some, save this before moving on */
      if (i != 0)
        save_node_status(current, &temp);

      dont_change_state = FALSE;

      if ((current = get_node_from_str(str, name, current)) == NULL)
        break;
      else
        {
        if (current->nd_mom_reported_down == TRUE)
          {
          /* There is a race condition if using a mom hierarchy and manually
           * shutting down a non-level 1 mom: if its message that the mom is
           * shutting down gets there before its last status update, the node
           * can incorrectly be set as free again. For that reason, only set
           * a mom back up if its reporting for itself. */
          if (strcmp(name, str + strlen("node=")) != 0)
            dont_change_state = TRUE;
          else
            current->nd_mom_reported_down = FALSE;
          }

        continue;
        }
      }

    /* add the info to the "temp" pbs_attribute */
    else if (!strcmp(str, START_GPU_STATUS))
      {
      is_gpustat_get(current, i, status_info);
      str = status_info[i].c_str();
      }
    else if (!strcmp(str, START_MIC_STATUS))
      {
      process_mic_status(current, i, status_info);
      str = status_info[i].c_str();
      }
#ifdef PENABLE_LINUX_CGROUPS
    else if (!strncmp(str, "layout", 6))
      {
      if (current->nd_layout == NULL)
        {
        current->nd_layout = new Machine(status_info[i]);
        }

      continue;
      }
#endif
    else if (!strcmp(str, "first_update=true"))
      {
      /* mom is requesting that we send the mom hierarchy file to her */
      //remove_hello(&hellos, current->nd_id);
      send_hello = true;
      
      /* reset gpu data in case mom reconnects with changed gpus */
      clear_nvidia_gpus(current);
      }
    else if ((rc = decode_arst(&temp, NULL, NULL, str, 0)) != PBSE_NONE)
      {
      DBPRT(("is_stat_get: cannot add attributes\n"));

      free_arst(&temp);

      break;
      }

    if (!strncmp(str, "state", 5))
      {
      if (dont_change_state == FALSE)
        process_state_str(current, str);
      }
    else if ((allow_any_mom == TRUE) &&
             (!strncmp(str, "uname", 5))) 
      {
      process_uname_str(current, str);
      }
    else if (!strncmp(str, "me", 2))  /* shorter str compare than "message" */
      {
      if ((!strncmp(str, "message=ERROR", 13)) &&
          (down_on_error == TRUE))
        {
        update_node_state(current, INUSE_DOWN);
        dont_change_state = TRUE;
        set_note_error(current, str);
        }
      }
    else if (!strncmp(str,"macaddr=",8))
      {
      update_node_mac_addr(current,str + 8);
      }
    else if ((mom_job_sync == TRUE) &&
             (!strncmp(str, "jobdata=", 8)))
      {
      /* update job attributes based on what the MOM gives us */      
      update_job_data(current, str + strlen("jobdata="));
      }
    else if ((mom_job_sync == TRUE) &&
             (!strncmp(str, "jobs=", 5)))
      {
      /* walk job list reported by mom */
      size_t         len = strlen(str) + strlen(current->nd_name) + 2;
      char          *jobstr = (char *)calloc(1, len);
      sync_job_info *sji = (sync_job_info *)calloc(1, sizeof(sync_job_info));

      if ((jobstr != NULL) &&
          (sji != NULL))
        {
        sprintf(jobstr, "%s:%s", current->nd_name, str+5);
        sji->input = jobstr;
        sji->timestamp = time(NULL);

        /* sji must be freed in sync_node_jobs */
        enqueue_threadpool_request(sync_node_jobs, sji, task_pool);
        }
      else
        {
        if (jobstr != NULL)
          {
          free(jobstr);
          }
        if (sji != NULL)
          {
          free(sji);
          }
        }
      }
    else if (auto_np)
      {
      if (!(strncmp(str, "ncpus=", 6)))
        {
        handle_auto_np(current, str);
        }
      }
    } /* END processing strings */

  if (current != NULL)
    {
    save_node_status(current, &temp);
    unlock_node(current, __func__, NULL, LOGLEVEL);
    }
  
  if ((rc == PBSE_NONE) &&
      (send_hello == true))
    rc = SEND_HELLO;
    
  return(rc);
  } /* END process_status_info() */
예제 #2
0
int process_status_info(

  const char               *nd_name,
  std::vector<std::string> &status_info)

  {
  const char     *name = nd_name;
  pbsnode        *current;
  bool            mom_job_sync = true;
  bool            auto_np = false;
  bool            down_on_error = false;
  bool            note_append_on_error = false;
  int             dont_change_state = FALSE;
  int             rc = PBSE_NONE;
  bool            send_hello = false;
  std::string     temp;
#ifdef PENABLE_LINUX_CGROUPS
  bool            force_layout_update = false;
#endif

  get_svr_attr_b(SRV_ATR_MomJobSync, &mom_job_sync);
  get_svr_attr_b(SRV_ATR_AutoNodeNP, &auto_np);
  get_svr_attr_b(SRV_ATR_NoteAppendOnError, &note_append_on_error);
  get_svr_attr_b(SRV_ATR_DownOnError, &down_on_error);

  /* if original node cannot be found do not process the update */
  if ((current = find_nodebyname(nd_name)) == NULL)
    return(PBSE_NONE);

  //A node we put to sleep is up and running.
  if (current->nd_power_state != POWER_STATE_RUNNING)
    {
    //Make sure we wait for a stray update that came after we changed the state to pass
    //by.
    if((current->nd_power_state_change_time + NODE_POWER_CHANGE_TIMEOUT) < time(NULL))
      {
      current->nd_power_state = POWER_STATE_RUNNING;
      write_node_power_state();
      }
    }

  /* loop over each string */
  for (unsigned int i = 0; i != status_info.size(); i++)
    {
    const char *str = status_info[i].c_str();

    /* these two options are for switching nodes */
    if (!strncmp(str, NUMA_KEYWORD, strlen(NUMA_KEYWORD)))
      {

      /* if we've already processed some, save this before moving on */
      if (i != 0)
        {
        save_node_status(current, temp);
        temp.clear();
        }
      
      dont_change_state = FALSE;

      if ((current = get_numa_from_str(str, current)) == NULL)
        break;
      else
        continue;
      }
    else if (!strncmp(str, "node=", strlen("node=")))
      {
      /* if we've already processed some, save this before moving on */
      if (i != 0)
        {
        save_node_status(current, temp);
        temp.clear();
        }

      dont_change_state = FALSE;

      if ((current = get_node_from_str(str, name, current)) == NULL)
        break;
      else
        {
        if (current->nd_mom_reported_down == TRUE)
          {
          /* There is a race condition if using a mom hierarchy and manually
           * shutting down a non-level 1 mom: if its message that the mom is
           * shutting down gets there before its last status update, the node
           * can incorrectly be set as free again. For that reason, only set
           * a mom back up if its reporting for itself. */
          if (strcmp(name, str + strlen("node=")) != 0)
            dont_change_state = TRUE;
          else
            current->nd_mom_reported_down = FALSE;
          }

        continue;
        }
      }

    /* add the info to the "temp" pbs_attribute */
    else if (!strcmp(str, START_GPU_STATUS))
      {
      is_gpustat_get(current, i, status_info);
      continue;
      }
    else if (!strcmp(str, START_MIC_STATUS))
      {
      process_mic_status(current, i, status_info);
      continue;
      }
#ifdef PENABLE_LINUX_CGROUPS
    else if (!strcmp(str, "force_layout_update"))
      {
      force_layout_update = true;
      continue;
      }
    else if (!strncmp(str, "layout", 6))
      {
      // Add 7 to skip "layout="
      update_layout_if_needed(current, str + 7, force_layout_update);

      // reset this to false in case we have a mom hierarchy in place
      force_layout_update = false;

      continue;
      }
#endif
    else if (!strncmp(str, PLUGIN_EQUALS, PLUGIN_EQ_LEN))
      {
      current->capture_plugin_resources(str + PLUGIN_EQ_LEN);
      continue;
      }
    else if (!strncmp(str, "jobs=", 5))
      {
      /* walk job list reported by mom */
      sync_job_info *sji = new sync_job_info();
      sji->node_name = current->get_name();
      sji->job_info = str + 5;
      sji->sync_jobs = mom_job_sync;
        
      // sji is freed in sync_node_jobs()
      enqueue_threadpool_request(sync_node_jobs, sji, task_pool);

      continue;
      }
    else if (!strcmp(str, "first_update=true"))
      {
      /* mom is requesting that we send the mom hierarchy file to her */
      //remove_hello(&hellos, current->nd_id);
      send_hello = true;
      
      /* reset gpu data in case mom reconnects with changed gpus */
      clear_nvidia_gpus(current);

      continue;
      }
    else 
      {
      // Save this string to our status line.
      if (temp.size() > 0)
        temp += ",";

      if (!strncmp(str, "message=", 8))
        {
        std::string no_newlines(str);
        size_t pos = no_newlines.find('\n');
        
        while (pos != std::string::npos)
          {
          no_newlines.replace(pos, 1, 1, ' ');
          pos = no_newlines.find('\n');
          }

        temp += no_newlines;
        }
      else
        temp += str;
    
      if (!strncmp(str, "state", 5))
        {
        if (dont_change_state == FALSE)
          process_state_str(current, str);
        }
      else if ((allow_any_mom == TRUE) &&
               (!strncmp(str, "uname", 5))) 
        {
        process_uname_str(current, str);
        }
      else if (!strncmp(str, "me", 2))  /* shorter str compare than "message" */
        {
        if ((!strncmp(str, "message=ERROR", 13)) &&
            (down_on_error == TRUE))
          {
          update_node_state(current, INUSE_DOWN);
          dont_change_state = TRUE;

          if (note_append_on_error == true)
            {
            set_note_error(current, str);
            }
          }
        }
      else if (!strncmp(str,"macaddr=",8))
        {
        update_node_mac_addr(current,str + 8);
        }
      else if ((mom_job_sync == true) &&
               (!strncmp(str, "jobdata=", 8)))
        {
        /* update job attributes based on what the MOM gives us */      
        update_job_data(current, str + strlen("jobdata="));
        }
      else if ((auto_np) &&
               (!(strncmp(str, "ncpus=", 6))))

        {
        handle_auto_np(current, str);
        }
      else if (!strncmp(str, "version=", 8))
        {
        current->set_version(str + 8);
        }
      }

    } /* END processing strings */

  if (current != NULL)
    {
    save_node_status(current, temp);
    current->unlock_node(__func__, NULL, LOGLEVEL);
    }
  
  if ((rc == PBSE_NONE) &&
      (send_hello == true))
    rc = SEND_HELLO;
    
  return(rc);
  } /* END process_status_info() */