示例#1
0
static void wtr_state_change(struct aps_controller *aps) {
    // to idle state
    // in the last version, wtr to idle occurs in wtr sub state machine
    // but now i remove it here
    if (is_rcv_nr_idle_both_sides(aps)) {
        fill_nr_idle(aps);
        exit_cur_sw_state(aps);
        wtr_state_exit(aps);
        set_state(aps, PRIM_IDLE);
        update_node_state(aps, IDLE);
    } else if (is_to_pass(aps)) {
        exit_cur_sw_state(aps);
        wtr_state_exit(aps);
        set_state(aps, PRIM_PASS);
        update_node_state(aps, PASS);
    } else if (is_to_wtr(aps)) {// keep in wtr process
        set_state(aps, PRIM_WTR);
        wtr_state_run(aps);
    } else if (is_to_switch(aps)) {
        drop_switch_if_occupy(aps);
        set_state(aps, PRIM_SWITCH);
        wtr_state_exit(aps);	//exit wtr state firstly
        sw_state_run(aps);	// go to switch state
    } else if (is_to_k_pass(aps)) {
        exit_cur_sw_state(aps);
        wtr_state_exit(aps);
        set_state(aps, PRIM_K_PASS);
        update_node_state(aps, k_PASS);
    } else {
        assert(0);
    }
}
示例#2
0
static void switch_state_change(struct aps_controller *aps) {
    // to idle state
    if (is_only_rcv_brq_it_sourcing_both_sides(aps)) {
        fill_nr_idle(aps);
        exit_cur_sw_state(aps);	// exit switch and it's sub states
        set_state(aps, PRIM_IDLE);
        update_node_state(aps, IDLE);
    } else if (is_to_pass(aps)) {
        exit_cur_sw_state(aps);	// exit switch and it's sub states
        set_state(aps, PRIM_PASS);
        update_node_state(aps, PASS);
    } else if (is_to_wtr(aps)) {
        // note: here need not exit switch and it's sub states
        // because switch state should be reserved for returning from wtr some time.
        // goto next layer of wtr sm
        set_state(aps, PRIM_WTR);
        wtr_state_run(aps);
    } else if (is_to_switch(aps)) {
        drop_switch_if_occupy(aps);
        set_state(aps, PRIM_SWITCH);
        sw_state_run(aps);	// go to next layer
    } else if (is_to_k_pass(aps)) {
        exit_cur_sw_state(aps);	// exit switch and it's sub states
        set_state(aps, PRIM_K_PASS);
        update_node_state(aps, k_PASS);
    } else {
        assert(0);
    }
}
示例#3
0
static void k_pass_state_change(struct aps_controller *aps) {
    // to idle state
    if (is_rcv_nr_idle_both_sides(aps)) {
        fill_nr_idle(aps);
        set_state(aps, PRIM_IDLE);
        update_node_state(aps, IDLE);
    } else if (is_to_switch(aps)) {
        drop_switch_if_occupy(aps);
        set_state(aps, PRIM_SWITCH);
        sw_state_run(aps);	// go to next layer
    } else if (is_to_pass(aps)) {
        set_state(aps, PRIM_PASS);
        update_node_state(aps, PASS);
    } else {
        assert(0);
    }
}
示例#4
0
int set_state(

  struct pbsnode *pnode,
  char           *str)

  {
  char *state_str = str + strlen("state=");

  if (!strcmp(state_str, "UP"))
    update_node_state(pnode, INUSE_FREE);
  else if (!strcmp(state_str, "DOWN"))
    update_node_state(pnode, INUSE_DOWN);
  else if (!strcmp(state_str, "BUSY"))
    update_node_state(pnode, INUSE_BUSY);

  return(PBSE_NONE);
  } /* END set_state() */
示例#5
0
/*
 * _gtk_check_menu_item_set_active:
 * @check_menu_item: a #GtkCheckMenuItem
 * @is_active: whether the action is active or not
 *
 * Sets the #GtkCheckMenuItem:active property directly. This function does
 * not emit signals or notifications: it is left to the caller to do so.
 */
void
_gtk_check_menu_item_set_active (GtkCheckMenuItem *check_menu_item,
                                 gboolean          is_active)
{
  GtkCheckMenuItemPrivate *priv = check_menu_item->priv;

  priv->active = is_active;
  update_node_state (check_menu_item);
}
示例#6
0
int process_state_str(

  struct pbsnode *np,
  const char    *str)

  {
  char            log_buf[LOCAL_LOG_BUF_SIZE];
  int             rc = PBSE_NONE;

  if (!strncmp(str, "state=down", 10))
    {
    update_node_state(np, INUSE_DOWN);
    }
  else if (!strncmp(str, "state=busy", 10))
    {
    update_node_state(np, INUSE_BUSY);
    }
  else if (!strncmp(str, "state=free", 10))
    {
    update_node_state(np, INUSE_FREE);
    }
  else
    {
    sprintf(log_buf, "unknown %s from node %s",
      str,
      (np->nd_name != NULL) ? np->nd_name : "NULL");
    
    log_err(-1, __func__, log_buf);
    
    update_node_state(np, INUSE_UNKNOWN);
    }
  
  if (LOGLEVEL >= 9)
    {
    sprintf(log_buf, "node '%s' is at state '0x%x'\n",
      np->nd_name,
      np->nd_state);
    
    log_event(PBSEVENT_ADMIN, PBS_EVENTCLASS_SERVER, __func__, log_buf);
    }
  
  return(rc);
  } /* END process_state_str() */
示例#7
0
static void
gtk_check_menu_item_state_flags_changed (GtkWidget     *widget,
                                         GtkStateFlags  previous_state)

{
  GtkCheckMenuItem *check_menu_item = GTK_CHECK_MENU_ITEM (widget);

  update_node_state (check_menu_item);

  GTK_WIDGET_CLASS (gtk_check_menu_item_parent_class)->state_flags_changed (widget, previous_state);
}
示例#8
0
static void pass_state_change(struct aps_controller *aps) {
    // to idle state
    if (NR == get_highest_brq_for_me(aps) && is_rcv_nr_idle_both_sides(aps)) {
        fill_nr_idle(aps);
        set_state(aps, PRIM_IDLE);
        update_node_state(aps, IDLE);
    } else if (is_recv_same_pri_long_brq_to_me(aps) || is_recv_long_brq_and_nr_from_same_neib(aps) || is_to_pass(aps)) {
        // keep pass
        set_state(aps, PRIM_PASS);
        update_node_state(aps, PASS);
    } else if (is_to_switch(aps)) {
        drop_switch_if_occupy(aps);
        set_state(aps, PRIM_SWITCH);
        sw_state_run(aps);	// go to next layer
    } else if (is_to_k_pass(aps)) {
        set_state(aps, PRIM_K_PASS);
        update_node_state(aps, k_PASS);
    } else {
        assert(0);
    }
}
示例#9
0
static void start_up_state_change(struct aps_controller *aps) {
    if (!IS_NE_READY) {
        tx_default_kbytes(aps);
        return;
    }
    if (is_to_idle(aps)) {
        fill_nr_idle(aps);
        set_state(aps, PRIM_IDLE);
        update_node_state(aps, IDLE);
    } else if (is_to_switch(aps)) {
        drop_switch_if_occupy(aps);
        set_state(aps, PRIM_SWITCH);
        sw_state_run(aps);	// go to next layer
    } else if (is_to_pass(aps)) {
        set_state(aps, PRIM_PASS);
        update_node_state(aps, PASS);
    } else if (is_to_k_pass(aps)) {
        set_state(aps, PRIM_K_PASS);
        update_node_state(aps, k_PASS);
    } else {
        assert(0);
    }
}
示例#10
0
bool pbsnode::update_internal_failure_counts(

  int rc)

  {
  bool held = false;
  char log_buf[2048];

  if (rc == PBSE_NONE)
    {
    this->nd_consecutive_successes++;

    if (this->nd_consecutive_successes > 1)
      {
      this->nd_proximal_failures = 0;

      if (this->nd_state & INUSE_NETWORK_FAIL)
        {
        snprintf(log_buf, sizeof(log_buf),
          "Node '%s' has had two or more consecutive network successes, marking online.",
          this->nd_name.c_str());
        log_record(1, 2, __func__, log_buf);
        this->remove_node_state_flag(INUSE_NETWORK_FAIL);
        }
      }
    }
  else
    {
    this->nd_proximal_failures++;
    this->nd_consecutive_successes = 0;

    if ((this->nd_proximal_failures > 2) &&
        ((this->nd_state & INUSE_NETWORK_FAIL) == 0))
      {
      snprintf(log_buf, sizeof(log_buf),
        "Node '%s' has had %d failures in close proximity, marking offline.",
        this->nd_name.c_str(), this->nd_proximal_failures);
      log_record(1, 2, __func__, log_buf);

      update_node_state(this, INUSE_NETWORK_FAIL);
      held = true;
      }
    }

  return(held);
  }
示例#11
0
static void
gtk_check_menu_item_activate (GtkMenuItem *menu_item)
{
  GtkCheckMenuItemPrivate *priv;

  GtkCheckMenuItem *check_menu_item = GTK_CHECK_MENU_ITEM (menu_item);
  priv = check_menu_item->priv;

  priv->active = !priv->active;

  gtk_check_menu_item_toggled (check_menu_item);
  update_node_state (check_menu_item);
  gtk_widget_queue_draw (GTK_WIDGET (check_menu_item));

  GTK_MENU_ITEM_CLASS (gtk_check_menu_item_parent_class)->activate (menu_item);

  g_object_notify (G_OBJECT (check_menu_item), "active");
}
示例#12
0
/**
 * gtk_check_menu_item_set_inconsistent:
 * @check_menu_item: a #GtkCheckMenuItem
 * @setting: %TRUE to display an “inconsistent” third state check
 *
 * If the user has selected a range of elements (such as some text or
 * spreadsheet cells) that are affected by a boolean setting, and the
 * current values in that range are inconsistent, you may want to
 * display the check in an “in between” state. This function turns on
 * “in between” display.  Normally you would turn off the inconsistent
 * state again if the user explicitly selects a setting. This has to be
 * done manually, gtk_check_menu_item_set_inconsistent() only affects
 * visual appearance, it doesn’t affect the semantics of the widget.
 * 
 **/
void
gtk_check_menu_item_set_inconsistent (GtkCheckMenuItem *check_menu_item,
                                      gboolean          setting)
{
  GtkCheckMenuItemPrivate *priv;

  g_return_if_fail (GTK_IS_CHECK_MENU_ITEM (check_menu_item));

  priv = check_menu_item->priv;
  
  setting = setting != FALSE;

  if (setting != priv->inconsistent)
    {
      priv->inconsistent = setting;
      update_node_state (check_menu_item);
      gtk_widget_queue_draw (GTK_WIDGET (check_menu_item));
      g_object_notify (G_OBJECT (check_menu_item), "inconsistent");
    }
}
int process_status_info(

  const char               *nd_name,
  std::vector<std::string> &status_info)

  {
  const char     *name = nd_name;
  struct pbsnode *current;
  long            mom_job_sync = FALSE;
  long            auto_np = FALSE;
  long            down_on_error = FALSE;
  int             dont_change_state = FALSE;
  pbs_attribute   temp;
  int             rc = PBSE_NONE;
  bool            send_hello = false;

  get_svr_attr_l(SRV_ATR_MomJobSync, &mom_job_sync);
  get_svr_attr_l(SRV_ATR_AutoNodeNP, &auto_np);
  get_svr_attr_l(SRV_ATR_DownOnError, &down_on_error);

  /* Before filling the "temp" pbs_attribute, initialize it.
   * The second and third parameter to decode_arst are never
   * used, so just leave them empty. (GBS) */
  memset(&temp, 0, sizeof(temp));

  if ((rc = decode_arst(&temp, NULL, NULL, NULL, 0)) != PBSE_NONE)
    {
    log_record(PBSEVENT_DEBUG, PBS_EVENTCLASS_NODE, __func__, "cannot initialize attribute");
    return(rc);
    }

  /* if original node cannot be found do not process the update */
  if ((current = find_nodebyname(nd_name)) == NULL)
    return(PBSE_NONE);

  //A node we put to sleep is up and running.
  if (current->nd_power_state != POWER_STATE_RUNNING)
    {
    //Make sure we wait for a stray update that came after we changed the state to pass
    //by.
    if((current->nd_power_state_change_time + NODE_POWER_CHANGE_TIMEOUT) < time(NULL))
      {
      current->nd_power_state = POWER_STATE_RUNNING;
      write_node_power_state();
      }
    }

  /* loop over each string */
  for (unsigned int i = 0; i != status_info.size(); i++)
    {
    const char *str = status_info[i].c_str();
    /* these two options are for switching nodes */
    if (!strncmp(str, NUMA_KEYWORD, strlen(NUMA_KEYWORD)))
      {
      /* if we've already processed some, save this before moving on */
      if (i != 0)
        save_node_status(current, &temp);
      
      dont_change_state = FALSE;

      if ((current = get_numa_from_str(str, current)) == NULL)
        break;
      else
        continue;
      }
    else if (!strncmp(str, "node=", strlen("node=")))
      {
      /* if we've already processed some, save this before moving on */
      if (i != 0)
        save_node_status(current, &temp);

      dont_change_state = FALSE;

      if ((current = get_node_from_str(str, name, current)) == NULL)
        break;
      else
        {
        if (current->nd_mom_reported_down == TRUE)
          {
          /* There is a race condition if using a mom hierarchy and manually
           * shutting down a non-level 1 mom: if its message that the mom is
           * shutting down gets there before its last status update, the node
           * can incorrectly be set as free again. For that reason, only set
           * a mom back up if its reporting for itself. */
          if (strcmp(name, str + strlen("node=")) != 0)
            dont_change_state = TRUE;
          else
            current->nd_mom_reported_down = FALSE;
          }

        continue;
        }
      }

    /* add the info to the "temp" pbs_attribute */
    else if (!strcmp(str, START_GPU_STATUS))
      {
      is_gpustat_get(current, i, status_info);
      str = status_info[i].c_str();
      }
    else if (!strcmp(str, START_MIC_STATUS))
      {
      process_mic_status(current, i, status_info);
      str = status_info[i].c_str();
      }
#ifdef PENABLE_LINUX_CGROUPS
    else if (!strncmp(str, "layout", 6))
      {
      if (current->nd_layout == NULL)
        {
        current->nd_layout = new Machine(status_info[i]);
        }

      continue;
      }
#endif
    else if (!strcmp(str, "first_update=true"))
      {
      /* mom is requesting that we send the mom hierarchy file to her */
      //remove_hello(&hellos, current->nd_id);
      send_hello = true;
      
      /* reset gpu data in case mom reconnects with changed gpus */
      clear_nvidia_gpus(current);
      }
    else if ((rc = decode_arst(&temp, NULL, NULL, str, 0)) != PBSE_NONE)
      {
      DBPRT(("is_stat_get: cannot add attributes\n"));

      free_arst(&temp);

      break;
      }

    if (!strncmp(str, "state", 5))
      {
      if (dont_change_state == FALSE)
        process_state_str(current, str);
      }
    else if ((allow_any_mom == TRUE) &&
             (!strncmp(str, "uname", 5))) 
      {
      process_uname_str(current, str);
      }
    else if (!strncmp(str, "me", 2))  /* shorter str compare than "message" */
      {
      if ((!strncmp(str, "message=ERROR", 13)) &&
          (down_on_error == TRUE))
        {
        update_node_state(current, INUSE_DOWN);
        dont_change_state = TRUE;
        set_note_error(current, str);
        }
      }
    else if (!strncmp(str,"macaddr=",8))
      {
      update_node_mac_addr(current,str + 8);
      }
    else if ((mom_job_sync == TRUE) &&
             (!strncmp(str, "jobdata=", 8)))
      {
      /* update job attributes based on what the MOM gives us */      
      update_job_data(current, str + strlen("jobdata="));
      }
    else if ((mom_job_sync == TRUE) &&
             (!strncmp(str, "jobs=", 5)))
      {
      /* walk job list reported by mom */
      size_t         len = strlen(str) + strlen(current->nd_name) + 2;
      char          *jobstr = (char *)calloc(1, len);
      sync_job_info *sji = (sync_job_info *)calloc(1, sizeof(sync_job_info));

      if ((jobstr != NULL) &&
          (sji != NULL))
        {
        sprintf(jobstr, "%s:%s", current->nd_name, str+5);
        sji->input = jobstr;
        sji->timestamp = time(NULL);

        /* sji must be freed in sync_node_jobs */
        enqueue_threadpool_request(sync_node_jobs, sji, task_pool);
        }
      else
        {
        if (jobstr != NULL)
          {
          free(jobstr);
          }
        if (sji != NULL)
          {
          free(sji);
          }
        }
      }
    else if (auto_np)
      {
      if (!(strncmp(str, "ncpus=", 6)))
        {
        handle_auto_np(current, str);
        }
      }
    } /* END processing strings */

  if (current != NULL)
    {
    save_node_status(current, &temp);
    unlock_node(current, __func__, NULL, LOGLEVEL);
    }
  
  if ((rc == PBSE_NONE) &&
      (send_hello == true))
    rc = SEND_HELLO;
    
  return(rc);
  } /* END process_status_info() */
示例#14
0
int process_status_info(

  const char               *nd_name,
  std::vector<std::string> &status_info)

  {
  const char     *name = nd_name;
  pbsnode        *current;
  bool            mom_job_sync = true;
  bool            auto_np = false;
  bool            down_on_error = false;
  bool            note_append_on_error = false;
  int             dont_change_state = FALSE;
  int             rc = PBSE_NONE;
  bool            send_hello = false;
  std::string     temp;
#ifdef PENABLE_LINUX_CGROUPS
  bool            force_layout_update = false;
#endif

  get_svr_attr_b(SRV_ATR_MomJobSync, &mom_job_sync);
  get_svr_attr_b(SRV_ATR_AutoNodeNP, &auto_np);
  get_svr_attr_b(SRV_ATR_NoteAppendOnError, &note_append_on_error);
  get_svr_attr_b(SRV_ATR_DownOnError, &down_on_error);

  /* if original node cannot be found do not process the update */
  if ((current = find_nodebyname(nd_name)) == NULL)
    return(PBSE_NONE);

  //A node we put to sleep is up and running.
  if (current->nd_power_state != POWER_STATE_RUNNING)
    {
    //Make sure we wait for a stray update that came after we changed the state to pass
    //by.
    if((current->nd_power_state_change_time + NODE_POWER_CHANGE_TIMEOUT) < time(NULL))
      {
      current->nd_power_state = POWER_STATE_RUNNING;
      write_node_power_state();
      }
    }

  /* loop over each string */
  for (unsigned int i = 0; i != status_info.size(); i++)
    {
    const char *str = status_info[i].c_str();

    /* these two options are for switching nodes */
    if (!strncmp(str, NUMA_KEYWORD, strlen(NUMA_KEYWORD)))
      {

      /* if we've already processed some, save this before moving on */
      if (i != 0)
        {
        save_node_status(current, temp);
        temp.clear();
        }
      
      dont_change_state = FALSE;

      if ((current = get_numa_from_str(str, current)) == NULL)
        break;
      else
        continue;
      }
    else if (!strncmp(str, "node=", strlen("node=")))
      {
      /* if we've already processed some, save this before moving on */
      if (i != 0)
        {
        save_node_status(current, temp);
        temp.clear();
        }

      dont_change_state = FALSE;

      if ((current = get_node_from_str(str, name, current)) == NULL)
        break;
      else
        {
        if (current->nd_mom_reported_down == TRUE)
          {
          /* There is a race condition if using a mom hierarchy and manually
           * shutting down a non-level 1 mom: if its message that the mom is
           * shutting down gets there before its last status update, the node
           * can incorrectly be set as free again. For that reason, only set
           * a mom back up if its reporting for itself. */
          if (strcmp(name, str + strlen("node=")) != 0)
            dont_change_state = TRUE;
          else
            current->nd_mom_reported_down = FALSE;
          }

        continue;
        }
      }

    /* add the info to the "temp" pbs_attribute */
    else if (!strcmp(str, START_GPU_STATUS))
      {
      is_gpustat_get(current, i, status_info);
      continue;
      }
    else if (!strcmp(str, START_MIC_STATUS))
      {
      process_mic_status(current, i, status_info);
      continue;
      }
#ifdef PENABLE_LINUX_CGROUPS
    else if (!strcmp(str, "force_layout_update"))
      {
      force_layout_update = true;
      continue;
      }
    else if (!strncmp(str, "layout", 6))
      {
      // Add 7 to skip "layout="
      update_layout_if_needed(current, str + 7, force_layout_update);

      // reset this to false in case we have a mom hierarchy in place
      force_layout_update = false;

      continue;
      }
#endif
    else if (!strncmp(str, PLUGIN_EQUALS, PLUGIN_EQ_LEN))
      {
      current->capture_plugin_resources(str + PLUGIN_EQ_LEN);
      continue;
      }
    else if (!strncmp(str, "jobs=", 5))
      {
      /* walk job list reported by mom */
      sync_job_info *sji = new sync_job_info();
      sji->node_name = current->get_name();
      sji->job_info = str + 5;
      sji->sync_jobs = mom_job_sync;
        
      // sji is freed in sync_node_jobs()
      enqueue_threadpool_request(sync_node_jobs, sji, task_pool);

      continue;
      }
    else if (!strcmp(str, "first_update=true"))
      {
      /* mom is requesting that we send the mom hierarchy file to her */
      //remove_hello(&hellos, current->nd_id);
      send_hello = true;
      
      /* reset gpu data in case mom reconnects with changed gpus */
      clear_nvidia_gpus(current);

      continue;
      }
    else 
      {
      // Save this string to our status line.
      if (temp.size() > 0)
        temp += ",";

      if (!strncmp(str, "message=", 8))
        {
        std::string no_newlines(str);
        size_t pos = no_newlines.find('\n');
        
        while (pos != std::string::npos)
          {
          no_newlines.replace(pos, 1, 1, ' ');
          pos = no_newlines.find('\n');
          }

        temp += no_newlines;
        }
      else
        temp += str;
    
      if (!strncmp(str, "state", 5))
        {
        if (dont_change_state == FALSE)
          process_state_str(current, str);
        }
      else if ((allow_any_mom == TRUE) &&
               (!strncmp(str, "uname", 5))) 
        {
        process_uname_str(current, str);
        }
      else if (!strncmp(str, "me", 2))  /* shorter str compare than "message" */
        {
        if ((!strncmp(str, "message=ERROR", 13)) &&
            (down_on_error == TRUE))
          {
          update_node_state(current, INUSE_DOWN);
          dont_change_state = TRUE;

          if (note_append_on_error == true)
            {
            set_note_error(current, str);
            }
          }
        }
      else if (!strncmp(str,"macaddr=",8))
        {
        update_node_mac_addr(current,str + 8);
        }
      else if ((mom_job_sync == true) &&
               (!strncmp(str, "jobdata=", 8)))
        {
        /* update job attributes based on what the MOM gives us */      
        update_job_data(current, str + strlen("jobdata="));
        }
      else if ((auto_np) &&
               (!(strncmp(str, "ncpus=", 6))))

        {
        handle_auto_np(current, str);
        }
      else if (!strncmp(str, "version=", 8))
        {
        current->set_version(str + 8);
        }
      }

    } /* END processing strings */

  if (current != NULL)
    {
    save_node_status(current, temp);
    current->unlock_node(__func__, NULL, LOGLEVEL);
    }
  
  if ((rc == PBSE_NONE) &&
      (send_hello == true))
    rc = SEND_HELLO;
    
  return(rc);
  } /* END process_status_info() */
示例#15
0
int process_state_str(

  pbsnode    *np,
  const char *str)

  {
  char            log_buf[LOCAL_LOG_BUF_SIZE];
  int             rc = PBSE_NONE;

  if (np->nd_state & INUSE_NOHIERARCHY)
    {
    sprintf(log_buf, "node %s has not received its hierarchy yet.",
      np->get_name());

    log_err(-1, __func__, log_buf);
    return(PBSE_HIERARCHY_NOT_SENT);
    }
  if (!strncmp(str, "state=down", 10))
    {
    update_node_state(np, INUSE_DOWN);
    }
  else if (!strncmp(str, "state=busy", 10))
    {
    if (np->nd_state == INUSE_DOWN)
      {
      restore_note(np);
      }

    update_node_state(np, INUSE_BUSY);
    }
  else if (!strncmp(str, "state=free", 10))
    {
    if (np->nd_state == INUSE_DOWN)
      {
      restore_note(np);
      }

    update_node_state(np, INUSE_FREE);
    }
  else
    {
    sprintf(log_buf, "unknown %s from node %s",
      str,
      np->get_name());
    
    log_err(-1, __func__, log_buf);
    
    update_node_state(np, INUSE_UNKNOWN);
    }
  
  if (LOGLEVEL >= 9)
    {
    sprintf(log_buf, "node '%s' is at state '0x%x'\n",
      np->get_name(),
      np->nd_state);
    
    log_event(PBSEVENT_ADMIN, PBS_EVENTCLASS_SERVER, __func__, log_buf);
    }
  
  return(rc);
  } /* END process_state_str() */
void *check_if_orphaned(

  void *vp)

  {
  char           *node_name = (char *)vp;
  char           *rsv_id = NULL;
  std::string     job_id;
  batch_request  *preq;
  int             handle = -1;
  int             retries = 0;
  struct pbsnode *pnode;
  char            log_buf[LOCAL_LOG_BUF_SIZE];

  if ((rsv_id = strchr(node_name, ':')) != NULL)
    {
    *rsv_id = '\0';
    rsv_id++;
    }
  else
    {
    free(node_name);
    return(NULL);
    }

  if (alps_reservations.is_orphaned(rsv_id, job_id) == true)
    {
    // Make sure the node with the orphan is not available for jobs
    if ((pnode = find_nodebyname(node_name)) != NULL)
      {
      if ((pnode->nd_state & (INUSE_BUSY | INUSE_DOWN)) == 0)
        {
        snprintf(log_buf, sizeof(log_buf),
          "Node %s has an orphan but wasn't marked as busy. Marking as busy now.",
          node_name);
        log_err(-1, __func__, log_buf);

        update_node_state(pnode, INUSE_BUSY);
        }

      pnode->unlock_node(__func__, NULL, LOGLEVEL);
      }

    if ((preq = alloc_br(PBS_BATCH_DeleteReservation)) == NULL)
      {
      free(node_name);
      alps_reservations.remove_from_orphaned_list(rsv_id);
      return(NULL);
      }

    preq->rq_extend = strdup(rsv_id);

    if ((pnode = get_next_login_node(NULL)) != NULL)
      {
      struct in_addr hostaddr;
      int            local_errno;
      pbs_net_t      momaddr;

      memcpy(&hostaddr, &pnode->nd_sock_addr.sin_addr, sizeof(hostaddr));
      momaddr = ntohl(hostaddr.s_addr);

      snprintf(log_buf, sizeof(log_buf),
        "Found orphan ALPS reservation ID %s for job %s; asking %s to remove it",
        rsv_id,
        job_id.c_str(),
        pnode->get_name());
      log_record(PBSEVENT_DEBUG, PBS_EVENTCLASS_SERVER, __func__, log_buf);

      while ((handle < 0) &&
             (retries < 3))
        {
        handle = svr_connect(momaddr, pnode->nd_mom_port, &local_errno, pnode, NULL);
        retries++;
        }

      /* unlock before the network transaction */
      pnode->unlock_node(__func__, NULL, LOGLEVEL);
      
      if (handle >= 0)
        issue_Drequest(handle, preq, true);
        
      free_br(preq);
      }

    alps_reservations.remove_from_orphaned_list(rsv_id);
    }

  free(node_name);

  return(NULL);
  } /* END check_if_orphaned() */
/*************************************************
 * svr_is_request
 *
 * Return: svr_is_request always returns a non-zero value
 *         and it must call close_conn to close the connection
 *         before returning. PBSE_SOCKET_CLOSE is the code
 *         for a successful return. But which ever retun 
 *         code is iused it must terminate the while loop
 *         in start_process_pbs_server_port.
 *************************************************/
int svr_is_request(
    
  struct tcp_chan *chan,
  int              version)

  {
  int                 command = 0;
  int                 ret = DIS_SUCCESS;
  int                 i;
  int                 err;
  char                nodename[PBS_MAXHOSTNAME];
  int                 perm = ATR_DFLAG_MGRD | ATR_DFLAG_MGWR;

  unsigned long       ipaddr;
  unsigned short      mom_port;
  unsigned short      rm_port;
  unsigned long       tmpaddr;

  struct sockaddr_in *addr = NULL;
  struct sockaddr     s_addr;
  unsigned int        len = sizeof(s_addr);

  struct pbsnode     *node = NULL;
  char               *node_name = NULL;

  char                log_buf[LOCAL_LOG_BUF_SIZE+1];

  command = disrsi(chan, &ret);

  if (ret != DIS_SUCCESS)
    goto err;

  if (LOGLEVEL >= 4)
    {
    snprintf(log_buf, LOCAL_LOG_BUF_SIZE,
        "message received from sock %d (version %d)",
        chan->sock,
        version);

    log_event(PBSEVENT_ADMIN,PBS_EVENTCLASS_SERVER,__func__,log_buf);
    }

  if (getpeername(chan->sock, &s_addr, &len) != 0)
    {
    close_conn(chan->sock, FALSE);
    log_err(errno,__func__, (char *)"Cannot get socket name using getpeername\n");
    return(PBSE_SOCKET_CLOSE);
    }

  addr = (struct sockaddr_in *)&s_addr;

  if (version != IS_PROTOCOL_VER)
    {
    snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "protocol version %d unknown from %s",
      version,
      netaddr(addr));

    log_err(-1, __func__, log_buf);
    close_conn(chan->sock, FALSE);
    return PBSE_SOCKET_DATA;
    }

  /* check that machine is known */
  mom_port = disrsi(chan, &ret);
  rm_port = disrsi(chan, &ret);

  if (LOGLEVEL >= 3)
    {
    snprintf(log_buf, LOCAL_LOG_BUF_SIZE,
      "message received from addr %s: mom_port %d  - rm_port %d",
      netaddr(addr),
      mom_port,
      rm_port);

    log_event(PBSEVENT_ADMIN,PBS_EVENTCLASS_SERVER,__func__,log_buf);
    }

  ipaddr = ntohl(addr->sin_addr.s_addr);
  
  if ((node = AVL_find(ipaddr, mom_port, ipaddrs)) != NULL)
    {
    lock_node(node, __func__, "AVL_find", LOGLEVEL);
    } /* END if AVL_find != NULL) */
  else if (allow_any_mom)
    {
    char *name = get_cached_nameinfo(addr);

    if (name != NULL)
      snprintf(nodename, sizeof(nodename), "%s", name);
    else if (getnameinfo(&s_addr, len, nodename, sizeof(nodename)-1, NULL, 0, 0) != 0)
      {
      tmpaddr = ntohl(addr->sin_addr.s_addr);
      sprintf(nodename, "0x%lX", tmpaddr);
      }
    else
      insert_addr_name_info(nodename, NULL, addr);

    err = create_partial_pbs_node(nodename, ipaddr, perm);

    if (err == PBSE_NONE)
      {
      node = AVL_find(ipaddr, 0, ipaddrs);
       
      lock_node(node, __func__, "no error", LOGLEVEL);
      }                                                         
    }
    
  if (node == NULL)
    {
    /* node not listed in trusted ipaddrs list */
    
    snprintf(log_buf, LOCAL_LOG_BUF_SIZE,
      "bad attempt to connect from %s (address not trusted - check entry in server_priv/nodes)",
      netaddr(addr));
    
    if (LOGLEVEL >= 2)
      {
      log_record(PBSEVENT_SCHED, PBS_EVENTCLASS_REQUEST, __func__, log_buf);
      }
    else
      {
      log_err(-1, __func__, log_buf);
      }
    
    close_conn(chan->sock, FALSE);
    return PBSE_SOCKET_CLOSE;
    }

  if (LOGLEVEL >= 3)
    {
    snprintf(log_buf, LOCAL_LOG_BUF_SIZE,
      "message %s (%d) received from mom on host %s (%s) (sock %d)",
      PBSServerCmds2[command],
      command,
      node->nd_name,
      netaddr(addr),
      chan->sock);

    log_event(PBSEVENT_ADMIN,PBS_EVENTCLASS_SERVER,__func__,log_buf);
    }

  switch (command)
    {
    case IS_NULL:  /* a ping from server */

      DBPRT(("%s: IS_NULL\n", __func__))

      break;

    case IS_UPDATE:

      DBPRT(("%s: IS_UPDATE\n", __func__))

      i = disrui(chan, &ret);

      if (ret != DIS_SUCCESS)
        {
        if (LOGLEVEL >= 1)
          {
          snprintf(log_buf, LOCAL_LOG_BUF_SIZE,
              "IS_UPDATE error %d on node %s\n", ret, node->nd_name);

          log_err(ret, __func__, log_buf);
          }

        goto err;
        }

      DBPRT(("%s: IS_UPDATE %s 0x%x\n", __func__, node->nd_name, i))

      update_node_state(node, i);

      if ((node->nd_state & INUSE_DOWN) != 0)
        {
        node->nd_mom_reported_down = TRUE;
        }

      break;

    case IS_STATUS:

      if (LOGLEVEL >= 2)
        {
        snprintf(log_buf, LOCAL_LOG_BUF_SIZE,
            "IS_STATUS received from %s", node->nd_name);

        log_event(PBSEVENT_ADMIN, PBS_EVENTCLASS_SERVER, __func__, log_buf);
        }

      if ((node_name = strdup(node->nd_name)) == NULL)
        goto err;
      unlock_node(node, __func__, "before is_stat_get", LOGLEVEL);

      ret = is_stat_get(node_name, chan);

      node = find_nodebyname(node_name);

      if (ret == SEND_HELLO)
        {
        struct hello_info *hi = (struct hello_info *)calloc(1, sizeof(struct hello_info));
        write_tcp_reply(chan, IS_PROTOCOL, IS_PROTOCOL_VER, IS_STATUS, DIS_SUCCESS);

        hi->name = strdup(node_name);
        enqueue_threadpool_request(send_hierarchy_threadtask, hi);
        ret = DIS_SUCCESS;
        }
      else
        write_tcp_reply(chan,IS_PROTOCOL,IS_PROTOCOL_VER,IS_STATUS,ret);

      if(node != NULL)
        node->nd_stream = -1;

      if (ret != DIS_SUCCESS)
        {
        if (LOGLEVEL >= 1)
          {
          snprintf(log_buf, LOCAL_LOG_BUF_SIZE,
              "IS_STATUS error %d on node %s", ret, node_name);

          log_err(ret, __func__, log_buf);
          }
        free(node_name);

        goto err;
        }
      free(node_name);

      break;

    default:

      snprintf(log_buf, LOCAL_LOG_BUF_SIZE,
          "unknown command %d sent from %s",
        command,
        node->nd_name);

      log_err(-1, __func__, log_buf);

      goto err;

      break;
    }  /* END switch (command) */

  /* must be closed because mom opens and closes this connection each time */
  close_conn(chan->sock, FALSE);

  if(node != NULL)
    unlock_node(node, __func__, "close", LOGLEVEL);
  
  return PBSE_SOCKET_CLOSE;

err:

  /* a DIS write error has occurred */

  if (node != NULL)
    {
    if (LOGLEVEL >= 1)
      {
      DBPRT(("%s: error processing node %s\n",
            __func__,
            node->nd_name))
      }

    sprintf(log_buf, "%s from %s(%s)",
      dis_emsg[ret],
      node->nd_name,
      netaddr(addr));
    
    unlock_node(node, __func__, "err", LOGLEVEL);
    }
  else
    {
/*************************************************
 * svr_is_request
 *
 * Return: svr_is_request always returns a non-zero value
 *         and it must call close_conn to close the connection
 *         before returning. PBSE_SOCKET_CLOSE is the code
 *         for a successful return. But which ever retun
 *         code is iused it must terminate the while loop
 *         in start_process_pbs_server_port.
 *************************************************/
void *svr_is_request(

    void *v)

{
    int                 command = 0;
    int                 ret = DIS_SUCCESS;
    int                 i;
    int                 err;
    char                nodename[PBS_MAXHOSTNAME];
    int                 perm = ATR_DFLAG_MGRD | ATR_DFLAG_MGWR;

    unsigned long       ipaddr;
    unsigned short      mom_port;
    unsigned short      rm_port;
    unsigned long       tmpaddr;
    struct sockaddr_in  addr;
    struct pbsnode     *node = NULL;
    char                log_buf[LOCAL_LOG_BUF_SIZE+1];
    char                msg_buf[80];
    char                tmp[80];
    int                 version;
    struct tcp_chan    *chan;
    long               *args;
    is_request_info    *isr = (is_request_info *)v;

    if (isr == NULL)
        return(NULL);

    chan = isr->chan;
    args = isr->args;

    version = disrsi(chan, &ret);

    if (ret != DIS_SUCCESS)
    {
        log_err(-1,  __func__, "Cannot read version - skipping this request.\n");
        close_conn(chan->sock, FALSE);
        DIS_tcp_cleanup(chan);
        return(NULL);
    }

    command = disrsi(chan, &ret);

    if (ret != DIS_SUCCESS)
    {
        snprintf(log_buf, sizeof(log_buf), "could not read command: %d", ret);
        log_err(-1, __func__, log_buf);
        close_conn(chan->sock, FALSE);
        DIS_tcp_cleanup(chan);
        return(NULL);
    }

    if (LOGLEVEL >= 4)
    {
        snprintf(log_buf, LOCAL_LOG_BUF_SIZE,
                 "message received from sock %d (version %d)",
                 chan->sock,
                 version);

        log_event(PBSEVENT_ADMIN, PBS_EVENTCLASS_SERVER, __func__, log_buf);
    }

    /* Just a note to let us know we only do IPv4 for now */
    addr.sin_family = AF_INET;
    memcpy(&addr.sin_addr, (void *)&args[1], sizeof(struct in_addr));
    addr.sin_port = args[2];

    if (version != IS_PROTOCOL_VER)
    {
        netaddr_long(args[1], tmp);
        sprintf(msg_buf, "%s:%ld", tmp, args[2]);

        snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "protocol version %d unknown from %s",
                 version,
                 msg_buf);

        log_err(-1, __func__, log_buf);
        close_conn(chan->sock, FALSE);
        DIS_tcp_cleanup(chan);
        return(NULL);
    }

    /* check that machine is known */
    mom_port = disrsi(chan, &ret);
    rm_port = disrsi(chan, &ret);

    if (LOGLEVEL >= 3)
    {
        netaddr_long(args[1], tmp);
        sprintf(msg_buf, "%s:%ld", tmp, args[2]);
        snprintf(log_buf, LOCAL_LOG_BUF_SIZE,
                 "message received from addr %s: mom_port %d  - rm_port %d",
                 msg_buf,
                 mom_port,
                 rm_port);

        log_event(PBSEVENT_ADMIN,PBS_EVENTCLASS_SERVER,__func__,log_buf);
    }

    ipaddr = args[1];

    if ((node = AVL_find(ipaddr, mom_port, ipaddrs)) != NULL)
    {
        node->lock_node(__func__, "AVL_find", LOGLEVEL);
    } /* END if AVL_find != NULL) */
    else if (allow_any_mom)
    {
        const char *name = get_cached_nameinfo(&addr);

        if (name != NULL)
            snprintf(nodename, sizeof(nodename), "%s", name);
        else if (getnameinfo((struct sockaddr *)&addr, sizeof(addr), nodename, sizeof(nodename)-1, NULL, 0, 0) != 0)
        {
            tmpaddr = ntohl(addr.sin_addr.s_addr);
            sprintf(nodename, "0x%lX", tmpaddr);
        }
        else
            insert_addr_name_info(NULL, nodename);

        err = create_partial_pbs_node(nodename, ipaddr, perm);

        if (err == PBSE_NONE)
        {
            node = AVL_find(ipaddr, 0, ipaddrs);

            node->lock_node(__func__, "no error", LOGLEVEL);
        }
    }

    if (node == NULL)
    {
        /* node not listed in trusted ipaddrs list */
        netaddr_long(args[1], tmp);
        sprintf(msg_buf, "%s:%ld", tmp, args[2]);

        snprintf(log_buf, LOCAL_LOG_BUF_SIZE,
                 "bad attempt to connect from %s (address not trusted - check entry in server_priv/nodes)",
                 msg_buf);

        if (LOGLEVEL >= 2)
        {
            log_record(PBSEVENT_SCHED, PBS_EVENTCLASS_REQUEST, __func__, log_buf);
        }
        else
        {
            log_err(-1, __func__, log_buf);
        }

        close_conn(chan->sock, FALSE);
        DIS_tcp_cleanup(chan);
        return(NULL);
    }

    if (LOGLEVEL >= 3)
    {
        netaddr_long(args[1], tmp);
        sprintf(msg_buf, "%s:%ld", tmp, args[2]);

        snprintf(log_buf, LOCAL_LOG_BUF_SIZE,
                 "message %s (%d) received from mom on host %s (%s) (sock %d)",
                 PBSServerCmds2[command],
                 command,
                 node->get_name(),
                 msg_buf,
                 chan->sock);

        log_event(PBSEVENT_ADMIN,PBS_EVENTCLASS_SERVER,__func__,log_buf);
    }

    mutex_mgr node_mutex(&node->nd_mutex, true);

    switch (command)
    {
    case IS_NULL:  /* a ping from server */

        DBPRT(("%s: IS_NULL\n", __func__))

        break;

    case IS_UPDATE:

        DBPRT(("%s: IS_UPDATE\n", __func__))

        i = disrui(chan, &ret);

        if (ret != DIS_SUCCESS)
        {
            if (LOGLEVEL >= 1)
            {
                snprintf(log_buf, LOCAL_LOG_BUF_SIZE,
                         "IS_UPDATE error %d on node %s\n", ret, node->get_name());

                log_err(ret, __func__, log_buf);
            }

            goto err;
        }

        DBPRT(("%s: IS_UPDATE %s 0x%x\n", __func__, node->get_name(), i))

        update_node_state(node, i);

        if ((node->nd_state & INUSE_DOWN) != 0)
        {
            node->nd_mom_reported_down = TRUE;
        }

        break;

    case IS_STATUS:

    {
        std::string node_name = node->get_name();

        if (LOGLEVEL >= 2)
        {
            snprintf(log_buf, LOCAL_LOG_BUF_SIZE,
                     "IS_STATUS received from %s", node->get_name());

            log_event(PBSEVENT_ADMIN, PBS_EVENTCLASS_SERVER, __func__, log_buf);
        }

        node_mutex.unlock();

        ret = is_stat_get(node_name.c_str(), chan);

        node = find_nodebyname(node_name.c_str());

        if (node != NULL)
        {
            node->nd_stream = -1;
            node_mutex.mark_as_locked();

            if (ret == SEND_HELLO)
            {
                //struct hello_info *hi = new hello_info(node->nd_id);
                write_tcp_reply(chan, IS_PROTOCOL, IS_PROTOCOL_VER, IS_STATUS, DIS_SUCCESS);

                hierarchy_handler.sendHierarchyToANode(node);
                ret = DIS_SUCCESS;
            }
            else
                write_tcp_reply(chan,IS_PROTOCOL,IS_PROTOCOL_VER,IS_STATUS,ret);
        }

        if (ret != DIS_SUCCESS)
        {
            if (LOGLEVEL >= 1)
            {
                snprintf(log_buf, LOCAL_LOG_BUF_SIZE,
                         "IS_STATUS error %d on node %s", ret, node_name.c_str());

                log_err(ret, __func__, log_buf);
            }

            goto err;
        }

        break;
    }

    default:

        snprintf(log_buf, LOCAL_LOG_BUF_SIZE,
                 "unknown command %d sent from %s",
                 command,
                 node->get_name());

        log_err(-1, __func__, log_buf);

        goto err;

        break;
    }  /* END switch (command) */

    /* must be closed because mom opens and closes this connection each time */
    close_conn(chan->sock, FALSE);
    DIS_tcp_cleanup(chan);

    return(NULL);

err:

    /* a DIS write error has occurred */

    if (node != NULL)
    {
        if (LOGLEVEL >= 1)
        {
            DBPRT(("%s: error processing node %s\n",
                   __func__,
                   node->get_name()))
        }

        netaddr_long(args[1], tmp);
        sprintf(msg_buf, "%s:%ld", tmp, args[2]);

        sprintf(log_buf, "%s from %s(%s)",
                dis_emsg[ret],
                node->get_name(),
                msg_buf);
    }
    else
    {
示例#19
0
int process_state_str(

  struct pbsnode *np,
  char           *str)

  {
  char            log_buf[LOCAL_LOG_BUF_SIZE];
  struct pbssubn *sp = NULL;
  int             rc = PBSE_NONE;

  if (!strncmp(str, "state=down", 10))
    {
    update_node_state(np, INUSE_DOWN);
    }
  else if (!strncmp(str, "state=busy", 10))
    {
    update_node_state(np, INUSE_BUSY);
    }
  else if (!strncmp(str, "state=free", 10))
    {
    update_node_state(np, INUSE_FREE);
    }
  else
    {
    sprintf(log_buf, "unknown %s from node %s",
      str,
      (np->nd_name != NULL) ? np->nd_name : "NULL");
    
    log_err(-1, __func__, log_buf);
    
    update_node_state(np, INUSE_UNKNOWN);
    }
  
  if (LOGLEVEL >= 9)
    {
    sprintf(log_buf, "node '%s' is at state '0x%x'\n",
      np->nd_name,
      np->nd_state);
    
    log_event(PBSEVENT_ADMIN, PBS_EVENTCLASS_SERVER, __func__, log_buf);
    }
  
  for (sp = np->nd_psn; sp != NULL; sp = sp->next)
    {
    if ((!(np->nd_state & INUSE_OFFLINE)) &&
        (sp->inuse & INUSE_OFFLINE))
      {
      /* this doesn't seem to ever happen */
      if (LOGLEVEL >= 2)
        {
        sprintf(log_buf, "sync'ing subnode state '%s' with node state on node %s\n",
          "offline",
          np->nd_name);
        
        log_event(PBSEVENT_ADMIN, PBS_EVENTCLASS_SERVER, __func__, log_buf);
        }
      
      sp->inuse &= ~INUSE_OFFLINE;
      }
    
    sp->inuse &= ~INUSE_DOWN;
    }

  return(rc);
  } /* END process_state_str() */