Example #1
0
void *send_power_state_to_mom(
    
  void *arg)

  {
  struct batch_request  *pRequest = (struct batch_request *)arg;
  struct pbsnode        *pNode = find_nodebyname(pRequest->rq_host);

  if (pNode == NULL)
    {
    free_br(pRequest);
    return NULL;
    }

  int handle = 0;
  int local_errno = 0;

  handle = svr_connect(pNode->nd_addrs[0],pNode->nd_mom_port,&local_errno,pNode,NULL);
  if (handle < 0)
    {
    unlock_node(pNode, __func__, "Error connecting", LOGLEVEL);
    return NULL;
    }

  unlock_node(pNode, __func__, "Done connecting", LOGLEVEL);
  issue_Drequest(handle, pRequest, true);

  return NULL;
  }
struct pbsnode *get_node_from_str(

  const char     *str,     /* I */
  const char     *orig_id, /* I */
  struct pbsnode *np)      /* M */

  {
  /* this is a node reporting on another node as well */
  const char     *node_id = str + strlen("node=");
  struct pbsnode *next = NULL;
  char            log_buf[LOCAL_LOG_BUF_SIZE];
 
  /* don't do anything if the name is the same as this node's name */
  if (strcmp(node_id, np->nd_name))
    {
    unlock_node(np, __func__, "np not numa update", LOGLEVEL);
    
    next = find_nodebyname(node_id);
    
    if (next == NULL)
      {
      /* NYI: should we add logic here to attempt the canonical name if this 
       * is the short name, and attempt the short name if this is the 
       * canonical name? */
      
      /* ERROR */
      snprintf(log_buf,sizeof(log_buf),
        "Node %s is reporting on node %s, which pbs_server doesn't know about\n",
        orig_id,
        node_id);
      log_err(-1, __func__, log_buf);
      }
    else
      {
      if (LOGLEVEL >= 7)
        {
        snprintf(log_buf,sizeof(log_buf),
          "Node %s is reporting for node %s\n",
          orig_id,
          node_id);
        
        log_event(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, __func__, log_buf);
        }
      
      next->nd_lastupdate = time(NULL);
      }
    }
  else
    {
    next = np;
    next->nd_lastupdate = time(NULL);
    }

  /* next may be NULL */

  return(next);
  } /* END get_node_from_str() */
Example #3
0
END_TEST
#endif


START_TEST(test_update_failure_counts)
  {
  const char *name = "lihue";
  struct pbsnode *pnode = find_nodebyname(name);
  update_failure_counts(name, -1);
  update_failure_counts(name, -1);

  // Make sure the two failures are correctly counted
  fail_unless(pnode->nd_proximal_failures == 2);
  fail_unless(pnode->nd_consecutive_successes == 0);
  fail_unless(pnode->nd_state == INUSE_FREE);

  // One success shouldn't reset the failure counts
  update_failure_counts(name, 0);
  fail_unless(pnode->nd_proximal_failures == 2);
  fail_unless(pnode->nd_consecutive_successes == 1);
  fail_unless(pnode->nd_state == INUSE_FREE);
  
  // Two should
  update_failure_counts(name, 0);
  fail_unless(pnode->nd_proximal_failures == 0);
  fail_unless(pnode->nd_consecutive_successes == 2);
  fail_unless(pnode->nd_state == INUSE_FREE);

  // One failure should reset the success count
  update_failure_counts(name, 1);
  fail_unless(pnode->nd_proximal_failures == 1);
  fail_unless(pnode->nd_consecutive_successes == 0);
  fail_unless(pnode->nd_state == INUSE_FREE);

  // State shouldn't change until there are 3 proximal failures
  update_failure_counts(name, 1);
  fail_unless(pnode->nd_proximal_failures == 2);
  fail_unless(pnode->nd_consecutive_successes == 0);
  fail_unless(pnode->nd_state == INUSE_FREE);
  
  update_failure_counts(name, 1);
  fail_unless(pnode->nd_state != INUSE_FREE);
  fail_unless(pnode->nd_proximal_failures == 3);

  // State shouldn't reset until there are 2 consecutive successes
  update_failure_counts(name, 0);
  fail_unless(pnode->nd_state != INUSE_FREE);
  fail_unless(pnode->nd_proximal_failures == 3);
  fail_unless(pnode->nd_consecutive_successes == 1);
  
  update_failure_counts(name, 0);
  fail_unless(pnode->nd_state == INUSE_FREE);
  fail_unless(pnode->nd_proximal_failures == 0);
  fail_unless(pnode->nd_consecutive_successes == 2);
  }
int is_reporter_node(

    const char *node_id)

{
    struct pbsnode *pnode = find_nodebyname(node_id);
    int             rc = FALSE;

    if (pnode != NULL)
    {
        rc = pnode->nd_is_alps_reporter;
        pnode->unlock_node(__func__, NULL, LOGLEVEL);
    }

    return(rc);
} /* END is_reporter_node() */
Example #5
0
int get_mom_node_version(
  
  const char *job_id, 
  int        &version)

  {
  job *pjob;
  pbsnode *pnode;

  pjob = svr_find_job(job_id, TRUE);
  if (pjob == NULL)
    return(PBSE_UNKJOBID);

  mutex_mgr job_mutex(pjob->ji_mutex, true);

  pnode = find_nodebyname(pjob->ji_qs.ji_destin);
  if (pnode == NULL)
    return(PBSE_UNKNODE);

  mutex_mgr node_mutex(&pnode->nd_mutex, true);
  version = pnode->get_version();

  return(PBSE_NONE);
  }
Example #6
0
int process_alps_status(

  char           *nd_name,
  dynamic_string *status_info)

  {
  char           *str;
  char            node_index_buf[MAXLINE];
  int             node_index = 0;
  struct pbsnode *parent;
  struct pbsnode *current = NULL;
  int             rc;
  pbs_attribute   temp;

  memset(&temp, 0, sizeof(temp));

  if ((rc = decode_arst(&temp, NULL, NULL, NULL, 0)) != PBSE_NONE)
    {
    log_record(PBSEVENT_DEBUG, PBS_EVENTCLASS_NODE, __func__, "cannot initialize attribute");
    return(rc);
    }

  /* if we can't find the parent node, ignore the update */
  if ((parent = find_nodebyname(nd_name)) == NULL)
    return(PBSE_NONE);

  /* loop over each string */
  for (str = status_info->str; str != NULL && *str != '\0'; str += strlen(str) + 1)
    {
    if (!strncmp(str, "node=", strlen("node=")))
      {
      if (str != status_info->str)
        {
        snprintf(node_index_buf, sizeof(node_index_buf), "node_index=%d", node_index++);
        decode_arst(&temp, NULL, NULL, node_index_buf, 0);
        save_node_status(current, &temp);
        }

      if ((current = determine_node_from_str(str, parent, current)) == NULL)
        break;
      else
        continue;
      }

    /* process the gpu status information separately */
    if (!strcmp(CRAY_GPU_STATUS_START, str))
      {
      process_gpu_status(current, &str);
      continue;
      }
    else if (!strncmp(reservation_id, str, strlen(reservation_id)))
      {
      process_reservation_id(current, str);
      }
    /* save this as is to the status strings */
    else if ((rc = decode_arst(&temp, NULL, NULL, str, 0)) != PBSE_NONE)
      {
      free_arst(&temp);
      return(rc);
      }

    /* perform any special processing */
    if (!strncmp(str, cproc_eq, cproc_eq_len))
      {
      set_ncpus(current, str);
      }
    else if (!strncmp(str, state, strlen(state)))
      {
      set_state(current, str);
      }

    } /* END processing the status update */

  if (current != NULL)
    {
    snprintf(node_index_buf, sizeof(node_index_buf), "node_index=%d", node_index++);
    decode_arst(&temp, NULL, NULL, node_index_buf, 0);
    save_node_status(current, &temp);
    unlock_node(current, __func__, NULL, 0);
    }

  unlock_node(parent, __func__, NULL, 0);

  return(PBSE_NONE);
  } /* END process_alps_status() */
void *check_if_orphaned(

  void *vp)

  {
  char           *node_name = (char *)vp;
  char           *rsv_id = NULL;
  std::string     job_id;
  batch_request  *preq;
  int             handle = -1;
  int             retries = 0;
  struct pbsnode *pnode;
  char            log_buf[LOCAL_LOG_BUF_SIZE];

  if ((rsv_id = strchr(node_name, ':')) != NULL)
    {
    *rsv_id = '\0';
    rsv_id++;
    }
  else
    {
    free(node_name);
    return(NULL);
    }

  if (alps_reservations.is_orphaned(rsv_id, job_id) == true)
    {
    // Make sure the node with the orphan is not available for jobs
    if ((pnode = find_nodebyname(node_name)) != NULL)
      {
      if ((pnode->nd_state & (INUSE_BUSY | INUSE_DOWN)) == 0)
        {
        snprintf(log_buf, sizeof(log_buf),
          "Node %s has an orphan but wasn't marked as busy. Marking as busy now.",
          node_name);
        log_err(-1, __func__, log_buf);

        update_node_state(pnode, INUSE_BUSY);
        }

      pnode->unlock_node(__func__, NULL, LOGLEVEL);
      }

    if ((preq = alloc_br(PBS_BATCH_DeleteReservation)) == NULL)
      {
      free(node_name);
      alps_reservations.remove_from_orphaned_list(rsv_id);
      return(NULL);
      }

    preq->rq_extend = strdup(rsv_id);

    if ((pnode = get_next_login_node(NULL)) != NULL)
      {
      struct in_addr hostaddr;
      int            local_errno;
      pbs_net_t      momaddr;

      memcpy(&hostaddr, &pnode->nd_sock_addr.sin_addr, sizeof(hostaddr));
      momaddr = ntohl(hostaddr.s_addr);

      snprintf(log_buf, sizeof(log_buf),
        "Found orphan ALPS reservation ID %s for job %s; asking %s to remove it",
        rsv_id,
        job_id.c_str(),
        pnode->get_name());
      log_record(PBSEVENT_DEBUG, PBS_EVENTCLASS_SERVER, __func__, log_buf);

      while ((handle < 0) &&
             (retries < 3))
        {
        handle = svr_connect(momaddr, pnode->nd_mom_port, &local_errno, pnode, NULL);
        retries++;
        }

      /* unlock before the network transaction */
      pnode->unlock_node(__func__, NULL, LOGLEVEL);
      
      if (handle >= 0)
        issue_Drequest(handle, preq, true);
        
      free_br(preq);
      }

    alps_reservations.remove_from_orphaned_list(rsv_id);
    }

  free(node_name);

  return(NULL);
  } /* END check_if_orphaned() */
Example #8
0
int req_stat_node(

  struct batch_request *preq)

  {
  char                 *name;

  int                   rc   = PBSE_NONE;
  int                   type = 0;
  int                   bad  = 0;

  struct pbsnode       *pnode = NULL;
  struct batch_reply   *preply;
  struct prop props;
  svrattrl             *pal;

  /*
   * first, check that the server indeed has a list of nodes
   * and if it does, validate the name of the requested object--
   * either name is that of a specific node, or name[0] is null/@
   * meaning request is for all nodes in the server's jurisdiction
   */

  if (LOGLEVEL >= 6)
    {
    log_record( PBSEVENT_SCHED, PBS_EVENTCLASS_REQUEST, __func__, "entered");
    }

  if (svr_totnodes <= 0)
    {
    rc = PBSE_NONODES;
    req_reject(rc, 0, preq, NULL, "node list is empty - check 'server_priv/nodes' file");

    return rc;
    }

  name = preq->rq_ind.rq_status.rq_id;

  if ((*name == '\0') || (*name == '@'))
    {
    type = 1;
    }
  else if ((*name == ':') && (*(name + 1) != '\0'))
    {
    if (!strcmp(name + 1, "ALL"))
      {
      type = 1;  /* psuedo-group for all nodes */
      }
    else
      {
      type = 2;
      props.name = name + 1;
      props.mark = 1;
      props.next = NULL;
      }
    }

  preply = &preq->rq_reply;

  preply->brp_choice = BATCH_REPLY_CHOICE_Status;

  CLEAR_HEAD(preply->brp_un.brp_status);

  if (type == 0)
    {
    /* get status of the named node */
    pnode = find_nodebyname(name);
    if (pnode == NULL)
      {
      rc = PBSE_UNKNODE;
      req_reject(rc, 0, preq, NULL, "cannot locate specified node");
      return(rc);
      }

    /* get the status on all of the numa nodes */
    if (pnode->nd_is_alps_reporter == TRUE)
      rc = get_alps_statuses(pnode, preq, &bad, &preply->brp_un.brp_status);
    else
      rc = get_numa_statuses(pnode, preq, &bad, &preply->brp_un.brp_status);

    unlock_node(pnode, __func__, "type == 0", LOGLEVEL);
    }
  else
    {
    /* get status of all or several nodes */
    all_nodes_iterator *iter = NULL;

    while ((pnode = next_host(&allnodes,&iter,NULL)) != NULL)
      {
      if ((type == 2) && 
          (!hasprop(pnode, &props)))
        {
        unlock_node(pnode, __func__, "type != 0, next_host", LOGLEVEL);
        continue;
        }

      /* get the status on all of the numa nodes */
      if (pnode->nd_is_alps_reporter == TRUE)
        rc = get_alps_statuses(pnode, preq, &bad, &preply->brp_un.brp_status);
      else
        rc = get_numa_statuses(pnode, preq, &bad, &preply->brp_un.brp_status);
      
      if (rc != PBSE_NONE)
        {
        unlock_node(pnode, __func__, "type != 0, rc != 0, get_numa_statuses", LOGLEVEL);
        break;
        }

      unlock_node(pnode, __func__, "type != 0, rc == 0, get_numa_statuses", LOGLEVEL);
      }

    if (iter != NULL)
      delete iter;
    }

  if (rc == PBSE_NONE)
    {
    /* SUCCESS */

    reply_send_svr(preq);
    }
  else
    {
    if (rc != PBSE_UNKNODEATR)
      {
      req_reject(rc, 0, preq, NULL, NULL);
      }
    else
      {
      pal = (svrattrl *)GET_NEXT(preq->rq_ind.rq_status.rq_attr);

      reply_badattr(rc, bad, pal, preq);
      }
    }

  return(rc);
  }  /* END req_stat_node() */
Example #9
0
void req_gpuctrl(

  struct batch_request *preq)

  {
  char   *id = "req_gpuctrl";

  char  *nodename = NULL;
  char  *gpuid = NULL;
  int    gpumode = -1;
  int    reset_perm = -1;
  int    reset_vol = -1;
#ifdef NVIDIA_GPUS
  struct pbsnode *pnode = NULL;
  int    gpuidx = -1;
  int    rc = 0;
  int    conn;
#endif  /* NVIDIA_GPUS */

  if ((preq->rq_perm &
       (ATR_DFLAG_MGWR | ATR_DFLAG_MGRD | ATR_DFLAG_OPRD | ATR_DFLAG_OPWR)) == 0)
    {
    req_reject(PBSE_PERM, 0, preq, NULL, NULL);
    return;
    }

  nodename = preq->rq_ind.rq_gpuctrl.rq_momnode;
  gpuid = preq->rq_ind.rq_gpuctrl.rq_gpuid;
  gpumode = preq->rq_ind.rq_gpuctrl.rq_gpumode;
  reset_perm = preq->rq_ind.rq_gpuctrl.rq_reset_perm;
  reset_vol = preq->rq_ind.rq_gpuctrl.rq_reset_vol;

#ifdef NVIDIA_GPUS

  if (LOGLEVEL >= 7)
    {
    sprintf(
      log_buffer,
      "GPU control request for node %s gpuid %s mode %d reset_perm %d reset_vol %d",
      nodename,
      gpuid,
      gpumode,
      reset_perm,
      reset_vol);

    log_ext(-1, id, log_buffer, LOG_INFO);
    }

  /* validate mom node exists */

  pnode = find_nodebyname(nodename);

  if (pnode == NULL)
    {
    req_reject(PBSE_UNKNODE, 0, preq, NULL, NULL);
    return;
    }

  /* validate that the node is up */

  if (pnode->nd_state & (INUSE_DELETED | INUSE_DOWN | INUSE_OFFLINE | INUSE_UNKNOWN))
    {
    sprintf(
      log_buffer,
      "Node %s is not available",
      pnode->nd_name);
    req_reject(PBSE_UNKREQ, 0, preq, NULL, log_buffer);
    return;
    }


  /* validate that the node has real gpus not virtual */

  if (!pnode->nd_gpus_real)
    {
    req_reject(PBSE_UNKREQ, 0, preq, NULL, "Not allowed for virtual gpus");
    return;
    }

  /* validate the gpuid exists */

  if ((gpuidx = gpu_entry_by_id(pnode, gpuid, FALSE)) == -1)
    {
    req_reject(PBSE_UNKREQ, 0, preq, NULL, "GPU ID does not exist on node");
    return;
    }

  /* validate that we have a real request */

  if ((gpumode == -1) && (reset_perm == -1) && (reset_vol == -1))
    {
    req_reject(PBSE_UNKREQ, 0, preq, NULL, "No action specified");
    return;
    }

  /* for mode changes validate the mode with the driver_version */

  if ((pnode->nd_gpusn[gpuidx].driver_ver == 260) && (gpumode > 2))
    {
    req_reject(PBSE_UNKREQ, 0, preq, NULL, "GPU driver version does not support mode 3");
    return;
    }

  /* we need to relay request to the mom for processing */
  /* have MOM attempt to change the gpu mode */

  preq->rq_orgconn = preq->rq_conn;  /* restore client socket */

  conn = svr_connect(
           pnode->nd_addrs[0],
           pbs_mom_port,
           process_Dreply,
           ToServerDIS);

  if (conn >= 0)
    {
    if ((rc = issue_Drequest(conn, preq, process_gpu_request_reply, NULL)) != 0)
      {
      req_reject(rc, 0, preq, NULL, NULL);
      }
    }
  else
    {
    req_reject(PBSE_UNKREQ, 0, preq, NULL, "Failed to get connection to mom");
    }

#else

    sprintf(
      log_buffer,
      "GPU control request not supported: node %s gpuid %s mode %d reset_perm %d reset_vol %d",
      nodename,
      gpuid,
      gpumode,
      reset_perm,
      reset_vol);

  if (LOGLEVEL >= 3)
    {
      log_ext(-1, id, log_buffer, LOG_INFO);
    }

  req_reject(PBSE_NOSUP, 0, preq, NULL, NULL);

#endif  /* NVIDIA_GPUS */

  return;
  }
Example #10
0
int process_status_info(

  const char               *nd_name,
  std::vector<std::string> &status_info)

  {
  const char     *name = nd_name;
  pbsnode        *current;
  bool            mom_job_sync = true;
  bool            auto_np = false;
  bool            down_on_error = false;
  bool            note_append_on_error = false;
  int             dont_change_state = FALSE;
  int             rc = PBSE_NONE;
  bool            send_hello = false;
  std::string     temp;
#ifdef PENABLE_LINUX_CGROUPS
  bool            force_layout_update = false;
#endif

  get_svr_attr_b(SRV_ATR_MomJobSync, &mom_job_sync);
  get_svr_attr_b(SRV_ATR_AutoNodeNP, &auto_np);
  get_svr_attr_b(SRV_ATR_NoteAppendOnError, &note_append_on_error);
  get_svr_attr_b(SRV_ATR_DownOnError, &down_on_error);

  /* if original node cannot be found do not process the update */
  if ((current = find_nodebyname(nd_name)) == NULL)
    return(PBSE_NONE);

  //A node we put to sleep is up and running.
  if (current->nd_power_state != POWER_STATE_RUNNING)
    {
    //Make sure we wait for a stray update that came after we changed the state to pass
    //by.
    if((current->nd_power_state_change_time + NODE_POWER_CHANGE_TIMEOUT) < time(NULL))
      {
      current->nd_power_state = POWER_STATE_RUNNING;
      write_node_power_state();
      }
    }

  /* loop over each string */
  for (unsigned int i = 0; i != status_info.size(); i++)
    {
    const char *str = status_info[i].c_str();

    /* these two options are for switching nodes */
    if (!strncmp(str, NUMA_KEYWORD, strlen(NUMA_KEYWORD)))
      {

      /* if we've already processed some, save this before moving on */
      if (i != 0)
        {
        save_node_status(current, temp);
        temp.clear();
        }
      
      dont_change_state = FALSE;

      if ((current = get_numa_from_str(str, current)) == NULL)
        break;
      else
        continue;
      }
    else if (!strncmp(str, "node=", strlen("node=")))
      {
      /* if we've already processed some, save this before moving on */
      if (i != 0)
        {
        save_node_status(current, temp);
        temp.clear();
        }

      dont_change_state = FALSE;

      if ((current = get_node_from_str(str, name, current)) == NULL)
        break;
      else
        {
        if (current->nd_mom_reported_down == TRUE)
          {
          /* There is a race condition if using a mom hierarchy and manually
           * shutting down a non-level 1 mom: if its message that the mom is
           * shutting down gets there before its last status update, the node
           * can incorrectly be set as free again. For that reason, only set
           * a mom back up if its reporting for itself. */
          if (strcmp(name, str + strlen("node=")) != 0)
            dont_change_state = TRUE;
          else
            current->nd_mom_reported_down = FALSE;
          }

        continue;
        }
      }

    /* add the info to the "temp" pbs_attribute */
    else if (!strcmp(str, START_GPU_STATUS))
      {
      is_gpustat_get(current, i, status_info);
      continue;
      }
    else if (!strcmp(str, START_MIC_STATUS))
      {
      process_mic_status(current, i, status_info);
      continue;
      }
#ifdef PENABLE_LINUX_CGROUPS
    else if (!strcmp(str, "force_layout_update"))
      {
      force_layout_update = true;
      continue;
      }
    else if (!strncmp(str, "layout", 6))
      {
      // Add 7 to skip "layout="
      update_layout_if_needed(current, str + 7, force_layout_update);

      // reset this to false in case we have a mom hierarchy in place
      force_layout_update = false;

      continue;
      }
#endif
    else if (!strncmp(str, PLUGIN_EQUALS, PLUGIN_EQ_LEN))
      {
      current->capture_plugin_resources(str + PLUGIN_EQ_LEN);
      continue;
      }
    else if (!strncmp(str, "jobs=", 5))
      {
      /* walk job list reported by mom */
      sync_job_info *sji = new sync_job_info();
      sji->node_name = current->get_name();
      sji->job_info = str + 5;
      sji->sync_jobs = mom_job_sync;
        
      // sji is freed in sync_node_jobs()
      enqueue_threadpool_request(sync_node_jobs, sji, task_pool);

      continue;
      }
    else if (!strcmp(str, "first_update=true"))
      {
      /* mom is requesting that we send the mom hierarchy file to her */
      //remove_hello(&hellos, current->nd_id);
      send_hello = true;
      
      /* reset gpu data in case mom reconnects with changed gpus */
      clear_nvidia_gpus(current);

      continue;
      }
    else 
      {
      // Save this string to our status line.
      if (temp.size() > 0)
        temp += ",";

      if (!strncmp(str, "message=", 8))
        {
        std::string no_newlines(str);
        size_t pos = no_newlines.find('\n');
        
        while (pos != std::string::npos)
          {
          no_newlines.replace(pos, 1, 1, ' ');
          pos = no_newlines.find('\n');
          }

        temp += no_newlines;
        }
      else
        temp += str;
    
      if (!strncmp(str, "state", 5))
        {
        if (dont_change_state == FALSE)
          process_state_str(current, str);
        }
      else if ((allow_any_mom == TRUE) &&
               (!strncmp(str, "uname", 5))) 
        {
        process_uname_str(current, str);
        }
      else if (!strncmp(str, "me", 2))  /* shorter str compare than "message" */
        {
        if ((!strncmp(str, "message=ERROR", 13)) &&
            (down_on_error == TRUE))
          {
          update_node_state(current, INUSE_DOWN);
          dont_change_state = TRUE;

          if (note_append_on_error == true)
            {
            set_note_error(current, str);
            }
          }
        }
      else if (!strncmp(str,"macaddr=",8))
        {
        update_node_mac_addr(current,str + 8);
        }
      else if ((mom_job_sync == true) &&
               (!strncmp(str, "jobdata=", 8)))
        {
        /* update job attributes based on what the MOM gives us */      
        update_job_data(current, str + strlen("jobdata="));
        }
      else if ((auto_np) &&
               (!(strncmp(str, "ncpus=", 6))))

        {
        handle_auto_np(current, str);
        }
      else if (!strncmp(str, "version=", 8))
        {
        current->set_version(str + 8);
        }
      }

    } /* END processing strings */

  if (current != NULL)
    {
    save_node_status(current, temp);
    current->unlock_node(__func__, NULL, LOGLEVEL);
    }
  
  if ((rc == PBSE_NONE) &&
      (send_hello == true))
    rc = SEND_HELLO;
    
  return(rc);
  } /* END process_status_info() */
Example #11
0
int set_node_power_state(
    
  struct pbsnode **ppNode,
  unsigned short   newState)

  {
  struct pbsnode *pNode = *ppNode;
  if (pNode->nd_addrs == NULL)
    {
    return PBSE_BAD_PARAMETER;
    }

  if (newState == POWER_STATE_RUNNING)
    {
    static std::string interface;
    static unsigned char mac_addr[6];
    if (interface.length() == 0)
      {
      if (!getMacAddr(interface,mac_addr))
        {
        return PBSE_SYSTEM;
        }
      }

    int sock;
    if ((sock = socket(AF_INET,SOCK_PACKET,SOCK_PACKET)) < 0)
      {
      return PBSE_SYSTEM;
      }

    unsigned char outpack[1000];

    memcpy(outpack+6,mac_addr,6);
    memcpy(outpack,pNode->nd_mac_addr,6);
    outpack[12] = 0x08;
    outpack[13] = 0x42;
    int offset = 14;
    memset(outpack + offset,0xff,6);
    offset += 6;

    for (int i = 0;i < 16;i++)
      {
      memcpy(outpack + offset,pNode->nd_mac_addr,6);
      offset += 6;
      }

    int one = 1;
    if (setsockopt(sock, SOL_SOCKET, SO_BROADCAST, (char *)&one, sizeof(one)) < 0)
      {
      close(sock);
      return PBSE_SYSTEM;
      }

    struct sockaddr whereto;
    whereto.sa_family = 0;
    snprintf(whereto.sa_data, sizeof(whereto.sa_data), "%s", interface.c_str());

    if (sendto(sock, outpack, offset, 0, &whereto, sizeof(whereto)) < 0)
      {
      close(sock);
      return PBSE_SYSTEM;
      }

    close(sock);
    return PBSE_NONE;
    }

  if (pNode->nd_job_usages.size() != 0)
    {
    //Can't change the power state on a node with running jobs.
    return PBSE_CANT_CHANGE_POWER_STATE_WITH_JOBS_RUNNING;
    }
  struct batch_request *request = alloc_br(PBS_BATCH_ChangePowerState);
  if (request == NULL)
    {
    return PBSE_SYSTEM;
    }

  request->rq_ind.rq_powerstate = newState;
  pNode->nd_power_state_change_time = time(NULL);

  snprintf(request->rq_host, sizeof(request->rq_host), "%s", pNode->nd_name);
  std::string hostname(request->rq_host);
  int rc = PBSE_NONE;

  {
    int handle = 0;
    int local_errno = 0;
    handle = svr_connect(pNode->nd_addrs[0],pNode->nd_mom_port,&local_errno,pNode,NULL);
    if(handle < 0)
      {
      unlock_node(pNode, __func__, "Error connecting", LOGLEVEL);
      *ppNode = NULL;
      return local_errno;
      }
    unlock_node(pNode, __func__, "Done connecting", LOGLEVEL);
    *ppNode = NULL;
    rc = issue_Drequest(handle, request,true);
    if(rc == PBSE_NONE)
      {
      rc = request->rq_reply.brp_code;
      if(rc < 0) rc = -rc;
      }
  }
  pNode = find_nodebyname(hostname.c_str());
  *ppNode = pNode;
  if ((rc == PBSE_NONE)&&(pNode != NULL))
    {
    pNode->nd_power_state = newState;
    }

  return(rc);
  }
Example #12
0
int process_alps_status(

  char           *nd_name,
  boost::ptr_vector<std::string>& status_info)

  {
  char           *current_node_id = NULL;
  char            node_index_buf[MAXLINE];
  int             node_index = 0;
  struct pbsnode *parent;
  struct pbsnode *current = NULL;
  int             rc;
  pbs_attribute   temp;
  hash_table_t   *rsv_ht;
  char            log_buf[LOCAL_LOG_BUF_SIZE];

  memset(&temp, 0, sizeof(temp));

  if ((rc = decode_arst(&temp, NULL, NULL, NULL, 0)) != PBSE_NONE)
    {
    log_record(PBSEVENT_DEBUG, PBS_EVENTCLASS_NODE, __func__, "cannot initialize attribute");
    return(rc);
    }

  /* if we can't find the parent node, ignore the update */
  if ((parent = find_nodebyname(nd_name)) == NULL)
    return(PBSE_NONE);

  /* keep track of reservations so that they're only processed once per update */
  rsv_ht = create_hash(INITIAL_RESERVATION_HOLDER_SIZE);

  /* loop over each string */
  for(boost::ptr_vector<std::string>::iterator i = status_info.begin();i != status_info.end();i++)
    {
    const char *str = i->c_str();
    if (!strncmp(str, "node=", strlen("node=")))
      {
      if (i != status_info.begin())
        {
        snprintf(node_index_buf, sizeof(node_index_buf), "node_index=%d", node_index++);
        decode_arst(&temp, NULL, NULL, node_index_buf, 0);
        save_node_status(current, &temp);
        }

      if ((current = determine_node_from_str(str, parent, current)) == NULL)
        break;
      else
        continue;
      }

    if(current == NULL)
      continue;

    /* process the gpu status information separately */
    if (!strcmp(CRAY_GPU_STATUS_START, str))
      {
      rc = process_gpu_status(current, i,status_info.end());
      str = i->c_str();
      continue;
      }
    else if (!strncmp(reservation_id, str, strlen(reservation_id)))
      {
      const char *just_rsv_id = str + strlen(reservation_id);

      if (get_value_hash(rsv_ht, just_rsv_id) == -1)
        {
        add_hash(rsv_ht, 1, strdup(just_rsv_id));

        /* sub-functions will attempt to lock a job, so we must unlock the
         * reporter node */
        unlock_node(parent, __func__, NULL, LOGLEVEL);

        process_reservation_id(current, str);

        current_node_id = strdup(current->nd_name);
        unlock_node(current, __func__, NULL, LOGLEVEL);

        /* re-lock the parent */
        if ((parent = find_nodebyname(nd_name)) == NULL)
          {
          /* reporter node disappeared - this shouldn't be possible */
          log_err(PBSE_UNKNODE, __func__, "Alps reporter node disappeared while recording a reservation");
          free_arst(&temp);
          free_all_keys(rsv_ht);
          free_hash(rsv_ht);
          free(current_node_id);
          return(PBSE_NONE);
          }

        if ((current = find_node_in_allnodes(&parent->alps_subnodes, current_node_id)) == NULL)
          {
          /* current node disappeared, this shouldn't be possible either */
          unlock_node(parent, __func__, NULL, LOGLEVEL);
          snprintf(log_buf, sizeof(log_buf), "Current node '%s' disappeared while recording a reservation",
            current_node_id);
          log_err(PBSE_UNKNODE, __func__, log_buf);
          free_arst(&temp);
          free_all_keys(rsv_ht);
          free_hash(rsv_ht);
          free(current_node_id);
          return(PBSE_NONE);
          }

        free(current_node_id);
        current_node_id = NULL;
        }
      }
    /* save this as is to the status strings */
    else if ((rc = decode_arst(&temp, NULL, NULL, str, 0)) != PBSE_NONE)
      {
      free_arst(&temp);
      free_all_keys(rsv_ht);
      free_hash(rsv_ht);
      return(rc);
      }

    /* perform any special processing */
    if (!strncmp(str, cproc_eq, ac_cproc_eq_len))
      {
      set_ncpus(current, parent, str);
      }
    else if (!strncmp(str, state, strlen(state)))
      {
      set_state(current, str);
      }

    } /* END processing the status update */

  if (current != NULL)
    {
    snprintf(node_index_buf, sizeof(node_index_buf), "node_index=%d", node_index++);
    decode_arst(&temp, NULL, NULL, node_index_buf, 0);
    save_node_status(current, &temp);
    unlock_node(current, __func__, NULL, LOGLEVEL);
    }

  unlock_node(parent, __func__, NULL, LOGLEVEL);

  free_all_keys(rsv_ht);
  free_hash(rsv_ht);

  return(PBSE_NONE);
  } /* END process_alps_status() */
Example #13
0
void req_stat_node(

  struct batch_request *preq) /* ptr to the decoded request   */

  {
  char    *name;

  struct pbsnode *pnode = NULL;

  struct batch_reply *preply;
  svrattrl *pal;
  int     rc   = 0;
  int     type = 0;
  int     i;

  struct prop props;


  char     *id = "req_stat_node";

  /*
   * first, check that the server indeed has a list of nodes
   * and if it does, validate the name of the requested object--
   * either name is that of a specific node, or name[0] is null/@
   * meaning request is for all nodes in the server's jurisdiction
   */

  if (LOGLEVEL >= 6)
    {
    log_record(
      PBSEVENT_SCHED,
      PBS_EVENTCLASS_REQUEST,
      id,
      "entered");
    }

  if ((pbsndmast == NULL) || (svr_totnodes <= 0))
    {
    req_reject(PBSE_NONODES, 0, preq, NULL, "node list is empty - check 'server_priv/nodes' file");

    return;
    }

  name = preq->rq_ind.rq_status.rq_id;

  if ((*name == '\0') || (*name == '@'))
    {
    type = 1;
    }
  else if ((*name == ':') && (*(name + 1) != '\0'))
    {
    if (!strcmp(name + 1, "ALL"))
      {
      type = 1;  /* psuedo-group for all nodes */
      }
    else
      {
      type = 2;
      props.name = name + 1;
      props.mark = 1;
      props.next = NULL;
      }
    }
  else
    {
    pnode = find_nodebyname(name);

    if (pnode == NULL)
      {
      req_reject(PBSE_UNKNODE, 0, preq, NULL, "cannot locate specified node");

      return;
      }
    }

  preply = &preq->rq_reply;

  preply->brp_choice = BATCH_REPLY_CHOICE_Status;

  CLEAR_HEAD(preply->brp_un.brp_status);

  if (type == 0)
    {
    /* get status of the named node */

    rc = status_node(pnode, preq, &preply->brp_un.brp_status);
    }
  else
    {
    /* get status of all or several nodes */

    for (i = 0;i < svr_totnodes;i++)
      {
      pnode = pbsndmast[i];

      if ((type == 2) && !hasprop(pnode, &props))
        continue;

      if ((rc = status_node(pnode, preq, &preply->brp_un.brp_status)) != 0)
        break;
      }
    }

  if (!rc)
    {
    /* SUCCESS */

    reply_send(preq);
    }
  else
    {
    if (rc != PBSE_UNKNODEATR)
      {
      req_reject(rc, 0, preq, NULL, NULL);
      }
    else
      {
      pal = (svrattrl *)GET_NEXT(preq->rq_ind.rq_status.rq_attr);

      reply_badattr(rc, bad, pal, preq);
      }
    }

  return;
  }  /* END req_stat_node() */
/*************************************************
 * svr_is_request
 *
 * Return: svr_is_request always returns a non-zero value
 *         and it must call close_conn to close the connection
 *         before returning. PBSE_SOCKET_CLOSE is the code
 *         for a successful return. But which ever retun 
 *         code is iused it must terminate the while loop
 *         in start_process_pbs_server_port.
 *************************************************/
int svr_is_request(
    
  struct tcp_chan *chan,
  int              version)

  {
  int                 command = 0;
  int                 ret = DIS_SUCCESS;
  int                 i;
  int                 err;
  char                nodename[PBS_MAXHOSTNAME];
  int                 perm = ATR_DFLAG_MGRD | ATR_DFLAG_MGWR;

  unsigned long       ipaddr;
  unsigned short      mom_port;
  unsigned short      rm_port;
  unsigned long       tmpaddr;

  struct sockaddr_in *addr = NULL;
  struct sockaddr     s_addr;
  unsigned int        len = sizeof(s_addr);

  struct pbsnode     *node = NULL;
  char               *node_name = NULL;

  char                log_buf[LOCAL_LOG_BUF_SIZE+1];

  command = disrsi(chan, &ret);

  if (ret != DIS_SUCCESS)
    goto err;

  if (LOGLEVEL >= 4)
    {
    snprintf(log_buf, LOCAL_LOG_BUF_SIZE,
        "message received from sock %d (version %d)",
        chan->sock,
        version);

    log_event(PBSEVENT_ADMIN,PBS_EVENTCLASS_SERVER,__func__,log_buf);
    }

  if (getpeername(chan->sock, &s_addr, &len) != 0)
    {
    close_conn(chan->sock, FALSE);
    log_err(errno,__func__, (char *)"Cannot get socket name using getpeername\n");
    return(PBSE_SOCKET_CLOSE);
    }

  addr = (struct sockaddr_in *)&s_addr;

  if (version != IS_PROTOCOL_VER)
    {
    snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "protocol version %d unknown from %s",
      version,
      netaddr(addr));

    log_err(-1, __func__, log_buf);
    close_conn(chan->sock, FALSE);
    return PBSE_SOCKET_DATA;
    }

  /* check that machine is known */
  mom_port = disrsi(chan, &ret);
  rm_port = disrsi(chan, &ret);

  if (LOGLEVEL >= 3)
    {
    snprintf(log_buf, LOCAL_LOG_BUF_SIZE,
      "message received from addr %s: mom_port %d  - rm_port %d",
      netaddr(addr),
      mom_port,
      rm_port);

    log_event(PBSEVENT_ADMIN,PBS_EVENTCLASS_SERVER,__func__,log_buf);
    }

  ipaddr = ntohl(addr->sin_addr.s_addr);
  
  if ((node = AVL_find(ipaddr, mom_port, ipaddrs)) != NULL)
    {
    lock_node(node, __func__, "AVL_find", LOGLEVEL);
    } /* END if AVL_find != NULL) */
  else if (allow_any_mom)
    {
    char *name = get_cached_nameinfo(addr);

    if (name != NULL)
      snprintf(nodename, sizeof(nodename), "%s", name);
    else if (getnameinfo(&s_addr, len, nodename, sizeof(nodename)-1, NULL, 0, 0) != 0)
      {
      tmpaddr = ntohl(addr->sin_addr.s_addr);
      sprintf(nodename, "0x%lX", tmpaddr);
      }
    else
      insert_addr_name_info(nodename, NULL, addr);

    err = create_partial_pbs_node(nodename, ipaddr, perm);

    if (err == PBSE_NONE)
      {
      node = AVL_find(ipaddr, 0, ipaddrs);
       
      lock_node(node, __func__, "no error", LOGLEVEL);
      }                                                         
    }
    
  if (node == NULL)
    {
    /* node not listed in trusted ipaddrs list */
    
    snprintf(log_buf, LOCAL_LOG_BUF_SIZE,
      "bad attempt to connect from %s (address not trusted - check entry in server_priv/nodes)",
      netaddr(addr));
    
    if (LOGLEVEL >= 2)
      {
      log_record(PBSEVENT_SCHED, PBS_EVENTCLASS_REQUEST, __func__, log_buf);
      }
    else
      {
      log_err(-1, __func__, log_buf);
      }
    
    close_conn(chan->sock, FALSE);
    return PBSE_SOCKET_CLOSE;
    }

  if (LOGLEVEL >= 3)
    {
    snprintf(log_buf, LOCAL_LOG_BUF_SIZE,
      "message %s (%d) received from mom on host %s (%s) (sock %d)",
      PBSServerCmds2[command],
      command,
      node->nd_name,
      netaddr(addr),
      chan->sock);

    log_event(PBSEVENT_ADMIN,PBS_EVENTCLASS_SERVER,__func__,log_buf);
    }

  switch (command)
    {
    case IS_NULL:  /* a ping from server */

      DBPRT(("%s: IS_NULL\n", __func__))

      break;

    case IS_UPDATE:

      DBPRT(("%s: IS_UPDATE\n", __func__))

      i = disrui(chan, &ret);

      if (ret != DIS_SUCCESS)
        {
        if (LOGLEVEL >= 1)
          {
          snprintf(log_buf, LOCAL_LOG_BUF_SIZE,
              "IS_UPDATE error %d on node %s\n", ret, node->nd_name);

          log_err(ret, __func__, log_buf);
          }

        goto err;
        }

      DBPRT(("%s: IS_UPDATE %s 0x%x\n", __func__, node->nd_name, i))

      update_node_state(node, i);

      if ((node->nd_state & INUSE_DOWN) != 0)
        {
        node->nd_mom_reported_down = TRUE;
        }

      break;

    case IS_STATUS:

      if (LOGLEVEL >= 2)
        {
        snprintf(log_buf, LOCAL_LOG_BUF_SIZE,
            "IS_STATUS received from %s", node->nd_name);

        log_event(PBSEVENT_ADMIN, PBS_EVENTCLASS_SERVER, __func__, log_buf);
        }

      if ((node_name = strdup(node->nd_name)) == NULL)
        goto err;
      unlock_node(node, __func__, "before is_stat_get", LOGLEVEL);

      ret = is_stat_get(node_name, chan);

      node = find_nodebyname(node_name);

      if (ret == SEND_HELLO)
        {
        struct hello_info *hi = (struct hello_info *)calloc(1, sizeof(struct hello_info));
        write_tcp_reply(chan, IS_PROTOCOL, IS_PROTOCOL_VER, IS_STATUS, DIS_SUCCESS);

        hi->name = strdup(node_name);
        enqueue_threadpool_request(send_hierarchy_threadtask, hi);
        ret = DIS_SUCCESS;
        }
      else
        write_tcp_reply(chan,IS_PROTOCOL,IS_PROTOCOL_VER,IS_STATUS,ret);

      if(node != NULL)
        node->nd_stream = -1;

      if (ret != DIS_SUCCESS)
        {
        if (LOGLEVEL >= 1)
          {
          snprintf(log_buf, LOCAL_LOG_BUF_SIZE,
              "IS_STATUS error %d on node %s", ret, node_name);

          log_err(ret, __func__, log_buf);
          }
        free(node_name);

        goto err;
        }
      free(node_name);

      break;

    default:

      snprintf(log_buf, LOCAL_LOG_BUF_SIZE,
          "unknown command %d sent from %s",
        command,
        node->nd_name);

      log_err(-1, __func__, log_buf);

      goto err;

      break;
    }  /* END switch (command) */

  /* must be closed because mom opens and closes this connection each time */
  close_conn(chan->sock, FALSE);

  if(node != NULL)
    unlock_node(node, __func__, "close", LOGLEVEL);
  
  return PBSE_SOCKET_CLOSE;

err:

  /* a DIS write error has occurred */

  if (node != NULL)
    {
    if (LOGLEVEL >= 1)
      {
      DBPRT(("%s: error processing node %s\n",
            __func__,
            node->nd_name))
      }

    sprintf(log_buf, "%s from %s(%s)",
      dis_emsg[ret],
      node->nd_name,
      netaddr(addr));
    
    unlock_node(node, __func__, "err", LOGLEVEL);
    }
  else
    {
int process_alps_status(

  const char               *nd_name,
  std::vector<std::string> &status_info)

  {
  const char    *ccu_p = NULL;
  char           *current_node_id = NULL;
  struct pbsnode *parent;
  struct pbsnode *current = NULL;
#ifdef PENABLE_LINUX_CGROUPS
  int             numa_nodes = 0;
  int             sockets = 0;
#endif
  std::string     temp;
  container::item_container<const char *> rsv_ht;
  char            log_buf[LOCAL_LOG_BUF_SIZE];

  /* if we can't find the parent node, ignore the update */
  if ((parent = find_nodebyname(nd_name)) == NULL)
    return(PBSE_NONE);

  /* loop over each string */
  for (unsigned int i = 0; i < status_info.size(); i++)
    {
    const char *str = status_info[i].c_str();

    if (!strncmp(str, "node=", strlen("node=")))
      {
      if (i != 0)
        {
        if (current != NULL)
          save_node_status(current, temp);
      
        temp.clear();
        }

      if ((current = determine_node_from_str(str, parent, current)) == NULL)
        break;
      else
        {
#ifdef PENABLE_LINUX_CGROUPS
        sockets = 0;
        numa_nodes = 0;
#endif

        continue;
        }
      }

    if (current == NULL)
      continue;

    /* process the gpu status information separately */
    if (!strcmp(CRAY_GPU_STATUS_START, str))
      {
      process_gpu_status(current, i, status_info);
      continue;
      }
    else if (!strncmp(reservation_id, str, strlen(reservation_id)))
      {
      const char *just_rsv_id = str + strlen(reservation_id);

      rsv_ht.lock();
      if (rsv_ht.find(just_rsv_id) == NULL)
        {
        rsv_ht.insert(just_rsv_id,just_rsv_id);
        rsv_ht.unlock();

        /* sub-functions will attempt to lock a job, so we must unlock the
         * reporter node */
        parent->unlock_node(__func__, NULL, LOGLEVEL);

        process_reservation_id(current, str);

        current_node_id = strdup(current->get_name());
        current->unlock_node(__func__, NULL, LOGLEVEL);

        /* re-lock the parent */
        if ((parent = find_nodebyname(nd_name)) == NULL)
          {
          /* reporter node disappeared - this shouldn't be possible */
          log_err(PBSE_UNKNODE, __func__, "Alps reporter node disappeared while recording a reservation");
          free(current_node_id);
          return(PBSE_NONE);
          }

        if ((current = find_node_in_allnodes(parent->alps_subnodes, current_node_id)) == NULL)
          {
          /* current node disappeared, this shouldn't be possible either */
          parent->unlock_node(__func__, NULL, LOGLEVEL);
          snprintf(log_buf, sizeof(log_buf), "Current node '%s' disappeared while recording a reservation",
            current_node_id);
          log_err(PBSE_UNKNODE, __func__, log_buf);
          free(current_node_id);
          return(PBSE_NONE);
          }

        free(current_node_id);
        current_node_id = NULL;
        }
      else
        {
        rsv_ht.unlock();
        }
      }
    /* save this as is to the status strings */
    else
      {
      if (temp.size() > 0)
        temp += ",";
      temp += str;
      }

    /* perform any special processing */
    if (!strncmp(str, ccu_eq, ac_ccu_eq_len))
      {
      /* save compute unit count in case we need it */
      /* note: this string (ccu_eq (CCU=)) needs to be found before cprocs_eq (CPROCS=) */
      /*  for the node */
      ccu_p = str;
      }
    else if (!strncmp(str, cproc_eq, ac_cproc_eq_len))
      {
      int ncpus;
      long svr_nppcu_value = 0;

      /*
       * Get the server nppcu value which determines how Hyper-Threaded
       * cores are reported. When server nppcu value is:
       *
       *  0 - Let ALPS choose whether or not to use Hyper-Threaded cores 
       *      (report all cores)
       *  1 - Do not use Hyper-Threaded cores
       *      (report only physical core (compute unit count)
       *  2 - Use Hyper-Threaded cores
       *      (report all cores)
       */
      get_svr_attr_l(SRV_ATR_nppcu, &svr_nppcu_value);

      if (svr_nppcu_value == NPPCU_NO_USE_HT && ccu_p != NULL)
        {
        /* no HT (nppcu==1), so use compute unit count */
        ncpus = atoi(ccu_p + ac_ccu_eq_len);

        /* use CPROC value if we are using APBASIL protocol < 1.3 */
        if (ncpus == 0)
          ncpus = atoi(str + ac_cproc_eq_len);

        /* reset the pointer */
        ccu_p = NULL;
        }
      else
        {
        /* let ALPS choose (nppcu==0) or use HT (nppcu==2), use actual processor count */
        ncpus = atoi(str + ac_cproc_eq_len);
        }

      set_ncpus(current, parent, ncpus);

#ifdef PENABLE_LINUX_CGROUPS
      if (numa_nodes == 0)
        numa_nodes = 1;

      if ((current->nd_layout.is_initialized() == false) ||
          (current->nd_layout.getTotalThreads() != current->nd_slots.get_total_execution_slots()))
        {
        Machine m(current->nd_slots.get_total_execution_slots(), numa_nodes, sockets);
        current->nd_layout = m;
        }
#endif
      }
    else if (!strncmp(str, state, strlen(state)))
      {
      set_state(current, str);
      }
#ifdef PENABLE_LINUX_CGROUPS
    else if (!strncmp(str, "totmem", 6))
      {
      set_total_memory(current, str);
      }
    else if (!strncmp(str, numas, 10))
      {
      // 11 is strlen("numa_nodes=")
      numa_nodes = strtol(str + 11, NULL, 10);
      }
    else if (!strncmp(str, "socket", 6))
      {
      // 7 is strlen("socket=")
      sockets = strtol(str + 7, NULL, 10);
      }
#endif

    } /* END processing the status update */

  if (current != NULL)
    {
    save_node_status(current, temp);
    current->unlock_node(__func__, NULL, LOGLEVEL);
    }

  parent->unlock_node(__func__, NULL, LOGLEVEL);

  return(PBSE_NONE);
  } /* END process_alps_status() */
/*************************************************
 * svr_is_request
 *
 * Return: svr_is_request always returns a non-zero value
 *         and it must call close_conn to close the connection
 *         before returning. PBSE_SOCKET_CLOSE is the code
 *         for a successful return. But which ever retun
 *         code is iused it must terminate the while loop
 *         in start_process_pbs_server_port.
 *************************************************/
void *svr_is_request(

    void *v)

{
    int                 command = 0;
    int                 ret = DIS_SUCCESS;
    int                 i;
    int                 err;
    char                nodename[PBS_MAXHOSTNAME];
    int                 perm = ATR_DFLAG_MGRD | ATR_DFLAG_MGWR;

    unsigned long       ipaddr;
    unsigned short      mom_port;
    unsigned short      rm_port;
    unsigned long       tmpaddr;
    struct sockaddr_in  addr;
    struct pbsnode     *node = NULL;
    char                log_buf[LOCAL_LOG_BUF_SIZE+1];
    char                msg_buf[80];
    char                tmp[80];
    int                 version;
    struct tcp_chan    *chan;
    long               *args;
    is_request_info    *isr = (is_request_info *)v;

    if (isr == NULL)
        return(NULL);

    chan = isr->chan;
    args = isr->args;

    version = disrsi(chan, &ret);

    if (ret != DIS_SUCCESS)
    {
        log_err(-1,  __func__, "Cannot read version - skipping this request.\n");
        close_conn(chan->sock, FALSE);
        DIS_tcp_cleanup(chan);
        return(NULL);
    }

    command = disrsi(chan, &ret);

    if (ret != DIS_SUCCESS)
    {
        snprintf(log_buf, sizeof(log_buf), "could not read command: %d", ret);
        log_err(-1, __func__, log_buf);
        close_conn(chan->sock, FALSE);
        DIS_tcp_cleanup(chan);
        return(NULL);
    }

    if (LOGLEVEL >= 4)
    {
        snprintf(log_buf, LOCAL_LOG_BUF_SIZE,
                 "message received from sock %d (version %d)",
                 chan->sock,
                 version);

        log_event(PBSEVENT_ADMIN, PBS_EVENTCLASS_SERVER, __func__, log_buf);
    }

    /* Just a note to let us know we only do IPv4 for now */
    addr.sin_family = AF_INET;
    memcpy(&addr.sin_addr, (void *)&args[1], sizeof(struct in_addr));
    addr.sin_port = args[2];

    if (version != IS_PROTOCOL_VER)
    {
        netaddr_long(args[1], tmp);
        sprintf(msg_buf, "%s:%ld", tmp, args[2]);

        snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "protocol version %d unknown from %s",
                 version,
                 msg_buf);

        log_err(-1, __func__, log_buf);
        close_conn(chan->sock, FALSE);
        DIS_tcp_cleanup(chan);
        return(NULL);
    }

    /* check that machine is known */
    mom_port = disrsi(chan, &ret);
    rm_port = disrsi(chan, &ret);

    if (LOGLEVEL >= 3)
    {
        netaddr_long(args[1], tmp);
        sprintf(msg_buf, "%s:%ld", tmp, args[2]);
        snprintf(log_buf, LOCAL_LOG_BUF_SIZE,
                 "message received from addr %s: mom_port %d  - rm_port %d",
                 msg_buf,
                 mom_port,
                 rm_port);

        log_event(PBSEVENT_ADMIN,PBS_EVENTCLASS_SERVER,__func__,log_buf);
    }

    ipaddr = args[1];

    if ((node = AVL_find(ipaddr, mom_port, ipaddrs)) != NULL)
    {
        node->lock_node(__func__, "AVL_find", LOGLEVEL);
    } /* END if AVL_find != NULL) */
    else if (allow_any_mom)
    {
        const char *name = get_cached_nameinfo(&addr);

        if (name != NULL)
            snprintf(nodename, sizeof(nodename), "%s", name);
        else if (getnameinfo((struct sockaddr *)&addr, sizeof(addr), nodename, sizeof(nodename)-1, NULL, 0, 0) != 0)
        {
            tmpaddr = ntohl(addr.sin_addr.s_addr);
            sprintf(nodename, "0x%lX", tmpaddr);
        }
        else
            insert_addr_name_info(NULL, nodename);

        err = create_partial_pbs_node(nodename, ipaddr, perm);

        if (err == PBSE_NONE)
        {
            node = AVL_find(ipaddr, 0, ipaddrs);

            node->lock_node(__func__, "no error", LOGLEVEL);
        }
    }

    if (node == NULL)
    {
        /* node not listed in trusted ipaddrs list */
        netaddr_long(args[1], tmp);
        sprintf(msg_buf, "%s:%ld", tmp, args[2]);

        snprintf(log_buf, LOCAL_LOG_BUF_SIZE,
                 "bad attempt to connect from %s (address not trusted - check entry in server_priv/nodes)",
                 msg_buf);

        if (LOGLEVEL >= 2)
        {
            log_record(PBSEVENT_SCHED, PBS_EVENTCLASS_REQUEST, __func__, log_buf);
        }
        else
        {
            log_err(-1, __func__, log_buf);
        }

        close_conn(chan->sock, FALSE);
        DIS_tcp_cleanup(chan);
        return(NULL);
    }

    if (LOGLEVEL >= 3)
    {
        netaddr_long(args[1], tmp);
        sprintf(msg_buf, "%s:%ld", tmp, args[2]);

        snprintf(log_buf, LOCAL_LOG_BUF_SIZE,
                 "message %s (%d) received from mom on host %s (%s) (sock %d)",
                 PBSServerCmds2[command],
                 command,
                 node->get_name(),
                 msg_buf,
                 chan->sock);

        log_event(PBSEVENT_ADMIN,PBS_EVENTCLASS_SERVER,__func__,log_buf);
    }

    mutex_mgr node_mutex(&node->nd_mutex, true);

    switch (command)
    {
    case IS_NULL:  /* a ping from server */

        DBPRT(("%s: IS_NULL\n", __func__))

        break;

    case IS_UPDATE:

        DBPRT(("%s: IS_UPDATE\n", __func__))

        i = disrui(chan, &ret);

        if (ret != DIS_SUCCESS)
        {
            if (LOGLEVEL >= 1)
            {
                snprintf(log_buf, LOCAL_LOG_BUF_SIZE,
                         "IS_UPDATE error %d on node %s\n", ret, node->get_name());

                log_err(ret, __func__, log_buf);
            }

            goto err;
        }

        DBPRT(("%s: IS_UPDATE %s 0x%x\n", __func__, node->get_name(), i))

        update_node_state(node, i);

        if ((node->nd_state & INUSE_DOWN) != 0)
        {
            node->nd_mom_reported_down = TRUE;
        }

        break;

    case IS_STATUS:

    {
        std::string node_name = node->get_name();

        if (LOGLEVEL >= 2)
        {
            snprintf(log_buf, LOCAL_LOG_BUF_SIZE,
                     "IS_STATUS received from %s", node->get_name());

            log_event(PBSEVENT_ADMIN, PBS_EVENTCLASS_SERVER, __func__, log_buf);
        }

        node_mutex.unlock();

        ret = is_stat_get(node_name.c_str(), chan);

        node = find_nodebyname(node_name.c_str());

        if (node != NULL)
        {
            node->nd_stream = -1;
            node_mutex.mark_as_locked();

            if (ret == SEND_HELLO)
            {
                //struct hello_info *hi = new hello_info(node->nd_id);
                write_tcp_reply(chan, IS_PROTOCOL, IS_PROTOCOL_VER, IS_STATUS, DIS_SUCCESS);

                hierarchy_handler.sendHierarchyToANode(node);
                ret = DIS_SUCCESS;
            }
            else
                write_tcp_reply(chan,IS_PROTOCOL,IS_PROTOCOL_VER,IS_STATUS,ret);
        }

        if (ret != DIS_SUCCESS)
        {
            if (LOGLEVEL >= 1)
            {
                snprintf(log_buf, LOCAL_LOG_BUF_SIZE,
                         "IS_STATUS error %d on node %s", ret, node_name.c_str());

                log_err(ret, __func__, log_buf);
            }

            goto err;
        }

        break;
    }

    default:

        snprintf(log_buf, LOCAL_LOG_BUF_SIZE,
                 "unknown command %d sent from %s",
                 command,
                 node->get_name());

        log_err(-1, __func__, log_buf);

        goto err;

        break;
    }  /* END switch (command) */

    /* must be closed because mom opens and closes this connection each time */
    close_conn(chan->sock, FALSE);
    DIS_tcp_cleanup(chan);

    return(NULL);

err:

    /* a DIS write error has occurred */

    if (node != NULL)
    {
        if (LOGLEVEL >= 1)
        {
            DBPRT(("%s: error processing node %s\n",
                   __func__,
                   node->get_name()))
        }

        netaddr_long(args[1], tmp);
        sprintf(msg_buf, "%s:%ld", tmp, args[2]);

        sprintf(log_buf, "%s from %s(%s)",
                dis_emsg[ret],
                node->get_name(),
                msg_buf);
    }
    else
    {
Example #17
0
int req_gpuctrl_svr(
    
  struct batch_request *preq)

  {
  int rc = PBSE_NONE;
  char  *nodename = NULL;
  char  *gpuid = NULL;
  int    gpumode = -1;
  int    reset_perm = -1;
  int    reset_vol = -1;
  char   log_buf[LOCAL_LOG_BUF_SIZE+1];
  int    local_errno = 0;
  struct pbsnode *pnode = NULL;
  int    gpuidx = -1;
  int    conn;

  if ((preq->rq_perm &
       (ATR_DFLAG_MGWR | ATR_DFLAG_MGRD | ATR_DFLAG_OPRD | ATR_DFLAG_OPWR)) == 0)
    {
    rc = PBSE_PERM;
    snprintf(log_buf, LOCAL_LOG_BUF_SIZE,
        "invalid permissions (ATR_DFLAG_MGWR | ATR_DFLAG_MGRD | ATR_DFLAG_OPRD | ATR_DFLAG_OPWR)");
    req_reject(rc, 0, preq, NULL, log_buf);
    return rc;
    }

  nodename = preq->rq_ind.rq_gpuctrl.rq_momnode;
  gpuid = preq->rq_ind.rq_gpuctrl.rq_gpuid;
  gpumode = preq->rq_ind.rq_gpuctrl.rq_gpumode;
  reset_perm = preq->rq_ind.rq_gpuctrl.rq_reset_perm;
  reset_vol = preq->rq_ind.rq_gpuctrl.rq_reset_vol;

  if (LOGLEVEL >= 7)
    {
    sprintf(
      log_buf,
      "GPU control request for node %s gpuid %s mode %d reset_perm %d reset_vol %d",
      nodename,
      gpuid,
      gpumode,
      reset_perm,
      reset_vol);

    log_ext(-1, __func__, log_buf, LOG_INFO);
    }

  /* validate mom node exists */

  pnode = find_nodebyname(nodename);

  if (pnode == NULL)
    {
    req_reject(PBSE_UNKNODE, 0, preq, NULL, NULL);
    return PBSE_UNKNODE;
    }

  /* validate that the node is up */

  if ((pnode->nd_state & (INUSE_DOWN | INUSE_OFFLINE | INUSE_UNKNOWN))||(pnode->nd_power_state != POWER_STATE_RUNNING))
    {
    rc = PBSE_UNKREQ;
    sprintf(log_buf,"Node %s is not available",pnode->nd_name);
    req_reject(rc, 0, preq, NULL, log_buf);
    unlock_node(pnode, __func__, NULL, LOGLEVEL);
    return rc;
    }

  /* validate that the node has real gpus not virtual */

  if (!pnode->nd_gpus_real)
    {
    rc = PBSE_UNKREQ;
    req_reject(rc, 0, preq, NULL, "Not allowed for virtual gpus");
    unlock_node(pnode, __func__, NULL, LOGLEVEL);
    return rc;
    }

  /* validate the gpuid exists */

  if ((gpuidx = gpu_entry_by_id(pnode, gpuid, FALSE)) == -1)
    {
    rc = PBSE_UNKREQ;
    req_reject(rc, 0, preq, NULL, "GPU ID does not exist on node");
    unlock_node(pnode, __func__, NULL, LOGLEVEL);
    return rc;
    }

  /* validate that we have a real request */

  if ((gpumode == -1) && (reset_perm == -1) && (reset_vol == -1))
    {
    rc = PBSE_UNKREQ;
    req_reject(rc, 0, preq, NULL, "No action specified");
    unlock_node(pnode, __func__, NULL, LOGLEVEL);
    return rc;
    }

  /* for mode changes validate the mode with the driver_version */

  if ((pnode->nd_gpusn[gpuidx].driver_ver == 260) && (gpumode > 2))
    {
    rc = PBSE_UNKREQ;
    req_reject(rc, 0, preq, NULL, "GPU driver version does not support mode 3");
    unlock_node(pnode, __func__, NULL, LOGLEVEL);
    return rc;
    }

  /* we need to relay request to the mom for processing */
  /* have MOM attempt to change the gpu mode */

  preq->rq_orgconn = preq->rq_conn;  /* restore client socket */

  unlock_node(pnode, __func__, NULL, LOGLEVEL);
  conn = svr_connect(
           pnode->nd_addrs[0],
           pbs_mom_port,
           &local_errno,
           NULL,
           NULL);
    

  if (conn >= 0)
    {
    if ((rc = issue_Drequest(conn, preq)) != PBSE_NONE)
      req_reject(rc, 0, preq, NULL, NULL);
    else
      process_gpu_request_reply(preq);
    }
  else
    {
    req_reject(PBSE_UNKREQ, 0, preq, NULL, "Failed to get connection to mom");
    }

  return rc;
  }
Example #18
0
void
req_stat_node(struct batch_request *preq)
{
	char		    *name;
	struct batch_reply  *preply;
	svrattrl	    *pal;
	struct pbsnode	    *pnode = NULL;
	int		    rc   = 0;
	int		    type = 0;
	int		    i;

	/*
	 * first, check that the server indeed has a list of nodes
	 * and if it does, validate the name of the requested object--
	 * either name is that of a spedific node, or name[0] is null/@
	 * meaning request is for all nodes in the server's jurisdiction
	 */

	if (pbsndlist == 0  ||  svr_totnodes <= 0) {
		req_reject(PBSE_NONODES, 0, preq);
		return;
	}

	resc_access_perm = preq->rq_perm;

	name = preq->rq_ind.rq_status.rq_id;

	if ((*name == '\0') || (*name =='@'))
		type = 1;
	else {
		pnode = find_nodebyname(name);
		if (pnode == NULL) {
			req_reject(PBSE_UNKNODE, 0, preq);
			return;
		}
	}

	preply = &preq->rq_reply;
	preply->brp_choice = BATCH_REPLY_CHOICE_Status;
	CLEAR_HEAD(preply->brp_un.brp_status);

	if (type == 0) {		/* get status of the named node */
		rc = status_node(pnode, preq, &preply->brp_un.brp_status);

	} else {			/* get status of all nodes */

		for (i = 0; i < svr_totnodes; i++) {
			pnode = pbsndlist[i];

			rc = status_node(pnode, preq,
				&preply->brp_un.brp_status);
			if (rc)
				break;
		}
	}

	if (!rc) {
		(void)reply_send(preq);
	} else {
		if (rc != PBSE_UNKNODEATR)
			req_reject(rc, 0, preq);

		else {
			pal = (svrattrl *)GET_NEXT(preq->rq_ind.
				rq_status.rq_attr);
			reply_badattr(rc, bad, pal, preq);
		}
	}
}
Example #19
0
void mgr_node_modify(

  struct batch_request *preq)  /* I */

  {
  int               need_todo = 0;

  int               rc;
  int               bad = 0;

  const char       *nodename = NULL;

  svrattrl         *plist;
  node_check_info   nci;

  struct pbsnode   *pnode = NULL;

  nodename = preq->rq_ind.rq_manager.rq_objname;
  pnode = find_nodebyname(nodename);

  if (pnode == NULL)
    {
    req_reject(PBSE_UNKNODE, 0, preq, NULL, NULL);

    return;
    }

  plist = (svrattrl *)GET_NEXT(preq->rq_ind.rq_manager.rq_attr);

  save_characteristic(pnode,&nci);

  rc = mgr_modify_node(
         &pnode,
         node_attr_def,
         ND_ATR_LAST,
         plist,
         preq->rq_perm,
         &bad,
         ATR_ACTION_ALTER);

  if (rc != 0)
    {
    /* In the specific node case, reply w/ error and return*/

    switch (rc)
      {
      case PBSE_INTERNAL:

      case PBSE_SYSTEM:

        req_reject(rc, bad, preq, NULL, NULL);

        break;

      case PBSE_NOATTR:

      case PBSE_ATTRRO:

      case PBSE_MUTUALEX:

      case PBSE_BADNDATVAL:

        reply_badattr(rc, bad, plist, preq);

        break;

      default:

        req_reject(rc, 0, preq, NULL, NULL);

        break;
      }

    if(pnode != NULL)
      {
      unlock_node(pnode, "mgr_node_set", (char *)"error", LOGLEVEL);
      pnode = NULL;
      }

    return;
    } /* END if (rc != 0) */
  else
    {
      /* modifications succeeded for this node */
    if(pnode != NULL)
      {
      chk_characteristic(pnode, &nci, &need_todo);
      }
    }

  if(pnode != NULL)
    {
    unlock_node(pnode, "mgr_node_set", (char *)"single_node", LOGLEVEL);
    pnode = NULL;
    }

  if (need_todo & WRITENODE_STATE)
    {
    /*some nodes set to "offline"*/
    write_node_state();

    need_todo &= ~(WRITENODE_STATE);
    }

  if (need_todo & WRITENODE_POWER_STATE)
    {
    /*some nodes changed power state*/
    write_node_power_state();

    need_todo &= ~(WRITENODE_POWER_STATE);
    }

  if (need_todo & WRITENODE_NOTE)
    {
    /*some nodes have new "note"s*/
    write_node_note();

    need_todo &= ~(WRITENODE_NOTE);
    }

  if (need_todo & WRITE_NEW_NODESFILE)
    {
    /*create/delete/prop/ntype change*/
    if (!update_nodes_file(NULL))
      need_todo &= ~(WRITE_NEW_NODESFILE);  /*successful on update*/
    }

  recompute_ntype_cnts();

  reply_ack(preq);  /*request completely successful*/

  return;
  }  /* END void mgr_node_set() */
int process_alps_status(

  char           *nd_name,
  dynamic_string *status_info)

  {
  char           *str;
  char           *ccu_p = NULL;
  char           *current_node_id = NULL;
  char            node_index_buf[MAXLINE];
  int             node_index = 0;
  struct pbsnode *parent;
  struct pbsnode *current = NULL;
  int             rc;
  pbs_attribute   temp;
  hash_table_t   *rsv_ht;
  char            log_buf[LOCAL_LOG_BUF_SIZE];

  memset(&temp, 0, sizeof(temp));

  if ((rc = decode_arst(&temp, NULL, NULL, NULL, 0)) != PBSE_NONE)
    {
    log_record(PBSEVENT_DEBUG, PBS_EVENTCLASS_NODE, __func__, "cannot initialize attribute");
    return(rc);
    }

  /* if we can't find the parent node, ignore the update */
  if ((parent = find_nodebyname(nd_name)) == NULL)
    return(PBSE_NONE);

  /* keep track of reservations so that they're only processed once per update */
  rsv_ht = create_hash(INITIAL_RESERVATION_HOLDER_SIZE);

  /* loop over each string */
  for (str = status_info->str; str != NULL && *str != '\0'; str += strlen(str) + 1)
    {
    if (!strncmp(str, "node=", strlen("node=")))
      {
      if (str != status_info->str)
        {
        snprintf(node_index_buf, sizeof(node_index_buf), "node_index=%d", node_index++);
        decode_arst(&temp, NULL, NULL, node_index_buf, 0);
        
        if (current != NULL)
          save_node_status(current, &temp);
        }

      if ((current = determine_node_from_str(str, parent, current)) == NULL)
        break;
      else
        continue;
      }

    if (current == NULL)
      continue;

    /* process the gpu status information separately */
    if (!strcmp(CRAY_GPU_STATUS_START, str))
      {
      process_gpu_status(current, &str);
      
      continue;
      }
    else if (!strncmp(reservation_id, str, strlen(reservation_id)))
      {
      char *just_rsv_id = str + strlen(reservation_id);

      if (get_value_hash(rsv_ht, just_rsv_id) == -1)
        {
        add_hash(rsv_ht, 1, strdup(just_rsv_id));

        /* sub-functions will attempt to lock a job, so we must unlock the
         * reporter node */
        unlock_node(parent, __func__, NULL, LOGLEVEL);

        process_reservation_id(current, str);

        current_node_id = strdup(current->nd_name);
        unlock_node(current, __func__, NULL, LOGLEVEL);

        /* re-lock the parent */
        if ((parent = find_nodebyname(nd_name)) == NULL)
          {
          /* reporter node disappeared - this shouldn't be possible */
          log_err(PBSE_UNKNODE, __func__, "Alps reporter node disappeared while recording a reservation");
          free_arst(&temp);
          free_all_keys(rsv_ht);
          free_hash(rsv_ht);
          free(current_node_id);
          return(PBSE_NONE);
          }

        if ((current = find_node_in_allnodes(&parent->alps_subnodes, current_node_id)) == NULL)
          {
          /* current node disappeared, this shouldn't be possible either */
          unlock_node(parent, __func__, NULL, LOGLEVEL);
          snprintf(log_buf, sizeof(log_buf), "Current node '%s' disappeared while recording a reservation",
            current_node_id);
          log_err(PBSE_UNKNODE, __func__, log_buf);
          free_arst(&temp);
          free_all_keys(rsv_ht);
          free_hash(rsv_ht);
          free(current_node_id);
          return(PBSE_NONE);
          }

        free(current_node_id);
        current_node_id = NULL;
        }
      }
    /* save this as is to the status strings */
    else if ((rc = decode_arst(&temp, NULL, NULL, str, 0)) != PBSE_NONE)
      {
      free_arst(&temp);
      free_all_keys(rsv_ht);
      free_hash(rsv_ht);
      return(rc);
      }

    /* perform any special processing */
    if (!strncmp(str, ccu_eq, ac_ccu_eq_len))
      {
      /* save compute unit count in case we need it */
      /* note: this string (ccu_eq (CCU=)) needs to be found before cprocs_eq (CPROCS=) */
      /*  for the node */
      ccu_p = str;
      }
    else if (!strncmp(str, cproc_eq, ac_cproc_eq_len))
      {
      int ncpus;
      long svr_nppcu_value = 0;

      /*
       * Get the server nppcu value which determines how Hyper-Threaded
       * cores are reported. When server nppcu value is:
       *
       *  0 - Let ALPS choose whether or not to use Hyper-Threaded cores 
       *      (report all cores)
       *  1 - Do not use Hyper-Threaded cores
       *      (report only physical core (compute unit count)
       *  2 - Use Hyper-Threaded cores
       *      (report all cores)
       */
      get_svr_attr_l(SRV_ATR_nppcu, &svr_nppcu_value);

      if (svr_nppcu_value == NPPCU_NO_USE_HT && ccu_p != NULL)
        {
        /* no HT (nppcu==1), so use compute unit count */
        ncpus = atoi(ccu_p + ac_ccu_eq_len);

        /* use CPROC value if we are using APBASIL protocol < 1.3 */
        if (ncpus == 0)
          ncpus = atoi(str + ac_cproc_eq_len);

        /* reset the pointer */
        ccu_p = NULL;
        }
      else
        {
        /* let ALPS choose (nppcu==0) or use HT (nppcu==2), use actual processor count */
        ncpus = atoi(str + ac_cproc_eq_len);
        }

      set_ncpus(current, parent, ncpus);
      }
    else if (!strncmp(str, state, strlen(state)))
      {
      set_state(current, str);
      }

    } /* END processing the status update */

  if (current != NULL)
    {
    snprintf(node_index_buf, sizeof(node_index_buf), "node_index=%d", node_index++);
    decode_arst(&temp, NULL, NULL, node_index_buf, 0);
    save_node_status(current, &temp);
    unlock_node(current, __func__, NULL, LOGLEVEL);
    }

  unlock_node(parent, __func__, NULL, LOGLEVEL);

  free_all_keys(rsv_ht);
  free_hash(rsv_ht);

  return(PBSE_NONE);
  } /* END process_alps_status() */
Example #21
0
int site_check_user_map(

  job  *pjob,  /* I */
  char *luser, /* I */
  char *EMsg)  /* O (optional,minsize=1024) */

  {
  char *orighost;
  char  owner[PBS_MAXUSER + 1];
  char *p1;
  char *p2;
  int   rc;

  int   ProxyAllowed = 0;
  int   ProxyRequested = 0;
  int   HostAllowed = 0;

  char  *dptr;

#ifdef MUNGE_AUTH
  char  uh[PBS_MAXUSER + PBS_MAXHOSTNAME + 2];
#endif

  if (EMsg != NULL)
    EMsg[0] = '\0';

  /* get just the owner name, without the "@host" */

  p1 = pjob->ji_wattr[JOB_ATR_job_owner].at_val.at_str;

  p2 = owner;

  while ((*p1 != '@') && (*p1 != '\0'))
    *p2++ = *p1++;

  *p2 = '\0';

  orighost = get_variable(pjob, pbs_o_host);

  if (orighost == NULL)
    {
    /* access denied */

    log_event(
      PBSEVENT_JOB,
      PBS_EVENTCLASS_JOB,
      pjob->ji_qs.ji_jobid,
      msg_orighost);

    if (EMsg != NULL)
      strcpy(EMsg, "source host not specified");

    return(-1);
    }

  if ((server.sv_attr[(int)SRV_ATR_AllowProxyUser].at_flags & ATR_VFLAG_SET) && \
      (server.sv_attr[(int)SRV_ATR_AllowProxyUser].at_val.at_long == 1))
    {
    ProxyAllowed = 1;
    }

  if (strcmp(owner, luser) != 0)
    {
    ProxyRequested = 1;
    }

  if (!strcmp(orighost, server_host) && !strcmp(owner, luser))
    {
    /* submitting from server host, access allowed */

    if ((ProxyRequested == 0) || (ProxyAllowed == 1))
      {
      return(0);
      }

    /* host is fine, must validate proxy via ruserok() */

    HostAllowed = 1;
    }

  /* make short host name */

  if ((dptr = strchr(orighost, '.')) != NULL)
    {
    *dptr = '\0';
    }

  if ((HostAllowed == 0) &&
      (server.sv_attr[SRV_ATR_AllowNodeSubmit].at_flags & ATR_VFLAG_SET) &&
      (server.sv_attr[SRV_ATR_AllowNodeSubmit].at_val.at_long == 1) &&
      (find_nodebyname(orighost) != NULL))
    {
    /* job submitted from compute host, access allowed */

    if (dptr != NULL)
      *dptr = '.';

    if ((ProxyRequested == 0) || (ProxyAllowed == 1))
      {
      return(0);
      }

    /* host is fine, must validate proxy via ruserok() */

    HostAllowed = 1;
    }

  if ((HostAllowed == 0) &&
      (server.sv_attr[(int)SRV_ATR_SubmitHosts].at_flags & ATR_VFLAG_SET))
    {

    struct array_strings *submithosts = NULL;
    char                 *testhost;
    int                   hostnum = 0;

    submithosts = server.sv_attr[(int)SRV_ATR_SubmitHosts].at_val.at_arst;

    for (hostnum = 0;hostnum < submithosts->as_usedptr;hostnum++)
      {
      testhost = submithosts->as_string[hostnum];

      if (!strcasecmp(testhost, orighost))
        {
        /* job submitted from host found in trusted submit host list, access allowed */

        if (dptr != NULL)
          *dptr = '.';

        if ((ProxyRequested == 0) || (ProxyAllowed == 1))
          {
          return(0);
          }

        /* host is fine, must validate proxy via ruserok() */

        HostAllowed = 1;

        break;
        }
      }  /* END for (hostnum) */
    }    /* END if (SRV_ATR_SubmitHosts) */

  if (dptr != NULL)
    *dptr = '.';

#ifdef MUNGE_AUTH
  sprintf(uh, "%s@%s", owner, orighost);
  rc = acl_check(&server.sv_attr[SRV_ATR_authusers], uh, ACL_User_Host);
  if(rc <= 0)
    {
    /* rc == 0 means we did not find a match.
       this is a failure */
    if(EMsg != NULL)
      {
      snprintf(EMsg, 1024, "could not authorize user %s from %s",
               owner, orighost);
      }
    rc = -1; /* -1 is what set_jobexid is expecting for a failure*/
    }
  else
    {
    /*SUCCESS*/
    rc = 0; /* the call to ruserok below was in the code first. ruserok returns 
               0 on success but acl_check returns a positive value on success. 
               We set rc to 0 to be consistent with the original ruserok functionality */
    }
#else
  rc = ruserok(orighost, 0, owner, luser);

  if (rc != 0 && EMsg != NULL)
    {
    /* Test rc so as to not fill this message in the case of success, since other
     * callers might not fill this message in the case of their errors and
     * very misleading error message will go into the logs.
     */
    snprintf(EMsg, 1024, "ruserok failed validating %s/%s from %s",
             owner,
             luser,
             orighost);
    }
#endif

   

#ifdef sun
  /* broken Sun ruserok() sets process so it appears to be owned */
  /* by the luser, change it back for cosmetic reasons           */

  setuid(0);

#endif /* sun */

  return(rc);
  }  /* END site_check_user_map() */
int process_status_info(

  const char               *nd_name,
  std::vector<std::string> &status_info)

  {
  const char     *name = nd_name;
  struct pbsnode *current;
  long            mom_job_sync = FALSE;
  long            auto_np = FALSE;
  long            down_on_error = FALSE;
  int             dont_change_state = FALSE;
  pbs_attribute   temp;
  int             rc = PBSE_NONE;
  bool            send_hello = false;

  get_svr_attr_l(SRV_ATR_MomJobSync, &mom_job_sync);
  get_svr_attr_l(SRV_ATR_AutoNodeNP, &auto_np);
  get_svr_attr_l(SRV_ATR_DownOnError, &down_on_error);

  /* Before filling the "temp" pbs_attribute, initialize it.
   * The second and third parameter to decode_arst are never
   * used, so just leave them empty. (GBS) */
  memset(&temp, 0, sizeof(temp));

  if ((rc = decode_arst(&temp, NULL, NULL, NULL, 0)) != PBSE_NONE)
    {
    log_record(PBSEVENT_DEBUG, PBS_EVENTCLASS_NODE, __func__, "cannot initialize attribute");
    return(rc);
    }

  /* if original node cannot be found do not process the update */
  if ((current = find_nodebyname(nd_name)) == NULL)
    return(PBSE_NONE);

  //A node we put to sleep is up and running.
  if (current->nd_power_state != POWER_STATE_RUNNING)
    {
    //Make sure we wait for a stray update that came after we changed the state to pass
    //by.
    if((current->nd_power_state_change_time + NODE_POWER_CHANGE_TIMEOUT) < time(NULL))
      {
      current->nd_power_state = POWER_STATE_RUNNING;
      write_node_power_state();
      }
    }

  /* loop over each string */
  for (unsigned int i = 0; i != status_info.size(); i++)
    {
    const char *str = status_info[i].c_str();
    /* these two options are for switching nodes */
    if (!strncmp(str, NUMA_KEYWORD, strlen(NUMA_KEYWORD)))
      {
      /* if we've already processed some, save this before moving on */
      if (i != 0)
        save_node_status(current, &temp);
      
      dont_change_state = FALSE;

      if ((current = get_numa_from_str(str, current)) == NULL)
        break;
      else
        continue;
      }
    else if (!strncmp(str, "node=", strlen("node=")))
      {
      /* if we've already processed some, save this before moving on */
      if (i != 0)
        save_node_status(current, &temp);

      dont_change_state = FALSE;

      if ((current = get_node_from_str(str, name, current)) == NULL)
        break;
      else
        {
        if (current->nd_mom_reported_down == TRUE)
          {
          /* There is a race condition if using a mom hierarchy and manually
           * shutting down a non-level 1 mom: if its message that the mom is
           * shutting down gets there before its last status update, the node
           * can incorrectly be set as free again. For that reason, only set
           * a mom back up if its reporting for itself. */
          if (strcmp(name, str + strlen("node=")) != 0)
            dont_change_state = TRUE;
          else
            current->nd_mom_reported_down = FALSE;
          }

        continue;
        }
      }

    /* add the info to the "temp" pbs_attribute */
    else if (!strcmp(str, START_GPU_STATUS))
      {
      is_gpustat_get(current, i, status_info);
      str = status_info[i].c_str();
      }
    else if (!strcmp(str, START_MIC_STATUS))
      {
      process_mic_status(current, i, status_info);
      str = status_info[i].c_str();
      }
#ifdef PENABLE_LINUX_CGROUPS
    else if (!strncmp(str, "layout", 6))
      {
      if (current->nd_layout == NULL)
        {
        current->nd_layout = new Machine(status_info[i]);
        }

      continue;
      }
#endif
    else if (!strcmp(str, "first_update=true"))
      {
      /* mom is requesting that we send the mom hierarchy file to her */
      //remove_hello(&hellos, current->nd_id);
      send_hello = true;
      
      /* reset gpu data in case mom reconnects with changed gpus */
      clear_nvidia_gpus(current);
      }
    else if ((rc = decode_arst(&temp, NULL, NULL, str, 0)) != PBSE_NONE)
      {
      DBPRT(("is_stat_get: cannot add attributes\n"));

      free_arst(&temp);

      break;
      }

    if (!strncmp(str, "state", 5))
      {
      if (dont_change_state == FALSE)
        process_state_str(current, str);
      }
    else if ((allow_any_mom == TRUE) &&
             (!strncmp(str, "uname", 5))) 
      {
      process_uname_str(current, str);
      }
    else if (!strncmp(str, "me", 2))  /* shorter str compare than "message" */
      {
      if ((!strncmp(str, "message=ERROR", 13)) &&
          (down_on_error == TRUE))
        {
        update_node_state(current, INUSE_DOWN);
        dont_change_state = TRUE;
        set_note_error(current, str);
        }
      }
    else if (!strncmp(str,"macaddr=",8))
      {
      update_node_mac_addr(current,str + 8);
      }
    else if ((mom_job_sync == TRUE) &&
             (!strncmp(str, "jobdata=", 8)))
      {
      /* update job attributes based on what the MOM gives us */      
      update_job_data(current, str + strlen("jobdata="));
      }
    else if ((mom_job_sync == TRUE) &&
             (!strncmp(str, "jobs=", 5)))
      {
      /* walk job list reported by mom */
      size_t         len = strlen(str) + strlen(current->nd_name) + 2;
      char          *jobstr = (char *)calloc(1, len);
      sync_job_info *sji = (sync_job_info *)calloc(1, sizeof(sync_job_info));

      if ((jobstr != NULL) &&
          (sji != NULL))
        {
        sprintf(jobstr, "%s:%s", current->nd_name, str+5);
        sji->input = jobstr;
        sji->timestamp = time(NULL);

        /* sji must be freed in sync_node_jobs */
        enqueue_threadpool_request(sync_node_jobs, sji, task_pool);
        }
      else
        {
        if (jobstr != NULL)
          {
          free(jobstr);
          }
        if (sji != NULL)
          {
          free(sji);
          }
        }
      }
    else if (auto_np)
      {
      if (!(strncmp(str, "ncpus=", 6)))
        {
        handle_auto_np(current, str);
        }
      }
    } /* END processing strings */

  if (current != NULL)
    {
    save_node_status(current, &temp);
    unlock_node(current, __func__, NULL, LOGLEVEL);
    }
  
  if ((rc == PBSE_NONE) &&
      (send_hello == true))
    rc = SEND_HELLO;
    
  return(rc);
  } /* END process_status_info() */
Example #23
0
END_TEST

START_TEST(find_nodebyname_test)
{
    struct pbsnode  node1;
    struct pbsnode  node2;
    struct pbsnode  reporter;
    struct pbsnode *pnode;

    alps_reporter = &reporter;

    memset(&node1, 0, sizeof(node1));
    memset(&node2, 0, sizeof(node2));

    node1.nd_name = (char *)"bob";
    node2.nd_name = (char *)"tom";
    alps_reporter->alps_subnodes = new all_nodes();
    initialize_allnodes(&allnodes, &node1, &node2);
    initialize_allnodes(alps_reporter->alps_subnodes, &node1, &node2);

    cray_enabled = FALSE;

    pnode = find_nodebyname(NULL);
    fail_unless(pnode == NULL, "NULL nodename input fail");

    pnode = find_nodebyname("george");
    fail_unless(pnode == NULL, "george found but doesn't exist");

    pnode = find_nodebyname("bob");
    fail_unless(pnode == &node1, "couldn't find bob?");

    pnode = find_nodebyname("tom");
    fail_unless(pnode == &node2, "couldn't find tom?");

    pnode = find_nodebyname(strdup("tom-0"));
    fail_unless(!strcmp(pnode->nd_name, "0"), "found an incorrect node name");

    pnode = find_nodebyname(strdup("tom-1"));
    fail_unless(!strcmp(pnode->nd_name, "1"), "found an incorrect node name");

    pnode = find_nodebyname(strdup("tom-10"));
    fail_unless(pnode == NULL, "found an incorrect node name");

    pnode = find_nodebyname(strdup("bob/0"));
    fail_unless(pnode == &node1, "couldn't find bob with the exec_host format");

    allnodes.lock();
    allnodes.clear();
    allnodes.unlock();

    cray_enabled = TRUE;

    pnode = find_nodebyname("tom");
    fail_unless(pnode == &node2, "couldn't find tom?");

    cray_enabled = TRUE;

    pnode = find_nodebyname("bob");
    fail_unless(pnode == &node1, "couldn't find bob?");

    cray_enabled = TRUE;

    pnode = find_nodebyname("george");
    fail_unless(pnode == NULL, "george found but doesn't exist");
    alps_reporter = NULL;

}
Example #24
0
END_TEST


START_TEST(find_nodebyname_test)
  {
  struct pbsnode  node1;
  struct pbsnode  node2;
  struct pbsnode  reporter;
  struct pbsnode *pnode;

  alps_reporter = &reporter;

  node1.change_name("bob");
  node2.change_name("tom");
  alps_reporter->alps_subnodes = new all_nodes();
  initialize_allnodes(&allnodes, &node1, &node2);
  initialize_allnodes(alps_reporter->alps_subnodes, &node1, &node2);

  cray_enabled = false;

  pnode = find_nodebyname(NULL);
  fail_unless(pnode == NULL, "NULL nodename input fail");

  pnode = find_nodebyname("george");
  fail_unless(pnode == NULL, "george found but doesn't exist");

  pnode = find_nodebyname("bob");
  fail_unless(pnode == &node1, "couldn't find bob?");
  pnode->unlock_node("a", "b", 0);

  pnode = find_nodebyname("tom");
  fail_unless(pnode == &node2, "couldn't find tom?");
  pnode->unlock_node("a", "b", 0);

  pnode = find_nodebyname(strdup("tom-0"));
  fail_unless(!strcmp(pnode->get_name(), "0"), "found an incorrect node name");
  pnode->unlock_node("a", "b", 0);

  pnode = find_nodebyname(strdup("tom-1"));
  fail_unless(!strcmp(pnode->get_name(), "1"), "found an incorrect node name");
  pnode->unlock_node("a", "b", 0);

  pnode = find_nodebyname(strdup("tom-10"));
  fail_unless(pnode == NULL, "found an incorrect node name");

  pnode = find_nodebyname(strdup("bob/0"));
  fail_unless(pnode == &node1, "couldn't find bob with the exec_host format");
  pnode->unlock_node("a", "b", 0);

  allnodes.lock();
  allnodes.clear();
  allnodes.unlock();

  cray_enabled = true;

  pnode = find_nodebyname("tom");
  fail_unless(pnode == &node2, "couldn't find tom?");

  cray_enabled = true;

  pnode = find_nodebyname("bob");
  fail_unless(pnode == &node1, "couldn't find bob?");

  cray_enabled = true;

  pnode = find_nodebyname("george");
  fail_unless(pnode == NULL, "george found but doesn't exist");
  alps_reporter = NULL;

  }