Exemplo n.º 1
0
/*
 * record_reservation()
 *
 * @pre-cond: pnode and rsv_id must be valid pointers
 * @post-cond: the reservation will be recorded in pbs_server's tracking mechanism
 * and on the job which has the node reserved, or -1 is returned and the reservation
 * is not recorded.
 * @param - pnode the node which is reporting the reservation
 * @param - rsv_id the id of the reservation being reported
 * @return - PBSE_NONE if the reservation was successfully recorded, -1 otherwise
 */
int record_reservation(

  struct pbsnode *pnode,
  const char     *rsv_id)

  {
  job            *pjob;
  bool            found_job = false;
  char            jobid[PBS_MAXSVRJOBID + 1];

  for (unsigned int i = 0; i < pnode->nd_job_usages.size(); i++)
    {
    /* cray only allows one job per node, so any valid job will be the job that is 
     * reserving this node. */
    job_usage_info *jui = pnode->nd_job_usages[i];
    strcpy(jobid, jui->jobid);

    unlock_node(pnode, __func__, NULL, LOGLEVEL);

    if ((pjob = svr_find_job(jobid, TRUE)) != NULL)
      {
      mutex_mgr job_mutex(pjob->ji_mutex, true);
      pjob->ji_wattr[JOB_ATR_reservation_id].at_val.at_str = strdup(rsv_id);
      pjob->ji_wattr[JOB_ATR_reservation_id].at_flags = ATR_VFLAG_SET;

      /* add environment variable BATCH_PARTITION_ID */
      char buf[1024];
      snprintf(buf, sizeof(buf), "BATCH_PARTITION_ID=%s", rsv_id);
      pbs_attribute  tempattr;
      clear_attr(&tempattr, &job_attr_def[JOB_ATR_variables]);
      job_attr_def[JOB_ATR_variables].at_decode(&tempattr,
        NULL, NULL, buf, 0);

      job_attr_def[JOB_ATR_variables].at_set(
        &pjob->ji_wattr[JOB_ATR_variables], &tempattr, INCR);

      job_attr_def[JOB_ATR_variables].at_free(&tempattr);

      track_alps_reservation(pjob);
      found_job = true;

      job_mutex.unlock(); 
      lock_node(pnode, __func__, NULL, LOGLEVEL);
      break;
      }
    else
      lock_node(pnode, __func__, NULL, LOGLEVEL);
    }

  if (found_job == false)
    return(-1);

  return(PBSE_NONE);
  } /* END record_reservation() */
Exemplo n.º 2
0
struct pbsnode *create_alps_subnode(

  struct pbsnode *parent,
  char           *node_id)

  {
  struct pbsnode *subnode = calloc(1, sizeof(struct pbsnode));
  svrattrl       *plist = NULL;
  int             bad;
  int             rc;

  if (initialize_pbsnode(subnode, strdup(node_id), NULL, NTYPE_CLUSTER) != PBSE_NONE)
    {
    free(subnode);
    log_err(ENOMEM, __func__, "");
    return(NULL);
    }

  if (create_subnode(subnode) == NULL)
    {
    free(subnode);
    log_err(ENOMEM, __func__, "");
    return(NULL);
    }

  /* do we need to do something else here? */
  subnode->nd_addrs = parent->nd_addrs;

  rc = mgr_set_node_attr(subnode, 
      node_attr_def,
      ND_ATR_LAST,
      plist,
      ATR_DFLAG_MGRD | ATR_DFLAG_MGWR,
      &bad,
      (void *)subnode,
      ATR_ACTION_ALTER);

  if (rc != PBSE_NONE)
    {
    free(subnode);
    log_err(rc, __func__, "Couldn't set node attributes");
    return(NULL);
    }

  subnode->nd_ntype = NTYPE_CLUSTER;
  subnode->parent = parent;

  /* add any properties to the subnodes */
  copy_properties(subnode, parent);

  lock_node(subnode, __func__, NULL, 0);
    
  insert_node(&(parent->alps_subnodes), subnode);
  
  return(subnode);
  } /* END create_alps_subnode() */
Exemplo n.º 3
0
struct pbsnode *create_alps_subnode(

  struct pbsnode *parent,
  const char     *node_id)

  {
  struct pbsnode *subnode = (struct pbsnode *)calloc(1, sizeof(struct pbsnode));
  svrattrl       *plist = NULL;
  int             bad;
  int             rc = PBSE_NONE;

  if (initialize_pbsnode(subnode, strdup(node_id), NULL, NTYPE_CLUSTER, FALSE) != PBSE_NONE)
    {
    free(subnode);
    log_err(ENOMEM, __func__, "");
    return(NULL);
    }

  // all nodes have at least 1 core
  add_execution_slot(subnode);
  
  // we need to increment this count for accuracy  
  svr_clnodes++;

  /* do we need to do something else here? */
  subnode->nd_addrs = parent->nd_addrs;

  rc = mgr_set_node_attr(subnode, 
      node_attr_def,
      ND_ATR_LAST,
      plist,
      ATR_DFLAG_MGRD | ATR_DFLAG_MGWR,
      &bad,
      (void *)subnode,
      ATR_ACTION_ALTER);

  if (rc != PBSE_NONE)
    {
    free(subnode);
    log_err(rc, __func__, "Couldn't set node attributes");
    return(NULL);
    }

  subnode->nd_ntype = NTYPE_CLUSTER;
  subnode->parent = parent;

  /* add any properties to the subnodes */
  copy_properties(subnode, parent);

  lock_node(subnode, __func__, NULL, LOGLEVEL);
    
  insert_node(&(parent->alps_subnodes), subnode);
  
  return(subnode);
  } /* END create_alps_subnode() */
Exemplo n.º 4
0
/*
 * record_reservation()
 *
 * @pre-cond: pnode and rsv_id must be valid pointers
 * @post-cond: the reservation will be recorded in pbs_server's tracking mechanism
 * and on the job which has the node reserved, or -1 is returned and the reservation
 * is not recorded.
 * @param - pnode the node which is reporting the reservation
 * @param - rsv_id the id of the reservation being reported
 * @return - PBSE_NONE if the reservation was successfully recorded, -1 otherwise
 */
int record_reservation(

  struct pbsnode *pnode,
  const char     *rsv_id)

  {
  job            *pjob;
  bool            found_job = false;
  char            jobid[PBS_MAXSVRJOBID + 1];

  for (unsigned int i = 0; i < pnode->nd_job_usages.size(); i++)
    {
    /* cray only allows one job per node, so any valid job will be the job that is 
     * reserving this node. */
    job_usage_info *jui = pnode->nd_job_usages[i];
    strcpy(jobid, jui->jobid);

    unlock_node(pnode, __func__, NULL, LOGLEVEL);

    if ((pjob = svr_find_job(jobid, TRUE)) != NULL)
      {
      mutex_mgr job_mutex(pjob->ji_mutex, true);
      pjob->ji_wattr[JOB_ATR_reservation_id].at_val.at_str = strdup(rsv_id);
      pjob->ji_wattr[JOB_ATR_reservation_id].at_flags = ATR_VFLAG_SET;

      track_alps_reservation(pjob);
      found_job = true;

      job_mutex.unlock(); 
      lock_node(pnode, __func__, NULL, LOGLEVEL);
      break;
      }
    else
      lock_node(pnode, __func__, NULL, LOGLEVEL);
    }

  if (found_job == false)
    return(-1);

  return(PBSE_NONE);
  } /* END record_reservation() */
Exemplo n.º 5
0
struct pbsnode *get_numa_from_str(
    
  const char     *str, /* I */
  struct pbsnode *np)  /* I */

  {
  const char     *numa_id;
  struct pbsnode *numa;
  unsigned long   numa_index;
  char            log_buf[LOCAL_LOG_BUF_SIZE];
  
  if (np->node_boards == NULL)
    {
    /* ERROR */
    snprintf(log_buf,sizeof(log_buf),
      "Node %s isn't declared to be NUMA, but mom is reporting\n",
      np->nd_name);
    log_err(-1, __func__, log_buf);
  
    unlock_node(np, __func__, "np numa update", LOGLEVEL);
    
    return(NULL);
    }
  
  numa_id = str + strlen(NUMA_KEYWORD);
  numa_index = atoi(numa_id);
  
  numa = AVL_find(numa_index, np->nd_mom_port, np->node_boards);
  
  if (numa == NULL)
    {
    /* ERROR */
    snprintf(log_buf,sizeof(log_buf),
      "Could not find NUMA index %lu for node %s\n",
      numa_index,
      np->nd_name);
    log_err(-1, __func__, log_buf);
    
    unlock_node(np, __func__, "np numa update", LOGLEVEL);
    
    return(NULL);
    }
 
  /* SUCCESS */
  unlock_node(np, __func__, "np numa update", LOGLEVEL);
  lock_node(numa, __func__, "numa numa update", LOGLEVEL);
  
  numa->nd_lastupdate = time(NULL);
  
  return(numa);
  } /* END get_numa_from_str() */
Exemplo n.º 6
0
struct pbsnode *get_next_login_node(

  struct prop *needed)

  {
  struct pbsnode *pnode = NULL;
  login_node     *ln;
  int             node_fits = TRUE;

  pthread_mutex_lock(logins.ln_mutex);
  ln = (login_node *)logins.ra->slots[logins.next_node].item;

  if (ln != NULL)
    {
    pnode = ln->pnode;
    lock_node(pnode, __func__, NULL, LOGLEVEL);
    
    if (needed != NULL)
      {
      if (hasprop(pnode, needed) == FALSE)
        {
        node_fits = FALSE;
        }
      }
    
    /* must have at least one execution slot available */
    if ((pnode->nd_nsn - pnode->nd_np_to_be_used < 1) ||
        ((pnode->nd_state & INUSE_DOWN) != 0) ||
        ((pnode->nd_state & INUSE_OFFLINE) != 0))
      {
      node_fits = FALSE;
      }
    
    if (node_fits == FALSE)
      {
      unlock_node(pnode, __func__, NULL, LOGLEVEL);
      pnode = find_fitting_node(needed);
      }
    else
      {
      ln->times_used++;
      update_next_node_index(ln->times_used);
      }
    }

  pthread_mutex_unlock(logins.ln_mutex);

  return(pnode);
  } /* END get_next_login_node() */
Exemplo n.º 7
0
struct pbsnode *get_next_login_node(

  struct prop *needed)

  {
  struct pbsnode *pnode = NULL;
  int             node_fits = TRUE;

  pthread_mutex_lock(logins.ln_mutex);
  login_node &ln = logins.nodes[logins.next_node];

  pnode = ln.pnode;
  lock_node(pnode, __func__, NULL, LOGLEVEL);
  
  if (needed != NULL)
    {
    if (hasprop(pnode, needed) == FALSE)
      {
      node_fits = FALSE;
      }
    }
  
  /* must have at least one execution slot available */
  if ((pnode->nd_slots.get_total_execution_slots() - pnode->nd_np_to_be_used < 1) ||
      ((pnode->nd_state & INUSE_NOT_READY) != 0) ||
      ((pnode->nd_state & INUSE_OFFLINE) != 0) ||
      (pnode->nd_power_state != POWER_STATE_RUNNING))
    {
    node_fits = FALSE;
    }
  
  if (node_fits == FALSE)
    {
    unlock_node(pnode, __func__, NULL, LOGLEVEL);
    pnode = find_fitting_node(needed);
    }
  else
    {
    ln.times_used++;
    update_next_node_index(ln.times_used);
    }

  pthread_mutex_unlock(logins.ln_mutex);

  return(pnode);
  } /* END get_next_login_node() */
Exemplo n.º 8
0
struct pbsnode *check_node(

  login_node  *ln,
  struct prop *needed)

  {
  struct pbsnode *pnode = ln->pnode;

  lock_node(pnode, __func__, NULL, LOGLEVEL);

  if ((hasprop(pnode, needed) == TRUE) &&
      (pnode->nd_nsn - pnode->nd_np_to_be_used >= 1) &&
      ((pnode->nd_state & INUSE_DOWN) == 0) &&
      ((pnode->nd_state & INUSE_OFFLINE) == 0))
    return(pnode);
  else
    {
    unlock_node(pnode, __func__, NULL, LOGLEVEL);
    return(NULL);
    }
  } /* END check_node() */
Exemplo n.º 9
0
/* instead of getting the status on a node with numa nodes, report
 * the status of all the numa nodes
 *
 * @param pnode - the node to report on
 * @param preq - the batch request
 * @param pstathd - the list to add this response to
 *
 * @return - 0 on SUCCESS, error code otherwise
 */
int get_numa_statuses(

  struct pbsnode       *pnode,    /* ptr to node receiving status query */
  struct batch_request *preq,
  int                  *bad,      /* O */
  tlist_head           *pstathd)  /* head of list to append status to  */

  {
  int i;
  int rc = 0;

  struct pbsnode *pn;

  if (pnode->num_node_boards == 0)
    {
    /* no numa nodes, just return the status for this node */
    rc = status_node(pnode, preq, bad, pstathd);

    return(rc);
    }

  for (i = 0; i < pnode->num_node_boards; i++)
    {
    pn = AVL_find(i,pnode->nd_mom_port,pnode->node_boards);

    if (pn == NULL)
      continue;

    lock_node(pn, __func__, NULL, LOGLEVEL);
    rc = status_node(pn, preq, bad, pstathd);
    unlock_node(pn, __func__, NULL, LOGLEVEL);

    if (rc != PBSE_NONE)
      {
      return(rc);
      }
    }

  return(rc);
  } /* END get_numa_statuses() */
Exemplo n.º 10
0
/*
 * check_node() 
 *
 * @return a pointer to the node if it is valid to be used
 * @param ln - a pointer to the login node struct containing the 
 * node that should be checked
 * @pre-cond - ln must be a pointer to a valid login node struct
 * @param needed - an optional pointer to the required properties for
 * the login node to have.
 */
struct pbsnode *check_node(

  login_node  *ln,
  struct prop *needed)

  {
  struct pbsnode *pnode = ln->pnode;

  lock_node(pnode, __func__, NULL, LOGLEVEL);

  if ((hasprop(pnode, needed) == TRUE) &&
      (pnode->nd_slots.get_number_free() - pnode->nd_np_to_be_used >= 1) &&
      ((pnode->nd_state & INUSE_NOT_READY) == 0) &&
      ((pnode->nd_state & INUSE_OFFLINE) == 0) &&
      (pnode->nd_power_state == POWER_STATE_RUNNING))
    return(pnode);
  else
    {
    unlock_node(pnode, __func__, NULL, LOGLEVEL);
    return(NULL);
    }
  } /* END check_node() */
Exemplo n.º 11
0
struct pbsnode *find_alpsnode_by_name(

  struct pbsnode *parent,
  char           *node_id)

  {
  struct pbsnode *node = NULL;
  int             index;

  pthread_mutex_lock(parent->alps_subnodes.allnodes_mutex);

  index = get_value_hash(parent->alps_subnodes.ht, node_id);

  if (index >= 0)
    node = (struct pbsnode *)parent->alps_subnodes.ra->slots[index].item;

  pthread_mutex_unlock(parent->alps_subnodes.allnodes_mutex);

  if (node != NULL)
    lock_node(node, __func__, NULL, 0);

  return(node);
  } /* END find_alpsnode_by_name() */
Exemplo n.º 12
0
/*************************************************
 * svr_is_request
 *
 * Return: svr_is_request always returns a non-zero value
 *         and it must call close_conn to close the connection
 *         before returning. PBSE_SOCKET_CLOSE is the code
 *         for a successful return. But which ever retun 
 *         code is iused it must terminate the while loop
 *         in start_process_pbs_server_port.
 *************************************************/
int svr_is_request(
    
  struct tcp_chan *chan,
  int              version)

  {
  int                 command = 0;
  int                 ret = DIS_SUCCESS;
  int                 i;
  int                 err;
  char                nodename[PBS_MAXHOSTNAME];
  int                 perm = ATR_DFLAG_MGRD | ATR_DFLAG_MGWR;

  unsigned long       ipaddr;
  unsigned short      mom_port;
  unsigned short      rm_port;
  unsigned long       tmpaddr;

  struct sockaddr_in *addr = NULL;
  struct sockaddr     s_addr;
  unsigned int        len = sizeof(s_addr);

  struct pbsnode     *node = NULL;
  char               *node_name = NULL;

  char                log_buf[LOCAL_LOG_BUF_SIZE+1];

  command = disrsi(chan, &ret);

  if (ret != DIS_SUCCESS)
    goto err;

  if (LOGLEVEL >= 4)
    {
    snprintf(log_buf, LOCAL_LOG_BUF_SIZE,
        "message received from sock %d (version %d)",
        chan->sock,
        version);

    log_event(PBSEVENT_ADMIN,PBS_EVENTCLASS_SERVER,__func__,log_buf);
    }

  if (getpeername(chan->sock, &s_addr, &len) != 0)
    {
    close_conn(chan->sock, FALSE);
    log_err(errno,__func__, (char *)"Cannot get socket name using getpeername\n");
    return(PBSE_SOCKET_CLOSE);
    }

  addr = (struct sockaddr_in *)&s_addr;

  if (version != IS_PROTOCOL_VER)
    {
    snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "protocol version %d unknown from %s",
      version,
      netaddr(addr));

    log_err(-1, __func__, log_buf);
    close_conn(chan->sock, FALSE);
    return PBSE_SOCKET_DATA;
    }

  /* check that machine is known */
  mom_port = disrsi(chan, &ret);
  rm_port = disrsi(chan, &ret);

  if (LOGLEVEL >= 3)
    {
    snprintf(log_buf, LOCAL_LOG_BUF_SIZE,
      "message received from addr %s: mom_port %d  - rm_port %d",
      netaddr(addr),
      mom_port,
      rm_port);

    log_event(PBSEVENT_ADMIN,PBS_EVENTCLASS_SERVER,__func__,log_buf);
    }

  ipaddr = ntohl(addr->sin_addr.s_addr);
  
  if ((node = AVL_find(ipaddr, mom_port, ipaddrs)) != NULL)
    {
    lock_node(node, __func__, "AVL_find", LOGLEVEL);
    } /* END if AVL_find != NULL) */
  else if (allow_any_mom)
    {
    char *name = get_cached_nameinfo(addr);

    if (name != NULL)
      snprintf(nodename, sizeof(nodename), "%s", name);
    else if (getnameinfo(&s_addr, len, nodename, sizeof(nodename)-1, NULL, 0, 0) != 0)
      {
      tmpaddr = ntohl(addr->sin_addr.s_addr);
      sprintf(nodename, "0x%lX", tmpaddr);
      }
    else
      insert_addr_name_info(nodename, NULL, addr);

    err = create_partial_pbs_node(nodename, ipaddr, perm);

    if (err == PBSE_NONE)
      {
      node = AVL_find(ipaddr, 0, ipaddrs);
       
      lock_node(node, __func__, "no error", LOGLEVEL);
      }                                                         
    }
    
  if (node == NULL)
    {
    /* node not listed in trusted ipaddrs list */
    
    snprintf(log_buf, LOCAL_LOG_BUF_SIZE,
      "bad attempt to connect from %s (address not trusted - check entry in server_priv/nodes)",
      netaddr(addr));
    
    if (LOGLEVEL >= 2)
      {
      log_record(PBSEVENT_SCHED, PBS_EVENTCLASS_REQUEST, __func__, log_buf);
      }
    else
      {
      log_err(-1, __func__, log_buf);
      }
    
    close_conn(chan->sock, FALSE);
    return PBSE_SOCKET_CLOSE;
    }

  if (LOGLEVEL >= 3)
    {
    snprintf(log_buf, LOCAL_LOG_BUF_SIZE,
      "message %s (%d) received from mom on host %s (%s) (sock %d)",
      PBSServerCmds2[command],
      command,
      node->nd_name,
      netaddr(addr),
      chan->sock);

    log_event(PBSEVENT_ADMIN,PBS_EVENTCLASS_SERVER,__func__,log_buf);
    }

  switch (command)
    {
    case IS_NULL:  /* a ping from server */

      DBPRT(("%s: IS_NULL\n", __func__))

      break;

    case IS_UPDATE:

      DBPRT(("%s: IS_UPDATE\n", __func__))

      i = disrui(chan, &ret);

      if (ret != DIS_SUCCESS)
        {
        if (LOGLEVEL >= 1)
          {
          snprintf(log_buf, LOCAL_LOG_BUF_SIZE,
              "IS_UPDATE error %d on node %s\n", ret, node->nd_name);

          log_err(ret, __func__, log_buf);
          }

        goto err;
        }

      DBPRT(("%s: IS_UPDATE %s 0x%x\n", __func__, node->nd_name, i))

      update_node_state(node, i);

      if ((node->nd_state & INUSE_DOWN) != 0)
        {
        node->nd_mom_reported_down = TRUE;
        }

      break;

    case IS_STATUS:

      if (LOGLEVEL >= 2)
        {
        snprintf(log_buf, LOCAL_LOG_BUF_SIZE,
            "IS_STATUS received from %s", node->nd_name);

        log_event(PBSEVENT_ADMIN, PBS_EVENTCLASS_SERVER, __func__, log_buf);
        }

      if ((node_name = strdup(node->nd_name)) == NULL)
        goto err;
      unlock_node(node, __func__, "before is_stat_get", LOGLEVEL);

      ret = is_stat_get(node_name, chan);

      node = find_nodebyname(node_name);

      if (ret == SEND_HELLO)
        {
        struct hello_info *hi = (struct hello_info *)calloc(1, sizeof(struct hello_info));
        write_tcp_reply(chan, IS_PROTOCOL, IS_PROTOCOL_VER, IS_STATUS, DIS_SUCCESS);

        hi->name = strdup(node_name);
        enqueue_threadpool_request(send_hierarchy_threadtask, hi);
        ret = DIS_SUCCESS;
        }
      else
        write_tcp_reply(chan,IS_PROTOCOL,IS_PROTOCOL_VER,IS_STATUS,ret);

      if(node != NULL)
        node->nd_stream = -1;

      if (ret != DIS_SUCCESS)
        {
        if (LOGLEVEL >= 1)
          {
          snprintf(log_buf, LOCAL_LOG_BUF_SIZE,
              "IS_STATUS error %d on node %s", ret, node_name);

          log_err(ret, __func__, log_buf);
          }
        free(node_name);

        goto err;
        }
      free(node_name);

      break;

    default:

      snprintf(log_buf, LOCAL_LOG_BUF_SIZE,
          "unknown command %d sent from %s",
        command,
        node->nd_name);

      log_err(-1, __func__, log_buf);

      goto err;

      break;
    }  /* END switch (command) */

  /* must be closed because mom opens and closes this connection each time */
  close_conn(chan->sock, FALSE);

  if(node != NULL)
    unlock_node(node, __func__, "close", LOGLEVEL);
  
  return PBSE_SOCKET_CLOSE;

err:

  /* a DIS write error has occurred */

  if (node != NULL)
    {
    if (LOGLEVEL >= 1)
      {
      DBPRT(("%s: error processing node %s\n",
            __func__,
            node->nd_name))
      }

    sprintf(log_buf, "%s from %s(%s)",
      dis_emsg[ret],
      node->nd_name,
      netaddr(addr));
    
    unlock_node(node, __func__, "err", LOGLEVEL);
    }
  else
    {