Пример #1
0
void * send_power_state_to_mom(void *arg)
  {
  struct batch_request  *pRequest = (struct batch_request *)arg;
  struct pbsnode        *pNode = find_nodebyname(pRequest->rq_host);

  if(pNode == NULL)
    {
    free_br(pRequest);
    return NULL;
    }

  int handle = 0;
  int local_errno = 0;
  handle = svr_connect(pNode->nd_addrs[0],pNode->nd_mom_port,&local_errno,pNode,NULL);
  if(handle < 0)
    {
    unlock_node(pNode, __func__, "Error connecting", LOGLEVEL);
    return NULL;
    }
  unlock_node(pNode, __func__, "Done connecting", LOGLEVEL);
  issue_Drequest(handle, pRequest);

  return NULL;
  }
Пример #2
0
int stat_to_mom(

  char             *job_id,
  struct stat_cntl *cntl)  /* M */

  {
  struct batch_request *newrq;
  int                   rc = PBSE_NONE;
  unsigned long         addr;
  char                  log_buf[LOCAL_LOG_BUF_SIZE+1];
  struct pbsnode       *node;
  int                   handle = -1;
  unsigned long         job_momaddr = -1;
  unsigned short        job_momport = -1;
  char                 *job_momname = NULL;
  job                  *pjob = NULL;

  if ((pjob = svr_find_job(job_id, FALSE)) == NULL)
    return(PBSE_JOBNOTFOUND);

  mutex_mgr job_mutex(pjob->ji_mutex, true);

  if ((pjob->ji_qs.ji_un.ji_exect.ji_momaddr == 0) || 
      (!pjob->ji_wattr[JOB_ATR_exec_host].at_val.at_str))
    {
    job_mutex.unlock();
    snprintf(log_buf, sizeof(log_buf),
      "Job %s missing MOM's information. Skipping statting on this job", pjob->ji_qs.ji_jobid);
    log_record(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, __func__, log_buf);
    return PBSE_BAD_PARAMETER;
    }

  job_momaddr = pjob->ji_qs.ji_un.ji_exect.ji_momaddr;
  job_momport = pjob->ji_qs.ji_un.ji_exect.ji_momport;
  job_momname = strdup(pjob->ji_wattr[JOB_ATR_exec_host].at_val.at_str);
  job_mutex.unlock();

  if (job_momname == NULL)
    return PBSE_MEM_MALLOC;

  if ((newrq = alloc_br(PBS_BATCH_StatusJob)) == NULL)
    {
    free(job_momname);
    return PBSE_MEM_MALLOC;
    }

  if (cntl->sc_type == 1)
    snprintf(newrq->rq_ind.rq_status.rq_id, sizeof(newrq->rq_ind.rq_status.rq_id), "%s", job_id);
  else
    newrq->rq_ind.rq_status.rq_id[0] = '\0';  /* get stat of all */

  CLEAR_HEAD(newrq->rq_ind.rq_status.rq_attr);

  /* if MOM is down just return stale information */
  addr = job_momaddr;

  node = tfind_addr(addr,job_momport,job_momname);
  free(job_momname);

  if (node == NULL)
    return PBSE_UNKNODE;
  if ((node->nd_state & INUSE_DOWN)||(node->nd_power_state != POWER_STATE_RUNNING))
    {
    if (LOGLEVEL >= 6)
      {
      snprintf(log_buf, LOCAL_LOG_BUF_SIZE,
          "node '%s' is allocated to job but in state 'down'",
          node->nd_name);

      log_event(PBSEVENT_SYSTEM,PBS_EVENTCLASS_JOB,job_id,log_buf);
      }

    unlock_node(node, __func__, "no rely mom", LOGLEVEL);
    free_br(newrq);

    return PBSE_NORELYMOM;
    }

  /* get connection to MOM */
  unlock_node(node, __func__, "before svr_connect", LOGLEVEL);
  handle = svr_connect(job_momaddr, job_momport, &rc, NULL, NULL);

  if (handle >= 0)
    {
    if ((rc = issue_Drequest(handle, newrq, true)) == PBSE_NONE)
      {
      stat_update(newrq, cntl);
      }
    }
  else
    rc = PBSE_CONNECT;

  if (rc == PBSE_SYSTEM)
    rc = PBSE_MEM_MALLOC;

  free_br(newrq);

  return(rc);
  }  /* END stat_to_mom() */
Пример #3
0
void req_gpuctrl(

  struct batch_request *preq)

  {
  char   *id = "req_gpuctrl";

  char  *nodename = NULL;
  char  *gpuid = NULL;
  int    gpumode = -1;
  int    reset_perm = -1;
  int    reset_vol = -1;
#ifdef NVIDIA_GPUS
  struct pbsnode *pnode = NULL;
  int    gpuidx = -1;
  int    rc = 0;
  int    conn;
#endif  /* NVIDIA_GPUS */

  if ((preq->rq_perm &
       (ATR_DFLAG_MGWR | ATR_DFLAG_MGRD | ATR_DFLAG_OPRD | ATR_DFLAG_OPWR)) == 0)
    {
    req_reject(PBSE_PERM, 0, preq, NULL, NULL);
    return;
    }

  nodename = preq->rq_ind.rq_gpuctrl.rq_momnode;
  gpuid = preq->rq_ind.rq_gpuctrl.rq_gpuid;
  gpumode = preq->rq_ind.rq_gpuctrl.rq_gpumode;
  reset_perm = preq->rq_ind.rq_gpuctrl.rq_reset_perm;
  reset_vol = preq->rq_ind.rq_gpuctrl.rq_reset_vol;

#ifdef NVIDIA_GPUS

  if (LOGLEVEL >= 7)
    {
    sprintf(
      log_buffer,
      "GPU control request for node %s gpuid %s mode %d reset_perm %d reset_vol %d",
      nodename,
      gpuid,
      gpumode,
      reset_perm,
      reset_vol);

    log_ext(-1, id, log_buffer, LOG_INFO);
    }

  /* validate mom node exists */

  pnode = find_nodebyname(nodename);

  if (pnode == NULL)
    {
    req_reject(PBSE_UNKNODE, 0, preq, NULL, NULL);
    return;
    }

  /* validate that the node is up */

  if (pnode->nd_state & (INUSE_DELETED | INUSE_DOWN | INUSE_OFFLINE | INUSE_UNKNOWN))
    {
    sprintf(
      log_buffer,
      "Node %s is not available",
      pnode->nd_name);
    req_reject(PBSE_UNKREQ, 0, preq, NULL, log_buffer);
    return;
    }


  /* validate that the node has real gpus not virtual */

  if (!pnode->nd_gpus_real)
    {
    req_reject(PBSE_UNKREQ, 0, preq, NULL, "Not allowed for virtual gpus");
    return;
    }

  /* validate the gpuid exists */

  if ((gpuidx = gpu_entry_by_id(pnode, gpuid, FALSE)) == -1)
    {
    req_reject(PBSE_UNKREQ, 0, preq, NULL, "GPU ID does not exist on node");
    return;
    }

  /* validate that we have a real request */

  if ((gpumode == -1) && (reset_perm == -1) && (reset_vol == -1))
    {
    req_reject(PBSE_UNKREQ, 0, preq, NULL, "No action specified");
    return;
    }

  /* for mode changes validate the mode with the driver_version */

  if ((pnode->nd_gpusn[gpuidx].driver_ver == 260) && (gpumode > 2))
    {
    req_reject(PBSE_UNKREQ, 0, preq, NULL, "GPU driver version does not support mode 3");
    return;
    }

  /* we need to relay request to the mom for processing */
  /* have MOM attempt to change the gpu mode */

  preq->rq_orgconn = preq->rq_conn;  /* restore client socket */

  conn = svr_connect(
           pnode->nd_addrs[0],
           pbs_mom_port,
           process_Dreply,
           ToServerDIS);

  if (conn >= 0)
    {
    if ((rc = issue_Drequest(conn, preq, process_gpu_request_reply, NULL)) != 0)
      {
      req_reject(rc, 0, preq, NULL, NULL);
      }
    }
  else
    {
    req_reject(PBSE_UNKREQ, 0, preq, NULL, "Failed to get connection to mom");
    }

#else

    sprintf(
      log_buffer,
      "GPU control request not supported: node %s gpuid %s mode %d reset_perm %d reset_vol %d",
      nodename,
      gpuid,
      gpumode,
      reset_perm,
      reset_vol);

  if (LOGLEVEL >= 3)
    {
      log_ext(-1, id, log_buffer, LOG_INFO);
    }

  req_reject(PBSE_NOSUP, 0, preq, NULL, NULL);

#endif  /* NVIDIA_GPUS */

  return;
  }
Пример #4
0
int req_gpuctrl_svr(
    
  struct batch_request *preq)

  {
  int rc = PBSE_NONE;
  char  *nodename = NULL;
  char  *gpuid = NULL;
  int    gpumode = -1;
  int    reset_perm = -1;
  int    reset_vol = -1;
  char   log_buf[LOCAL_LOG_BUF_SIZE+1];
  int    local_errno = 0;
  struct pbsnode *pnode = NULL;
  int    gpuidx = -1;
  int    conn;

  if ((preq->rq_perm &
       (ATR_DFLAG_MGWR | ATR_DFLAG_MGRD | ATR_DFLAG_OPRD | ATR_DFLAG_OPWR)) == 0)
    {
    rc = PBSE_PERM;
    snprintf(log_buf, LOCAL_LOG_BUF_SIZE,
        "invalid permissions (ATR_DFLAG_MGWR | ATR_DFLAG_MGRD | ATR_DFLAG_OPRD | ATR_DFLAG_OPWR)");
    req_reject(rc, 0, preq, NULL, log_buf);
    return rc;
    }

  nodename = preq->rq_ind.rq_gpuctrl.rq_momnode;
  gpuid = preq->rq_ind.rq_gpuctrl.rq_gpuid;
  gpumode = preq->rq_ind.rq_gpuctrl.rq_gpumode;
  reset_perm = preq->rq_ind.rq_gpuctrl.rq_reset_perm;
  reset_vol = preq->rq_ind.rq_gpuctrl.rq_reset_vol;

  if (LOGLEVEL >= 7)
    {
    sprintf(
      log_buf,
      "GPU control request for node %s gpuid %s mode %d reset_perm %d reset_vol %d",
      nodename,
      gpuid,
      gpumode,
      reset_perm,
      reset_vol);

    log_ext(-1, __func__, log_buf, LOG_INFO);
    }

  /* validate mom node exists */

  pnode = find_nodebyname(nodename);

  if (pnode == NULL)
    {
    req_reject(PBSE_UNKNODE, 0, preq, NULL, NULL);
    return PBSE_UNKNODE;
    }

  /* validate that the node is up */

  if ((pnode->nd_state & (INUSE_DOWN | INUSE_OFFLINE | INUSE_UNKNOWN))||(pnode->nd_power_state != POWER_STATE_RUNNING))
    {
    rc = PBSE_UNKREQ;
    sprintf(log_buf,"Node %s is not available",pnode->nd_name);
    req_reject(rc, 0, preq, NULL, log_buf);
    unlock_node(pnode, __func__, NULL, LOGLEVEL);
    return rc;
    }

  /* validate that the node has real gpus not virtual */

  if (!pnode->nd_gpus_real)
    {
    rc = PBSE_UNKREQ;
    req_reject(rc, 0, preq, NULL, "Not allowed for virtual gpus");
    unlock_node(pnode, __func__, NULL, LOGLEVEL);
    return rc;
    }

  /* validate the gpuid exists */

  if ((gpuidx = gpu_entry_by_id(pnode, gpuid, FALSE)) == -1)
    {
    rc = PBSE_UNKREQ;
    req_reject(rc, 0, preq, NULL, "GPU ID does not exist on node");
    unlock_node(pnode, __func__, NULL, LOGLEVEL);
    return rc;
    }

  /* validate that we have a real request */

  if ((gpumode == -1) && (reset_perm == -1) && (reset_vol == -1))
    {
    rc = PBSE_UNKREQ;
    req_reject(rc, 0, preq, NULL, "No action specified");
    unlock_node(pnode, __func__, NULL, LOGLEVEL);
    return rc;
    }

  /* for mode changes validate the mode with the driver_version */

  if ((pnode->nd_gpusn[gpuidx].driver_ver == 260) && (gpumode > 2))
    {
    rc = PBSE_UNKREQ;
    req_reject(rc, 0, preq, NULL, "GPU driver version does not support mode 3");
    unlock_node(pnode, __func__, NULL, LOGLEVEL);
    return rc;
    }

  /* we need to relay request to the mom for processing */
  /* have MOM attempt to change the gpu mode */

  preq->rq_orgconn = preq->rq_conn;  /* restore client socket */

  unlock_node(pnode, __func__, NULL, LOGLEVEL);
  conn = svr_connect(
           pnode->nd_addrs[0],
           pbs_mom_port,
           &local_errno,
           NULL,
           NULL);
    

  if (conn >= 0)
    {
    if ((rc = issue_Drequest(conn, preq)) != PBSE_NONE)
      req_reject(rc, 0, preq, NULL, NULL);
    else
      process_gpu_request_reply(preq);
    }
  else
    {
    req_reject(PBSE_UNKREQ, 0, preq, NULL, "Failed to get connection to mom");
    }

  return rc;
  }
Пример #5
0
int stat_to_mom(

  char             *job_id,
  struct stat_cntl *cntl)  /* M */

  {
  struct batch_request *newrq;
  int                   rc = PBSE_NONE;
  unsigned long         addr;
  char                  log_buf[LOCAL_LOG_BUF_SIZE+1];
  struct pbsnode       *node;
  int handle = -1;
  unsigned long job_momaddr = -1;
  unsigned short job_momport = -1;
  char *job_momname = NULL;
  job *pjob = NULL;

  if ((pjob = svr_find_job(job_id, FALSE)) == NULL)
    return PBSE_JOBNOTFOUND;

  job_momaddr = pjob->ji_qs.ji_un.ji_exect.ji_momaddr;
  job_momport = pjob->ji_qs.ji_un.ji_exect.ji_momport;
  job_momname = strdup(pjob->ji_wattr[JOB_ATR_exec_host].at_val.at_str);
  unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL);

  if (job_momname == NULL)
    return PBSE_MEM_MALLOC;

  if ((newrq = alloc_br(PBS_BATCH_StatusJob)) == NULL)
    {
    free(job_momname);
    return PBSE_MEM_MALLOC;
    }

  if (cntl->sc_type == 1)
    strcpy(newrq->rq_ind.rq_status.rq_id, job_id);
  else
    newrq->rq_ind.rq_status.rq_id[0] = '\0';  /* get stat of all */

  CLEAR_HEAD(newrq->rq_ind.rq_status.rq_attr);

  /* if MOM is down just return stale information */
  addr = job_momaddr;

  node = tfind_addr(addr,job_momport,job_momname);
  free(job_momname);

  if (node == NULL)
    return PBSE_UNKNODE;
  if (node->nd_state & INUSE_DOWN)
    {
    if (LOGLEVEL >= 6)
      {
      snprintf(log_buf, LOCAL_LOG_BUF_SIZE,
          "node '%s' is allocated to job but in state 'down'",
          node->nd_name);

      log_event(PBSEVENT_SYSTEM,PBS_EVENTCLASS_JOB,job_id,log_buf);
      }

    unlock_node(node, __func__, "no rely mom", LOGLEVEL);
    free_br(newrq);

    return PBSE_NORELYMOM;
    }

  /* get connection to MOM */
  unlock_node(node, __func__, "before svr_connect", LOGLEVEL);
  handle = svr_connect(job_momaddr, job_momport, &rc, NULL, NULL, ToServerDIS);

  /* Unlock job here */
  if (handle >= 0)
    {
    if ((rc = issue_Drequest(handle, newrq)) == PBSE_NONE)
      {
      stat_update(newrq, cntl);
      }
    }
  else
    rc = PBSE_CONNECT;

  if (rc == PBSE_SYSTEM)
    rc = PBSE_MEM_MALLOC;

  free_br(newrq);

  return rc;
  }  /* END stat_to_mom() */
Пример #6
0
int relay_to_mom(

  job                   **pjob_ptr,
  struct batch_request   *request, /* the request to send */
  void                  (*func)(struct work_task *))

  {
  int             handle; /* a client style connection handle */
  int             rc;
  int             local_errno = 0;
  pbs_net_t       addr;
  unsigned short  port;
  job            *pjob = *pjob_ptr;
  char            jobid[PBS_MAXSVRJOBID + 1];
  char *job_momname = NULL;

  struct pbsnode *node;
  char            log_buf[LOCAL_LOG_BUF_SIZE];

  /* if MOM is down don't try to connect */
  addr = pjob->ji_qs.ji_un.ji_exect.ji_momaddr;
  port = pjob->ji_qs.ji_un.ji_exect.ji_momport;
  job_momname = strdup(pjob->ji_wattr[JOB_ATR_exec_host].at_val.at_str);
  if (job_momname == NULL)
    return PBSE_MEM_MALLOC;

  if ((node = tfind_addr(addr, port, job_momname)) == NULL)
    {
    free(job_momname);
    return(PBSE_NORELYMOM);
    }
  free(job_momname);

  if ((node != NULL) &&
      (node->nd_state & INUSE_DOWN))
    {
    unlock_node(node, __func__, "no rely mom", LOGLEVEL);
    return(PBSE_NORELYMOM);
    }

  if (LOGLEVEL >= 7)
    {
    char *tmp = netaddr_pbs_net_t(pjob->ji_qs.ji_un.ji_exect.ji_momaddr);
    sprintf(log_buf, "momaddr=%s",tmp);

    log_record(PBSEVENT_SCHED, PBS_EVENTCLASS_REQUEST, __func__, log_buf);

    free(tmp);
    }

  unlock_node(node, __func__, "after svr_connect", LOGLEVEL);
  handle = svr_connect(
           pjob->ji_qs.ji_un.ji_exect.ji_momaddr,
           pjob->ji_qs.ji_un.ji_exect.ji_momport,
           &local_errno,
           NULL,
           NULL,
           ToServerDIS);
    

  if (handle < 0)
    {
    log_event(PBSEVENT_ERROR,PBS_EVENTCLASS_REQUEST,"",msg_norelytomom);

    return(PBSE_NORELYMOM);
    }

  strcpy(jobid, pjob->ji_qs.ji_jobid);
  unlock_ji_mutex(pjob, __func__, NULL, LOGLEVEL);

  request->rq_orgconn = request->rq_conn; /* save client socket */

  rc = issue_Drequest(handle, request);

  *pjob_ptr = svr_find_job(jobid, TRUE);

  return(rc);
  }  /* END relay_to_mom() */
Пример #7
0
int issue_to_svr(

  char                 *servern,                  /* I */
  struct batch_request *preq,                     /* I */
  void (*replyfunc)    (struct work_task *))      /* I */

  {
  int               rc = PBSE_NONE;
  int               do_retry = 0;
  int               handle;
  int               my_err = 0;
  pbs_net_t         svraddr;
  char             *svrname;
  unsigned int      port = pbs_server_port_dis;

  struct work_task *pwt;
  time_t            time_now = time(NULL);

  snprintf(preq->rq_host, sizeof(preq->rq_host), "%s", servern);

  preq->rq_fromsvr = 1;
  preq->rq_perm = ATR_DFLAG_MGRD | ATR_DFLAG_MGWR | ATR_DFLAG_SvWR;

  svrname = parse_servername(servern, &port); 
  svraddr = get_hostaddr(&my_err,svrname);

  free(svrname);

  if (svraddr == (pbs_net_t)0)
    {
    if (my_err == PBS_NET_RC_RETRY)
      {
      /* Non fatal error - retry */

      do_retry = 1;
      }
    }
  else
    {
    handle = svr_connect(svraddr, port, &my_err, NULL, NULL, ToServerDIS);

    if (handle >= 0)
      {
      if (((rc = issue_Drequest(handle, preq)) == PBSE_NONE) &&
          (handle != PBS_LOCAL_CONNECTION))
        {
        /* preq is already freed if handle == PBS_LOCAL_CONNECTION - a reply 
         * has always been sent */
        rc = preq->rq_reply.brp_code;
        }

      return(rc);
      }
    else if (handle == PBS_NET_RC_RETRY)
      {
      do_retry = 1;
      }
    }

  /* if reached here, it didn`t go, do we retry? */

  if (do_retry)
    {
    if (preq->rq_id == NULL)
      get_batch_request_id(preq);

    pwt = set_task(WORK_Timed, (long)(time_now + PBS_NET_RETRY_TIME), reissue_to_svr, preq->rq_id, TRUE);

    pwt->wt_parmfunc = replyfunc;

    pthread_mutex_unlock(pwt->wt_mutex);

    return(PBSE_NONE);
    }

  /* FAILURE */

  return(PBSE_INTERNAL);
  }  /* END issue_to_svr() */
Пример #8
0
int relay_to_mom(

    job                   **pjob_ptr,
    struct batch_request   *request, /* the request to send */
    void                  (*func)(struct work_task *))

{
    int             handle; /* a client style connection handle */
    int             rc;
    int             local_errno = 0;
    pbs_net_t       addr;
    unsigned short  port;
    job            *pjob = *pjob_ptr;
    char            jobid[PBS_MAXSVRJOBID + 1];
    char           *job_momname = NULL;

    struct pbsnode *node;
    char            log_buf[LOCAL_LOG_BUF_SIZE];
    std::string     node_name;

    if (pjob->ji_wattr[JOB_ATR_exec_host].at_val.at_str == NULL)
    {
        snprintf(log_buf, sizeof(log_buf),
                 "attempting to send a request to %s's mom but no exec_host list?",
                 pjob->ji_qs.ji_jobid);
        log_err(PBSE_BADSTATE, __func__, log_buf);

        return(PBSE_BADSTATE);
    }

    /* if MOM is down don't try to connect */
    addr = pjob->ji_qs.ji_un.ji_exect.ji_momaddr;
    port = pjob->ji_qs.ji_un.ji_exect.ji_momport;
    job_momname = strdup(pjob->ji_wattr[JOB_ATR_exec_host].at_val.at_str);
    if (job_momname == NULL)
        return PBSE_MEM_MALLOC;

    if ((node = tfind_addr(addr, port, job_momname)) == NULL)
    {
        free(job_momname);
        return(PBSE_NORELYMOM);
    }
    free(job_momname);

    if ((node != NULL) &&
            ((node->nd_state & INUSE_NOT_READY)||
             (node->nd_power_state != POWER_STATE_RUNNING)))
    {
        node->unlock_node(__func__, "no relay mom", LOGLEVEL);
        return(PBSE_NORELYMOM);
    }

    if (LOGLEVEL >= 7)
    {
        char *tmp = netaddr_pbs_net_t(pjob->ji_qs.ji_un.ji_exect.ji_momaddr);
        sprintf(log_buf, "momaddr=%s",tmp);

        log_record(PBSEVENT_SCHED, PBS_EVENTCLASS_REQUEST, __func__, log_buf);

        free(tmp);
    }

    node_name = node->get_name();

    node->unlock_node(__func__, "after svr_connect", LOGLEVEL);

    strcpy(jobid, pjob->ji_qs.ji_jobid);
    unlock_ji_mutex(pjob, __func__, NULL, LOGLEVEL);
    *pjob_ptr = NULL;

    handle = svr_connect(addr, port, &local_errno, NULL, NULL);

    if (handle < 0)
    {
        update_failure_counts(node_name.c_str(), -1);
        log_event(PBSEVENT_ERROR,PBS_EVENTCLASS_REQUEST,"",msg_norelytomom);

        return(PBSE_NORELYMOM);
    }

    request->rq_orgconn = request->rq_conn; /* save client socket */

    rc = issue_Drequest(handle, request, true);

    if (request->rq_reply.brp_code == PBSE_TIMEOUT)
        update_failure_counts(node_name.c_str(), PBSE_TIMEOUT);
    else
        update_failure_counts(node_name.c_str(), 0);

    *pjob_ptr = svr_find_job(jobid, TRUE);

    return(rc);
}  /* END relay_to_mom() */
Пример #9
0
int issue_to_svr(

    const char            *servern,                  /* I */
    struct batch_request **preq_ptr,                 /* I */
    void (*replyfunc)      (struct work_task *))     /* I */

{
    int             rc = PBSE_NONE;
    bool            do_retry = false;
    int             handle;
    int             my_err = 0;
    pbs_net_t       svraddr;
    char           *svrname;
    unsigned int    port = pbs_server_port_dis;
    batch_request  *preq = *preq_ptr;

    snprintf(preq->rq_host, sizeof(preq->rq_host), "%s", servern);

    preq->rq_fromsvr = 1;
    preq->rq_perm = ATR_DFLAG_MGRD | ATR_DFLAG_MGWR | ATR_DFLAG_SvWR;

    svrname = parse_servername(servern, &port);
    svraddr = get_hostaddr(&my_err,svrname);

    free(svrname);

    if (svraddr == (pbs_net_t)0)
    {
        if (my_err == PBS_NET_RC_RETRY)
        {
            /* Non fatal error - retry */

            do_retry = true;
        }
    }
    else
    {
        handle = svr_connect(svraddr, port, &my_err, NULL, NULL);

        if (handle >= 0)
        {
            if (((rc = issue_Drequest(handle, preq, true)) == PBSE_NONE) &&
                    (handle != PBS_LOCAL_CONNECTION))
            {
                /* preq is already freed if handle == PBS_LOCAL_CONNECTION - a reply
                 * has always been sent */
                rc = preq->rq_reply.brp_code;
            }
            else if (handle == PBS_LOCAL_CONNECTION)
                *preq_ptr = NULL;

            return(rc);
        }
        else if (handle == PBS_NET_RC_RETRY)
            do_retry = true;
    }

    /* if reached here, it didn`t go, do we retry? */

    if (do_retry)
    {
        queue_a_retry_task(preq, replyfunc);

        return(PBSE_NONE);
    }

    /* FAILURE */

    return(PBSE_INTERNAL);
}  /* END issue_to_svr() */
Пример #10
0
int set_node_power_state(
    
  struct pbsnode **ppNode,
  unsigned short   newState)

  {
  struct pbsnode *pNode = *ppNode;
  if (pNode->nd_addrs == NULL)
    {
    return PBSE_BAD_PARAMETER;
    }

  if (newState == POWER_STATE_RUNNING)
    {
    static std::string interface;
    static unsigned char mac_addr[6];
    if (interface.length() == 0)
      {
      if (!getMacAddr(interface,mac_addr))
        {
        return PBSE_SYSTEM;
        }
      }

    int sock;
    if ((sock = socket(AF_INET,SOCK_PACKET,SOCK_PACKET)) < 0)
      {
      return PBSE_SYSTEM;
      }

    unsigned char outpack[1000];

    memcpy(outpack+6,mac_addr,6);
    memcpy(outpack,pNode->nd_mac_addr,6);
    outpack[12] = 0x08;
    outpack[13] = 0x42;
    int offset = 14;
    memset(outpack + offset,0xff,6);
    offset += 6;

    for (int i = 0;i < 16;i++)
      {
      memcpy(outpack + offset,pNode->nd_mac_addr,6);
      offset += 6;
      }

    int one = 1;
    if (setsockopt(sock, SOL_SOCKET, SO_BROADCAST, (char *)&one, sizeof(one)) < 0)
      {
      close(sock);
      return PBSE_SYSTEM;
      }

    struct sockaddr whereto;
    whereto.sa_family = 0;
    snprintf(whereto.sa_data, sizeof(whereto.sa_data), "%s", interface.c_str());

    if (sendto(sock, outpack, offset, 0, &whereto, sizeof(whereto)) < 0)
      {
      close(sock);
      return PBSE_SYSTEM;
      }

    close(sock);
    return PBSE_NONE;
    }

  if (pNode->nd_job_usages.size() != 0)
    {
    //Can't change the power state on a node with running jobs.
    return PBSE_CANT_CHANGE_POWER_STATE_WITH_JOBS_RUNNING;
    }
  struct batch_request *request = alloc_br(PBS_BATCH_ChangePowerState);
  if (request == NULL)
    {
    return PBSE_SYSTEM;
    }

  request->rq_ind.rq_powerstate = newState;
  pNode->nd_power_state_change_time = time(NULL);

  snprintf(request->rq_host, sizeof(request->rq_host), "%s", pNode->nd_name);
  std::string hostname(request->rq_host);
  int rc = PBSE_NONE;

  {
    int handle = 0;
    int local_errno = 0;
    handle = svr_connect(pNode->nd_addrs[0],pNode->nd_mom_port,&local_errno,pNode,NULL);
    if(handle < 0)
      {
      unlock_node(pNode, __func__, "Error connecting", LOGLEVEL);
      *ppNode = NULL;
      return local_errno;
      }
    unlock_node(pNode, __func__, "Done connecting", LOGLEVEL);
    *ppNode = NULL;
    rc = issue_Drequest(handle, request,true);
    if(rc == PBSE_NONE)
      {
      rc = request->rq_reply.brp_code;
      if(rc < 0) rc = -rc;
      }
  }
  pNode = find_nodebyname(hostname.c_str());
  *ppNode = pNode;
  if ((rc == PBSE_NONE)&&(pNode != NULL))
    {
    pNode->nd_power_state = newState;
    }

  return(rc);
  }
Пример #11
0
int stat_to_mom(

  job              *pjob,  /* I */
  struct stat_cntl *cntl)  /* I/O */

  {

  struct batch_request *newrq;
  int          rc;

  struct work_task     *pwt = 0;

  struct pbsnode       *node;

  if ((newrq = alloc_br(PBS_BATCH_StatusJob)) == NULL)
    {
    return(PBSE_SYSTEM);
    }

  /* set up status request, save address of cntl in request for later */

  newrq->rq_extra = (void *)cntl;

  if (cntl->sc_type == 1)
    strcpy(newrq->rq_ind.rq_status.rq_id, pjob->ji_qs.ji_jobid);
  else
    newrq->rq_ind.rq_status.rq_id[0] = '\0';  /* get stat of all */

  CLEAR_HEAD(newrq->rq_ind.rq_status.rq_attr);

  /* if MOM is down just return stale information */

  if (((node = tfind_addr(pjob->ji_qs.ji_un.ji_exect.ji_momaddr)) != NULL) &&
      (node->nd_state & (INUSE_DELETED | INUSE_DOWN)))
    {
    if (LOGLEVEL >= 6)
      {
      sprintf(log_buffer, "node '%s' is allocated to job but in state '%s'",
              node->nd_name,
              (node->nd_state & INUSE_DELETED) ? "deleted" : "down");

      log_event(
        PBSEVENT_SYSTEM,
        PBS_EVENTCLASS_JOB,
        pjob->ji_qs.ji_jobid,
        log_buffer);
      }

    return(PBSE_NORELYMOM);
    }

  /* get connection to MOM */

  cntl->sc_conn = svr_connect(
                    pjob->ji_qs.ji_un.ji_exect.ji_momaddr,
                    pbs_mom_port,
                    process_Dreply,
                    ToServerDIS);

  if ((rc = cntl->sc_conn) >= 0)
    rc = issue_Drequest(cntl->sc_conn, newrq, stat_update, &pwt);

  if (rc != 0)
    {
    /* request failed */

    if (pwt)
      delete_task(pwt);

    free_br(newrq);

    if (cntl->sc_conn >= 0)
      svr_disconnect(cntl->sc_conn);
    }  /* END if (rc != NULL) */

  return(rc);
  }  /* END stat_to_mom() */
Пример #12
0
void *check_if_orphaned(

  void *vp)

  {
  char           *node_name = (char *)vp;
  char           *rsv_id = NULL;
  std::string     job_id;
  batch_request  *preq;
  int             handle = -1;
  int             retries = 0;
  struct pbsnode *pnode;
  char            log_buf[LOCAL_LOG_BUF_SIZE];

  if ((rsv_id = strchr(node_name, ':')) != NULL)
    {
    *rsv_id = '\0';
    rsv_id++;
    }
  else
    {
    free(node_name);
    return(NULL);
    }

  if (alps_reservations.is_orphaned(rsv_id, job_id) == true)
    {
    // Make sure the node with the orphan is not available for jobs
    if ((pnode = find_nodebyname(node_name)) != NULL)
      {
      if ((pnode->nd_state & (INUSE_BUSY | INUSE_DOWN)) == 0)
        {
        snprintf(log_buf, sizeof(log_buf),
          "Node %s has an orphan but wasn't marked as busy. Marking as busy now.",
          node_name);
        log_err(-1, __func__, log_buf);

        update_node_state(pnode, INUSE_BUSY);
        }

      pnode->unlock_node(__func__, NULL, LOGLEVEL);
      }

    if ((preq = alloc_br(PBS_BATCH_DeleteReservation)) == NULL)
      {
      free(node_name);
      alps_reservations.remove_from_orphaned_list(rsv_id);
      return(NULL);
      }

    preq->rq_extend = strdup(rsv_id);

    if ((pnode = get_next_login_node(NULL)) != NULL)
      {
      struct in_addr hostaddr;
      int            local_errno;
      pbs_net_t      momaddr;

      memcpy(&hostaddr, &pnode->nd_sock_addr.sin_addr, sizeof(hostaddr));
      momaddr = ntohl(hostaddr.s_addr);

      snprintf(log_buf, sizeof(log_buf),
        "Found orphan ALPS reservation ID %s for job %s; asking %s to remove it",
        rsv_id,
        job_id.c_str(),
        pnode->get_name());
      log_record(PBSEVENT_DEBUG, PBS_EVENTCLASS_SERVER, __func__, log_buf);

      while ((handle < 0) &&
             (retries < 3))
        {
        handle = svr_connect(momaddr, pnode->nd_mom_port, &local_errno, pnode, NULL);
        retries++;
        }

      /* unlock before the network transaction */
      pnode->unlock_node(__func__, NULL, LOGLEVEL);
      
      if (handle >= 0)
        issue_Drequest(handle, preq, true);
        
      free_br(preq);
      }

    alps_reservations.remove_from_orphaned_list(rsv_id);
    }

  free(node_name);

  return(NULL);
  } /* END check_if_orphaned() */
Пример #13
0
int issue_to_svr(

  char                 *servern,                  /* I */
  struct batch_request *preq,                     /* I */
  void (*replyfunc)    (struct work_task *))      /* I */

  {
  int   do_retry = 0;
  int   handle;
  pbs_net_t svraddr;
  char  *svrname;
  unsigned int  port = pbs_server_port_dis;

  struct work_task *pwt;

  strcpy(preq->rq_host, servern);

  preq->rq_fromsvr = 1;
  preq->rq_perm = ATR_DFLAG_MGRD | ATR_DFLAG_MGWR | ATR_DFLAG_SvWR;

  svrname = parse_servername(servern, &port); 
  svraddr = get_hostaddr(svrname);

  if (svraddr == (pbs_net_t)0)
    {
    if (pbs_errno == PBS_NET_RC_RETRY)
      {
      /* Non fatal error - retry */

      do_retry = 1;
      }
    }
  else
    {
    handle = svr_connect(svraddr, port, process_Dreply, ToServerDIS);

    if (handle >= 0)
      {
      return(issue_Drequest(handle, preq, replyfunc, NULL));
      }
    else if (handle == PBS_NET_RC_RETRY)
      {
      do_retry = 1;
      }
    }

  /* if reached here, it didn`t go, do we retry? */

  if (do_retry)
    {
    pwt = set_task(
            WORK_Timed,
            (long)(time_now + PBS_NET_RETRY_TIME),
            reissue_to_svr,
            (void *)preq);

    pwt->wt_parmfunc = replyfunc;

    return(0);
    }

  /* FAILURE */

  return(-1);
  }  /* END issue_to_svr() */
Пример #14
0
int relay_to_mom(

  job *pjob,
  struct batch_request  *request, /* the request to send */
  void (*func)(struct work_task *))

  {
  char *id = "relay_to_mom";

  int conn; /* a client style connection handle */
  int   rc;
  pbs_net_t addr;

  struct pbsnode *node;

  /* if MOM is down don't try to connect */

  addr = pjob->ji_qs.ji_un.ji_exect.ji_momaddr;

  node = tfind_addr(addr,pjob->ji_qs.ji_un.ji_exect.ji_momport,pjob);

  if ((node != NULL) &&
      (node->nd_state & (INUSE_DELETED|INUSE_DOWN)))
    {
    return(PBSE_NORELYMOM);
    }

  if (LOGLEVEL >= 7)
    {
    sprintf(log_buffer, "momaddr=%s",
            netaddr_pbs_net_t(pjob->ji_qs.ji_un.ji_exect.ji_momaddr));

    log_record(
      PBSEVENT_SCHED,
      PBS_EVENTCLASS_REQUEST,
      id,
      log_buffer);
    }

  conn = svr_connect(

           pjob->ji_qs.ji_un.ji_exect.ji_momaddr,
           pjob->ji_qs.ji_un.ji_exect.ji_momport,
           process_Dreply,
           ToServerDIS);

  if (conn < 0)
    {
    LOG_EVENT(
      PBSEVENT_ERROR,
      PBS_EVENTCLASS_REQUEST,
      "",
      msg_norelytomom);

    return(PBSE_NORELYMOM);
    }

  request->rq_orgconn = request->rq_conn; /* save client socket */

  rc = issue_Drequest(conn, request, func, NULL);

  return(rc);
  }  /* END relay_to_mom() */