예제 #1
0
int stat_to_mom(

  char             *job_id,
  struct stat_cntl *cntl)  /* M */

  {
  struct batch_request *newrq;
  int                   rc = PBSE_NONE;
  unsigned long         addr;
  char                  log_buf[LOCAL_LOG_BUF_SIZE+1];
  struct pbsnode       *node;
  int                   handle = -1;
  unsigned long         job_momaddr = -1;
  unsigned short        job_momport = -1;
  char                 *job_momname = NULL;
  job                  *pjob = NULL;

  if ((pjob = svr_find_job(job_id, FALSE)) == NULL)
    return(PBSE_JOBNOTFOUND);

  mutex_mgr job_mutex(pjob->ji_mutex, true);

  if ((pjob->ji_qs.ji_un.ji_exect.ji_momaddr == 0) || 
      (!pjob->ji_wattr[JOB_ATR_exec_host].at_val.at_str))
    {
    job_mutex.unlock();
    snprintf(log_buf, sizeof(log_buf),
      "Job %s missing MOM's information. Skipping statting on this job", pjob->ji_qs.ji_jobid);
    log_record(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, __func__, log_buf);
    return PBSE_BAD_PARAMETER;
    }

  job_momaddr = pjob->ji_qs.ji_un.ji_exect.ji_momaddr;
  job_momport = pjob->ji_qs.ji_un.ji_exect.ji_momport;
  job_momname = strdup(pjob->ji_wattr[JOB_ATR_exec_host].at_val.at_str);
  job_mutex.unlock();

  if (job_momname == NULL)
    return PBSE_MEM_MALLOC;

  if ((newrq = alloc_br(PBS_BATCH_StatusJob)) == NULL)
    {
    free(job_momname);
    return PBSE_MEM_MALLOC;
    }

  if (cntl->sc_type == 1)
    snprintf(newrq->rq_ind.rq_status.rq_id, sizeof(newrq->rq_ind.rq_status.rq_id), "%s", job_id);
  else
    newrq->rq_ind.rq_status.rq_id[0] = '\0';  /* get stat of all */

  CLEAR_HEAD(newrq->rq_ind.rq_status.rq_attr);

  /* if MOM is down just return stale information */
  addr = job_momaddr;

  node = tfind_addr(addr,job_momport,job_momname);
  free(job_momname);

  if (node == NULL)
    return PBSE_UNKNODE;
  if ((node->nd_state & INUSE_DOWN)||(node->nd_power_state != POWER_STATE_RUNNING))
    {
    if (LOGLEVEL >= 6)
      {
      snprintf(log_buf, LOCAL_LOG_BUF_SIZE,
          "node '%s' is allocated to job but in state 'down'",
          node->nd_name);

      log_event(PBSEVENT_SYSTEM,PBS_EVENTCLASS_JOB,job_id,log_buf);
      }

    unlock_node(node, __func__, "no rely mom", LOGLEVEL);
    free_br(newrq);

    return PBSE_NORELYMOM;
    }

  /* get connection to MOM */
  unlock_node(node, __func__, "before svr_connect", LOGLEVEL);
  handle = svr_connect(job_momaddr, job_momport, &rc, NULL, NULL);

  if (handle >= 0)
    {
    if ((rc = issue_Drequest(handle, newrq, true)) == PBSE_NONE)
      {
      stat_update(newrq, cntl);
      }
    }
  else
    rc = PBSE_CONNECT;

  if (rc == PBSE_SYSTEM)
    rc = PBSE_MEM_MALLOC;

  free_br(newrq);

  return(rc);
  }  /* END stat_to_mom() */
예제 #2
0
int stat_to_mom(

  char             *job_id,
  struct stat_cntl *cntl)  /* M */

  {
  struct batch_request *newrq;
  int                   rc = PBSE_NONE;
  unsigned long         addr;
  char                  log_buf[LOCAL_LOG_BUF_SIZE+1];
  struct pbsnode       *node;
  int handle = -1;
  unsigned long job_momaddr = -1;
  unsigned short job_momport = -1;
  char *job_momname = NULL;
  job *pjob = NULL;

  if ((pjob = svr_find_job(job_id, FALSE)) == NULL)
    return PBSE_JOBNOTFOUND;

  job_momaddr = pjob->ji_qs.ji_un.ji_exect.ji_momaddr;
  job_momport = pjob->ji_qs.ji_un.ji_exect.ji_momport;
  job_momname = strdup(pjob->ji_wattr[JOB_ATR_exec_host].at_val.at_str);
  unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL);

  if (job_momname == NULL)
    return PBSE_MEM_MALLOC;

  if ((newrq = alloc_br(PBS_BATCH_StatusJob)) == NULL)
    {
    free(job_momname);
    return PBSE_MEM_MALLOC;
    }

  if (cntl->sc_type == 1)
    strcpy(newrq->rq_ind.rq_status.rq_id, job_id);
  else
    newrq->rq_ind.rq_status.rq_id[0] = '\0';  /* get stat of all */

  CLEAR_HEAD(newrq->rq_ind.rq_status.rq_attr);

  /* if MOM is down just return stale information */
  addr = job_momaddr;

  node = tfind_addr(addr,job_momport,job_momname);
  free(job_momname);

  if (node == NULL)
    return PBSE_UNKNODE;
  if (node->nd_state & INUSE_DOWN)
    {
    if (LOGLEVEL >= 6)
      {
      snprintf(log_buf, LOCAL_LOG_BUF_SIZE,
          "node '%s' is allocated to job but in state 'down'",
          node->nd_name);

      log_event(PBSEVENT_SYSTEM,PBS_EVENTCLASS_JOB,job_id,log_buf);
      }

    unlock_node(node, __func__, "no rely mom", LOGLEVEL);
    free_br(newrq);

    return PBSE_NORELYMOM;
    }

  /* get connection to MOM */
  unlock_node(node, __func__, "before svr_connect", LOGLEVEL);
  handle = svr_connect(job_momaddr, job_momport, &rc, NULL, NULL, ToServerDIS);

  /* Unlock job here */
  if (handle >= 0)
    {
    if ((rc = issue_Drequest(handle, newrq)) == PBSE_NONE)
      {
      stat_update(newrq, cntl);
      }
    }
  else
    rc = PBSE_CONNECT;

  if (rc == PBSE_SYSTEM)
    rc = PBSE_MEM_MALLOC;

  free_br(newrq);

  return rc;
  }  /* END stat_to_mom() */
예제 #3
0
int relay_to_mom(

  job                   **pjob_ptr,
  struct batch_request   *request, /* the request to send */
  void                  (*func)(struct work_task *))

  {
  int             handle; /* a client style connection handle */
  int             rc;
  int             local_errno = 0;
  pbs_net_t       addr;
  unsigned short  port;
  job            *pjob = *pjob_ptr;
  char            jobid[PBS_MAXSVRJOBID + 1];
  char *job_momname = NULL;

  struct pbsnode *node;
  char            log_buf[LOCAL_LOG_BUF_SIZE];

  /* if MOM is down don't try to connect */
  addr = pjob->ji_qs.ji_un.ji_exect.ji_momaddr;
  port = pjob->ji_qs.ji_un.ji_exect.ji_momport;
  job_momname = strdup(pjob->ji_wattr[JOB_ATR_exec_host].at_val.at_str);
  if (job_momname == NULL)
    return PBSE_MEM_MALLOC;

  if ((node = tfind_addr(addr, port, job_momname)) == NULL)
    {
    free(job_momname);
    return(PBSE_NORELYMOM);
    }
  free(job_momname);

  if ((node != NULL) &&
      (node->nd_state & INUSE_DOWN))
    {
    unlock_node(node, __func__, "no rely mom", LOGLEVEL);
    return(PBSE_NORELYMOM);
    }

  if (LOGLEVEL >= 7)
    {
    char *tmp = netaddr_pbs_net_t(pjob->ji_qs.ji_un.ji_exect.ji_momaddr);
    sprintf(log_buf, "momaddr=%s",tmp);

    log_record(PBSEVENT_SCHED, PBS_EVENTCLASS_REQUEST, __func__, log_buf);

    free(tmp);
    }

  unlock_node(node, __func__, "after svr_connect", LOGLEVEL);
  handle = svr_connect(
           pjob->ji_qs.ji_un.ji_exect.ji_momaddr,
           pjob->ji_qs.ji_un.ji_exect.ji_momport,
           &local_errno,
           NULL,
           NULL,
           ToServerDIS);
    

  if (handle < 0)
    {
    log_event(PBSEVENT_ERROR,PBS_EVENTCLASS_REQUEST,"",msg_norelytomom);

    return(PBSE_NORELYMOM);
    }

  strcpy(jobid, pjob->ji_qs.ji_jobid);
  unlock_ji_mutex(pjob, __func__, NULL, LOGLEVEL);

  request->rq_orgconn = request->rq_conn; /* save client socket */

  rc = issue_Drequest(handle, request);

  *pjob_ptr = svr_find_job(jobid, TRUE);

  return(rc);
  }  /* END relay_to_mom() */
예제 #4
0
int relay_to_mom(

    job                   **pjob_ptr,
    struct batch_request   *request, /* the request to send */
    void                  (*func)(struct work_task *))

{
    int             handle; /* a client style connection handle */
    int             rc;
    int             local_errno = 0;
    pbs_net_t       addr;
    unsigned short  port;
    job            *pjob = *pjob_ptr;
    char            jobid[PBS_MAXSVRJOBID + 1];
    char           *job_momname = NULL;

    struct pbsnode *node;
    char            log_buf[LOCAL_LOG_BUF_SIZE];
    std::string     node_name;

    if (pjob->ji_wattr[JOB_ATR_exec_host].at_val.at_str == NULL)
    {
        snprintf(log_buf, sizeof(log_buf),
                 "attempting to send a request to %s's mom but no exec_host list?",
                 pjob->ji_qs.ji_jobid);
        log_err(PBSE_BADSTATE, __func__, log_buf);

        return(PBSE_BADSTATE);
    }

    /* if MOM is down don't try to connect */
    addr = pjob->ji_qs.ji_un.ji_exect.ji_momaddr;
    port = pjob->ji_qs.ji_un.ji_exect.ji_momport;
    job_momname = strdup(pjob->ji_wattr[JOB_ATR_exec_host].at_val.at_str);
    if (job_momname == NULL)
        return PBSE_MEM_MALLOC;

    if ((node = tfind_addr(addr, port, job_momname)) == NULL)
    {
        free(job_momname);
        return(PBSE_NORELYMOM);
    }
    free(job_momname);

    if ((node != NULL) &&
            ((node->nd_state & INUSE_NOT_READY)||
             (node->nd_power_state != POWER_STATE_RUNNING)))
    {
        node->unlock_node(__func__, "no relay mom", LOGLEVEL);
        return(PBSE_NORELYMOM);
    }

    if (LOGLEVEL >= 7)
    {
        char *tmp = netaddr_pbs_net_t(pjob->ji_qs.ji_un.ji_exect.ji_momaddr);
        sprintf(log_buf, "momaddr=%s",tmp);

        log_record(PBSEVENT_SCHED, PBS_EVENTCLASS_REQUEST, __func__, log_buf);

        free(tmp);
    }

    node_name = node->get_name();

    node->unlock_node(__func__, "after svr_connect", LOGLEVEL);

    strcpy(jobid, pjob->ji_qs.ji_jobid);
    unlock_ji_mutex(pjob, __func__, NULL, LOGLEVEL);
    *pjob_ptr = NULL;

    handle = svr_connect(addr, port, &local_errno, NULL, NULL);

    if (handle < 0)
    {
        update_failure_counts(node_name.c_str(), -1);
        log_event(PBSEVENT_ERROR,PBS_EVENTCLASS_REQUEST,"",msg_norelytomom);

        return(PBSE_NORELYMOM);
    }

    request->rq_orgconn = request->rq_conn; /* save client socket */

    rc = issue_Drequest(handle, request, true);

    if (request->rq_reply.brp_code == PBSE_TIMEOUT)
        update_failure_counts(node_name.c_str(), PBSE_TIMEOUT);
    else
        update_failure_counts(node_name.c_str(), 0);

    *pjob_ptr = svr_find_job(jobid, TRUE);

    return(rc);
}  /* END relay_to_mom() */
예제 #5
0
int stat_to_mom(

  job              *pjob,  /* I */
  struct stat_cntl *cntl)  /* I/O */

  {

  struct batch_request *newrq;
  int          rc;

  struct work_task     *pwt = 0;

  struct pbsnode       *node;

  if ((newrq = alloc_br(PBS_BATCH_StatusJob)) == NULL)
    {
    return(PBSE_SYSTEM);
    }

  /* set up status request, save address of cntl in request for later */

  newrq->rq_extra = (void *)cntl;

  if (cntl->sc_type == 1)
    strcpy(newrq->rq_ind.rq_status.rq_id, pjob->ji_qs.ji_jobid);
  else
    newrq->rq_ind.rq_status.rq_id[0] = '\0';  /* get stat of all */

  CLEAR_HEAD(newrq->rq_ind.rq_status.rq_attr);

  /* if MOM is down just return stale information */

  if (((node = tfind_addr(pjob->ji_qs.ji_un.ji_exect.ji_momaddr)) != NULL) &&
      (node->nd_state & (INUSE_DELETED | INUSE_DOWN)))
    {
    if (LOGLEVEL >= 6)
      {
      sprintf(log_buffer, "node '%s' is allocated to job but in state '%s'",
              node->nd_name,
              (node->nd_state & INUSE_DELETED) ? "deleted" : "down");

      log_event(
        PBSEVENT_SYSTEM,
        PBS_EVENTCLASS_JOB,
        pjob->ji_qs.ji_jobid,
        log_buffer);
      }

    return(PBSE_NORELYMOM);
    }

  /* get connection to MOM */

  cntl->sc_conn = svr_connect(
                    pjob->ji_qs.ji_un.ji_exect.ji_momaddr,
                    pbs_mom_port,
                    process_Dreply,
                    ToServerDIS);

  if ((rc = cntl->sc_conn) >= 0)
    rc = issue_Drequest(cntl->sc_conn, newrq, stat_update, &pwt);

  if (rc != 0)
    {
    /* request failed */

    if (pwt)
      delete_task(pwt);

    free_br(newrq);

    if (cntl->sc_conn >= 0)
      svr_disconnect(cntl->sc_conn);
    }  /* END if (rc != NULL) */

  return(rc);
  }  /* END stat_to_mom() */
예제 #6
0
int relay_to_mom(

  job *pjob,
  struct batch_request  *request, /* the request to send */
  void (*func)(struct work_task *))

  {
  char *id = "relay_to_mom";

  int conn; /* a client style connection handle */
  int   rc;
  pbs_net_t addr;

  struct pbsnode *node;

  /* if MOM is down don't try to connect */

  addr = pjob->ji_qs.ji_un.ji_exect.ji_momaddr;

  node = tfind_addr(addr,pjob->ji_qs.ji_un.ji_exect.ji_momport,pjob);

  if ((node != NULL) &&
      (node->nd_state & (INUSE_DELETED|INUSE_DOWN)))
    {
    return(PBSE_NORELYMOM);
    }

  if (LOGLEVEL >= 7)
    {
    sprintf(log_buffer, "momaddr=%s",
            netaddr_pbs_net_t(pjob->ji_qs.ji_un.ji_exect.ji_momaddr));

    log_record(
      PBSEVENT_SCHED,
      PBS_EVENTCLASS_REQUEST,
      id,
      log_buffer);
    }

  conn = svr_connect(

           pjob->ji_qs.ji_un.ji_exect.ji_momaddr,
           pjob->ji_qs.ji_un.ji_exect.ji_momport,
           process_Dreply,
           ToServerDIS);

  if (conn < 0)
    {
    LOG_EVENT(
      PBSEVENT_ERROR,
      PBS_EVENTCLASS_REQUEST,
      "",
      msg_norelytomom);

    return(PBSE_NORELYMOM);
    }

  request->rq_orgconn = request->rq_conn; /* save client socket */

  rc = issue_Drequest(conn, request, func, NULL);

  return(rc);
  }  /* END relay_to_mom() */