示例#1
0
文件: test_uut.c 项目: msbritt/torque
END_TEST
#endif


START_TEST(test_update_failure_counts)
  {
  const char *name = "lihue";
  struct pbsnode *pnode = find_nodebyname(name);
  update_failure_counts(name, -1);
  update_failure_counts(name, -1);

  // Make sure the two failures are correctly counted
  fail_unless(pnode->nd_proximal_failures == 2);
  fail_unless(pnode->nd_consecutive_successes == 0);
  fail_unless(pnode->nd_state == INUSE_FREE);

  // One success shouldn't reset the failure counts
  update_failure_counts(name, 0);
  fail_unless(pnode->nd_proximal_failures == 2);
  fail_unless(pnode->nd_consecutive_successes == 1);
  fail_unless(pnode->nd_state == INUSE_FREE);
  
  // Two should
  update_failure_counts(name, 0);
  fail_unless(pnode->nd_proximal_failures == 0);
  fail_unless(pnode->nd_consecutive_successes == 2);
  fail_unless(pnode->nd_state == INUSE_FREE);

  // One failure should reset the success count
  update_failure_counts(name, 1);
  fail_unless(pnode->nd_proximal_failures == 1);
  fail_unless(pnode->nd_consecutive_successes == 0);
  fail_unless(pnode->nd_state == INUSE_FREE);

  // State shouldn't change until there are 3 proximal failures
  update_failure_counts(name, 1);
  fail_unless(pnode->nd_proximal_failures == 2);
  fail_unless(pnode->nd_consecutive_successes == 0);
  fail_unless(pnode->nd_state == INUSE_FREE);
  
  update_failure_counts(name, 1);
  fail_unless(pnode->nd_state != INUSE_FREE);
  fail_unless(pnode->nd_proximal_failures == 3);

  // State shouldn't reset until there are 2 consecutive successes
  update_failure_counts(name, 0);
  fail_unless(pnode->nd_state != INUSE_FREE);
  fail_unless(pnode->nd_proximal_failures == 3);
  fail_unless(pnode->nd_consecutive_successes == 1);
  
  update_failure_counts(name, 0);
  fail_unless(pnode->nd_state == INUSE_FREE);
  fail_unless(pnode->nd_proximal_failures == 0);
  fail_unless(pnode->nd_consecutive_successes == 2);
  }
示例#2
0
int send_job_work(

  char           *job_id,
  const char     *node_name, /* I */
  int             type,      /* I */
  int            *my_err,    /* O */
  batch_request  *preq)      /* M */

  {
  int                   rc = LOCUTION_FAIL;
  int                   ret = PBSE_NONE;
  int                   local_errno = 0;
  tlist_head            attrl;

  int                   encode_type;
  int                   mom_err = PBSE_NONE;
  int                   resc_access_perm;
  std::string           script_name;
  char                 *pc;
  char                  stdout_path[MAXPATHLEN + 1];
  char                  stderr_path[MAXPATHLEN + 1];
  char                  chkpt_path[MAXPATHLEN + 1];
  char                  log_buf[LOCAL_LOG_BUF_SIZE];
  long                  start_time = time(NULL);
  bool                  attempt_to_queue_job = false;
  bool                  change_substate_on_attempt_to_queue = false;
  bool                  need_to_send_job_script = false;
  bool                  job_has_run = false;
  job                  *pjob = NULL;
  char                  job_destin[PBS_MAXROUTEDEST+1];

  bool                  Timeout = false;
  
  unsigned long         job_momaddr = -1;
  unsigned short        job_momport = -1;

  if ((pjob = svr_find_job(job_id, TRUE)) == NULL)
    {
    *my_err = PBSE_JOBNOTFOUND;
    req_reject(-1, 0, preq, NULL, NULL);
    return(PBSE_JOBNOTFOUND);
    }

  mutex_mgr job_mutex(pjob->ji_mutex, true);

  if (strlen(pjob->ji_qs.ji_destin) != 0)
    strcpy(job_destin, pjob->ji_qs.ji_destin);
  else
    job_destin[0] = '\0';

  job_momaddr = pjob->ji_qs.ji_un.ji_exect.ji_momaddr;
  job_momport = pjob->ji_qs.ji_un.ji_exect.ji_momport;

  if (pjob->ji_qs.ji_svrflags & JOB_SVFLG_SCRIPT)
    need_to_send_job_script = TRUE;

  if (pjob->ji_qs.ji_svrflags & JOB_SVFLG_HASRUN)
    job_has_run = TRUE;

  if ((job_destin[0] != '\0') && 
      (type != MOVE_TYPE_Exec))
    {
    if ((pc = strchr(job_destin, '@')) != NULL)
      {
      job_momaddr = get_hostaddr(&local_errno, pc + 1);
      job_momport = pbs_server_port_dis;
      }
    }

  /* encode job attributes to be moved */
  CLEAR_HEAD(attrl);

  /* select attributes/resources to send based on move type */
  if (type == MOVE_TYPE_Exec)
    {
    /* moving job to MOM - ie job start */

    resc_access_perm = ATR_DFLAG_MOM;
    encode_type = ATR_ENCODE_MOM;
    }
  else
    {
    /* moving job to alternate server? */
    resc_access_perm =
      ATR_DFLAG_USWR |
      ATR_DFLAG_OPWR |
      ATR_DFLAG_MGWR |
      ATR_DFLAG_SvRD;

    encode_type = ATR_ENCODE_SVR;

    /* clear default resource settings */
    ret = svr_dequejob(pjob, FALSE);
    if (ret)
      {
      job_mutex.set_unlock_on_exit(false);
      return(ret);
      }
    }

  encode_attributes(attrl, pjob, resc_access_perm, encode_type);

  rc = get_job_script_path(pjob, script_name);

  if (rc != PBSE_NONE)
    {
    if (rc == PBSE_JOB_RECYCLED)
      job_mutex.set_unlock_on_exit(false);
  
    free_server_attrs(&attrl);

    return(rc);
    }
  
  if (job_has_run)
    {
    if ((get_job_file_path(pjob, StdOut, stdout_path, sizeof(stdout_path)) != 0) ||
        (get_job_file_path(pjob, StdErr, stderr_path, sizeof(stderr_path)) != 0) ||
        (get_job_file_path(pjob, Checkpoint, chkpt_path, sizeof(chkpt_path)) != 0))
      {
      job_mutex.unlock();
      goto send_job_work_end;
      }
    }

  /* if the job is substate JOB_SUBSTATE_TRNOUTCM it means we are 
   * recovering after being down or a late failure so we just want 
   * to send the "ready-to-commit/commit" */
  if (pjob->ji_qs.ji_substate != JOB_SUBSTATE_TRNOUTCM)
    {
    attempt_to_queue_job = true;

    if (pjob->ji_qs.ji_substate != JOB_SUBSTATE_TRNOUT)
      change_substate_on_attempt_to_queue = true;
    }
  
  job_mutex.unlock();
  
  rc = send_job_over_network_with_retries(job_id,
                                          job_destin,
                                          attrl,
                                          attempt_to_queue_job,
                                          change_substate_on_attempt_to_queue,
                                          Timeout,
                                          script_name.c_str(),
                                          need_to_send_job_script,
                                          job_has_run,
                                          job_momaddr,
                                          job_momport,
                                          stdout_path,
                                          stderr_path,
                                          chkpt_path,
                                          type,
                                          my_err,
                                          &mom_err);

  if (Timeout == TRUE)
    {
    /* 10 indicates that job migrate timed out, server will mark node down *
          and abort the job - see post_sendmom() */
    sprintf(log_buf, "child timed-out attempting to start job %s", job_id);
    log_ext(*my_err, __func__, log_buf, LOG_WARNING);
    rc = LOCUTION_REQUEUE;
    }
  else if (rc != LOCUTION_SUCCESS)
    {
    if (should_retry_route(*my_err) == -1)
      {
      sprintf(log_buf, "child failed and will not retry job %s", job_id);
      log_err(*my_err, __func__, log_buf);
      rc = LOCUTION_FAIL;
      }
    else
      rc = LOCUTION_REQUEUE;
    }
  
  if (type == MOVE_TYPE_Exec)
    {
    if (node_name != NULL)
      update_failure_counts(node_name, rc);
    else
      update_failure_counts(job_destin, rc);
    }

send_job_work_end:
  finish_move_process(job_id, preq, start_time, node_name, rc, type, mom_err);
  free_server_attrs(&attrl);

  return(rc);
  } /* END send_job_work() */
示例#3
0
int relay_to_mom(

    job                   **pjob_ptr,
    struct batch_request   *request, /* the request to send */
    void                  (*func)(struct work_task *))

{
    int             handle; /* a client style connection handle */
    int             rc;
    int             local_errno = 0;
    pbs_net_t       addr;
    unsigned short  port;
    job            *pjob = *pjob_ptr;
    char            jobid[PBS_MAXSVRJOBID + 1];
    char           *job_momname = NULL;

    struct pbsnode *node;
    char            log_buf[LOCAL_LOG_BUF_SIZE];
    std::string     node_name;

    if (pjob->ji_wattr[JOB_ATR_exec_host].at_val.at_str == NULL)
    {
        snprintf(log_buf, sizeof(log_buf),
                 "attempting to send a request to %s's mom but no exec_host list?",
                 pjob->ji_qs.ji_jobid);
        log_err(PBSE_BADSTATE, __func__, log_buf);

        return(PBSE_BADSTATE);
    }

    /* if MOM is down don't try to connect */
    addr = pjob->ji_qs.ji_un.ji_exect.ji_momaddr;
    port = pjob->ji_qs.ji_un.ji_exect.ji_momport;
    job_momname = strdup(pjob->ji_wattr[JOB_ATR_exec_host].at_val.at_str);
    if (job_momname == NULL)
        return PBSE_MEM_MALLOC;

    if ((node = tfind_addr(addr, port, job_momname)) == NULL)
    {
        free(job_momname);
        return(PBSE_NORELYMOM);
    }
    free(job_momname);

    if ((node != NULL) &&
            ((node->nd_state & INUSE_NOT_READY)||
             (node->nd_power_state != POWER_STATE_RUNNING)))
    {
        node->unlock_node(__func__, "no relay mom", LOGLEVEL);
        return(PBSE_NORELYMOM);
    }

    if (LOGLEVEL >= 7)
    {
        char *tmp = netaddr_pbs_net_t(pjob->ji_qs.ji_un.ji_exect.ji_momaddr);
        sprintf(log_buf, "momaddr=%s",tmp);

        log_record(PBSEVENT_SCHED, PBS_EVENTCLASS_REQUEST, __func__, log_buf);

        free(tmp);
    }

    node_name = node->get_name();

    node->unlock_node(__func__, "after svr_connect", LOGLEVEL);

    strcpy(jobid, pjob->ji_qs.ji_jobid);
    unlock_ji_mutex(pjob, __func__, NULL, LOGLEVEL);
    *pjob_ptr = NULL;

    handle = svr_connect(addr, port, &local_errno, NULL, NULL);

    if (handle < 0)
    {
        update_failure_counts(node_name.c_str(), -1);
        log_event(PBSEVENT_ERROR,PBS_EVENTCLASS_REQUEST,"",msg_norelytomom);

        return(PBSE_NORELYMOM);
    }

    request->rq_orgconn = request->rq_conn; /* save client socket */

    rc = issue_Drequest(handle, request, true);

    if (request->rq_reply.brp_code == PBSE_TIMEOUT)
        update_failure_counts(node_name.c_str(), PBSE_TIMEOUT);
    else
        update_failure_counts(node_name.c_str(), 0);

    *pjob_ptr = svr_find_job(jobid, TRUE);

    return(rc);
}  /* END relay_to_mom() */