Ejemplo n.º 1
0
END_TEST

START_TEST(send_job_over_network_test)
  {
  bool timeout = false;
  char *jobid = strdup("1.napali");
  char *destin = strdup("bob");
  bool attempt_to_queue = true;
  bool c = true;
  tlist_head h;
  int my_err;
  int mom_err;

  CLEAR_HEAD(h);

  fail_unless(send_job_over_network(strdup("2.napali"), 5, destin, h, attempt_to_queue, c, timeout, "script", true, false, 10, strdup("/out"), strdup("/err"), strdup("/chkpt"), MOVE_TYPE_Exec, &my_err,&mom_err) == LOCUTION_FAIL);

  CLEAR_HEAD(h);

  fail_unless(send_job_over_network(jobid, 5, destin, h, attempt_to_queue, c, timeout, "script", true, false, 10, strdup("/out"), strdup("/err"), strdup("/chkpt"), MOVE_TYPE_Exec, &my_err,&mom_err) == PBSE_NONE);
  fail_unless(attempt_to_queue == false);

  fprintf(stderr,"%p %s\n",(void *)destin,destin);
  CLEAR_HEAD(h);

  rdycommit_fail = true;
  fail_unless(send_job_over_network(jobid, 5, destin, h, attempt_to_queue, c, timeout, "script", true, false, 10, strdup("/out"), strdup("/err"), strdup("/chkpt"), MOVE_TYPE_Exec, &my_err,&mom_err) == LOCUTION_RETRY);
  rdycommit_fail = false;

  }
Ejemplo n.º 2
0
int send_job_over_network_with_retries(
    
  char           *job_id,
  char           *job_destin,
  tlist_head     &attrl,
  bool           &attempt_to_queue_job,
  bool           &change_substate_on_attempt_to_queue,
  bool           &timeout,
  const char     *script_name,
  bool            need_to_send_job_script,
  bool            job_has_run,
  unsigned long   job_momaddr,
  unsigned short  job_momport,
  char           *stdout_path,
  char           *stderr_path,
  char           *chkpt_path,
  int             type,
  int            *my_err,
  int            *mom_err)

  {
  int  con = PBS_NET_RC_UNSET;
  char log_buf[LOCAL_LOG_BUF_SIZE];
  int  rc = LOCUTION_RETRY;

  for (int NumRetries = 0; NumRetries < RETRY; NumRetries++)
    {
    /* connect to receiving server with retries */
    if (NumRetries > 0)
      {
      /* recycle after an error */
      if (con >= 0)
        {
        svr_disconnect(con);
        con = PBS_NET_RC_UNSET;
        }

      /* check my_err from previous attempt */
      if ((should_retry_route(*my_err) == -1) ||
          (should_retry_route(*mom_err) == -1))
        {
        sprintf(log_buf, "child failed in previous commit request for job %s", job_id);

        log_err(*my_err, __func__, log_buf);
        break;
        }

      sleep(1 << NumRetries);
      }

    /* make sure this is zero at the point that we're retrying */
    *my_err = 0;

    if ((con = svr_connect(job_momaddr, job_momport, my_err, NULL, NULL)) == PBS_NET_RC_FATAL)
      {
      sprintf(log_buf, "send_job failed to host %s, %lx port %d",
        (job_destin[0] != '\0') ? job_destin : "unknown host",
        job_momaddr,
        job_momport);

      log_err(*my_err, __func__, log_buf);
      rc = LOCUTION_FAIL;

      break;
      }

    if (con == PBS_NET_RC_RETRY)
      {
      *my_err = 0; /* should retry */

      continue;
      }

    if (con == PBS_LOCAL_CONNECTION)
      {
      log_err(-1, __func__, "attempting to run the job on pbs_server???");
      return(PBSE_SYSTEM);
      }

    rc = send_job_over_network(job_id, 
                               con,
                               job_destin,
                               attrl,
                               attempt_to_queue_job,
                               change_substate_on_attempt_to_queue,
                               timeout,
                               script_name,
                               need_to_send_job_script,
                               job_has_run,
                               job_momaddr,
                               stdout_path,
                               stderr_path,
                               chkpt_path,
                               type,
                               my_err,
                               mom_err);

    if (rc == LOCUTION_SUCCESS)
      break;
    }  /* END for (NumRetries) */
  
  if (con >= 0)
    svr_disconnect(con);

  return(rc);
  } /* END send_job_over_network_with_retries() */