END_TEST START_TEST(send_job_over_network_test) { bool timeout = false; char *jobid = strdup("1.napali"); char *destin = strdup("bob"); bool attempt_to_queue = true; bool c = true; tlist_head h; int my_err; int mom_err; CLEAR_HEAD(h); fail_unless(send_job_over_network(strdup("2.napali"), 5, destin, h, attempt_to_queue, c, timeout, "script", true, false, 10, strdup("/out"), strdup("/err"), strdup("/chkpt"), MOVE_TYPE_Exec, &my_err,&mom_err) == LOCUTION_FAIL); CLEAR_HEAD(h); fail_unless(send_job_over_network(jobid, 5, destin, h, attempt_to_queue, c, timeout, "script", true, false, 10, strdup("/out"), strdup("/err"), strdup("/chkpt"), MOVE_TYPE_Exec, &my_err,&mom_err) == PBSE_NONE); fail_unless(attempt_to_queue == false); fprintf(stderr,"%p %s\n",(void *)destin,destin); CLEAR_HEAD(h); rdycommit_fail = true; fail_unless(send_job_over_network(jobid, 5, destin, h, attempt_to_queue, c, timeout, "script", true, false, 10, strdup("/out"), strdup("/err"), strdup("/chkpt"), MOVE_TYPE_Exec, &my_err,&mom_err) == LOCUTION_RETRY); rdycommit_fail = false; }
int send_job_over_network_with_retries( char *job_id, char *job_destin, tlist_head &attrl, bool &attempt_to_queue_job, bool &change_substate_on_attempt_to_queue, bool &timeout, const char *script_name, bool need_to_send_job_script, bool job_has_run, unsigned long job_momaddr, unsigned short job_momport, char *stdout_path, char *stderr_path, char *chkpt_path, int type, int *my_err, int *mom_err) { int con = PBS_NET_RC_UNSET; char log_buf[LOCAL_LOG_BUF_SIZE]; int rc = LOCUTION_RETRY; for (int NumRetries = 0; NumRetries < RETRY; NumRetries++) { /* connect to receiving server with retries */ if (NumRetries > 0) { /* recycle after an error */ if (con >= 0) { svr_disconnect(con); con = PBS_NET_RC_UNSET; } /* check my_err from previous attempt */ if ((should_retry_route(*my_err) == -1) || (should_retry_route(*mom_err) == -1)) { sprintf(log_buf, "child failed in previous commit request for job %s", job_id); log_err(*my_err, __func__, log_buf); break; } sleep(1 << NumRetries); } /* make sure this is zero at the point that we're retrying */ *my_err = 0; if ((con = svr_connect(job_momaddr, job_momport, my_err, NULL, NULL)) == PBS_NET_RC_FATAL) { sprintf(log_buf, "send_job failed to host %s, %lx port %d", (job_destin[0] != '\0') ? job_destin : "unknown host", job_momaddr, job_momport); log_err(*my_err, __func__, log_buf); rc = LOCUTION_FAIL; break; } if (con == PBS_NET_RC_RETRY) { *my_err = 0; /* should retry */ continue; } if (con == PBS_LOCAL_CONNECTION) { log_err(-1, __func__, "attempting to run the job on pbs_server???"); return(PBSE_SYSTEM); } rc = send_job_over_network(job_id, con, job_destin, attrl, attempt_to_queue_job, change_substate_on_attempt_to_queue, timeout, script_name, need_to_send_job_script, job_has_run, job_momaddr, stdout_path, stderr_path, chkpt_path, type, my_err, mom_err); if (rc == LOCUTION_SUCCESS) break; } /* END for (NumRetries) */ if (con >= 0) svr_disconnect(con); return(rc); } /* END send_job_over_network_with_retries() */