END_TEST #endif START_TEST(test_update_failure_counts) { const char *name = "lihue"; struct pbsnode *pnode = find_nodebyname(name); update_failure_counts(name, -1); update_failure_counts(name, -1); // Make sure the two failures are correctly counted fail_unless(pnode->nd_proximal_failures == 2); fail_unless(pnode->nd_consecutive_successes == 0); fail_unless(pnode->nd_state == INUSE_FREE); // One success shouldn't reset the failure counts update_failure_counts(name, 0); fail_unless(pnode->nd_proximal_failures == 2); fail_unless(pnode->nd_consecutive_successes == 1); fail_unless(pnode->nd_state == INUSE_FREE); // Two should update_failure_counts(name, 0); fail_unless(pnode->nd_proximal_failures == 0); fail_unless(pnode->nd_consecutive_successes == 2); fail_unless(pnode->nd_state == INUSE_FREE); // One failure should reset the success count update_failure_counts(name, 1); fail_unless(pnode->nd_proximal_failures == 1); fail_unless(pnode->nd_consecutive_successes == 0); fail_unless(pnode->nd_state == INUSE_FREE); // State shouldn't change until there are 3 proximal failures update_failure_counts(name, 1); fail_unless(pnode->nd_proximal_failures == 2); fail_unless(pnode->nd_consecutive_successes == 0); fail_unless(pnode->nd_state == INUSE_FREE); update_failure_counts(name, 1); fail_unless(pnode->nd_state != INUSE_FREE); fail_unless(pnode->nd_proximal_failures == 3); // State shouldn't reset until there are 2 consecutive successes update_failure_counts(name, 0); fail_unless(pnode->nd_state != INUSE_FREE); fail_unless(pnode->nd_proximal_failures == 3); fail_unless(pnode->nd_consecutive_successes == 1); update_failure_counts(name, 0); fail_unless(pnode->nd_state == INUSE_FREE); fail_unless(pnode->nd_proximal_failures == 0); fail_unless(pnode->nd_consecutive_successes == 2); }
int send_job_work( char *job_id, const char *node_name, /* I */ int type, /* I */ int *my_err, /* O */ batch_request *preq) /* M */ { int rc = LOCUTION_FAIL; int ret = PBSE_NONE; int local_errno = 0; tlist_head attrl; int encode_type; int mom_err = PBSE_NONE; int resc_access_perm; std::string script_name; char *pc; char stdout_path[MAXPATHLEN + 1]; char stderr_path[MAXPATHLEN + 1]; char chkpt_path[MAXPATHLEN + 1]; char log_buf[LOCAL_LOG_BUF_SIZE]; long start_time = time(NULL); bool attempt_to_queue_job = false; bool change_substate_on_attempt_to_queue = false; bool need_to_send_job_script = false; bool job_has_run = false; job *pjob = NULL; char job_destin[PBS_MAXROUTEDEST+1]; bool Timeout = false; unsigned long job_momaddr = -1; unsigned short job_momport = -1; if ((pjob = svr_find_job(job_id, TRUE)) == NULL) { *my_err = PBSE_JOBNOTFOUND; req_reject(-1, 0, preq, NULL, NULL); return(PBSE_JOBNOTFOUND); } mutex_mgr job_mutex(pjob->ji_mutex, true); if (strlen(pjob->ji_qs.ji_destin) != 0) strcpy(job_destin, pjob->ji_qs.ji_destin); else job_destin[0] = '\0'; job_momaddr = pjob->ji_qs.ji_un.ji_exect.ji_momaddr; job_momport = pjob->ji_qs.ji_un.ji_exect.ji_momport; if (pjob->ji_qs.ji_svrflags & JOB_SVFLG_SCRIPT) need_to_send_job_script = TRUE; if (pjob->ji_qs.ji_svrflags & JOB_SVFLG_HASRUN) job_has_run = TRUE; if ((job_destin[0] != '\0') && (type != MOVE_TYPE_Exec)) { if ((pc = strchr(job_destin, '@')) != NULL) { job_momaddr = get_hostaddr(&local_errno, pc + 1); job_momport = pbs_server_port_dis; } } /* encode job attributes to be moved */ CLEAR_HEAD(attrl); /* select attributes/resources to send based on move type */ if (type == MOVE_TYPE_Exec) { /* moving job to MOM - ie job start */ resc_access_perm = ATR_DFLAG_MOM; encode_type = ATR_ENCODE_MOM; } else { /* moving job to alternate server? */ resc_access_perm = ATR_DFLAG_USWR | ATR_DFLAG_OPWR | ATR_DFLAG_MGWR | ATR_DFLAG_SvRD; encode_type = ATR_ENCODE_SVR; /* clear default resource settings */ ret = svr_dequejob(pjob, FALSE); if (ret) { job_mutex.set_unlock_on_exit(false); return(ret); } } encode_attributes(attrl, pjob, resc_access_perm, encode_type); rc = get_job_script_path(pjob, script_name); if (rc != PBSE_NONE) { if (rc == PBSE_JOB_RECYCLED) job_mutex.set_unlock_on_exit(false); free_server_attrs(&attrl); return(rc); } if (job_has_run) { if ((get_job_file_path(pjob, StdOut, stdout_path, sizeof(stdout_path)) != 0) || (get_job_file_path(pjob, StdErr, stderr_path, sizeof(stderr_path)) != 0) || (get_job_file_path(pjob, Checkpoint, chkpt_path, sizeof(chkpt_path)) != 0)) { job_mutex.unlock(); goto send_job_work_end; } } /* if the job is substate JOB_SUBSTATE_TRNOUTCM it means we are * recovering after being down or a late failure so we just want * to send the "ready-to-commit/commit" */ if (pjob->ji_qs.ji_substate != JOB_SUBSTATE_TRNOUTCM) { attempt_to_queue_job = true; if (pjob->ji_qs.ji_substate != JOB_SUBSTATE_TRNOUT) change_substate_on_attempt_to_queue = true; } job_mutex.unlock(); rc = send_job_over_network_with_retries(job_id, job_destin, attrl, attempt_to_queue_job, change_substate_on_attempt_to_queue, Timeout, script_name.c_str(), need_to_send_job_script, job_has_run, job_momaddr, job_momport, stdout_path, stderr_path, chkpt_path, type, my_err, &mom_err); if (Timeout == TRUE) { /* 10 indicates that job migrate timed out, server will mark node down * and abort the job - see post_sendmom() */ sprintf(log_buf, "child timed-out attempting to start job %s", job_id); log_ext(*my_err, __func__, log_buf, LOG_WARNING); rc = LOCUTION_REQUEUE; } else if (rc != LOCUTION_SUCCESS) { if (should_retry_route(*my_err) == -1) { sprintf(log_buf, "child failed and will not retry job %s", job_id); log_err(*my_err, __func__, log_buf); rc = LOCUTION_FAIL; } else rc = LOCUTION_REQUEUE; } if (type == MOVE_TYPE_Exec) { if (node_name != NULL) update_failure_counts(node_name, rc); else update_failure_counts(job_destin, rc); } send_job_work_end: finish_move_process(job_id, preq, start_time, node_name, rc, type, mom_err); free_server_attrs(&attrl); return(rc); } /* END send_job_work() */
int relay_to_mom( job **pjob_ptr, struct batch_request *request, /* the request to send */ void (*func)(struct work_task *)) { int handle; /* a client style connection handle */ int rc; int local_errno = 0; pbs_net_t addr; unsigned short port; job *pjob = *pjob_ptr; char jobid[PBS_MAXSVRJOBID + 1]; char *job_momname = NULL; struct pbsnode *node; char log_buf[LOCAL_LOG_BUF_SIZE]; std::string node_name; if (pjob->ji_wattr[JOB_ATR_exec_host].at_val.at_str == NULL) { snprintf(log_buf, sizeof(log_buf), "attempting to send a request to %s's mom but no exec_host list?", pjob->ji_qs.ji_jobid); log_err(PBSE_BADSTATE, __func__, log_buf); return(PBSE_BADSTATE); } /* if MOM is down don't try to connect */ addr = pjob->ji_qs.ji_un.ji_exect.ji_momaddr; port = pjob->ji_qs.ji_un.ji_exect.ji_momport; job_momname = strdup(pjob->ji_wattr[JOB_ATR_exec_host].at_val.at_str); if (job_momname == NULL) return PBSE_MEM_MALLOC; if ((node = tfind_addr(addr, port, job_momname)) == NULL) { free(job_momname); return(PBSE_NORELYMOM); } free(job_momname); if ((node != NULL) && ((node->nd_state & INUSE_NOT_READY)|| (node->nd_power_state != POWER_STATE_RUNNING))) { node->unlock_node(__func__, "no relay mom", LOGLEVEL); return(PBSE_NORELYMOM); } if (LOGLEVEL >= 7) { char *tmp = netaddr_pbs_net_t(pjob->ji_qs.ji_un.ji_exect.ji_momaddr); sprintf(log_buf, "momaddr=%s",tmp); log_record(PBSEVENT_SCHED, PBS_EVENTCLASS_REQUEST, __func__, log_buf); free(tmp); } node_name = node->get_name(); node->unlock_node(__func__, "after svr_connect", LOGLEVEL); strcpy(jobid, pjob->ji_qs.ji_jobid); unlock_ji_mutex(pjob, __func__, NULL, LOGLEVEL); *pjob_ptr = NULL; handle = svr_connect(addr, port, &local_errno, NULL, NULL); if (handle < 0) { update_failure_counts(node_name.c_str(), -1); log_event(PBSEVENT_ERROR,PBS_EVENTCLASS_REQUEST,"",msg_norelytomom); return(PBSE_NORELYMOM); } request->rq_orgconn = request->rq_conn; /* save client socket */ rc = issue_Drequest(handle, request, true); if (request->rq_reply.brp_code == PBSE_TIMEOUT) update_failure_counts(node_name.c_str(), PBSE_TIMEOUT); else update_failure_counts(node_name.c_str(), 0); *pjob_ptr = svr_find_job(jobid, TRUE); return(rc); } /* END relay_to_mom() */