int issue_to_svr( char *servern, /* I */ struct batch_request *preq, /* I */ void (*replyfunc) (struct work_task *)) /* I */ { int do_retry = 0; int handle; pbs_net_t svraddr; char *svrname; unsigned int port = pbs_server_port_dis; struct work_task *pwt; strcpy(preq->rq_host, servern); preq->rq_fromsvr = 1; preq->rq_perm = ATR_DFLAG_MGRD | ATR_DFLAG_MGWR | ATR_DFLAG_SvWR; svrname = parse_servername(servern, &port); svraddr = get_hostaddr(svrname); if (svraddr == (pbs_net_t)0) { if (pbs_errno == PBS_NET_RC_RETRY) { /* Non fatal error - retry */ do_retry = 1; } } else { handle = svr_connect(svraddr, port, process_Dreply, ToServerDIS); if (handle >= 0) { return(issue_Drequest(handle, preq, replyfunc, NULL)); } else if (handle == PBS_NET_RC_RETRY) { do_retry = 1; } } /* if reached here, it didn`t go, do we retry? */ if (do_retry) { pwt = set_task( WORK_Timed, (long)(time_now + PBS_NET_RETRY_TIME), reissue_to_svr, (void *)preq); pwt->wt_parmfunc = replyfunc; return(0); } /* FAILURE */ return(-1); } /* END issue_to_svr() */
int finalize_rerunjob( batch_request *preq, job *pjob, int rc) { int Force; char log_buf[LOCAL_LOG_BUF_SIZE]; if (pjob == NULL) return(PBSE_BAD_PARAMETER); mutex_mgr pjob_mutex = mutex_mgr(pjob->ji_mutex, true); if (preq->rq_extend && !strncasecmp(preq->rq_extend, RERUNFORCE, strlen(RERUNFORCE))) Force = 1; else Force = 0; switch (rc) { case -1: /* completed job was requeued */ /* clear out job completion time if there is one */ break; case 0: /* requeue request successful */ pjob->ji_qs.ji_substate = JOB_SUBSTATE_RERUN; break; case PBSE_SYSTEM: /* This may not be accurate...*/ rc = PBSE_MEM_MALLOC; snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "Can not allocate memory"); req_reject(rc, 0, preq, NULL, log_buf); return rc; break; default: if (Force == 0) { rc = PBSE_MOMREJECT; snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "Rejected by mom"); req_reject(rc, 0, preq, NULL, log_buf); return rc; } else { int newstate; int newsubst; unsigned int dummy; char *tmp; long cray_enabled = FALSE; get_svr_attr_l(SRV_ATR_CrayEnabled, &cray_enabled); if ((cray_enabled == TRUE) && (pjob->ji_wattr[JOB_ATR_login_node_id].at_val.at_str != NULL)) tmp = parse_servername(pjob->ji_wattr[JOB_ATR_login_node_id].at_val.at_str, &dummy); else tmp = parse_servername(pjob->ji_wattr[JOB_ATR_exec_host].at_val.at_str, &dummy); /* Cannot communicate with MOM, forcibly requeue job. This is a relatively disgusting thing to do */ sprintf(log_buf, "rerun req to %s failed (rc=%d), forcibly requeueing job", tmp, rc); free(tmp); log_event( PBSEVENT_ERROR | PBSEVENT_ADMIN | PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buf); log_err(-1, __func__, log_buf); strcat(log_buf, ", previous output files may be lost"); svr_mailowner(pjob, MAIL_OTHER, MAIL_FORCE, log_buf); svr_setjobstate(pjob, JOB_STATE_EXITING, JOB_SUBSTATE_RERUN3, FALSE); rel_resc(pjob); /* free resc assigned to job */ pjob->ji_modified = 1; /* force full job save */ pjob->ji_momhandle = -1; pjob->ji_qs.ji_svrflags &= ~JOB_SVFLG_StagedIn; svr_evaljobstate(*pjob, newstate, newsubst, 0); svr_setjobstate(pjob, newstate, newsubst, FALSE); } break; } /* END switch (rc) */ pjob->ji_qs.ji_svrflags = (pjob->ji_qs.ji_svrflags & ~(JOB_SVFLG_CHECKPOINT_FILE |JOB_SVFLG_CHECKPOINT_MIGRATEABLE | JOB_SVFLG_CHECKPOINT_COPIED)) | JOB_SVFLG_HASRUN; sprintf(log_buf, msg_manager, msg_jobrerun, preq->rq_user, preq->rq_host); log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf); reply_ack(preq); /* note in accounting file */ account_record(PBS_ACCT_RERUN, pjob, NULL); return rc; } /* END req_rerunjob() */