void post_rerun( batch_request *preq) { int newstate; int newsub; job *pjob; char log_buf[LOCAL_LOG_BUF_SIZE]; if (preq == NULL) return; if (preq->rq_reply.brp_code != 0) { sprintf(log_buf, "rerun signal reject by mom: %s - %d", preq->rq_ind.rq_signal.rq_jid, preq->rq_reply.brp_code); log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,__func__,log_buf); if ((pjob = svr_find_job(preq->rq_ind.rq_signal.rq_jid, FALSE))) { mutex_mgr job_mutex(pjob->ji_mutex, true); svr_evaljobstate(*pjob, newstate, newsub, 1); svr_setjobstate(pjob, newstate, newsub, FALSE); } } return; } /* END post_rerun() */
/* * release_job - releases the hold on job j * @param j - the job to modify * @return 0 if successful, a PBS error on failure */ int release_job( struct batch_request *preq, /* I */ void *j) /* I/O */ { long old_hold; int rc = 0; int newstate; int newsub; char *pset; job *pjob = (job *)j; char log_buf[LOCAL_LOG_BUF_SIZE]; pbs_attribute temphold; /* cannot do anything until we decode the holds to be set */ if ((rc = get_hold(&preq->rq_ind.rq_hold.rq_orig.rq_attr, &pset, &temphold)) != 0) { return(rc); } /* if other than HOLD_u is being released, must have privil */ if ((rc = chk_hold_priv(temphold.at_val.at_long, preq->rq_perm)) != 0) { return(rc); } /* unset the hold */ old_hold = pjob->ji_wattr[JOB_ATR_hold].at_val.at_long; if ((rc = job_attr_def[JOB_ATR_hold].at_set(&pjob->ji_wattr[JOB_ATR_hold], &temphold, DECR))) { return(rc); } /* everything went well, if holds changed, update the job state */ if (old_hold != pjob->ji_wattr[JOB_ATR_hold].at_val.at_long) { pjob->ji_modified = 1; /* indicates attributes changed */ svr_evaljobstate(pjob, &newstate, &newsub, 0); svr_setjobstate(pjob, newstate, newsub, FALSE); /* saves job */ } sprintf(log_buf, msg_jobholdrel, pset, preq->rq_user, preq->rq_host); log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buf); return(rc); } /* END release_job() */
void hold_job( attribute *temphold, /* I */ void *j) /* I */ { long *hold_val; long old_hold; int newstate; int newsub; attribute *pattr; job *pjob = (job *)j; if (pjob == NULL) return; hold_val = &pjob->ji_wattr[JOB_ATR_hold].at_val.at_long; old_hold = *hold_val; *hold_val |= temphold->at_val.at_long; pjob->ji_wattr[JOB_ATR_hold].at_flags |= ATR_VFLAG_SET; pattr = &pjob->ji_wattr[JOB_ATR_checkpoint]; if ((pjob->ji_qs.ji_state == JOB_STATE_RUNNING) && ((pattr->at_flags & ATR_VFLAG_SET) && ((csv_find_string(pattr->at_val.at_str, "s") != NULL) || (csv_find_string(pattr->at_val.at_str, "c") != NULL) || (csv_find_string(pattr->at_val.at_str, "enabled") != NULL)))) { /* TODO */ /* preq_tmp = alloc_br(preq->rq_type); */ } else if (old_hold != *hold_val) { /* indicate attributes changed */ pjob->ji_modified = 1; svr_evaljobstate(pjob, &newstate, &newsub, 0); svr_setjobstate(pjob, newstate, newsub); } }
/** * @brief * force_reque - requeue (rerun) a job * * @param[in,out] pwt - job which needs to be rerun */ void force_reque(job *pjob) { int newstate; int newsubstate; pjob->ji_modified = 1; pjob->ji_momhandle = -1; pjob->ji_mom_prot = PROT_INVALID; /* simulate rerun: free nodes, clear checkpoint flag, and */ /* clear exec_vnode string */ rel_resc(pjob); /* note in accounting file */ account_jobend(pjob, pjob->ji_acctrec, PBS_ACCT_RERUN); /* if a subjob, we set substate to RERUN3 to cause trktbl entry */ /* to be reset to Qeued, and then blow away the job struct */ if (pjob->ji_qs.ji_svrflags & JOB_SVFLG_SubJob) { pjob->ji_qs.ji_substate = JOB_SUBSTATE_RERUN3; job_purge(pjob); return; } /* * Clear any JOB_SVFLG_Actsuspd flag too, as the job is no longer * suspended (User busy). A suspended job is rerun in case of a * MOM failure after the workstation becomes active(busy). */ pjob->ji_qs.ji_svrflags &= ~(JOB_SVFLG_Actsuspd | JOB_SVFLG_StagedIn | JOB_SVFLG_CHKPT); job_attr_def[(int)JOB_ATR_exec_host].at_free( &pjob->ji_wattr[(int)JOB_ATR_exec_host]); job_attr_def[(int)JOB_ATR_exec_host2].at_free( &pjob->ji_wattr[(int)JOB_ATR_exec_host2]); job_attr_def[(int)JOB_ATR_exec_vnode].at_free( &pjob->ji_wattr[(int)JOB_ATR_exec_vnode]); job_attr_def[(int)JOB_ATR_pset].at_free( &pjob->ji_wattr[(int)JOB_ATR_pset]); /* job dir has no meaning for re-queued jobs, so unset it */ job_attr_def[(int)JOB_ATR_jobdir].at_free(&pjob-> ji_wattr[(int)JOB_ATR_jobdir]); svr_evaljobstate(pjob, &newstate, &newsubstate, 1); (void)svr_setjobstate(pjob, newstate, newsubstate); }
END_TEST START_TEST(svr_evaljobstate_test) { struct job test_job; int state = 0; int substate = 0; memset(&test_job, 0, sizeof(test_job)); svr_evaljobstate(NULL, &state, &substate, 0); svr_evaljobstate(&test_job, NULL, &substate, 0); svr_evaljobstate(&test_job, &state, NULL, 0); test_job.ji_qs.ji_state = JOB_STATE_RUNNING; svr_evaljobstate(&test_job, &state, &substate, 0); fail_unless(test_job.ji_qs.ji_state == state, "svr_setjobstate state fail case 1"); fail_unless(test_job.ji_qs.ji_substate == substate, "svr_setjobstate substate fail case 1"); memset(&test_job, 0, sizeof(test_job)); test_job.ji_wattr[JOB_ATR_hold].at_val.at_long = 1; svr_evaljobstate(&test_job, &state, &substate, 0); fail_unless(test_job.ji_qs.ji_state == state, "svr_setjobstate state fail case 2"); fail_unless(test_job.ji_qs.ji_substate == substate, "svr_setjobstate substate fail case 2"); memset(&test_job, 0, sizeof(test_job)); test_job.ji_wattr[JOB_ATR_stagein].at_flags = 1; svr_evaljobstate(&test_job, &state, &substate, 0); fail_unless(test_job.ji_qs.ji_state == state, "svr_setjobstate state fail case 3"); fail_unless(test_job.ji_qs.ji_substate == substate, "svr_setjobstate substate fail case 3"); memset(&test_job, 0, sizeof(test_job)); svr_evaljobstate(&test_job, &state, &substate, 0); fail_unless(test_job.ji_qs.ji_state == state, "svr_setjobstate state fail case 4"); fail_unless(test_job.ji_qs.ji_substate == substate, "svr_setjobstate substate fail case 4"); memset(&test_job, 0, sizeof(test_job)); svr_evaljobstate(&test_job, &state, &substate, 1); fail_unless(JOB_STATE_QUEUED == state, "svr_setjobstate state fail case 5"); fail_unless(JOB_SUBSTATE_QUEUED == substate, "svr_setjobstate substate fail case 5"); }
int req_rerunjob( struct batch_request *preq) { int rc = PBSE_NONE; job *pjob; int Force; int MgrRequired = TRUE; char log_buf[LOCAL_LOG_BUF_SIZE]; /* check if requestor is admin, job owner, etc */ if ((pjob = chk_job_request(preq->rq_ind.rq_rerun, preq)) == 0) { /* FAILURE */ /* chk_job_request calls req_reject() */ rc = PBSE_SYSTEM; return rc; /* This needs to fixed to return an accurate error */ } /* the job must be running or completed */ if (pjob->ji_qs.ji_state >= JOB_STATE_EXITING) { if (pjob->ji_wattr[JOB_ATR_checkpoint_name].at_flags & ATR_VFLAG_SET) { /* allow end-users to rerun checkpointed jobs */ MgrRequired = FALSE; } } else if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING) { /* job is running */ /* NO-OP */ } else { /* FAILURE - job is in bad state */ rc = PBSE_BADSTATE; snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "job %s is in a bad state", preq->rq_ind.rq_rerun); req_reject(rc, 0, preq, NULL, log_buf); unlock_ji_mutex(pjob, __func__, "2", LOGLEVEL); return rc; } if ((MgrRequired == TRUE) && ((preq->rq_perm & (ATR_DFLAG_MGWR | ATR_DFLAG_OPWR)) == 0)) { /* FAILURE */ rc = PBSE_PERM; snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "additional permissions required (ATR_DFLAG_MGWR | ATR_DFLAG_OPWR)"); req_reject(rc, 0, preq, NULL, log_buf); unlock_ji_mutex(pjob, __func__, "3", LOGLEVEL); return rc; } /* the job must be rerunnable */ if (pjob->ji_wattr[JOB_ATR_rerunable].at_val.at_long == 0) { /* NOTE: should force override this constraint? maybe (???) */ /* no, the user is saying that the job will break, and IEEE Std 1003.1 specifically says rerun is to be rejected if rerunable==FALSE -garrick */ rc = PBSE_NORERUN; snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "job %s not rerunnable", preq->rq_ind.rq_rerun); req_reject(rc, 0, preq, NULL, log_buf); unlock_ji_mutex(pjob, __func__, "4", LOGLEVEL); return rc; } if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING) { /* ask MOM to kill off the job if it is running */ static const char *rerun = "rerun"; char *extra = strdup(rerun); rc = issue_signal(&pjob, "SIGKILL", post_rerun, extra); } else { if (pjob->ji_wattr[JOB_ATR_hold].at_val.at_long == HOLD_n) { svr_setjobstate(pjob, JOB_STATE_QUEUED, JOB_SUBSTATE_QUEUED, FALSE); } else { svr_setjobstate(pjob, JOB_STATE_HELD, JOB_SUBSTATE_HELD, FALSE); } /* reset some job attributes */ pjob->ji_wattr[JOB_ATR_comp_time].at_flags &= ~ATR_VFLAG_SET; pjob->ji_wattr[JOB_ATR_reported].at_flags &= ~ATR_VFLAG_SET; set_statechar(pjob); rc = -1; } if (preq->rq_extend && !strncasecmp(preq->rq_extend, RERUNFORCE, strlen(RERUNFORCE))) Force = 1; else Force = 0; switch (rc) { case - 1: /* completed job was requeued */ /* clear out job completion time if there is one */ break; case 0: /* requeue request successful */ if (pjob != NULL) pjob->ji_qs.ji_substate = JOB_SUBSTATE_RERUN; break; case PBSE_SYSTEM: /* This may not be accurate...*/ rc = PBSE_MEM_MALLOC; snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "Can not allocate memory"); req_reject(rc, 0, preq, NULL, log_buf); return rc; break; default: if (Force == 0) { rc = PBSE_MOMREJECT; snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "Rejected by mom"); req_reject(rc, 0, preq, NULL, log_buf); if (pjob != NULL) unlock_ji_mutex(pjob, __func__, "5", LOGLEVEL); return rc; } else { int newstate; int newsubst; unsigned int dummy; char *tmp; long cray_enabled = FALSE; if (pjob != NULL) { get_svr_attr_l(SRV_ATR_CrayEnabled, &cray_enabled); if ((cray_enabled == TRUE) && (pjob->ji_wattr[JOB_ATR_login_node_id].at_val.at_str != NULL)) tmp = parse_servername(pjob->ji_wattr[JOB_ATR_login_node_id].at_val.at_str, &dummy); else tmp = parse_servername(pjob->ji_wattr[JOB_ATR_exec_host].at_val.at_str, &dummy); /* Cannot communicate with MOM, forcibly requeue job. This is a relatively disgusting thing to do */ sprintf(log_buf, "rerun req to %s failed (rc=%d), forcibly requeueing job", tmp, rc); free(tmp); log_event( PBSEVENT_ERROR | PBSEVENT_ADMIN | PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buf); log_err(-1, __func__, log_buf); strcat(log_buf, ", previous output files may be lost"); svr_mailowner(pjob, MAIL_OTHER, MAIL_FORCE, log_buf); svr_setjobstate(pjob, JOB_STATE_EXITING, JOB_SUBSTATE_RERUN3, FALSE); rel_resc(pjob); /* free resc assigned to job */ if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_HOTSTART) == 0) { /* in case of server shutdown, don't clear exec_host */ /* will use it on hotstart when next comes up */ job_attr_def[JOB_ATR_exec_host].at_free(&pjob->ji_wattr[JOB_ATR_exec_host]); job_attr_def[JOB_ATR_session_id].at_free(&pjob->ji_wattr[JOB_ATR_session_id]); job_attr_def[JOB_ATR_exec_gpus].at_free(&pjob->ji_wattr[JOB_ATR_exec_gpus]); } pjob->ji_modified = 1; /* force full job save */ pjob->ji_momhandle = -1; pjob->ji_qs.ji_svrflags &= ~JOB_SVFLG_StagedIn; svr_evaljobstate(pjob, &newstate, &newsubst, 0); svr_setjobstate(pjob, newstate, newsubst, FALSE); } } break; } /* END switch (rc) */ /* So job has run and is to be rerun (not restarted) */ if (pjob == NULL) { rc = PBSE_JOB_RERUN; } else { pjob->ji_qs.ji_svrflags = (pjob->ji_qs.ji_svrflags & ~(JOB_SVFLG_CHECKPOINT_FILE |JOB_SVFLG_CHECKPOINT_MIGRATEABLE | JOB_SVFLG_CHECKPOINT_COPIED)) | JOB_SVFLG_HASRUN; sprintf(log_buf, msg_manager, msg_jobrerun, preq->rq_user, preq->rq_host); log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf); reply_ack(preq); /* note in accounting file */ account_record(PBS_ACCT_RERUN, pjob, NULL); unlock_ji_mutex(pjob, __func__, "6", LOGLEVEL); } return rc; } /* END req_rerunjob() */
void req_deletejob( struct batch_request *preq) /* I */ { job *pjob; struct work_task *pwtold; struct work_task *pwtnew; struct work_task *pwtcheck; int rc; char *sigt = "SIGTERM"; char *Msg = NULL; /* check if we are getting a purgecomplete from scheduler */ if ((preq->rq_extend != NULL) && !strncmp(preq->rq_extend,PURGECOMP,strlen(PURGECOMP))) { /* * purge_completed_jobs will respond with either an ack or reject */ purge_completed_jobs(preq); return; } /* The way this is implemented, if the user enters the command "qdel -p <jobid>", * they can then delete jobs other than their own since the authorization * checks are made below in chk_job_request. This should probably be fixed. */ if (forced_jobpurge(preq) != 0) { return; } /* NOTE: should support rq_objname={<JOBID>|ALL|<name:<JOBNAME>} */ /* NYI */ pjob = chk_job_request(preq->rq_ind.rq_delete.rq_objname, preq); if (pjob == NULL) { /* NOTE: chk_job_request() will issue req_reject() */ return; } if (preq->rq_extend != NULL) { if (strncmp(preq->rq_extend, deldelaystr, strlen(deldelaystr)) && strncmp(preq->rq_extend, delasyncstr, strlen(delasyncstr)) && strncmp(preq->rq_extend, delpurgestr, strlen(delpurgestr))) { /* have text message in request extension, add it */ Msg = preq->rq_extend; /* * Message capability is only for operators and managers. * Check if request is authorized */ if ((preq->rq_perm & (ATR_DFLAG_OPRD | ATR_DFLAG_OPWR | ATR_DFLAG_MGRD | ATR_DFLAG_MGWR)) == 0) { req_reject(PBSE_PERM, 0, preq, NULL, "must have operator or manager privilege to use -m parameter"); return; } } } if (pjob->ji_qs.ji_state == JOB_STATE_TRANSIT) { /* * Find pid of router from existing work task entry, * then establish another work task on same child. * Next, signal the router and wait for its completion; */ pwtold = (struct work_task *)GET_NEXT(pjob->ji_svrtask); while (pwtold != NULL) { if ((pwtold->wt_type == WORK_Deferred_Child) || (pwtold->wt_type == WORK_Deferred_Cmp)) { pwtnew = set_task( pwtold->wt_type, pwtold->wt_event, post_delete_route, preq); if (pwtnew != NULL) { /* * reset type in case the SIGCHLD came * in during the set_task; it makes * sure that next_task() will find the * new entry. */ pwtnew->wt_type = pwtold->wt_type; pwtnew->wt_aux = pwtold->wt_aux; kill((pid_t)pwtold->wt_event, SIGTERM); pjob->ji_qs.ji_substate = JOB_SUBSTATE_ABORT; return; /* all done for now */ } else { req_reject(PBSE_SYSTEM, 0, preq, NULL, NULL); return; } } pwtold = (struct work_task *)GET_NEXT(pwtold->wt_linkobj); } /* should never get here ... */ log_err(-1, "req_delete", "Did not find work task for router"); req_reject(PBSE_INTERNAL, 0, preq, NULL, NULL); return; } if (pjob->ji_qs.ji_substate == JOB_SUBSTATE_PRERUN || pjob->ji_qs.ji_substate == JOB_SUBSTATE_RERUN || pjob->ji_qs.ji_substate == JOB_SUBSTATE_RERUN1 || pjob->ji_qs.ji_substate == JOB_SUBSTATE_RERUN2 || pjob->ji_qs.ji_substate == JOB_SUBSTATE_RERUN3 ) { /* If JOB_SUBSTATE_PRERUN being sent to MOM, wait till she gets it going */ /* retry in one second */ /* If JOB_SUBSTATE_RERUN, RERUN1, RERUN2 or RERUN3 the job is being requeued. Wait until finished */ static time_t cycle_check_when = 0; static char cycle_check_jid[PBS_MAXSVRJOBID + 1]; if (cycle_check_when != 0) { if (!strcmp(pjob->ji_qs.ji_jobid, cycle_check_jid) && (time_now - cycle_check_when > 10)) { /* state not updated after 10 seconds */ /* did the mom ever get it? delete it anyways... */ cycle_check_jid[0] = '\0'; cycle_check_when = 0; goto jump; } if (time_now - cycle_check_when > 20) { /* give up after 20 seconds */ cycle_check_jid[0] = '\0'; cycle_check_when = 0; } } /* END if (cycle_check_when != 0) */ if (cycle_check_when == 0) { /* new PRERUN job located */ cycle_check_when = time_now; strcpy(cycle_check_jid, pjob->ji_qs.ji_jobid); } sprintf(log_buffer, "job cannot be deleted, state=PRERUN, requeuing delete request"); log_event( PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buffer); pwtnew = set_task( WORK_Timed, time_now + 1, post_delete_route, preq); if (pwtnew == 0) req_reject(PBSE_SYSTEM, 0, preq, NULL, NULL); return; } /* END if (pjob->ji_qs.ji_substate == JOB_SUBSTATE_PRERUN) */ jump: /* * Log delete and if requesting client is not job owner, send mail. */ sprintf(log_buffer, "requestor=%s@%s", preq->rq_user, preq->rq_host); /* NOTE: should annotate accounting record with extend message (NYI) */ account_record(PBS_ACCT_DEL, pjob, log_buffer); sprintf(log_buffer, msg_manager, msg_deletejob, preq->rq_user, preq->rq_host); log_event( PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buffer); /* NOTE: should incorporate job delete message */ if (Msg != NULL) { /* have text message in request extension, add it */ strcat(log_buffer, "\n"); strcat(log_buffer, Msg); } if ((svr_chk_owner(preq, pjob) != 0) && !has_job_delete_nanny(pjob)) { /* only send email if owner did not delete job and job deleted has not been previously attempted */ svr_mailowner(pjob, MAIL_DEL, MAIL_FORCE, log_buffer); /* * If we sent mail and already sent the extra message * then reset message so we don't trigger a redundant email * in job_abt() */ if (Msg != NULL) { Msg = NULL; } } if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_FILE) != 0) { /* job has restart file at mom, change restart comment if failed */ change_restart_comment_if_needed(pjob); } if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING) { /* * setup a nanny task to make sure the job is actually deleted (see the * comments at job_delete_nanny()). */ if (has_job_delete_nanny(pjob)) { req_reject(PBSE_IVALREQ, 0, preq, NULL, "job cancel in progress"); return; } apply_job_delete_nanny(pjob, time_now + 60); /* check if we are getting a asynchronous delete */ if ((preq->rq_extend != NULL) && !strncmp(preq->rq_extend,DELASYNC,strlen(DELASYNC))) { struct batch_request *preq_tmp = NULL; /* * Respond with an ack now instead of after MOM processing * Create a new batch request and fill it in. It will be freed by reply_ack */ snprintf(log_buffer,sizeof(log_buffer), "Deleting job asynchronously"); log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buffer); preq_tmp = alloc_br(PBS_BATCH_DeleteJob); preq_tmp->rq_perm = preq->rq_perm; preq_tmp->rq_ind.rq_manager.rq_cmd = preq->rq_ind.rq_manager.rq_cmd; preq_tmp->rq_ind.rq_manager.rq_objtype = preq->rq_ind.rq_manager.rq_objtype; preq_tmp->rq_fromsvr = preq->rq_fromsvr; preq_tmp->rq_extsz = preq->rq_extsz; preq_tmp->rq_conn = preq->rq_conn; memcpy(preq_tmp->rq_ind.rq_manager.rq_objname, preq->rq_ind.rq_manager.rq_objname, PBS_MAXSVRJOBID + 1); memcpy(preq_tmp->rq_user, preq->rq_user, PBS_MAXUSER + 1); memcpy(preq_tmp->rq_host, preq->rq_host, PBS_MAXHOSTNAME + 1); reply_ack(preq_tmp); preq->rq_noreply = TRUE; /* set for no more replies */ } /* make a cleanup task if set */ if ((server.sv_attr[SRV_ATR_JobForceCancelTime].at_flags & ATR_VFLAG_SET) && (server.sv_attr[SRV_ATR_JobForceCancelTime].at_val.at_long > 0)) { pwtcheck = set_task( WORK_Timed, time_now + server.sv_attr[SRV_ATR_JobForceCancelTime].at_val.at_long, ensure_deleted, preq); if (pwtcheck != NULL) append_link(&pjob->ji_svrtask, &pwtcheck->wt_linkobj, pwtcheck); } /* * Send signal request to MOM. The server will automagically * pick up and "finish" off the client request when MOM replies. */ if ((rc = issue_signal(pjob, sigt, post_delete_mom1, preq))) { /* cant send to MOM */ req_reject(rc, 0, preq, NULL, NULL); } /* normally will ack reply when mom responds */ sprintf(log_buffer, msg_delrunjobsig, sigt); LOG_EVENT( PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buffer); return; } /* END if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING) */ /* make a cleanup task if set */ if ((server.sv_attr[SRV_ATR_JobForceCancelTime].at_flags & ATR_VFLAG_SET) && (server.sv_attr[SRV_ATR_JobForceCancelTime].at_val.at_long > 0)) { pwtcheck = set_task( WORK_Timed, time_now + server.sv_attr[SRV_ATR_JobForceCancelTime].at_val.at_long, ensure_deleted, preq); if (pwtcheck != NULL) append_link(&pjob->ji_svrtask, &pwtcheck->wt_linkobj, pwtcheck); } /* if configured, and this job didn't have a slot limit hold, free a job * held with the slot limit hold */ if ((server.sv_attr[SRV_ATR_MoabArrayCompatible].at_val.at_long != FALSE) && ((pjob->ji_wattr[JOB_ATR_hold].at_val.at_long & HOLD_l) == FALSE)) { if ((pjob->ji_arraystruct != NULL) && (pjob->ji_is_array_template == FALSE)) { int i; int newstate; int newsub; job *tmp; job_array *pa = pjob->ji_arraystruct; for (i = 0; i < pa->ai_qs.array_size; i++) { if (pa->jobs[i] == NULL) continue; tmp = (job *)pa->jobs[i]; if (tmp->ji_wattr[JOB_ATR_hold].at_val.at_long & HOLD_l) { tmp->ji_wattr[JOB_ATR_hold].at_val.at_long &= ~HOLD_l; if (tmp->ji_wattr[JOB_ATR_hold].at_val.at_long == 0) { tmp->ji_wattr[JOB_ATR_hold].at_flags &= ~ATR_VFLAG_SET; } svr_evaljobstate(tmp, &newstate, &newsub, 1); svr_setjobstate(tmp, newstate, newsub); job_save(tmp, SAVEJOB_FULL, 0); break; } } } } /* END MoabArrayCompatible check */ if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_FILE) != 0) { /* job has restart file at mom, do end job processing */ svr_setjobstate(pjob, JOB_STATE_EXITING, JOB_SUBSTATE_EXITING); pjob->ji_momhandle = -1; /* force new connection */ pwtnew = set_task(WORK_Immed, 0, on_job_exit, (void *)pjob); if (pwtnew) { append_link(&pjob->ji_svrtask, &pwtnew->wt_linkobj, pwtnew); } } else if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_StagedIn) != 0) { /* job has staged-in file, should remove them */ remove_stagein(pjob); job_abt(&pjob, Msg); } else { /* * the job is not transitting (though it may have been) and * is not running, so put in into a complete state. */ struct work_task *ptask; struct pbs_queue *pque; int KeepSeconds = 0; svr_setjobstate(pjob, JOB_STATE_COMPLETE, JOB_SUBSTATE_COMPLETE); if ((pque = pjob->ji_qhdr) && (pque != NULL)) { pque->qu_numcompleted++; } KeepSeconds = attr_ifelse_long( &pque->qu_attr[QE_ATR_KeepCompleted], &server.sv_attr[SRV_ATR_KeepCompleted], 0); ptask = set_task(WORK_Timed, time_now + KeepSeconds, on_job_exit, pjob); if (ptask != NULL) { append_link(&pjob->ji_svrtask, &ptask->wt_linkobj, ptask); } } /* END else if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_FILE) != 0) */ reply_ack(preq); return; } /* END req_deletejob() */
/** * * @brief * Send a job over the network to some other server or MOM. * @par * Under Linux/Unix, this starts a child process to do the work. * Connect to the destination host and port, * and go through the protocol to transfer the job. * Signals are blocked. * * @param[in] jobp - pointer to the job being sent. * @param[in] hostaddr - the address of host to send job to, host byte order. * @param[in] port - the destination port, host byte order * @param[in] move_type - the type of move (e.g. MOVE_TYPE_exec) * @param[in] post_func - the function to execute once the child process * sending job completes (Linux/Unix only) * @param[in] data - input data to 'post_func' * * @return int * @retval 2 parent : success (child forked) * @retval -1 parent : on failure (pbs_errno set to error number) * @retval SEND_JOB_OK child : 0 success, job sent * @retval SEND_JOB_FATAL child : 1 permenent failure or rejection, * @retval SEND_JOB_RETRY child : 2 failed but try again * @retval SEND_JOB_NODEDW child : 3 execution node down, retry different node */ int send_job(job *jobp, pbs_net_t hostaddr, int port, int move_type, void (*post_func)(struct work_task *), struct batch_request *preq) { #ifdef WIN32 char cmdline[80]; pio_handles pio; char buf[4096]; struct work_task *ptask; int newstate; int newsub; long tempval; char script_name[MAXPATHLEN+1]; int gridproxy_cred = 0; #ifdef PBS_CRED_GRIDPROXY if (jobp->ji_extended.ji_ext.ji_credtype == PBS_CREDTYPE_GRIDPROXY) gridproxy_cred = 1; #endif if (pbs_conf.pbs_use_tcp == 1 && move_type == MOVE_TYPE_Exec && gridproxy_cred == 0) { return (send_job_exec(jobp, hostaddr, port, preq)); } sprintf(cmdline, "%s/sbin/pbs_send_job", pbs_conf.pbs_exec_path); if (win_popen(cmdline, "w", &pio, NULL) == 0) { errno = GetLastError(); pbs_errno = errno; (void)sprintf(log_buffer, "executing %s for job %s failed errno=%d", cmdline, jobp->ji_qs.ji_jobid, errno); log_event(PBSEVENT_DEBUG, PBS_EVENTCLASS_JOB, LOG_ERR, jobp->ji_qs.ji_jobid, log_buffer); /* force re-eval of job state out of Transit */ svr_evaljobstate(jobp, &newstate, &newsub, 1); svr_setjobstate(jobp, newstate, newsub); win_pclose(&pio); return (-1); } ptask = set_task(WORK_Deferred_Child, (long)pio.pi.hProcess, post_func, preq); if (!ptask) { log_err(errno, __func__, msg_err_malloc); errno = ENOMEM; pbs_errno = errno; win_pclose(&pio); /* force re-eval of job state out of Transit */ svr_evaljobstate(jobp, &newstate, &newsub, 1); svr_setjobstate(jobp, newstate, newsub); return (-1); } else { ptask->wt_parm2 = jobp; append_link(&((job *)jobp)->ji_svrtask, &ptask->wt_linkobj, ptask); } script_name[0] = '\0'; /* if job has a script read it from database */ if (jobp->ji_qs.ji_svrflags & JOB_SVFLG_SCRIPT) { /* * copy the job script from database to a temp file * PBSD_jscript works with a file * delete it at the end of the send */ if (svr_create_tmp_jobscript(jobp, &script_name) != 0) { pbs_errno = PBSE_SYSTEM; snprintf(log_buffer, sizeof(log_buffer), "Failed to create temporary job script for job %s", jobp->ji_qs.ji_jobid); log_err(pbs_errno, "send_job", log_buffer); win_pclose2(&pio); return (-1); } } addpid(pio.pi.hProcess); /* our job is to calc eligible time accurately and save it */ /* on new server, accrue type should be calc afresh */ /* Note: if job is being sent for execution on mom, then don't calc eligible time */ if ((jobp->ji_wattr[(int)JOB_ATR_accrue_type].at_val.at_long == JOB_ELIGIBLE) && (server.sv_attr[(int)SRV_ATR_EligibleTimeEnable].at_val.at_long == 1) && (move_type != MOVE_TYPE_Exec)) { tempval = ((long)time_now - jobp->ji_wattr[(int)JOB_ATR_sample_starttime].at_val.at_long); jobp->ji_wattr[(int)JOB_ATR_eligible_time].at_val.at_long += tempval; jobp->ji_wattr[(int)JOB_ATR_eligible_time].at_flags |= ATR_VFLAG_MODCACHE; } /* in windows code, a child process "w32_send_job" handles the send * This needs the job information, so we save using the filesystem * This avoids the child process from having to "connect" to the database again * The file is deleted by the send_job child process when it has done recovering the job */ job_save_fs(jobp, SAVEJOB_FULLFORCE); /* so the spawned process can get a fresh copy of job */ if (*jobp->ji_qs.ji_fileprefix != '\0') sprintf(buf, "jobfile=%s%s\n", jobp->ji_qs.ji_fileprefix, JOB_FILE_SUFFIX); else sprintf(buf, "jobfile=%s%s\n", jobp->ji_qs.ji_jobid, JOB_FILE_SUFFIX); win_pwrite(&pio, buf, strlen(buf)); sprintf(buf, "destaddr=%ld\n", hostaddr); win_pwrite(&pio, buf, strlen(buf)); sprintf(buf, "destport=%d\n", port); win_pwrite(&pio, buf, strlen(buf)); sprintf(buf, "move_type=%d\n", move_type); win_pwrite(&pio, buf, strlen(buf)); sprintf(buf, "in_server=%d\n", is_linked(&svr_alljobs, &jobp->ji_alljobs)); win_pwrite(&pio, buf, strlen(buf)); sprintf(buf, "server_name=%s\n", (server_name?server_name:"")); win_pwrite(&pio, buf, strlen(buf)); sprintf(buf, "server_host=%s\n", (server_host?server_host:"")); win_pwrite(&pio, buf, strlen(buf)); sprintf(buf, "server_addr=%ld\n", pbs_server_addr); win_pwrite(&pio, buf, strlen(buf)); sprintf(buf, "server_port=%d\n", pbs_server_port_dis); win_pwrite(&pio, buf, strlen(buf)); sprintf(buf, "log_file=%s\n", (log_file?log_file:"")); win_pwrite(&pio, buf, strlen(buf)); sprintf(buf, "path_log=%s\n", (path_log?path_log:"")); win_pwrite(&pio, buf, strlen(buf)); sprintf(buf, "path_jobs=%s\n", (path_jobs?path_jobs:"")); win_pwrite(&pio, buf, strlen(buf)); sprintf(buf, "path_spool=%s\n", (path_spool?path_spool:"")); win_pwrite(&pio, buf, strlen(buf)); sprintf(buf, "path_rescdef=%s\n", (path_rescdef?path_rescdef:"")); win_pwrite(&pio, buf, strlen(buf)); sprintf(buf, "path_users=%s\n", (path_users?path_users:"")); win_pwrite(&pio, buf, strlen(buf)); sprintf(buf, "path_hooks_workdir=%s\n", (path_hooks_workdir?path_hooks_workdir:"")); win_pwrite(&pio, buf, strlen(buf)); sprintf(buf, "svr_history_enable=%ld\n", svr_history_enable); win_pwrite(&pio, buf, strlen(buf)); sprintf(buf, "svr_history_duration=%ld\n", svr_history_duration); win_pwrite(&pio, buf, strlen(buf)); if ( (server.sv_attr[SRV_ATR_ssignon_enable].at_flags & \ ATR_VFLAG_SET) && \ (server.sv_attr[SRV_ATR_ssignon_enable].at_val.at_long == 1) ) strcpy(buf, "single_signon_password_enable=1\n"); else strcpy(buf, "single_signon_password_enable=0\n"); win_pwrite(&pio, buf, strlen(buf)); sprintf(buf, "script_name=%s\n", script_name); win_pwrite(&pio, buf, strlen(buf)); strcpy(buf, "quit\n"); win_pwrite(&pio, buf, strlen(buf)); win_pclose2(&pio); /* closes all handles except the process handle */ return (2); #else pbs_list_head attrl; enum conn_type cntype = ToServerDIS; int con; char *credbuf = NULL; size_t credlen = 0; char *destin = jobp->ji_qs.ji_destin; int encode_type; int i; char job_id[PBS_MAXSVRJOBID+1]; attribute *pattr; pid_t pid; struct attropl *pqjatr; /* list (single) of attropl for quejob */ char script_name[MAXPATHLEN+1]; struct work_task *ptask; struct hostent *hp; struct in_addr addr; long tempval; int gridproxy_cred = 0; int rpp = 0; #ifdef PBS_CRED_GRIDPROXY if (jobp->ji_extended.ji_ext.ji_credtype == PBS_CREDTYPE_GRIDPROXY) gridproxy_cred = 1; #endif if (pbs_conf.pbs_use_tcp == 1 && move_type == MOVE_TYPE_Exec && gridproxy_cred == 0) { return (send_job_exec(jobp, hostaddr, port, preq)); } script_name[0] = '\0'; /* if job has a script read it from database */ if (jobp->ji_qs.ji_svrflags & JOB_SVFLG_SCRIPT) { /* * copy the job script from database to a temp file * PBSD_jscript works with a file * delete it at the end of the send */ if (svr_create_tmp_jobscript(jobp, script_name) != 0) { pbs_errno = PBSE_SYSTEM; snprintf(log_buffer, sizeof(log_buffer), "Failed to create temporary job script for job %s", jobp->ji_qs.ji_jobid); log_err(pbs_errno, "send_job", log_buffer); return -1; } } pid = fork(); if (pid == -1) { /* Error on fork */ log_err(errno, __func__, "fork failed\n"); pbs_errno = PBSE_SYSTEM; return -1; } if (pid != 0) { /* The parent (main server) */ ptask = set_task(WORK_Deferred_Child, pid, post_func, preq); if (!ptask) { log_err(errno, __func__, msg_err_malloc); return (-1); } else { ptask->wt_parm2 = jobp; append_link(&((job *)jobp)->ji_svrtask, &ptask->wt_linkobj, ptask); } return 2; } /* * the child process * * set up signal cather for error return */ DBPRT(("%s: child started, sending to port %d\n", __func__, port)) rpp_terminate(); /* Unprotect child from being killed by kernel */ daemon_protect(0, PBS_DAEMON_PROTECT_OFF); #ifdef WIN32 /* get host name */ /* * If host address is loopback address then do not resolve with dns * Use "localhost" as the host name. */ if ((htonl(hostaddr) == loopback_addr->sin_addr.s_addr)) { (void)get_credential(LOCALHOST_SHORTNAME, jobp, PBS_GC_BATREQ, &credbuf, &credlen); } else { #endif addr.s_addr = htonl(hostaddr); hp = gethostbyaddr((void *)&addr, sizeof(struct in_addr), AF_INET); if (hp == NULL) { sprintf(log_buffer, "%s: h_errno=%d", inet_ntoa(addr), h_errno); log_err(-1, __func__, log_buffer); } else { /* read any credential file */ (void)get_credential(hp->h_name, jobp, PBS_GC_BATREQ, &credbuf, &credlen); } #ifdef WIN32 } #endif /* encode job attributes to be moved */ CLEAR_HEAD(attrl); /* select attributes/resources to send based on move type */ if (move_type == MOVE_TYPE_Exec) { resc_access_perm = ATR_DFLAG_MOM; encode_type = ATR_ENCODE_MOM; cntype = ToServerDIS; } else { resc_access_perm = ATR_DFLAG_USWR | ATR_DFLAG_OPWR | ATR_DFLAG_MGWR | ATR_DFLAG_SvRD; encode_type = ATR_ENCODE_SVR; svr_dequejob(jobp); /* clears default resource settings */ } /* our job is to calc eligible time accurately and save it */ /* on new server, accrue type should be calc afresh */ /* Note: if job is being sent for execution on mom, then don't calc eligible time */ if ((jobp->ji_wattr[(int)JOB_ATR_accrue_type].at_val.at_long == JOB_ELIGIBLE) && (server.sv_attr[(int)SRV_ATR_EligibleTimeEnable].at_val.at_long == 1) && (move_type != MOVE_TYPE_Exec)) { tempval = ((long)time_now - jobp->ji_wattr[(int)JOB_ATR_sample_starttime].at_val.at_long); jobp->ji_wattr[(int)JOB_ATR_eligible_time].at_val.at_long += tempval; jobp->ji_wattr[(int)JOB_ATR_eligible_time].at_flags |= ATR_VFLAG_MODCACHE; } pattr = jobp->ji_wattr; for (i=0; i < (int)JOB_ATR_LAST; i++) { if ((job_attr_def+i)->at_flags & resc_access_perm) { (void)(job_attr_def+i)->at_encode(pattr+i, &attrl, (job_attr_def+i)->at_name, (char *)0, encode_type, NULL); } } attrl_fixlink(&attrl); /* save the job id for when after we purge the job */ (void)strcpy(job_id, jobp->ji_qs.ji_jobid); pbs_errno = 0; con = -1; for (i=0; i<RETRY; i++) { /* connect to receiving server with retries */ if (i > 0) { /* recycle after an error */ if (con >= 0) svr_disconnect(con); if (should_retry_route(pbs_errno) == -1) { /* delete the temp script file */ unlink(script_name); exit(SEND_JOB_FATAL); /* fatal error, don't retry */ } sleep(1<<i); } if ((con = svr_connect(hostaddr, port, 0, cntype, rpp)) == PBS_NET_RC_FATAL) { (void)sprintf(log_buffer, "send_job failed to %lx port %d", hostaddr, port); log_err(pbs_errno, __func__, log_buffer); /* delete the temp script file */ unlink(script_name); if ((move_type == MOVE_TYPE_Exec) && (pbs_errno == PBSE_BADCRED)) exit(SEND_JOB_NODEDW); exit(SEND_JOB_FATAL); } else if (con == PBS_NET_RC_RETRY) { pbs_errno = ECONNREFUSED; /* should retry */ continue; } /* * if the job is substate JOB_SUBSTATE_TRNOUTCM which means * we are recovering after being down or a late failure, we * just want to send the commit" */ if (jobp->ji_qs.ji_substate != JOB_SUBSTATE_TRNOUTCM) { if (jobp->ji_qs.ji_substate != JOB_SUBSTATE_TRNOUT) { jobp->ji_qs.ji_substate = JOB_SUBSTATE_TRNOUT; } pqjatr = &((svrattrl *)GET_NEXT(attrl))->al_atopl; if (PBSD_queuejob(con, jobp->ji_qs.ji_jobid, destin, pqjatr, (char *)0, rpp, NULL) == 0) { if (pbs_errno == PBSE_JOBEXIST && move_type == MOVE_TYPE_Exec) { /* already running, mark it so */ log_event(PBSEVENT_ERROR, PBS_EVENTCLASS_JOB, LOG_INFO, jobp->ji_qs.ji_jobid, "Mom reports job already running"); exit(SEND_JOB_OK); } else if ((pbs_errno == PBSE_HOOKERROR) || (pbs_errno == PBSE_HOOK_REJECT) || (pbs_errno == PBSE_HOOK_REJECT_RERUNJOB) || (pbs_errno == PBSE_HOOK_REJECT_DELETEJOB)) { char name_buf[MAXPATHLEN+1]; int rfd; int len; char *reject_msg; int err; err = pbs_errno; reject_msg = pbs_geterrmsg(con); (void)sprintf(log_buffer, "send of job to %s failed error = %d reject_msg=%s", destin, err, reject_msg?reject_msg:""); log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, LOG_INFO, jobp->ji_qs.ji_jobid, log_buffer); (void)strcpy(name_buf, path_hooks_workdir); (void)strcat(name_buf, jobp->ji_qs.ji_jobid); (void)strcat(name_buf, HOOK_REJECT_SUFFIX); if ((reject_msg != NULL) && (reject_msg[0] != '\0')) { if ((rfd = open(name_buf, O_RDWR|O_CREAT|O_TRUNC, 0600)) == -1) { sprintf(log_buffer, "open of reject file %s failed: errno %d", name_buf, errno); log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, LOG_INFO, jobp->ji_qs.ji_jobid, log_buffer); } else { #ifdef WIN32 secure_file(name_buf, "Administrators", READS_MASK|WRITES_MASK|STANDARD_RIGHTS_REQUIRED); setmode(rfd, O_BINARY); #endif len = strlen(reject_msg)+1; /* write also trailing null char */ if (write(rfd, reject_msg, len) != len) { sprintf(log_buffer, "write to file %s incomplete: errno %d", name_buf, errno); log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, LOG_INFO, jobp->ji_qs.ji_jobid, log_buffer); } close(rfd); } } if (err == PBSE_HOOKERROR) exit(SEND_JOB_HOOKERR); if (err == PBSE_HOOK_REJECT) exit(SEND_JOB_HOOK_REJECT); if (err == PBSE_HOOK_REJECT_RERUNJOB) exit(SEND_JOB_HOOK_REJECT_RERUNJOB); if (err == PBSE_HOOK_REJECT_DELETEJOB) exit(SEND_JOB_HOOK_REJECT_DELETEJOB); } else { (void)sprintf(log_buffer, "send of job to %s failed error = %d", destin, pbs_errno); log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, LOG_INFO, jobp->ji_qs.ji_jobid, log_buffer); continue; } } if (jobp->ji_qs.ji_svrflags & JOB_SVFLG_SCRIPT) { if (PBSD_jscript(con, script_name, rpp, NULL) != 0) continue; } if (credlen > 0) { int ret; ret = PBSD_jcred(con, jobp->ji_extended.ji_ext.ji_credtype, credbuf, credlen, rpp, NULL); if ((ret == 0) || (i == (RETRY - 1))) free(credbuf); /* free credbuf if cred info is sent successfully OR */ /* at the end of all retry attempts */ if (ret != 0) continue; } if ((move_type == MOVE_TYPE_Exec) && (jobp->ji_qs.ji_svrflags & JOB_SVFLG_HASRUN) && (hostaddr != pbs_server_addr)) { /* send files created on prior run */ if ((move_job_file(con, jobp, StdOut, rpp, NULL) != 0) || (move_job_file(con, jobp, StdErr, rpp, NULL) != 0) || (move_job_file(con, jobp, Chkpt, rpp, NULL) != 0)) continue; } jobp->ji_qs.ji_substate = JOB_SUBSTATE_TRNOUTCM; } if (PBSD_rdytocmt(con, job_id, rpp, NULL) != 0) continue; if (PBSD_commit(con, job_id, rpp, NULL) != 0) { /* delete the temp script file */ unlink(script_name); exit(SEND_JOB_FATAL); } svr_disconnect(con); /* delete the temp script file */ unlink(script_name); exit(SEND_JOB_OK); /* This child process is all done */ } if (con >= 0) svr_disconnect(con); /* * If connection is actively refused by the execution node(or mother superior) OR * the execution node(or mother superior) is rejecting request with error * PBSE_BADHOST(failing to authorize server host), the node should be marked down. */ if ((move_type == MOVE_TYPE_Exec) && (pbs_errno == ECONNREFUSED || pbs_errno == PBSE_BADHOST)) { i = SEND_JOB_NODEDW; } else if (should_retry_route(pbs_errno) == -1) { i = SEND_JOB_FATAL; } else { i = SEND_JOB_RETRY; } (void)sprintf(log_buffer, "send_job failed with error %d", pbs_errno); log_event(PBSEVENT_DEBUG, PBS_EVENTCLASS_JOB, LOG_NOTICE, jobp->ji_qs.ji_jobid, log_buffer); /* delete the temp script file */ unlink(script_name); exit(i); return -1; /* NOT REACHED */ #endif /* !WIN32 */ }
static void post_movejob( struct work_task *pwt) { char *id = "post_movejob"; struct batch_request *req; int newstate; int newsub; int stat; int r; job *jobp; req = (struct batch_request *)pwt->wt_parm2; stat = pwt->wt_aux; pbs_errno = PBSE_NONE; if (req->rq_type != PBS_BATCH_MoveJob) { sprintf(log_buffer, "bad request type %d\n", req->rq_type); log_err(-1, id, log_buffer); return; } jobp = find_job(req->rq_ind.rq_move.rq_jid); if ((jobp == NULL) || (jobp != (job *)pwt->wt_parm1)) { sprintf(log_buffer, "job %s not found\n", req->rq_ind.rq_move.rq_jid); log_err(-1, id, log_buffer); } if (WIFEXITED(stat)) { r = WEXITSTATUS(stat); if (r == 0) { /* purge server's job structure */ if (jobp->ji_qs.ji_svrflags & JOB_SVFLG_StagedIn) remove_stagein(jobp); if (jobp->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_COPIED) remove_checkpoint(jobp); strcpy(log_buffer, msg_movejob); sprintf(log_buffer + strlen(log_buffer), msg_manager, req->rq_ind.rq_move.rq_destin, req->rq_user, req->rq_host); job_purge(jobp); } else { r = PBSE_ROUTEREJ; } } else { r = PBSE_SYSTEM; sprintf(log_buffer, msg_badexit, stat); strcat(log_buffer, id); log_event( PBSEVENT_SYSTEM, PBS_EVENTCLASS_JOB, jobp->ji_qs.ji_jobid, log_buffer); } if (r) { if (jobp != NULL) { /* force re-eval of job state out of Transit */ svr_evaljobstate(jobp, &newstate, &newsub, 1); svr_setjobstate(jobp, newstate, newsub); } req_reject(r, 0, req, NULL, NULL); } else { reply_ack(req); } return; } /* END post_movejob() */
void req_modifyjob(struct batch_request *preq) { int add_to_am_list = 0; /* if altered during sched cycle */ int bad = 0; int jt; /* job type */ int newstate; int newsubstate; resource_def *outsideselect = NULL; job *pjob; svrattrl *plist; resource *presc; resource_def *prsd; int rc; int running = 0; int sendmom = 0; char hook_msg[HOOK_MSG_SIZE]; int mod_project = 0; pbs_sched *psched; switch (process_hooks(preq, hook_msg, sizeof(hook_msg), pbs_python_set_interrupt)) { case 0: /* explicit reject */ reply_text(preq, PBSE_HOOKERROR, hook_msg); return; case 1: /* explicit accept */ if (recreate_request(preq) == -1) { /* error */ /* we have to reject the request, as 'preq' */ /* may have been partly modified */ strcpy(hook_msg, "modifyjob event: rejected request"); log_event(PBSEVENT_ERROR, PBS_EVENTCLASS_HOOK, LOG_ERR, "", hook_msg); reply_text(preq, PBSE_HOOKERROR, hook_msg); return; } break; case 2: /* no hook script executed - go ahead and accept event*/ break; default: log_event(PBSEVENT_DEBUG2, PBS_EVENTCLASS_HOOK, LOG_INFO, "", "modifyjob event: accept req by default"); } if (pseldef == NULL) /* do one time to keep handy */ pseldef = find_resc_def(svr_resc_def, "select", svr_resc_size); pjob = chk_job_request(preq->rq_ind.rq_modify.rq_objname, preq, &jt); if (pjob == NULL) return; if ((jt == IS_ARRAY_Single) || (jt == IS_ARRAY_Range)) { req_reject(PBSE_IVALREQ, 0, preq); return; } psched = find_sched_from_sock(preq->rq_conn); /* allow scheduler to modify job */ if (psched == NULL) { /* provisioning job is not allowed to be modified */ if ((pjob->ji_qs.ji_state == JOB_STATE_RUNNING) && (pjob->ji_qs.ji_substate == JOB_SUBSTATE_PROVISION)) { req_reject(PBSE_BADSTATE, 0, preq); return; } } /* cannot be in exiting or transit, exiting has already be checked */ if (pjob->ji_qs.ji_state == JOB_STATE_TRANSIT) { req_reject(PBSE_BADSTATE, 0, preq); return; } plist = (svrattrl *)GET_NEXT(preq->rq_ind.rq_modify.rq_attr); if (plist == NULL) { /* nothing to do */ reply_ack(preq); return; } /* * Special checks must be made: * if during a scheduling cycle and certain attributes are altered, * make a note of the job to prevent it from being run now; * if job is running, only certain attributes/resources can be * altered. */ if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING) { running = 1; } while (plist) { int i; i = find_attr(job_attr_def, plist->al_name, JOB_ATR_LAST); /* * Is the attribute being altered one which could change * scheduling (ATR_DFLAG_SCGALT set) and if a scheduling * cycle is in progress, then set flag to add the job to list * of jobs which cannot be run in this cycle. * If the scheduler itself sends a modify job request, * no need to delay the job until next cycle. */ if ((psched == NULL) && (scheduler_jobs_stat) && (job_attr_def[i].at_flags & ATR_DFLAG_SCGALT)) add_to_am_list = 1; /* Is the attribute modifiable in RUN state ? */ if (i < 0) { reply_badattr(PBSE_NOATTR, 1, plist, preq); return; } if ((running == 1) && ((job_attr_def[i].at_flags & ATR_DFLAG_ALTRUN) == 0)) { reply_badattr(PBSE_MODATRRUN, 1, plist, preq); return; } if (i == (int)JOB_ATR_resource) { prsd = find_resc_def(svr_resc_def, plist->al_resc, svr_resc_size); if (prsd == 0) { reply_badattr(PBSE_UNKRESC, 1, plist, preq); return; } /* is the specified resource modifiable while */ /* the job is running */ if (running) { if ((prsd->rs_flags & ATR_DFLAG_ALTRUN) == 0) { reply_badattr(PBSE_MODATRRUN, 1, plist, preq); return; } sendmom = 1; } /* should the resource be only in a select spec */ if (prsd->rs_flags & ATR_DFLAG_CVTSLT && !outsideselect && plist->al_atopl.value && plist->al_atopl.value[0]) { /* if "-lresource" is set and has non-NULL value, ** remember as potential bad resource ** if this appears along "select". */ outsideselect = prsd; } } if (strcmp(plist->al_name, ATTR_project) == 0) { mod_project = 1; } else if ((strcmp(plist->al_name, ATTR_runcount) == 0) && ((plist->al_flags & ATR_VFLAG_HOOK) == 0) && (plist->al_value != NULL) && (plist->al_value[0] != '\0') && ((preq->rq_perm & (ATR_DFLAG_MGWR | ATR_DFLAG_OPWR)) == 0) && (atol(plist->al_value) < \ pjob->ji_wattr[(int)JOB_ATR_runcount].at_val.at_long)) { sprintf(log_buffer, "regular user %s@%s cannot decrease '%s' attribute value from %ld to %ld", preq->rq_user, preq->rq_host, ATTR_runcount, pjob->ji_wattr[(int)JOB_ATR_runcount].at_val.at_long, atol(plist->al_value)); log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, LOG_ERR, pjob->ji_qs.ji_jobid, log_buffer); req_reject(PBSE_PERM, 0, preq); return; } plist = (svrattrl *)GET_NEXT(plist->al_link); } if (outsideselect) { presc = find_resc_entry(&pjob->ji_wattr[(int)JOB_ATR_resource], pseldef); if (presc && ((presc->rs_value.at_flags & ATR_VFLAG_DEFLT) == 0)) { /* select is not a default, so reject qalter */ resc_in_err = strdup(outsideselect->rs_name); req_reject(PBSE_INVALJOBRESC, 0, preq); return; } } /* modify the jobs attributes */ bad = 0; plist = (svrattrl *)GET_NEXT(preq->rq_ind.rq_modify.rq_attr); rc = modify_job_attr(pjob, plist, preq->rq_perm, &bad); if (rc) { if (pjob->ji_clterrmsg) reply_text(preq, rc, pjob->ji_clterrmsg); else reply_badattr(rc, bad, plist, preq); return; } /* If certain attributes modified and if in scheduling cycle */ /* then add to list of jobs which cannot be run in this cycle */ if (add_to_am_list) am_jobs_add(pjob); /* see req_runjob() */ /* check if project attribute was requested to be modified to */ /* be the default project value */ if (mod_project && (pjob->ji_wattr[(int)JOB_ATR_project].at_flags & \ ATR_VFLAG_SET)) { if (strcmp(pjob->ji_wattr[(int)JOB_ATR_project].at_val.at_str, PBS_DEFAULT_PROJECT) == 0) { sprintf(log_buffer, msg_defproject, ATTR_project, PBS_DEFAULT_PROJECT); #ifdef NAS /* localmod 107 */ log_event(PBSEVENT_DEBUG4, PBS_EVENTCLASS_JOB, LOG_INFO, pjob->ji_qs.ji_jobid, log_buffer); #else log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, LOG_INFO, pjob->ji_qs.ji_jobid, log_buffer); #endif /* localmod 107 */ } } if (pjob->ji_wattr[(int)JOB_ATR_resource].at_flags & ATR_VFLAG_MODIFY) { presc = find_resc_entry(&pjob->ji_wattr[(int)JOB_ATR_resource], pseldef); if (presc && (presc->rs_value.at_flags & ATR_VFLAG_DEFLT)) { /* changing Resource_List and select is a default */ /* clear "select" so it is rebuilt inset_resc_deflt */ pseldef->rs_free(&presc->rs_value); } } /* Reset any defaults resource limit which might have been unset */ if ((rc = set_resc_deflt((void *)pjob, JOB_OBJECT, NULL)) != 0) { req_reject(rc, 0, preq); return; } /* if job is not running, may need to change its state */ if (pjob->ji_qs.ji_state != JOB_STATE_RUNNING) { svr_evaljobstate(pjob, &newstate, &newsubstate, 0); (void)svr_setjobstate(pjob, newstate, newsubstate); } else { (void)job_save(pjob, SAVEJOB_FULL); } (void)sprintf(log_buffer, msg_manager, msg_jobmod, preq->rq_user, preq->rq_host); log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, LOG_INFO, pjob->ji_qs.ji_jobid, log_buffer); /* if a resource limit changed for a running job, send to MOM */ if (sendmom) { rc = relay_to_mom(pjob, preq, post_modify_req); if (rc) req_reject(rc, 0, preq); /* unable to get to MOM */ return; } reply_ack(preq); }
void process_hold_reply( batch_request *preq) { job *pjob; pbs_attribute temphold; int newstate; int newsub; int rc; char *pset; char log_buf[LOCAL_LOG_BUF_SIZE]; /* preq was handled previously */ if (preq == NULL) return; preq->rq_conn = preq->rq_orgconn; /* restore client socket */ if ((pjob = svr_find_job(preq->rq_ind.rq_hold.rq_orig.rq_objname, FALSE)) == NULL) { log_event(PBSEVENT_DEBUG, PBS_EVENTCLASS_JOB, preq->rq_ind.rq_hold.rq_orig.rq_objname, msg_postmomnojob); req_reject(PBSE_UNKJOBID, 0, preq, NULL, msg_postmomnojob); } else { mutex_mgr job_mutex(pjob->ji_mutex, true); if (preq->rq_reply.brp_code != 0) { rc = get_hold(&preq->rq_ind.rq_hold.rq_orig.rq_attr, (const char **)&pset, &temphold); if (rc == 0) { rc = job_attr_def[JOB_ATR_hold].at_set(&pjob->ji_wattr[JOB_ATR_hold], &temphold, DECR); } pjob->ji_qs.ji_substate = JOB_SUBSTATE_RUNNING; /* reset it */ pjob->ji_modified = 1; /* indicate attributes changed */ svr_evaljobstate(*pjob, newstate, newsub, 0); svr_setjobstate(pjob, newstate, newsub, FALSE); /* saves job */ if (preq->rq_reply.brp_code != PBSE_NOSUP) { sprintf(log_buf, msg_mombadhold, preq->rq_reply.brp_code); log_event(PBSEVENT_DEBUG, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buf); req_reject(preq->rq_reply.brp_code, 0, preq, NULL, log_buf); } else { reply_ack(preq); } } else { /* record that MOM has a checkpoint file */ /* PBS_CHECKPOINT_MIGRATEABLE is defined as zero therefore this code will never fire. * And if these flags are not set, start_exec will not try to run the job from * the checkpoint image file. */ pjob->ji_qs.ji_svrflags |= JOB_SVFLG_CHECKPOINT_FILE; if (preq->rq_reply.brp_auxcode) /* checkpoint can be moved */ { pjob->ji_qs.ji_svrflags &= ~JOB_SVFLG_CHECKPOINT_FILE; pjob->ji_qs.ji_svrflags |= JOB_SVFLG_HASRUN | JOB_SVFLG_CHECKPOINT_MIGRATEABLE; } pjob->ji_modified = 1; /* indicate attributes changed */ svr_evaljobstate(*pjob, newstate, newsub, 0); svr_setjobstate(pjob, newstate, newsub, FALSE); /* saves job */ account_record(PBS_ACCT_CHKPNT, pjob, "Checkpointed and held"); /* note in accounting file */ reply_ack(preq); } } } /* END process_hold_reply() */
static void post_stagein( struct work_task *pwt) { int code; int newstate; int newsub; job *pjob; struct batch_request *preq; attribute *pwait; preq = pwt->wt_parm1; code = preq->rq_reply.brp_code; pjob = find_job(preq->rq_extra); free(preq->rq_extra); if (pjob != NULL) { if (code != 0) { /* stage in failed - hold job */ free_nodes(pjob); pwait = &pjob->ji_wattr[(int)JOB_ATR_exectime]; if ((pwait->at_flags & ATR_VFLAG_SET) == 0) { pwait->at_val.at_long = time_now + PBS_STAGEFAIL_WAIT; pwait->at_flags |= ATR_VFLAG_SET; job_set_wait(pwait, pjob, 0); } svr_setjobstate(pjob, JOB_STATE_WAITING, JOB_SUBSTATE_STAGEFAIL); if (preq->rq_reply.brp_choice == BATCH_REPLY_CHOICE_Text) { /* set job comment */ /* NYI */ svr_mailowner( pjob, MAIL_STAGEIN, MAIL_FORCE, preq->rq_reply.brp_un.brp_txt.brp_str); } } else { /* stage in was successful */ pjob->ji_qs.ji_svrflags |= JOB_SVFLG_StagedIn; if (pjob->ji_qs.ji_substate == JOB_SUBSTATE_STAGEGO) { if (is_checkpoint_restart(pjob)) { /* need to copy checkpoint file to mom before running */ svr_send_checkpoint( pjob, preq, JOB_STATE_RUNNING, JOB_SUBSTATE_CHKPTGO); } else { /* continue to start job running */ svr_strtjob2(pjob, NULL); } } else { svr_evaljobstate(pjob, &newstate, &newsub, 0); svr_setjobstate(pjob, newstate, newsub); } } } /* END if (pjob != NULL) */ release_req(pwt); /* close connection and release request */ return; } /* END post_stagein() */
static void post_sendmom( struct work_task *pwt) /* I */ { char *id = "post_sendmom"; int newstate; int newsub; int r; int stat; job *jobp = (job *)pwt->wt_parm1; struct batch_request *preq = (struct batch_request *)pwt->wt_parm2; char *MOMName = NULL; int jindex; long DTime = time_now - 10000; if (LOGLEVEL >= 6) { log_record( PBSEVENT_JOB, PBS_EVENTCLASS_JOB, jobp->ji_qs.ji_jobid, "entering post_sendmom"); } stat = pwt->wt_aux; if (WIFEXITED(stat)) { r = WEXITSTATUS(stat); } else { r = 2; /* cannot get child exit status */ sprintf(log_buffer, msg_badexit, stat); strcat(log_buffer, id); log_event( PBSEVENT_SYSTEM, PBS_EVENTCLASS_JOB, jobp->ji_qs.ji_jobid, log_buffer); } /* maintain local struct to associate job id with dispatch time */ for (jindex = 0;jindex < 20;jindex++) { if (DispatchJob[jindex] == jobp) { DTime = DispatchTime[jindex]; DispatchJob[jindex] = NULL; MOMName = DispatchNode[jindex]; break; } } if (LOGLEVEL >= 1) { sprintf(log_buffer, "child reported %s for job after %ld seconds (dest=%s), rc=%d", (r == 0) ? "success" : "failure", time_now - DTime, (MOMName != NULL) ? MOMName : "???", r); log_event( PBSEVENT_SYSTEM, PBS_EVENTCLASS_JOB, jobp->ji_qs.ji_jobid, log_buffer); } switch (r) { case 0: /* send to MOM went ok */ jobp->ji_qs.ji_svrflags &= ~JOB_SVFLG_HOTSTART; if (preq != NULL) reply_ack(preq); /* record start time for accounting */ jobp->ji_qs.ji_stime = time_now; /* update resource usage attributes */ set_resc_assigned(jobp, INCR); if (jobp->ji_qs.ji_substate == JOB_SUBSTATE_PRERUN) { /* may be EXITING if job finished first */ svr_setjobstate(jobp, JOB_STATE_RUNNING, JOB_SUBSTATE_RUNNING); /* above saves job structure */ } /* accounting log for start or restart */ if (jobp->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_FILE) account_record(PBS_ACCT_RESTRT, jobp, "Restart from checkpoint"); else account_jobstr(jobp); /* if any dependencies, see if action required */ if (jobp->ji_wattr[(int)JOB_ATR_depend].at_flags & ATR_VFLAG_SET) depend_on_exec(jobp); /* * it is unfortunate, but while the job has gone into execution, * there is no way of obtaining the session id except by making * a status request of MOM. (Even if the session id was passed * back to the sending child, it couldn't get up to the parent.) */ jobp->ji_momstat = 0; stat_mom_job(jobp); break; case 10: /* NOTE: if r == 10, connection to mom timed out. Mark node down */ stream_eof(-1, jobp->ji_qs.ji_un.ji_exect.ji_momaddr, 0); /* send failed, requeue the job */ log_event( PBSEVENT_JOB, PBS_EVENTCLASS_JOB, jobp->ji_qs.ji_jobid, "unable to run job, MOM rejected/timeout"); free_nodes(jobp); if (jobp->ji_qs.ji_substate != JOB_SUBSTATE_ABORT) { if (preq != NULL) req_reject(PBSE_MOMREJECT, 0, preq, MOMName, "connection to mom timed out"); svr_evaljobstate(jobp, &newstate, &newsub, 1); svr_setjobstate(jobp, newstate, newsub); } else { if (preq != NULL) req_reject(PBSE_BADSTATE, 0, preq, MOMName, "job was aborted by mom"); } break; case 1: /* commit failed */ default: { int JobOK = 0; /* send failed, requeue the job */ sprintf(log_buffer, "unable to run job, MOM rejected/rc=%d", r); log_event( PBSEVENT_JOB, PBS_EVENTCLASS_JOB, jobp->ji_qs.ji_jobid, log_buffer); free_nodes(jobp); if (jobp->ji_qs.ji_substate != JOB_SUBSTATE_ABORT) { if (preq != NULL) { char tmpLine[1024]; if (preq->rq_reply.brp_code == PBSE_JOBEXIST) { /* job already running, start request failed but return success since * desired behavior (job is running) is accomplished */ JobOK = 1; } else { sprintf(tmpLine, "cannot send job to %s, state=%s", (MOMName != NULL) ? MOMName : "mom", PJobSubState[jobp->ji_qs.ji_substate]); req_reject(PBSE_MOMREJECT, 0, preq, MOMName, tmpLine); } } if (JobOK == 1) { /* do not re-establish accounting - completed first time job was started */ /* update mom-based job status */ jobp->ji_momstat = 0; stat_mom_job(jobp); } else { svr_evaljobstate(jobp, &newstate, &newsub, 1); svr_setjobstate(jobp, newstate, newsub); } } else { if (preq != NULL) req_reject(PBSE_BADSTATE, 0, preq, MOMName, "send failed - abort"); } break; } } /* END switch (r) */ return; } /* END post_sendmom() */
/** * update_array_values() * * updates internal bookeeping values for job arrays * @param pa - array to update * @param pjob - the pjob that an event happened on * @param event - code for what event just happened */ void update_array_values( job_array *pa, /* I */ int old_state, /* I */ enum ArrayEventsEnum event, /* I */ char *job_id, long job_atr_hold, int job_exit_status) { long moab_compatible; switch (event) { case aeQueue: /* NYI, nothing needs to be done for this yet */ break; case aeRun: if (old_state != JOB_STATE_RUNNING) { pa->ai_qs.jobs_running++; pa->ai_qs.num_started++; } break; case aeTerminate: if (old_state == JOB_STATE_RUNNING) { if (pa->ai_qs.jobs_running > 0) pa->ai_qs.jobs_running--; } if (job_exit_status == 0) { pa->ai_qs.num_successful++; pa->ai_qs.jobs_done++; } else { pa->ai_qs.num_failed++; pa->ai_qs.jobs_done++; } array_save(pa); /* update slot limit hold if necessary */ if (get_svr_attr_l(SRV_ATR_MoabArrayCompatible, &moab_compatible) != PBSE_NONE) moab_compatible = FALSE; if (moab_compatible != FALSE) { /* only need to update if the job wasn't previously held */ if ((job_atr_hold & HOLD_l) == FALSE) { int i; int newstate; int newsub; job *pj; /* find the first held job and release its hold */ for (i = 0; i < pa->ai_qs.array_size; i++) { if (pa->job_ids[i] == NULL) continue; if (!strcmp(pa->job_ids[i], job_id)) continue; if ((pj = svr_find_job(pa->job_ids[i], TRUE)) == NULL) { free(pa->job_ids[i]); pa->job_ids[i] = NULL; } else { if (pj->ji_wattr[JOB_ATR_hold].at_val.at_long & HOLD_l) { pj->ji_wattr[JOB_ATR_hold].at_val.at_long &= ~HOLD_l; if (pj->ji_wattr[JOB_ATR_hold].at_val.at_long == 0) { pj->ji_wattr[JOB_ATR_hold].at_flags &= ~ATR_VFLAG_SET; } svr_evaljobstate(pj, &newstate, &newsub, 1); svr_setjobstate(pj, newstate, newsub, FALSE); job_save(pj, SAVEJOB_FULL, 0); unlock_ji_mutex(pj, __func__, "1", LOGLEVEL); break; } unlock_ji_mutex(pj, __func__, "2", LOGLEVEL); } } } } break; default: /* log error? */ break; } set_array_depend_holds(pa); array_save(pa); } /* END update_array_values() */
/** * @brief * post_routejob - clean up action for child started in net_move/send_job * to "route" a job to another server * @par * If route was successfull, delete job. * @par * If route didn't work, mark destination not to be tried again for this * job and call route again. * * @param[in] pwt - work task structure * * @return none. */ static void post_routejob(struct work_task *pwt) { int newstate; int newsub; int r; int stat = pwt->wt_aux; job *jobp = (job *)pwt->wt_parm2; if (jobp == NULL) { log_event(PBSEVENT_ERROR, PBS_EVENTCLASS_JOB, LOG_INFO, "", "post_routejob failed, jobp NULL"); return; } if (WIFEXITED(stat)) { r = WEXITSTATUS(stat); } else { r = SEND_JOB_FATAL; (void)sprintf(log_buffer, msg_badexit, stat); (void)strcat(log_buffer, __func__); log_event(PBSEVENT_SYSTEM, PBS_EVENTCLASS_JOB, LOG_NOTICE, jobp->ji_qs.ji_jobid, log_buffer); } switch (r) { case SEND_JOB_OK: /* normal return, job was routed */ if (jobp->ji_qs.ji_svrflags & JOB_SVFLG_StagedIn) remove_stagein(jobp); /* * If the server is configured to keep job history and the job * is created here, do not purge the job structure but save * it for history purpose. No need to check for sub-jobs as * sub jobs can not be routed. */ if (svr_chk_history_conf()) svr_setjob_histinfo(jobp, T_MOV_JOB); else job_purge(jobp); /* need to remove server job struct */ return; case SEND_JOB_FATAL: /* permanent rejection (or signal) */ if (jobp->ji_qs.ji_substate == JOB_SUBSTATE_ABORT) { /* Job Delete in progress, just set to queued status */ (void)svr_setjobstate(jobp, JOB_STATE_QUEUED, JOB_SUBSTATE_ABORT); return; } add_dest(jobp); /* else mark destination as bad */ /* fall through */ default : /* try routing again */ /* force re-eval of job state out of Transit */ svr_evaljobstate(jobp, &newstate, &newsub, 1); (void)svr_setjobstate(jobp, newstate, newsub); jobp->ji_retryok = 1; if ((r = job_route(jobp)) == PBSE_ROUTEREJ) (void)job_abt(jobp, msg_routebad); else if (r != 0) (void)job_abt(jobp, msg_routexceed); break; } return; }
END_TEST START_TEST(svr_evaljobstate_test) { struct job test_job; int state = 0; int substate = 0; memset(&test_job, 0, sizeof(test_job)); test_job.ji_qs.ji_state = JOB_STATE_RUNNING; svr_evaljobstate(test_job, state, substate, 0); fail_unless(test_job.ji_qs.ji_state == state, "svr_setjobstate state fail case 1"); fail_unless(test_job.ji_qs.ji_substate == substate, "svr_setjobstate substate fail case 1"); memset(&test_job, 0, sizeof(test_job)); test_job.ji_wattr[JOB_ATR_hold].at_val.at_long = 1; svr_evaljobstate(test_job, state, substate, 0); fail_unless(test_job.ji_qs.ji_state == state, "svr_setjobstate state fail case 2"); fail_unless(test_job.ji_qs.ji_substate == substate, "svr_setjobstate substate fail case 2"); memset(&test_job, 0, sizeof(test_job)); test_job.ji_wattr[JOB_ATR_stagein].at_flags = 1; svr_evaljobstate(test_job, state, substate, 0); fail_unless(test_job.ji_qs.ji_state == state, "svr_setjobstate state fail case 3"); fail_unless(test_job.ji_qs.ji_substate == substate, "svr_setjobstate substate fail case 3"); memset(&test_job, 0, sizeof(test_job)); svr_evaljobstate(test_job, state, substate, 0); fail_unless(test_job.ji_qs.ji_state == state, "svr_setjobstate state fail case 4"); fail_unless(test_job.ji_qs.ji_substate == substate, "svr_setjobstate substate fail case 4"); memset(&test_job, 0, sizeof(test_job)); svr_evaljobstate(test_job, state, substate, 1); fail_unless(JOB_STATE_QUEUED == state, "svr_setjobstate state fail case 5"); fail_unless(JOB_SUBSTATE_QUEUED == substate, "svr_setjobstate substate fail case 5"); int old_state; int old_substate; test_job.ji_qs.ji_state = JOB_STATE_EXITING; test_job.ji_qs.ji_substate = JOB_SUBSTATE_EXITING; old_state = test_job.ji_qs.ji_state; old_substate = test_job.ji_qs.ji_substate; svr_evaljobstate(test_job, state, substate, 1); fail_unless(old_state == state); fail_unless(old_substate == substate); test_job.ji_qs.ji_state = JOB_STATE_EXITING; test_job.ji_qs.ji_substate = JOB_SUBSTATE_RERUN3; old_state = test_job.ji_qs.ji_state; old_substate = test_job.ji_qs.ji_substate; svr_evaljobstate(test_job, state, substate, 1); fail_unless(state == JOB_STATE_QUEUED); fail_unless(substate == JOB_SUBSTATE_QUEUED); test_job.ji_qs.ji_state = JOB_STATE_COMPLETE; test_job.ji_qs.ji_substate = JOB_SUBSTATE_COMPLETE; old_state = test_job.ji_qs.ji_state; old_substate = test_job.ji_qs.ji_substate; svr_evaljobstate(test_job, state, substate, 1); fail_unless(old_state == state); fail_unless(old_substate == substate); }
int req_holdjob( batch_request *vp) /* I */ { long *hold_val; int newstate; int newsub; long old_hold; job *pjob; char *pset; int rc; pbs_attribute temphold; pbs_attribute *pattr; batch_request *preq = (struct batch_request *)vp; char log_buf[LOCAL_LOG_BUF_SIZE]; batch_request *dup_req = NULL; pjob = chk_job_request(preq->rq_ind.rq_hold.rq_orig.rq_objname, preq); if (pjob == NULL) { return(PBSE_NONE); } mutex_mgr job_mutex(pjob->ji_mutex, true); /* cannot do anything until we decode the holds to be set */ if ((rc = get_hold(&preq->rq_ind.rq_hold.rq_orig.rq_attr, (const char **)&pset, &temphold)) != 0) { req_reject(rc, 0, preq, NULL, NULL); return(PBSE_NONE); } /* if other than HOLD_u is being set, must have privil */ if ((rc = chk_hold_priv(temphold.at_val.at_long, preq->rq_perm)) != 0) { req_reject(rc, 0, preq, NULL, NULL); return(PBSE_NONE); } hold_val = &pjob->ji_wattr[JOB_ATR_hold].at_val.at_long; old_hold = *hold_val; *hold_val |= temphold.at_val.at_long; pjob->ji_wattr[JOB_ATR_hold].at_flags |= ATR_VFLAG_SET; sprintf(log_buf, msg_jobholdset, pset, preq->rq_user, preq->rq_host); pattr = &pjob->ji_wattr[JOB_ATR_checkpoint]; if ((pjob->ji_qs.ji_state == JOB_STATE_RUNNING) && ((pattr->at_flags & ATR_VFLAG_SET) && ((csv_find_string(pattr->at_val.at_str, "s") != NULL) || (csv_find_string(pattr->at_val.at_str, "c") != NULL) || (csv_find_string(pattr->at_val.at_str, "enabled") != NULL)))) { /* have MOM attempt checkpointing */ /* ** The jobid in the request always have the server suffix attached ** which is dropped when the server attribute ** 'display_job_server_suffix' is FALSE and so will in the MOM's. ** Therefore, it must be passed as the server to the MOM so she can ** find it to hold. */ if (strncmp(pjob->ji_qs.ji_jobid, preq->rq_ind.rq_hold.rq_orig.rq_objname, PBS_MAXSVRJOBID)) snprintf(preq->rq_ind.rq_hold.rq_orig.rq_objname, sizeof(preq->rq_ind.rq_hold.rq_orig.rq_objname), "%s", pjob->ji_qs.ji_jobid); if ((dup_req = duplicate_request(preq)) == NULL) { req_reject(rc, 0, preq, NULL, "memory allocation failure"); } /* The dup_req is freed in relay_to_mom (failure) * or in issue_Drequest (success) */ else if ((rc = relay_to_mom(&pjob, dup_req, NULL)) != PBSE_NONE) { free_br(dup_req); *hold_val = old_hold; /* reset to the old value */ req_reject(rc, 0, preq, NULL, "relay to mom failed"); if (pjob == NULL) job_mutex.set_unlock_on_exit(false); } else { if (pjob != NULL) { pjob->ji_qs.ji_svrflags |= JOB_SVFLG_HASRUN | JOB_SVFLG_CHECKPOINT_FILE; job_save(pjob, SAVEJOB_QUICK, 0); /* fill in log_buf again, since relay_to_mom changed it */ sprintf(log_buf, msg_jobholdset, pset, preq->rq_user, preq->rq_host); log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buf); unlock_ji_mutex(pjob, __func__, "3", LOGLEVEL); pjob = NULL; reply_ack(preq); } else job_mutex.set_unlock_on_exit(false); process_hold_reply(dup_req); } } #ifdef ENABLE_BLCR else if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING) { /* * This system is configured with BLCR checkpointing to be used, * but this Running job does not have checkpointing enabled, * so we reject the request */ log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf); req_reject(PBSE_IVALREQ, 0, preq, NULL, "job not held since checkpointing is expected but not enabled for job"); } #endif else { /* everything went well, may need to update the job state */ log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf); if (old_hold != *hold_val) { /* indicate attributes changed */ pjob->ji_modified = 1; svr_evaljobstate(*pjob, newstate, newsub, 0); svr_setjobstate(pjob, newstate, newsub, FALSE); } reply_ack(preq); } return(PBSE_NONE); } /* END req_holdjob() */
void req_holdjob(struct batch_request *preq) { long *hold_val; int jt; /* job type */ int newstate; int newsub; long old_hold; job *pjob; char *pset; int rc; char date[32]; time_t now; pjob = chk_job_request(preq->rq_ind.rq_hold.rq_orig.rq_objname, preq, &jt); if (pjob == (job *)0) return; if ((jt != IS_ARRAY_NO) && (jt != IS_ARRAY_ArrayJob)) { req_reject(PBSE_IVALREQ, 0, preq); return; } if ((pjob->ji_qs.ji_state == JOB_STATE_RUNNING) && (pjob->ji_qs.ji_substate == JOB_SUBSTATE_PROVISION)) { req_reject(PBSE_BADSTATE, 0, preq); return; } /* cannot do anything until we decode the holds to be set */ if ((rc=get_hold(&preq->rq_ind.rq_hold.rq_orig.rq_attr, &pset)) != 0) { req_reject(rc, 0, preq); return; } /* if other than HOLD_u is being set, must have privil */ if ((rc = chk_hold_priv(temphold.at_val.at_long, preq->rq_perm)) != 0) { req_reject(rc, 0, preq); return; } /* HOLD_bad_password can only be done by root or admin */ #ifdef WIN32 if ( (temphold.at_val.at_long & HOLD_bad_password) && \ !isAdminPrivilege(preq->rq_user) ) #else if ( (temphold.at_val.at_long & HOLD_bad_password) && \ strcasecmp(preq->rq_user, PBS_DEFAULT_ADMIN) != 0 ) #endif { req_reject(PBSE_PERM, 0, preq); return; } hold_val = &pjob->ji_wattr[(int)JOB_ATR_hold].at_val.at_long; old_hold = *hold_val; *hold_val |= temphold.at_val.at_long; pjob->ji_wattr[(int)JOB_ATR_hold].at_flags |= ATR_VFLAG_SET | ATR_VFLAG_MODCACHE; /* Note the hold time in the job comment. */ now = time(NULL); (void)strncpy(date, (const char *)ctime(&now), 24); date[24] = '\0'; (void)sprintf(log_buffer, "Job held by %s on %s", preq->rq_user, date); job_attr_def[(int)JOB_ATR_Comment].at_decode(&pjob->ji_wattr[(int)JOB_ATR_Comment], (char *)0, (char *)0, log_buffer); (void)sprintf(log_buffer, msg_jobholdset, pset, preq->rq_user, preq->rq_host); if ((pjob->ji_qs.ji_state == JOB_STATE_RUNNING) && (pjob->ji_qs.ji_substate != JOB_SUBSTATE_PRERUN) && (pjob->ji_wattr[(int)JOB_ATR_chkpnt].at_val.at_str) && (*pjob->ji_wattr[(int)JOB_ATR_chkpnt].at_val.at_str != 'n')) { /* have MOM attempt checkpointing */ if ((rc = relay_to_mom(pjob, preq, post_hold)) != 0) { *hold_val = old_hold; /* reset to the old value */ req_reject(rc, 0, preq); } else { pjob->ji_qs.ji_substate = JOB_SUBSTATE_RERUN; pjob->ji_qs.ji_svrflags |= JOB_SVFLG_HASRUN | JOB_SVFLG_CHKPT; (void)job_save(pjob, SAVEJOB_QUICK); log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, LOG_INFO, pjob->ji_qs.ji_jobid, log_buffer); } } else { /* every thing went well, may need to update the job state */ log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, LOG_INFO, pjob->ji_qs.ji_jobid, log_buffer); if (old_hold != *hold_val) { /* indicate attributes changed */ pjob->ji_modified = 1; svr_evaljobstate(pjob, &newstate, &newsub, 0); (void)svr_setjobstate(pjob, newstate, newsub); } reply_ack(preq); } }
/* * release_job - releases the hold on job j * @param j - the job to modify * @param pa - a pointer to an array whose mutex we hold - always this job's array * @return 0 if successful, a PBS error on failure */ int release_job( struct batch_request *preq, /* I */ void *j, /* I/O */ job_array *pa) /* I */ { long old_hold; int rc = PBSE_NONE; int newstate; int newsub; char *pset; job *pjob = (job *)j; char log_buf[LOCAL_LOG_BUF_SIZE]; pbs_attribute temphold; // this function is meaningless for jobs in exiting or completed if (pjob->ji_qs.ji_state > JOB_STATE_RUNNING) return(PBSE_NONE); /* cannot do anything until we decode the holds to be set */ if ((rc = get_hold(&preq->rq_ind.rq_hold.rq_orig.rq_attr, (const char **)&pset, &temphold)) != 0) { return(rc); } /* if other than HOLD_u is being released, must have privil */ if ((rc = chk_hold_priv(temphold.at_val.at_long, preq->rq_perm)) != 0) { return(rc); } /* unset the hold */ old_hold = pjob->ji_wattr[JOB_ATR_hold].at_val.at_long; if ((rc = job_attr_def[JOB_ATR_hold].at_set(&pjob->ji_wattr[JOB_ATR_hold], &temphold, DECR))) { return(rc); } if (pjob->ji_arraystructid[0] != '\0') { // Make sure our slot limit counts are correct check_array_slot_limits(pjob, pa); } /* everything went well, if holds changed, update the job state */ if (old_hold != pjob->ji_wattr[JOB_ATR_hold].at_val.at_long) { pjob->ji_modified = 1; /* indicates attributes changed */ svr_evaljobstate(*pjob, newstate, newsub, 0); svr_setjobstate(pjob, newstate, newsub, FALSE); /* saves job */ } sprintf(log_buf, msg_jobholdrel, pset, preq->rq_user, preq->rq_host); log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buf); return(rc); } /* END release_job() */
void req_releasejob(struct batch_request *preq) { int jt; /* job type */ int newstate; int newsub; long old_hold; job *pjob; char *pset; int rc; pjob = chk_job_request(preq->rq_ind.rq_release.rq_objname, preq, &jt); if (pjob == (job *)0) return; if ((jt != IS_ARRAY_NO) && (jt != IS_ARRAY_ArrayJob)) { req_reject(PBSE_IVALREQ, 0, preq); return; } /* cannot do anything until we decode the holds to be set */ if ((rc=get_hold(&preq->rq_ind.rq_hold.rq_orig.rq_attr, &pset)) != 0) { req_reject(rc, 0, preq); return; } /* if other than HOLD_u is being released, must have privil */ if ((rc = chk_hold_priv(temphold.at_val.at_long, preq->rq_perm)) != 0) { req_reject(rc, 0, preq); return; } /* all ok so far, unset the hold */ old_hold = pjob->ji_wattr[(int)JOB_ATR_hold].at_val.at_long; rc = job_attr_def[(int)JOB_ATR_hold]. at_set(&pjob->ji_wattr[(int)JOB_ATR_hold], &temphold, DECR); if (rc) { req_reject(rc, 0, preq); return; } /* every thing went well, if holds changed, update the job state */ #ifndef NAS /* localmod 105 Always reset etime on release */ if (old_hold != pjob->ji_wattr[(int)JOB_ATR_hold].at_val.at_long) { #endif /* localmod 105 */ #ifdef NAS /* localmod 105 */ { attribute *etime = &pjob->ji_wattr[(int)JOB_ATR_etime]; etime->at_val.at_long = time_now; etime->at_flags |= ATR_VFLAG_SET|ATR_VFLAG_MODCACHE; #endif /* localmod 105 */ pjob->ji_modified = 1; /* indicates attributes changed */ svr_evaljobstate(pjob, &newstate, &newsub, 0); (void)svr_setjobstate(pjob, newstate, newsub); /* saves job */ } if (pjob->ji_wattr[(int)JOB_ATR_hold].at_val.at_long == 0) job_attr_def[(int)JOB_ATR_Comment].at_free(&pjob->ji_wattr[(int)JOB_ATR_Comment]); (void)sprintf(log_buffer, msg_jobholdrel, pset, preq->rq_user, preq->rq_host); log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, LOG_INFO, pjob->ji_qs.ji_jobid, log_buffer); reply_ack(preq); } /** * @brief * get_hold - search a list of attributes (svrattrl) for the hold-types * attribute. This is used by the Hold Job and Release Job request, * therefore it is an error if the hold-types attribute is not present, * or there is more than one. * * Decode the hold attribute into temphold. * * @param[in] phead - pbs list head. * @param[out] phead - RETURN - ptr to hold value * * @return error code */ static int get_hold(pbs_list_head *phead, char **pset) { int have_one = 0; struct svrattrl *holdattr = (struct svrattrl*)0; struct svrattrl *pal; pal = (struct svrattrl *)GET_NEXT((*phead)); while (pal) { if (!strcasecmp(pal->al_name, job_attr_def[(int)JOB_ATR_hold].at_name)) { holdattr = pal; *pset = pal->al_value; have_one++; } else { return (PBSE_IVALREQ); } pal = (struct svrattrl *)GET_NEXT(pal->al_link); } if (have_one != 1) return (PBSE_IVALREQ); /* decode into temporary attribute structure */ clear_attr(&temphold, &job_attr_def[(int)JOB_ATR_hold]); return (job_attr_def[(int)JOB_ATR_hold].at_decode( &temphold, holdattr->al_name, (char *)0, holdattr->al_value)); }
/** * update_array_values() * * updates internal bookeeping values for job arrays * @param pa - array to update * @param pjob - the pjob that an event happened on * @param event - code for what event just happened */ void update_array_values( job_array *pa, /* I */ void *j, /* I */ int old_state, /* I */ enum ArrayEventsEnum event) /* I */ { job *pjob = (job *)j; int exit_status; switch (event) { case aeQueue: /* NYI, nothing needs to be done for this yet */ break; case aeRun: if (old_state != JOB_STATE_RUNNING) { pa->ai_qs.jobs_running++; pa->ai_qs.num_started++; } break; case aeTerminate: exit_status = pjob->ji_qs.ji_un.ji_exect.ji_exitstat; if (old_state == JOB_STATE_RUNNING) { if (pa->ai_qs.jobs_running > 0) pa->ai_qs.jobs_running--; } if (exit_status == 0) { pa->ai_qs.num_successful++; pa->ai_qs.jobs_done++; } else { pa->ai_qs.num_failed++; pa->ai_qs.jobs_done++; } array_save(pa); /* update slot limit hold if necessary */ if (server.sv_attr[SRV_ATR_MoabArrayCompatible].at_val.at_long != FALSE) { /* only need to update if the job wasn't previously held */ if ((pjob->ji_wattr[JOB_ATR_hold].at_val.at_long & HOLD_l) == FALSE) { int i; int newstate; int newsub; job *pj; /* find the first held job and release its hold */ for (i = 0; i < pa->ai_qs.array_size; i++) { if (pa->jobs[i] == NULL) continue; pj = (job *)pa->jobs[i]; if (pj->ji_wattr[JOB_ATR_hold].at_val.at_long & HOLD_l) { pj->ji_wattr[JOB_ATR_hold].at_val.at_long &= ~HOLD_l; if (pj->ji_wattr[JOB_ATR_hold].at_val.at_long == 0) { pj->ji_wattr[JOB_ATR_hold].at_flags &= ~ATR_VFLAG_SET; } svr_evaljobstate(pj, &newstate, &newsub, 1); svr_setjobstate(pj, newstate, newsub); job_save(pj, SAVEJOB_FULL, 0); break; } } } } break; default: /* log error? */ break; } set_array_depend_holds(pa); array_save(pa); } /* END update_array_values() */
void finish_routing_processing( job *pjob, int status) { int newstate; int newsub; if (pjob == NULL) return; if (LOGLEVEL >= 10) log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, __func__, pjob->ji_qs.ji_jobid); switch (status) { case LOCUTION_SUCCESS: /* normal return, job was routed */ if (pjob->ji_qs.ji_svrflags & JOB_SVFLG_StagedIn) remove_stagein(&pjob); if (pjob != NULL) { if (pjob->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_COPIED) remove_checkpoint(&pjob); if (pjob != NULL) svr_job_purge(pjob); /* need to remove server job struct */ } break; case LOCUTION_FAIL: /* permanent rejection (or signal) */ if (pjob->ji_qs.ji_substate == JOB_SUBSTATE_ABORT) { /* job delete in progress, just set to queued status */ svr_setjobstate(pjob, JOB_STATE_QUEUED, JOB_SUBSTATE_ABORT, FALSE); svr_mailowner(pjob, 'a', TRUE, "Couldn't route job to remote server"); unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL); return; } add_dest(pjob); /* else mark destination as bad */ /* fall through */ default: /* try routing again */ svr_mailowner(pjob, 'a', TRUE, "Couldn't route job to remote server"); /* force re-eval of job state out of Transit */ svr_evaljobstate(*pjob, newstate, newsub, 1); svr_setjobstate(pjob, newstate, newsub, FALSE); if ((status = job_route(pjob)) == PBSE_ROUTEREJ) job_abt(&pjob, pbse_to_txt(PBSE_ROUTEREJ)); else if (status != 0) job_abt(&pjob, msg_routexceed); else unlock_ji_mutex(pjob, __func__, "2", LOGLEVEL); break; } /* END switch (status) */ return; } /* END finish_routing_processing() */
int modify_job( void **j, /* O */ svrattrl *plist, /* I */ struct batch_request *preq, /* I */ int checkpoint_req, /* I */ int flag) /* I */ { int bad = 0; int i; int newstate; int newsubstate; resource_def *prsd; int rc; int sendmom = 0; int copy_checkpoint_files = FALSE; char log_buf[LOCAL_LOG_BUF_SIZE]; struct batch_request *dup_req = NULL; job *pjob = (job *)*j; if (pjob == NULL) { sprintf(log_buf, "job structure is NULL"); log_err(PBSE_IVALREQ, __func__, log_buf); return(PBSE_IVALREQ); } /* cannot be in exiting or transit, exiting has already been checked */ if (pjob->ji_qs.ji_state == JOB_STATE_TRANSIT) { /* FAILURE */ snprintf(log_buf,sizeof(log_buf), "Cannot modify job '%s' in transit\n", pjob->ji_qs.ji_jobid); log_err(PBSE_BADSTATE, __func__, log_buf); return(PBSE_BADSTATE); } if (((checkpoint_req == CHK_HOLD) || (checkpoint_req == CHK_CONT)) && (pjob->ji_qs.ji_substate == JOB_SUBSTATE_RUNNING)) { /* May need to request copy of the checkpoint file from mom */ copy_checkpoint_files = TRUE; if (checkpoint_req == CHK_HOLD) { sprintf(log_buf,"setting jobsubstate for %s to RERUN\n", pjob->ji_qs.ji_jobid); pjob->ji_qs.ji_substate = JOB_SUBSTATE_RERUN; job_save(pjob, SAVEJOB_QUICK, 0); log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buf); /* remove checkpoint restart file if there is one */ if (pjob->ji_wattr[JOB_ATR_restart_name].at_flags & ATR_VFLAG_SET) { cleanup_restart_file(pjob); } } } /* if job is running, special checks must be made */ /* NOTE: must determine if job exists down at MOM - this will occur if job is running, job is held, or job was held and just barely released (ie qhold/qrls) */ /* COMMENTED OUT BY JOSH B IN 2.3 DUE TO MAJOR PROBLEMS w/ CUSTOMERS * --FIX and uncomment once we know what is really going on. * * We now know that ji_destin gets set on a qmove and that the mom does not * have the job at that point. * if ((pjob->ji_qs.ji_state == JOB_STATE_RUNNING) || ((pjob->ji_qs.ji_state == JOB_STATE_HELD) && (pjob->ji_qs.ji_destin[0] != '\0')) || ((pjob->ji_qs.ji_state == JOB_STATE_QUEUED) && (pjob->ji_qs.ji_destin[0] != '\0'))) */ if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING) { while (plist != NULL) { /* is the pbs_attribute modifiable in RUN state ? */ i = find_attr(job_attr_def, plist->al_name, JOB_ATR_LAST); if ((i < 0) || ((job_attr_def[i].at_flags & ATR_DFLAG_ALTRUN) == 0)) { /* FAILURE */ snprintf(log_buf,sizeof(log_buf), "Cannot modify attribute '%s' while running\n", plist->al_name); log_err(PBSE_MODATRRUN, __func__, log_buf); return PBSE_MODATRRUN; } /* NOTE: only explicitly specified job attributes are routed down to MOM */ if (i == JOB_ATR_resource) { /* is the specified resource modifiable while */ /* the job is running */ prsd = find_resc_def(svr_resc_def, plist->al_resc, svr_resc_size); if (prsd == NULL) { /* FAILURE */ snprintf(log_buf,sizeof(log_buf), "Unknown attribute '%s'\n", plist->al_name); log_err(PBSE_UNKRESC, __func__, log_buf); return(PBSE_UNKRESC); } if ((prsd->rs_flags & ATR_DFLAG_ALTRUN) == 0) { /* FAILURE */ snprintf(log_buf,sizeof(log_buf), "Cannot modify attribute '%s' while running\n", plist->al_name); log_err(PBSE_MODATRRUN, __func__, log_buf); return(PBSE_MODATRRUN); } sendmom = 1; } /* else if ((i == JOB_ATR_checkpoint_name) || (i == JOB_ATR_variables)) { sendmom = 1; } */ plist = (svrattrl *)GET_NEXT(plist->al_link); } } /* END if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING) */ /* modify the job's attributes */ bad = 0; plist = (svrattrl *)GET_NEXT(preq->rq_ind.rq_modify.rq_attr); rc = modify_job_attr(pjob, plist, preq->rq_perm, &bad); if (rc) { /* FAILURE */ snprintf(log_buf,sizeof(log_buf), "Cannot set attributes for job '%s'\n", pjob->ji_qs.ji_jobid); log_err(rc, __func__, log_buf); if (rc == PBSE_JOBNOTFOUND) *j = NULL; return(rc); } /* Reset any defaults resource limit which might have been unset */ set_resc_deflt(pjob, NULL, FALSE); /* if job is not running, may need to change its state */ if (pjob->ji_qs.ji_state != JOB_STATE_RUNNING) { svr_evaljobstate(pjob, &newstate, &newsubstate, 0); svr_setjobstate(pjob, newstate, newsubstate, FALSE); } else { job_save(pjob, SAVEJOB_FULL, 0); } sprintf(log_buf, msg_manager, msg_jobmod, preq->rq_user, preq->rq_host); log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf); /* if a resource limit changed for a running job, send to MOM */ if (sendmom) { /* if the NO_MOM_RELAY flag is set the calling function will call relay_to_mom so we do not need to do it here */ if (flag != NO_MOM_RELAY) { /* The last number is unused unless this is an array */ if ((rc = copy_batchrequest(&dup_req, preq, 0, -1)) != 0) { } /* The dup_req is freed in relay_to_mom (failure) * or in issue_Drequest (success) */ else if ((rc = relay_to_mom(&pjob, dup_req, post_modify_req))) { if (pjob != NULL) { snprintf(log_buf,sizeof(log_buf), "Unable to relay information to mom for job '%s'\n", pjob->ji_qs.ji_jobid); log_err(rc, __func__, log_buf); } return(rc); /* unable to get to MOM */ } } return(PBSE_RELAYED_TO_MOM); } if (copy_checkpoint_files) { struct batch_request *momreq = 0; momreq = cpy_checkpoint(momreq, pjob, JOB_ATR_checkpoint_name, CKPT_DIR_OUT); if (momreq != NULL) { /* have files to copy */ momreq->rq_extra = strdup(pjob->ji_qs.ji_jobid); /* The momreq is freed in relay_to_mom (failure) * or in issue_Drequest (success) */ if (checkpoint_req == CHK_HOLD) { rc = relay_to_mom(&pjob, momreq, chkpt_xfr_hold); } else { rc = relay_to_mom(&pjob, momreq, chkpt_xfr_done); } if (rc != 0) { if (pjob != NULL) { snprintf(log_buf,sizeof(log_buf), "Unable to relay information to mom for job '%s'\n", pjob->ji_qs.ji_jobid); log_err(rc, __func__, log_buf); } return(PBSE_NONE); /* come back when mom replies */ } } else { log_err(-1, __func__, "Failed to get batch request"); } } return(PBSE_NONE); } /* END modify_job() */
int finalize_rerunjob( batch_request *preq, job *pjob, int rc) { int Force; char log_buf[LOCAL_LOG_BUF_SIZE]; if (pjob == NULL) return(PBSE_BAD_PARAMETER); mutex_mgr pjob_mutex = mutex_mgr(pjob->ji_mutex, true); if (preq->rq_extend && !strncasecmp(preq->rq_extend, RERUNFORCE, strlen(RERUNFORCE))) Force = 1; else Force = 0; switch (rc) { case -1: /* completed job was requeued */ /* clear out job completion time if there is one */ break; case 0: /* requeue request successful */ pjob->ji_qs.ji_substate = JOB_SUBSTATE_RERUN; break; case PBSE_SYSTEM: /* This may not be accurate...*/ rc = PBSE_MEM_MALLOC; snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "Can not allocate memory"); req_reject(rc, 0, preq, NULL, log_buf); return rc; break; default: if (Force == 0) { rc = PBSE_MOMREJECT; snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "Rejected by mom"); req_reject(rc, 0, preq, NULL, log_buf); return rc; } else { int newstate; int newsubst; unsigned int dummy; char *tmp; if ((cray_enabled == true) && (pjob->ji_wattr[JOB_ATR_login_node_id].at_val.at_str != NULL)) tmp = parse_servername(pjob->ji_wattr[JOB_ATR_login_node_id].at_val.at_str, &dummy); else tmp = parse_servername(pjob->ji_wattr[JOB_ATR_exec_host].at_val.at_str, &dummy); /* Cannot communicate with MOM, forcibly requeue job. This is a relatively disgusting thing to do */ sprintf(log_buf, "rerun req to %s failed (rc=%d), forcibly requeueing job", tmp, rc); free(tmp); log_event( PBSEVENT_ERROR | PBSEVENT_ADMIN | PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buf); log_err(-1, __func__, log_buf); strcat(log_buf, ", previous output files may be lost"); svr_mailowner(pjob, MAIL_OTHER, MAIL_FORCE, log_buf); svr_setjobstate(pjob, JOB_STATE_EXITING, JOB_SUBSTATE_RERUN3, FALSE); rel_resc(pjob); /* free resc assigned to job */ pjob->ji_modified = 1; /* force full job save */ pjob->ji_momhandle = -1; pjob->ji_qs.ji_svrflags &= ~JOB_SVFLG_StagedIn; svr_evaljobstate(*pjob, newstate, newsubst, 0); svr_setjobstate(pjob, newstate, newsubst, FALSE); } break; } /* END switch (rc) */ pjob->ji_qs.ji_svrflags = (pjob->ji_qs.ji_svrflags & ~(JOB_SVFLG_CHECKPOINT_FILE |JOB_SVFLG_CHECKPOINT_MIGRATEABLE | JOB_SVFLG_CHECKPOINT_COPIED)) | JOB_SVFLG_HASRUN; sprintf(log_buf, msg_manager, msg_jobrerun, preq->rq_user, preq->rq_host); log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf); reply_ack(preq); /* note in accounting file */ account_record(PBS_ACCT_RERUN, pjob, NULL); return rc; } /* END req_rerunjob() */
static void post_routejob( struct work_task *pwt) { int newstate; int newsub; int r; int stat = pwt->wt_aux; char *id = "post_routejob"; job *jobp = (job *)pwt->wt_parm1; if (WIFEXITED(stat)) { r = WEXITSTATUS(stat); } else { r = 2; sprintf(log_buffer, msg_badexit, stat); strcat(log_buffer, id); log_event( PBSEVENT_SYSTEM, PBS_EVENTCLASS_JOB, jobp->ji_qs.ji_jobid, log_buffer); } switch (r) { case 0: /* normal return, job was routed */ if (jobp->ji_qs.ji_svrflags & JOB_SVFLG_StagedIn) remove_stagein(jobp); if (jobp->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_COPIED) remove_checkpoint(jobp); job_purge(jobp); /* need to remove server job struct */ return; /*NOTREACHED*/ break; case 1: /* permanent rejection (or signal) */ if (jobp->ji_qs.ji_substate == JOB_SUBSTATE_ABORT) { /* job delete in progress, just set to queued status */ svr_setjobstate(jobp, JOB_STATE_QUEUED, JOB_SUBSTATE_ABORT); return; } add_dest(jobp); /* else mark destination as bad */ /* fall through */ default : /* try routing again */ /* force re-eval of job state out of Transit */ svr_evaljobstate(jobp, &newstate, &newsub, 1); svr_setjobstate(jobp, newstate, newsub); if ((r = job_route(jobp)) == PBSE_ROUTEREJ) job_abt(&jobp, pbse_to_txt(PBSE_ROUTEREJ)); else if (r != 0) job_abt(&jobp, msg_routexceed); break; } /* END switch (r) */ return; } /* END post_routejob() */
static void process_hold_reply( struct work_task *pwt) { job *pjob; struct batch_request *preq; int newstate; int newsub; attribute temphold; char *pset; int rc; svr_disconnect(pwt->wt_event); /* close connection to MOM */ preq = pwt->wt_parm1; preq->rq_conn = preq->rq_orgconn; /* restore client socket */ if ((pjob = find_job(preq->rq_ind.rq_hold.rq_orig.rq_objname)) == (job *)0) { LOG_EVENT(PBSEVENT_DEBUG, PBS_EVENTCLASS_JOB, preq->rq_ind.rq_hold.rq_orig.rq_objname, msg_postmomnojob); req_reject(PBSE_UNKJOBID, 0, preq, NULL, msg_postmomnojob); } else if (preq->rq_reply.brp_code != 0) { rc = get_hold(&preq->rq_ind.rq_hold.rq_orig.rq_attr, &pset, &temphold); if (rc == 0) { rc = job_attr_def[(int)JOB_ATR_hold].at_set(&pjob->ji_wattr[(int)JOB_ATR_hold], &temphold, DECR); } pjob->ji_qs.ji_substate = JOB_SUBSTATE_RUNNING; /* reset it */ pjob->ji_modified = 1; /* indicate attributes changed */ svr_evaljobstate(pjob, &newstate, &newsub, 0); svr_setjobstate(pjob, newstate, newsub); /* saves job */ if (preq->rq_reply.brp_code != PBSE_NOSUP) { sprintf(log_buffer, msg_mombadhold, preq->rq_reply.brp_code); LOG_EVENT(PBSEVENT_DEBUG, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buffer); req_reject(preq->rq_reply.brp_code, 0, preq, NULL, log_buffer); } else { reply_ack(preq); } } else { /* record that MOM has a checkpoint file */ /* PBS_CHECKPOINT_MIGRATEABLE is defined as zero therefore this code will never fire. * And if these flags are not set, start_exec will not try to run the job from * the checkpoint image file. */ pjob->ji_qs.ji_svrflags |= JOB_SVFLG_CHECKPOINT_FILE; if (preq->rq_reply.brp_auxcode) /* checkpoint can be moved */ { pjob->ji_qs.ji_svrflags &= ~JOB_SVFLG_CHECKPOINT_FILE; pjob->ji_qs.ji_svrflags |= JOB_SVFLG_HASRUN | JOB_SVFLG_CHECKPOINT_MIGRATEABLE; } pjob->ji_modified = 1; /* indicate attributes changed */ svr_evaljobstate(pjob, &newstate, &newsub, 0); svr_setjobstate(pjob, newstate, newsub); /* saves job */ account_record(PBS_ACCT_CHKPNT, pjob, "Checkpointed and held"); /* note in accounting file */ reply_ack(preq); } }
void finish_moving_processing( job *pjob, struct batch_request *req, int status) { char log_buf[LOCAL_LOG_BUF_SIZE]; int newstate; int newsub; if (req->rq_type != PBS_BATCH_MoveJob) { sprintf(log_buf, "bad request type %d\n", req->rq_type); log_err(-1, __func__, log_buf); return; } if (pjob == NULL) return; switch (status) { case LOCUTION_SUCCESS: /* purge server's job structure */ if (pjob->ji_qs.ji_svrflags & JOB_SVFLG_StagedIn) remove_stagein(&pjob); if (pjob != NULL) { if (pjob->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_COPIED) remove_checkpoint(&pjob); } snprintf(log_buf, sizeof(log_buf), "%s", msg_movejob); snprintf(log_buf + strlen(log_buf), sizeof(log_buf) - strlen(log_buf), msg_manager, req->rq_ind.rq_move.rq_destin, req->rq_user, req->rq_host); if (pjob != NULL) svr_job_purge(pjob); reply_ack(req); break; default: status = PBSE_ROUTEREJ; if (pjob != NULL) { /* force re-eval of job state out of Transit */ svr_evaljobstate(*pjob, newstate, newsub, 1); svr_setjobstate(pjob, newstate, newsub, FALSE); unlock_ji_mutex(pjob, __func__, "3", LOGLEVEL); } req_reject(status, 0, req, NULL, NULL); break; } /* END switch (status) */ } /* END finish_moving_processing() */
/** * @brief * post_movejob - clean up action for child started in net_move/send_job * to "move" a job to another server * @par * If move was successfull, delete server's copy of thejob structure, * and reply to request. * @par * If route didn't work, reject the request. * * @param[in] pwt - work task structure * * @return none. */ static void post_movejob(struct work_task *pwt) { char *id = "post_movejob"; struct batch_request *req; int newstate; int newsub; int stat; int r; job *jobp; req = (struct batch_request *)pwt->wt_parm1; stat = pwt->wt_aux; pbs_errno = PBSE_NONE; if (req->rq_type != PBS_BATCH_MoveJob) { sprintf(log_buffer, "bad request type %d", req->rq_type); log_err(-1, __func__, log_buffer); return; } jobp = find_job(req->rq_ind.rq_move.rq_jid); if ((jobp == NULL) || (jobp != (job *)pwt->wt_parm2)) { sprintf(log_buffer, "job %s not found", req->rq_ind.rq_move.rq_jid); log_err(-1, __func__, log_buffer); } if (WIFEXITED(stat)) { r = WEXITSTATUS(stat); if (r == SEND_JOB_OK) { /* purge server's job structure */ if (jobp->ji_qs.ji_svrflags & JOB_SVFLG_StagedIn) remove_stagein(jobp); (void)strcpy(log_buffer, msg_movejob); (void)sprintf(log_buffer+strlen(log_buffer), msg_manager, req->rq_ind.rq_move.rq_destin, req->rq_user, req->rq_host); /* * If server is configured to keep job history info and * the job is created here, then keep the job struture * for history purpose without purging. No need to check * for sub-jobs as sub jobs can't be moved. */ if (svr_chk_history_conf()) svr_setjob_histinfo(jobp, T_MOV_JOB); else job_purge(jobp); } else r = PBSE_ROUTEREJ; } else { r = PBSE_SYSTEM; (void)sprintf(log_buffer, msg_badexit, stat); (void)strcat(log_buffer, __func__); log_event(PBSEVENT_SYSTEM, PBS_EVENTCLASS_JOB, LOG_NOTICE, jobp->ji_qs.ji_jobid, log_buffer); } if (r) { if (jobp) { /* force re-eval of job state out of Transit */ svr_evaljobstate(jobp, &newstate, &newsub, 1); svr_setjobstate(jobp, newstate, newsub); } req_reject(r, 0, req); } else reply_ack(req); return; }
int execute_job_delete( job *pjob, /* M */ char *Msg, /* I */ struct batch_request *preq) /* I */ { struct work_task *pwtnew; int rc; char *sigt = "SIGTERM"; int has_mutex = TRUE; char log_buf[LOCAL_LOG_BUF_SIZE]; time_t time_now = time(NULL); long force_cancel = FALSE; long array_compatible = FALSE; chk_job_req_permissions(&pjob,preq); if (pjob == NULL) { /* preq is rejected in chk_job_req_permissions here */ return(-1); } if (pjob->ji_qs.ji_state == JOB_STATE_TRANSIT) { /* see note in req_delete - not sure this is possible still, * but the deleted code is irrelevant now. I will leave this * part --dbeer */ unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL); return(-1); } if (pjob->ji_qs.ji_substate == JOB_SUBSTATE_PRERUN || pjob->ji_qs.ji_substate == JOB_SUBSTATE_RERUN || pjob->ji_qs.ji_substate == JOB_SUBSTATE_RERUN1 || pjob->ji_qs.ji_substate == JOB_SUBSTATE_RERUN2 || pjob->ji_qs.ji_substate == JOB_SUBSTATE_RERUN3 ) { /* If JOB_SUBSTATE_PRERUN being sent to MOM, wait till she gets it going */ /* retry in one second */ /* If JOB_SUBSTATE_RERUN, RERUN1, RERUN2 or RERUN3 the job is being requeued. Wait until finished */ static time_t cycle_check_when = 0; static char cycle_check_jid[PBS_MAXSVRJOBID + 1]; if (cycle_check_when != 0) { if (!strcmp(pjob->ji_qs.ji_jobid, cycle_check_jid) && (time_now - cycle_check_when > 10)) { /* state not updated after 10 seconds */ /* did the mom ever get it? delete it anyways... */ cycle_check_jid[0] = '\0'; cycle_check_when = 0; goto jump; } if (time_now - cycle_check_when > 20) { /* give up after 20 seconds */ cycle_check_jid[0] = '\0'; cycle_check_when = 0; } } /* END if (cycle_check_when != 0) */ if (cycle_check_when == 0) { /* new PRERUN job located */ cycle_check_when = time_now; strcpy(cycle_check_jid, pjob->ji_qs.ji_jobid); } sprintf(log_buf, "job cannot be deleted, state=PRERUN, requeuing delete request"); log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf); pwtnew = set_task(WORK_Timed,time_now + 1,post_delete_route,preq,FALSE); unlock_ji_mutex(pjob, __func__, "2", LOGLEVEL); if (pwtnew == NULL) { req_reject(PBSE_SYSTEM, 0, preq, NULL, NULL); return(-1); } else { return(ROUTE_DELETE); } } /* END if (pjob->ji_qs.ji_substate == JOB_SUBSTATE_PRERUN) */ jump: /* * Log delete and if requesting client is not job owner, send mail. */ sprintf(log_buf, "requestor=%s@%s", preq->rq_user, preq->rq_host); /* NOTE: should annotate accounting record with extend message (NYI) */ account_record(PBS_ACCT_DEL, pjob, log_buf); sprintf(log_buf, msg_manager, msg_deletejob, preq->rq_user, preq->rq_host); log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf); /* NOTE: should incorporate job delete message */ if (Msg != NULL) { /* have text message in request extension, add it */ strcat(log_buf, "\n"); strcat(log_buf, Msg); } if ((svr_chk_owner(preq, pjob) != 0) && (pjob->ji_has_delete_nanny == FALSE)) { /* only send email if owner did not delete job and job deleted has not been previously attempted */ svr_mailowner(pjob, MAIL_DEL, MAIL_FORCE, log_buf); /* * If we sent mail and already sent the extra message * then reset message so we don't trigger a redundant email * in job_abt() */ if (Msg != NULL) { Msg = NULL; } } if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_FILE) != 0) { /* job has restart file at mom, change restart comment if failed */ change_restart_comment_if_needed(pjob); } if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING) { /* * setup a nanny task to make sure the job is actually deleted (see the * comments at job_delete_nanny()). */ if (pjob->ji_has_delete_nanny == TRUE) { unlock_ji_mutex(pjob, __func__, "3", LOGLEVEL); req_reject(PBSE_IVALREQ, 0, preq, NULL, "job cancel in progress"); return(-1); } apply_job_delete_nanny(pjob, time_now + 60); /* * Send signal request to MOM. The server will automagically * pick up and "finish" off the client request when MOM replies. */ get_batch_request_id(preq); if ((rc = issue_signal(&pjob, sigt, post_delete_mom1, strdup(preq->rq_id)))) { /* cant send to MOM */ req_reject(rc, 0, preq, NULL, NULL); } /* normally will ack reply when mom responds */ if (pjob != NULL) { sprintf(log_buf, msg_delrunjobsig, sigt); log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf); unlock_ji_mutex(pjob, __func__, "4", LOGLEVEL); } return(-1); } /* END if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING) */ /* make a cleanup task if set */ get_svr_attr_l(SRV_ATR_JobForceCancelTime, &force_cancel); if (force_cancel > 0) { char *dup_jobid = strdup(pjob->ji_qs.ji_jobid); set_task(WORK_Timed, time_now + force_cancel, ensure_deleted, dup_jobid, FALSE); } /* if configured, and this job didn't have a slot limit hold, free a job * held with the slot limit hold */ get_svr_attr_l(SRV_ATR_MoabArrayCompatible, &array_compatible); if ((array_compatible != FALSE) && ((pjob->ji_wattr[JOB_ATR_hold].at_val.at_long & HOLD_l) == FALSE)) { if ((pjob->ji_arraystruct != NULL) && (pjob->ji_is_array_template == FALSE)) { int i; int newstate; int newsub; job *tmp; job_array *pa = get_jobs_array(&pjob); if (pjob == NULL) return(-1); for (i = 0; i < pa->ai_qs.array_size; i++) { if (pa->job_ids[i] == NULL) continue; if (!strcmp(pa->job_ids[i], pjob->ji_qs.ji_jobid)) continue; if ((tmp = svr_find_job(pa->job_ids[i], FALSE)) == NULL) { free(pa->job_ids[i]); pa->job_ids[i] = NULL; } else { if (tmp->ji_wattr[JOB_ATR_hold].at_val.at_long & HOLD_l) { tmp->ji_wattr[JOB_ATR_hold].at_val.at_long &= ~HOLD_l; if (tmp->ji_wattr[JOB_ATR_hold].at_val.at_long == 0) { tmp->ji_wattr[JOB_ATR_hold].at_flags &= ~ATR_VFLAG_SET; } svr_evaljobstate(tmp, &newstate, &newsub, 1); svr_setjobstate(tmp, newstate, newsub, FALSE); job_save(tmp, SAVEJOB_FULL, 0); unlock_ji_mutex(tmp, __func__, "5", LOGLEVEL); break; } unlock_ji_mutex(tmp, __func__, "6", LOGLEVEL); } } if (LOGLEVEL >= 7) { sprintf(log_buf, "%s: unlocking ai_mutex", __func__); log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buf); } pthread_mutex_unlock(pa->ai_mutex); } } /* END MoabArrayCompatible check */ if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_FILE) != 0) { /* job has restart file at mom, do end job processing */ svr_setjobstate(pjob, JOB_STATE_EXITING, JOB_SUBSTATE_EXITING, FALSE); /* force new connection */ pjob->ji_momhandle = -1; if (LOGLEVEL >= 7) { sprintf(log_buf, "calling on_job_exit from %s", __func__); log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buf); } set_task(WORK_Immed, 0, on_job_exit, strdup(pjob->ji_qs.ji_jobid), FALSE); } else if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_StagedIn) != 0) { /* job has staged-in file, should remove them */ remove_stagein(&pjob); if (pjob != NULL) job_abt(&pjob, Msg); has_mutex = FALSE; } else { /* * the job is not transitting (though it may have been) and * is not running, so put in into a complete state. */ struct pbs_queue *pque; int KeepSeconds = 0; svr_setjobstate(pjob, JOB_STATE_COMPLETE, JOB_SUBSTATE_COMPLETE, FALSE); if ((pque = get_jobs_queue(&pjob)) != NULL) { pque->qu_numcompleted++; unlock_queue(pque, __func__, NULL, LOGLEVEL); if (LOGLEVEL >= 7) { sprintf(log_buf, "calling on_job_exit from %s", __func__); log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buf); } pthread_mutex_lock(server.sv_attr_mutex); KeepSeconds = attr_ifelse_long( &pque->qu_attr[QE_ATR_KeepCompleted], &server.sv_attr[SRV_ATR_KeepCompleted], 0); pthread_mutex_unlock(server.sv_attr_mutex); } else KeepSeconds = 0; if (pjob != NULL) { set_task(WORK_Timed, time_now + KeepSeconds, on_job_exit, strdup(pjob->ji_qs.ji_jobid), FALSE); } else has_mutex = FALSE; } /* END else if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_FILE) != 0) */ if (has_mutex == TRUE) unlock_ji_mutex(pjob, __func__, "7", LOGLEVEL); return(PBSE_NONE); } /* END execute_job_delete() */
void req_releasejob( struct batch_request *preq) /* ptr to the decoded request */ { int newstate; int newsub; long old_hold; job *pjob; char *pset; int rc; attribute temphold; pjob = chk_job_request(preq->rq_ind.rq_release.rq_objname, preq); if (pjob == NULL) { return; } /* cannot do anything until we decode the holds to be set */ if ((rc = get_hold(&preq->rq_ind.rq_hold.rq_orig.rq_attr, &pset, &temphold)) != 0) { req_reject(rc, 0, preq, NULL, NULL); return; } /* if other than HOLD_u is being released, must have privil */ if ((rc = chk_hold_priv(temphold.at_val.at_long, preq->rq_perm)) != 0) { req_reject(rc, 0, preq, NULL, NULL); return; } /* all ok so far, unset the hold */ old_hold = pjob->ji_wattr[(int)JOB_ATR_hold].at_val.at_long; if ((rc = job_attr_def[(int)JOB_ATR_hold].at_set(&pjob->ji_wattr[(int)JOB_ATR_hold], &temphold, DECR))) { req_reject(rc, 0, preq, NULL, NULL); return; } /* everything went well, if holds changed, update the job state */ if (old_hold != pjob->ji_wattr[(int)JOB_ATR_hold].at_val.at_long) { pjob->ji_modified = 1; /* indicates attributes changed */ svr_evaljobstate(pjob, &newstate, &newsub, 0); svr_setjobstate(pjob, newstate, newsub); /* saves job */ } sprintf(log_buffer, msg_jobholdrel, pset, preq->rq_user, preq->rq_host); LOG_EVENT(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buffer); reply_ack(preq); return; } /* END req_releasejob() */