/** * @brief * post_run_depend - this function is called via a work task when a * register dependency "after" is received for a job that is already * running. We accept the dependency and then turn around and send a * "release" back to the newly registered job to remove its hold. * * param[in] pwt - work task */ static void post_run_depend(struct work_task *pwt) { job *pjob; pjob = (job *)pwt->wt_parm1; if (pjob->ji_wattr[(int)JOB_ATR_depend].at_flags&ATR_VFLAG_SET) (void)depend_on_exec(pjob); return; }
static void post_sendmom( struct work_task *pwt) /* I */ { char *id = "post_sendmom"; int newstate; int newsub; int r; int stat; job *jobp = (job *)pwt->wt_parm1; struct batch_request *preq = (struct batch_request *)pwt->wt_parm2; char *MOMName = NULL; int jindex; long DTime = time_now - 10000; if (LOGLEVEL >= 6) { log_record( PBSEVENT_JOB, PBS_EVENTCLASS_JOB, jobp->ji_qs.ji_jobid, "entering post_sendmom"); } stat = pwt->wt_aux; if (WIFEXITED(stat)) { r = WEXITSTATUS(stat); } else { r = 2; /* cannot get child exit status */ sprintf(log_buffer, msg_badexit, stat); strcat(log_buffer, id); log_event( PBSEVENT_SYSTEM, PBS_EVENTCLASS_JOB, jobp->ji_qs.ji_jobid, log_buffer); } /* maintain local struct to associate job id with dispatch time */ for (jindex = 0;jindex < 20;jindex++) { if (DispatchJob[jindex] == jobp) { DTime = DispatchTime[jindex]; DispatchJob[jindex] = NULL; MOMName = DispatchNode[jindex]; break; } } if (LOGLEVEL >= 1) { sprintf(log_buffer, "child reported %s for job after %ld seconds (dest=%s), rc=%d", (r == 0) ? "success" : "failure", time_now - DTime, (MOMName != NULL) ? MOMName : "???", r); log_event( PBSEVENT_SYSTEM, PBS_EVENTCLASS_JOB, jobp->ji_qs.ji_jobid, log_buffer); } switch (r) { case 0: /* send to MOM went ok */ jobp->ji_qs.ji_svrflags &= ~JOB_SVFLG_HOTSTART; if (preq != NULL) reply_ack(preq); /* record start time for accounting */ jobp->ji_qs.ji_stime = time_now; /* update resource usage attributes */ set_resc_assigned(jobp, INCR); if (jobp->ji_qs.ji_substate == JOB_SUBSTATE_PRERUN) { /* may be EXITING if job finished first */ svr_setjobstate(jobp, JOB_STATE_RUNNING, JOB_SUBSTATE_RUNNING); /* above saves job structure */ } /* accounting log for start or restart */ if (jobp->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_FILE) account_record(PBS_ACCT_RESTRT, jobp, "Restart from checkpoint"); else account_jobstr(jobp); /* if any dependencies, see if action required */ if (jobp->ji_wattr[(int)JOB_ATR_depend].at_flags & ATR_VFLAG_SET) depend_on_exec(jobp); /* * it is unfortunate, but while the job has gone into execution, * there is no way of obtaining the session id except by making * a status request of MOM. (Even if the session id was passed * back to the sending child, it couldn't get up to the parent.) */ jobp->ji_momstat = 0; stat_mom_job(jobp); break; case 10: /* NOTE: if r == 10, connection to mom timed out. Mark node down */ stream_eof(-1, jobp->ji_qs.ji_un.ji_exect.ji_momaddr, 0); /* send failed, requeue the job */ log_event( PBSEVENT_JOB, PBS_EVENTCLASS_JOB, jobp->ji_qs.ji_jobid, "unable to run job, MOM rejected/timeout"); free_nodes(jobp); if (jobp->ji_qs.ji_substate != JOB_SUBSTATE_ABORT) { if (preq != NULL) req_reject(PBSE_MOMREJECT, 0, preq, MOMName, "connection to mom timed out"); svr_evaljobstate(jobp, &newstate, &newsub, 1); svr_setjobstate(jobp, newstate, newsub); } else { if (preq != NULL) req_reject(PBSE_BADSTATE, 0, preq, MOMName, "job was aborted by mom"); } break; case 1: /* commit failed */ default: { int JobOK = 0; /* send failed, requeue the job */ sprintf(log_buffer, "unable to run job, MOM rejected/rc=%d", r); log_event( PBSEVENT_JOB, PBS_EVENTCLASS_JOB, jobp->ji_qs.ji_jobid, log_buffer); free_nodes(jobp); if (jobp->ji_qs.ji_substate != JOB_SUBSTATE_ABORT) { if (preq != NULL) { char tmpLine[1024]; if (preq->rq_reply.brp_code == PBSE_JOBEXIST) { /* job already running, start request failed but return success since * desired behavior (job is running) is accomplished */ JobOK = 1; } else { sprintf(tmpLine, "cannot send job to %s, state=%s", (MOMName != NULL) ? MOMName : "mom", PJobSubState[jobp->ji_qs.ji_substate]); req_reject(PBSE_MOMREJECT, 0, preq, MOMName, tmpLine); } } if (JobOK == 1) { /* do not re-establish accounting - completed first time job was started */ /* update mom-based job status */ jobp->ji_momstat = 0; stat_mom_job(jobp); } else { svr_evaljobstate(jobp, &newstate, &newsub, 1); svr_setjobstate(jobp, newstate, newsub); } } else { if (preq != NULL) req_reject(PBSE_BADSTATE, 0, preq, MOMName, "send failed - abort"); } break; } } /* END switch (r) */ return; } /* END post_sendmom() */