/** * poll_job_task * * The invocation of this routine is triggered from * the pbs_server main_loop code. The check of * SRV_ATR_PollJobs appears to be redundant. */ void poll_job_task( struct work_task *ptask) { job *pjob; pjob = (job *)ptask->wt_parm1; if (pjob == NULL) { /* FAILURE */ return; } if (server.sv_attr[(int)SRV_ATR_PollJobs].at_val.at_long && (pjob->ji_qs.ji_state == JOB_STATE_RUNNING)) { stat_mom_job(pjob); } return; } /* END poll_job_task() */
/** * poll _job_task * * The invocation of this routine is triggered from * the pbs_server main_loop code. The check of * SRV_ATR_PollJobs appears to be redundant. */ void poll_job_task( struct work_task *ptask) { char *job_id = (char *)ptask->wt_parm1; job *pjob; time_t time_now = time(NULL); long poll_jobs = 0; int job_state = -1; if (job_id != NULL) { pjob = svr_find_job(job_id, FALSE); if (pjob != NULL) { mutex_mgr job_mutex(pjob->ji_mutex, true); job_state = pjob->ji_qs.ji_state; job_mutex.unlock(); get_svr_attr_l(SRV_ATR_PollJobs, &poll_jobs); if ((poll_jobs) && (job_state == JOB_STATE_RUNNING)) { /* we need to throttle the number of outstanding threads are doing job polling. This prevents a problem where pbs_server gets hung waiting on I/O from the mom */ pthread_mutex_lock(poll_job_task_mutex); if (current_poll_job_tasks < max_poll_job_tasks) { current_poll_job_tasks++; pthread_mutex_unlock(poll_job_task_mutex); stat_mom_job(job_id); pthread_mutex_lock(poll_job_task_mutex); current_poll_job_tasks--; } pthread_mutex_unlock(poll_job_task_mutex); /* add another task */ set_task(WORK_Timed, time_now + JobStatRate, poll_job_task, strdup(job_id), FALSE); } } free(job_id); } free(ptask->wt_mutex); free(ptask); } /* END poll_job_task() */
/** * poll_job_task * * The invocation of this routine is triggered from * the pbs_server main_loop code. */ void poll_job_task( struct work_task *ptask) { char *job_id = (char *)ptask->wt_parm1; job *pjob; time_t time_now = time(NULL); long poll_jobs = 0; long job_stat_rate; free(ptask->wt_mutex); free(ptask); if (job_id != NULL) { pjob = svr_find_job(job_id, FALSE); if (pjob != NULL) { mutex_mgr job_mutex(pjob->ji_mutex, true); int job_state = -1; job_state = pjob->ji_qs.ji_state; // only do things for running jobs if (job_state == JOB_STATE_RUNNING) { job_mutex.unlock(); get_svr_attr_l(SRV_ATR_JobStatRate, &job_stat_rate); if (time(NULL) - pjob->ji_last_reported_time > job_stat_rate) { get_svr_attr_l(SRV_ATR_PollJobs, &poll_jobs); if (poll_jobs) stat_mom_job(job_id); } /* add another task */ set_task(WORK_Timed, time_now + (job_stat_rate / 3), poll_job_task, strdup(job_id), FALSE); } } free(job_id); } } /* END poll_job_task() */
/** * poll _job_task * * The invocation of this routine is triggered from * the pbs_server main_loop code. */ void poll_job_task( struct work_task *ptask) { char *job_id = (char *)ptask->wt_parm1; job *pjob; time_t time_now = time(NULL); int job_state = -1; char log_buf[LOCAL_LOG_BUF_SIZE]; if (job_id != NULL) { pjob = svr_find_job(job_id, FALSE); if (pjob != NULL) { mutex_mgr job_mutex(pjob->ji_mutex, true); job_state = pjob->ji_qs.ji_state; job_mutex.unlock(); if (job_state == JOB_STATE_RUNNING) { /* we need to throttle the number of outstanding threads are doing job polling. This prevents a problem where pbs_server gets hung waiting on I/O from the mom */ pthread_mutex_lock(poll_job_task_mutex); if (current_poll_job_tasks < max_poll_job_tasks) { if ((pjob->ji_qs.ji_un.ji_exect.ji_momaddr == 0) || (!pjob->ji_wattr[JOB_ATR_exec_host].at_val.at_str)) { pthread_mutex_unlock(poll_job_task_mutex); snprintf(log_buf, sizeof(log_buf), "Job %s missing MOM's information. Skipping polling on this job", pjob->ji_qs.ji_jobid); log_record(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, __func__, log_buf); } else { current_poll_job_tasks++; pthread_mutex_unlock(poll_job_task_mutex); stat_mom_job(job_id); pthread_mutex_lock(poll_job_task_mutex); current_poll_job_tasks--; } } pthread_mutex_unlock(poll_job_task_mutex); /* add another task */ set_task(WORK_Timed, time_now + JobStatRate, poll_job_task, strdup(job_id), FALSE); } } free(job_id); } free(ptask->wt_mutex); free(ptask); } /* END poll_job_task() */
static void post_sendmom( struct work_task *pwt) /* I */ { char *id = "post_sendmom"; int newstate; int newsub; int r; int stat; job *jobp = (job *)pwt->wt_parm1; struct batch_request *preq = (struct batch_request *)pwt->wt_parm2; char *MOMName = NULL; int jindex; long DTime = time_now - 10000; if (LOGLEVEL >= 6) { log_record( PBSEVENT_JOB, PBS_EVENTCLASS_JOB, jobp->ji_qs.ji_jobid, "entering post_sendmom"); } stat = pwt->wt_aux; if (WIFEXITED(stat)) { r = WEXITSTATUS(stat); } else { r = 2; /* cannot get child exit status */ sprintf(log_buffer, msg_badexit, stat); strcat(log_buffer, id); log_event( PBSEVENT_SYSTEM, PBS_EVENTCLASS_JOB, jobp->ji_qs.ji_jobid, log_buffer); } /* maintain local struct to associate job id with dispatch time */ for (jindex = 0;jindex < 20;jindex++) { if (DispatchJob[jindex] == jobp) { DTime = DispatchTime[jindex]; DispatchJob[jindex] = NULL; MOMName = DispatchNode[jindex]; break; } } if (LOGLEVEL >= 1) { sprintf(log_buffer, "child reported %s for job after %ld seconds (dest=%s), rc=%d", (r == 0) ? "success" : "failure", time_now - DTime, (MOMName != NULL) ? MOMName : "???", r); log_event( PBSEVENT_SYSTEM, PBS_EVENTCLASS_JOB, jobp->ji_qs.ji_jobid, log_buffer); } switch (r) { case 0: /* send to MOM went ok */ jobp->ji_qs.ji_svrflags &= ~JOB_SVFLG_HOTSTART; if (preq != NULL) reply_ack(preq); /* record start time for accounting */ jobp->ji_qs.ji_stime = time_now; /* update resource usage attributes */ set_resc_assigned(jobp, INCR); if (jobp->ji_qs.ji_substate == JOB_SUBSTATE_PRERUN) { /* may be EXITING if job finished first */ svr_setjobstate(jobp, JOB_STATE_RUNNING, JOB_SUBSTATE_RUNNING); /* above saves job structure */ } /* accounting log for start or restart */ if (jobp->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_FILE) account_record(PBS_ACCT_RESTRT, jobp, "Restart from checkpoint"); else account_jobstr(jobp); /* if any dependencies, see if action required */ if (jobp->ji_wattr[(int)JOB_ATR_depend].at_flags & ATR_VFLAG_SET) depend_on_exec(jobp); /* * it is unfortunate, but while the job has gone into execution, * there is no way of obtaining the session id except by making * a status request of MOM. (Even if the session id was passed * back to the sending child, it couldn't get up to the parent.) */ jobp->ji_momstat = 0; stat_mom_job(jobp); break; case 10: /* NOTE: if r == 10, connection to mom timed out. Mark node down */ stream_eof(-1, jobp->ji_qs.ji_un.ji_exect.ji_momaddr, 0); /* send failed, requeue the job */ log_event( PBSEVENT_JOB, PBS_EVENTCLASS_JOB, jobp->ji_qs.ji_jobid, "unable to run job, MOM rejected/timeout"); free_nodes(jobp); if (jobp->ji_qs.ji_substate != JOB_SUBSTATE_ABORT) { if (preq != NULL) req_reject(PBSE_MOMREJECT, 0, preq, MOMName, "connection to mom timed out"); svr_evaljobstate(jobp, &newstate, &newsub, 1); svr_setjobstate(jobp, newstate, newsub); } else { if (preq != NULL) req_reject(PBSE_BADSTATE, 0, preq, MOMName, "job was aborted by mom"); } break; case 1: /* commit failed */ default: { int JobOK = 0; /* send failed, requeue the job */ sprintf(log_buffer, "unable to run job, MOM rejected/rc=%d", r); log_event( PBSEVENT_JOB, PBS_EVENTCLASS_JOB, jobp->ji_qs.ji_jobid, log_buffer); free_nodes(jobp); if (jobp->ji_qs.ji_substate != JOB_SUBSTATE_ABORT) { if (preq != NULL) { char tmpLine[1024]; if (preq->rq_reply.brp_code == PBSE_JOBEXIST) { /* job already running, start request failed but return success since * desired behavior (job is running) is accomplished */ JobOK = 1; } else { sprintf(tmpLine, "cannot send job to %s, state=%s", (MOMName != NULL) ? MOMName : "mom", PJobSubState[jobp->ji_qs.ji_substate]); req_reject(PBSE_MOMREJECT, 0, preq, MOMName, tmpLine); } } if (JobOK == 1) { /* do not re-establish accounting - completed first time job was started */ /* update mom-based job status */ jobp->ji_momstat = 0; stat_mom_job(jobp); } else { svr_evaljobstate(jobp, &newstate, &newsub, 1); svr_setjobstate(jobp, newstate, newsub); } } else { if (preq != NULL) req_reject(PBSE_BADSTATE, 0, preq, MOMName, "send failed - abort"); } break; } } /* END switch (r) */ return; } /* END post_sendmom() */