END_TEST START_TEST(task_save_test) { int result = 0; struct task test_task; struct job test_job; const char *file_prefix = "prefix"; memset(&test_task, 0, sizeof(test_task)); memset(&test_job, 0, sizeof(test_job)); result = task_save(NULL); fail_unless(result == PBSE_BAD_PARAMETER, "NULL input fail"); result = task_save(&test_task); fail_unless(result == PBSE_BAD_PARAMETER, "NULL pointer to owning job fail"); test_task.ti_job = &test_job; strncpy(test_job.ji_qs.ji_fileprefix, file_prefix, sizeof(test_job.ji_qs.ji_fileprefix) - 1); result = task_save(&test_task); fail_unless(result == -1, "task_save fail"); }
END_TEST START_TEST(task_save_test) { int result = 0; struct task test_task; struct job test_job; const char *file_prefix = "prefix"; memset(&test_task, 0, sizeof(test_task)); memset(&test_job, 0, sizeof(test_job)); result = task_save(NULL); fail_unless(result == PBSE_BAD_PARAMETER, "NULL input fail"); result = task_save(&test_task); fail_unless(result == PBSE_BAD_PARAMETER, "NULL pointer to owning job fail"); strncpy(test_job.ji_qs.ji_fileprefix, file_prefix, sizeof(test_job.ji_qs.ji_fileprefix) - 1); strcpy(test_task.ti_qs.ti_parentjobid, "jobid"); extern job *mock_mom_find_job_return; mock_mom_find_job_return = NULL; result = task_save(&test_task); fail_unless(result == -1, "task_save fail"); }
void scan_for_terminated(void) { static char id[] = "scan_for_terminated"; int exiteval = 0; pid_t pid; job *pjob; task *ptask = NULL; int statloc; unsigned int momport = 0; int tcount; if (LOGLEVEL >= 7) { log_record( PBSEVENT_JOB, PBS_EVENTCLASS_JOB, id, "entered"); } /* update the latest intelligence about the running jobs; */ /* must be done before we reap the zombies, else we lose the info */ termin_child = 0; if (mom_get_sample() == PBSE_NONE) { pjob = (job *)GET_PRIOR(svr_alljobs); while (pjob != NULL) { mom_set_use(pjob); pjob = (job *)GET_PRIOR(pjob->ji_alljobs); } } /* Now figure out which task(s) have terminated (are zombies) */ /* NOTE: does a job's tasks include its epilog? */ while ((pid = waitpid(-1, &statloc, WNOHANG)) > 0) { pjob = (job *)GET_PRIOR(svr_alljobs); while (pjob != NULL) { /* * see if process was a child doing a special * function for MOM */ if (LOGLEVEL >= 7) { snprintf(log_buffer, 1024, "checking job w/subtask pid=%d (child pid=%d)", pjob->ji_momsubt, pid); LOG_EVENT( PBSEVENT_DEBUG, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buffer); } if (pid == pjob->ji_momsubt) { if (LOGLEVEL >= 7) { snprintf(log_buffer, 1024, "found match with job subtask for pid=%d", pid); LOG_EVENT( PBSEVENT_DEBUG, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buffer); } break; } /* look for task */ ptask = (task *)GET_NEXT(pjob->ji_tasks); /* locate task with associated process id */ tcount = 0; while (ptask != NULL) { if (ptask->ti_qs.ti_sid == pid) { if (LOGLEVEL >= 7) { snprintf(log_buffer, 1024, "found match with job task %d for pid=%d", tcount, pid); LOG_EVENT( PBSEVENT_DEBUG, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buffer); } break; } ptask = (task *)GET_NEXT(ptask->ti_jobtask); tcount++; } /* END while (ptask) */ if (ptask != NULL) { /* pid match located - break out of job loop */ break; } pjob = (job *)GET_PRIOR(pjob->ji_alljobs); } /* END while (pjob != NULL) */ if (pjob == NULL) { if (LOGLEVEL >= 1) { sprintf(log_buffer, "pid %d not tracked, exitcode=%d", pid, statloc); log_record( PBSEVENT_JOB, PBS_EVENTCLASS_JOB, id, log_buffer); } continue; } /* END if (pjob == NULL) */ if (WIFEXITED(statloc)) exiteval = WEXITSTATUS(statloc); else if (WIFSIGNALED(statloc)) exiteval = WTERMSIG(statloc) + 0x100; else exiteval = 1; if (pid == pjob->ji_momsubt) { /* PID matches job mom subtask */ /* NOTE: both ji_momsubt and ji_mompost normally set in routine preobit_reply() after epilog child is successfully forked */ if (pjob->ji_mompost != NULL) { if (pjob->ji_mompost(pjob, exiteval) == 0) { /* success */ pjob->ji_mompost = NULL; } } /* END if (pjob->ji_mompost != NULL) */ else { log_record( PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, "job has no postprocessing routine registered"); } /* clear mom sub-task */ pjob->ji_momsubt = 0; if(multi_mom) { momport = pbs_rm_port; } job_save(pjob, SAVEJOB_QUICK, momport); continue; } /* END if (pid == pjob->ji_momsubt) */ /* what happens if mom PID is reaped before subtask? */ if (LOGLEVEL >= 2) { sprintf(log_buffer, "pid %d harvested for job %s, task %d, exitcode=%d", pid, pjob->ji_qs.ji_jobid, ptask->ti_qs.ti_task, exiteval); log_record( PBSEVENT_JOB, PBS_EVENTCLASS_JOB, id, log_buffer); } /* where is job purged? How do we keep job from progressing in state until the obit is sent? */ kill_task(ptask, SIGKILL, 0); ptask->ti_qs.ti_exitstat = exiteval; ptask->ti_qs.ti_status = TI_STATE_EXITED; task_save(ptask); sprintf(log_buffer, "%s: job %s task %d terminated, sid=%d", id, pjob->ji_qs.ji_jobid, ptask->ti_qs.ti_task, ptask->ti_qs.ti_sid); LOG_EVENT( PBSEVENT_DEBUG, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buffer); exiting_tasks = 1; } /* END while ((pid = waitpid(-1,&statloc,WNOHANG)) > 0) */ return; } /* END scan_for_terminated() */
/** * @brief * Internal session cpu time decoding routine. * * @param[in] job - a job pointer. * * @return ulong * @retval sum of all cpu time consumed for all tasks executed by the job, in seconds, * adjusted by cputfactor. * */ static unsigned long cput_sum(job *pjob) { ulong cputime = 0; int i; int nps = 0; int taskprocs; prstatus_t *ps; prpsinfo_t *pi; task *ptask; ulong tcput; for (ptask = (task *)GET_NEXT(pjob->ji_tasks); ptask != NULL; ptask = (task *)GET_NEXT(ptask->ti_jobtask)) { /* DEAD task */ if (ptask->ti_qs.ti_sid <= 1) { cputime += ptask->ti_cput; continue; } tcput = 0; taskprocs = 0; for (i=0; i<nproc; i++) { pi = &proc_info[i]; /* is this process part of any task? */ if (ptask->ti_qs.ti_sid != pi->pr_sid) continue; nps++; taskprocs++; if (pi->pr_state == SZOMB) { /* use zombie's iff top process */ if ((pi->pr_sid != pi->pr_pid) && (pi->pr_ppid != mom_pid)) continue; tcput += tv(pi->pr_time); DBPRT(("%s: task %08.8X ses %d pid %d " "(zomb) %d\n", __func__, ptask->ti_qs.ti_task, pi->pr_sid, pi->pr_pid, tcput)) } else { ps = &proc_status[i]; tcput += (tv(ps->pr_utime) + tv(ps->pr_stime) + tv(ps->pr_cutime) + tv(ps->pr_cstime)); } DBPRT(("%s: task %08.8X ses %d pid %d cputime %d\n", __func__, ptask->ti_qs.ti_task, pi->pr_sid, ps->pr_pid, tcput)) } if (tcput > ptask->ti_cput) ptask->ti_cput = tcput; cputime += ptask->ti_cput; DBPRT(("%s: task %08.8X cput %lu total %lu\n", __func__, ptask->ti_qs.ti_task, ptask->ti_cput, cputime)) if (taskprocs == 0) { sprintf(log_buffer, "no active process for task %8.8X", ptask->ti_qs.ti_task); log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, LOG_INFO, pjob->ji_qs.ji_jobid, log_buffer); ptask->ti_qs.ti_exitstat = 0; ptask->ti_qs.ti_status = TI_STATE_EXITED; if (pjob->ji_qs.ji_un.ji_momt.ji_exitstat >= 0) pjob->ji_qs.ji_un.ji_momt.ji_exitstat = 0; task_save(ptask); exiting_tasks = 1; } }
void scan_for_terminated(void) /* linux */ { int exiteval = 0; pid_t pid; job *pjob = NULL; task *ptask = NULL; int statloc; unsigned int momport = 0; #ifdef USESAVEDRESOURCES int update_stats = TRUE; #endif /* USESAVEDRESOURCES */ int tcount; if (LOGLEVEL >= 9) { log_record(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, __func__, "entered"); } /* update the latest intelligence about the running jobs; */ /* must be done before we reap the zombies, else we lose the info */ termin_child = 0; if (mom_get_sample() == PBSE_NONE) { std::list<job *>::reverse_iterator iter; // get a list of jobs in start time order, first to last for (iter = alljobs_list.rbegin(); iter != alljobs_list.rend(); iter++) { pjob = *iter; if ((pjob->ji_stats_done == true) || (pjob->ji_qs.ji_state < JOB_STATE_RUNNING)) continue; #ifdef USESAVEDRESOURCES ptask = (task *)GET_NEXT(pjob->ji_tasks); /* ** check task with associated process id to see if we are recovering ** after a mom restart where process completed while we were gone */ while (ptask != NULL) { if (ptask->ti_flags & TI_FLAGS_RECOVERY) { if (LOGLEVEL >= 7) { snprintf(log_buffer, sizeof(log_buffer), "Found match for recovering job task for sid=%d", ptask->ti_qs.ti_sid); log_event(PBSEVENT_DEBUG, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buffer); } update_stats = FALSE; break; } ptask = (task *)GET_NEXT(ptask->ti_jobtask); } /* END while (ptask) */ if (update_stats) { mom_set_use(pjob); } #else mom_set_use(pjob); #endif /* USESAVEDRESOURCES */ } } /* Now figure out which task(s) have terminated (are zombies) */ /* NOTE: does a job's tasks include its epilog? */ while ((pid = waitpid(-1, &statloc, WNOHANG)) > 0) { std::list<job *>::reverse_iterator iter; if (LOGLEVEL >= 8) { sprintf(log_buffer, "Child exited with pid: %d", pid); log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, __func__, log_buffer); } // get a list of jobs in start time order, first to last for (iter = alljobs_list.rbegin(); iter != alljobs_list.rend(); iter++) { pjob = *iter; /* * see if process was a child doing a special * function for MOM */ if (pjob->ji_momsubt != 0) { if (LOGLEVEL >= 9) { snprintf(log_buffer, sizeof(log_buffer), "Checking to see if exiting child pid '%d' is a match for special mom task with pid=%d", pid, pjob->ji_momsubt); log_event(PBSEVENT_DEBUG, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buffer); } if (pid == pjob->ji_momsubt) { if (LOGLEVEL >= 9) { snprintf(log_buffer, sizeof(log_buffer), "The exiting child is a match of special subtask with pid=%d for job %s", pid, pjob->ji_qs.ji_jobid); log_event(PBSEVENT_DEBUG, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buffer); } break; } } /* look for task */ ptask = (task *)GET_NEXT(pjob->ji_tasks); /* locate task with associated process id */ tcount = 0; while (ptask != NULL) { if ((ptask->ti_qs.ti_sid == pid) && (ptask->ti_qs.ti_status != TI_STATE_EXITED)) { if (LOGLEVEL >= 7) { snprintf(log_buffer, sizeof(log_buffer), "Exiting child matches job task %d for pid=%d", tcount, pid); log_event(PBSEVENT_DEBUG, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buffer); } break; } ptask = (task *)GET_NEXT(ptask->ti_jobtask); tcount++; } /* END while (ptask) */ // make sure the task is the top level task for the job to mark the job done if ((ptask != NULL) && (ptask->ti_qs.ti_parenttask == TM_NULL_TASK)) { /* pid match located - break out of job loop */ pjob->ji_stats_done = true; break; } } /* END while (pjob != NULL) */ if (WIFEXITED(statloc)) exiteval = WEXITSTATUS(statloc); else if (WIFSIGNALED(statloc)) exiteval = WTERMSIG(statloc) + 0x100; else exiteval = 1; if (pjob == NULL) { if (LOGLEVEL >= 1) { sprintf(log_buffer, "Child pid %d is not part of a job, statloc=%d, exitval=%d", pid, statloc, exiteval); log_record(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, __func__, log_buffer); } continue; } /* END if (pjob == NULL) */ if (pid == pjob->ji_momsubt) { /* PID matches job mom subtask */ /* NOTE: both ji_momsubt and ji_mompost normally set in routine preobit_reply() after epilog child is successfully forked */ if (pjob->ji_mompost != NULL) { if (pjob->ji_mompost(pjob, exiteval) == 0) { /* success */ pjob->ji_mompost = NULL; } } /* END if (pjob->ji_mompost != NULL) */ else if (LOGLEVEL >= 8) // This is a debug statement { log_record( PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, "Job has no postprocessing routine registered"); } /* clear mom sub-task */ pjob->ji_momsubt = 0; if (multi_mom) { momport = pbs_rm_port; } job_save(pjob, SAVEJOB_QUICK, momport); continue; } /* END if (pid == pjob->ji_momsubt) */ if (ptask == NULL) continue; /* what happens if mom PID is reaped before subtask? */ if (LOGLEVEL >= 2) { sprintf(log_buffer, "pid %d harvested for job %s, task %d, exitcode=%d", pid, pjob->ji_qs.ji_jobid, ptask->ti_qs.ti_task, exiteval); log_record(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, __func__, log_buffer); } /* where is job purged? How do we keep job from progressing in state until the obit is sent? */ kill_task(pjob, ptask, SIGKILL, 0); ptask->ti_qs.ti_exitstat = exiteval; ptask->ti_qs.ti_status = TI_STATE_EXITED; task_save(ptask); sprintf(log_buffer, "%s: job %s task %d terminated, sid=%d", __func__, pjob->ji_qs.ji_jobid, ptask->ti_qs.ti_task, ptask->ti_qs.ti_sid); log_event(PBSEVENT_DEBUG, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buffer); exiting_tasks = 1; } /* END while ((pid = waitpid(-1,&statloc,WNOHANG)) > 0) */ return; } /* END scan_for_terminated() */
/** * @brief wait_action * Wait for a task that has terminated or a socket that is ready to read. * Mark any terminated task as Exiting and do network processing on * any ready socket. * * @return void */ void wait_action(void) { static char id[] = "wait_action"; int rc = 0; int hNum = 0; HANDLE hArray[MAXIMUM_WAIT_OBJECTS+1] = {INVALID_HANDLE_VALUE}; HANDLE hProc = INVALID_HANDLE_VALUE; extern HANDLE hStop; /* mutex: quit when released */ int ecode = -1; job *pjob = NULL; task *ptask = NULL; int waittime = 500; extern int mom_run_state; struct work_task *p_wtask = NULL; HANDLE pid = INVALID_HANDLE_VALUE; /* Check for non job-related tasks like periodic hook tasks */ while (1) { if ((pid = waitpid((HANDLE)-1, &ecode, WNOHANG)) == (HANDLE)-1) { if (errno == EINTR) { continue; } else { break; } } else if (pid == 0) { break; } p_wtask = (struct work_task *)GET_NEXT(task_list_event); while (p_wtask) { if ((p_wtask->wt_type == WORK_Deferred_Child) && ((HANDLE)p_wtask->wt_event == pid)) { p_wtask->wt_type = WORK_Deferred_Cmp; p_wtask->wt_aux = (int)ecode; /* exit status */ svr_delay_entry++; /* see next_task() */ } p_wtask = (struct work_task *)GET_NEXT(p_wtask->wt_linkall); } } for (;;) { hNum = 0; if (mom_run_state && hStop != NULL) /* add mutex to array */ hArray[hNum++] = hStop; pjob = (job *)GET_NEXT(svr_alljobs); while (pjob) { /* * see if process was a child doing a special * function for MOM */ if ((pjob->ji_momsubt != NULL) && (pjob->ji_momsubt != INVALID_HANDLE_VALUE) && (pjob->ji_mompost != NULL)) { hArray[hNum++] = pjob->ji_momsubt; } /* * process tasks */ ptask = (task *)GET_NEXT(pjob->ji_tasks); while (ptask) { if ((ptask->ti_hProc != NULL) && (ptask->ti_hProc != INVALID_HANDLE_VALUE)) hArray[hNum++] = ptask->ti_hProc; if (hNum > MAXIMUM_WAIT_OBJECTS) break; ptask = (task *)GET_NEXT(ptask->ti_jobtask); } if (hNum > MAXIMUM_WAIT_OBJECTS) { DBPRT(("%s: %d more than MAX\n", id, hNum)) hNum = MAXIMUM_WAIT_OBJECTS; break; } pjob = (job *)GET_NEXT(pjob->ji_alljobs); } if (hNum == 0) /* nothing to wait for */ break; rc = WaitForMultipleObjects(hNum, hArray, FALSE, waittime); if (rc == WAIT_TIMEOUT) /* nobody is done */ break; else if (rc == WAIT_FAILED) { log_err(-1, id, "WaitForMultipleObjects"); break; } waittime = 0; /* only wait the first time */ rc -= WAIT_OBJECT_0; /* which object was it? */ assert(0 <= rc && rc < hNum); if (rc == 0 && mom_run_state && hStop != NULL) { /* got mutex */ mom_run_state = 0; /* shutdown */ continue; } /* ** It was a process finishing. Find which one. */ hProc = hArray[rc]; rc = GetExitCodeProcess(hProc, &ecode); if (rc == 0) { log_err(-1, id, "GetExitCodeProcess"); ecode = 99; } else if (rc == STILL_ACTIVE) /* shouldn't happen */ break; CloseHandle(hProc); /* find which process finished */ pjob = (job *)GET_NEXT(svr_alljobs); while (pjob) { if (pjob->ji_momsubt == hProc) break; ptask = (task *)GET_NEXT(pjob->ji_tasks); while (ptask) { if (ptask->ti_hProc == hProc) break; ptask = (task *)GET_NEXT(ptask->ti_jobtask); } if (ptask) break; pjob = (job *)GET_NEXT(pjob->ji_alljobs); } assert(pjob != NULL); if (pjob->ji_momsubt == hProc) { pjob->ji_momsubt = NULL; if (pjob->ji_mompost) { pjob->ji_mompost(pjob, ecode); /* After epilogue, get rid of any HOSTFILE */ if (pjob->ji_mompost == send_obit) { char file[MAXPATHLEN+1]; (void)sprintf(file, "%s/aux/%s", pbs_conf.pbs_home_path, pjob->ji_qs.ji_jobid); (void)unlink(file); } pjob->ji_mompost = 0; } (void)job_save(pjob, SAVEJOB_QUICK); continue; } DBPRT(("%s: task %d pid %d exit value %d\n", id, ptask->ti_qs.ti_task, ptask->ti_qs.ti_sid, ecode)) ptask->ti_hProc = NULL; ptask->ti_qs.ti_exitstat = ecode; ptask->ti_qs.ti_status = TI_STATE_EXITED; ptask->ti_qs.ti_sid = 0; (void)task_save(ptask); sprintf(log_buffer, "task %d terminated", ptask->ti_qs.ti_task); log_event(PBSEVENT_DEBUG, PBS_EVENTCLASS_JOB, LOG_DEBUG, pjob->ji_qs.ji_jobid, log_buffer); exiting_tasks = 1; } connection_idlecheck(); }
/** * @brief * Internal session cpu time decoding routine. * * @param[in] job - a job pointer. * * @return ulong * @retval sum of all cpu time consumed for all tasks executed by the job, in seconds, * adjusted by cputfactor. * */ static unsigned long cput_sum(job *pjob) { ulong cputime, addtime; int i; int nps = 0; int taskprocs; psinfo_t *pi; task *ptask; ulong tcput; cputime = 0; for (ptask = (task *)GET_NEXT(pjob->ji_tasks); ptask != NULL; ptask = (task *)GET_NEXT(ptask->ti_jobtask)) { /* DEAD task */ if (ptask->ti_qs.ti_sid <= 1) { cputime += ptask->ti_cput; continue; } tcput = 0; taskprocs = 0; for (i=0; i<nproc; i++) { pi = &proc_info[i]; /* is this process part of the task? */ if (ptask->ti_qs.ti_sid != pi->pr_sid) continue; nps++; taskprocs++; if (pi->pr_nlwp == 0) { /* zombie */ if ((pi->pr_sid != pi->pr_pid) && (pi->pr_ppid != mom_pid)) continue; /* top of session/job, record it */ tcput += tv(pi->pr_time); DBPRT(("%s: task %08.8X ses %d pid %d " "(zombie) cputime %lu\n", __func__, ptask->ti_qs.ti_task, pi->pr_sid, pi->pr_pid, tcput)) } else { addtime = tv(pi->pr_time) + tv(pi->pr_ctime); tcput += addtime; DBPRT(("%s: task %08.8X ses %d pid %d " "cputime %lu\n", __func__, ptask->ti_qs.ti_task, pi->pr_sid, pi->pr_pid, tcput)) } } if (tcput > ptask->ti_cput) ptask->ti_cput = tcput; cputime += ptask->ti_cput; DBPRT(("%s: task %08.8X cput %lu total %lu\n", __func__, ptask->ti_qs.ti_task, ptask->ti_cput, cputime)) if (taskprocs == 0) { sprintf(log_buffer, "no active process for task %8.8X", ptask->ti_qs.ti_task); log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, LOG_INFO, pjob->ji_qs.ji_jobid, log_buffer); ptask->ti_qs.ti_status = TI_STATE_EXITED; task_save(ptask); exiting_tasks = 1; } }