/** * @brief * In response to an unrecoverable error (normally after calling the * offline_job_host() function above), requeue a job - perhaps it * will have better luck running on a set of vnodes other than those * just offlined. * * @param[in] pjob - pointer to job structure * * @return Void * */ void requeue_job(job *pjob) { char *jid = pjob->ji_qs.ji_jobid; static char *cmdbuf = NULL; if (cmdbuf == NULL) { cmdbuf = malloc(execmax); if (cmdbuf == NULL) { log_joberr(errno, __func__, "cmdbuf malloc", jid); return; } } if (snprintf(cmdbuf, execmax, "%s/bin/%s %s", pbs_conf.pbs_exec_path, "qrerun", jid) >= execmax) { log_joberr(-1, __func__, "cmdbuf overflow", jid); return; } if (system(cmdbuf) == -1) log_joberr(errno, __func__, "attempt to requeue job failed", jid); else log_event(PBSEVENT_ADMIN, PBS_EVENTCLASS_JOB, LOG_NOTICE, jid, "requeued"); }
/** * @brief * In response to an unrecoverable error, derive the list of vnodes * assigned to the given job that belong to this mom and use the list * to construct and issue a command to offline them. * * @param[in] pjob - pointer to job structure * * @return Void * */ void offline_job_vnodes(job *pjob) { int i; hnodent *hn; char *jid = pjob->ji_qs.ji_jobid; static char *cmdbuf = NULL; static char *cmdprefix = "qmgr -c 'set node "; static char *cmdsuffix = "state += offline'"; int suffixlen = strlen(cmdsuffix) + 1; char linebuf[_POSIX_ARG_MAX]; char *vnodeptr; /* vnode within cmdbuf */ if (cmdbuf == NULL) { cmdbuf = malloc(execmax); if (cmdbuf == NULL) { log_joberr(errno, __func__, "cmdbuf malloc", jid); return; } } if (snprintf(cmdbuf, execmax, "%s/bin/%s", pbs_conf.pbs_exec_path, cmdprefix) >= execmax) { log_joberr(-1, __func__, "cmdbuf overflow", jid); return; } vnodeptr = cmdbuf + strlen(cmdbuf); /* assume ' ' at cmdprefix end */ for (i = 0, hn = &pjob->ji_hosts[pjob->ji_nodeid]; i < hn->hn_vlnum; i++) { host_vlist_t *hv; hv = &hn->hn_vlist[i]; if ((hv->hv_mem > 0) || (hv->hv_ncpus > 0)) { size_t len = strlen(hv->hv_vname); if (len >= sizeof(linebuf)) { sprintf(log_buffer, "vnode name too long (%lu)", len); log_joberr(-1, __func__, log_buffer, jid); return; } /* cmdbuf length + vnode name length + ' ' + suffixlen */ if (strlen(cmdbuf) + len + 1 + suffixlen > execmax) { log_joberr(-1, __func__, "cmdbuf overflow", jid); return; } sprintf(linebuf, "%s %s", hv->hv_vname, cmdsuffix); (void) strcat(cmdbuf, linebuf); if (system(cmdbuf) == -1) { log_joberr(errno, __func__, "attempt to offline job vnode(s) failed", jid); } else { sprintf(log_buffer, "vnode %s offlined", hv->hv_vname); log_event(PBSEVENT_ADMIN, PBS_EVENTCLASS_JOB, LOG_ALERT, jid, log_buffer); } *vnodeptr = '\0'; /* truncate to cmdprefix */ } } }
/** * @brief * Internal session cpu time decoding routine. * * @param[in] job - a job pointer. * * @return ulong * @retval sum of all cpu time consumed for all tasks executed by the job, in seconds, * adjusted by cputfactor. * */ static unsigned long cput_sum(job *pjob) { static char id[] = "cput_sum"; ulong cputime; mtime_t addtime; int i, ret; int nps = 0; int taskprocs; proc_t *pp; pbs_task *ptask; ulong tcput; struct jresourcecpu jcpu; cputime = 0; for (ptask = (pbs_task *)GET_NEXT(pjob->ji_tasks); ptask != NULL; ptask = (pbs_task *)GET_NEXT(ptask->ti_jobtask)) { id_t jid = ptask->ti_qs.ti_u.ti_ext.ti_jid; /* DEAD task */ if (ptask->ti_qs.ti_sid <= 1) { cputime += ptask->ti_cput; continue; } /* check the job time */ if (jid > 0) { ret = getresourcej(jid, CURR_CPU, (long long)&jcpu); if (ret == -1) { if (errno != ESRCH) { sprintf(log_buffer, "getresourcej: jid %d " "task %08.8X", jid, ptask->ti_qs.ti_task); log_joberr(errno, id, log_buffer, pjob->ji_qs.ji_jobid); } } else { tcput = MSEC(jcpu.jr_cpu); if (tcput > ptask->ti_cput) ptask->ti_cput = tcput; cputime += ptask->ti_cput; DBPRT(("%s: task %08.8X jid %d " "jcpu %ld cput %lu total %lu\n", id, ptask->ti_qs.ti_task, jid, tcput, ptask->ti_cput, cputime)) continue; } } tcput = 0; taskprocs = 0; for (i=0; i<nproc; i++) { pp = &proct[i]; if (BOGUS_PROC(pp)) continue; /* is this process part of the task? */ if (ptask->ti_qs.ti_sid != pp->p_sid && ptask->ti_qs.ti_u.ti_ext.ti_parent != pp->p_pid) continue; nps++; taskprocs++; DBPRT(("%s: task %08.8X jid %d ses %d pid %d ppid %d ", id, ptask->ti_qs.ti_task, jid, pp->p_sid, pp->p_pid, pp->p_ppid)) if (pp->p_task.t_stat == SZOMB) { /* get zombie time only if top process */ if ((pp->p_sid == pp->p_pid) || (pp->p_ppid == mom_pid)) { addtime = pp->p_hcutime+pp->p_hcstime; DBPRT(("cput %lu ", CKSEC(addtime))) } DBPRT(("(zombie)\n")) } else {