Exemplo n.º 1
0
/**
 * @brief
 *	In response to an unrecoverable error (normally after calling the
 *	offline_job_host() function above), requeue a job - perhaps it
 *	will have better luck running on a set of vnodes other than those
 *	just offlined.
 *
 * @param[in] pjob - pointer to job structure
 *
 * @return Void
 *
 */
void
requeue_job(job *pjob)
{
	char			*jid = pjob->ji_qs.ji_jobid;
	static char		*cmdbuf = NULL;

	if (cmdbuf == NULL) {
		cmdbuf = malloc(execmax);
		if (cmdbuf == NULL) {
			log_joberr(errno, __func__, "cmdbuf malloc", jid);
			return;
		}
	}
	if (snprintf(cmdbuf, execmax, "%s/bin/%s %s",
		pbs_conf.pbs_exec_path, "qrerun", jid) >= execmax) {
		log_joberr(-1, __func__, "cmdbuf overflow", jid);
		return;
	}

	if (system(cmdbuf) == -1)
		log_joberr(errno, __func__, "attempt to requeue job failed", jid);
	else
		log_event(PBSEVENT_ADMIN, PBS_EVENTCLASS_JOB, LOG_NOTICE, jid,
			"requeued");
}
Exemplo n.º 2
0
/**
 * @brief
 *	In response to an unrecoverable error, derive the list of vnodes
 *	assigned to the given job that belong to this mom and use the list
 *	to construct and issue a command to offline them.
 *
 * @param[in] pjob - pointer to job structure
 *
 * @return Void
 *
 */
void
offline_job_vnodes(job *pjob)
{
	int			i;
	hnodent			*hn;
	char			*jid = pjob->ji_qs.ji_jobid;
	static char		*cmdbuf = NULL;
	static char		*cmdprefix = "qmgr -c 'set node ";
	static char		*cmdsuffix = "state += offline'";
	int			suffixlen = strlen(cmdsuffix) + 1;
	char			linebuf[_POSIX_ARG_MAX];
	char			*vnodeptr;	/* vnode within cmdbuf */

	if (cmdbuf == NULL) {
		cmdbuf = malloc(execmax);
		if (cmdbuf == NULL) {
			log_joberr(errno, __func__, "cmdbuf malloc", jid);
			return;
		}
	}
	if (snprintf(cmdbuf, execmax, "%s/bin/%s",
		pbs_conf.pbs_exec_path, cmdprefix) >= execmax) {
		log_joberr(-1, __func__, "cmdbuf overflow", jid);
		return;
	}
	vnodeptr = cmdbuf + strlen(cmdbuf);	/* assume ' ' at cmdprefix end */

	for (i = 0, hn = &pjob->ji_hosts[pjob->ji_nodeid];
		i < hn->hn_vlnum; i++) {
		host_vlist_t	*hv;

		hv = &hn->hn_vlist[i];
		if ((hv->hv_mem > 0) || (hv->hv_ncpus > 0)) {
			size_t	len = strlen(hv->hv_vname);

			if (len >= sizeof(linebuf)) {
				sprintf(log_buffer, "vnode name too long (%lu)",
					len);
				log_joberr(-1, __func__, log_buffer, jid);
				return;
			}

			/* cmdbuf length + vnode name length + ' ' + suffixlen */
			if (strlen(cmdbuf) + len + 1 + suffixlen > execmax) {
				log_joberr(-1, __func__, "cmdbuf overflow", jid);
				return;
			}
			sprintf(linebuf, "%s %s", hv->hv_vname, cmdsuffix);
			(void) strcat(cmdbuf, linebuf);

			if (system(cmdbuf) == -1) {
				log_joberr(errno, __func__,
					"attempt to offline job vnode(s) failed",
					jid);
			} else {
				sprintf(log_buffer, "vnode %s offlined",
					hv->hv_vname);
				log_event(PBSEVENT_ADMIN, PBS_EVENTCLASS_JOB,
					LOG_ALERT, jid, log_buffer);
			}
			*vnodeptr = '\0';	/* truncate to cmdprefix */
		}
	}
}
Exemplo n.º 3
0
/**
 * @brief
 *      Internal session cpu time decoding routine.
 *
 * @param[in] job - a job pointer.
 *
 * @return      ulong
 * @retval      sum of all cpu time consumed for all tasks executed by the job, in seconds,
 *              adjusted by cputfactor.
 *
 */
static unsigned long
cput_sum(job *pjob)
{
	static char		id[] = "cput_sum";
	ulong			cputime;
	mtime_t			addtime;
	int			i, ret;
	int			nps = 0;
	int			taskprocs;
	proc_t			*pp;
	pbs_task		*ptask;
	ulong			tcput;
	struct	jresourcecpu	jcpu;

	cputime = 0;
	for (ptask = (pbs_task *)GET_NEXT(pjob->ji_tasks);
		ptask != NULL;
		ptask = (pbs_task *)GET_NEXT(ptask->ti_jobtask)) {
		id_t    jid = ptask->ti_qs.ti_u.ti_ext.ti_jid;

		/* DEAD task */
		if (ptask->ti_qs.ti_sid <= 1) {
			cputime += ptask->ti_cput;
			continue;
		}

		/* check the job time */
		if (jid > 0) {
			ret = getresourcej(jid, CURR_CPU, (long long)&jcpu);
			if (ret == -1) {
				if (errno != ESRCH) {
					sprintf(log_buffer,
						"getresourcej: jid %d "
						"task %08.8X",
						jid, ptask->ti_qs.ti_task);
					log_joberr(errno, id, log_buffer,
						pjob->ji_qs.ji_jobid);
				}
			}
			else {
				tcput = MSEC(jcpu.jr_cpu);
				if (tcput > ptask->ti_cput)
					ptask->ti_cput = tcput;
				cputime += ptask->ti_cput;
				DBPRT(("%s: task %08.8X jid %d "
					"jcpu %ld cput %lu total %lu\n", id,
					ptask->ti_qs.ti_task, jid,
					tcput, ptask->ti_cput, cputime))
				continue;
			}
		}

		tcput = 0;
		taskprocs = 0;
		for (i=0; i<nproc; i++) {
			pp = &proct[i];

			if (BOGUS_PROC(pp))
				continue;

			/* is this process part of the task? */
			if (ptask->ti_qs.ti_sid != pp->p_sid &&
				ptask->ti_qs.ti_u.ti_ext.ti_parent != pp->p_pid)
				continue;

			nps++;
			taskprocs++;
			DBPRT(("%s: task %08.8X jid %d ses %d pid %d ppid %d ",
				id, ptask->ti_qs.ti_task, jid,
				pp->p_sid, pp->p_pid, pp->p_ppid))
			if (pp->p_task.t_stat == SZOMB) {
				/* get zombie time only if top process */
				if ((pp->p_sid == pp->p_pid) ||
					(pp->p_ppid == mom_pid)) {
					addtime = pp->p_hcutime+pp->p_hcstime;
					DBPRT(("cput %lu ", CKSEC(addtime)))
				}
				DBPRT(("(zombie)\n"))
			}
			else {