Esempio n. 1
0
static int
_handle_stat_jobacct(int fd, stepd_step_rec_t *job, uid_t uid)
{
	jobacctinfo_t *jobacct = NULL;
	jobacctinfo_t *temp_jobacct = NULL;
	int i = 0;
	int num_tasks = 0;
	debug("_handle_stat_jobacct for job %u.%u",
	      job->jobid, job->stepid);

	debug3("  uid = %d", uid);
	if (uid != job->uid && !_slurm_authorized_user(uid)) {
		debug("stat jobacct from uid %ld for job %u.%u "
		      "owned by uid %ld",
		      (long)uid, job->jobid, job->stepid, (long)job->uid);
		/* Send NULL */
		jobacctinfo_setinfo(jobacct, JOBACCT_DATA_PIPE, &fd,
				    SLURM_PROTOCOL_VERSION);
		return SLURM_ERROR;
	}

	jobacct = jobacctinfo_create(NULL);
	debug3("num tasks = %d", job->node_tasks);

	for (i = 0; i < job->node_tasks; i++) {
		temp_jobacct = jobacct_gather_stat_task(job->task[i]->pid);
		if (temp_jobacct) {
			jobacctinfo_aggregate(jobacct, temp_jobacct);
			jobacctinfo_destroy(temp_jobacct);
			num_tasks++;
		}
	}

	jobacctinfo_setinfo(jobacct, JOBACCT_DATA_PIPE, &fd,
			    SLURM_PROTOCOL_VERSION);
	safe_write(fd, &num_tasks, sizeof(int));

	jobacctinfo_destroy(jobacct);

	return SLURM_SUCCESS;

rwfail:
	jobacctinfo_destroy(jobacct);
	return SLURM_ERROR;
}
Esempio n. 2
0
File: req.c Progetto: birc-aeh/slurm
static int
_handle_completion(int fd, stepd_step_rec_t *job, uid_t uid)
{
	int rc = SLURM_SUCCESS;
	int errnum = 0;
	int first;
	int last;
	jobacctinfo_t *jobacct = NULL;
	int step_rc;
	char* buf;
	int len;
	Buf buffer;
	int version;	/* For future use */
	bool lock_set = false;

	debug("_handle_completion for job %u.%u",
	      job->jobid, job->stepid);

	debug3("  uid = %d", uid);
	if (!_slurm_authorized_user(uid)) {
		debug("step completion message from uid %ld for job %u.%u ",
		      (long)uid, job->jobid, job->stepid);
		rc = -1;
		errnum = EPERM;
		/* Send the return code and errno */
		safe_write(fd, &rc, sizeof(int));
		safe_write(fd, &errnum, sizeof(int));
		return SLURM_SUCCESS;
	}

	safe_read(fd, &version, sizeof(int));
	safe_read(fd, &first, sizeof(int));
	safe_read(fd, &last, sizeof(int));
	safe_read(fd, &step_rc, sizeof(int));

	/*
	 * We must not use getinfo over a pipe with slurmd here
	 * Indeed, slurmstepd does a large use of setinfo over a pipe
	 * with slurmd and doing the reverse can result in a deadlock
	 * scenario with slurmd :
	 * slurmd(lockforread,write)/slurmstepd(write,lockforread)
	 * Do pack/unpack instead to be sure of independances of
	 * slurmd and slurmstepd
	 */
	safe_read(fd, &len, sizeof(int));
	buf = xmalloc(len);
	safe_read(fd, buf, len);
	buffer = create_buf(buf, len);
	jobacctinfo_unpack(&jobacct, SLURM_PROTOCOL_VERSION,
			   PROTOCOL_TYPE_SLURM, buffer, 1);
	free_buf(buffer);

	/*
	 * Record the completed nodes
	 */
	pthread_mutex_lock(&step_complete.lock);
	lock_set = true;
	if (! step_complete.wait_children) {
		rc = -1;
		errnum = ETIMEDOUT; /* not used anyway */
		goto timeout;
	}

	/* SlurmUser or root can craft a launch without a valid credential
	 * ("srun --no-alloc ...") and no tree information can be built
	 *  without the hostlist from the credential. */
	if (step_complete.rank >= 0) {
#if 0
		char bits_string[128];
		debug2("Setting range %d (bit %d) through %d(bit %d)",
		       first, first-(step_complete.rank+1),
		       last, last-(step_complete.rank+1));
		bit_fmt(bits_string, sizeof(bits_string), step_complete.bits);
		debug2("  before bits: %s", bits_string);
#endif
		bit_nset(step_complete.bits,
			 first - (step_complete.rank+1),
			 last - (step_complete.rank+1));
#if 0
		bit_fmt(bits_string, sizeof(bits_string), step_complete.bits);
		debug2("  after bits: %s", bits_string);
#endif
	}
	step_complete.step_rc = MAX(step_complete.step_rc, step_rc);

	/************* acct stuff ********************/
	jobacctinfo_aggregate(step_complete.jobacct, jobacct);
timeout:
	jobacctinfo_destroy(jobacct);
	/*********************************************/

	/* Send the return code and errno, we do this within the locked
	 * region to ensure that the stepd doesn't exit before we can
	 * perform this send. */
	safe_write(fd, &rc, sizeof(int));
	safe_write(fd, &errnum, sizeof(int));
	pthread_cond_signal(&step_complete.cond);
	pthread_mutex_unlock(&step_complete.lock);

	return SLURM_SUCCESS;


rwfail:	if (lock_set) {
		pthread_cond_signal(&step_complete.cond);
		pthread_mutex_unlock(&step_complete.lock);
	}
	return SLURM_FAILURE;
}
Esempio n. 3
0
File: req.c Progetto: A1ve5/slurm
/* Wait for the pid given and when it ends get and children it might
 * of left behind and wait on them instead.
 */
static void *_wait_extern_pid(void *args)
{
	extern_pid_t *extern_pid = (extern_pid_t *)args;

	stepd_step_rec_t *job = extern_pid->job;
	pid_t pid = extern_pid->pid;

	jobacctinfo_t *jobacct = NULL;
	pid_t *pids = NULL;
	int npids = 0, i;
	char	proc_stat_file[256];	/* Allow ~20x extra length */
	FILE *stat_fp = NULL;
	int fd;
	char sbuf[256], *tmp, state[1];
	int num_read, ppid;

	xfree(extern_pid);

	//info("waiting on pid %d", pid);
	_block_on_pid(pid);
	//info("done with pid %d %d: %m", pid, rc);
	jobacct = jobacct_gather_remove_task(pid);
	if (jobacct) {
		job->jobacct->energy.consumed_energy = 0;
		jobacctinfo_aggregate(job->jobacct, jobacct);
		jobacctinfo_destroy(jobacct);
	}
	acct_gather_profile_g_task_end(pid);

	/* See if we have any children of init left and add them to track. */
	proctrack_g_get_pids(job->cont_id, &pids, &npids);
	for (i = 0; i < npids; i++) {
		snprintf(proc_stat_file, 256, "/proc/%d/stat", pids[i]);
		if (!(stat_fp = fopen(proc_stat_file, "r")))
			continue;  /* Assume the process went away */
		fd = fileno(stat_fp);
		fcntl(fd, F_SETFD, FD_CLOEXEC);

		num_read = read(fd, sbuf, (sizeof(sbuf) - 1));

		if (num_read <= 0)
			goto next_pid;

		sbuf[num_read] = '\0';

		/* get to the end of cmd name */
		tmp = strrchr(sbuf, ')');
		*tmp = '\0';	/* replace trailing ')' with NULL */
		/* skip space after ')' too */
		sscanf(tmp + 2,	"%c %d ", state, &ppid);

		if (ppid == 1) {
			debug2("adding tracking of orphaned process %d",
			       pids[i]);
			_handle_add_extern_pid_internal(job, pids[i]);
		}
	next_pid:
		fclose(stat_fp);
	}

	return NULL;
}