예제 #1
0
static int _unpack_step_complete_msg(dbd_step_comp_msg_t **msg,
				     uint16_t rpc_version, Buf buffer)
{
	uint32_t uint32_tmp;
	dbd_step_comp_msg_t *msg_ptr = xmalloc(sizeof(dbd_step_comp_msg_t));
	*msg = msg_ptr;

	if (rpc_version >= SLURM_MIN_PROTOCOL_VERSION) {
		safe_unpack32(&msg_ptr->assoc_id, buffer);
		safe_unpack64(&msg_ptr->db_index, buffer);
		safe_unpack_time(&msg_ptr->end_time, buffer);
		safe_unpack32(&msg_ptr->exit_code, buffer);
		jobacctinfo_unpack((struct jobacctinfo **)&msg_ptr->jobacct,
				   rpc_version, PROTOCOL_TYPE_DBD, buffer, 1);
		safe_unpack32(&msg_ptr->job_id, buffer);
		safe_unpack_time(&msg_ptr->job_submit_time, buffer);
		safe_unpackstr_xmalloc(&msg_ptr->job_tres_alloc_str,
				       &uint32_tmp, buffer);
		safe_unpack32(&msg_ptr->req_uid, buffer);
		safe_unpack_time(&msg_ptr->start_time, buffer);
		safe_unpack16(&msg_ptr->state, buffer);
		safe_unpack32(&msg_ptr->step_id, buffer);
		safe_unpack32(&msg_ptr->total_tasks, buffer);
	} else
		goto unpack_error;

	return SLURM_SUCCESS;

unpack_error:
	debug2("slurmdbd_unpack_step_complete_msg:"
	       "unpack_error: size_buf(buffer) %u",
	       size_buf(buffer));
	slurmdbd_free_step_complete_msg(msg_ptr);
	*msg = NULL;
	return SLURM_ERROR;
}
예제 #2
0
extern int jobacctinfo_getinfo(
	jobacctinfo_t *jobacct, enum jobacct_data_type type, void *data,
	uint16_t protocol_version)
{
	int rc = SLURM_SUCCESS;
	int *fd = (int *)data;
	uint32_t *uint32 = (uint32_t *) data;
	uint64_t *uint64 = (uint64_t *) data;
	double *dub = (double *) data;
	jobacct_id_t *jobacct_id = (jobacct_id_t *) data;
	struct rusage *rusage = (struct rusage *)data;
	struct jobacctinfo *send = (struct jobacctinfo *) data;
	char *buf = NULL;

	if (!plugin_polling)
		return SLURM_SUCCESS;

	/* jobacct needs to be allocated before this is called.	*/
	xassert(jobacct);

	switch (type) {
	case JOBACCT_DATA_TOTAL:
		memcpy(send, jobacct, sizeof(struct jobacctinfo));
		break;
	case JOBACCT_DATA_PIPE:
		if (protocol_version >= SLURM_MIN_PROTOCOL_VERSION) {
			int len;
			Buf buffer;

			safe_read(*fd, &len, sizeof(int));
			buf = xmalloc(len);
			safe_read(*fd, buf, len);
			buffer = create_buf(buf, len);
			jobacctinfo_unpack(&jobacct, protocol_version,
					   PROTOCOL_TYPE_SLURM, buffer, 0);
			free_buf(buffer);
		}

		break;
	case JOBACCT_DATA_RUSAGE:
		memset(rusage, 0, sizeof(struct rusage));
		rusage->ru_utime.tv_sec = jobacct->user_cpu_sec;
		rusage->ru_utime.tv_usec = jobacct->user_cpu_usec;
		rusage->ru_stime.tv_sec = jobacct->sys_cpu_sec;
		rusage->ru_stime.tv_usec = jobacct->sys_cpu_usec;
		break;
	case JOBACCT_DATA_MAX_RSS:
		*uint64 = jobacct->max_rss;
		break;
	case JOBACCT_DATA_MAX_RSS_ID:
		*jobacct_id = jobacct->max_rss_id;
		break;
	case JOBACCT_DATA_TOT_RSS:
		*uint64 = jobacct->tot_rss;
		break;
	case JOBACCT_DATA_MAX_VSIZE:
		*uint64 = jobacct->max_vsize;
		break;
	case JOBACCT_DATA_MAX_VSIZE_ID:
		*jobacct_id = jobacct->max_vsize_id;
		break;
	case JOBACCT_DATA_TOT_VSIZE:
		*uint64 = jobacct->tot_vsize;
		break;
	case JOBACCT_DATA_MAX_PAGES:
		*uint64 = jobacct->max_pages;
		break;
	case JOBACCT_DATA_MAX_PAGES_ID:
		*jobacct_id = jobacct->max_pages_id;
		break;
	case JOBACCT_DATA_TOT_PAGES:
		*uint64 = jobacct->tot_pages;
		break;
	case JOBACCT_DATA_MIN_CPU:
		*uint32 = jobacct->min_cpu;
		break;
	case JOBACCT_DATA_MIN_CPU_ID:
		*jobacct_id = jobacct->min_cpu_id;
		break;
	case JOBACCT_DATA_TOT_CPU:
		*dub = jobacct->tot_cpu;
		break;
	case JOBACCT_DATA_ACT_CPUFREQ:
		*uint32 = jobacct->act_cpufreq;
		break;
	case JOBACCT_DATA_CONSUMED_ENERGY:
		*uint64 = jobacct->energy.consumed_energy;
		break;
	case JOBACCT_DATA_MAX_DISK_READ:
		*dub = jobacct->max_disk_read;
		break;
	case JOBACCT_DATA_MAX_DISK_READ_ID:
		*jobacct_id = jobacct->max_disk_read_id;
		break;
	case JOBACCT_DATA_TOT_DISK_READ:
		*dub = jobacct->tot_disk_read;
		break;
	case JOBACCT_DATA_MAX_DISK_WRITE:
		*dub = jobacct->max_disk_write;
		break;
	case JOBACCT_DATA_MAX_DISK_WRITE_ID:
		*jobacct_id = jobacct->max_disk_write_id;
		break;
	case JOBACCT_DATA_TOT_DISK_WRITE:
		*dub = jobacct->tot_disk_write;
		break;
	default:
		debug("jobacct_g_set_getinfo data_type %d invalid", type);
	}
	return rc;

rwfail:
	xfree(buf);
	return SLURM_ERROR;
}
예제 #3
0
파일: req.c 프로젝트: birc-aeh/slurm
static int
_handle_completion(int fd, stepd_step_rec_t *job, uid_t uid)
{
	int rc = SLURM_SUCCESS;
	int errnum = 0;
	int first;
	int last;
	jobacctinfo_t *jobacct = NULL;
	int step_rc;
	char* buf;
	int len;
	Buf buffer;
	int version;	/* For future use */
	bool lock_set = false;

	debug("_handle_completion for job %u.%u",
	      job->jobid, job->stepid);

	debug3("  uid = %d", uid);
	if (!_slurm_authorized_user(uid)) {
		debug("step completion message from uid %ld for job %u.%u ",
		      (long)uid, job->jobid, job->stepid);
		rc = -1;
		errnum = EPERM;
		/* Send the return code and errno */
		safe_write(fd, &rc, sizeof(int));
		safe_write(fd, &errnum, sizeof(int));
		return SLURM_SUCCESS;
	}

	safe_read(fd, &version, sizeof(int));
	safe_read(fd, &first, sizeof(int));
	safe_read(fd, &last, sizeof(int));
	safe_read(fd, &step_rc, sizeof(int));

	/*
	 * We must not use getinfo over a pipe with slurmd here
	 * Indeed, slurmstepd does a large use of setinfo over a pipe
	 * with slurmd and doing the reverse can result in a deadlock
	 * scenario with slurmd :
	 * slurmd(lockforread,write)/slurmstepd(write,lockforread)
	 * Do pack/unpack instead to be sure of independances of
	 * slurmd and slurmstepd
	 */
	safe_read(fd, &len, sizeof(int));
	buf = xmalloc(len);
	safe_read(fd, buf, len);
	buffer = create_buf(buf, len);
	jobacctinfo_unpack(&jobacct, SLURM_PROTOCOL_VERSION,
			   PROTOCOL_TYPE_SLURM, buffer, 1);
	free_buf(buffer);

	/*
	 * Record the completed nodes
	 */
	pthread_mutex_lock(&step_complete.lock);
	lock_set = true;
	if (! step_complete.wait_children) {
		rc = -1;
		errnum = ETIMEDOUT; /* not used anyway */
		goto timeout;
	}

	/* SlurmUser or root can craft a launch without a valid credential
	 * ("srun --no-alloc ...") and no tree information can be built
	 *  without the hostlist from the credential. */
	if (step_complete.rank >= 0) {
#if 0
		char bits_string[128];
		debug2("Setting range %d (bit %d) through %d(bit %d)",
		       first, first-(step_complete.rank+1),
		       last, last-(step_complete.rank+1));
		bit_fmt(bits_string, sizeof(bits_string), step_complete.bits);
		debug2("  before bits: %s", bits_string);
#endif
		bit_nset(step_complete.bits,
			 first - (step_complete.rank+1),
			 last - (step_complete.rank+1));
#if 0
		bit_fmt(bits_string, sizeof(bits_string), step_complete.bits);
		debug2("  after bits: %s", bits_string);
#endif
	}
	step_complete.step_rc = MAX(step_complete.step_rc, step_rc);

	/************* acct stuff ********************/
	jobacctinfo_aggregate(step_complete.jobacct, jobacct);
timeout:
	jobacctinfo_destroy(jobacct);
	/*********************************************/

	/* Send the return code and errno, we do this within the locked
	 * region to ensure that the stepd doesn't exit before we can
	 * perform this send. */
	safe_write(fd, &rc, sizeof(int));
	safe_write(fd, &errnum, sizeof(int));
	pthread_cond_signal(&step_complete.cond);
	pthread_mutex_unlock(&step_complete.lock);

	return SLURM_SUCCESS;


rwfail:	if (lock_set) {
		pthread_cond_signal(&step_complete.cond);
		pthread_mutex_unlock(&step_complete.lock);
	}
	return SLURM_FAILURE;
}