Exemple #1
0
static void _send_step_complete_rpc(srun_job_t *srun_job, int step_rc)
{
	slurm_msg_t req;
	step_complete_msg_t msg;
	int rc;

	memset(&msg, 0, sizeof(step_complete_msg_t));
	msg.job_id = srun_job->jobid;
	msg.job_step_id = srun_job->stepid;
	msg.range_first = 0;
	msg.range_last = 0;
	msg.step_rc = step_rc;
	msg.jobacct = jobacctinfo_create(NULL);

	slurm_msg_t_init(&req);
	req.msg_type = REQUEST_STEP_COMPLETE;
	req.data = &msg;
/*	req.address = step_complete.parent_addr; */

	debug3("Sending step complete RPC to slurmctld");
	if (slurm_send_recv_controller_rc_msg(&req, &rc, working_cluster_rec)
	    < 0)
		error("Error sending step complete RPC to slurmctld");
	jobacctinfo_destroy(msg.jobacct);
}
Exemple #2
0
/*
 *
 * Returns jobacctinfo_t struct on success, NULL on error.
 * jobacctinfo_t must be freed after calling this function.
 */
int
stepd_stat_jobacct(int fd, job_step_id_msg_t *sent, job_step_stat_t *resp)
{
	int req = REQUEST_STEP_STAT;
	int rc = SLURM_SUCCESS;
	int tasks = 0;

	debug("Entering stepd_stat_jobacct for job %u.%u",
	      sent->job_id, sent->step_id);
	safe_write(fd, &req, sizeof(int));

	/* Receive the jobacct struct and return */
	resp->jobacct = jobacctinfo_create(NULL);

	/* Do not attempt reading data until there is something to read.
	 * Avoid locking the jobacct_gather plugin early and creating
	 * possible deadlock. */
	if (wait_fd_readable(fd, 300))
		goto rwfail;
	rc = jobacctinfo_getinfo(resp->jobacct, JOBACCT_DATA_PIPE, &fd);

	safe_read(fd, &tasks, sizeof(int));
	resp->num_tasks = tasks;

	return rc;
rwfail:
	error("gathering job accounting: %d", rc);
	jobacctinfo_destroy(resp->jobacct);
	resp->jobacct = NULL;
	return rc;
}
Exemple #3
0
static stepd_step_rec_t *
_step_setup(slurm_addr_t *cli, slurm_addr_t *self, slurm_msg_t *msg)
{
	stepd_step_rec_t *job = NULL;

	switch (msg->msg_type) {
	case REQUEST_BATCH_JOB_LAUNCH:
		debug2("setup for a batch_job");
		job = mgr_launch_batch_job_setup(msg->data, cli);
		break;
	case REQUEST_LAUNCH_TASKS:
		debug2("setup for a launch_task");
		job = mgr_launch_tasks_setup(msg->data, cli, self,
					     msg->protocol_version);
		break;
	default:
		fatal("handle_launch_message: Unrecognized launch RPC");
		break;
	}

	if (!job) {
		error("_step_setup: no job returned");
		return NULL;
	}

	job->jmgr_pid = getpid();
	job->jobacct = jobacctinfo_create(NULL);

	/* Establish GRES environment variables */
	if (conf->debug_flags & DEBUG_FLAG_GRES) {
		gres_plugin_job_state_log(job->job_gres_list, job->jobid);
		gres_plugin_step_state_log(job->step_gres_list, job->jobid,
					   job->stepid);
	}
	if (msg->msg_type == REQUEST_BATCH_JOB_LAUNCH)
		gres_plugin_job_set_env(&job->env, job->job_gres_list, 0);
	else if (msg->msg_type == REQUEST_LAUNCH_TASKS)
		gres_plugin_step_set_env(&job->env, job->step_gres_list, 0);

	/*
	 * Add slurmd node topology informations to job env array
	 */
	env_array_overwrite(&job->env,"SLURM_TOPOLOGY_ADDR",
			    conf->node_topo_addr);
	env_array_overwrite(&job->env,"SLURM_TOPOLOGY_ADDR_PATTERN",
			    conf->node_topo_pattern);

	set_msg_node_id(job);

	return job;
}
Exemple #4
0
static int
_handle_stat_jobacct(int fd, stepd_step_rec_t *job, uid_t uid)
{
	jobacctinfo_t *jobacct = NULL;
	jobacctinfo_t *temp_jobacct = NULL;
	int i = 0;
	int num_tasks = 0;
	debug("_handle_stat_jobacct for job %u.%u",
	      job->jobid, job->stepid);

	debug3("  uid = %d", uid);
	if (uid != job->uid && !_slurm_authorized_user(uid)) {
		debug("stat jobacct from uid %ld for job %u.%u "
		      "owned by uid %ld",
		      (long)uid, job->jobid, job->stepid, (long)job->uid);
		/* Send NULL */
		jobacctinfo_setinfo(jobacct, JOBACCT_DATA_PIPE, &fd,
				    SLURM_PROTOCOL_VERSION);
		return SLURM_ERROR;
	}

	jobacct = jobacctinfo_create(NULL);
	debug3("num tasks = %d", job->node_tasks);

	for (i = 0; i < job->node_tasks; i++) {
		temp_jobacct = jobacct_gather_stat_task(job->task[i]->pid);
		if (temp_jobacct) {
			jobacctinfo_aggregate(jobacct, temp_jobacct);
			jobacctinfo_destroy(temp_jobacct);
			num_tasks++;
		}
	}

	jobacctinfo_setinfo(jobacct, JOBACCT_DATA_PIPE, &fd,
			    SLURM_PROTOCOL_VERSION);
	safe_write(fd, &num_tasks, sizeof(int));

	jobacctinfo_destroy(jobacct);

	return SLURM_SUCCESS;

rwfail:
	jobacctinfo_destroy(jobacct);
	return SLURM_ERROR;
}
Exemple #5
0
extern int jobacct_gather_add_task(pid_t pid, jobacct_id_t *jobacct_id,
				   int poll)
{
	struct jobacctinfo *jobacct;

	if (jobacct_gather_init() < 0)
		return SLURM_ERROR;

	if (!plugin_polling)
		return SLURM_SUCCESS;

	if (_jobacct_shutdown_test())
		return SLURM_ERROR;

	jobacct = jobacctinfo_create(jobacct_id);

	slurm_mutex_lock(&task_list_lock);
	if (pid <= 0) {
		error("invalid pid given (%d) for task acct", pid);
		goto error;
	} else if (!task_list) {
		error("no task list created!");
		goto error;
	}

	jobacct->pid = pid;
	memcpy(&jobacct->id, jobacct_id, sizeof(jobacct_id_t));
	jobacct->min_cpu = 0;
	debug2("adding task %u pid %d on node %u to jobacct",
	       jobacct_id->taskid, pid, jobacct_id->nodeid);
	list_push(task_list, jobacct);
	slurm_mutex_unlock(&task_list_lock);

	(*(ops.add_task))(pid, jobacct_id);

	if (poll == 1)
		_poll_data(1);

	return SLURM_SUCCESS;
error:
	slurm_mutex_unlock(&task_list_lock);
	jobacctinfo_destroy(jobacct);
	return SLURM_ERROR;
}
Exemple #6
0
static int _parse_comp_file(
	char *file, step_update_request_msg_t *step_msg)
{
	int i;
	FILE *fd = fopen(file, "r");
	char line[BUFFER_SIZE];
	char *fptr;
	int version;
	char *update[MAX_RECORD_FIELDS+1];    /* End list with null entry and,
						 possibly, more data than we
						 expected */

	enum {	UPDATE_STEP_VERSION,
		UPDATE_STEP_EXTRA,
		UPDATE_STEP_INBLOCKS,
		UPDATE_STEP_OUTBLOCKS,
		UPDATE_STEP_EXITCODE,
		UPDATE_STEP_CPU_ALLOC,
		UPDATE_STEP_START,
		UPDATE_STEP_END,
		UPDATE_STEP_USER_SEC,
		UPDATE_STEP_SYS_SEC,
		UPDATE_STEP_MAX_RSS,
		UPDATE_STEP_UID,
		UPDATE_STEP_STEPNAME,
		UPDATE_STEP_VER1_LENGTH
	};

	if (fd == NULL) {
		perror(file);
		return SLURM_ERROR;
	}

	if (!fgets(line, BUFFER_SIZE, fd)) {
		fprintf(stderr, "Empty step update completion file\n");
		return SLURM_ERROR;
	}

	fptr = line;	/* break the record into NULL-terminated strings */
	for (i = 0; i < MAX_RECORD_FIELDS; i++) {
		update[i] = fptr;
		fptr = strstr(fptr, " ");
		if (fptr == NULL) {
			fptr = strstr(update[i], "\n");
			if (fptr)
				*fptr = 0;
			break;
		} else
			*fptr++ = 0;
	}

	if (i < MAX_RECORD_FIELDS)
		i++;
	update[i] = 0;

	version = atoi(update[UPDATE_STEP_VERSION]);
	switch (version) {
	case 1:
		if (i != UPDATE_STEP_VER1_LENGTH) {
			fprintf(stderr,
				"Bad step update completion file length\n");
			return SLURM_ERROR;
		}
		step_msg->jobacct = jobacctinfo_create(NULL);
		step_msg->exit_code = atoi(update[UPDATE_STEP_EXITCODE]);
		step_msg->start_time = atoi(update[UPDATE_STEP_START]);
		step_msg->end_time = atoi(update[UPDATE_STEP_END]);
		step_msg->jobacct->user_cpu_sec =
			atoi(update[UPDATE_STEP_USER_SEC]);
		step_msg->jobacct->sys_cpu_sec =
			atoi(update[UPDATE_STEP_SYS_SEC]);
		step_msg->jobacct->min_cpu =
			step_msg->jobacct->user_cpu_sec
			+ step_msg->jobacct->sys_cpu_sec;
		step_msg->jobacct->max_rss = atoi(update[UPDATE_STEP_MAX_RSS]);
		step_msg->name =
			xstrdup(xbasename(update[UPDATE_STEP_STEPNAME]));
		break;
	default:
		fprintf(stderr, "Unsupported step update "
			"completion file version: %d\n",
			version);
		return SLURM_ERROR;
		break;
	}


	return SLURM_SUCCESS;
}
Exemple #7
0
/*
 *  This function handles the initialization information from slurmd
 *  sent by _send_slurmstepd_init() in src/slurmd/slurmd/req.c.
 */
static int
_init_from_slurmd(int sock, char **argv,
		  slurm_addr_t **_cli, slurm_addr_t **_self, slurm_msg_t **_msg,
		  int *_ngids, gid_t **_gids)
{
	char *incoming_buffer = NULL;
	Buf buffer;
	int step_type;
	int len, proto;
	slurm_addr_t *cli = NULL;
	slurm_addr_t *self = NULL;
	slurm_msg_t *msg = NULL;
	int ngids = 0;
	gid_t *gids = NULL;
	uint16_t port;
	char buf[16];
	log_options_t lopts = LOG_OPTS_INITIALIZER;

	log_init(argv[0], lopts, LOG_DAEMON, NULL);

	/* receive job type from slurmd */
	safe_read(sock, &step_type, sizeof(int));
	debug3("step_type = %d", step_type);

	/* receive reverse-tree info from slurmd */
	slurm_mutex_lock(&step_complete.lock);
	safe_read(sock, &step_complete.rank, sizeof(int));
	safe_read(sock, &step_complete.parent_rank, sizeof(int));
	safe_read(sock, &step_complete.children, sizeof(int));
	safe_read(sock, &step_complete.depth, sizeof(int));
	safe_read(sock, &step_complete.max_depth, sizeof(int));
	safe_read(sock, &step_complete.parent_addr, sizeof(slurm_addr_t));
	step_complete.bits = bit_alloc(step_complete.children);
	step_complete.jobacct = jobacctinfo_create(NULL);
	slurm_mutex_unlock(&step_complete.lock);

	/* receive conf from slurmd */
	if ((conf = read_slurmd_conf_lite (sock)) == NULL)
		fatal("Failed to read conf from slurmd");

	log_alter(conf->log_opts, 0, conf->logfile);
	log_set_timefmt(conf->log_fmt);

	debug2("debug level is %d.", conf->debug_level);

	switch_g_slurmd_step_init();

	slurm_get_ip_str(&step_complete.parent_addr, &port, buf, 16);
	debug3("slurmstepd rank %d, parent address = %s, port = %u",
	       step_complete.rank, buf, port);

	/* receive cli from slurmd */
	safe_read(sock, &len, sizeof(int));
	incoming_buffer = xmalloc(sizeof(char) * len);
	safe_read(sock, incoming_buffer, len);
	buffer = create_buf(incoming_buffer,len);
	cli = xmalloc(sizeof(slurm_addr_t));
	if (slurm_unpack_slurm_addr_no_alloc(cli, buffer) == SLURM_ERROR)
		fatal("slurmstepd: problem with unpack of slurmd_conf");
	free_buf(buffer);

	/* receive self from slurmd */
	safe_read(sock, &len, sizeof(int));
	if (len > 0) {
		/* receive packed self from main slurmd */
		incoming_buffer = xmalloc(sizeof(char) * len);
		safe_read(sock, incoming_buffer, len);
		buffer = create_buf(incoming_buffer,len);
		self = xmalloc(sizeof(slurm_addr_t));
		if (slurm_unpack_slurm_addr_no_alloc(self, buffer)
		    == SLURM_ERROR) {
			fatal("slurmstepd: problem with unpack of "
			      "slurmd_conf");
		}
		free_buf(buffer);
	}

	/* Receive GRES information from slurmd */
	gres_plugin_recv_stepd(sock);

	/* Grab the slurmd's spooldir. Has %n expanded. */
	cpu_freq_init(conf);

	/* Receive cpu_frequency info from slurmd */
	cpu_freq_recv_info(sock);

	/* get the protocol version of the srun */
	safe_read(sock, &proto, sizeof(int));

	/* receive req from slurmd */
	safe_read(sock, &len, sizeof(int));
	incoming_buffer = xmalloc(sizeof(char) * len);
	safe_read(sock, incoming_buffer, len);
	buffer = create_buf(incoming_buffer,len);

	msg = xmalloc(sizeof(slurm_msg_t));
	slurm_msg_t_init(msg);
	msg->protocol_version = (uint16_t)proto;

	switch (step_type) {
	case LAUNCH_BATCH_JOB:
		msg->msg_type = REQUEST_BATCH_JOB_LAUNCH;
		break;
	case LAUNCH_TASKS:
		msg->msg_type = REQUEST_LAUNCH_TASKS;
		break;
	default:
		fatal("%s: Unrecognized launch RPC (%d)", __func__, step_type);
		break;
	}
	if (unpack_msg(msg, buffer) == SLURM_ERROR)
		fatal("slurmstepd: we didn't unpack the request correctly");
	free_buf(buffer);

	/* receive cached group ids array for the relevant uid */
	safe_read(sock, &ngids, sizeof(int));
	if (ngids > 0) {
		int i;
		uint32_t tmp32;

		gids = (gid_t *)xmalloc(sizeof(gid_t) * ngids);
		for (i = 0; i < ngids; i++) {
			safe_read(sock, &tmp32, sizeof(uint32_t));
			gids[i] = (gid_t)tmp32;
			debug2("got gid %d", gids[i]);
		}
	}

	*_cli = cli;
	*_self = self;
	*_msg = msg;
	*_ngids = ngids;
	*_gids = gids;

	return 1;

rwfail:
	fatal("Error reading initialization data from slurmd");
	exit(1);
}