示例#1
0
static void _handle_msg(slurm_msg_t *msg)
{
	static uint32_t slurm_uid = NO_VAL;
	uid_t req_uid = g_slurm_auth_get_uid(msg->auth_cred, NULL);
	uid_t uid = getuid();
	job_step_kill_msg_t *ss;
	srun_user_msg_t *um;

	if (slurm_uid == NO_VAL)
		slurm_uid = slurm_get_slurm_user_id();
	if ((req_uid != slurm_uid) && (req_uid != 0) && (req_uid != uid)) {
		error ("Security violation, slurm message from uid %u",
		       (unsigned int) req_uid);
		return;
	}

	switch (msg->msg_type) {
	case SRUN_PING:
		debug3("slurmctld ping received");
		slurm_send_rc_msg(msg, SLURM_SUCCESS);
		slurm_free_srun_ping_msg(msg->data);
		break;
	case SRUN_JOB_COMPLETE:
		debug("received job step complete message");
		_handle_step_complete(msg->data);
		slurm_free_srun_job_complete_msg(msg->data);
		break;
	case SRUN_USER_MSG:
		um = msg->data;
		info("%s", um->msg);
		slurm_free_srun_user_msg(msg->data);
		break;
	case SRUN_TIMEOUT:
		debug2("received job step timeout message");
		_handle_timeout(msg->data);
		slurm_free_srun_timeout_msg(msg->data);
		break;
	case SRUN_STEP_SIGNAL:
		ss = msg->data;
		debug("received step signal %u RPC", ss->signal);
		if (ss->signal)
			launch_p_fwd_signal(ss->signal);
		slurm_free_job_step_kill_msg(msg->data);
		break;
	default:
		debug("received spurious message type: %u",
		      msg->msg_type);
		break;
	}
	return;
}
示例#2
0
static void _handle_timeout(srun_timeout_msg_t *timeout_msg)
{
	time_t now = time(NULL);
	char time_str[24];

	if (now < timeout_msg->timeout) {
		slurm_make_time_str(&timeout_msg->timeout,
				    time_str, sizeof(time_str));
		debug("step %u.%u will timeout at %s",
		      timeout_msg->job_id, timeout_msg->step_id, time_str);
		return;
	}

	slurm_make_time_str(&now, time_str, sizeof(time_str));
	error("*** STEP %u.%u CANCELLED AT %s DUE TO TIME LIMIT ***",
	      timeout_msg->job_id, timeout_msg->step_id, time_str);
	launch_p_fwd_signal(SIGKILL);
	return;
}
示例#3
0
static void _handle_timeout(srun_timeout_msg_t *timeout_msg)
{
	time_t now = time(NULL);
	char time_str[24];

	/* It turns out if we wait for this to happen it will never
	   happen if srun is the caller without being in an
	   allocation.  So just exit instead of wait.
	*/
	/* if (now < timeout_msg->timeout) { */
	/* 	slurm_make_time_str(&timeout_msg->timeout, */
	/* 			    time_str, sizeof(time_str)); */
	/* 	debug("step %u.%u will timeout at %s", */
	/* 	      timeout_msg->job_id, timeout_msg->step_id, time_str); */
	/* 	return; */
	/* } */

	slurm_make_time_str(&now, time_str, sizeof(time_str));

	error("*** STEP %u.%u CANCELLED AT %s DUE TO TIME LIMIT ***",
	      timeout_msg->job_id, timeout_msg->step_id, time_str);
	launch_p_fwd_signal(SIGKILL);
	return;
}
示例#4
0
static void _handle_step_complete(srun_job_complete_msg_t *comp_msg)
{
	launch_p_fwd_signal(SIGKILL);
	return;
}