Esempio n. 1
0
int pmixp_coll_ring_check(pmixp_coll_t *coll, pmixp_coll_ring_msg_hdr_t *hdr)
{
	char *nodename = NULL;
	int rc;

	if (hdr->nodeid != _ring_prev_id(coll)) {
		nodename = pmixp_info_job_host(hdr->nodeid);
		PMIXP_ERROR("%p: unexpected contrib from %s:%u, expected is %d",
			    coll, nodename, hdr->nodeid, _ring_prev_id(coll));
		return SLURM_ERROR;
	}
	rc = pmixp_coll_check(coll, hdr->seq);
	if (PMIXP_COLL_REQ_FAILURE == rc) {
		/* this is an unacceptable event: either something went
		 * really wrong or the state machine is incorrect.
		 * This will 100% lead to application hang.
		 */
		nodename = pmixp_info_job_host(hdr->nodeid);
		PMIXP_ERROR("Bad collective seq. #%d from %s:%u, current is %d",
			    hdr->seq, nodename, hdr->nodeid, coll->seq);
		pmixp_debug_hang(0); /* enable hang to debug this! */
		slurm_kill_job_step(pmixp_info_jobid(),
				    pmixp_info_stepid(), SIGKILL);
		xfree(nodename);
		return SLURM_SUCCESS;
	} else if (PMIXP_COLL_REQ_SKIP == rc) {
#ifdef PMIXP_COLL_DEBUG
		nodename = pmixp_info_job_host(hdr->nodeid);
		PMIXP_ERROR("Wrong collective seq. #%d from nodeid %u, current is %d, skip this message",
			    hdr->seq, hdr->nodeid, coll->seq);
#endif
		return SLURM_ERROR;
	}
	return SLURM_SUCCESS;
}
Esempio n. 2
0
extern int launch_p_step_terminate(void)
{
	info("Terminating job step %u.%u",
	     local_srun_job->jobid, local_srun_job->stepid);
	return slurm_kill_job_step(local_srun_job->jobid,
				   local_srun_job->stepid, SIGKILL);
}
Esempio n. 3
0
File: pmi2.c Progetto: cread/slurm
static int
_handle_kvs_fence(int fd, int lrank, client_req_t *req)
{
	int rc = 0;

	debug3("mpi/pmi2: in _handle_kvs_fence, from task %d",
	       job_info.gtids[lrank]);
	if (tasks_to_wait == 0 && children_to_wait == 0) {
		tasks_to_wait = job_info.ltasks;
		children_to_wait = tree_info.num_children;
	}
	tasks_to_wait --;

	/* mutex protection is not required */
	if (tasks_to_wait == 0 && children_to_wait == 0) {
		rc = temp_kvs_send();
		if (rc != SLURM_SUCCESS) {
			error("mpi/pmi2: failed to send temp kvs to %s",
			      tree_info.parent_node ?: "srun");
			send_kvs_fence_resp_to_clients(
				rc,
				"mpi/pmi2: failed to send temp kvs");
			/* cancel the step to avoid tasks hang */
			slurm_kill_job_step(job_info.jobid, job_info.stepid,
					    SIGKILL);
		} else {
Esempio n. 4
0
static int
_handle_abort(int fd, int lrank, client_req_t *req)
{
    debug3("mpi/pmi2: in _handle_abort");
    /* no response needed. just cancel the job */
    slurm_kill_job_step(job_info.jobid, job_info.stepid, SIGKILL);
    debug3("mpi/pmi2: out _handle_abort");
    return SLURM_SUCCESS;
}
Esempio n. 5
0
static void errhandler(pmix_status_t status,
		       pmix_proc_t proc[], size_t nproc,
		       pmix_info_t info[], size_t ninfo)
{
	/* TODO: do something more sophisticated here */
	/* FIXME: use proper specificator for nranges */
	PMIXP_ERROR_STD("Error handler invoked: status = %d, nranges = %d",
			status, (int) nproc);
	slurm_kill_job_step(pmixp_info_jobid(), pmixp_info_stepid(), SIGKILL);
}
Esempio n. 6
0
static int
_handle_kvs_fence(int fd, Buf buf)
{
	uint32_t from_nodeid, num_children, temp32, seq;
	char *from_node = NULL;
	int rc = SLURM_SUCCESS;

	safe_unpack32(&from_nodeid, buf);
	safe_unpackstr_xmalloc(&from_node, &temp32, buf);
	safe_unpack32(&num_children, buf);
	safe_unpack32(&seq, buf);

	debug3("mpi/pmi2: in _handle_kvs_fence, from node %u(%s) representing"
	       " %u offspring, seq=%u", from_nodeid, from_node, num_children,
	       seq);
	if (seq != kvs_seq) {
		error("mpi/pmi2: invalid kvs seq from node %u(%s) ignored, "
		      "expect %u got %u",
		      from_nodeid, from_node, kvs_seq, seq);
		goto out;
	}
	if (seq == tree_info.children_kvs_seq[from_nodeid]) {
		info("mpi/pmi2: duplicate KVS_FENCE request from node %u(%s) "
		      "ignored, seq=%u", from_nodeid, from_node, seq);
		goto out;
	}
	tree_info.children_kvs_seq[from_nodeid] = seq;

	if (tasks_to_wait == 0 && children_to_wait == 0) {
		tasks_to_wait = job_info.ltasks;
		children_to_wait = tree_info.num_children;
	}
	children_to_wait -= num_children;

	temp_kvs_merge(buf);

	if (children_to_wait == 0 && tasks_to_wait == 0) {
		rc = temp_kvs_send();
		if (rc != SLURM_SUCCESS) {
			if (in_stepd()) {
				error("mpi/pmi2: failed to send temp kvs"
				      " to %s",
				      tree_info.parent_node ?: "srun");
				send_kvs_fence_resp_to_clients(
					rc,
					"mpi/pmi2: failed to send temp kvs");
			} else {
				error("mpi/pmi2: failed to send temp kvs"
				      " to compute nodes");
			}
			/* cancel the step to avoid tasks hang */
			slurm_kill_job_step(job_info.jobid, job_info.stepid,
					    SIGKILL);
		} else {
Esempio n. 7
0
static pmix_status_t abort_fn(const pmix_proc_t *proc, void *server_object,
			      int status, const char msg[], pmix_proc_t procs[],
			      size_t nprocs, pmix_op_cbfunc_t cbfunc, void *cbdata)
{
	/* Just kill this stepid for now. Think what we can do for FT here? */
	PMIXP_DEBUG("called: status = %d, msg = %s", status, msg);
	slurm_kill_job_step(pmixp_info_jobid(), pmixp_info_stepid(), SIGKILL);

	if (NULL != cbfunc) {
		cbfunc(PMIX_SUCCESS, cbdata);
	}
	return PMIX_SUCCESS;
}
Esempio n. 8
0
static void _errhandler(size_t evhdlr_registration_id,
			pmix_status_t status,
			const pmix_proc_t *source,
			pmix_info_t info[], size_t ninfo,
			pmix_info_t *results, size_t nresults,
			pmix_event_notification_cbfunc_fn_t cbfunc,
			void *cbdata)
{
	/* TODO: do something more sophisticated here */
	/* FIXME: use proper specificator for nranges */
	PMIXP_ERROR_STD("Error handler invoked: status = %d",
			status);
	slurm_kill_job_step(pmixp_info_jobid(), pmixp_info_stepid(), SIGKILL);
}
Esempio n. 9
0
File: pmi2.c Progetto: cread/slurm
static int
_handle_abort(int fd, int lrank, client_req_t *req)
{
	int rc = SLURM_SUCCESS;
	bool is_world = false;

	debug3("mpi/pmi2: in _handle_abort");
	client_req_parse_body(req);
	client_req_get_bool(req, ISWORLD_KEY, &is_world);
	/* no response needed. just cancel the job step if required */
	if (is_world) {
		slurm_kill_job_step(job_info.jobid, job_info.stepid, SIGKILL);
	}
	return rc;
}
Esempio n. 10
0
File: ring.c Progetto: A1ve5/slurm
/* send message defined by buf and size to given rank stepd */
static int pmix_stepd_send(const char* buf, uint32_t size, int rank)
{
	int rc = SLURM_SUCCESS;

	/* map rank to host name */
	char* host = hostlist_nth(pmix_stepd_hostlist, rank); /* strdup-ed */

	/* delay to sleep between retries in seconds,
	 * if there are multiple retires, we'll grow this delay
          * using exponential backoff, doubling it each time */
	unsigned int delay = 1;

	/* we'll try multiple times to send message to stepd,
	 * we retry in case stepd is just slow to get started */
	int retries = 0;
	while (1) {
		/* attempt to send message */
		rc = slurm_forward_data(&host, tree_sock_addr, size, buf);
		if (rc == SLURM_SUCCESS) {
			/* message sent successfully, we're done */
			break;
		}

		/* check whether we've exceeded our retry count */
		retries++;
		if (retries >= MAX_RETRIES) {
			/* cancel the step to avoid tasks hang */
			slurm_kill_job_step(job_info.jobid, job_info.stepid,
					    SIGKILL);
		}

		/* didn't succeeded, but we'll retry again,
		 * sleep for a bit first */
		sleep(delay);
		delay *= 2;
	}

	/* free host name */
	free(host); /* strdup-ed */

	return rc;
} 
Esempio n. 11
0
static void
_handle_openmpi_port_error(const char *tasks, const char *hosts,
			   slurm_step_ctx_t *step_ctx)
{
	uint32_t job_id, step_id;
	char *msg = "retrying";

	if (!retry_step_begin) {
		retry_step_begin = true;
		retry_step_cnt++;
	}

	if (retry_step_cnt >= MAX_STEP_RETRIES)
		msg = "aborting";
	error("%s: tasks %s unable to claim reserved port, %s.",
	      hosts, tasks, msg);

	slurm_step_ctx_get(step_ctx, SLURM_STEP_CTX_JOBID, &job_id);
	slurm_step_ctx_get(step_ctx, SLURM_STEP_CTX_STEPID, &step_id);
	info("Terminating job step %u.%u", job_id, step_id);
	slurm_kill_job_step(job_id, step_id, SIGKILL);
}
Esempio n. 12
0
void
job_force_termination(srun_job_t *job)
{
	static int kill_sent = 0;
	static time_t last_msg = 0;

	if (kill_sent == 0) {
		info("forcing job termination");
		/* Sends SIGKILL to tasks directly */
		update_job_state(job, SRUN_JOB_FORCETERM);
	} else {
		time_t now = time(NULL);
		if (last_msg != now) {
			info("job abort in progress");
			last_msg = now;
		}
		if (kill_sent == 1) {
			/* Try sending SIGKILL through slurmctld */
			slurm_kill_job_step(job->jobid, job->stepid, SIGKILL);
		}
	}
	kill_sent++;
}
Esempio n. 13
0
static void _process_server_request(pmixp_base_hdr_t *hdr, Buf buf)
{
	int rc;

	switch (hdr->type) {
	case PMIXP_MSG_FAN_IN:
	case PMIXP_MSG_FAN_OUT: {
		pmixp_coll_t *coll;
		pmixp_proc_t *procs = NULL;
		size_t nprocs = 0;
		pmixp_coll_type_t type = 0;
		int c_nodeid;

		rc = pmixp_coll_unpack_info(buf, &type, &c_nodeid,
					    &procs, &nprocs);
		if (SLURM_SUCCESS != rc) {
			char *nodename = pmixp_info_job_host(hdr->nodeid);
			PMIXP_ERROR("Bad message header from node %s",
				    nodename);
			xfree(nodename);
			goto exit;
		}
		coll = pmixp_state_coll_get(type, procs, nprocs);
		xfree(procs);

		PMIXP_DEBUG("FENCE collective message from nodeid = %u, "
			    "type = %s, seq = %d",
			    hdr->nodeid,
			    ((PMIXP_MSG_FAN_IN == hdr->type) ?
				     "fan-in" : "fan-out"),
			    hdr->seq);
		rc = pmixp_coll_check_seq(coll, hdr->seq);
		if (PMIXP_COLL_REQ_FAILURE == rc) {
			/* this is unexepable event: either something went
			 * really wrong or the state machine is incorrect.
			 * This will 100% lead to application hang.
			 */
			char *nodename = pmixp_info_job_host(hdr->nodeid);
			PMIXP_ERROR("Bad collective seq. #%d from %s, current"
				    " is %d",
				    hdr->seq, nodename, coll->seq);
			pmixp_debug_hang(0); /* enable hang to debug this! */
			slurm_kill_job_step(pmixp_info_jobid(),
					    pmixp_info_stepid(), SIGKILL);
			xfree(nodename);
			break;
		} else if (PMIXP_COLL_REQ_SKIP == rc) {
			PMIXP_DEBUG("Wrong collective seq. #%d from"
				    " nodeid %u, current is %d, skip "
				    "this message",
				    hdr->seq, hdr->nodeid, coll->seq);
			goto exit;
		}

		if (PMIXP_MSG_FAN_IN == hdr->type) {
			pmixp_coll_contrib_child(coll, hdr->nodeid,
						 hdr->seq, buf);
		} else {
			pmixp_coll_contrib_parent(coll, hdr->nodeid,
						  hdr->seq, buf);
		}

		break;
	}
	case PMIXP_MSG_DMDX: {
		pmixp_dmdx_process(buf, hdr->nodeid, hdr->seq);
		/* buf will be free'd by the PMIx callback so
		 * protect the data by voiding the buffer.
		 * Use the statement below instead of (buf = NULL)
		 * to maintain incapsulation - in general `buf`is
		 * not a pointer, but opaque type.
		 */
		buf = create_buf(NULL, 0);
		break;
	}
	case PMIXP_MSG_INIT_DIRECT:
		PMIXP_DEBUG("Direct connection init from %d", hdr->nodeid);
		break;
#ifndef NDEBUG
	case PMIXP_MSG_PINGPONG: {
		/* if the pingpong mode was activated -
		 * node 0 sends ping requests
		 * and receiver assumed to respond back to node 0
		 */
		int msize = remaining_buf(buf);

		if (pmixp_info_nodeid()) {
			pmixp_server_pp_send(0, msize);
		} else {
			if (pmixp_server_pp_same_thread()) {
				if (pmixp_server_pp_count() ==
				    pmixp_server_pp_warmups()) {
					pmixp_server_pp_start();
				}
				if (!pmixp_server_pp_check_fini(msize)) {
					pmixp_server_pp_send(1, msize);
				}
			}
		}
		pmixp_server_pp_inc();
		break;
	}
#endif
	default:
		PMIXP_ERROR("Unknown message type %d", hdr->type);
		break;
	}

exit:
	free_buf(buf);
}
Esempio n. 14
0
static void *
_cancel_step_id (void *ci)
{
	int error_code = SLURM_SUCCESS, i;
	job_cancel_info_t *cancel_info = (job_cancel_info_t *)ci;
	uint32_t job_id  = cancel_info->job_id;
	uint32_t step_id = cancel_info->step_id;
	bool sig_set = true;
	DEF_TIMERS;

	if (cancel_info->sig == (uint16_t) NO_VAL) {
		cancel_info->sig = SIGKILL;
		sig_set = false;
	}

	if (!cancel_info->job_id_str) {
		if (cancel_info->array_job_id &&
		    (cancel_info->array_task_id == INFINITE)) {
			xstrfmtcat(cancel_info->job_id_str, "%u_*",
				   cancel_info->array_job_id);
		} else if (cancel_info->array_job_id) {
			xstrfmtcat(cancel_info->job_id_str, "%u_%u",
				   cancel_info->array_job_id,
				   cancel_info->array_task_id);
		} else {
			xstrfmtcat(cancel_info->job_id_str, "%u",
				   cancel_info->job_id);
		}
	}

	for (i = 0; i < MAX_CANCEL_RETRY; i++) {
		if (cancel_info->sig == SIGKILL) {
			verbose("Terminating step %s.%u",
				cancel_info->job_id_str, step_id);
		} else {
			verbose("Signal %u to step %s.%u",
				cancel_info->sig,
				cancel_info->job_id_str, step_id);
		}

		_add_delay();
		START_TIMER;
		if ((!sig_set) || opt.ctld)
			error_code = slurm_kill_job_step(job_id, step_id,
							 cancel_info->sig);
		else if (cancel_info->sig == SIGKILL)
			error_code = slurm_terminate_job_step(job_id, step_id);
		else
			error_code = slurm_signal_job_step(job_id, step_id,
							   cancel_info->sig);
		END_TIMER;
		slurm_mutex_lock(&max_delay_lock);
		max_resp_time = MAX(max_resp_time, DELTA_TIMER);
		slurm_mutex_unlock(&max_delay_lock);

		if ((error_code == 0) ||
		    ((errno != ESLURM_TRANSITION_STATE_NO_UPDATE) &&
		     (errno != ESLURM_JOB_PENDING)))
			break;
		verbose("Job is in transistional state, retrying");
		sleep(5 + i);
	}
	if (error_code) {
		error_code = slurm_get_errno();
		if ((opt.verbose > 0) || (error_code != ESLURM_ALREADY_DONE))
			error("Kill job error on job step id %s: %s",
		 	      cancel_info->job_id_str,
			      slurm_strerror(slurm_get_errno()));

		if ((error_code == ESLURM_ALREADY_DONE) &&
		    (cancel_info->sig == SIGKILL)) {
			error_code = 0;	/* Ignore error if job done */
		}
	}

	/* Purposely free the struct passed in here, so the caller doesn't have
	 * to keep track of it, but don't destroy the mutex and condition
	 * variables contained. */
	slurm_mutex_lock(cancel_info->num_active_threads_lock);
	*(cancel_info->rc) = MAX(*(cancel_info->rc), error_code);
	(*(cancel_info->num_active_threads))--;
	slurm_cond_signal(cancel_info->num_active_threads_cond);
	slurm_mutex_unlock(cancel_info->num_active_threads_lock);

	xfree(cancel_info->job_id_str);
	xfree(cancel_info);
	return NULL;
}
Esempio n. 15
0
static void _process_server_request(recv_header_t *_hdr, void *payload)
{
	send_header_t *hdr = &_hdr->send_hdr;
	char *nodename = pmixp_info_job_host(hdr->nodeid);
	Buf buf;
	int rc;

	buf = create_buf(payload, hdr->msgsize);

	switch (hdr->type) {
	case PMIXP_MSG_FAN_IN:
	case PMIXP_MSG_FAN_OUT: {
		pmixp_coll_t *coll;
		pmix_proc_t *procs = NULL;
		size_t nprocs = 0;
		pmixp_coll_type_t type = 0;

		rc = pmixp_coll_unpack_ranges(buf, &type, &procs, &nprocs);
		if (SLURM_SUCCESS != rc) {
			PMIXP_ERROR("Bad message header from node %s", nodename);
			return;
		}
		coll = pmixp_state_coll_get(type, procs, nprocs);
		xfree(procs);

		PMIXP_DEBUG("FENCE collective message from node \"%s\", type = %s, seq = %d",
			    nodename, (PMIXP_MSG_FAN_IN == hdr->type) ? "fan-in" : "fan-out",
			    hdr->seq);
		rc = pmixp_coll_check_seq(coll, hdr->seq, nodename);
		if (PMIXP_COLL_REQ_FAILURE == rc) {
			/* this is unexepable event: either something went
			 * really wrong or the state machine is incorrect.
			 * This will 100% lead to application hang.
			 */
			PMIXP_ERROR("Bad collective seq. #%d from %s, current is %d",
				    hdr->seq, nodename, coll->seq);
			pmixp_debug_hang(0); /* enable hang to debug this! */
			slurm_kill_job_step(pmixp_info_jobid(), pmixp_info_stepid(),
					    SIGKILL);

			break;
		} else if (PMIXP_COLL_REQ_SKIP == rc) {
			PMIXP_DEBUG("Wrong collective seq. #%d from %s, current is %d, skip this message",
				    hdr->seq, nodename, coll->seq);
			free_buf(buf);
			break;
		}

		if (PMIXP_MSG_FAN_IN == hdr->type) {
			pmixp_coll_contrib_node(coll, nodename, buf);
			/* we don't need this buffer anymore */
			free_buf(buf);
		} else {
			pmixp_coll_bcast(coll, buf);
			/* buf will be free'd by the PMIx callback */
		}

		break;
	}
	case PMIXP_MSG_DMDX: {
		pmixp_dmdx_process(buf, nodename, hdr->seq);
		break;
	}
	case PMIXP_MSG_HEALTH_CHK: {
		/* this is just health ping.
		 * TODO: can we do something more sophisticated?
		 */
		free_buf(buf);
		break;
	}
	default:
		PMIXP_ERROR("Unknown message type %d", hdr->type);
		break;
	}
	xfree(nodename);
}