示例#1
0
文件: pmi2.c 项目: cread/slurm
static int
_handle_kvs_fence(int fd, int lrank, client_req_t *req)
{
	int rc = 0;

	debug3("mpi/pmi2: in _handle_kvs_fence, from task %d",
	       job_info.gtids[lrank]);
	if (tasks_to_wait == 0 && children_to_wait == 0) {
		tasks_to_wait = job_info.ltasks;
		children_to_wait = tree_info.num_children;
	}
	tasks_to_wait --;

	/* mutex protection is not required */
	if (tasks_to_wait == 0 && children_to_wait == 0) {
		rc = temp_kvs_send();
		if (rc != SLURM_SUCCESS) {
			error("mpi/pmi2: failed to send temp kvs to %s",
			      tree_info.parent_node ?: "srun");
			send_kvs_fence_resp_to_clients(
				rc,
				"mpi/pmi2: failed to send temp kvs");
			/* cancel the step to avoid tasks hang */
			slurm_kill_job_step(job_info.jobid, job_info.stepid,
					    SIGKILL);
		} else {
示例#2
0
static int
_handle_kvs_fence(int fd, Buf buf)
{
	uint32_t from_nodeid, num_children, temp32, seq;
	char *from_node = NULL;
	int rc = SLURM_SUCCESS;

	safe_unpack32(&from_nodeid, buf);
	safe_unpackstr_xmalloc(&from_node, &temp32, buf);
	safe_unpack32(&num_children, buf);
	safe_unpack32(&seq, buf);

	debug3("mpi/pmi2: in _handle_kvs_fence, from node %u(%s) representing"
	       " %u offspring, seq=%u", from_nodeid, from_node, num_children,
	       seq);
	if (seq != kvs_seq) {
		error("mpi/pmi2: invalid kvs seq from node %u(%s) ignored, "
		      "expect %u got %u",
		      from_nodeid, from_node, kvs_seq, seq);
		goto out;
	}
	if (seq == tree_info.children_kvs_seq[from_nodeid]) {
		info("mpi/pmi2: duplicate KVS_FENCE request from node %u(%s) "
		      "ignored, seq=%u", from_nodeid, from_node, seq);
		goto out;
	}
	tree_info.children_kvs_seq[from_nodeid] = seq;

	if (tasks_to_wait == 0 && children_to_wait == 0) {
		tasks_to_wait = job_info.ltasks;
		children_to_wait = tree_info.num_children;
	}
	children_to_wait -= num_children;

	temp_kvs_merge(buf);

	if (children_to_wait == 0 && tasks_to_wait == 0) {
		rc = temp_kvs_send();
		if (rc != SLURM_SUCCESS) {
			if (in_stepd()) {
				error("mpi/pmi2: failed to send temp kvs"
				      " to %s",
				      tree_info.parent_node ?: "srun");
				send_kvs_fence_resp_to_clients(
					rc,
					"mpi/pmi2: failed to send temp kvs");
			} else {
				error("mpi/pmi2: failed to send temp kvs"
				      " to compute nodes");
			}
			/* cancel the step to avoid tasks hang */
			slurm_kill_job_step(job_info.jobid, job_info.stepid,
					    SIGKILL);
		} else {
示例#3
0
static int
_handle_barrier_in(int fd, int lrank, client_req_t *req)
{
    int rc = 0;

    debug3("mpi/pmi2: in _handle_barrier_in, from task %d",
           job_info.gtids[lrank]);
    if (tasks_to_wait == 0 && children_to_wait == 0) {
        tasks_to_wait = job_info.ltasks;
        children_to_wait = tree_info.num_children;
    }
    tasks_to_wait --;

    /* mutex protection is not required */
    if (tasks_to_wait == 0 && children_to_wait == 0) {
        rc = temp_kvs_send();
    }
    debug3("mpi/pmi2: out _handle_barrier_in, tasks_to_wait=%d, "
           "children_to_wait=%d", tasks_to_wait, children_to_wait);
    return rc;
}