static int _handle_kvs_fence(int fd, int lrank, client_req_t *req) { int rc = 0; debug3("mpi/pmi2: in _handle_kvs_fence, from task %d", job_info.gtids[lrank]); if (tasks_to_wait == 0 && children_to_wait == 0) { tasks_to_wait = job_info.ltasks; children_to_wait = tree_info.num_children; } tasks_to_wait --; /* mutex protection is not required */ if (tasks_to_wait == 0 && children_to_wait == 0) { rc = temp_kvs_send(); if (rc != SLURM_SUCCESS) { error("mpi/pmi2: failed to send temp kvs to %s", tree_info.parent_node ?: "srun"); send_kvs_fence_resp_to_clients( rc, "mpi/pmi2: failed to send temp kvs"); /* cancel the step to avoid tasks hang */ slurm_kill_job_step(job_info.jobid, job_info.stepid, SIGKILL); } else {
static int _handle_kvs_fence(int fd, Buf buf) { uint32_t from_nodeid, num_children, temp32, seq; char *from_node = NULL; int rc = SLURM_SUCCESS; safe_unpack32(&from_nodeid, buf); safe_unpackstr_xmalloc(&from_node, &temp32, buf); safe_unpack32(&num_children, buf); safe_unpack32(&seq, buf); debug3("mpi/pmi2: in _handle_kvs_fence, from node %u(%s) representing" " %u offspring, seq=%u", from_nodeid, from_node, num_children, seq); if (seq != kvs_seq) { error("mpi/pmi2: invalid kvs seq from node %u(%s) ignored, " "expect %u got %u", from_nodeid, from_node, kvs_seq, seq); goto out; } if (seq == tree_info.children_kvs_seq[from_nodeid]) { info("mpi/pmi2: duplicate KVS_FENCE request from node %u(%s) " "ignored, seq=%u", from_nodeid, from_node, seq); goto out; } tree_info.children_kvs_seq[from_nodeid] = seq; if (tasks_to_wait == 0 && children_to_wait == 0) { tasks_to_wait = job_info.ltasks; children_to_wait = tree_info.num_children; } children_to_wait -= num_children; temp_kvs_merge(buf); if (children_to_wait == 0 && tasks_to_wait == 0) { rc = temp_kvs_send(); if (rc != SLURM_SUCCESS) { if (in_stepd()) { error("mpi/pmi2: failed to send temp kvs" " to %s", tree_info.parent_node ?: "srun"); send_kvs_fence_resp_to_clients( rc, "mpi/pmi2: failed to send temp kvs"); } else { error("mpi/pmi2: failed to send temp kvs" " to compute nodes"); } /* cancel the step to avoid tasks hang */ slurm_kill_job_step(job_info.jobid, job_info.stepid, SIGKILL); } else {
static int _handle_barrier_in(int fd, int lrank, client_req_t *req) { int rc = 0; debug3("mpi/pmi2: in _handle_barrier_in, from task %d", job_info.gtids[lrank]); if (tasks_to_wait == 0 && children_to_wait == 0) { tasks_to_wait = job_info.ltasks; children_to_wait = tree_info.num_children; } tasks_to_wait --; /* mutex protection is not required */ if (tasks_to_wait == 0 && children_to_wait == 0) { rc = temp_kvs_send(); } debug3("mpi/pmi2: out _handle_barrier_in, tasks_to_wait=%d, " "children_to_wait=%d", tasks_to_wait, children_to_wait); return rc; }