int pmixp_coll_ring_check(pmixp_coll_t *coll, pmixp_coll_ring_msg_hdr_t *hdr) { char *nodename = NULL; int rc; if (hdr->nodeid != _ring_prev_id(coll)) { nodename = pmixp_info_job_host(hdr->nodeid); PMIXP_ERROR("%p: unexpected contrib from %s:%u, expected is %d", coll, nodename, hdr->nodeid, _ring_prev_id(coll)); return SLURM_ERROR; } rc = pmixp_coll_check(coll, hdr->seq); if (PMIXP_COLL_REQ_FAILURE == rc) { /* this is an unacceptable event: either something went * really wrong or the state machine is incorrect. * This will 100% lead to application hang. */ nodename = pmixp_info_job_host(hdr->nodeid); PMIXP_ERROR("Bad collective seq. #%d from %s:%u, current is %d", hdr->seq, nodename, hdr->nodeid, coll->seq); pmixp_debug_hang(0); /* enable hang to debug this! */ slurm_kill_job_step(pmixp_info_jobid(), pmixp_info_stepid(), SIGKILL); xfree(nodename); return SLURM_SUCCESS; } else if (PMIXP_COLL_REQ_SKIP == rc) { #ifdef PMIXP_COLL_DEBUG nodename = pmixp_info_job_host(hdr->nodeid); PMIXP_ERROR("Wrong collective seq. #%d from nodeid %u, current is %d, skip this message", hdr->seq, hdr->nodeid, coll->seq); #endif return SLURM_ERROR; } return SLURM_SUCCESS; }
extern int launch_p_step_terminate(void) { info("Terminating job step %u.%u", local_srun_job->jobid, local_srun_job->stepid); return slurm_kill_job_step(local_srun_job->jobid, local_srun_job->stepid, SIGKILL); }
static int _handle_kvs_fence(int fd, int lrank, client_req_t *req) { int rc = 0; debug3("mpi/pmi2: in _handle_kvs_fence, from task %d", job_info.gtids[lrank]); if (tasks_to_wait == 0 && children_to_wait == 0) { tasks_to_wait = job_info.ltasks; children_to_wait = tree_info.num_children; } tasks_to_wait --; /* mutex protection is not required */ if (tasks_to_wait == 0 && children_to_wait == 0) { rc = temp_kvs_send(); if (rc != SLURM_SUCCESS) { error("mpi/pmi2: failed to send temp kvs to %s", tree_info.parent_node ?: "srun"); send_kvs_fence_resp_to_clients( rc, "mpi/pmi2: failed to send temp kvs"); /* cancel the step to avoid tasks hang */ slurm_kill_job_step(job_info.jobid, job_info.stepid, SIGKILL); } else {
static int _handle_abort(int fd, int lrank, client_req_t *req) { debug3("mpi/pmi2: in _handle_abort"); /* no response needed. just cancel the job */ slurm_kill_job_step(job_info.jobid, job_info.stepid, SIGKILL); debug3("mpi/pmi2: out _handle_abort"); return SLURM_SUCCESS; }
static void errhandler(pmix_status_t status, pmix_proc_t proc[], size_t nproc, pmix_info_t info[], size_t ninfo) { /* TODO: do something more sophisticated here */ /* FIXME: use proper specificator for nranges */ PMIXP_ERROR_STD("Error handler invoked: status = %d, nranges = %d", status, (int) nproc); slurm_kill_job_step(pmixp_info_jobid(), pmixp_info_stepid(), SIGKILL); }
static int _handle_kvs_fence(int fd, Buf buf) { uint32_t from_nodeid, num_children, temp32, seq; char *from_node = NULL; int rc = SLURM_SUCCESS; safe_unpack32(&from_nodeid, buf); safe_unpackstr_xmalloc(&from_node, &temp32, buf); safe_unpack32(&num_children, buf); safe_unpack32(&seq, buf); debug3("mpi/pmi2: in _handle_kvs_fence, from node %u(%s) representing" " %u offspring, seq=%u", from_nodeid, from_node, num_children, seq); if (seq != kvs_seq) { error("mpi/pmi2: invalid kvs seq from node %u(%s) ignored, " "expect %u got %u", from_nodeid, from_node, kvs_seq, seq); goto out; } if (seq == tree_info.children_kvs_seq[from_nodeid]) { info("mpi/pmi2: duplicate KVS_FENCE request from node %u(%s) " "ignored, seq=%u", from_nodeid, from_node, seq); goto out; } tree_info.children_kvs_seq[from_nodeid] = seq; if (tasks_to_wait == 0 && children_to_wait == 0) { tasks_to_wait = job_info.ltasks; children_to_wait = tree_info.num_children; } children_to_wait -= num_children; temp_kvs_merge(buf); if (children_to_wait == 0 && tasks_to_wait == 0) { rc = temp_kvs_send(); if (rc != SLURM_SUCCESS) { if (in_stepd()) { error("mpi/pmi2: failed to send temp kvs" " to %s", tree_info.parent_node ?: "srun"); send_kvs_fence_resp_to_clients( rc, "mpi/pmi2: failed to send temp kvs"); } else { error("mpi/pmi2: failed to send temp kvs" " to compute nodes"); } /* cancel the step to avoid tasks hang */ slurm_kill_job_step(job_info.jobid, job_info.stepid, SIGKILL); } else {
static pmix_status_t abort_fn(const pmix_proc_t *proc, void *server_object, int status, const char msg[], pmix_proc_t procs[], size_t nprocs, pmix_op_cbfunc_t cbfunc, void *cbdata) { /* Just kill this stepid for now. Think what we can do for FT here? */ PMIXP_DEBUG("called: status = %d, msg = %s", status, msg); slurm_kill_job_step(pmixp_info_jobid(), pmixp_info_stepid(), SIGKILL); if (NULL != cbfunc) { cbfunc(PMIX_SUCCESS, cbdata); } return PMIX_SUCCESS; }
static void _errhandler(size_t evhdlr_registration_id, pmix_status_t status, const pmix_proc_t *source, pmix_info_t info[], size_t ninfo, pmix_info_t *results, size_t nresults, pmix_event_notification_cbfunc_fn_t cbfunc, void *cbdata) { /* TODO: do something more sophisticated here */ /* FIXME: use proper specificator for nranges */ PMIXP_ERROR_STD("Error handler invoked: status = %d", status); slurm_kill_job_step(pmixp_info_jobid(), pmixp_info_stepid(), SIGKILL); }
static int _handle_abort(int fd, int lrank, client_req_t *req) { int rc = SLURM_SUCCESS; bool is_world = false; debug3("mpi/pmi2: in _handle_abort"); client_req_parse_body(req); client_req_get_bool(req, ISWORLD_KEY, &is_world); /* no response needed. just cancel the job step if required */ if (is_world) { slurm_kill_job_step(job_info.jobid, job_info.stepid, SIGKILL); } return rc; }
/* send message defined by buf and size to given rank stepd */ static int pmix_stepd_send(const char* buf, uint32_t size, int rank) { int rc = SLURM_SUCCESS; /* map rank to host name */ char* host = hostlist_nth(pmix_stepd_hostlist, rank); /* strdup-ed */ /* delay to sleep between retries in seconds, * if there are multiple retires, we'll grow this delay * using exponential backoff, doubling it each time */ unsigned int delay = 1; /* we'll try multiple times to send message to stepd, * we retry in case stepd is just slow to get started */ int retries = 0; while (1) { /* attempt to send message */ rc = slurm_forward_data(&host, tree_sock_addr, size, buf); if (rc == SLURM_SUCCESS) { /* message sent successfully, we're done */ break; } /* check whether we've exceeded our retry count */ retries++; if (retries >= MAX_RETRIES) { /* cancel the step to avoid tasks hang */ slurm_kill_job_step(job_info.jobid, job_info.stepid, SIGKILL); } /* didn't succeeded, but we'll retry again, * sleep for a bit first */ sleep(delay); delay *= 2; } /* free host name */ free(host); /* strdup-ed */ return rc; }
static void _handle_openmpi_port_error(const char *tasks, const char *hosts, slurm_step_ctx_t *step_ctx) { uint32_t job_id, step_id; char *msg = "retrying"; if (!retry_step_begin) { retry_step_begin = true; retry_step_cnt++; } if (retry_step_cnt >= MAX_STEP_RETRIES) msg = "aborting"; error("%s: tasks %s unable to claim reserved port, %s.", hosts, tasks, msg); slurm_step_ctx_get(step_ctx, SLURM_STEP_CTX_JOBID, &job_id); slurm_step_ctx_get(step_ctx, SLURM_STEP_CTX_STEPID, &step_id); info("Terminating job step %u.%u", job_id, step_id); slurm_kill_job_step(job_id, step_id, SIGKILL); }
void job_force_termination(srun_job_t *job) { static int kill_sent = 0; static time_t last_msg = 0; if (kill_sent == 0) { info("forcing job termination"); /* Sends SIGKILL to tasks directly */ update_job_state(job, SRUN_JOB_FORCETERM); } else { time_t now = time(NULL); if (last_msg != now) { info("job abort in progress"); last_msg = now; } if (kill_sent == 1) { /* Try sending SIGKILL through slurmctld */ slurm_kill_job_step(job->jobid, job->stepid, SIGKILL); } } kill_sent++; }
static void _process_server_request(pmixp_base_hdr_t *hdr, Buf buf) { int rc; switch (hdr->type) { case PMIXP_MSG_FAN_IN: case PMIXP_MSG_FAN_OUT: { pmixp_coll_t *coll; pmixp_proc_t *procs = NULL; size_t nprocs = 0; pmixp_coll_type_t type = 0; int c_nodeid; rc = pmixp_coll_unpack_info(buf, &type, &c_nodeid, &procs, &nprocs); if (SLURM_SUCCESS != rc) { char *nodename = pmixp_info_job_host(hdr->nodeid); PMIXP_ERROR("Bad message header from node %s", nodename); xfree(nodename); goto exit; } coll = pmixp_state_coll_get(type, procs, nprocs); xfree(procs); PMIXP_DEBUG("FENCE collective message from nodeid = %u, " "type = %s, seq = %d", hdr->nodeid, ((PMIXP_MSG_FAN_IN == hdr->type) ? "fan-in" : "fan-out"), hdr->seq); rc = pmixp_coll_check_seq(coll, hdr->seq); if (PMIXP_COLL_REQ_FAILURE == rc) { /* this is unexepable event: either something went * really wrong or the state machine is incorrect. * This will 100% lead to application hang. */ char *nodename = pmixp_info_job_host(hdr->nodeid); PMIXP_ERROR("Bad collective seq. #%d from %s, current" " is %d", hdr->seq, nodename, coll->seq); pmixp_debug_hang(0); /* enable hang to debug this! */ slurm_kill_job_step(pmixp_info_jobid(), pmixp_info_stepid(), SIGKILL); xfree(nodename); break; } else if (PMIXP_COLL_REQ_SKIP == rc) { PMIXP_DEBUG("Wrong collective seq. #%d from" " nodeid %u, current is %d, skip " "this message", hdr->seq, hdr->nodeid, coll->seq); goto exit; } if (PMIXP_MSG_FAN_IN == hdr->type) { pmixp_coll_contrib_child(coll, hdr->nodeid, hdr->seq, buf); } else { pmixp_coll_contrib_parent(coll, hdr->nodeid, hdr->seq, buf); } break; } case PMIXP_MSG_DMDX: { pmixp_dmdx_process(buf, hdr->nodeid, hdr->seq); /* buf will be free'd by the PMIx callback so * protect the data by voiding the buffer. * Use the statement below instead of (buf = NULL) * to maintain incapsulation - in general `buf`is * not a pointer, but opaque type. */ buf = create_buf(NULL, 0); break; } case PMIXP_MSG_INIT_DIRECT: PMIXP_DEBUG("Direct connection init from %d", hdr->nodeid); break; #ifndef NDEBUG case PMIXP_MSG_PINGPONG: { /* if the pingpong mode was activated - * node 0 sends ping requests * and receiver assumed to respond back to node 0 */ int msize = remaining_buf(buf); if (pmixp_info_nodeid()) { pmixp_server_pp_send(0, msize); } else { if (pmixp_server_pp_same_thread()) { if (pmixp_server_pp_count() == pmixp_server_pp_warmups()) { pmixp_server_pp_start(); } if (!pmixp_server_pp_check_fini(msize)) { pmixp_server_pp_send(1, msize); } } } pmixp_server_pp_inc(); break; } #endif default: PMIXP_ERROR("Unknown message type %d", hdr->type); break; } exit: free_buf(buf); }
static void * _cancel_step_id (void *ci) { int error_code = SLURM_SUCCESS, i; job_cancel_info_t *cancel_info = (job_cancel_info_t *)ci; uint32_t job_id = cancel_info->job_id; uint32_t step_id = cancel_info->step_id; bool sig_set = true; DEF_TIMERS; if (cancel_info->sig == (uint16_t) NO_VAL) { cancel_info->sig = SIGKILL; sig_set = false; } if (!cancel_info->job_id_str) { if (cancel_info->array_job_id && (cancel_info->array_task_id == INFINITE)) { xstrfmtcat(cancel_info->job_id_str, "%u_*", cancel_info->array_job_id); } else if (cancel_info->array_job_id) { xstrfmtcat(cancel_info->job_id_str, "%u_%u", cancel_info->array_job_id, cancel_info->array_task_id); } else { xstrfmtcat(cancel_info->job_id_str, "%u", cancel_info->job_id); } } for (i = 0; i < MAX_CANCEL_RETRY; i++) { if (cancel_info->sig == SIGKILL) { verbose("Terminating step %s.%u", cancel_info->job_id_str, step_id); } else { verbose("Signal %u to step %s.%u", cancel_info->sig, cancel_info->job_id_str, step_id); } _add_delay(); START_TIMER; if ((!sig_set) || opt.ctld) error_code = slurm_kill_job_step(job_id, step_id, cancel_info->sig); else if (cancel_info->sig == SIGKILL) error_code = slurm_terminate_job_step(job_id, step_id); else error_code = slurm_signal_job_step(job_id, step_id, cancel_info->sig); END_TIMER; slurm_mutex_lock(&max_delay_lock); max_resp_time = MAX(max_resp_time, DELTA_TIMER); slurm_mutex_unlock(&max_delay_lock); if ((error_code == 0) || ((errno != ESLURM_TRANSITION_STATE_NO_UPDATE) && (errno != ESLURM_JOB_PENDING))) break; verbose("Job is in transistional state, retrying"); sleep(5 + i); } if (error_code) { error_code = slurm_get_errno(); if ((opt.verbose > 0) || (error_code != ESLURM_ALREADY_DONE)) error("Kill job error on job step id %s: %s", cancel_info->job_id_str, slurm_strerror(slurm_get_errno())); if ((error_code == ESLURM_ALREADY_DONE) && (cancel_info->sig == SIGKILL)) { error_code = 0; /* Ignore error if job done */ } } /* Purposely free the struct passed in here, so the caller doesn't have * to keep track of it, but don't destroy the mutex and condition * variables contained. */ slurm_mutex_lock(cancel_info->num_active_threads_lock); *(cancel_info->rc) = MAX(*(cancel_info->rc), error_code); (*(cancel_info->num_active_threads))--; slurm_cond_signal(cancel_info->num_active_threads_cond); slurm_mutex_unlock(cancel_info->num_active_threads_lock); xfree(cancel_info->job_id_str); xfree(cancel_info); return NULL; }
static void _process_server_request(recv_header_t *_hdr, void *payload) { send_header_t *hdr = &_hdr->send_hdr; char *nodename = pmixp_info_job_host(hdr->nodeid); Buf buf; int rc; buf = create_buf(payload, hdr->msgsize); switch (hdr->type) { case PMIXP_MSG_FAN_IN: case PMIXP_MSG_FAN_OUT: { pmixp_coll_t *coll; pmix_proc_t *procs = NULL; size_t nprocs = 0; pmixp_coll_type_t type = 0; rc = pmixp_coll_unpack_ranges(buf, &type, &procs, &nprocs); if (SLURM_SUCCESS != rc) { PMIXP_ERROR("Bad message header from node %s", nodename); return; } coll = pmixp_state_coll_get(type, procs, nprocs); xfree(procs); PMIXP_DEBUG("FENCE collective message from node \"%s\", type = %s, seq = %d", nodename, (PMIXP_MSG_FAN_IN == hdr->type) ? "fan-in" : "fan-out", hdr->seq); rc = pmixp_coll_check_seq(coll, hdr->seq, nodename); if (PMIXP_COLL_REQ_FAILURE == rc) { /* this is unexepable event: either something went * really wrong or the state machine is incorrect. * This will 100% lead to application hang. */ PMIXP_ERROR("Bad collective seq. #%d from %s, current is %d", hdr->seq, nodename, coll->seq); pmixp_debug_hang(0); /* enable hang to debug this! */ slurm_kill_job_step(pmixp_info_jobid(), pmixp_info_stepid(), SIGKILL); break; } else if (PMIXP_COLL_REQ_SKIP == rc) { PMIXP_DEBUG("Wrong collective seq. #%d from %s, current is %d, skip this message", hdr->seq, nodename, coll->seq); free_buf(buf); break; } if (PMIXP_MSG_FAN_IN == hdr->type) { pmixp_coll_contrib_node(coll, nodename, buf); /* we don't need this buffer anymore */ free_buf(buf); } else { pmixp_coll_bcast(coll, buf); /* buf will be free'd by the PMIx callback */ } break; } case PMIXP_MSG_DMDX: { pmixp_dmdx_process(buf, nodename, hdr->seq); break; } case PMIXP_MSG_HEALTH_CHK: { /* this is just health ping. * TODO: can we do something more sophisticated? */ free_buf(buf); break; } default: PMIXP_ERROR("Unknown message type %d", hdr->type); break; } xfree(nodename); }