int pmixp_coll_ring_check(pmixp_coll_t *coll, pmixp_coll_ring_msg_hdr_t *hdr) { char *nodename = NULL; int rc; if (hdr->nodeid != _ring_prev_id(coll)) { nodename = pmixp_info_job_host(hdr->nodeid); PMIXP_ERROR("%p: unexpected contrib from %s:%u, expected is %d", coll, nodename, hdr->nodeid, _ring_prev_id(coll)); return SLURM_ERROR; } rc = pmixp_coll_check(coll, hdr->seq); if (PMIXP_COLL_REQ_FAILURE == rc) { /* this is an unacceptable event: either something went * really wrong or the state machine is incorrect. * This will 100% lead to application hang. */ nodename = pmixp_info_job_host(hdr->nodeid); PMIXP_ERROR("Bad collective seq. #%d from %s:%u, current is %d", hdr->seq, nodename, hdr->nodeid, coll->seq); pmixp_debug_hang(0); /* enable hang to debug this! */ slurm_kill_job_step(pmixp_info_jobid(), pmixp_info_stepid(), SIGKILL); xfree(nodename); return SLURM_SUCCESS; } else if (PMIXP_COLL_REQ_SKIP == rc) { #ifdef PMIXP_COLL_DEBUG nodename = pmixp_info_job_host(hdr->nodeid); PMIXP_ERROR("Wrong collective seq. #%d from nodeid %u, current is %d, skip this message", hdr->seq, hdr->nodeid, coll->seq); #endif return SLURM_ERROR; } return SLURM_SUCCESS; }
static void errhandler(pmix_status_t status, pmix_proc_t proc[], size_t nproc, pmix_info_t info[], size_t ninfo) { /* TODO: do something more sophisticated here */ /* FIXME: use proper specificator for nranges */ PMIXP_ERROR_STD("Error handler invoked: status = %d, nranges = %d", status, (int) nproc); slurm_kill_job_step(pmixp_info_jobid(), pmixp_info_stepid(), SIGKILL); }
static pmix_status_t abort_fn(const pmix_proc_t *proc, void *server_object, int status, const char msg[], pmix_proc_t procs[], size_t nprocs, pmix_op_cbfunc_t cbfunc, void *cbdata) { /* Just kill this stepid for now. Think what we can do for FT here? */ PMIXP_DEBUG("called: status = %d, msg = %s", status, msg); slurm_kill_job_step(pmixp_info_jobid(), pmixp_info_stepid(), SIGKILL); if (NULL != cbfunc) { cbfunc(PMIX_SUCCESS, cbdata); } return PMIX_SUCCESS; }
static void _errhandler(size_t evhdlr_registration_id, pmix_status_t status, const pmix_proc_t *source, pmix_info_t info[], size_t ninfo, pmix_info_t *results, size_t nresults, pmix_event_notification_cbfunc_fn_t cbfunc, void *cbdata) { /* TODO: do something more sophisticated here */ /* FIXME: use proper specificator for nranges */ PMIXP_ERROR_STD("Error handler invoked: status = %d", status); slurm_kill_job_step(pmixp_info_jobid(), pmixp_info_stepid(), SIGKILL); }
int pmixp_info_set(const stepd_step_rec_t *job, char ***env) { int i, rc; size_t msize; memset(&_pmixp_job_info, 0, sizeof(_pmixp_job_info)); #ifndef NDEBUG _pmixp_job_info.magic = PMIX_INFO_MAGIC; #endif /* security info */ _pmixp_job_info.uid = job->uid; _pmixp_job_info.gid = job->gid; /* This node info */ _pmixp_job_info.jobid = job->jobid; _pmixp_job_info.stepid = job->stepid; _pmixp_job_info.node_id = job->nodeid; _pmixp_job_info.node_tasks = job->node_tasks; /* Global info */ _pmixp_job_info.ntasks = job->ntasks; _pmixp_job_info.nnodes = job->nnodes; msize = sizeof(*_pmixp_job_info.task_cnts) * job->nnodes; _pmixp_job_info.task_cnts = xmalloc(msize); for (i = 0; i < job->nnodes; i++) { _pmixp_job_info.task_cnts[i] = job->task_cnts[i]; } msize = _pmixp_job_info.node_tasks * sizeof(uint32_t); _pmixp_job_info.gtids = xmalloc(msize); for (i = 0; i < job->node_tasks; i++) { _pmixp_job_info.gtids[i] = job->task[i]->gtid; } /* Setup hostnames and job-wide info */ if ((rc = _resources_set(env))) { return rc; } if ((rc = _env_set(env))) { return rc; } snprintf(_pmixp_job_info.nspace, PMIX_MAX_NSLEN, "slurm.pmix.%d.%d", pmixp_info_jobid(), pmixp_info_stepid()); return SLURM_SUCCESS; }
static void _process_server_request(pmixp_base_hdr_t *hdr, Buf buf) { int rc; switch (hdr->type) { case PMIXP_MSG_FAN_IN: case PMIXP_MSG_FAN_OUT: { pmixp_coll_t *coll; pmixp_proc_t *procs = NULL; size_t nprocs = 0; pmixp_coll_type_t type = 0; int c_nodeid; rc = pmixp_coll_unpack_info(buf, &type, &c_nodeid, &procs, &nprocs); if (SLURM_SUCCESS != rc) { char *nodename = pmixp_info_job_host(hdr->nodeid); PMIXP_ERROR("Bad message header from node %s", nodename); xfree(nodename); goto exit; } coll = pmixp_state_coll_get(type, procs, nprocs); xfree(procs); PMIXP_DEBUG("FENCE collective message from nodeid = %u, " "type = %s, seq = %d", hdr->nodeid, ((PMIXP_MSG_FAN_IN == hdr->type) ? "fan-in" : "fan-out"), hdr->seq); rc = pmixp_coll_check_seq(coll, hdr->seq); if (PMIXP_COLL_REQ_FAILURE == rc) { /* this is unexepable event: either something went * really wrong or the state machine is incorrect. * This will 100% lead to application hang. */ char *nodename = pmixp_info_job_host(hdr->nodeid); PMIXP_ERROR("Bad collective seq. #%d from %s, current" " is %d", hdr->seq, nodename, coll->seq); pmixp_debug_hang(0); /* enable hang to debug this! */ slurm_kill_job_step(pmixp_info_jobid(), pmixp_info_stepid(), SIGKILL); xfree(nodename); break; } else if (PMIXP_COLL_REQ_SKIP == rc) { PMIXP_DEBUG("Wrong collective seq. #%d from" " nodeid %u, current is %d, skip " "this message", hdr->seq, hdr->nodeid, coll->seq); goto exit; } if (PMIXP_MSG_FAN_IN == hdr->type) { pmixp_coll_contrib_child(coll, hdr->nodeid, hdr->seq, buf); } else { pmixp_coll_contrib_parent(coll, hdr->nodeid, hdr->seq, buf); } break; } case PMIXP_MSG_DMDX: { pmixp_dmdx_process(buf, hdr->nodeid, hdr->seq); /* buf will be free'd by the PMIx callback so * protect the data by voiding the buffer. * Use the statement below instead of (buf = NULL) * to maintain incapsulation - in general `buf`is * not a pointer, but opaque type. */ buf = create_buf(NULL, 0); break; } case PMIXP_MSG_INIT_DIRECT: PMIXP_DEBUG("Direct connection init from %d", hdr->nodeid); break; #ifndef NDEBUG case PMIXP_MSG_PINGPONG: { /* if the pingpong mode was activated - * node 0 sends ping requests * and receiver assumed to respond back to node 0 */ int msize = remaining_buf(buf); if (pmixp_info_nodeid()) { pmixp_server_pp_send(0, msize); } else { if (pmixp_server_pp_same_thread()) { if (pmixp_server_pp_count() == pmixp_server_pp_warmups()) { pmixp_server_pp_start(); } if (!pmixp_server_pp_check_fini(msize)) { pmixp_server_pp_send(1, msize); } } } pmixp_server_pp_inc(); break; } #endif default: PMIXP_ERROR("Unknown message type %d", hdr->type); break; } exit: free_buf(buf); }
static int _env_set(char ***env) { char *p = NULL; /* ----------- Temp directories settings ------------- */ /* * FIXME: This is dangerous to set this from the user environment. * I was using this to debug in linux containers * On real hardware each node has it's own separate /tmp directory */ /* set server temp directory - change this process environment */ p = getenvp(*env, PMIXP_TMPDIR_SRV); if (NULL != p) { setenv(PMIXP_OS_TMPDIR_ENV, p, 1); } p = getenv(PMIXP_OS_TMPDIR_ENV); if (NULL == p) { p = PMIXP_TMPDIR_DEFAULT; } _pmixp_job_info.lib_tmpdir = xstrdup_printf("%s/pmix.%d.%d/", p, pmixp_info_jobid(), pmixp_info_stepid()); /* save client temp directory if requested * TODO: We want to get TmpFS value as well if exists. * Need to sync with SLURM developers. */ p = getenvp(*env, PMIXP_TMPDIR_CLI); if (NULL != p) { _pmixp_job_info.cli_tmpdir = xstrdup(p); } else { p = slurm_get_tmp_fs(); if (NULL != p) { _pmixp_job_info.cli_tmpdir = p; } } /* ----------- Timeout setting ------------- */ /* TODO: also would be nice to have a cluster-wide setting in SLURM */ _pmixp_job_info.timeout = PMIXP_TIMEOUT_DEFAULT; p = getenvp(*env, PMIXP_TIMEOUT); if (NULL != p) { int tmp; tmp = atoi(p); if (tmp > 0) { _pmixp_job_info.timeout = tmp; } } /* ----------- Forward PMIX settings ------------- */ /* FIXME: this may be intrusive as well as PMIx library will create * lots of output files in /tmp by default. * somebody can use this or annoyance */ p = getenvp(*env, PMIXP_PMIXLIB_DEBUG); if (NULL != p) { setenv(PMIXP_PMIXLIB_DEBUG, p, 1); /* output into the file since we are in slurmstepd * and stdout is muted. * One needs to check TMPDIR for the results */ setenv(PMIXP_PMIXLIB_DEBUG_REDIR, "file", 1); } return SLURM_SUCCESS; }
static void _process_server_request(recv_header_t *_hdr, void *payload) { send_header_t *hdr = &_hdr->send_hdr; char *nodename = pmixp_info_job_host(hdr->nodeid); Buf buf; int rc; buf = create_buf(payload, hdr->msgsize); switch (hdr->type) { case PMIXP_MSG_FAN_IN: case PMIXP_MSG_FAN_OUT: { pmixp_coll_t *coll; pmix_proc_t *procs = NULL; size_t nprocs = 0; pmixp_coll_type_t type = 0; rc = pmixp_coll_unpack_ranges(buf, &type, &procs, &nprocs); if (SLURM_SUCCESS != rc) { PMIXP_ERROR("Bad message header from node %s", nodename); return; } coll = pmixp_state_coll_get(type, procs, nprocs); xfree(procs); PMIXP_DEBUG("FENCE collective message from node \"%s\", type = %s, seq = %d", nodename, (PMIXP_MSG_FAN_IN == hdr->type) ? "fan-in" : "fan-out", hdr->seq); rc = pmixp_coll_check_seq(coll, hdr->seq, nodename); if (PMIXP_COLL_REQ_FAILURE == rc) { /* this is unexepable event: either something went * really wrong or the state machine is incorrect. * This will 100% lead to application hang. */ PMIXP_ERROR("Bad collective seq. #%d from %s, current is %d", hdr->seq, nodename, coll->seq); pmixp_debug_hang(0); /* enable hang to debug this! */ slurm_kill_job_step(pmixp_info_jobid(), pmixp_info_stepid(), SIGKILL); break; } else if (PMIXP_COLL_REQ_SKIP == rc) { PMIXP_DEBUG("Wrong collective seq. #%d from %s, current is %d, skip this message", hdr->seq, nodename, coll->seq); free_buf(buf); break; } if (PMIXP_MSG_FAN_IN == hdr->type) { pmixp_coll_contrib_node(coll, nodename, buf); /* we don't need this buffer anymore */ free_buf(buf); } else { pmixp_coll_bcast(coll, buf); /* buf will be free'd by the PMIx callback */ } break; } case PMIXP_MSG_DMDX: { pmixp_dmdx_process(buf, nodename, hdr->seq); break; } case PMIXP_MSG_HEALTH_CHK: { /* this is just health ping. * TODO: can we do something more sophisticated? */ free_buf(buf); break; } default: PMIXP_ERROR("Unknown message type %d", hdr->type); break; } xfree(nodename); }
static int _env_set(char ***env) { char *p = NULL; xassert(_pmixp_job_info.hostname); _pmixp_job_info.server_addr_unfmt = slurm_get_slurmd_spooldir(NULL); _pmixp_job_info.lib_tmpdir = slurm_conf_expand_slurmd_path( _pmixp_job_info.server_addr_unfmt, _pmixp_job_info.hostname); xstrfmtcat(_pmixp_job_info.server_addr_unfmt, "/stepd.slurm.pmix.%d.%d", pmixp_info_jobid(), pmixp_info_stepid()); _pmixp_job_info.spool_dir = xstrdup(_pmixp_job_info.lib_tmpdir); /* ----------- Temp directories settings ------------- */ xstrfmtcat(_pmixp_job_info.lib_tmpdir, "/pmix.%d.%d/", pmixp_info_jobid(), pmixp_info_stepid()); /* save client temp directory if requested * TODO: We want to get TmpFS value as well if exists. * Need to sync with SLURM developers. */ p = getenvp(*env, PMIXP_TMPDIR_CLI); if (p) _pmixp_job_info.cli_tmpdir_base = xstrdup(p); else _pmixp_job_info.cli_tmpdir_base = slurm_get_tmp_fs( _pmixp_job_info.hostname); _pmixp_job_info.cli_tmpdir = xstrdup_printf("%s/spmix_appdir_%d.%d", _pmixp_job_info.cli_tmpdir_base, pmixp_info_jobid(), pmixp_info_stepid()); /* ----------- Timeout setting ------------- */ /* TODO: also would be nice to have a cluster-wide setting in SLURM */ _pmixp_job_info.timeout = PMIXP_TIMEOUT_DEFAULT; p = getenvp(*env, PMIXP_TIMEOUT); if (NULL != p) { int tmp; tmp = atoi(p); if (tmp > 0) { _pmixp_job_info.timeout = tmp; } } /* ----------- Forward PMIX settings ------------- */ /* FIXME: this may be intrusive as well as PMIx library will create * lots of output files in /tmp by default. * somebody can use this or annoyance */ p = getenvp(*env, PMIXP_PMIXLIB_DEBUG); if (NULL != p) { setenv(PMIXP_PMIXLIB_DEBUG, p, 1); /* output into the file since we are in slurmstepd * and stdout is muted. * One needs to check TMPDIR for the results */ setenv(PMIXP_PMIXLIB_DEBUG_REDIR, "file", 1); } return SLURM_SUCCESS; }
/* * information about relative ranks as assigned by the RM */ static void _set_procdatas(List lresp) { pmixp_namespace_t *nsptr = pmixp_nspaces_local(); pmix_info_t *kvp, *tkvp; char *p = NULL; int i; /* (char*) jobid assigned by scheduler */ xstrfmtcat(p, "%d.%d", pmixp_info_jobid(), pmixp_info_stepid()); PMIXP_ALLOC_KEY(kvp, PMIX_JOBID); PMIX_VAL_SET(&kvp->value, string, p); xfree(p); list_append(lresp, kvp); PMIXP_ALLOC_KEY(kvp, PMIX_NODEID); PMIX_VAL_SET(&kvp->value, uint32_t, nsptr->node_id); list_append(lresp, kvp); /* store information about local processes */ for (i = 0; i < pmixp_info_tasks(); i++) { List rankinfo; ListIterator it; int count, j, localid, nodeid; char *nodename; pmix_info_t *info; rankinfo = list_create(pmixp_xfree_xmalloced); PMIXP_ALLOC_KEY(kvp, PMIX_RANK); PMIX_VAL_SET(&kvp->value, int, i); list_append(rankinfo, kvp); /* TODO: always use 0 so far. this is not the general case though * (see SLURM MIMD: man srun, section MULTIPLE PROGRAM CONFIGURATION) */ PMIXP_ALLOC_KEY(kvp, PMIX_APPNUM); PMIX_VAL_SET(&kvp->value, int, 0); list_append(rankinfo, kvp); /* TODO: the same as for previous here */ PMIXP_ALLOC_KEY(kvp, PMIX_APPLDR); PMIX_VAL_SET(&kvp->value, int, 0); list_append(rankinfo, kvp); /* TODO: fix when several apps will appear */ PMIXP_ALLOC_KEY(kvp, PMIX_GLOBAL_RANK); PMIX_VAL_SET(&kvp->value, uint32_t, i); list_append(rankinfo, kvp); /* TODO: fix when several apps will appear */ PMIXP_ALLOC_KEY(kvp, PMIX_APP_RANK); PMIX_VAL_SET(&kvp->value, uint32_t, i); list_append(rankinfo, kvp); localid = pmixp_info_taskid2localid(i); /* this rank is local, store local info ab't it! */ if (0 <= localid) { PMIXP_ALLOC_KEY(kvp, PMIX_LOCAL_RANK); PMIX_VAL_SET(&kvp->value, uint16_t, localid); list_append(rankinfo, kvp); /* TODO: fix when several apps will appear */ PMIXP_ALLOC_KEY(kvp, PMIX_NODE_RANK); PMIX_VAL_SET(&kvp->value, uint16_t, localid); list_append(rankinfo, kvp); } nodeid = nsptr->task_map[i]; nodename = hostlist_nth(nsptr->hl, nodeid); PMIXP_ALLOC_KEY(kvp, PMIX_HOSTNAME); PMIX_VAL_SET(&kvp->value, string, nodename); list_append(rankinfo, kvp); free(nodename); /* merge rankinfo into one PMIX_PROC_DATA key */ count = list_count(rankinfo); PMIXP_ALLOC_KEY(kvp, PMIX_PROC_DATA); kvp->value.type = PMIX_INFO_ARRAY; kvp->value.data.array.size = count; PMIX_INFO_CREATE(info, count); it = list_iterator_create(rankinfo); j = 0; while (NULL != (tkvp = list_next(it))) { /* Just copy all the fields here. We will free original kvp's * using list_destroy without free'ing their fields so it is * safe to do so. */ info[j] = *tkvp; j++; } list_destroy(rankinfo); kvp->value.data.array.array = (pmix_info_t *)info; info = NULL; /* put the complex key to the list */ list_append(lresp, kvp); } }