int pmixp_stepd_finalize(void) { char *path; if (!_was_initialized) { /* nothing to do */ return 0; } pmixp_libpmix_finalize(); pmixp_dmdx_finalize(); pmixp_conn_fini(); pmixp_dconn_fini(); pmixp_state_finalize(); pmixp_nspaces_finalize(); /* cleanup the UNIX socket */ PMIXP_DEBUG("Remove PMIx plugin usock"); close(pmixp_info_srv_usock_fd()); path = pmixp_info_nspace_usock(pmixp_info_namespace()); unlink(path); xfree(path); /* free the information */ pmixp_info_free(); return SLURM_SUCCESS; }
int pmixp_coll_contrib_local(pmixp_coll_t *coll, char *data, size_t size) { PMIXP_DEBUG("%s:%d: get local contribution", pmixp_info_namespace(), pmixp_info_nodeid()); /* sanity check */ pmixp_coll_sanity_check(coll); /* lock the structure */ slurm_mutex_lock(&coll->lock); /* change the collective state if need */ if (PMIXP_COLL_SYNC == coll->state) { PMIXP_DEBUG( "%s:%d: get local contribution: switch to PMIXP_COLL_FAN_IN", pmixp_info_namespace(), pmixp_info_nodeid()); coll->state = PMIXP_COLL_FAN_IN; coll->ts = time(NULL); } xassert(PMIXP_COLL_FAN_IN == coll->state); /* save & mark local contribution */ coll->contrib_local = true; grow_buf(coll->buf, size); memcpy(get_buf_data(coll->buf) + get_buf_offset(coll->buf), data, size); set_buf_offset(coll->buf, get_buf_offset(coll->buf) + size); /* unlock the structure */ slurm_mutex_unlock(&coll->lock); /* check if the collective is ready to progress */ _progress_fan_in(coll); PMIXP_DEBUG("%s:%d: get local contribution: finish", pmixp_info_namespace(), pmixp_info_nodeid()); return SLURM_SUCCESS; }
void _progres_fan_out(pmixp_coll_t *coll, Buf buf) { PMIXP_DEBUG("%s:%d: start", pmixp_info_namespace(), pmixp_info_nodeid()); pmixp_coll_sanity_check(coll); xassert(PMIXP_COLL_FAN_OUT == coll->state || PMIXP_COLL_FAN_OUT_IN == coll->state); /* update the database */ if (NULL != coll->cbfunc) { void *data = get_buf_data(buf) + get_buf_offset(buf); size_t size = remaining_buf(buf); PMIXP_DEBUG("%s:%d: use the callback", pmixp_info_namespace(), pmixp_info_nodeid()); coll->cbfunc(PMIX_SUCCESS, data, size, coll->cbdata, pmixp_free_Buf, (void *)buf); } /* Prepare for the next collective operation */ _fan_out_finished(coll); PMIXP_DEBUG("%s:%d: collective is prepared for the next use", pmixp_info_namespace(), pmixp_info_nodeid()); }
void pmixp_coll_bcast(pmixp_coll_t *coll, Buf buf) { PMIXP_DEBUG("%s:%d: start", pmixp_info_namespace(), pmixp_info_nodeid()); /* lock the structure */ slurm_mutex_lock(&coll->lock); _progres_fan_out(coll, buf); /* unlock the structure */ slurm_mutex_unlock(&coll->lock); /* We may already start next collective. Try to progress! * its OK if we in SYNC - there will be no-op */ _progress_fan_in(coll); }
static int _pmixp_server_cperf_iter(char *data, int ndata) { pmixp_coll_t *coll; pmixp_proc_t procs; int cur_count = _pmixp_server_cperf_count(); strncpy(procs.nspace, pmixp_info_namespace(), PMIXP_MAX_NSLEN); procs.rank = pmixp_lib_get_wildcard(); coll = pmixp_state_coll_get(PMIXP_COLL_TYPE_FENCE, &procs, 1); xassert(!pmixp_coll_contrib_local(coll, data, ndata, _pmixp_cperf_cbfunc, NULL)); while (cur_count == _pmixp_server_cperf_count()) { usleep(1); } return 0; }
static void _respond_with_error(int seq_num, char *sender_host, char *sender_ns, int status) { Buf buf = create_buf(NULL, 0); char *addr; int rc; /* rank doesn't matter here, don't send it */ _setup_header(buf, DMDX_RESPONSE, pmixp_info_namespace(), -1, status); /* generate namespace usocket name */ addr = pmixp_info_nspace_usock(sender_ns); /* send response */ rc = pmixp_server_send(sender_host, PMIXP_MSG_DMDX, seq_num, addr, get_buf_data(buf), get_buf_offset(buf), 1); if (SLURM_SUCCESS != rc) { PMIXP_ERROR("Cannot send direct modex error" " response to %s", sender_host); } xfree(addr); free_buf(buf); }
static void _setup_header(Buf buf, dmdx_type_t t, const char *nspace, int rank, int status) { char *str; /* 1. pack message type */ unsigned char type = (char)t; grow_buf(buf, sizeof(char)); pack8(type, buf); /* 2. pack namespace _with_ '\0' (strlen(nspace) + 1)! */ packmem((char *)nspace, strlen(nspace) + 1, buf); /* 3. pack rank */ grow_buf(buf, sizeof(int)); pack32((uint32_t)rank, buf); /* 4. pack my rendezvous point - local namespace * ! _with_ '\0' (strlen(nspace) + 1) ! */ str = pmixp_info_namespace(); packmem(str, strlen(str) + 1, buf); /* 5. pack the status */ pack32((uint32_t)status, buf); }
static void _dmdx_req(Buf buf, char *sender_host, uint32_t seq_num) { int rank, rc; int status; char *ns = NULL, *sender_ns = NULL; pmixp_namespace_t *nsptr; dmdx_caddy_t *caddy = NULL; rc = _read_info(buf, &ns, &rank, &sender_ns,&status); if (SLURM_SUCCESS != rc) { /* there is not much we can do here, but data corruption shouldn't happen */ PMIXP_ERROR("Fail to unpack header data in" " request from %s, rc = %d", sender_host, rc); goto exit; } if (0 != xstrcmp(ns, pmixp_info_namespace())) { /* request for namespase that is not controlled by this daemon * considered as error. This may change in future. */ PMIXP_ERROR("Bad request from %s: asked for" " nspace = %s, mine is %s", sender_host, ns, pmixp_info_namespace()); _respond_with_error(seq_num, sender_host, sender_ns, PMIX_ERR_INVALID_NAMESPACE); goto exit; } nsptr = pmixp_nspaces_local(); if (nsptr->ntasks <= rank) { PMIXP_ERROR("Bad request from %s: nspace \"%s\"" " has only %d ranks, asked for %d", sender_host, ns, nsptr->ntasks, rank); _respond_with_error(seq_num, sender_host, sender_ns, PMIX_ERR_BAD_PARAM); goto exit; } /* setup temp structure to handle information fro _dmdx_pmix_cb */ caddy = xmalloc(sizeof(dmdx_caddy_t)); caddy->seq_num = seq_num; /* ns is a pointer inside incoming buffer */ strncpy(caddy->proc.nspace, ns, PMIX_MAX_NSLEN); ns = NULL; /* protect the data */ caddy->proc.rank = rank; /* sender_host was passed from outside - copy it */ caddy->sender_host = xstrdup(sender_host); sender_host = NULL; /* protect the data */ /* sender_ns is a pointer inside incoming buffer */ caddy->sender_ns = xstrdup(sender_ns); sender_ns = NULL; rc = PMIx_server_dmodex_request(&caddy->proc, _dmdx_pmix_cb, (void *)caddy); if (PMIX_SUCCESS != rc) { PMIXP_ERROR("Can't request modex data from libpmix-server," "requesting host = %s, nspace = %s, rank = %d, rc = %d", caddy->sender_host, caddy->proc.nspace, caddy->proc.rank, rc); _respond_with_error(seq_num, caddy->sender_host, caddy->sender_ns, rc); _dmdx_free_caddy(caddy); } exit: /* we don't need this buffer anymore */ free_buf(buf); /* no sense to return errors, engine can't do anything * anyway. We've notified libpmix, that's enough */ }
int pmixp_stepd_init(const stepd_step_rec_t *job, char ***env) { char *path; int fd, rc; if (SLURM_SUCCESS != (rc = pmixp_info_set(job, env))) { PMIXP_ERROR("pmixp_info_set(job, env) failed"); goto err_info; } /* Create UNIX socket for slurmd communication */ path = pmixp_info_nspace_usock(pmixp_info_namespace()); if (NULL == path) { PMIXP_ERROR("pmixp_info_nspace_usock: out-of-memory"); rc = SLURM_ERROR; goto err_path; } if ((fd = pmixp_usock_create_srv(path)) < 0) { PMIXP_ERROR("pmixp_usock_create_srv"); rc = SLURM_ERROR; goto err_usock; } fd_set_close_on_exec(fd); pmixp_info_srv_usock_set(path, fd); if (!pmixp_info_same_arch()){ _direct_proto.hdr_unpack_cb = _direct_hdr_unpack_portable; _direct_hdr_pack = _direct_hdr_pack_portable; } pmixp_conn_init(_slurm_proto, _direct_proto); if((rc = pmixp_dconn_init(pmixp_info_nodes_uni(), _direct_proto)) ){ PMIXP_ERROR("pmixp_dconn_init() failed"); goto err_dconn; } if ((rc = pmixp_nspaces_init())) { PMIXP_ERROR("pmixp_nspaces_init() failed"); goto err_nspaces; } if (SLURM_SUCCESS != (rc = pmixp_state_init())) { PMIXP_ERROR("pmixp_state_init() failed"); goto err_state; } if (SLURM_SUCCESS != (rc = pmixp_dmdx_init())) { PMIXP_ERROR("pmixp_dmdx_init() failed"); goto err_dmdx; } if (SLURM_SUCCESS != (rc = pmixp_libpmix_init())) { PMIXP_ERROR("pmixp_libpmix_init() failed"); goto err_lib; } if (SLURM_SUCCESS != (rc = pmixp_libpmix_job_set())) { PMIXP_ERROR("pmixp_libpmix_job_set() failed"); goto err_job; } pmixp_server_init_pp(env); pmixp_server_init_cperf(env); xfree(path); _was_initialized = 1; return SLURM_SUCCESS; err_job: pmixp_libpmix_finalize(); err_lib: pmixp_dmdx_finalize(); err_dmdx: pmixp_state_finalize(); err_state: pmixp_nspaces_finalize(); err_nspaces: pmixp_dconn_fini(); err_dconn: pmixp_conn_fini(); close(pmixp_info_srv_usock_fd()); err_usock: xfree(path); err_path: pmixp_info_free(); err_info: return rc; }
int pmixp_stepd_init(const stepd_step_rec_t *job, char ***env) { char *path; int fd, rc; if (SLURM_SUCCESS != (rc = pmixp_info_set(job, env))) { PMIXP_ERROR("pmixp_info_set(job, env) failed"); return rc; } /* Create UNIX socket for slurmd communication */ path = pmixp_info_nspace_usock(pmixp_info_namespace()); if (NULL == path) { PMIXP_ERROR("Out-of-memory"); rc = SLURM_ERROR; goto err_path; } if ((fd = pmixp_usock_create_srv(path)) < 0) { rc = SLURM_ERROR; goto err_usock; } fd_set_close_on_exec(fd); pmixp_info_srv_contacts(path, fd); if (SLURM_SUCCESS != (rc = pmixp_nspaces_init())) { PMIXP_ERROR("pmixp_nspaces_init() failed"); goto err_usock; } if (SLURM_SUCCESS != (rc = pmixp_state_init())) { PMIXP_ERROR("pmixp_state_init() failed"); goto err_state; } if (SLURM_SUCCESS != (rc = pmixp_dmdx_init())) { PMIXP_ERROR("pmixp_dmdx_init() failed"); goto err_dmdx; } if (SLURM_SUCCESS != (rc = pmixp_libpmix_init())) { PMIXP_ERROR("pmixp_libpmix_init() failed"); goto err_lib; } if (SLURM_SUCCESS != (rc = pmixp_libpmix_job_set())) { PMIXP_ERROR("pmixp_libpmix_job_set() failed"); goto err_job; } xfree(path); _was_initialized = 1; return SLURM_SUCCESS; err_job: pmixp_libpmix_finalize(); err_lib: pmixp_dmdx_finalize(); err_dmdx: pmixp_state_finalize(); err_state: pmixp_nspaces_finalize(); err_usock: xfree(path); err_path: pmixp_info_free(); return rc; }
int pmixp_libpmix_job_set(void) { List lresp; pmix_info_t *info; int ninfo; ListIterator it; pmix_info_t *kvp; int i, rc; uid_t uid = pmixp_info_jobuid(); gid_t gid = pmixp_info_jobgid(); _register_caddy_t *register_caddy; register_caddy = xmalloc(sizeof(_register_caddy_t)*(pmixp_info_tasks_loc()+1)); pmixp_debug_hang(0); /* Use list to safely expand/reduce key-value pairs. */ lresp = list_create(pmixp_xfree_xmalloced); _general_proc_info(lresp); _set_tmpdirs(lresp); _set_procdatas(lresp); _set_sizeinfo(lresp); if (SLURM_SUCCESS != _set_mapsinfo(lresp)) { list_destroy(lresp); PMIXP_ERROR("Can't build nodemap"); return SLURM_ERROR; } _set_localinfo(lresp); ninfo = list_count(lresp); PMIX_INFO_CREATE(info, ninfo); it = list_iterator_create(lresp); i = 0; while (NULL != (kvp = list_next(it))) { info[i] = *kvp; i++; } list_destroy(lresp); register_caddy[0].active = 1; rc = PMIx_server_register_nspace(pmixp_info_namespace(), pmixp_info_tasks_loc(), info, ninfo, _release_cb, ®ister_caddy[0]); if (PMIX_SUCCESS != rc) { PMIXP_ERROR("Cannot register namespace %s, nlocalproc=%d, " "ninfo = %d", pmixp_info_namespace(), pmixp_info_tasks_loc(), ninfo); return SLURM_ERROR; } PMIXP_DEBUG("task initialization"); for (i = 0; i < pmixp_info_tasks_loc(); i++) { pmix_proc_t proc; register_caddy[i+1].active = 1; strncpy(proc.nspace, pmixp_info_namespace(), PMIX_MAX_NSLEN); proc.rank = pmixp_info_taskid(i); rc = PMIx_server_register_client(&proc, uid, gid, NULL, _release_cb, ®ister_caddy[i + 1]); if (PMIX_SUCCESS != rc) { PMIXP_ERROR("Cannot register client %d(%d) in namespace %s", pmixp_info_taskid(i), i, pmixp_info_namespace()); return SLURM_ERROR; } } /* wait for all registration actions to finish */ while( 1 ){ int exit_flag = 1; struct timespec ts; ts.tv_sec = 0; ts.tv_nsec = 100; for(i=0; i < pmixp_info_tasks_loc() + 1; i++){ if( register_caddy[i].active ){ exit_flag = 0; } } if( exit_flag ){ break; } nanosleep(&ts, NULL); } PMIX_INFO_FREE(info, ninfo); xfree(register_caddy); return SLURM_SUCCESS; }
static void _progress_fan_in(pmixp_coll_t *coll) { pmixp_srv_cmd_t type; const char *addr = pmixp_info_srv_addr(); char *hostlist = NULL; int rc, is_p2p = 0; Buf root_buf; PMIXP_DEBUG("%s:%d: start, local=%d, child_cntr=%d", pmixp_info_namespace(), pmixp_info_nodeid(), coll->contrib_local, coll->contrib_cntr); /* lock the collective */ slurm_mutex_lock(&coll->lock); pmixp_coll_sanity_check(coll); if (PMIXP_COLL_FAN_IN != coll->state) { /* In case of race condition between libpmix and * slurm threads progress_fan_in can be called * after we moved to the next step. */ goto unlock; } if (!coll->contrib_local || coll->contrib_cntr != coll->children_cnt) { /* Not yet ready to go to the next step */ goto unlock; } /* The root of the collective will have parent_host == NULL */ if (NULL != coll->parent_host) { hostlist = xstrdup(coll->parent_host); type = PMIXP_MSG_FAN_IN; PMIXP_DEBUG("%s:%d: switch to PMIXP_COLL_FAN_OUT state", pmixp_info_namespace(), pmixp_info_nodeid()); is_p2p = 1; } else { if (0 < hostlist_count(coll->all_children)) { hostlist = hostlist_ranged_string_xmalloc( coll->all_children); type = PMIXP_MSG_FAN_OUT; pmixp_debug_hang(0); } rc = _copy_payload(coll->buf, coll->serv_offs, &root_buf); xassert(0 == rc); PMIXP_DEBUG("%s:%d: finish with this collective (I am the root)", pmixp_info_namespace(), pmixp_info_nodeid()); } PMIXP_DEBUG("%s:%d: send data to %s", pmixp_info_namespace(), pmixp_info_nodeid(), hostlist); /* Check for the singletone case */ if (NULL != hostlist) { if( 0 == coll->seq && NULL != coll->parent_host ){ /* This is the first message sent to the parent. * There might be a race condition where parent * is not ready to receive the messages. * Use zero-size message to check parent status first * and then send the full message. */ pmixp_server_health_chk(hostlist, addr); } rc = pmixp_server_send(hostlist, type, coll->seq, addr, get_buf_data(coll->buf), get_buf_offset(coll->buf), is_p2p); if (SLURM_SUCCESS != rc) { PMIXP_ERROR( "Cannot send data (size = %lu), to hostlist:\n%s", (uint64_t) get_buf_offset(coll->buf), hostlist); /* return error indication to PMIx. Nodes that haven't received data * will exit by a timeout. * FIXME: do we need to do something with successfuly finished nodes? */ goto unlock; } } /* transit to the next state */ _fan_in_finished(coll); /* if we are root - push data to PMIx here. * Originally there was a homogenuous solution: root nodename was in the hostlist. * However this may lead to the undesired side effects: we are blocked here sending * data and cannot receive (it will be triggered in this thread after we will leave * this callback), so we have to rely on buffering on the SLURM side. * Better not to do so. */ if (NULL == coll->parent_host) { /* if I am the root - pass the data to PMIx and reset collective here */ /* copy payload excluding reserved server header */ _progres_fan_out(coll, root_buf); } unlock: if (NULL != hostlist) { xfree(hostlist); } /* lock the */ slurm_mutex_unlock(&coll->lock); }
int pmixp_coll_contrib_node(pmixp_coll_t *coll, char *nodename, Buf buf) { int nodeid; char *data = NULL; uint32_t size; char *state = NULL; PMIXP_DEBUG("%s:%d: get contribution from node %s", pmixp_info_namespace(), pmixp_info_nodeid(), nodename); /* lock the structure */ slurm_mutex_lock(&coll->lock); pmixp_coll_sanity_check(coll); /* fix the collective status if need */ if (PMIXP_COLL_SYNC == coll->state) { PMIXP_DEBUG("%s:%d: get contribution from node %s: switch to PMIXP_COLL_FAN_IN", pmixp_info_namespace(), pmixp_info_nodeid(), nodename); coll->state = PMIXP_COLL_FAN_IN; coll->ts = time(NULL); } else if( PMIXP_COLL_FAN_OUT == coll->state) { PMIXP_DEBUG("%s:%d: get contribution from node %s: switch to PMIXP_COLL_FAN_OUT_IN" " (next collective!)", pmixp_info_namespace(), pmixp_info_nodeid(), nodename); coll->state = PMIXP_COLL_FAN_OUT_IN; coll->ts_next = time(NULL); } xassert(PMIXP_COLL_FAN_IN == coll->state || PMIXP_COLL_FAN_OUT_IN == coll->state); /* Because of possible timeouts/delays in transmission we * can receive a contribution second time. Avoid duplications * by checking our records. */ nodeid = hostlist_find(coll->ch_hosts, nodename); xassert(0 <= nodeid); if (0 > nodeid) { /* protect ourselfs if we are running with no asserts */ goto proceed; } if (0 < coll->ch_contribs[nodeid]) { /* May be 0 or 1. If grater - transmission skew, ignore. */ PMIXP_DEBUG("Multiple contributions from child_id=%d, hostname=%s", nodeid, nodename); /* this is duplication, skip. */ goto proceed; } data = get_buf_data(buf) + get_buf_offset(buf); size = remaining_buf(buf); grow_buf(coll->buf, size); memcpy(get_buf_data(coll->buf) + get_buf_offset(coll->buf), data, size); set_buf_offset(coll->buf, get_buf_offset(coll->buf) + size); /* increase number of individual contributions */ coll->ch_contribs[nodeid]++; /* increase number of total contributions */ coll->contrib_cntr++; proceed: /* unlock the structure */ slurm_mutex_unlock(&coll->lock); if( PMIXP_COLL_FAN_IN == coll->state ){ /* make a progress if we are in fan-in state */ _progress_fan_in(coll); } switch( coll->state ){ case PMIXP_COLL_SYNC: state = "sync"; break; case PMIXP_COLL_FAN_IN: state = "fan-in"; break; case PMIXP_COLL_FAN_OUT: state = "fan-out"; break; case PMIXP_COLL_FAN_OUT_IN: state = "fan-out-in"; break; } PMIXP_DEBUG("%s:%d: get contribution from node %s: finish. State = %s", pmixp_info_namespace(), pmixp_info_nodeid(), nodename, state); return SLURM_SUCCESS; }