int pmixp_server_pp_send(int nodeid, int size) { Buf buf = pmixp_server_buf_new(); int rc; pmixp_ep_t ep; struct pp_cbdata *cbdata = xmalloc(sizeof(*cbdata)); grow_buf(buf, size); ep.type = PMIXP_EP_NOIDEID; ep.ep.nodeid = nodeid; cbdata->buf = buf; cbdata->size = size; set_buf_offset(buf,get_buf_offset(buf) + size); rc = pmixp_server_send_nb(&ep, PMIXP_MSG_PINGPONG, _pmixp_pp_count, buf, pingpong_complete, (void*)cbdata); if (SLURM_SUCCESS != rc) { char *nodename = pmixp_info_job_host(nodeid); PMIXP_ERROR("Was unable to wait for the parent %s to " "become alive", nodename); xfree(nodename); } return rc; }
static int _ring_forward_data(pmixp_coll_ring_ctx_t *coll_ctx, uint32_t contrib_id, uint32_t hop_seq, void *data, size_t size) { pmixp_coll_ring_msg_hdr_t hdr; pmixp_coll_t *coll = _ctx_get_coll(coll_ctx); pmixp_coll_ring_t *ring = &coll->state.ring; hdr.nodeid = coll->my_peerid; hdr.msgsize = size; hdr.seq = coll_ctx->seq; hdr.hop_seq = hop_seq; hdr.contrib_id = contrib_id; pmixp_ep_t *ep = (pmixp_ep_t*)xmalloc(sizeof(*ep)); pmixp_coll_ring_cbdata_t *cbdata = NULL; uint32_t offset = 0; Buf buf = _get_fwd_buf(coll_ctx); int rc = SLURM_SUCCESS; pmixp_coll_ring_ctx_sanity_check(coll_ctx); #ifdef PMIXP_COLL_DEBUG PMIXP_DEBUG("%p: transit data to nodeid=%d, seq=%d, hop=%d, size=%lu, contrib=%d", coll_ctx, _ring_next_id(coll), hdr.seq, hdr.hop_seq, hdr.msgsize, hdr.contrib_id); #endif if (!buf) { rc = SLURM_ERROR; goto exit; } ep->type = PMIXP_EP_NOIDEID; ep->ep.nodeid = ring->next_peerid; /* pack ring info */ _pack_coll_ring_info(coll, &hdr, buf); /* insert payload to buf */ offset = get_buf_offset(buf); pmixp_server_buf_reserve(buf, size); memcpy(get_buf_data(buf) + offset, data, size); set_buf_offset(buf, offset + size); cbdata = xmalloc(sizeof(pmixp_coll_ring_cbdata_t)); cbdata->buf = buf; cbdata->coll = coll; cbdata->coll_ctx = coll_ctx; cbdata->seq = coll_ctx->seq; rc = pmixp_server_send_nb(ep, PMIXP_MSG_RING, coll_ctx->seq, buf, _ring_sent_cb, cbdata); exit: return rc; }
static int _progress_ufwd(pmixp_coll_t *coll) { pmixp_ep_t ep[coll->chldrn_cnt]; int ep_cnt = 0; int rc, i; char *nodename = NULL; pmixp_coll_cbdata_t *cbdata = NULL; xassert(PMIXP_COLL_UPFWD == coll->state); /* for some reasons doesnt switch to downfwd */ switch (coll->ufwd_status) { case PMIXP_COLL_SND_FAILED: /* something went wrong with upward send. * notify libpmix about that and abort * collective */ if (coll->cbfunc) { coll->cbfunc(PMIX_ERROR, NULL, 0, coll->cbdata, NULL, NULL); } _reset_coll(coll); /* Don't need to do anything else */ return false; case PMIXP_COLL_SND_ACTIVE: /* still waiting for the send completion */ return false; case PMIXP_COLL_SND_DONE: if (coll->contrib_prnt) { /* all-set to go to the next stage */ break; } return false; default: /* Should not happen, fatal error */ abort(); } /* We now can upward part for the next collective */ _reset_coll_ufwd(coll); /* move to the next state */ coll->state = PMIXP_COLL_DOWNFWD; coll->dfwd_status = PMIXP_COLL_SND_ACTIVE; if (!pmixp_info_srv_direct_conn()) { /* only root of the tree should get here */ xassert(0 > coll->prnt_peerid); if (coll->chldrn_cnt) { /* We can run on just one node */ ep[ep_cnt].type = PMIXP_EP_HLIST; ep[ep_cnt].ep.hostlist = coll->chldrn_str; ep_cnt++; } } else { for(i=0; i<coll->chldrn_cnt; i++){ ep[i].type = PMIXP_EP_NOIDEID; ep[i].ep.nodeid = coll->chldrn_ids[i]; ep_cnt++; } } /* We need to wait for ep_cnt send completions + the local callback */ coll->dfwd_cb_wait = ep_cnt; if (ep_cnt || coll->cbfunc) { /* allocate the callback data */ cbdata = xmalloc(sizeof(pmixp_coll_cbdata_t)); cbdata->coll = coll; cbdata->seq = coll->seq; cbdata->refcntr = ep_cnt; if (coll->cbfunc) { cbdata->refcntr++; } } for(i=0; i < ep_cnt; i++){ rc = pmixp_server_send_nb(&ep[i], PMIXP_MSG_FAN_OUT, coll->seq, coll->dfwd_buf, _dfwd_sent_cb, cbdata); if (SLURM_SUCCESS != rc) { if (PMIXP_EP_NOIDEID == ep[i].type){ nodename = pmixp_info_job_host(ep[i].ep.nodeid); PMIXP_ERROR("Cannot send data (size = %lu), " "to %s:%d", (uint64_t) get_buf_offset(coll->dfwd_buf), nodename, ep[i].ep.nodeid); xfree(nodename); } else { PMIXP_ERROR("Cannot send data (size = %lu), " "to %s", (uint64_t) get_buf_offset(coll->dfwd_buf), ep[i].ep.hostlist); } coll->dfwd_status = PMIXP_COLL_SND_FAILED; } #ifdef PMIXP_COLL_DEBUG if (PMIXP_EP_NOIDEID == ep[i].type) { nodename = pmixp_info_job_host(ep[i].ep.nodeid); PMIXP_DEBUG("%p: fwd to %s:%d, size = %lu", coll, nodename, ep[i].ep.nodeid, (uint64_t) get_buf_offset(coll->dfwd_buf)); xfree(nodename); } else { PMIXP_DEBUG("%p: fwd to %s, size = %lu", coll, ep[i].ep.hostlist, (uint64_t) get_buf_offset(coll->dfwd_buf)); } #endif } if (coll->cbfunc) { char *data = get_buf_data(coll->dfwd_buf) + coll->dfwd_offset; size_t size = get_buf_offset(coll->dfwd_buf) - coll->dfwd_offset; coll->dfwd_cb_wait++; coll->cbfunc(PMIX_SUCCESS, data, size, coll->cbdata, _libpmix_cb, (void *)cbdata); #ifdef PMIXP_COLL_DEBUG PMIXP_DEBUG("%p: local delivery, size = %lu", coll, (uint64_t)size); #endif } /* events observed - need another iteration */ return true; }
static int _progress_collect(pmixp_coll_t *coll) { pmixp_ep_t ep = {0}; int rc; xassert(PMIXP_COLL_COLLECT == coll->state); ep.type = PMIXP_EP_NONE; #ifdef PMIXP_COLL_DEBUG PMIXP_DEBUG("%p: state=%s, local=%d, child_cntr=%d", coll, pmixp_coll_state2str(coll->state), (int)coll->contrib_local, coll->contrib_children); #endif /* lock the collective */ pmixp_coll_sanity_check(coll); if (PMIXP_COLL_COLLECT != coll->state) { /* In case of race condition between libpmix and * slurm threads we can be called * after we moved to the next step. */ return 0; } if (!coll->contrib_local || coll->contrib_children != coll->chldrn_cnt) { /* Not yet ready to go to the next step */ return 0; } if (pmixp_info_srv_direct_conn()) { /* We will need to forward aggregated * message back to our children */ coll->state = PMIXP_COLL_UPFWD; } else { /* If we use SLURM API (SAPI) - intermediate nodes * don't need to forward data as the root will do * SAPI broadcast. * So, only root has to go through the full UPFWD * state and send the message back. * Other procs have to go through other route. The reason for * that is the fact that som of out children can receive bcast * message early and initiate next collective. We need to handle * that properly. */ if (0 > coll->prnt_peerid) { coll->state = PMIXP_COLL_UPFWD; } else { coll->state = PMIXP_COLL_UPFWD_WSC; } } /* The root of the collective will have parent_host == NULL */ if (NULL != coll->prnt_host) { ep.type = PMIXP_EP_NOIDEID; ep.ep.nodeid = coll->prnt_peerid; coll->ufwd_status = PMIXP_COLL_SND_ACTIVE; PMIXP_DEBUG("%p: send data to %s:%d", coll, coll->prnt_host, coll->prnt_peerid); } else { /* move data from input buffer to the output */ char *dst, *src = get_buf_data(coll->ufwd_buf) + coll->ufwd_offset; size_t size = get_buf_offset(coll->ufwd_buf) - coll->ufwd_offset; pmixp_server_buf_reserve(coll->dfwd_buf, size); dst = get_buf_data(coll->dfwd_buf) + coll->dfwd_offset; memcpy(dst, src, size); set_buf_offset(coll->dfwd_buf, coll->dfwd_offset + size); /* no need to send */ coll->ufwd_status = PMIXP_COLL_SND_DONE; /* this is root */ coll->contrib_prnt = true; } if (PMIXP_EP_NONE != ep.type) { pmixp_coll_cbdata_t *cbdata; cbdata = xmalloc(sizeof(pmixp_coll_cbdata_t)); cbdata->coll = coll; cbdata->seq = coll->seq; cbdata->refcntr = 1; char *nodename = coll->prnt_host; rc = pmixp_server_send_nb(&ep, PMIXP_MSG_FAN_IN, coll->seq, coll->ufwd_buf, _ufwd_sent_cb, cbdata); if (SLURM_SUCCESS != rc) { PMIXP_ERROR("Cannot send data (size = %lu), " "to %s:%d", (uint64_t) get_buf_offset(coll->ufwd_buf), nodename, ep.ep.nodeid); coll->ufwd_status = PMIXP_COLL_SND_FAILED; } #ifdef PMIXP_COLL_DEBUG PMIXP_DEBUG("%p: fwd to %s:%d, size = %lu", coll, nodename, ep.ep.nodeid, (uint64_t) get_buf_offset(coll->dfwd_buf)); #endif } /* events observed - need another iteration */ return true; }