static int _ring_forward_data(pmixp_coll_ring_ctx_t *coll_ctx, uint32_t contrib_id, uint32_t hop_seq, void *data, size_t size) { pmixp_coll_ring_msg_hdr_t hdr; pmixp_coll_t *coll = _ctx_get_coll(coll_ctx); pmixp_coll_ring_t *ring = &coll->state.ring; hdr.nodeid = coll->my_peerid; hdr.msgsize = size; hdr.seq = coll_ctx->seq; hdr.hop_seq = hop_seq; hdr.contrib_id = contrib_id; pmixp_ep_t *ep = (pmixp_ep_t*)xmalloc(sizeof(*ep)); pmixp_coll_ring_cbdata_t *cbdata = NULL; uint32_t offset = 0; Buf buf = _get_fwd_buf(coll_ctx); int rc = SLURM_SUCCESS; pmixp_coll_ring_ctx_sanity_check(coll_ctx); #ifdef PMIXP_COLL_DEBUG PMIXP_DEBUG("%p: transit data to nodeid=%d, seq=%d, hop=%d, size=%lu, contrib=%d", coll_ctx, _ring_next_id(coll), hdr.seq, hdr.hop_seq, hdr.msgsize, hdr.contrib_id); #endif if (!buf) { rc = SLURM_ERROR; goto exit; } ep->type = PMIXP_EP_NOIDEID; ep->ep.nodeid = ring->next_peerid; /* pack ring info */ _pack_coll_ring_info(coll, &hdr, buf); /* insert payload to buf */ offset = get_buf_offset(buf); pmixp_server_buf_reserve(buf, size); memcpy(get_buf_data(buf) + offset, data, size); set_buf_offset(buf, offset + size); cbdata = xmalloc(sizeof(pmixp_coll_ring_cbdata_t)); cbdata->buf = buf; cbdata->coll = coll; cbdata->coll_ctx = coll_ctx; cbdata->seq = coll_ctx->seq; rc = pmixp_server_send_nb(ep, PMIXP_MSG_RING, coll_ctx->seq, buf, _ring_sent_cb, cbdata); exit: return rc; }
int pmixp_coll_contrib_local(pmixp_coll_t *coll, char *data, size_t size, pmix_modex_cbfunc_t cbfunc, void *cbdata) { int ret = SLURM_SUCCESS; pmixp_debug_hang(0); /* sanity check */ pmixp_coll_sanity_check(coll); /* lock the structure */ slurm_mutex_lock(&coll->lock); #ifdef PMIXP_COLL_DEBUG PMIXP_DEBUG("%p: contrib/loc: seqnum=%u, state=%s, size=%zd", coll, coll->seq, pmixp_coll_state2str(coll->state), size); #endif switch (coll->state) { case PMIXP_COLL_SYNC: /* change the state */ coll->ts = time(NULL); /* fall-thru */ case PMIXP_COLL_COLLECT: /* sanity check */ break; case PMIXP_COLL_DOWNFWD: /* We are waiting for some send requests * to be finished, but local node has started * the next contribution. * This is an OK situation, go ahead and store * it, the buffer with the contribution is not used * now. */ #ifdef PMIXP_COLL_DEBUG PMIXP_DEBUG("%p: contrib/loc: next coll!", coll); #endif break; case PMIXP_COLL_UPFWD: case PMIXP_COLL_UPFWD_WSC: case PMIXP_COLL_UPFWD_WPC: /* this is not a correct behavior, respond with an error. */ #ifdef PMIXP_COLL_DEBUG PMIXP_DEBUG("%p: contrib/loc: before prev coll is finished!", coll); #endif ret = SLURM_ERROR; goto exit; default: /* FATAL: should not happen in normal workflow */ PMIXP_ERROR("%p: local contrib while active collective, " "state = %s", coll, pmixp_coll_state2str(coll->state)); xassert(0); abort(); } if (coll->contrib_local) { /* Double contribution - reject */ ret = SLURM_ERROR; goto exit; } /* save & mark local contribution */ coll->contrib_local = true; pmixp_server_buf_reserve(coll->ufwd_buf, size); memcpy(get_buf_data(coll->ufwd_buf) + get_buf_offset(coll->ufwd_buf), data, size); set_buf_offset(coll->ufwd_buf, get_buf_offset(coll->ufwd_buf) + size); /* setup callback info */ coll->cbfunc = cbfunc; coll->cbdata = cbdata; /* check if the collective is ready to progress */ _progress_coll(coll); #ifdef PMIXP_COLL_DEBUG PMIXP_DEBUG("%p: finish, state=%s", coll, pmixp_coll_state2str(coll->state)); #endif exit: /* unlock the structure */ slurm_mutex_unlock(&coll->lock); return ret; }
static int _progress_collect(pmixp_coll_t *coll) { pmixp_ep_t ep = {0}; int rc; xassert(PMIXP_COLL_COLLECT == coll->state); ep.type = PMIXP_EP_NONE; #ifdef PMIXP_COLL_DEBUG PMIXP_DEBUG("%p: state=%s, local=%d, child_cntr=%d", coll, pmixp_coll_state2str(coll->state), (int)coll->contrib_local, coll->contrib_children); #endif /* lock the collective */ pmixp_coll_sanity_check(coll); if (PMIXP_COLL_COLLECT != coll->state) { /* In case of race condition between libpmix and * slurm threads we can be called * after we moved to the next step. */ return 0; } if (!coll->contrib_local || coll->contrib_children != coll->chldrn_cnt) { /* Not yet ready to go to the next step */ return 0; } if (pmixp_info_srv_direct_conn()) { /* We will need to forward aggregated * message back to our children */ coll->state = PMIXP_COLL_UPFWD; } else { /* If we use SLURM API (SAPI) - intermediate nodes * don't need to forward data as the root will do * SAPI broadcast. * So, only root has to go through the full UPFWD * state and send the message back. * Other procs have to go through other route. The reason for * that is the fact that som of out children can receive bcast * message early and initiate next collective. We need to handle * that properly. */ if (0 > coll->prnt_peerid) { coll->state = PMIXP_COLL_UPFWD; } else { coll->state = PMIXP_COLL_UPFWD_WSC; } } /* The root of the collective will have parent_host == NULL */ if (NULL != coll->prnt_host) { ep.type = PMIXP_EP_NOIDEID; ep.ep.nodeid = coll->prnt_peerid; coll->ufwd_status = PMIXP_COLL_SND_ACTIVE; PMIXP_DEBUG("%p: send data to %s:%d", coll, coll->prnt_host, coll->prnt_peerid); } else { /* move data from input buffer to the output */ char *dst, *src = get_buf_data(coll->ufwd_buf) + coll->ufwd_offset; size_t size = get_buf_offset(coll->ufwd_buf) - coll->ufwd_offset; pmixp_server_buf_reserve(coll->dfwd_buf, size); dst = get_buf_data(coll->dfwd_buf) + coll->dfwd_offset; memcpy(dst, src, size); set_buf_offset(coll->dfwd_buf, coll->dfwd_offset + size); /* no need to send */ coll->ufwd_status = PMIXP_COLL_SND_DONE; /* this is root */ coll->contrib_prnt = true; } if (PMIXP_EP_NONE != ep.type) { pmixp_coll_cbdata_t *cbdata; cbdata = xmalloc(sizeof(pmixp_coll_cbdata_t)); cbdata->coll = coll; cbdata->seq = coll->seq; cbdata->refcntr = 1; char *nodename = coll->prnt_host; rc = pmixp_server_send_nb(&ep, PMIXP_MSG_FAN_IN, coll->seq, coll->ufwd_buf, _ufwd_sent_cb, cbdata); if (SLURM_SUCCESS != rc) { PMIXP_ERROR("Cannot send data (size = %lu), " "to %s:%d", (uint64_t) get_buf_offset(coll->ufwd_buf), nodename, ep.ep.nodeid); coll->ufwd_status = PMIXP_COLL_SND_FAILED; } #ifdef PMIXP_COLL_DEBUG PMIXP_DEBUG("%p: fwd to %s:%d, size = %lu", coll, nodename, ep.ep.nodeid, (uint64_t) get_buf_offset(coll->dfwd_buf)); #endif } /* events observed - need another iteration */ return true; }
int pmixp_coll_contrib_parent(pmixp_coll_t *coll, uint32_t peerid, uint32_t seq, Buf buf) { #ifdef PMIXP_COLL_DEBUG char *nodename = NULL; int lpeerid = -1; #endif char *data_src = NULL, *data_dst = NULL; uint32_t size; int expected_peerid; /* lock the structure */ slurm_mutex_lock(&coll->lock); if (pmixp_info_srv_direct_conn()) { expected_peerid = coll->prnt_peerid; } else { expected_peerid = coll->root_peerid; } /* Sanity check */ pmixp_coll_sanity_check(coll); if (expected_peerid != peerid) { char *nodename = pmixp_info_job_host(peerid); /* protect ourselfs if we are running with no asserts */ PMIXP_ERROR("%p: parent contrib from bad nodeid=%s:%u, " "expect=%d", coll, nodename, peerid, expected_peerid); xfree(nodename); goto proceed; } #ifdef PMIXP_COLL_DEBUG nodename = pmixp_info_job_host(peerid); lpeerid = hostlist_find(coll->peers_hl, nodename); /* Mark this event */ PMIXP_DEBUG("%p: contrib/rem from %s:%d(%d): state=%s, size=%u", coll, nodename, peerid, lpeerid, pmixp_coll_state2str(coll->state), remaining_buf(buf)); #endif switch (coll->state) { case PMIXP_COLL_SYNC: case PMIXP_COLL_COLLECT: /* It looks like a retransmission attempt when remote side * identified transmission failure, but we actually successfuly * received the message */ #ifdef PMIXP_COLL_DEBUG PMIXP_DEBUG("%p: prev contrib from %s:%d(%d): " "seq=%u, cur_seq=%u, state=%s", coll, nodename, peerid, lpeerid, seq, coll->seq, pmixp_coll_state2str(coll->state)); #endif /* sanity check */ if ((coll->seq - 1) != seq) { /* FATAL: should not happen in normal workflow */ char *nodename = pmixp_info_job_host(peerid); PMIXP_ERROR("%p: unexpected contrib from %s:%d: " "contrib_seq = %d, coll->seq = %d, " "state=%s", coll, nodename, peerid, seq, coll->seq, pmixp_coll_state2str(coll->state)); xfree(nodename); xassert((coll->seq - 1) == seq); abort(); } goto proceed; case PMIXP_COLL_UPFWD_WSC:{ /* we are not actually ready to receive this contribution as * the upward portion of the collective wasn't received yet. * This should not happen as SAPI (SLURM API) is blocking and * we chould transit to PMIXP_COLL_UPFWD_WPC immediately */ /* FATAL: should not happen in normal workflow */ char *nodename = pmixp_info_job_host(peerid); PMIXP_ERROR("%p: unexpected contrib from %s:%d: " "contrib_seq = %d, coll->seq = %d, " "state=%s", coll, nodename, peerid, seq, coll->seq, pmixp_coll_state2str(coll->state)); xfree(nodename); xassert((coll->seq - 1) == seq); abort(); } case PMIXP_COLL_UPFWD: case PMIXP_COLL_UPFWD_WPC: /* we were waiting for this */ break; case PMIXP_COLL_DOWNFWD: /* It looks like a retransmission attempt when remote side * identified transmission failure, but we actually successfuly * received the message */ #ifdef PMIXP_COLL_DEBUG PMIXP_DEBUG("%p: double contrib from %s:%d(%d) " "seq=%u, cur_seq=%u, state=%s", coll, nodename, peerid, lpeerid, seq, coll->seq, pmixp_coll_state2str(coll->state)); #endif /* sanity check */ if (coll->seq != seq) { char *nodename = pmixp_info_job_host(peerid); /* FATAL: should not happen in normal workflow */ PMIXP_ERROR("%p: unexpected contrib from %s:%d: " "seq = %d, coll->seq = %d, state=%s", coll, nodename, peerid, seq, coll->seq, pmixp_coll_state2str(coll->state)); xassert((coll->seq - 1) == seq); xfree(nodename); abort(); } goto proceed; default: /* should not happen in normal workflow */ PMIXP_ERROR("%p: unknown collective state %s", coll, pmixp_coll_state2str(coll->state)); abort(); } /* Because of possible timeouts/delays in transmission we * can receive a contribution second time. Avoid duplications * by checking our records. */ if (coll->contrib_prnt) { char *nodename = pmixp_info_job_host(peerid); /* May be 0 or 1. If grater - transmission skew, ignore. * NOTE: this output is not on the critical path - * don't preprocess it out */ PMIXP_DEBUG("%p: multiple contributions from parent %s:%d", coll, nodename, peerid); xfree(nodename); /* this is duplication, skip. */ goto proceed; } coll->contrib_prnt = true; data_src = get_buf_data(buf) + get_buf_offset(buf); size = remaining_buf(buf); pmixp_server_buf_reserve(coll->dfwd_buf, size); data_dst = get_buf_data(coll->dfwd_buf) + get_buf_offset(coll->dfwd_buf); memcpy(data_dst, data_src, size); set_buf_offset(coll->dfwd_buf, get_buf_offset(coll->dfwd_buf) + size); proceed: _progress_coll(coll); #ifdef PMIXP_COLL_DEBUG if (nodename) { PMIXP_DEBUG("%p: finish: node=%s:%d(%d), state=%s", coll, nodename, peerid, lpeerid, pmixp_coll_state2str(coll->state)); xfree(nodename); } #endif /* unlock the structure */ slurm_mutex_unlock(&coll->lock); return SLURM_SUCCESS; }
int pmixp_coll_contrib_child(pmixp_coll_t *coll, uint32_t peerid, uint32_t seq, Buf buf) { char *data_src = NULL, *data_dst = NULL; uint32_t size; int chld_id; /* lock the structure */ slurm_mutex_lock(&coll->lock); pmixp_coll_sanity_check(coll); if (0 > (chld_id = _chld_id(coll, peerid))) { char *nodename = pmixp_info_job_host(peerid); char *avail_ids = _chld_ids_str(coll); PMIXP_DEBUG("%p: contribution from the non-child node " "%s:%d, acceptable ids: %s", coll, nodename, peerid, avail_ids); xfree(nodename); xfree(avail_ids); } #ifdef PMIXP_COLL_DEBUG char *nodename = pmixp_info_job_host(peerid); int lpeerid = hostlist_find(coll->peers_hl, nodename); PMIXP_DEBUG("%p: contrib/rem from %s:%d(%d:%d):, state=%s, size=%u", coll, nodename, peerid, lpeerid, chld_id, pmixp_coll_state2str(coll->state), remaining_buf(buf)); #endif switch (coll->state) { case PMIXP_COLL_SYNC: /* change the state */ coll->ts = time(NULL); /* fall-thru */ case PMIXP_COLL_COLLECT: /* sanity check */ if (coll->seq != seq) { char *nodename = pmixp_info_job_host(peerid); /* FATAL: should not happen in normal workflow */ PMIXP_ERROR("%p: unexpected contrib from %s:%d " "(child #%d) seq = %d, coll->seq = %d, " "state=%s", coll, nodename, peerid, chld_id, seq, coll->seq, pmixp_coll_state2str(coll->state)); xassert(coll->seq == seq); abort(); } break; case PMIXP_COLL_UPFWD: case PMIXP_COLL_UPFWD_WSC: /* FATAL: should not happen in normal workflow */ PMIXP_ERROR("%p: unexpected contrib from %s:%d, state = %s", coll, nodename, peerid, pmixp_coll_state2str(coll->state)); xassert(0); abort(); case PMIXP_COLL_UPFWD_WPC: case PMIXP_COLL_DOWNFWD: #ifdef PMIXP_COLL_DEBUG /* It looks like a retransmission attempt when remote side * identified transmission failure, but we actually successfuly * received the message */ PMIXP_DEBUG("%p: contrib for the next collective " "from=%s:%d(%d:%d) contrib_seq=%u, coll->seq=%u, " "state=%s", coll, nodename, peerid, lpeerid, chld_id, seq, coll->seq, pmixp_coll_state2str(coll->state)); #endif if ((coll->seq +1) != seq) { char *nodename = pmixp_info_job_host(peerid); /* should not happen in normal workflow */ PMIXP_ERROR("%p: unexpected contrib from %s:%d(x:%d) " "seq = %d, coll->seq = %d, " "state=%s", coll, nodename, peerid, chld_id, seq, coll->seq, pmixp_coll_state2str(coll->state)); xfree(nodename); xassert((coll->seq +1) == seq); abort(); } break; default: /* should not happen in normal workflow */ PMIXP_ERROR("%p: unknown collective state %s", coll, pmixp_coll_state2str(coll->state)); abort(); } /* Because of possible timeouts/delays in transmission we * can receive a contribution second time. Avoid duplications * by checking our records. */ if (coll->contrib_chld[chld_id]) { char *nodename = pmixp_info_job_host(peerid); /* May be 0 or 1. If grater - transmission skew, ignore. * NOTE: this output is not on the critical path - * don't preprocess it out */ PMIXP_DEBUG("%p: multiple contribs from %s:%d(x:%d)", coll, nodename, peerid, chld_id); /* this is duplication, skip. */ xfree(nodename); goto proceed; } data_src = get_buf_data(buf) + get_buf_offset(buf); size = remaining_buf(buf); pmixp_server_buf_reserve(coll->ufwd_buf, size); data_dst = get_buf_data(coll->ufwd_buf) + get_buf_offset(coll->ufwd_buf); memcpy(data_dst, data_src, size); set_buf_offset(coll->ufwd_buf, get_buf_offset(coll->ufwd_buf) + size); /* increase number of individual contributions */ coll->contrib_chld[chld_id] = true; /* increase number of total contributions */ coll->contrib_children++; proceed: _progress_coll(coll); #ifdef PMIXP_COLL_DEBUG PMIXP_DEBUG("%p: finish: node=%s:%d(%d:%d), state=%s", coll, nodename, peerid, lpeerid, chld_id, pmixp_coll_state2str(coll->state)); xfree(nodename); #endif /* unlock the structure */ slurm_mutex_unlock(&coll->lock); return SLURM_SUCCESS; }