static inline void pmixp_coll_ring_ctx_sanity_check( pmixp_coll_ring_ctx_t *coll_ctx) { xassert(NULL != coll_ctx); xassert(coll_ctx->in_use); pmixp_coll_sanity_check(coll_ctx->coll); }
int pmixp_coll_ring_local(pmixp_coll_t *coll, char *data, size_t size, void *cbfunc, void *cbdata) { int ret = SLURM_SUCCESS; pmixp_coll_ring_ctx_t *coll_ctx = NULL; /* lock the structure */ slurm_mutex_lock(&coll->lock); /* sanity check */ pmixp_coll_sanity_check(coll); /* setup callback info */ coll->cbfunc = cbfunc; coll->cbdata = cbdata; coll_ctx = pmixp_coll_ring_ctx_new(coll); if (!coll_ctx) { PMIXP_ERROR("Can not get new ring collective context, seq=%u", coll->seq); ret = SLURM_ERROR; goto exit; } #ifdef PMIXP_COLL_DEBUG PMIXP_DEBUG("%p: contrib/loc: seqnum=%u, state=%d, size=%lu", coll_ctx, coll_ctx->seq, coll_ctx->state, size); #endif if (_pmixp_coll_contrib(coll_ctx, coll->my_peerid, 0, data, size)) { goto exit; } /* mark local contribution */ coll_ctx->contrib_local = true; _progress_coll_ring(coll_ctx); exit: /* unlock the structure */ slurm_mutex_unlock(&coll->lock); return ret; }
static void _libpmix_cb(void *_vcbdata) { pmixp_coll_ring_cbdata_t *cbdata = (pmixp_coll_ring_cbdata_t*)_vcbdata; pmixp_coll_t *coll = cbdata->coll; Buf buf = cbdata->buf; pmixp_coll_sanity_check(coll); /* lock the structure */ slurm_mutex_lock(&coll->lock); /* reset buf */ buf->processed = 0; /* push it back to pool for reuse */ list_push(coll->state.ring.ring_buf_pool, buf); /* unlock the structure */ slurm_mutex_unlock(&coll->lock); xfree(cbdata); }
void pmixp_coll_free(pmixp_coll_t *coll) { pmixp_coll_sanity_check(coll); if (NULL != coll->pset.procs) { xfree(coll->pset.procs); } #ifdef PMIXP_COLL_DEBUG hostlist_destroy(coll->peers_hl); #endif /* check for collective in a not-SYNC state - something went wrong */ switch(coll->type) { case PMIXP_COLL_TYPE_FENCE_TREE: if (PMIXP_COLL_TREE_SYNC != coll->state.tree.state) pmixp_coll_log(coll); pmixp_coll_tree_free(&coll->state.tree); break; case PMIXP_COLL_TYPE_FENCE_RING: { int i, ctx_in_use = 0; for (i = 0; i < PMIXP_COLL_RING_CTX_NUM; i++) { pmixp_coll_ring_ctx_t *coll_ctx = &coll->state.ring.ctx_array[i]; if (coll_ctx->in_use) ctx_in_use++; } if (ctx_in_use) pmixp_coll_log(coll); pmixp_coll_ring_free(&coll->state.ring); break; } default: PMIXP_ERROR("Unknown coll type"); break; } xfree(coll); }
int pmixp_coll_contrib_local(pmixp_coll_t *coll, char *data, size_t size) { PMIXP_DEBUG("%s:%d: get local contribution", pmixp_info_namespace(), pmixp_info_nodeid()); /* sanity check */ pmixp_coll_sanity_check(coll); /* lock the structure */ slurm_mutex_lock(&coll->lock); /* change the collective state if need */ if (PMIXP_COLL_SYNC == coll->state) { PMIXP_DEBUG( "%s:%d: get local contribution: switch to PMIXP_COLL_FAN_IN", pmixp_info_namespace(), pmixp_info_nodeid()); coll->state = PMIXP_COLL_FAN_IN; coll->ts = time(NULL); } xassert(PMIXP_COLL_FAN_IN == coll->state); /* save & mark local contribution */ coll->contrib_local = true; grow_buf(coll->buf, size); memcpy(get_buf_data(coll->buf) + get_buf_offset(coll->buf), data, size); set_buf_offset(coll->buf, get_buf_offset(coll->buf) + size); /* unlock the structure */ slurm_mutex_unlock(&coll->lock); /* check if the collective is ready to progress */ _progress_fan_in(coll); PMIXP_DEBUG("%s:%d: get local contribution: finish", pmixp_info_namespace(), pmixp_info_nodeid()); return SLURM_SUCCESS; }
void _progres_fan_out(pmixp_coll_t *coll, Buf buf) { PMIXP_DEBUG("%s:%d: start", pmixp_info_namespace(), pmixp_info_nodeid()); pmixp_coll_sanity_check(coll); xassert(PMIXP_COLL_FAN_OUT == coll->state || PMIXP_COLL_FAN_OUT_IN == coll->state); /* update the database */ if (NULL != coll->cbfunc) { void *data = get_buf_data(buf) + get_buf_offset(buf); size_t size = remaining_buf(buf); PMIXP_DEBUG("%s:%d: use the callback", pmixp_info_namespace(), pmixp_info_nodeid()); coll->cbfunc(PMIX_SUCCESS, data, size, coll->cbdata, pmixp_free_Buf, (void *)buf); } /* Prepare for the next collective operation */ _fan_out_finished(coll); PMIXP_DEBUG("%s:%d: collective is prepared for the next use", pmixp_info_namespace(), pmixp_info_nodeid()); }
static void _ring_sent_cb(int rc, pmixp_p2p_ctx_t ctx, void *_cbdata) { pmixp_coll_ring_cbdata_t *cbdata = (pmixp_coll_ring_cbdata_t*)_cbdata; pmixp_coll_ring_ctx_t *coll_ctx = cbdata->coll_ctx; pmixp_coll_t *coll = cbdata->coll; Buf buf = cbdata->buf; pmixp_coll_sanity_check(coll); if (PMIXP_P2P_REGULAR == ctx) { /* lock the collective */ slurm_mutex_lock(&coll->lock); } #ifdef PMIXP_COLL_DEBUG PMIXP_DEBUG("%p: called %d", coll_ctx, coll_ctx->seq); #endif if (cbdata->seq != coll_ctx->seq) { /* it seems like this collective was reset since the time * we initiated this send. * Just exit to avoid data corruption. */ PMIXP_DEBUG("%p: collective was reset!", coll_ctx); goto exit; } coll_ctx->forward_cnt++; _progress_coll_ring(coll_ctx); exit: pmixp_server_buf_reset(buf); list_push(coll->state.ring.fwrd_buf_pool, buf); if (PMIXP_P2P_REGULAR == ctx) { /* unlock the collective */ slurm_mutex_unlock(&coll->lock); } xfree(cbdata); }
int pmixp_coll_contrib_local(pmixp_coll_t *coll, char *data, size_t size, pmix_modex_cbfunc_t cbfunc, void *cbdata) { int ret = SLURM_SUCCESS; pmixp_debug_hang(0); /* sanity check */ pmixp_coll_sanity_check(coll); /* lock the structure */ slurm_mutex_lock(&coll->lock); #ifdef PMIXP_COLL_DEBUG PMIXP_DEBUG("%p: contrib/loc: seqnum=%u, state=%s, size=%zd", coll, coll->seq, pmixp_coll_state2str(coll->state), size); #endif switch (coll->state) { case PMIXP_COLL_SYNC: /* change the state */ coll->ts = time(NULL); /* fall-thru */ case PMIXP_COLL_COLLECT: /* sanity check */ break; case PMIXP_COLL_DOWNFWD: /* We are waiting for some send requests * to be finished, but local node has started * the next contribution. * This is an OK situation, go ahead and store * it, the buffer with the contribution is not used * now. */ #ifdef PMIXP_COLL_DEBUG PMIXP_DEBUG("%p: contrib/loc: next coll!", coll); #endif break; case PMIXP_COLL_UPFWD: case PMIXP_COLL_UPFWD_WSC: case PMIXP_COLL_UPFWD_WPC: /* this is not a correct behavior, respond with an error. */ #ifdef PMIXP_COLL_DEBUG PMIXP_DEBUG("%p: contrib/loc: before prev coll is finished!", coll); #endif ret = SLURM_ERROR; goto exit; default: /* FATAL: should not happen in normal workflow */ PMIXP_ERROR("%p: local contrib while active collective, " "state = %s", coll, pmixp_coll_state2str(coll->state)); xassert(0); abort(); } if (coll->contrib_local) { /* Double contribution - reject */ ret = SLURM_ERROR; goto exit; } /* save & mark local contribution */ coll->contrib_local = true; pmixp_server_buf_reserve(coll->ufwd_buf, size); memcpy(get_buf_data(coll->ufwd_buf) + get_buf_offset(coll->ufwd_buf), data, size); set_buf_offset(coll->ufwd_buf, get_buf_offset(coll->ufwd_buf) + size); /* setup callback info */ coll->cbfunc = cbfunc; coll->cbdata = cbdata; /* check if the collective is ready to progress */ _progress_coll(coll); #ifdef PMIXP_COLL_DEBUG PMIXP_DEBUG("%p: finish, state=%s", coll, pmixp_coll_state2str(coll->state)); #endif exit: /* unlock the structure */ slurm_mutex_unlock(&coll->lock); return ret; }
static int _progress_collect(pmixp_coll_t *coll) { pmixp_ep_t ep = {0}; int rc; xassert(PMIXP_COLL_COLLECT == coll->state); ep.type = PMIXP_EP_NONE; #ifdef PMIXP_COLL_DEBUG PMIXP_DEBUG("%p: state=%s, local=%d, child_cntr=%d", coll, pmixp_coll_state2str(coll->state), (int)coll->contrib_local, coll->contrib_children); #endif /* lock the collective */ pmixp_coll_sanity_check(coll); if (PMIXP_COLL_COLLECT != coll->state) { /* In case of race condition between libpmix and * slurm threads we can be called * after we moved to the next step. */ return 0; } if (!coll->contrib_local || coll->contrib_children != coll->chldrn_cnt) { /* Not yet ready to go to the next step */ return 0; } if (pmixp_info_srv_direct_conn()) { /* We will need to forward aggregated * message back to our children */ coll->state = PMIXP_COLL_UPFWD; } else { /* If we use SLURM API (SAPI) - intermediate nodes * don't need to forward data as the root will do * SAPI broadcast. * So, only root has to go through the full UPFWD * state and send the message back. * Other procs have to go through other route. The reason for * that is the fact that som of out children can receive bcast * message early and initiate next collective. We need to handle * that properly. */ if (0 > coll->prnt_peerid) { coll->state = PMIXP_COLL_UPFWD; } else { coll->state = PMIXP_COLL_UPFWD_WSC; } } /* The root of the collective will have parent_host == NULL */ if (NULL != coll->prnt_host) { ep.type = PMIXP_EP_NOIDEID; ep.ep.nodeid = coll->prnt_peerid; coll->ufwd_status = PMIXP_COLL_SND_ACTIVE; PMIXP_DEBUG("%p: send data to %s:%d", coll, coll->prnt_host, coll->prnt_peerid); } else { /* move data from input buffer to the output */ char *dst, *src = get_buf_data(coll->ufwd_buf) + coll->ufwd_offset; size_t size = get_buf_offset(coll->ufwd_buf) - coll->ufwd_offset; pmixp_server_buf_reserve(coll->dfwd_buf, size); dst = get_buf_data(coll->dfwd_buf) + coll->dfwd_offset; memcpy(dst, src, size); set_buf_offset(coll->dfwd_buf, coll->dfwd_offset + size); /* no need to send */ coll->ufwd_status = PMIXP_COLL_SND_DONE; /* this is root */ coll->contrib_prnt = true; } if (PMIXP_EP_NONE != ep.type) { pmixp_coll_cbdata_t *cbdata; cbdata = xmalloc(sizeof(pmixp_coll_cbdata_t)); cbdata->coll = coll; cbdata->seq = coll->seq; cbdata->refcntr = 1; char *nodename = coll->prnt_host; rc = pmixp_server_send_nb(&ep, PMIXP_MSG_FAN_IN, coll->seq, coll->ufwd_buf, _ufwd_sent_cb, cbdata); if (SLURM_SUCCESS != rc) { PMIXP_ERROR("Cannot send data (size = %lu), " "to %s:%d", (uint64_t) get_buf_offset(coll->ufwd_buf), nodename, ep.ep.nodeid); coll->ufwd_status = PMIXP_COLL_SND_FAILED; } #ifdef PMIXP_COLL_DEBUG PMIXP_DEBUG("%p: fwd to %s:%d, size = %lu", coll, nodename, ep.ep.nodeid, (uint64_t) get_buf_offset(coll->dfwd_buf)); #endif } /* events observed - need another iteration */ return true; }
/* * use it for internal collective * performance evaluation tool. */ pmixp_coll_t *pmixp_coll_from_cbdata(void *cbdata) { pmixp_coll_cbdata_t *ptr = (pmixp_coll_cbdata_t*)cbdata; pmixp_coll_sanity_check(ptr->coll); return ptr->coll; }
int pmixp_coll_contrib_parent(pmixp_coll_t *coll, uint32_t peerid, uint32_t seq, Buf buf) { #ifdef PMIXP_COLL_DEBUG char *nodename = NULL; int lpeerid = -1; #endif char *data_src = NULL, *data_dst = NULL; uint32_t size; int expected_peerid; /* lock the structure */ slurm_mutex_lock(&coll->lock); if (pmixp_info_srv_direct_conn()) { expected_peerid = coll->prnt_peerid; } else { expected_peerid = coll->root_peerid; } /* Sanity check */ pmixp_coll_sanity_check(coll); if (expected_peerid != peerid) { char *nodename = pmixp_info_job_host(peerid); /* protect ourselfs if we are running with no asserts */ PMIXP_ERROR("%p: parent contrib from bad nodeid=%s:%u, " "expect=%d", coll, nodename, peerid, expected_peerid); xfree(nodename); goto proceed; } #ifdef PMIXP_COLL_DEBUG nodename = pmixp_info_job_host(peerid); lpeerid = hostlist_find(coll->peers_hl, nodename); /* Mark this event */ PMIXP_DEBUG("%p: contrib/rem from %s:%d(%d): state=%s, size=%u", coll, nodename, peerid, lpeerid, pmixp_coll_state2str(coll->state), remaining_buf(buf)); #endif switch (coll->state) { case PMIXP_COLL_SYNC: case PMIXP_COLL_COLLECT: /* It looks like a retransmission attempt when remote side * identified transmission failure, but we actually successfuly * received the message */ #ifdef PMIXP_COLL_DEBUG PMIXP_DEBUG("%p: prev contrib from %s:%d(%d): " "seq=%u, cur_seq=%u, state=%s", coll, nodename, peerid, lpeerid, seq, coll->seq, pmixp_coll_state2str(coll->state)); #endif /* sanity check */ if ((coll->seq - 1) != seq) { /* FATAL: should not happen in normal workflow */ char *nodename = pmixp_info_job_host(peerid); PMIXP_ERROR("%p: unexpected contrib from %s:%d: " "contrib_seq = %d, coll->seq = %d, " "state=%s", coll, nodename, peerid, seq, coll->seq, pmixp_coll_state2str(coll->state)); xfree(nodename); xassert((coll->seq - 1) == seq); abort(); } goto proceed; case PMIXP_COLL_UPFWD_WSC:{ /* we are not actually ready to receive this contribution as * the upward portion of the collective wasn't received yet. * This should not happen as SAPI (SLURM API) is blocking and * we chould transit to PMIXP_COLL_UPFWD_WPC immediately */ /* FATAL: should not happen in normal workflow */ char *nodename = pmixp_info_job_host(peerid); PMIXP_ERROR("%p: unexpected contrib from %s:%d: " "contrib_seq = %d, coll->seq = %d, " "state=%s", coll, nodename, peerid, seq, coll->seq, pmixp_coll_state2str(coll->state)); xfree(nodename); xassert((coll->seq - 1) == seq); abort(); } case PMIXP_COLL_UPFWD: case PMIXP_COLL_UPFWD_WPC: /* we were waiting for this */ break; case PMIXP_COLL_DOWNFWD: /* It looks like a retransmission attempt when remote side * identified transmission failure, but we actually successfuly * received the message */ #ifdef PMIXP_COLL_DEBUG PMIXP_DEBUG("%p: double contrib from %s:%d(%d) " "seq=%u, cur_seq=%u, state=%s", coll, nodename, peerid, lpeerid, seq, coll->seq, pmixp_coll_state2str(coll->state)); #endif /* sanity check */ if (coll->seq != seq) { char *nodename = pmixp_info_job_host(peerid); /* FATAL: should not happen in normal workflow */ PMIXP_ERROR("%p: unexpected contrib from %s:%d: " "seq = %d, coll->seq = %d, state=%s", coll, nodename, peerid, seq, coll->seq, pmixp_coll_state2str(coll->state)); xassert((coll->seq - 1) == seq); xfree(nodename); abort(); } goto proceed; default: /* should not happen in normal workflow */ PMIXP_ERROR("%p: unknown collective state %s", coll, pmixp_coll_state2str(coll->state)); abort(); } /* Because of possible timeouts/delays in transmission we * can receive a contribution second time. Avoid duplications * by checking our records. */ if (coll->contrib_prnt) { char *nodename = pmixp_info_job_host(peerid); /* May be 0 or 1. If grater - transmission skew, ignore. * NOTE: this output is not on the critical path - * don't preprocess it out */ PMIXP_DEBUG("%p: multiple contributions from parent %s:%d", coll, nodename, peerid); xfree(nodename); /* this is duplication, skip. */ goto proceed; } coll->contrib_prnt = true; data_src = get_buf_data(buf) + get_buf_offset(buf); size = remaining_buf(buf); pmixp_server_buf_reserve(coll->dfwd_buf, size); data_dst = get_buf_data(coll->dfwd_buf) + get_buf_offset(coll->dfwd_buf); memcpy(data_dst, data_src, size); set_buf_offset(coll->dfwd_buf, get_buf_offset(coll->dfwd_buf) + size); proceed: _progress_coll(coll); #ifdef PMIXP_COLL_DEBUG if (nodename) { PMIXP_DEBUG("%p: finish: node=%s:%d(%d), state=%s", coll, nodename, peerid, lpeerid, pmixp_coll_state2str(coll->state)); xfree(nodename); } #endif /* unlock the structure */ slurm_mutex_unlock(&coll->lock); return SLURM_SUCCESS; }
int pmixp_coll_contrib_child(pmixp_coll_t *coll, uint32_t peerid, uint32_t seq, Buf buf) { char *data_src = NULL, *data_dst = NULL; uint32_t size; int chld_id; /* lock the structure */ slurm_mutex_lock(&coll->lock); pmixp_coll_sanity_check(coll); if (0 > (chld_id = _chld_id(coll, peerid))) { char *nodename = pmixp_info_job_host(peerid); char *avail_ids = _chld_ids_str(coll); PMIXP_DEBUG("%p: contribution from the non-child node " "%s:%d, acceptable ids: %s", coll, nodename, peerid, avail_ids); xfree(nodename); xfree(avail_ids); } #ifdef PMIXP_COLL_DEBUG char *nodename = pmixp_info_job_host(peerid); int lpeerid = hostlist_find(coll->peers_hl, nodename); PMIXP_DEBUG("%p: contrib/rem from %s:%d(%d:%d):, state=%s, size=%u", coll, nodename, peerid, lpeerid, chld_id, pmixp_coll_state2str(coll->state), remaining_buf(buf)); #endif switch (coll->state) { case PMIXP_COLL_SYNC: /* change the state */ coll->ts = time(NULL); /* fall-thru */ case PMIXP_COLL_COLLECT: /* sanity check */ if (coll->seq != seq) { char *nodename = pmixp_info_job_host(peerid); /* FATAL: should not happen in normal workflow */ PMIXP_ERROR("%p: unexpected contrib from %s:%d " "(child #%d) seq = %d, coll->seq = %d, " "state=%s", coll, nodename, peerid, chld_id, seq, coll->seq, pmixp_coll_state2str(coll->state)); xassert(coll->seq == seq); abort(); } break; case PMIXP_COLL_UPFWD: case PMIXP_COLL_UPFWD_WSC: /* FATAL: should not happen in normal workflow */ PMIXP_ERROR("%p: unexpected contrib from %s:%d, state = %s", coll, nodename, peerid, pmixp_coll_state2str(coll->state)); xassert(0); abort(); case PMIXP_COLL_UPFWD_WPC: case PMIXP_COLL_DOWNFWD: #ifdef PMIXP_COLL_DEBUG /* It looks like a retransmission attempt when remote side * identified transmission failure, but we actually successfuly * received the message */ PMIXP_DEBUG("%p: contrib for the next collective " "from=%s:%d(%d:%d) contrib_seq=%u, coll->seq=%u, " "state=%s", coll, nodename, peerid, lpeerid, chld_id, seq, coll->seq, pmixp_coll_state2str(coll->state)); #endif if ((coll->seq +1) != seq) { char *nodename = pmixp_info_job_host(peerid); /* should not happen in normal workflow */ PMIXP_ERROR("%p: unexpected contrib from %s:%d(x:%d) " "seq = %d, coll->seq = %d, " "state=%s", coll, nodename, peerid, chld_id, seq, coll->seq, pmixp_coll_state2str(coll->state)); xfree(nodename); xassert((coll->seq +1) == seq); abort(); } break; default: /* should not happen in normal workflow */ PMIXP_ERROR("%p: unknown collective state %s", coll, pmixp_coll_state2str(coll->state)); abort(); } /* Because of possible timeouts/delays in transmission we * can receive a contribution second time. Avoid duplications * by checking our records. */ if (coll->contrib_chld[chld_id]) { char *nodename = pmixp_info_job_host(peerid); /* May be 0 or 1. If grater - transmission skew, ignore. * NOTE: this output is not on the critical path - * don't preprocess it out */ PMIXP_DEBUG("%p: multiple contribs from %s:%d(x:%d)", coll, nodename, peerid, chld_id); /* this is duplication, skip. */ xfree(nodename); goto proceed; } data_src = get_buf_data(buf) + get_buf_offset(buf); size = remaining_buf(buf); pmixp_server_buf_reserve(coll->ufwd_buf, size); data_dst = get_buf_data(coll->ufwd_buf) + get_buf_offset(coll->ufwd_buf); memcpy(data_dst, data_src, size); set_buf_offset(coll->ufwd_buf, get_buf_offset(coll->ufwd_buf) + size); /* increase number of individual contributions */ coll->contrib_chld[chld_id] = true; /* increase number of total contributions */ coll->contrib_children++; proceed: _progress_coll(coll); #ifdef PMIXP_COLL_DEBUG PMIXP_DEBUG("%p: finish: node=%s:%d(%d:%d), state=%s", coll, nodename, peerid, lpeerid, chld_id, pmixp_coll_state2str(coll->state)); xfree(nodename); #endif /* unlock the structure */ slurm_mutex_unlock(&coll->lock); return SLURM_SUCCESS; }
static void _progress_fan_in(pmixp_coll_t *coll) { pmixp_srv_cmd_t type; const char *addr = pmixp_info_srv_addr(); char *hostlist = NULL; int rc, is_p2p = 0; Buf root_buf; PMIXP_DEBUG("%s:%d: start, local=%d, child_cntr=%d", pmixp_info_namespace(), pmixp_info_nodeid(), coll->contrib_local, coll->contrib_cntr); /* lock the collective */ slurm_mutex_lock(&coll->lock); pmixp_coll_sanity_check(coll); if (PMIXP_COLL_FAN_IN != coll->state) { /* In case of race condition between libpmix and * slurm threads progress_fan_in can be called * after we moved to the next step. */ goto unlock; } if (!coll->contrib_local || coll->contrib_cntr != coll->children_cnt) { /* Not yet ready to go to the next step */ goto unlock; } /* The root of the collective will have parent_host == NULL */ if (NULL != coll->parent_host) { hostlist = xstrdup(coll->parent_host); type = PMIXP_MSG_FAN_IN; PMIXP_DEBUG("%s:%d: switch to PMIXP_COLL_FAN_OUT state", pmixp_info_namespace(), pmixp_info_nodeid()); is_p2p = 1; } else { if (0 < hostlist_count(coll->all_children)) { hostlist = hostlist_ranged_string_xmalloc( coll->all_children); type = PMIXP_MSG_FAN_OUT; pmixp_debug_hang(0); } rc = _copy_payload(coll->buf, coll->serv_offs, &root_buf); xassert(0 == rc); PMIXP_DEBUG("%s:%d: finish with this collective (I am the root)", pmixp_info_namespace(), pmixp_info_nodeid()); } PMIXP_DEBUG("%s:%d: send data to %s", pmixp_info_namespace(), pmixp_info_nodeid(), hostlist); /* Check for the singletone case */ if (NULL != hostlist) { if( 0 == coll->seq && NULL != coll->parent_host ){ /* This is the first message sent to the parent. * There might be a race condition where parent * is not ready to receive the messages. * Use zero-size message to check parent status first * and then send the full message. */ pmixp_server_health_chk(hostlist, addr); } rc = pmixp_server_send(hostlist, type, coll->seq, addr, get_buf_data(coll->buf), get_buf_offset(coll->buf), is_p2p); if (SLURM_SUCCESS != rc) { PMIXP_ERROR( "Cannot send data (size = %lu), to hostlist:\n%s", (uint64_t) get_buf_offset(coll->buf), hostlist); /* return error indication to PMIx. Nodes that haven't received data * will exit by a timeout. * FIXME: do we need to do something with successfuly finished nodes? */ goto unlock; } } /* transit to the next state */ _fan_in_finished(coll); /* if we are root - push data to PMIx here. * Originally there was a homogenuous solution: root nodename was in the hostlist. * However this may lead to the undesired side effects: we are blocked here sending * data and cannot receive (it will be triggered in this thread after we will leave * this callback), so we have to rely on buffering on the SLURM side. * Better not to do so. */ if (NULL == coll->parent_host) { /* if I am the root - pass the data to PMIx and reset collective here */ /* copy payload excluding reserved server header */ _progres_fan_out(coll, root_buf); } unlock: if (NULL != hostlist) { xfree(hostlist); } /* lock the */ slurm_mutex_unlock(&coll->lock); }
int pmixp_coll_contrib_node(pmixp_coll_t *coll, char *nodename, Buf buf) { int nodeid; char *data = NULL; uint32_t size; char *state = NULL; PMIXP_DEBUG("%s:%d: get contribution from node %s", pmixp_info_namespace(), pmixp_info_nodeid(), nodename); /* lock the structure */ slurm_mutex_lock(&coll->lock); pmixp_coll_sanity_check(coll); /* fix the collective status if need */ if (PMIXP_COLL_SYNC == coll->state) { PMIXP_DEBUG("%s:%d: get contribution from node %s: switch to PMIXP_COLL_FAN_IN", pmixp_info_namespace(), pmixp_info_nodeid(), nodename); coll->state = PMIXP_COLL_FAN_IN; coll->ts = time(NULL); } else if( PMIXP_COLL_FAN_OUT == coll->state) { PMIXP_DEBUG("%s:%d: get contribution from node %s: switch to PMIXP_COLL_FAN_OUT_IN" " (next collective!)", pmixp_info_namespace(), pmixp_info_nodeid(), nodename); coll->state = PMIXP_COLL_FAN_OUT_IN; coll->ts_next = time(NULL); } xassert(PMIXP_COLL_FAN_IN == coll->state || PMIXP_COLL_FAN_OUT_IN == coll->state); /* Because of possible timeouts/delays in transmission we * can receive a contribution second time. Avoid duplications * by checking our records. */ nodeid = hostlist_find(coll->ch_hosts, nodename); xassert(0 <= nodeid); if (0 > nodeid) { /* protect ourselfs if we are running with no asserts */ goto proceed; } if (0 < coll->ch_contribs[nodeid]) { /* May be 0 or 1. If grater - transmission skew, ignore. */ PMIXP_DEBUG("Multiple contributions from child_id=%d, hostname=%s", nodeid, nodename); /* this is duplication, skip. */ goto proceed; } data = get_buf_data(buf) + get_buf_offset(buf); size = remaining_buf(buf); grow_buf(coll->buf, size); memcpy(get_buf_data(coll->buf) + get_buf_offset(coll->buf), data, size); set_buf_offset(coll->buf, get_buf_offset(coll->buf) + size); /* increase number of individual contributions */ coll->ch_contribs[nodeid]++; /* increase number of total contributions */ coll->contrib_cntr++; proceed: /* unlock the structure */ slurm_mutex_unlock(&coll->lock); if( PMIXP_COLL_FAN_IN == coll->state ){ /* make a progress if we are in fan-in state */ _progress_fan_in(coll); } switch( coll->state ){ case PMIXP_COLL_SYNC: state = "sync"; break; case PMIXP_COLL_FAN_IN: state = "fan-in"; break; case PMIXP_COLL_FAN_OUT: state = "fan-out"; break; case PMIXP_COLL_FAN_OUT_IN: state = "fan-out-in"; break; } PMIXP_DEBUG("%s:%d: get contribution from node %s: finish. State = %s", pmixp_info_namespace(), pmixp_info_nodeid(), nodename, state); return SLURM_SUCCESS; }