int pmixp_coll_init(pmixp_coll_t *coll, pmixp_coll_type_t type, const pmixp_proc_t *procs, size_t nprocs) { int rc = SLURM_SUCCESS; hostlist_t hl; coll->seq = 0; #ifndef NDEBUG coll->magic = PMIXP_COLL_STATE_MAGIC; #endif coll->type = type; coll->pset.procs = xmalloc(sizeof(*procs) * nprocs); coll->pset.nprocs = nprocs; memcpy(coll->pset.procs, procs, sizeof(*procs) * nprocs); if (SLURM_SUCCESS != pmixp_hostset_from_ranges(procs, nprocs, &hl)) { /* TODO: provide ranges output routine */ PMIXP_ERROR("Bad ranges information"); rc = SLURM_ERROR; goto exit; } coll->peers_cnt = hostlist_count(hl); coll->my_peerid = hostlist_find(hl, pmixp_info_hostname()); #ifdef PMIXP_COLL_DEBUG /* if we debug collectives - store a copy of a full * hostlist to resolve participant id to the hostname */ coll->peers_hl = hostlist_copy(hl); #endif switch(type) { case PMIXP_COLL_TYPE_FENCE_TREE: rc = pmixp_coll_tree_init(coll, &hl); break; case PMIXP_COLL_TYPE_FENCE_RING: rc = pmixp_coll_ring_init(coll, &hl); break; default: PMIXP_ERROR("Unknown coll type"); rc = SLURM_ERROR; } hostlist_destroy(hl); if (rc) { goto exit; } exit: return rc; }
void pmixp_coll_ring_log(pmixp_coll_t *coll) { int i; pmixp_coll_ring_t *ring = &coll->state.ring; char *nodename, *next, *prev; char *out_str = NULL; PMIXP_ERROR("%p: %s state seq=%d", coll, pmixp_coll_type2str(coll->type), coll->seq); nodename = pmixp_info_job_host(coll->my_peerid); PMIXP_ERROR("my peerid: %d:%s", coll->my_peerid, nodename); xfree(nodename); next = pmixp_info_job_host(_ring_next_id(coll)); prev = pmixp_info_job_host(_ring_prev_id(coll)); xstrfmtcat(out_str,"neighbor id: next %d:%s, prev %d:%s", _ring_next_id(coll), next, _ring_prev_id(coll), prev); PMIXP_ERROR("%s", out_str); xfree(next); xfree(prev); xfree(out_str); for (i = 0; i < PMIXP_COLL_RING_CTX_NUM; i++) { pmixp_coll_ring_ctx_t *coll_ctx = &ring->ctx_array[i]; PMIXP_ERROR("Context ptr=%p, #%d, in-use=%d", coll_ctx, i, coll_ctx->in_use); if (coll_ctx->in_use) { int id; char *done_contrib, *wait_contrib; hostlist_t hl_done_contrib, hl_wait_contrib; pmixp_hostset_from_ranges(coll->pset.procs, coll->pset.nprocs, &hl_done_contrib); hl_wait_contrib = hostlist_copy(hl_done_contrib); PMIXP_ERROR("\t seq=%d contribs: loc=%d/prev=%d/fwd=%d", coll_ctx->seq, coll_ctx->contrib_local, coll_ctx->contrib_prev, coll_ctx->forward_cnt); PMIXP_ERROR("\t neighbor contribs [%d]:", coll->peers_cnt); for (id = 0; id < coll->peers_cnt; id++) { char *nodename = pmixp_info_job_host(id); if(coll_ctx->contrib_map[id]) { hostlist_delete_host(hl_wait_contrib, nodename); } else { hostlist_delete_host(hl_done_contrib, nodename); } xfree(nodename); } done_contrib = slurm_hostlist_ranged_string_xmalloc( hl_done_contrib); wait_contrib = slurm_hostlist_ranged_string_xmalloc( hl_wait_contrib); PMIXP_ERROR("\t done contrib: %s", strlen(done_contrib) ? done_contrib : "-"); PMIXP_ERROR("\t wait contrib: %s", strlen(wait_contrib) ? wait_contrib : "-"); PMIXP_ERROR("\t status=%s", pmixp_coll_ring_state2str(coll_ctx->state)); PMIXP_ERROR("\t buf size=%u, remain=%u", size_buf(coll_ctx->ring_buf), remaining_buf(coll_ctx->ring_buf)); xfree(done_contrib); xfree(wait_contrib); hostlist_destroy(hl_done_contrib); hostlist_destroy(hl_wait_contrib); } } }