int pmixp_coll_contrib_local(pmixp_coll_t *coll, char *data, size_t size) { PMIXP_DEBUG("%s:%d: get local contribution", pmixp_info_namespace(), pmixp_info_nodeid()); /* sanity check */ pmixp_coll_sanity_check(coll); /* lock the structure */ slurm_mutex_lock(&coll->lock); /* change the collective state if need */ if (PMIXP_COLL_SYNC == coll->state) { PMIXP_DEBUG( "%s:%d: get local contribution: switch to PMIXP_COLL_FAN_IN", pmixp_info_namespace(), pmixp_info_nodeid()); coll->state = PMIXP_COLL_FAN_IN; coll->ts = time(NULL); } xassert(PMIXP_COLL_FAN_IN == coll->state); /* save & mark local contribution */ coll->contrib_local = true; grow_buf(coll->buf, size); memcpy(get_buf_data(coll->buf) + get_buf_offset(coll->buf), data, size); set_buf_offset(coll->buf, get_buf_offset(coll->buf) + size); /* unlock the structure */ slurm_mutex_unlock(&coll->lock); /* check if the collective is ready to progress */ _progress_fan_in(coll); PMIXP_DEBUG("%s:%d: get local contribution: finish", pmixp_info_namespace(), pmixp_info_nodeid()); return SLURM_SUCCESS; }
void _progres_fan_out(pmixp_coll_t *coll, Buf buf) { PMIXP_DEBUG("%s:%d: start", pmixp_info_namespace(), pmixp_info_nodeid()); pmixp_coll_sanity_check(coll); xassert(PMIXP_COLL_FAN_OUT == coll->state || PMIXP_COLL_FAN_OUT_IN == coll->state); /* update the database */ if (NULL != coll->cbfunc) { void *data = get_buf_data(buf) + get_buf_offset(buf); size_t size = remaining_buf(buf); PMIXP_DEBUG("%s:%d: use the callback", pmixp_info_namespace(), pmixp_info_nodeid()); coll->cbfunc(PMIX_SUCCESS, data, size, coll->cbdata, pmixp_free_Buf, (void *)buf); } /* Prepare for the next collective operation */ _fan_out_finished(coll); PMIXP_DEBUG("%s:%d: collective is prepared for the next use", pmixp_info_namespace(), pmixp_info_nodeid()); }
void pmixp_coll_bcast(pmixp_coll_t *coll, Buf buf) { PMIXP_DEBUG("%s:%d: start", pmixp_info_namespace(), pmixp_info_nodeid()); /* lock the structure */ slurm_mutex_lock(&coll->lock); _progres_fan_out(coll, buf); /* unlock the structure */ slurm_mutex_unlock(&coll->lock); /* We may already start next collective. Try to progress! * its OK if we in SYNC - there will be no-op */ _progress_fan_in(coll); }
static void _process_server_request(pmixp_base_hdr_t *hdr, Buf buf) { int rc; switch (hdr->type) { case PMIXP_MSG_FAN_IN: case PMIXP_MSG_FAN_OUT: { pmixp_coll_t *coll; pmixp_proc_t *procs = NULL; size_t nprocs = 0; pmixp_coll_type_t type = 0; int c_nodeid; rc = pmixp_coll_unpack_info(buf, &type, &c_nodeid, &procs, &nprocs); if (SLURM_SUCCESS != rc) { char *nodename = pmixp_info_job_host(hdr->nodeid); PMIXP_ERROR("Bad message header from node %s", nodename); xfree(nodename); goto exit; } coll = pmixp_state_coll_get(type, procs, nprocs); xfree(procs); PMIXP_DEBUG("FENCE collective message from nodeid = %u, " "type = %s, seq = %d", hdr->nodeid, ((PMIXP_MSG_FAN_IN == hdr->type) ? "fan-in" : "fan-out"), hdr->seq); rc = pmixp_coll_check_seq(coll, hdr->seq); if (PMIXP_COLL_REQ_FAILURE == rc) { /* this is unexepable event: either something went * really wrong or the state machine is incorrect. * This will 100% lead to application hang. */ char *nodename = pmixp_info_job_host(hdr->nodeid); PMIXP_ERROR("Bad collective seq. #%d from %s, current" " is %d", hdr->seq, nodename, coll->seq); pmixp_debug_hang(0); /* enable hang to debug this! */ slurm_kill_job_step(pmixp_info_jobid(), pmixp_info_stepid(), SIGKILL); xfree(nodename); break; } else if (PMIXP_COLL_REQ_SKIP == rc) { PMIXP_DEBUG("Wrong collective seq. #%d from" " nodeid %u, current is %d, skip " "this message", hdr->seq, hdr->nodeid, coll->seq); goto exit; } if (PMIXP_MSG_FAN_IN == hdr->type) { pmixp_coll_contrib_child(coll, hdr->nodeid, hdr->seq, buf); } else { pmixp_coll_contrib_parent(coll, hdr->nodeid, hdr->seq, buf); } break; } case PMIXP_MSG_DMDX: { pmixp_dmdx_process(buf, hdr->nodeid, hdr->seq); /* buf will be free'd by the PMIx callback so * protect the data by voiding the buffer. * Use the statement below instead of (buf = NULL) * to maintain incapsulation - in general `buf`is * not a pointer, but opaque type. */ buf = create_buf(NULL, 0); break; } case PMIXP_MSG_INIT_DIRECT: PMIXP_DEBUG("Direct connection init from %d", hdr->nodeid); break; #ifndef NDEBUG case PMIXP_MSG_PINGPONG: { /* if the pingpong mode was activated - * node 0 sends ping requests * and receiver assumed to respond back to node 0 */ int msize = remaining_buf(buf); if (pmixp_info_nodeid()) { pmixp_server_pp_send(0, msize); } else { if (pmixp_server_pp_same_thread()) { if (pmixp_server_pp_count() == pmixp_server_pp_warmups()) { pmixp_server_pp_start(); } if (!pmixp_server_pp_check_fini(msize)) { pmixp_server_pp_send(1, msize); } } } pmixp_server_pp_inc(); break; } #endif default: PMIXP_ERROR("Unknown message type %d", hdr->type); break; } exit: free_buf(buf); }
/* * For this to work the following conditions supposed to be * satisfied: * - SLURM has to be configured with `--enable-debug` option * - jobstep needs to have at least two nodes * In this case communication exchange will be done between * the first two nodes. */ void pmixp_server_run_pp(void) { int i; size_t start, end, bound; /* ping is initiated by the nodeid == 0 * all the rest - just exit */ if (pmixp_info_nodeid()) { return; } start = 1 << _pmixp_pp_low; end = 1 << _pmixp_pp_up; bound = 1 << _pmixp_pp_bound; for (i = start; i <= end; i *= 2) { int count, iters = _pmixp_pp_siter; struct timeval tv1, tv2; double time; if (i >= bound) { iters = _pmixp_pp_liter; } if (!_pmixp_pp_same_thr) { /* warmup - 10% of iters # */ count = pmixp_server_pp_count() + iters/10; while (pmixp_server_pp_count() < count) { int cur_count = pmixp_server_pp_count(); pmixp_server_pp_send(1, i); while (cur_count == pmixp_server_pp_count()) { usleep(1); } } count = pmixp_server_pp_count() + iters; gettimeofday(&tv1, NULL); while (pmixp_server_pp_count() < count) { int cur_count = pmixp_server_pp_count(); /* Send the message to the (nodeid == 1) */ pmixp_server_pp_send(1, i); /* wait for the response */ while (cur_count == pmixp_server_pp_count()); } gettimeofday(&tv2, NULL); time = tv2.tv_sec + 1E-6 * tv2.tv_usec - (tv1.tv_sec + 1E-6 * tv1.tv_usec); /* Output measurements to the slurmd.log */ PMIXP_ERROR("latency: %d - %.9lf", i, time / iters ); } else { int count = iters + iters/10; slurm_mutex_lock(&_pmixp_pp_lock); _pmixp_pp_warmup = iters/10; _pmixp_pp_iters = iters; _pmixp_pp_count = 0; slurm_mutex_unlock(&_pmixp_pp_lock); /* initiate sends */ pmixp_server_pp_send(1, i); while (pmixp_server_pp_count() < count){ sched_yield(); } } } }
static void _progress_fan_in(pmixp_coll_t *coll) { pmixp_srv_cmd_t type; const char *addr = pmixp_info_srv_addr(); char *hostlist = NULL; int rc, is_p2p = 0; Buf root_buf; PMIXP_DEBUG("%s:%d: start, local=%d, child_cntr=%d", pmixp_info_namespace(), pmixp_info_nodeid(), coll->contrib_local, coll->contrib_cntr); /* lock the collective */ slurm_mutex_lock(&coll->lock); pmixp_coll_sanity_check(coll); if (PMIXP_COLL_FAN_IN != coll->state) { /* In case of race condition between libpmix and * slurm threads progress_fan_in can be called * after we moved to the next step. */ goto unlock; } if (!coll->contrib_local || coll->contrib_cntr != coll->children_cnt) { /* Not yet ready to go to the next step */ goto unlock; } /* The root of the collective will have parent_host == NULL */ if (NULL != coll->parent_host) { hostlist = xstrdup(coll->parent_host); type = PMIXP_MSG_FAN_IN; PMIXP_DEBUG("%s:%d: switch to PMIXP_COLL_FAN_OUT state", pmixp_info_namespace(), pmixp_info_nodeid()); is_p2p = 1; } else { if (0 < hostlist_count(coll->all_children)) { hostlist = hostlist_ranged_string_xmalloc( coll->all_children); type = PMIXP_MSG_FAN_OUT; pmixp_debug_hang(0); } rc = _copy_payload(coll->buf, coll->serv_offs, &root_buf); xassert(0 == rc); PMIXP_DEBUG("%s:%d: finish with this collective (I am the root)", pmixp_info_namespace(), pmixp_info_nodeid()); } PMIXP_DEBUG("%s:%d: send data to %s", pmixp_info_namespace(), pmixp_info_nodeid(), hostlist); /* Check for the singletone case */ if (NULL != hostlist) { if( 0 == coll->seq && NULL != coll->parent_host ){ /* This is the first message sent to the parent. * There might be a race condition where parent * is not ready to receive the messages. * Use zero-size message to check parent status first * and then send the full message. */ pmixp_server_health_chk(hostlist, addr); } rc = pmixp_server_send(hostlist, type, coll->seq, addr, get_buf_data(coll->buf), get_buf_offset(coll->buf), is_p2p); if (SLURM_SUCCESS != rc) { PMIXP_ERROR( "Cannot send data (size = %lu), to hostlist:\n%s", (uint64_t) get_buf_offset(coll->buf), hostlist); /* return error indication to PMIx. Nodes that haven't received data * will exit by a timeout. * FIXME: do we need to do something with successfuly finished nodes? */ goto unlock; } } /* transit to the next state */ _fan_in_finished(coll); /* if we are root - push data to PMIx here. * Originally there was a homogenuous solution: root nodename was in the hostlist. * However this may lead to the undesired side effects: we are blocked here sending * data and cannot receive (it will be triggered in this thread after we will leave * this callback), so we have to rely on buffering on the SLURM side. * Better not to do so. */ if (NULL == coll->parent_host) { /* if I am the root - pass the data to PMIx and reset collective here */ /* copy payload excluding reserved server header */ _progres_fan_out(coll, root_buf); } unlock: if (NULL != hostlist) { xfree(hostlist); } /* lock the */ slurm_mutex_unlock(&coll->lock); }
int pmixp_coll_contrib_node(pmixp_coll_t *coll, char *nodename, Buf buf) { int nodeid; char *data = NULL; uint32_t size; char *state = NULL; PMIXP_DEBUG("%s:%d: get contribution from node %s", pmixp_info_namespace(), pmixp_info_nodeid(), nodename); /* lock the structure */ slurm_mutex_lock(&coll->lock); pmixp_coll_sanity_check(coll); /* fix the collective status if need */ if (PMIXP_COLL_SYNC == coll->state) { PMIXP_DEBUG("%s:%d: get contribution from node %s: switch to PMIXP_COLL_FAN_IN", pmixp_info_namespace(), pmixp_info_nodeid(), nodename); coll->state = PMIXP_COLL_FAN_IN; coll->ts = time(NULL); } else if( PMIXP_COLL_FAN_OUT == coll->state) { PMIXP_DEBUG("%s:%d: get contribution from node %s: switch to PMIXP_COLL_FAN_OUT_IN" " (next collective!)", pmixp_info_namespace(), pmixp_info_nodeid(), nodename); coll->state = PMIXP_COLL_FAN_OUT_IN; coll->ts_next = time(NULL); } xassert(PMIXP_COLL_FAN_IN == coll->state || PMIXP_COLL_FAN_OUT_IN == coll->state); /* Because of possible timeouts/delays in transmission we * can receive a contribution second time. Avoid duplications * by checking our records. */ nodeid = hostlist_find(coll->ch_hosts, nodename); xassert(0 <= nodeid); if (0 > nodeid) { /* protect ourselfs if we are running with no asserts */ goto proceed; } if (0 < coll->ch_contribs[nodeid]) { /* May be 0 or 1. If grater - transmission skew, ignore. */ PMIXP_DEBUG("Multiple contributions from child_id=%d, hostname=%s", nodeid, nodename); /* this is duplication, skip. */ goto proceed; } data = get_buf_data(buf) + get_buf_offset(buf); size = remaining_buf(buf); grow_buf(coll->buf, size); memcpy(get_buf_data(coll->buf) + get_buf_offset(coll->buf), data, size); set_buf_offset(coll->buf, get_buf_offset(coll->buf) + size); /* increase number of individual contributions */ coll->ch_contribs[nodeid]++; /* increase number of total contributions */ coll->contrib_cntr++; proceed: /* unlock the structure */ slurm_mutex_unlock(&coll->lock); if( PMIXP_COLL_FAN_IN == coll->state ){ /* make a progress if we are in fan-in state */ _progress_fan_in(coll); } switch( coll->state ){ case PMIXP_COLL_SYNC: state = "sync"; break; case PMIXP_COLL_FAN_IN: state = "fan-in"; break; case PMIXP_COLL_FAN_OUT: state = "fan-out"; break; case PMIXP_COLL_FAN_OUT_IN: state = "fan-out-in"; break; } PMIXP_DEBUG("%s:%d: get contribution from node %s: finish. State = %s", pmixp_info_namespace(), pmixp_info_nodeid(), nodename, state); return SLURM_SUCCESS; }