static int _serv_write(eio_obj_t *obj, List objs) { /* sanity check */ xassert(NULL != obj ); if (obj->shutdown) { /* corresponding connection will be * cleaned up during plugin finalize */ return 0; } PMIXP_DEBUG("fd = %d", obj->fd); pmixp_conn_t *conn = (pmixp_conn_t *)obj->arg; /* debug stub */ pmixp_debug_hang(0); /* progress sends */ pmixp_conn_progress_snd(conn); /* if we are done with this connection - remove it */ if (!pmixp_conn_is_alive(conn)) { obj->shutdown = true; PMIXP_DEBUG("Connection finalized fd = %d", obj->fd); /* cleanup after this connection */ eio_remove_obj(obj, objs); pmixp_conn_return(conn); } return 0; }
static int _serv_read(eio_obj_t *obj, List objs) { /* sanity check */ xassert(NULL != obj ); if (obj->shutdown) { /* corresponding connection will be * cleaned up during plugin finalize */ return 0; } PMIXP_DEBUG("fd = %d", obj->fd); pmixp_conn_t *conn = (pmixp_conn_t *)obj->arg; bool proceed = true; /* debug stub */ pmixp_debug_hang(0); /* Read and process all received messages */ while (proceed) { if (!pmixp_conn_progress_rcv(conn)) { proceed = 0; } if (!pmixp_conn_is_alive(conn)) { obj->shutdown = true; PMIXP_DEBUG("Connection closed fd = %d", obj->fd); /* cleanup after this connection */ eio_remove_obj(obj, objs); pmixp_conn_return(conn); proceed = 0; } } return 0; }
static bool _serv_writable(eio_obj_t *obj) { /* sanity check */ xassert(NULL != obj ); if (obj->shutdown) { /* corresponding connection will be * cleaned up during plugin finalize */ return false; } /* get I/O engine */ pmixp_conn_t *conn = (pmixp_conn_t *)obj->arg; pmixp_io_engine_t *eng = conn->eng; /* debug stub */ pmixp_debug_hang(0); /* Invoke cleanup callbacks if any */ pmixp_io_send_cleanup(eng); /* check if we have something to send */ if (pmixp_io_send_pending(eng)) { return true; } return false; }
/* * TODO: we need to keep track of the "me" * structures created here, because we need to * free them in "pmixp_stepd_finalize" */ void pmixp_server_slurm_conn(int fd) { eio_obj_t *obj; pmixp_conn_t *conn = NULL; PMIXP_DEBUG("Request from fd = %d", fd); pmixp_debug_hang(0); /* Set nonblocking */ fd_set_nonblocking(fd); fd_set_close_on_exec(fd); conn = pmixp_conn_new_temp(PMIXP_PROTO_SLURM, fd, _slurm_new_msg); /* try to process right here */ pmixp_conn_progress_rcv(conn); if (!pmixp_conn_is_alive(conn)) { /* success, don't need this connection anymore */ pmixp_conn_return(conn); return; } /* If it is a blocking operation: create AIO object to * handle it */ obj = eio_obj_create(fd, &slurm_peer_ops, (void *)conn); eio_new_obj(pmixp_info_io(), obj); }
static int _pmix_p2p_send_core(const char *nodename, const char *address, const char *data, uint32_t len) { int rc, timeout; slurm_msg_t msg; forward_data_msg_t req; List ret_list; ret_data_info_t *ret_data_info = NULL; pmixp_debug_hang(0); slurm_msg_t_init(&msg); PMIXP_DEBUG("nodelist=%s, address=%s, len=%u", nodename, address, len); req.address = (char *)address; req.len = len; /* there is not much we can do - just cast) */ req.data = (char*)data; msg.msg_type = REQUEST_FORWARD_DATA; msg.data = &req; if (slurm_conf_get_addr(nodename, &msg.address) == SLURM_ERROR) { PMIXP_ERROR("Can't find address for host " "%s, check slurm.conf", nodename); return SLURM_ERROR; } timeout = slurm_get_msg_timeout() * 1000; msg.forward.timeout = timeout; msg.forward.cnt = 0; msg.forward.nodelist = NULL; ret_list = slurm_send_addr_recv_msgs(&msg, (char*)nodename, timeout); if (!ret_list) { /* This should never happen (when this was * written slurm_send_addr_recv_msgs always * returned a list */ PMIXP_ERROR("No return list given from " "slurm_send_addr_recv_msgs spawned for %s", nodename); return SLURM_ERROR; } else if ((errno != SLURM_COMMUNICATIONS_CONNECTION_ERROR) && !list_count(ret_list)) { PMIXP_ERROR("failed to send to %s, errno=%d", nodename, errno); return SLURM_ERROR; } rc = SLURM_SUCCESS; while ((ret_data_info = list_pop(ret_list))) { int temp_rc = slurm_get_return_code(ret_data_info->type, ret_data_info->data); if (temp_rc != SLURM_SUCCESS) rc = temp_rc; destroy_data_info(ret_data_info); } FREE_NULL_LIST(ret_list); return rc; }
int pmixp_p2p_send(const char *nodename, const char *address, const char *data, uint32_t len, unsigned int start_delay, unsigned int retry_cnt, int silent) { int retry = 0, rc; unsigned int delay = start_delay; /* in milliseconds */ pmixp_debug_hang(0); while (1) { if (!silent && retry >= 1) { PMIXP_DEBUG("send failed, rc=%d, try #%d", rc, retry); } rc = _pmix_p2p_send_core(nodename, address, data, len); if (rc == SLURM_SUCCESS) break; retry++; if (retry >= retry_cnt) { PMIXP_ERROR("send failed, rc=%d, exceeded the retry limit", rc); break; } /* wait with constantly increasing delay */ struct timespec ts = {(delay / 1000), ((delay % 1000) * 1000000)}; nanosleep(&ts, NULL); delay *= 2; } return rc; }
static int _serv_read(eio_obj_t *obj, List objs) { PMIXP_DEBUG("fd = %d", obj->fd); pmixp_io_engine_t *me = (pmixp_io_engine_t *)obj->arg; bool proceed = true; pmixp_debug_hang(0); /* Read and process all received messages */ while (proceed) { switch( _process_message(me) ){ case 2: obj->shutdown = true; PMIXP_DEBUG("Connection finalized fd = %d", obj->fd); /* cleanup after this connection */ eio_remove_obj(obj, objs); xfree(me); case 0: proceed = 0; case 1: break; } } return 0; }
int pmixp_coll_ring_check(pmixp_coll_t *coll, pmixp_coll_ring_msg_hdr_t *hdr) { char *nodename = NULL; int rc; if (hdr->nodeid != _ring_prev_id(coll)) { nodename = pmixp_info_job_host(hdr->nodeid); PMIXP_ERROR("%p: unexpected contrib from %s:%u, expected is %d", coll, nodename, hdr->nodeid, _ring_prev_id(coll)); return SLURM_ERROR; } rc = pmixp_coll_check(coll, hdr->seq); if (PMIXP_COLL_REQ_FAILURE == rc) { /* this is an unacceptable event: either something went * really wrong or the state machine is incorrect. * This will 100% lead to application hang. */ nodename = pmixp_info_job_host(hdr->nodeid); PMIXP_ERROR("Bad collective seq. #%d from %s:%u, current is %d", hdr->seq, nodename, hdr->nodeid, coll->seq); pmixp_debug_hang(0); /* enable hang to debug this! */ slurm_kill_job_step(pmixp_info_jobid(), pmixp_info_stepid(), SIGKILL); xfree(nodename); return SLURM_SUCCESS; } else if (PMIXP_COLL_REQ_SKIP == rc) { #ifdef PMIXP_COLL_DEBUG nodename = pmixp_info_job_host(hdr->nodeid); PMIXP_ERROR("Wrong collective seq. #%d from nodeid %u, current is %d, skip this message", hdr->seq, hdr->nodeid, coll->seq); #endif return SLURM_ERROR; } return SLURM_SUCCESS; }
pmix_status_t fencenb_fn(const pmix_proc_t procs[], size_t nprocs, const pmix_info_t info[], size_t ninfo, char *data, size_t ndata, pmix_modex_cbfunc_t cbfunc, void *cbdata) { PMIXP_DEBUG("called"); pmixp_coll_t *coll; pmixp_coll_type_t type = PMIXP_COLL_TYPE_FENCE; pmix_status_t status = PMIX_SUCCESS; pmixp_debug_hang(0); coll = pmixp_state_coll_get(type, procs, nprocs); pmixp_coll_set_callback(coll, cbfunc, cbdata); if (SLURM_SUCCESS != pmixp_coll_contrib_local(coll, data, ndata)) { goto error; } return PMIX_SUCCESS; error: cbfunc(status, NULL, 0, cbdata, NULL, NULL); return status; }
/* * For this to work the following conditions supposed to be * satisfied: * - SLURM has to be configured with `--enable-debug` option * - jobstep needs to have at least two nodes * In this case communication exchange will be done between * the first two nodes. */ void pmixp_server_run_cperf() { int size; size_t start, end, bound; pmixp_debug_hang(0); start = 1 << _pmixp_cperf_low; end = 1 << _pmixp_cperf_up; bound = 1 << _pmixp_cperf_bound; for (size = start; size <= end; size *= 2) { int j, iters = _pmixp_cperf_siter; struct timeval tv1, tv2; if (size >= bound) { iters = _pmixp_cperf_liter; } double times[iters]; char *data = xmalloc(size); PMIXP_ERROR("coll perf %d", size); for(j=0; j<iters; j++){ gettimeofday(&tv1, NULL); _pmixp_server_cperf_iter(data, size); gettimeofday(&tv2, NULL); times[j] = tv2.tv_sec + 1E-6 * tv2.tv_usec - (tv1.tv_sec + 1E-6 * tv1.tv_usec); } for(j=0; j<iters; j++){ /* Output measurements to the slurmd.log */ PMIXP_ERROR("\t%d %d: %.9lf", j, size, times[j]); } xfree(data); } }
static void _process_server_request(pmixp_base_hdr_t *hdr, Buf buf) { int rc; switch (hdr->type) { case PMIXP_MSG_FAN_IN: case PMIXP_MSG_FAN_OUT: { pmixp_coll_t *coll; pmixp_proc_t *procs = NULL; size_t nprocs = 0; pmixp_coll_type_t type = 0; int c_nodeid; rc = pmixp_coll_unpack_info(buf, &type, &c_nodeid, &procs, &nprocs); if (SLURM_SUCCESS != rc) { char *nodename = pmixp_info_job_host(hdr->nodeid); PMIXP_ERROR("Bad message header from node %s", nodename); xfree(nodename); goto exit; } coll = pmixp_state_coll_get(type, procs, nprocs); xfree(procs); PMIXP_DEBUG("FENCE collective message from nodeid = %u, " "type = %s, seq = %d", hdr->nodeid, ((PMIXP_MSG_FAN_IN == hdr->type) ? "fan-in" : "fan-out"), hdr->seq); rc = pmixp_coll_check_seq(coll, hdr->seq); if (PMIXP_COLL_REQ_FAILURE == rc) { /* this is unexepable event: either something went * really wrong or the state machine is incorrect. * This will 100% lead to application hang. */ char *nodename = pmixp_info_job_host(hdr->nodeid); PMIXP_ERROR("Bad collective seq. #%d from %s, current" " is %d", hdr->seq, nodename, coll->seq); pmixp_debug_hang(0); /* enable hang to debug this! */ slurm_kill_job_step(pmixp_info_jobid(), pmixp_info_stepid(), SIGKILL); xfree(nodename); break; } else if (PMIXP_COLL_REQ_SKIP == rc) { PMIXP_DEBUG("Wrong collective seq. #%d from" " nodeid %u, current is %d, skip " "this message", hdr->seq, hdr->nodeid, coll->seq); goto exit; } if (PMIXP_MSG_FAN_IN == hdr->type) { pmixp_coll_contrib_child(coll, hdr->nodeid, hdr->seq, buf); } else { pmixp_coll_contrib_parent(coll, hdr->nodeid, hdr->seq, buf); } break; } case PMIXP_MSG_DMDX: { pmixp_dmdx_process(buf, hdr->nodeid, hdr->seq); /* buf will be free'd by the PMIx callback so * protect the data by voiding the buffer. * Use the statement below instead of (buf = NULL) * to maintain incapsulation - in general `buf`is * not a pointer, but opaque type. */ buf = create_buf(NULL, 0); break; } case PMIXP_MSG_INIT_DIRECT: PMIXP_DEBUG("Direct connection init from %d", hdr->nodeid); break; #ifndef NDEBUG case PMIXP_MSG_PINGPONG: { /* if the pingpong mode was activated - * node 0 sends ping requests * and receiver assumed to respond back to node 0 */ int msize = remaining_buf(buf); if (pmixp_info_nodeid()) { pmixp_server_pp_send(0, msize); } else { if (pmixp_server_pp_same_thread()) { if (pmixp_server_pp_count() == pmixp_server_pp_warmups()) { pmixp_server_pp_start(); } if (!pmixp_server_pp_check_fini(msize)) { pmixp_server_pp_send(1, msize); } } } pmixp_server_pp_inc(); break; } #endif default: PMIXP_ERROR("Unknown message type %d", hdr->type); break; } exit: free_buf(buf); }
int pmixp_coll_contrib_local(pmixp_coll_t *coll, char *data, size_t size, pmix_modex_cbfunc_t cbfunc, void *cbdata) { int ret = SLURM_SUCCESS; pmixp_debug_hang(0); /* sanity check */ pmixp_coll_sanity_check(coll); /* lock the structure */ slurm_mutex_lock(&coll->lock); #ifdef PMIXP_COLL_DEBUG PMIXP_DEBUG("%p: contrib/loc: seqnum=%u, state=%s, size=%zd", coll, coll->seq, pmixp_coll_state2str(coll->state), size); #endif switch (coll->state) { case PMIXP_COLL_SYNC: /* change the state */ coll->ts = time(NULL); /* fall-thru */ case PMIXP_COLL_COLLECT: /* sanity check */ break; case PMIXP_COLL_DOWNFWD: /* We are waiting for some send requests * to be finished, but local node has started * the next contribution. * This is an OK situation, go ahead and store * it, the buffer with the contribution is not used * now. */ #ifdef PMIXP_COLL_DEBUG PMIXP_DEBUG("%p: contrib/loc: next coll!", coll); #endif break; case PMIXP_COLL_UPFWD: case PMIXP_COLL_UPFWD_WSC: case PMIXP_COLL_UPFWD_WPC: /* this is not a correct behavior, respond with an error. */ #ifdef PMIXP_COLL_DEBUG PMIXP_DEBUG("%p: contrib/loc: before prev coll is finished!", coll); #endif ret = SLURM_ERROR; goto exit; default: /* FATAL: should not happen in normal workflow */ PMIXP_ERROR("%p: local contrib while active collective, " "state = %s", coll, pmixp_coll_state2str(coll->state)); xassert(0); abort(); } if (coll->contrib_local) { /* Double contribution - reject */ ret = SLURM_ERROR; goto exit; } /* save & mark local contribution */ coll->contrib_local = true; pmixp_server_buf_reserve(coll->ufwd_buf, size); memcpy(get_buf_data(coll->ufwd_buf) + get_buf_offset(coll->ufwd_buf), data, size); set_buf_offset(coll->ufwd_buf, get_buf_offset(coll->ufwd_buf) + size); /* setup callback info */ coll->cbfunc = cbfunc; coll->cbdata = cbdata; /* check if the collective is ready to progress */ _progress_coll(coll); #ifdef PMIXP_COLL_DEBUG PMIXP_DEBUG("%p: finish, state=%s", coll, pmixp_coll_state2str(coll->state)); #endif exit: /* unlock the structure */ slurm_mutex_unlock(&coll->lock); return ret; }
static void _process_server_request(recv_header_t *_hdr, void *payload) { send_header_t *hdr = &_hdr->send_hdr; char *nodename = pmixp_info_job_host(hdr->nodeid); Buf buf; int rc; buf = create_buf(payload, hdr->msgsize); switch (hdr->type) { case PMIXP_MSG_FAN_IN: case PMIXP_MSG_FAN_OUT: { pmixp_coll_t *coll; pmix_proc_t *procs = NULL; size_t nprocs = 0; pmixp_coll_type_t type = 0; rc = pmixp_coll_unpack_ranges(buf, &type, &procs, &nprocs); if (SLURM_SUCCESS != rc) { PMIXP_ERROR("Bad message header from node %s", nodename); return; } coll = pmixp_state_coll_get(type, procs, nprocs); xfree(procs); PMIXP_DEBUG("FENCE collective message from node \"%s\", type = %s, seq = %d", nodename, (PMIXP_MSG_FAN_IN == hdr->type) ? "fan-in" : "fan-out", hdr->seq); rc = pmixp_coll_check_seq(coll, hdr->seq, nodename); if (PMIXP_COLL_REQ_FAILURE == rc) { /* this is unexepable event: either something went * really wrong or the state machine is incorrect. * This will 100% lead to application hang. */ PMIXP_ERROR("Bad collective seq. #%d from %s, current is %d", hdr->seq, nodename, coll->seq); pmixp_debug_hang(0); /* enable hang to debug this! */ slurm_kill_job_step(pmixp_info_jobid(), pmixp_info_stepid(), SIGKILL); break; } else if (PMIXP_COLL_REQ_SKIP == rc) { PMIXP_DEBUG("Wrong collective seq. #%d from %s, current is %d, skip this message", hdr->seq, nodename, coll->seq); free_buf(buf); break; } if (PMIXP_MSG_FAN_IN == hdr->type) { pmixp_coll_contrib_node(coll, nodename, buf); /* we don't need this buffer anymore */ free_buf(buf); } else { pmixp_coll_bcast(coll, buf); /* buf will be free'd by the PMIx callback */ } break; } case PMIXP_MSG_DMDX: { pmixp_dmdx_process(buf, nodename, hdr->seq); break; } case PMIXP_MSG_HEALTH_CHK: { /* this is just health ping. * TODO: can we do something more sophisticated? */ free_buf(buf); break; } default: PMIXP_ERROR("Unknown message type %d", hdr->type); break; } xfree(nodename); }
int pmixp_libpmix_job_set(void) { List lresp; pmix_info_t *info; int ninfo; ListIterator it; pmix_info_t *kvp; int i, rc; uid_t uid = pmixp_info_jobuid(); gid_t gid = pmixp_info_jobgid(); _register_caddy_t *register_caddy; register_caddy = xmalloc(sizeof(_register_caddy_t)*(pmixp_info_tasks_loc()+1)); pmixp_debug_hang(0); /* Use list to safely expand/reduce key-value pairs. */ lresp = list_create(pmixp_xfree_xmalloced); _general_proc_info(lresp); _set_tmpdirs(lresp); _set_procdatas(lresp); _set_sizeinfo(lresp); if (SLURM_SUCCESS != _set_mapsinfo(lresp)) { list_destroy(lresp); PMIXP_ERROR("Can't build nodemap"); return SLURM_ERROR; } _set_localinfo(lresp); ninfo = list_count(lresp); PMIX_INFO_CREATE(info, ninfo); it = list_iterator_create(lresp); i = 0; while (NULL != (kvp = list_next(it))) { info[i] = *kvp; i++; } list_destroy(lresp); register_caddy[0].active = 1; rc = PMIx_server_register_nspace(pmixp_info_namespace(), pmixp_info_tasks_loc(), info, ninfo, _release_cb, ®ister_caddy[0]); if (PMIX_SUCCESS != rc) { PMIXP_ERROR("Cannot register namespace %s, nlocalproc=%d, " "ninfo = %d", pmixp_info_namespace(), pmixp_info_tasks_loc(), ninfo); return SLURM_ERROR; } PMIXP_DEBUG("task initialization"); for (i = 0; i < pmixp_info_tasks_loc(); i++) { pmix_proc_t proc; register_caddy[i+1].active = 1; strncpy(proc.nspace, pmixp_info_namespace(), PMIX_MAX_NSLEN); proc.rank = pmixp_info_taskid(i); rc = PMIx_server_register_client(&proc, uid, gid, NULL, _release_cb, ®ister_caddy[i + 1]); if (PMIX_SUCCESS != rc) { PMIXP_ERROR("Cannot register client %d(%d) in namespace %s", pmixp_info_taskid(i), i, pmixp_info_namespace()); return SLURM_ERROR; } } /* wait for all registration actions to finish */ while( 1 ){ int exit_flag = 1; struct timespec ts; ts.tv_sec = 0; ts.tv_nsec = 100; for(i=0; i < pmixp_info_tasks_loc() + 1; i++){ if( register_caddy[i].active ){ exit_flag = 0; } } if( exit_flag ){ break; } nanosleep(&ts, NULL); } PMIX_INFO_FREE(info, ninfo); xfree(register_caddy); return SLURM_SUCCESS; }
static void _progress_fan_in(pmixp_coll_t *coll) { pmixp_srv_cmd_t type; const char *addr = pmixp_info_srv_addr(); char *hostlist = NULL; int rc, is_p2p = 0; Buf root_buf; PMIXP_DEBUG("%s:%d: start, local=%d, child_cntr=%d", pmixp_info_namespace(), pmixp_info_nodeid(), coll->contrib_local, coll->contrib_cntr); /* lock the collective */ slurm_mutex_lock(&coll->lock); pmixp_coll_sanity_check(coll); if (PMIXP_COLL_FAN_IN != coll->state) { /* In case of race condition between libpmix and * slurm threads progress_fan_in can be called * after we moved to the next step. */ goto unlock; } if (!coll->contrib_local || coll->contrib_cntr != coll->children_cnt) { /* Not yet ready to go to the next step */ goto unlock; } /* The root of the collective will have parent_host == NULL */ if (NULL != coll->parent_host) { hostlist = xstrdup(coll->parent_host); type = PMIXP_MSG_FAN_IN; PMIXP_DEBUG("%s:%d: switch to PMIXP_COLL_FAN_OUT state", pmixp_info_namespace(), pmixp_info_nodeid()); is_p2p = 1; } else { if (0 < hostlist_count(coll->all_children)) { hostlist = hostlist_ranged_string_xmalloc( coll->all_children); type = PMIXP_MSG_FAN_OUT; pmixp_debug_hang(0); } rc = _copy_payload(coll->buf, coll->serv_offs, &root_buf); xassert(0 == rc); PMIXP_DEBUG("%s:%d: finish with this collective (I am the root)", pmixp_info_namespace(), pmixp_info_nodeid()); } PMIXP_DEBUG("%s:%d: send data to %s", pmixp_info_namespace(), pmixp_info_nodeid(), hostlist); /* Check for the singletone case */ if (NULL != hostlist) { if( 0 == coll->seq && NULL != coll->parent_host ){ /* This is the first message sent to the parent. * There might be a race condition where parent * is not ready to receive the messages. * Use zero-size message to check parent status first * and then send the full message. */ pmixp_server_health_chk(hostlist, addr); } rc = pmixp_server_send(hostlist, type, coll->seq, addr, get_buf_data(coll->buf), get_buf_offset(coll->buf), is_p2p); if (SLURM_SUCCESS != rc) { PMIXP_ERROR( "Cannot send data (size = %lu), to hostlist:\n%s", (uint64_t) get_buf_offset(coll->buf), hostlist); /* return error indication to PMIx. Nodes that haven't received data * will exit by a timeout. * FIXME: do we need to do something with successfuly finished nodes? */ goto unlock; } } /* transit to the next state */ _fan_in_finished(coll); /* if we are root - push data to PMIx here. * Originally there was a homogenuous solution: root nodename was in the hostlist. * However this may lead to the undesired side effects: we are blocked here sending * data and cannot receive (it will be triggered in this thread after we will leave * this callback), so we have to rely on buffering on the SLURM side. * Better not to do so. */ if (NULL == coll->parent_host) { /* if I am the root - pass the data to PMIx and reset collective here */ /* copy payload excluding reserved server header */ _progres_fan_out(coll, root_buf); } unlock: if (NULL != hostlist) { xfree(hostlist); } /* lock the */ slurm_mutex_unlock(&coll->lock); }