/* Send slurm message with timeout * RET message size (as specified in argument) or SLURM_ERROR on error */ int _slurm_send_timeout(slurm_fd_t fd, char *buf, size_t size, uint32_t flags, int timeout) { int rc; int sent = 0; int fd_flags; struct pollfd ufds; struct timeval tstart; int timeleft = timeout; char temp[2]; ufds.fd = fd; ufds.events = POLLOUT; fd_flags = _slurm_fcntl(fd, F_GETFL); fd_set_nonblocking(fd); gettimeofday(&tstart, NULL); while (sent < size) { timeleft = timeout - _tot_wait(&tstart); if (timeleft <= 0) { debug("_slurm_send_timeout at %d of %zd, timeout", sent, size); slurm_seterrno(SLURM_PROTOCOL_SOCKET_IMPL_TIMEOUT); sent = SLURM_ERROR; goto done; } if ((rc = poll(&ufds, 1, timeleft)) <= 0) { if ((rc == 0) || (errno == EINTR) || (errno == EAGAIN)) continue; else { debug("_slurm_send_timeout at %d of %zd, " "poll error: %s", sent, size, strerror(errno)); slurm_seterrno(SLURM_COMMUNICATIONS_SEND_ERROR); sent = SLURM_ERROR; goto done; } } /* * Check here to make sure the socket really is there. * If not then exit out and notify the sender. This * is here since a write doesn't always tell you the * socket is gone, but getting 0 back from a * nonblocking read means just that. */ if (ufds.revents & POLLERR) { debug("_slurm_send_timeout: Socket POLLERR"); slurm_seterrno(ENOTCONN); sent = SLURM_ERROR; goto done; } if ((ufds.revents & POLLHUP) || (ufds.revents & POLLNVAL) || (_slurm_recv(fd, &temp, 1, flags) == 0)) { debug2("_slurm_send_timeout: Socket no longer there"); slurm_seterrno(ENOTCONN); sent = SLURM_ERROR; goto done; } if ((ufds.revents & POLLOUT) != POLLOUT) { error("_slurm_send_timeout: Poll failure, revents:%d", ufds.revents); } rc = _slurm_send(fd, &buf[sent], (size - sent), flags); if (rc < 0) { if (errno == EINTR) continue; debug("_slurm_send_timeout at %d of %zd, " "send error: %s", sent, size, strerror(errno)); if (errno == EAGAIN) { /* poll() lied to us */ usleep(10000); continue; } slurm_seterrno(SLURM_COMMUNICATIONS_SEND_ERROR); sent = SLURM_ERROR; goto done; } if (rc == 0) { debug("_slurm_send_timeout at %d of %zd, " "sent zero bytes", sent, size); slurm_seterrno(SLURM_PROTOCOL_SOCKET_ZERO_BYTES_SENT); sent = SLURM_ERROR; goto done; } sent += rc; } done: /* Reset fd flags to prior state, preserve errno */ if (fd_flags != SLURM_PROTOCOL_ERROR) { int slurm_err = slurm_get_errno(); _slurm_fcntl(fd , F_SETFL , fd_flags); slurm_seterrno(slurm_err); } return sent; }
int pmixp_server_send_nb(pmixp_ep_t *ep, pmixp_srv_cmd_t type, uint32_t seq, Buf buf, pmixp_server_sent_cb_t complete_cb, void *cb_data) { pmixp_base_hdr_t bhdr; int rc = SLURM_ERROR; pmixp_dconn_t *dconn = NULL; PMIXP_BASE_HDR_SETUP(bhdr, type, seq, buf); /* if direct connection is not enabled * always use SLURM protocol */ if (!pmixp_info_srv_direct_conn()) { goto send_slurm; } switch (ep->type) { case PMIXP_EP_HLIST: goto send_slurm; case PMIXP_EP_NOIDEID:{ int hostid; hostid = ep->ep.nodeid; xassert(0 <= hostid); dconn = pmixp_dconn_lock(hostid); switch (pmixp_dconn_state(dconn)) { case PMIXP_DIRECT_EP_SENT: case PMIXP_DIRECT_CONNECTED: /* keep the lock here and proceed * to the direct send */ goto send_direct; case PMIXP_DIRECT_INIT: pmixp_dconn_req_sent(dconn); pmixp_dconn_unlock(dconn); goto send_slurm; default:{ /* this is a bug! */ pmixp_dconn_state_t state = pmixp_dconn_state(dconn); pmixp_dconn_unlock(dconn); PMIXP_ERROR("Bad direct connection state: %d", (int)state); xassert( (state == PMIXP_DIRECT_INIT) || (state == PMIXP_DIRECT_EP_SENT) || (state == PMIXP_DIRECT_CONNECTED) ); abort(); } } } default: PMIXP_ERROR("Bad value of the endpoint type: %d", (int)ep->type); xassert( PMIXP_EP_HLIST == ep->type || PMIXP_EP_NOIDEID == ep->type); abort(); } return rc; send_slurm: rc = _slurm_send(ep, bhdr, buf); complete_cb(rc, PMIXP_P2P_INLINE, cb_data); return SLURM_SUCCESS; send_direct: xassert( NULL != dconn ); _direct_send(dconn, ep, bhdr, buf, complete_cb, cb_data); pmixp_dconn_unlock(dconn); return SLURM_SUCCESS; }