static ucs_status_t ucp_wireup_conn_ack_handler(void *arg, void *data, size_t length, void *desc) { ucp_worker_h worker = arg; ucp_wireup_msg_t *msg = data; ucp_ep_h ep; UCS_ASYNC_BLOCK(&worker->async); ucp_wireup_log(worker, UCP_AM_ID_CONN_ACK, msg, 0); ep = ucp_worker_find_ep(worker, msg->src_uuid); if (ep == NULL) { ucs_debug("ignoring connection request - not exists"); goto out; } if (ep->state & UCP_EP_STATE_REMOTE_CONNECTED) { ucs_debug("ignoring conn_ack - remote already connected"); goto out; } /* * If we got CONN_ACK, it means remote side got our reply, and also * connected to us. */ ucp_ep_remote_connected(ep); out: UCS_ASYNC_UNBLOCK(&worker->async); return UCS_OK; }
static UCS_CLASS_CLEANUP_FUNC(uct_rdmacm_ep_t) { uct_rdmacm_iface_t *iface = ucs_derived_of(self->super.super.iface, uct_rdmacm_iface_t); ucs_debug("rdmacm_ep %p: destroying", self); UCS_ASYNC_BLOCK(iface->super.worker->async); if (self->is_on_pending) { ucs_list_del(&self->list_elem); self->is_on_pending = 0; } /* remove the slow progress function in case it was placed on the slow progress * chain but wasn't invoked yet */ uct_worker_progress_unregister_safe(&iface->super.worker->super, &self->slow_prog_id); /* if the destroyed ep is the active one on the iface, mark it as destroyed * so that arriving events on the iface won't try to access this ep */ if (iface->ep == self) { iface->ep = UCT_RDMACM_IFACE_BLOCKED_NO_EP; } UCS_ASYNC_UNBLOCK(iface->super.worker->async); ucs_free(self->priv_data); }
ucs_status_t ucp_ep_create(ucp_worker_h worker, const ucp_address_t *address, ucp_ep_h *ep_p) { char peer_name[UCP_WORKER_NAME_MAX]; uint8_t addr_indices[UCP_MAX_LANES]; ucp_address_entry_t *address_list; unsigned address_count; ucs_status_t status; uint64_t dest_uuid; ucp_ep_h ep; UCS_ASYNC_BLOCK(&worker->async); status = ucp_address_unpack(address, &dest_uuid, peer_name, sizeof(peer_name), &address_count, &address_list); if (status != UCS_OK) { ucs_error("failed to unpack remote address: %s", ucs_status_string(status)); goto out; } ep = ucp_worker_ep_find(worker, dest_uuid); if (ep != NULL) { /* TODO handle a case where the existing endpoint is incomplete */ *ep_p = ep; status = UCS_OK; goto out_free_address; } /* allocate endpoint */ status = ucp_ep_new(worker, dest_uuid, peer_name, "from api call", &ep); if (status != UCS_OK) { goto out_free_address; } /* initialize transport endpoints */ status = ucp_wireup_init_lanes(ep, address_count, address_list, addr_indices); if (status != UCS_OK) { goto err_destroy_ep; } /* send initial wireup message */ if (!(ep->flags & UCP_EP_FLAG_LOCAL_CONNECTED)) { status = ucp_wireup_send_request(ep); if (status != UCS_OK) { goto err_destroy_ep; } } *ep_p = ep; goto out_free_address; err_destroy_ep: ucp_ep_destroy(ep); out_free_address: ucs_free(address_list); out: UCS_ASYNC_UNBLOCK(&worker->async); return status; }
static UCS_CLASS_INIT_FUNC(uct_tcp_ep_t, uct_tcp_iface_t *iface, int fd, const struct sockaddr_in *dest_addr) { ucs_status_t status; UCS_CLASS_CALL_SUPER_INIT(uct_base_ep_t, &iface->super) self->buf = ucs_malloc(iface->config.buf_size, "tcp_buf"); if (self->buf == NULL) { return UCS_ERR_NO_MEMORY; } self->events = 0; self->offset = 0; self->length = 0; ucs_queue_head_init(&self->pending_q); if (fd == -1) { status = ucs_tcpip_socket_create(&self->fd); if (status != UCS_OK) { goto err; } /* TODO use non-blocking connect */ status = uct_tcp_socket_connect(self->fd, dest_addr); if (status != UCS_OK) { goto err_close; } } else { self->fd = fd; } status = ucs_sys_fcntl_modfl(self->fd, O_NONBLOCK, 0); if (status != UCS_OK) { goto err_close; } status = uct_tcp_iface_set_sockopt(iface, self->fd); if (status != UCS_OK) { goto err_close; } uct_tcp_ep_epoll_ctl(self, EPOLL_CTL_ADD); UCS_ASYNC_BLOCK(iface->super.worker->async); ucs_list_add_tail(&iface->ep_list, &self->list); UCS_ASYNC_UNBLOCK(iface->super.worker->async); ucs_debug("tcp_ep %p: created on iface %p, fd %d", self, iface, self->fd); return UCS_OK; err_close: close(self->fd); err: return status; }
ucs_status_ptr_t ucp_disconnect_nb(ucp_ep_h ep) { ucp_worker_h worker = ep->worker; void *request; UCS_ASYNC_BLOCK(&worker->async); request = ucp_disconnect_nb_internal(ep); UCS_ASYNC_UNBLOCK(&worker->async); return request; }
ucs_status_t uct_cm_ep_pending_add(uct_ep_h tl_ep, uct_pending_req_t *req) { uct_cm_iface_t *iface = ucs_derived_of(tl_ep->iface, uct_cm_iface_t); uct_cm_ep_t *ep = ucs_derived_of(tl_ep, uct_cm_ep_t); UCS_ASYNC_BLOCK(iface->super.super.worker->async); ucs_derived_of(uct_pending_req_priv(req), uct_cm_pending_req_priv_t)->ep = ep; uct_pending_req_push(&iface->notify_q, req); UCS_ASYNC_UNBLOCK(iface->super.super.worker->async); return UCS_OK; }
ucs_status_t ucp_ep_create(ucp_worker_h worker, const ucp_address_t *address, ucp_ep_h *ep_p) { char peer_name[UCP_WORKER_NAME_MAX]; ucs_status_t status; uint64_t dest_uuid; unsigned address_count; ucp_address_entry_t *address_list; ucp_ep_h ep; UCS_ASYNC_BLOCK(&worker->async); status = ucp_address_unpack(address, &dest_uuid, peer_name, sizeof(peer_name), &address_count, &address_list); if (status != UCS_OK) { ucs_error("failed to unpack remote address: %s", ucs_status_string(status)); goto out; } ep = ucp_worker_ep_find(worker, dest_uuid); if (ep != NULL) { /* TODO handle a case where the existing endpoint is incomplete */ ucs_debug("returning existing ep %p which is already connected to %"PRIx64, ep, ep->dest_uuid); *ep_p = ep; status = UCS_OK; goto out_free_address; } status = ucp_ep_create_connected(worker, dest_uuid, peer_name, address_count, address_list, " from api call", &ep); if (status != UCS_OK) { goto out_free_address; } /* send initial wireup message */ if (!(ep->flags & UCP_EP_FLAG_LOCAL_CONNECTED)) { status = ucp_wireup_send_request(ep); if (status != UCS_OK) { goto err_destroy_ep; } } *ep_p = ep; goto out_free_address; err_destroy_ep: ucp_ep_destroy(ep); out_free_address: ucs_free(address_list); out: UCS_ASYNC_UNBLOCK(&worker->async); return status; }
static ucs_status_t ucp_wireup_conn_req_handler(void *arg, void *data, size_t length, void *desc) { ucp_worker_h worker = arg; ucp_wireup_msg_t *msg = data; ucs_status_t status; ucp_ep_h ep; UCS_ASYNC_BLOCK(&worker->async); ucp_wireup_log(worker, UCP_AM_ID_CONN_REQ, msg, 0); ep = ucp_worker_find_ep(worker, msg->src_uuid); if (ep == NULL) { ucs_debug("ignoring connection request - not exists"); goto out; } if (ep->state & UCP_EP_STATE_LOCAL_CONNECTED) { ucs_debug("ignoring connection request - already connected"); /* TODO allow active-passive connection establishment */ goto out; } if (ep->uct.rsc_index != msg->dst_rsc_index) { ucs_error("got connection request on a different resource (got: %d, expected: %d)", msg->dst_rsc_index, ep->uct.rsc_index); /* TODO send reject, and use different transport */ goto out; } status = uct_ep_connect_to_ep(ep->uct.next_ep, (struct sockaddr *)(msg + 1)); if (status != UCS_OK) { ucs_debug("failed to connect"); /* TODO send reject */ goto out; } ep->state |= UCP_EP_STATE_LOCAL_CONNECTED; status = ucp_ep_wireup_send(ep, ep->wireup_ep, UCP_AM_ID_CONN_REP, msg->src_rsc_index); if (status != UCS_OK) { goto out; } ep->state |= UCP_EP_STATE_CONN_REP_SENT; out: UCS_ASYNC_UNBLOCK(&worker->async); return UCS_OK; }
void ucp_ep_destroy(ucp_ep_h ep) { ucp_worker_h worker = ep->worker; ucs_debug("destroy ep %p", ep); UCS_ASYNC_BLOCK(&worker->async); sglib_hashed_ucp_ep_t_delete(worker->ep_hash, ep); ucp_ep_destory_uct_eps(ep); UCS_ASYNC_UNBLOCK(&worker->async); ucs_free(ep); }
static UCS_CLASS_CLEANUP_FUNC(uct_tcp_ep_t) { uct_tcp_iface_t *iface = ucs_derived_of(self->super.super.iface, uct_tcp_iface_t); ucs_debug("tcp_ep %p: destroying", self); UCS_ASYNC_BLOCK(iface->super.worker->async); ucs_list_del(&self->list); UCS_ASYNC_UNBLOCK(iface->super.worker->async); ucs_free(self->buf); close(self->fd); }
ucs_status_t ucp_ep_create(ucp_worker_h worker, ucp_address_t *address, ucp_ep_h *ep_p) { uint64_t dest_uuid = ucp_address_uuid(address); char peer_name[UCP_PEER_NAME_MAX]; ucs_status_t status; ucp_ep_h ep; UCS_ASYNC_BLOCK(&worker->async); ep = ucp_worker_ep_find(worker, dest_uuid); if (ep != NULL) { ucs_debug("returning existing ep %p which is already connected to %"PRIx64, ep, ep->dest_uuid); goto out; } ucp_address_peer_name(address, peer_name); status = ucp_ep_new(worker, dest_uuid, peer_name, " from api call", &ep); if (status != UCS_OK) { goto err; } status = ucp_wireup_start(ep, address); if (status != UCS_OK) { goto err_free; } out: UCS_ASYNC_UNBLOCK(&worker->async); *ep_p = ep; return UCS_OK; err_free: ucs_free(ep); err: UCS_ASYNC_UNBLOCK(&worker->async); return status; }
static ucs_status_t ucp_wireup_msg_handler(void *arg, void *data, size_t length, void *desc) { ucp_worker_h worker = arg; ucp_wireup_msg_t *msg = data; char peer_name[UCP_WORKER_NAME_MAX]; ucp_address_entry_t *address_list; unsigned address_count; ucs_status_t status; uint64_t uuid; UCS_ASYNC_BLOCK(&worker->async); status = ucp_address_unpack(msg + 1, &uuid, peer_name, UCP_WORKER_NAME_MAX, &address_count, &address_list); if (status != UCS_OK) { ucs_error("failed to unpack address: %s", ucs_status_string(status)); goto out; } if (msg->type == UCP_WIREUP_MSG_ACK) { ucs_assert(address_count == 0); ucp_wireup_process_ack(worker, uuid); } else if (msg->type == UCP_WIREUP_MSG_REQUEST) { ucp_wireup_process_request(worker, msg, uuid, peer_name, address_count, address_list); } else if (msg->type == UCP_WIREUP_MSG_REPLY) { ucp_wireup_process_reply(worker, msg, uuid, address_count, address_list); } else { ucs_bug("invalid wireup message"); } ucs_free(address_list); out: UCS_ASYNC_UNBLOCK(&worker->async); return UCS_OK; }
void ucp_ep_wireup_stop(ucp_ep_h ep) { ucp_worker_h worker = ep->worker; ucs_trace_func("ep=%p", ep); if (ep->uct.next_ep != NULL) { while (uct_ep_flush(ep->uct.next_ep) != UCS_OK) { ucp_worker_progress(ep->worker); } uct_ep_destroy(ep->uct.next_ep); } if (ep->wireup_ep != NULL) { while (uct_ep_flush(ep->wireup_ep) != UCS_OK) { ucp_worker_progress(ep->worker); } uct_ep_destroy(ep->wireup_ep); } UCS_ASYNC_BLOCK(&worker->async); sglib_hashed_ucp_ep_t_delete(worker->ep_hash, ep); UCS_ASYNC_UNBLOCK(&worker->async); }
ucs_status_t ucp_ep_wireup_start(ucp_ep_h ep, ucp_address_t *address) { ucp_worker_h worker = ep->worker; struct sockaddr *am_short_addr; ucp_rsc_index_t wireup_rsc_index; struct sockaddr *wireup_addr; uct_iface_attr_t *iface_attr; uct_iface_h iface; ucp_rsc_index_t dst_rsc_index, wireup_dst_rsc_index; ucp_rsc_index_t wireup_dst_pd_index; ucs_status_t status; UCS_ASYNC_BLOCK(&worker->async); ep->dest_uuid = ucp_address_uuid(address); sglib_hashed_ucp_ep_t_add(worker->ep_hash, ep); ucs_debug("connecting 0x%"PRIx64"->0x%"PRIx64, worker->uuid, ep->dest_uuid); /* * Select best transport for active messages */ status = ucp_pick_best_wireup(worker, address, ucp_am_short_score_func, &ep->uct.rsc_index, &dst_rsc_index, &ep->uct.dst_pd_index, &am_short_addr, &ep->uct.reachable_pds, "short_am"); if (status != UCS_OK) { ucs_error("No transport for short active message"); goto err; } iface = worker->ifaces[ep->uct.rsc_index]; iface_attr = &worker->iface_attrs[ep->uct.rsc_index]; /* * If the selected transport can be connected directly, do it. */ if (iface_attr->cap.flags & UCT_IFACE_FLAG_CONNECT_TO_IFACE) { status = uct_ep_create_connected(iface, am_short_addr, &ep->uct.next_ep); if (status != UCS_OK) { ucs_debug("failed to create ep"); goto err; } ep->state |= UCP_EP_STATE_LOCAL_CONNECTED; ucp_ep_remote_connected(ep); goto out; } /* * If we cannot connect the selected transport directly, select another * transport for doing the wireup. */ status = ucp_pick_best_wireup(worker, address, ucp_wireup_score_func, &wireup_rsc_index, &wireup_dst_rsc_index, &wireup_dst_pd_index, &wireup_addr, &ep->uct.reachable_pds, "wireup"); if (status != UCS_OK) { goto err; } status = uct_ep_create_connected(worker->ifaces[wireup_rsc_index], wireup_addr, &ep->wireup_ep); if (status != UCS_OK) { goto err; } if (!(iface_attr->cap.flags & UCT_IFACE_FLAG_CONNECT_TO_EP)) { status = UCS_ERR_UNREACHABLE; goto err_destroy_wireup_ep; } /* * Until the transport is connected, send operations should return NO_RESOURCE. * Plant a dummy endpoint object which will do it. */ status = UCS_CLASS_NEW(ucp_dummy_ep_t, &ep->uct.ep, ep); if (status != UCS_OK) { goto err_destroy_wireup_ep; } /* * Create endpoint for the transport we need to wire-up. */ status = uct_ep_create(iface, &ep->uct.next_ep); if (status != UCS_OK) { goto err_destroy_uct_ep; } /* * Send initial connection request for wiring-up the transport. */ status = ucp_ep_wireup_send(ep, ep->wireup_ep, UCP_AM_ID_CONN_REQ, dst_rsc_index); if (status != UCS_OK) { goto err_destroy_next_ep; } out: UCS_ASYNC_UNBLOCK(&worker->async); return UCS_OK; err_destroy_next_ep: uct_ep_destroy(ep->uct.next_ep); err_destroy_uct_ep: uct_ep_destroy(ep->uct.ep); err_destroy_wireup_ep: uct_ep_destroy(ep->wireup_ep); err: sglib_hashed_ucp_ep_t_delete(worker->ep_hash, ep); UCS_ASYNC_UNBLOCK(&worker->async); return status; }
static ucs_status_t ucp_wireup_conn_rep_handler(void *arg, void *data, size_t length, void *desc) { ucp_worker_h worker = arg; ucp_wireup_msg_t *msg = data; ucs_status_t status; ucp_ep_h ep; ucp_wireup_log(worker, UCP_AM_ID_CONN_REP, msg, 0); UCS_ASYNC_BLOCK(&worker->async); ep = ucp_worker_find_ep(worker, msg->src_uuid); if (ep == NULL) { ucs_debug("ignoring connection request - not exists"); goto out; } if (msg->dst_rsc_index != ep->uct.rsc_index) { ucs_error("got connection reply on a different resource"); goto out; } if (ep->state & UCP_EP_STATE_REMOTE_CONNECTED) { ucs_debug("ignoring conn_rep - remote already connected"); goto out; } /* If we have not connected yet, do it now */ if (!(ep->state & UCP_EP_STATE_LOCAL_CONNECTED)) { status = uct_ep_connect_to_ep(ep->uct.next_ep, (struct sockaddr *)(msg + 1)); if (status != UCS_OK) { ucs_debug("failed to connect"); goto out; } ep->state |= UCP_EP_STATE_LOCAL_CONNECTED; } /* * Send ACK to let remote side know it can start sending. * If we already sent a reply to the remote side, no need to send an ACK. * * We can use the new ep even from async thread, because main thread has not * started using it yet. */ if (!(ep->state & (UCP_EP_STATE_CONN_REP_SENT|UCP_EP_STATE_CONN_ACK_SENT))) { status = ucp_ep_wireup_send(ep, ep->uct.next_ep, UCP_AM_ID_CONN_ACK, msg->src_rsc_index); if (status != UCS_OK) { goto out; } ep->state |= UCP_EP_STATE_CONN_ACK_SENT; } /* * If we got CONN_REP, it means the remote side got our address and connected * to us. */ ucp_ep_remote_connected(ep); out: UCS_ASYNC_UNBLOCK(&worker->async); return UCS_OK; }
static UCS_CLASS_INIT_FUNC(uct_rdmacm_ep_t, uct_iface_t *tl_iface, const ucs_sock_addr_t *sockaddr, const void *priv_data, size_t length) { uct_rdmacm_iface_t *iface = ucs_derived_of(tl_iface, uct_rdmacm_iface_t); char ip_port_str[UCS_SOCKADDR_STRING_LEN]; uct_rdmacm_priv_data_hdr_t hdr; ucs_status_t status; UCS_CLASS_CALL_SUPER_INIT(uct_base_ep_t, &iface->super); if (iface->is_server) { /* TODO allow an interface to be used both for server and client */ return UCS_ERR_UNSUPPORTED; } /* Initialize these fields before calling rdma_resolve_addr to avoid a race * where they are used before being initialized (from the async thread * - after an RDMA_CM_EVENT_ROUTE_RESOLVED event) */ hdr.length = length; self->priv_data = ucs_malloc(sizeof(hdr) + length, "client private data"); if (self->priv_data == NULL) { status = UCS_ERR_NO_MEMORY; goto err; } memcpy(self->priv_data, &hdr, sizeof(hdr)); memcpy(self->priv_data + sizeof(hdr), priv_data, length); /* Save the remote address */ if (sockaddr->addr->sa_family == AF_INET) { memcpy(&self->remote_addr, sockaddr->addr, sizeof(struct sockaddr_in)); } else if (sockaddr->addr->sa_family == AF_INET6) { memcpy(&self->remote_addr, sockaddr->addr, sizeof(struct sockaddr_in6)); } else { ucs_error("rdmacm ep: unknown remote sa_family=%d", sockaddr->addr->sa_family); status = UCS_ERR_IO_ERROR; goto err_free_mem; } self->slow_prog_id = UCS_CALLBACKQ_ID_NULL; /* The interface can point at one endpoint at a time and therefore, the * connection establishment cannot be done in parallel for several endpoints */ /* TODO support connection establishment on parallel endpoints on the same iface */ if (iface->ep == NULL) { iface->ep = self; self->is_on_pending = 0; /* After rdma_resolve_addr(), the client will wait for an * RDMA_CM_EVENT_ADDR_RESOLVED event on the event_channel * to proceed with the connection establishment. * This event will be retrieved from the event_channel by the async thread. * All endpoints share the interface's event_channel but can use it serially. */ status = uct_rdmacm_ep_resolve_addr(self); if (status != UCS_OK) { goto err_free_mem; } } else { /* Add the ep to the pending queue */ UCS_ASYNC_BLOCK(iface->super.worker->async); ucs_list_add_tail(&iface->pending_eps_list, &self->list_elem); self->is_on_pending = 1; UCS_ASYNC_UNBLOCK(iface->super.worker->async); } ucs_debug("created an RDMACM endpoint on iface %p. event_channel: %p, " "iface cm_id: %p remote addr: %s", iface, iface->event_ch, iface->cm_id, ucs_sockaddr_str((struct sockaddr *)sockaddr->addr, ip_port_str, UCS_SOCKADDR_STRING_LEN)); return UCS_OK; err_free_mem: ucs_free(self->priv_data); err: return status; }