int sock_verify_info(struct fi_info *hints) { uint64_t caps; enum fi_ep_type ep_type; int ret; struct sock_domain *domain; struct sock_fabric *fabric; if (!hints) return 0; ep_type = hints->ep_attr ? hints->ep_attr->type : FI_EP_UNSPEC; switch (ep_type) { case FI_EP_UNSPEC: case FI_EP_MSG: caps = SOCK_EP_MSG_CAP; ret = sock_msg_verify_ep_attr(hints->ep_attr, hints->tx_attr, hints->rx_attr); break; case FI_EP_DGRAM: caps = SOCK_EP_DGRAM_CAP; ret = sock_dgram_verify_ep_attr(hints->ep_attr, hints->tx_attr, hints->rx_attr); break; case FI_EP_RDM: caps = SOCK_EP_RDM_CAP; ret = sock_rdm_verify_ep_attr(hints->ep_attr, hints->tx_attr, hints->rx_attr); break; default: ret = -FI_ENODATA; } if (ret) return ret; if ((caps | hints->caps) != caps) { SOCK_LOG_DBG("Unsupported capabilities\n"); return -FI_ENODATA; } switch (hints->addr_format) { case FI_FORMAT_UNSPEC: case FI_SOCKADDR: case FI_SOCKADDR_IN: break; default: return -FI_ENODATA; } if (hints->domain_attr && hints->domain_attr->domain) { domain = container_of(hints->domain_attr->domain, struct sock_domain, dom_fid); if (!sock_dom_check_list(domain)) { SOCK_LOG_DBG("no matching domain\n"); return -FI_ENODATA; } }
static void *_sock_conn_listen(void *arg) { int conn_fd, ret; char tmp; socklen_t addr_size; struct sockaddr_in remote; struct pollfd poll_fds[2]; struct sock_ep_attr *ep_attr = (struct sock_ep_attr *)arg; struct sock_conn_listener *listener = &ep_attr->listener; struct sock_conn_map *map = &ep_attr->cmap; poll_fds[0].fd = listener->sock; poll_fds[1].fd = listener->signal_fds[1]; poll_fds[0].events = poll_fds[1].events = POLLIN; listener->is_ready = 1; while (listener->do_listen) { if (poll(poll_fds, 2, -1) > 0) { if (poll_fds[1].revents & POLLIN) { ret = ofi_read_socket(listener->signal_fds[1], &tmp, 1); if (ret != 1) { SOCK_LOG_ERROR("Invalid signal\n"); goto err; } continue; } } else { goto err; } addr_size = sizeof(remote); conn_fd = accept(listener->sock, (struct sockaddr *) &remote, &addr_size); SOCK_LOG_DBG("CONN: accepted conn-req: %d\n", conn_fd); if (conn_fd < 0) { SOCK_LOG_ERROR("failed to accept: %s\n", strerror(errno)); goto err; } SOCK_LOG_DBG("ACCEPT: %s, %d\n", inet_ntoa(remote.sin_addr), ntohs(remote.sin_port)); fastlock_acquire(&map->lock); sock_conn_map_insert(ep_attr, &remote, conn_fd, 1); fastlock_release(&map->lock); sock_pe_signal(ep_attr->domain->pe); } err: ofi_close_socket(listener->sock); SOCK_LOG_DBG("Listener thread exited\n"); return NULL; }
ssize_t sock_conn_send_src_addr(struct sock_ep_attr *ep_attr, struct sock_tx_ctx *tx_ctx, struct sock_conn *conn) { int ret; uint64_t total_len; struct sock_op tx_op = { 0 }; tx_op.op = SOCK_OP_CONN_MSG; SOCK_LOG_DBG("New conn msg on TX: %p using conn: %p\n", tx_ctx, conn); total_len = 0; tx_op.src_iov_len = sizeof(struct sockaddr_in); total_len = tx_op.src_iov_len + sizeof(struct sock_op_send); sock_tx_ctx_start(tx_ctx); if (ofi_rbavail(&tx_ctx->rb) < total_len) { ret = -FI_EAGAIN; goto err; } sock_tx_ctx_write_op_send(tx_ctx, &tx_op, 0, (uintptr_t) NULL, 0, 0, ep_attr, conn); sock_tx_ctx_write(tx_ctx, ep_attr->src_addr, sizeof(struct sockaddr_in)); sock_tx_ctx_commit(tx_ctx); conn->address_published = 1; return 0; err: sock_tx_ctx_abort(tx_ctx); return ret; }
struct sock_rx_entry *sock_rx_new_buffered_entry(struct sock_rx_ctx *rx_ctx, size_t len) { struct sock_rx_entry *rx_entry; if (rx_ctx->buffered_len + len >= rx_ctx->attr.total_buffered_recv) { SOCK_LOG_ERROR("Exceeded buffered recv limit\n"); } rx_entry = calloc(1, sizeof(*rx_entry) + len); if (!rx_entry) return NULL; SOCK_LOG_DBG("New buffered entry:%p len: %lu, ctx: %p\n", rx_entry, len, rx_ctx); rx_entry->is_busy = 1; rx_entry->is_buffered = 1; rx_entry->rx_op.dest_iov_len = 1; rx_entry->iov[0].iov.len = len; rx_entry->iov[0].iov.addr = (uintptr_t) (rx_entry + 1); rx_entry->total_len = len; rx_ctx->buffered_len += len; dlist_insert_tail(&rx_entry->entry, &rx_ctx->rx_buffered_list); rx_entry->is_busy = 1; rx_entry->is_tagged = 0; return rx_entry; }
ssize_t sock_comm_send(struct sock_pe_entry *pe_entry, const void *buf, size_t len) { ssize_t ret, used; if (len > pe_entry->cache_sz) { used = ofi_rbused(&pe_entry->comm_buf); if (used == sock_comm_flush(pe_entry)) { return sock_comm_send_socket(pe_entry->conn, buf, len); } else { return 0; } } if (ofi_rbavail(&pe_entry->comm_buf) < len) { ret = sock_comm_flush(pe_entry); if (ret <= 0) return 0; } ret = MIN(ofi_rbavail(&pe_entry->comm_buf), len); ofi_rbwrite(&pe_entry->comm_buf, buf, ret); ofi_rbcommit(&pe_entry->comm_buf); SOCK_LOG_DBG("buffered %lu\n", ret); return ret; }
ssize_t sock_comm_peek(struct sock_conn *conn, void *buf, size_t len) { ssize_t ret; ret = ofi_recv_socket(conn->sock_fd, buf, len, MSG_PEEK); if (ret == 0) { conn->connected = 0; SOCK_LOG_DBG("Disconnected\n"); return ret; } if (ret < 0) { SOCK_LOG_DBG("peek %s\n", strerror(ofi_sockerr())); ret = 0; } if (ret > 0) SOCK_LOG_DBG("peek from network: %lu\n", ret); return ret; }
fi_addr_t sock_av_lookup_key(struct sock_av *av, int key) { int i; struct sock_av_addr *av_addr; struct sock_conn_map *cmap; cmap = av->cmap; for (i = 0; i < av->table_hdr->stored; i++) { av_addr = &av->table[i]; if (sock_compare_addr(&cmap->table[key].addr, (struct sockaddr_in*)&av_addr->addr)) { SOCK_LOG_DBG("LOOKUP: (%d->%d)\n", key, i); return i; } } SOCK_LOG_DBG("Reverse-LOOKUP failed: %d, %s:%d\n", key, inet_ntoa(cmap->table[key].addr.sin_addr), ntohs(cmap->table[key].addr.sin_port)); return FI_ADDR_NOTAVAIL; }
static ssize_t sock_comm_recv_socket(struct sock_conn *conn, void *buf, size_t len) { ssize_t ret; ret = ofi_recv_socket(conn->sock_fd, buf, len, 0); if (ret == 0) { conn->connected = 0; SOCK_LOG_DBG("Disconnected: %s:%d\n", inet_ntoa(conn->addr.sin_addr), ntohs(conn->addr.sin_port)); return ret; } if (ret < 0) { SOCK_LOG_DBG("read %s\n", strerror(ofi_sockerr())); ret = 0; } if (ret > 0) SOCK_LOG_DBG("read from network: %lu\n", ret); return ret; }
int sock_av_get_addr_index(struct sock_av *av, struct sockaddr_in *addr) { int i; struct sock_av_addr *av_addr; for (i = 0; i < av->table_hdr->stored; i++) { av_addr = &av->table[i]; if (ofi_equals_sockaddr(addr, (struct sockaddr_in *)&av_addr->addr)) return i; } SOCK_LOG_DBG("failed to get index in AV\n"); return -1; }
static ssize_t sock_eq_sread(struct fid_eq *eq, uint32_t *event, void *buf, size_t len, int timeout, uint64_t flags) { int ret; struct sock_eq *sock_eq; struct dlist_entry *list; struct sock_eq_entry *entry; sock_eq = container_of(eq, struct sock_eq, eq); sock_eq_clean_err_data_list(sock_eq, 0); if (!dlistfd_empty(&sock_eq->err_list)) { return -FI_EAVAIL; } if (dlistfd_empty(&sock_eq->list)) { if(!timeout) { SOCK_LOG_DBG("Nothing to read from eq!\n"); return -FI_EAGAIN; } ret = dlistfd_wait_avail(&sock_eq->list, timeout); if (!dlistfd_empty(&sock_eq->err_list)) { return -FI_EAVAIL; } if (ret <= 0) return (ret == 0 || ret == -FI_ETIMEDOUT) ? -FI_EAGAIN : ret; } fastlock_acquire(&sock_eq->lock); list = sock_eq->list.list.next; entry = container_of(list, struct sock_eq_entry, entry); if (entry->len > len) { ret = -FI_ETOOSMALL; goto out; } ret = entry->len; *event = entry->type; memcpy(buf, entry->event, entry->len); if (!(flags & FI_PEEK)) { dlistfd_remove(list, &sock_eq->list); free(entry); } out: fastlock_release(&sock_eq->lock); return (ret == 0 || ret == -FI_ETIMEDOUT) ? -FI_EAGAIN : ret; }
static ssize_t sock_comm_send_socket(struct sock_conn *conn, const void *buf, size_t len) { ssize_t ret; ret = ofi_send_socket(conn->sock_fd, buf, len, MSG_NOSIGNAL); if (ret < 0) { if (OFI_SOCK_TRY_SND_RCV_AGAIN(ofi_sockerr())) { ret = 0; } else if (ofi_sockerr() == EPIPE) { conn->connected = 0; SOCK_LOG_DBG("Disconnected: %s:%d\n", inet_ntoa(conn->addr.sin_addr), ntohs(conn->addr.sin_port)); } else { SOCK_LOG_DBG("write error: %s\n", strerror(ofi_sockerr())); } } if (ret > 0) SOCK_LOG_DBG("wrote to network: %lu\n", ret); return ret; }
ssize_t sock_comm_recv(struct sock_pe_entry *pe_entry, void *buf, size_t len) { ssize_t read_len; if (ofi_rbempty(&pe_entry->comm_buf)) { if (len <= pe_entry->cache_sz) { sock_comm_recv_buffer(pe_entry); } else { return sock_comm_recv_socket(pe_entry->conn, buf, len); } } read_len = MIN(len, ofi_rbused(&pe_entry->comm_buf)); ofi_rbread(&pe_entry->comm_buf, buf, read_len); SOCK_LOG_DBG("read from buffer: %lu\n", read_len); return read_len; }
/* FIXME: pool of rx_entry */ struct sock_rx_entry *sock_rx_new_entry(struct sock_rx_ctx *rx_ctx) { struct sock_rx_entry *rx_entry; rx_entry = calloc(1, sizeof(*rx_entry)); if (!rx_entry) return NULL; rx_entry->is_tagged = 0; SOCK_LOG_DBG("New rx_entry: %p, ctx: %p\n", rx_entry, rx_ctx); dlist_init(&rx_entry->entry); fastlock_acquire(&rx_ctx->lock); rx_ctx->num_left--; fastlock_release(&rx_ctx->lock); return rx_entry; }
static int sock_rdm_verify_rx_attr(const struct fi_rx_attr *attr) { if (!attr) return 0; if ((attr->caps | SOCK_EP_RDM_CAP) != SOCK_EP_RDM_CAP) { SOCK_LOG_DBG("Unsupported RDM rx caps\n"); return -FI_ENODATA; } if ((attr->msg_order | SOCK_EP_MSG_ORDER) != SOCK_EP_MSG_ORDER) { SOCK_LOG_DBG("Unsuported rx message order\n"); return -FI_ENODATA; } if ((attr->comp_order | SOCK_EP_COMP_ORDER) != SOCK_EP_COMP_ORDER) { SOCK_LOG_DBG("Unsuported rx completion order\n"); return -FI_ENODATA; } if (attr->total_buffered_recv > sock_rdm_rx_attr.total_buffered_recv) { SOCK_LOG_DBG("Buffered receive size too large\n"); return -FI_ENODATA; } if (attr->size > sock_rdm_rx_attr.size) { SOCK_LOG_DBG("Rx size too large\n"); return -FI_ENODATA; } if (attr->iov_limit > sock_rdm_rx_attr.iov_limit) { SOCK_LOG_DBG("Rx iov limit too large\n"); return -FI_ENODATA; } return 0; }
static int sock_rdm_verify_tx_attr(const struct fi_tx_attr *attr) { if (!attr) return 0; if ((attr->caps | SOCK_EP_RDM_CAP) != SOCK_EP_RDM_CAP) { SOCK_LOG_DBG("Unsupported RDM tx caps\n"); return -FI_ENODATA; } if ((attr->msg_order | SOCK_EP_MSG_ORDER) != SOCK_EP_MSG_ORDER) { SOCK_LOG_DBG("Unsupported tx message order\n"); return -FI_ENODATA; } if (attr->inject_size > sock_rdm_tx_attr.inject_size) { SOCK_LOG_DBG("Inject size too large\n"); return -FI_ENODATA; } if (attr->size > sock_rdm_tx_attr.size) { SOCK_LOG_DBG("Tx size too large\n"); return -FI_ENODATA; } if (attr->iov_limit > sock_rdm_tx_attr.iov_limit) { SOCK_LOG_DBG("Tx iov limit too large\n"); return -FI_ENODATA; } if (attr->rma_iov_limit > sock_rdm_tx_attr.rma_iov_limit) { SOCK_LOG_DBG("RMA iov limit too large\n"); return -FI_ENODATA; } return 0; }
static int sock_ep_close(struct fid *fid) { struct sock_ep *sock_ep; char c = 0; switch (fid->fclass) { case FI_CLASS_EP: sock_ep = container_of(fid, struct sock_ep, ep.fid); break; case FI_CLASS_SEP: sock_ep = container_of(fid, struct sock_ep, ep.fid); break; default: return -FI_EINVAL; } if (sock_ep->is_alias) { ofi_atomic_dec32(&sock_ep->attr->ref); return 0; } if (ofi_atomic_get32(&sock_ep->attr->ref) || ofi_atomic_get32(&sock_ep->attr->num_rx_ctx) || ofi_atomic_get32(&sock_ep->attr->num_tx_ctx)) return -FI_EBUSY; if (sock_ep->attr->ep_type == FI_EP_MSG) { sock_ep->attr->cm.do_listen = 0; if (ofi_write_socket(sock_ep->attr->cm.signal_fds[0], &c, 1) != 1) SOCK_LOG_DBG("Failed to signal\n"); if (sock_ep->attr->cm.listener_thread && pthread_join(sock_ep->attr->cm.listener_thread, NULL)) { SOCK_LOG_ERROR("pthread join failed (%d)\n", ofi_syserr()); } ofi_close_socket(sock_ep->attr->cm.signal_fds[0]); ofi_close_socket(sock_ep->attr->cm.signal_fds[1]); } else { if (sock_ep->attr->av) ofi_atomic_dec32(&sock_ep->attr->av->ref); } if (sock_ep->attr->av) { fastlock_acquire(&sock_ep->attr->av->list_lock); fid_list_remove(&sock_ep->attr->av->ep_list, &sock_ep->attr->lock, &sock_ep->ep.fid); fastlock_release(&sock_ep->attr->av->list_lock); } pthread_mutex_lock(&sock_ep->attr->domain->pe->list_lock); if (sock_ep->attr->tx_shared) { fastlock_acquire(&sock_ep->attr->tx_ctx->lock); dlist_remove(&sock_ep->attr->tx_ctx_entry); fastlock_release(&sock_ep->attr->tx_ctx->lock); } if (sock_ep->attr->rx_shared) { fastlock_acquire(&sock_ep->attr->rx_ctx->lock); dlist_remove(&sock_ep->attr->rx_ctx_entry); fastlock_release(&sock_ep->attr->rx_ctx->lock); } pthread_mutex_unlock(&sock_ep->attr->domain->pe->list_lock); if (sock_ep->attr->conn_handle.do_listen) { fastlock_acquire(&sock_ep->attr->domain->conn_listener.signal_lock); fi_epoll_del(sock_ep->attr->domain->conn_listener.emap, sock_ep->attr->conn_handle.sock); fastlock_release(&sock_ep->attr->domain->conn_listener.signal_lock); ofi_close_socket(sock_ep->attr->conn_handle.sock); sock_ep->attr->conn_handle.do_listen = 0; } fastlock_destroy(&sock_ep->attr->cm.lock); if (sock_ep->attr->eq) { fastlock_acquire(&sock_ep->attr->eq->lock); sock_ep_clear_eq_list(&sock_ep->attr->eq->list, &sock_ep->ep); /* Any err_data if present would be freed by * sock_eq_clean_err_data_list when EQ is closed */ sock_ep_clear_eq_list(&sock_ep->attr->eq->err_list, &sock_ep->ep); fastlock_release(&sock_ep->attr->eq->lock); } if (sock_ep->attr->fclass != FI_CLASS_SEP) { if (!sock_ep->attr->tx_shared) sock_pe_remove_tx_ctx(sock_ep->attr->tx_array[0]); sock_tx_ctx_close(sock_ep->attr->tx_array[0]); sock_tx_ctx_free(sock_ep->attr->tx_array[0]); } if (sock_ep->attr->fclass != FI_CLASS_SEP) { if (!sock_ep->attr->rx_shared) sock_pe_remove_rx_ctx(sock_ep->attr->rx_array[0]); sock_rx_ctx_close(sock_ep->attr->rx_array[0]); sock_rx_ctx_free(sock_ep->attr->rx_array[0]); } free(sock_ep->attr->tx_array); free(sock_ep->attr->rx_array); if (sock_ep->attr->src_addr) free(sock_ep->attr->src_addr); if (sock_ep->attr->dest_addr) free(sock_ep->attr->dest_addr); fastlock_acquire(&sock_ep->attr->domain->pe->lock); ofi_idm_reset(&sock_ep->attr->av_idm); sock_conn_map_destroy(sock_ep->attr); fastlock_release(&sock_ep->attr->domain->pe->lock); ofi_atomic_dec32(&sock_ep->attr->domain->ref); fastlock_destroy(&sock_ep->attr->lock); free(sock_ep->attr); free(sock_ep); return 0; }
static int sock_check_table_in(struct sock_av *_av, struct sockaddr_in *addr, fi_addr_t *fi_addr, int count, uint64_t flags, void *context, int index) { void *new_addr; int i, j, ret = 0; char sa_ip[INET_ADDRSTRLEN]; struct sock_av_addr *av_addr; size_t new_count, table_sz, old_sz; if ((_av->attr.flags & FI_EVENT) && !_av->eq) return -FI_ENOEQ; if (_av->attr.flags & FI_READ) { for (i = 0; i < count; i++) { for (j = 0; j < _av->table_hdr->stored; j++) { if (!sock_av_is_valid_address(&addr[i])) { if (fi_addr) fi_addr[i] = FI_ADDR_NOTAVAIL; sock_av_report_error(_av, context, i, FI_EINVAL); continue; } av_addr = &_av->table[j]; if (memcmp(&av_addr->addr, &addr[i], sizeof(struct sockaddr_in)) == 0) { SOCK_LOG_DBG("Found addr in shared av\n"); if (fi_addr) fi_addr[i] = (fi_addr_t)j; ret++; } } } sock_av_report_success(_av, context, ret, flags); return (_av->attr.flags & FI_EVENT) ? 0 : ret; } for (i = 0, ret = 0; i < count; i++) { if (_av->table_hdr->stored == _av->table_hdr->size) { if (_av->table_hdr->req_sz) { if (fi_addr) fi_addr[i] = FI_ADDR_NOTAVAIL; sock_av_report_error(_av, context, i, FI_ENOSPC); SOCK_LOG_ERROR("Cannot insert to AV table\n"); continue; } else { new_count = _av->table_hdr->size * 2; table_sz = SOCK_AV_TABLE_SZ(new_count, _av->attr.name); old_sz = SOCK_AV_TABLE_SZ(_av->table_hdr->size, _av->attr.name); if (_av->attr.name) { new_addr = sock_mremap(_av->table_hdr, old_sz, table_sz); if (new_addr == MAP_FAILED) { if (fi_addr) fi_addr[i] = FI_ADDR_NOTAVAIL; sock_av_report_error(_av, context, i, FI_ENOMEM); continue; } _av->idx_arr[_av->table_hdr->stored] = _av->table_hdr->stored; } else { new_addr = realloc(_av->table_hdr, table_sz); if (!new_addr) { if (fi_addr) fi_addr[i] = FI_ADDR_NOTAVAIL; sock_av_report_error(_av, context, i, FI_ENOMEM); continue; } } _av->table_hdr = new_addr; _av->table_hdr->size = new_count; sock_update_av_table(_av, new_count); } } if (!sock_av_is_valid_address(&addr[i])) { if (fi_addr) fi_addr[i] = FI_ADDR_NOTAVAIL; sock_av_report_error(_av, context, i, FI_EINVAL); continue; } av_addr = &_av->table[_av->table_hdr->stored]; memcpy(sa_ip, inet_ntoa((&addr[i])->sin_addr), INET_ADDRSTRLEN); SOCK_LOG_DBG("AV-INSERT:dst_addr: family: %d, IP is %s, port: %d\n", ((struct sockaddr_in *)&addr[i])->sin_family, sa_ip, ntohs(((struct sockaddr_in *)&addr[i])->sin_port)); memcpy(&av_addr->addr, &addr[i], sizeof(struct sockaddr_in)); if (fi_addr) fi_addr[i] = (fi_addr_t)_av->table_hdr->stored; av_addr->valid = 1; _av->table_hdr->stored++; ret++; } sock_av_report_success(_av, context, ret, flags); return (_av->attr.flags & FI_EVENT) ? 0 : ret; }
int sock_av_open(struct fid_domain *domain, struct fi_av_attr *attr, struct fid_av **av, void *context) { int ret = 0; struct sock_domain *dom; struct sock_av *_av; size_t table_sz; if (!attr || sock_verify_av_attr(attr)) return -FI_EINVAL; if (attr->type == FI_AV_UNSPEC) attr->type = FI_AV_TABLE; dom = container_of(domain, struct sock_domain, dom_fid); if (dom->attr.av_type != FI_AV_UNSPEC && dom->attr.av_type != attr->type) return -FI_EINVAL; _av = calloc(1, sizeof(*_av)); if (!_av) return -FI_ENOMEM; _av->attr = *attr; _av->attr.count = (attr->count) ? attr->count : sock_av_def_sz; table_sz = SOCK_AV_TABLE_SZ(_av->attr.count, attr->name); if (attr->name) { ret = ofi_shm_map(&_av->shm, attr->name, table_sz, attr->flags & FI_READ, (void**)&_av->table_hdr); if (ret || _av->table_hdr == MAP_FAILED) { SOCK_LOG_ERROR("map failed\n"); ret = -FI_EINVAL; goto err; } _av->idx_arr = (uint64_t *)(_av->table_hdr + 1); _av->attr.map_addr = _av->idx_arr; attr->map_addr = _av->attr.map_addr; SOCK_LOG_DBG("Updating map_addr: %p\n", _av->attr.map_addr); if (attr->flags & FI_READ) { if (_av->table_hdr->size != _av->attr.count) { ret = -FI_EINVAL; goto err2; } } else { _av->table_hdr->size = _av->attr.count; _av->table_hdr->stored = 0; } _av->shared = 1; } else { _av->table_hdr = calloc(1, table_sz); if (!_av->table_hdr) { ret = -FI_ENOMEM; goto err; } _av->table_hdr->size = _av->attr.count; _av->table_hdr->req_sz = attr->count; } sock_update_av_table(_av, _av->attr.count); _av->av_fid.fid.fclass = FI_CLASS_AV; _av->av_fid.fid.context = context; _av->av_fid.fid.ops = &sock_av_fi_ops; switch (attr->type) { case FI_AV_MAP: _av->av_fid.ops = &sock_am_ops; break; case FI_AV_TABLE: _av->av_fid.ops = &sock_at_ops; break; default: ret = -FI_EINVAL; goto err2; } atomic_initialize(&_av->ref, 0); atomic_inc(&dom->ref); _av->domain = dom; switch (dom->info.addr_format) { case FI_SOCKADDR_IN: _av->addrlen = sizeof(struct sockaddr_in); break; default: SOCK_LOG_ERROR("Invalid address format: only IPv4 supported\n"); ret = -FI_EINVAL; goto err2; } _av->rx_ctx_bits = attr->rx_ctx_bits; _av->mask = attr->rx_ctx_bits ? ((uint64_t)1 << (64 - attr->rx_ctx_bits)) - 1 : ~0; *av = &_av->av_fid; return 0; err2: if(attr->name) { ofi_shm_unmap(&_av->shm); } else { if(_av->table_hdr && _av->table_hdr != MAP_FAILED) free(_av->table_hdr); } err: free(_av); return ret; }
void sock_rx_release_entry(struct sock_rx_entry *rx_entry) { SOCK_LOG_DBG("Releasing rx_entry: %p\n", rx_entry); free(rx_entry); }
static int sock_ep_close(struct fid *fid) { struct sock_ep *sock_ep; char c = 0; switch (fid->fclass) { case FI_CLASS_EP: sock_ep = container_of(fid, struct sock_ep, ep.fid); break; case FI_CLASS_SEP: sock_ep = container_of(fid, struct sock_ep, ep.fid); break; default: return -FI_EINVAL; } if (sock_ep->is_alias) { atomic_dec(&sock_ep->attr->ref); return 0; } if (atomic_get(&sock_ep->attr->ref) || atomic_get(&sock_ep->attr->num_rx_ctx) || atomic_get(&sock_ep->attr->num_tx_ctx)) return -FI_EBUSY; if (sock_ep->attr->ep_type == FI_EP_MSG) { sock_ep->attr->cm.do_listen = 0; if (ofi_write_socket(sock_ep->attr->cm.signal_fds[0], &c, 1) != 1) SOCK_LOG_DBG("Failed to signal\n"); if (sock_ep->attr->cm.listener_thread && pthread_join(sock_ep->attr->cm.listener_thread, NULL)) { SOCK_LOG_ERROR("pthread join failed (%d)\n", errno); } ofi_close_socket(sock_ep->attr->cm.signal_fds[0]); ofi_close_socket(sock_ep->attr->cm.signal_fds[1]); } else { if (sock_ep->attr->av) atomic_dec(&sock_ep->attr->av->ref); } pthread_mutex_lock(&sock_ep->attr->domain->pe->list_lock); if (sock_ep->attr->tx_shared) { fastlock_acquire(&sock_ep->attr->tx_ctx->lock); dlist_remove(&sock_ep->attr->tx_ctx_entry); fastlock_release(&sock_ep->attr->tx_ctx->lock); } if (sock_ep->attr->rx_shared) { fastlock_acquire(&sock_ep->attr->rx_ctx->lock); dlist_remove(&sock_ep->attr->rx_ctx_entry); fastlock_release(&sock_ep->attr->rx_ctx->lock); } pthread_mutex_unlock(&sock_ep->attr->domain->pe->list_lock); if (sock_ep->attr->listener.do_listen) { sock_ep->attr->listener.do_listen = 0; if (ofi_write_socket(sock_ep->attr->listener.signal_fds[0], &c, 1) != 1) SOCK_LOG_DBG("Failed to signal\n"); if (sock_ep->attr->listener.listener_thread && pthread_join(sock_ep->attr->listener.listener_thread, NULL)) { SOCK_LOG_ERROR("pthread join failed (%d)\n", errno); } ofi_close_socket(sock_ep->attr->listener.signal_fds[0]); ofi_close_socket(sock_ep->attr->listener.signal_fds[1]); } fastlock_destroy(&sock_ep->attr->cm.lock); if (sock_ep->attr->fclass != FI_CLASS_SEP) { if (!sock_ep->attr->tx_shared) sock_pe_remove_tx_ctx(sock_ep->attr->tx_array[0]); sock_tx_ctx_close(sock_ep->attr->tx_array[0]); sock_tx_ctx_free(sock_ep->attr->tx_array[0]); } if (sock_ep->attr->fclass != FI_CLASS_SEP) { if (!sock_ep->attr->rx_shared) sock_pe_remove_rx_ctx(sock_ep->attr->rx_array[0]); sock_rx_ctx_close(sock_ep->attr->rx_array[0]); sock_rx_ctx_free(sock_ep->attr->rx_array[0]); } idm_reset(&sock_ep->attr->conn_idm); idm_reset(&sock_ep->attr->av_idm); free(sock_ep->attr->tx_array); free(sock_ep->attr->rx_array); if (sock_ep->attr->src_addr) free(sock_ep->attr->src_addr); if (sock_ep->attr->dest_addr) free(sock_ep->attr->dest_addr); sock_conn_map_destroy(&sock_ep->attr->cmap); atomic_dec(&sock_ep->attr->domain->ref); fastlock_destroy(&sock_ep->attr->lock); free(sock_ep->attr); free(sock_ep); return 0; }
int sock_verify_domain_attr(struct fi_domain_attr *attr) { if (!attr) return 0; if (attr->name) { if (strcmp(attr->name, sock_dom_name)) return -FI_ENODATA; } switch (attr->threading) { case FI_THREAD_UNSPEC: case FI_THREAD_SAFE: case FI_THREAD_FID: case FI_THREAD_DOMAIN: case FI_THREAD_COMPLETION: case FI_THREAD_ENDPOINT: break; default: SOCK_LOG_DBG("Invalid threading model!\n"); return -FI_ENODATA; } switch (attr->control_progress) { case FI_PROGRESS_UNSPEC: case FI_PROGRESS_AUTO: case FI_PROGRESS_MANUAL: break; default: SOCK_LOG_DBG("Control progress mode not supported!\n"); return -FI_ENODATA; } switch (attr->data_progress) { case FI_PROGRESS_UNSPEC: case FI_PROGRESS_AUTO: case FI_PROGRESS_MANUAL: break; default: SOCK_LOG_DBG("Data progress mode not supported!\n"); return -FI_ENODATA; } switch (attr->resource_mgmt) { case FI_RM_UNSPEC: case FI_RM_DISABLED: case FI_RM_ENABLED: break; default: SOCK_LOG_DBG("Resource mgmt not supported!\n"); return -FI_ENODATA; } switch (attr->av_type) { case FI_AV_UNSPEC: case FI_AV_MAP: case FI_AV_TABLE: break; default: SOCK_LOG_DBG("AV type not supported!\n"); return -FI_ENODATA; } switch (attr->mr_mode) { case FI_MR_UNSPEC: case FI_MR_BASIC: case FI_MR_SCALABLE: break; default: SOCK_LOG_DBG("MR mode not supported\n"); return -FI_ENODATA; } if (attr->mr_key_size > sock_domain_attr.mr_key_size) return -FI_ENODATA; if (attr->cq_data_size > sock_domain_attr.cq_data_size) return -FI_ENODATA; if (attr->cq_cnt > sock_domain_attr.cq_cnt) return -FI_ENODATA; if (attr->ep_cnt > sock_domain_attr.ep_cnt) return -FI_ENODATA; if (attr->max_ep_tx_ctx > sock_domain_attr.max_ep_tx_ctx) return -FI_ENODATA; if (attr->max_ep_rx_ctx > sock_domain_attr.max_ep_rx_ctx) return -FI_ENODATA; return 0; }
ssize_t sock_ep_sendmsg(struct fid_ep *ep, const struct fi_msg *msg, uint64_t flags) { int ret, i; uint64_t total_len; struct sock_op tx_op; union sock_iov tx_iov; struct sock_conn *conn; struct sock_tx_ctx *tx_ctx; struct sock_ep *sock_ep; switch (ep->fid.fclass) { case FI_CLASS_EP: sock_ep = container_of(ep, struct sock_ep, ep); tx_ctx = sock_ep->tx_ctx; break; case FI_CLASS_TX_CTX: tx_ctx = container_of(ep, struct sock_tx_ctx, fid.ctx); sock_ep = tx_ctx->ep; break; default: SOCK_LOG_ERROR("Invalid EP type\n"); return -FI_EINVAL; } #if ENABLE_DEBUG if (msg->iov_count > SOCK_EP_MAX_IOV_LIMIT) return -FI_EINVAL; #endif if (!tx_ctx->enabled) return -FI_EOPBADSTATE; if (sock_drop_packet(sock_ep)) return 0; ret = sock_ep_get_conn(sock_ep, tx_ctx, msg->addr, &conn); if (ret) return ret; SOCK_LOG_DBG("New sendmsg on TX: %p using conn: %p\n", tx_ctx, conn); SOCK_EP_SET_TX_OP_FLAGS(flags); if (flags & SOCK_USE_OP_FLAGS) flags |= tx_ctx->attr.op_flags; if (sock_ep_is_send_cq_low(&tx_ctx->comp, flags)) { SOCK_LOG_ERROR("CQ size low\n"); return -FI_EAGAIN; } if (flags & FI_TRIGGER) { ret = sock_queue_msg_op(ep, msg, flags, SOCK_OP_SEND); if (ret != 1) return ret; } memset(&tx_op, 0, sizeof(struct sock_op)); tx_op.op = SOCK_OP_SEND; total_len = 0; if (flags & FI_INJECT) { for (i = 0; i < msg->iov_count; i++) total_len += msg->msg_iov[i].iov_len; if (total_len > SOCK_EP_MAX_INJECT_SZ) { ret = -FI_EINVAL; goto err; } tx_op.src_iov_len = total_len; } else { tx_op.src_iov_len = msg->iov_count; total_len = msg->iov_count * sizeof(union sock_iov); } total_len += sizeof(struct sock_op_send); if (flags & FI_REMOTE_CQ_DATA) total_len += sizeof(uint64_t); sock_tx_ctx_start(tx_ctx); if (rbavail(&tx_ctx->rb) < total_len) { ret = -FI_EAGAIN; goto err; } sock_tx_ctx_write_op_send(tx_ctx, &tx_op, flags, (uintptr_t) msg->context, msg->addr, (uintptr_t) msg->msg_iov[0].iov_base, sock_ep, conn); if (flags & FI_REMOTE_CQ_DATA) sock_tx_ctx_write(tx_ctx, &msg->data, sizeof(msg->data)); if (flags & FI_INJECT) { for (i = 0; i < msg->iov_count; i++) { sock_tx_ctx_write(tx_ctx, msg->msg_iov[i].iov_base, msg->msg_iov[i].iov_len); } } else { for (i = 0; i < msg->iov_count; i++) { tx_iov.iov.addr = (uintptr_t) msg->msg_iov[i].iov_base; tx_iov.iov.len = msg->msg_iov[i].iov_len; sock_tx_ctx_write(tx_ctx, &tx_iov, sizeof(tx_iov)); } } sock_tx_ctx_commit(tx_ctx); return 0; err: sock_tx_ctx_abort(tx_ctx); return ret; }
int sock_av_open(struct fid_domain *domain, struct fi_av_attr *attr, struct fid_av **av, void *context) { int ret = 0; struct sock_domain *dom; struct sock_av *_av; size_t table_sz, i; uint64_t flags = O_RDWR; if (!attr || sock_verify_av_attr(attr)) return -FI_EINVAL; dom = container_of(domain, struct sock_domain, dom_fid); if (dom->attr.av_type != FI_AV_UNSPEC && attr && dom->attr.av_type != attr->type) return -FI_EINVAL; _av = calloc(1, sizeof(*_av)); if (!_av) return -FI_ENOMEM; _av->attr = *attr; _av->attr.count = (attr->count) ? attr->count : sock_av_def_sz; table_sz = sizeof(struct sock_av_table_hdr) + _av->attr.count * sizeof(struct sock_av_addr); if (attr->name) { _av->name = strdup(attr->name); if (!_av->name) { ret = -FI_ENOMEM; goto err1; } if (!(attr->flags & FI_READ)) flags |= O_CREAT; for (i = 0; i < strlen(_av->name); i++) if (_av->name[i] == ' ') _av->name[i] = '_'; SOCK_LOG_DBG("Creating shm segment :%s (size: %lu)\n", _av->name, table_sz); _av->shared_fd = shm_open(_av->name, flags, S_IRUSR | S_IWUSR); if (_av->shared_fd < 0) { SOCK_LOG_ERROR("shm_open failed\n"); ret = -FI_EINVAL; goto err2; } if (ftruncate(_av->shared_fd, table_sz) == -1) { SOCK_LOG_ERROR("ftruncate failed\n"); shm_unlink(_av->name); ret = -FI_EINVAL; goto err2; } _av->table_hdr = mmap(NULL, table_sz, PROT_READ | PROT_WRITE, MAP_SHARED, _av->shared_fd, 0); if (attr->flags & FI_READ) { if (_av->table_hdr->size != _av->attr.count) { ret = -FI_EINVAL; goto err2; } } else { _av->table_hdr->size = _av->attr.count; _av->table_hdr->stored = 0; } if (_av->table_hdr == MAP_FAILED) { SOCK_LOG_ERROR("mmap failed\n"); shm_unlink(_av->name); ret = -FI_EINVAL; goto err2; } } else { _av->table_hdr = calloc(1, table_sz); if (!_av->table_hdr) { ret = -FI_ENOMEM; goto err3; } _av->table_hdr->size = _av->attr.count; _av->table_hdr->req_sz = attr->count; } _av->table = (struct sock_av_addr *)((char *)_av->table_hdr + sizeof(struct sock_av_table_hdr)); _av->av_fid.fid.fclass = FI_CLASS_AV; _av->av_fid.fid.context = context; _av->av_fid.fid.ops = &sock_av_fi_ops; switch (attr->type) { case FI_AV_MAP: _av->av_fid.ops = &sock_am_ops; break; case FI_AV_TABLE: _av->av_fid.ops = &sock_at_ops; break; default: ret = -FI_EINVAL; goto err3; } atomic_initialize(&_av->ref, 0); atomic_inc(&dom->ref); _av->domain = dom; switch (dom->info.addr_format) { case FI_SOCKADDR_IN: _av->addrlen = sizeof(struct sockaddr_in); break; default: SOCK_LOG_ERROR("Invalid address format: only IPv4 supported\n"); ret = -FI_EINVAL; goto err3; } _av->rx_ctx_bits = attr->rx_ctx_bits; _av->mask = attr->rx_ctx_bits ? ((uint64_t)1 << (64 - attr->rx_ctx_bits)) - 1 : ~0; *av = &_av->av_fid; return 0; err3: free(_av->table_hdr); err2: free(_av->name); err1: free(_av); return ret; }
struct sock_conn *sock_ep_connect(struct sock_ep *ep, fi_addr_t index) { int conn_fd = -1, ret; int do_retry = sock_conn_retry; struct sock_conn *conn, *new_conn; uint16_t idx; struct sockaddr_in *addr; socklen_t lon; int valopt = 0; struct pollfd poll_fd; if (ep->ep_type == FI_EP_MSG) { idx = 0; addr = ep->dest_addr; } else { idx = index & ep->av->mask; addr = (struct sockaddr_in *)&ep->av->table[idx].addr; } do_connect: fastlock_acquire(&ep->cmap.lock); conn = sock_ep_lookup_conn(ep, index, addr); fastlock_release(&ep->cmap.lock); if (conn != SOCK_CM_CONN_IN_PROGRESS) return conn; conn_fd = socket(AF_INET, SOCK_STREAM, 0); if (conn_fd == -1) { SOCK_LOG_ERROR("failed to create conn_fd, errno: %d\n", errno); errno = FI_EOTHER; return NULL; } ret = fd_set_nonblock(conn_fd); if (ret) { SOCK_LOG_ERROR("failed to set conn_fd nonblocking, errno: %d\n", errno); errno = FI_EOTHER; close(conn_fd); return NULL; } SOCK_LOG_DBG("Connecting to: %s:%d\n", inet_ntoa(addr->sin_addr), ntohs(addr->sin_port)); SOCK_LOG_DBG("Connecting using address:%s\n", inet_ntoa(ep->src_addr->sin_addr)); ret = connect(conn_fd, (struct sockaddr *) addr, sizeof *addr); if (ret < 0) { if (errno == EINPROGRESS) { poll_fd.fd = conn_fd; poll_fd.events = POLLOUT; ret = poll(&poll_fd, 1, 15 * 1000); if (ret < 0) { SOCK_LOG_DBG("poll failed\n"); goto retry; } lon = sizeof(int); ret = getsockopt(conn_fd, SOL_SOCKET, SO_ERROR, (void*)(&valopt), &lon); if (ret < 0) { SOCK_LOG_DBG("getsockopt failed: %d, %d\n", ret, conn_fd); goto retry; } if (valopt) { SOCK_LOG_DBG("Error in connection() %d - %s - %d\n", valopt, strerror(valopt), conn_fd); SOCK_LOG_DBG("Connecting to: %s:%d\n", inet_ntoa(addr->sin_addr), ntohs(addr->sin_port)); SOCK_LOG_DBG("Connecting using address:%s\n", inet_ntoa(ep->src_addr->sin_addr)); goto retry; } goto out; } else { SOCK_LOG_DBG("Timeout or error() - %s: %d\n", strerror(errno), conn_fd); SOCK_LOG_DBG("Connecting to: %s:%d\n", inet_ntoa(addr->sin_addr), ntohs(addr->sin_port)); SOCK_LOG_DBG("Connecting using address:%s\n", inet_ntoa(ep->src_addr->sin_addr)); goto retry; } } else { goto out; } retry: do_retry--; sleep(10); if (!do_retry) goto err; if (conn_fd != -1) { close(conn_fd); conn_fd = -1; } SOCK_LOG_ERROR("Connect error, retrying - %s - %d\n", strerror(errno), conn_fd); SOCK_LOG_DBG("Connecting to: %s:%d\n", inet_ntoa(addr->sin_addr), ntohs(addr->sin_port)); SOCK_LOG_DBG("Connecting using address:%s\n", inet_ntoa(ep->src_addr->sin_addr)); goto do_connect; out: fastlock_acquire(&ep->cmap.lock); new_conn = sock_conn_map_insert(ep, addr, conn_fd, 0); new_conn->av_index = (ep->ep_type == FI_EP_MSG) ? FI_ADDR_NOTAVAIL : (fi_addr_t) idx; conn = idm_lookup(&ep->av_idm, index); if (conn == SOCK_CM_CONN_IN_PROGRESS) { idm_set(&ep->av_idm, index, new_conn); conn = new_conn; } fastlock_release(&ep->cmap.lock); return conn; err: close(conn_fd); return NULL; }
static ssize_t sock_ep_recv(struct fid_ep *ep, void *buf, size_t len, void *desc, fi_addr_t src_addr, void *context) { struct iovec msg_iov = { .iov_base = buf, .iov_len = len, }; struct fi_msg msg = { .msg_iov = &msg_iov, .desc = &desc, .iov_count = 1, .addr = src_addr, .context = context, .data = 0, }; return sock_ep_recvmsg(ep, &msg, SOCK_USE_OP_FLAGS); } static ssize_t sock_ep_recvv(struct fid_ep *ep, const struct iovec *iov, void **desc, size_t count, fi_addr_t src_addr, void *context) { struct fi_msg msg = { .msg_iov = iov, .desc = desc, .iov_count = count, .addr = src_addr, .context = context, .data = 0, }; return sock_ep_recvmsg(ep, &msg, SOCK_USE_OP_FLAGS); } ssize_t sock_ep_sendmsg(struct fid_ep *ep, const struct fi_msg *msg, uint64_t flags) { int ret; size_t i; uint64_t total_len, op_flags; struct sock_op tx_op; union sock_iov tx_iov; struct sock_conn *conn; struct sock_tx_ctx *tx_ctx; struct sock_ep *sock_ep; struct sock_ep_attr *ep_attr; switch (ep->fid.fclass) { case FI_CLASS_EP: sock_ep = container_of(ep, struct sock_ep, ep); ep_attr = sock_ep->attr; tx_ctx = sock_ep->attr->tx_ctx->use_shared ? sock_ep->attr->tx_ctx->stx_ctx : sock_ep->attr->tx_ctx; op_flags = sock_ep->tx_attr.op_flags; break; case FI_CLASS_TX_CTX: tx_ctx = container_of(ep, struct sock_tx_ctx, fid.ctx); ep_attr = tx_ctx->ep_attr; op_flags = tx_ctx->attr.op_flags; break; default: SOCK_LOG_ERROR("Invalid EP type\n"); return -FI_EINVAL; } #if ENABLE_DEBUG if (msg->iov_count > SOCK_EP_MAX_IOV_LIMIT) return -FI_EINVAL; #endif if (!tx_ctx->enabled) return -FI_EOPBADSTATE; if (sock_drop_packet(ep_attr)) return 0; ret = sock_ep_get_conn(ep_attr, tx_ctx, msg->addr, &conn); if (ret) return ret; SOCK_LOG_DBG("New sendmsg on TX: %p using conn: %p\n", tx_ctx, conn); SOCK_EP_SET_TX_OP_FLAGS(flags); if (flags & SOCK_USE_OP_FLAGS) flags |= op_flags; if (flags & FI_TRIGGER) { ret = sock_queue_msg_op(ep, msg, flags, FI_OP_SEND); if (ret != 1) return ret; } memset(&tx_op, 0, sizeof(struct sock_op)); tx_op.op = SOCK_OP_SEND; total_len = 0; if (flags & FI_INJECT) { for (i = 0; i < msg->iov_count; i++) total_len += msg->msg_iov[i].iov_len; if (total_len > SOCK_EP_MAX_INJECT_SZ) return -FI_EINVAL; tx_op.src_iov_len = total_len; } else { tx_op.src_iov_len = msg->iov_count; total_len = msg->iov_count * sizeof(union sock_iov); } total_len += sizeof(struct sock_op_send); if (flags & FI_REMOTE_CQ_DATA) total_len += sizeof(uint64_t); sock_tx_ctx_start(tx_ctx); if (ofi_rbavail(&tx_ctx->rb) < total_len) { ret = -FI_EAGAIN; goto err; } sock_tx_ctx_write_op_send(tx_ctx, &tx_op, flags, (uintptr_t) msg->context, msg->addr, (uintptr_t) ((msg->iov_count > 0) ? msg->msg_iov[0].iov_base : NULL), ep_attr, conn); if (flags & FI_REMOTE_CQ_DATA) sock_tx_ctx_write(tx_ctx, &msg->data, sizeof(msg->data)); if (flags & FI_INJECT) { for (i = 0; i < msg->iov_count; i++) { sock_tx_ctx_write(tx_ctx, msg->msg_iov[i].iov_base, msg->msg_iov[i].iov_len); } } else { for (i = 0; i < msg->iov_count; i++) { tx_iov.iov.addr = (uintptr_t) msg->msg_iov[i].iov_base; tx_iov.iov.len = msg->msg_iov[i].iov_len; sock_tx_ctx_write(tx_ctx, &tx_iov, sizeof(tx_iov)); } } sock_tx_ctx_commit(tx_ctx); return 0; err: sock_tx_ctx_abort(tx_ctx); return ret; } static ssize_t sock_ep_send(struct fid_ep *ep, const void *buf, size_t len, void *desc, fi_addr_t dest_addr, void *context) { struct iovec msg_iov = { .iov_base = (void *)buf, .iov_len = len, }; struct fi_msg msg = { .msg_iov = &msg_iov, .desc = &desc, .iov_count = 1, .addr = dest_addr, .context = context, .data = 0, }; return sock_ep_sendmsg(ep, &msg, SOCK_USE_OP_FLAGS); } static ssize_t sock_ep_sendv(struct fid_ep *ep, const struct iovec *iov, void **desc, size_t count, fi_addr_t dest_addr, void *context) { struct fi_msg msg = { .msg_iov = iov, .desc = desc, .iov_count = count, .addr = dest_addr, .context = context, .data = 0, }; return sock_ep_sendmsg(ep, &msg, SOCK_USE_OP_FLAGS); } static ssize_t sock_ep_senddata(struct fid_ep *ep, const void *buf, size_t len, void *desc, uint64_t data, fi_addr_t dest_addr, void *context) { struct iovec msg_iov = { .iov_base = (void *)buf, .iov_len = len, }; struct fi_msg msg = { .msg_iov = &msg_iov, .desc = desc, .iov_count = 1, .addr = dest_addr, .context = context, .data = data, }; return sock_ep_sendmsg(ep, &msg, FI_REMOTE_CQ_DATA | SOCK_USE_OP_FLAGS); } static ssize_t sock_ep_inject(struct fid_ep *ep, const void *buf, size_t len, fi_addr_t dest_addr) { struct iovec msg_iov = { .iov_base = (void *)buf, .iov_len = len, }; struct fi_msg msg = { .msg_iov = &msg_iov, .desc = NULL, .iov_count = 1, .addr = dest_addr, .context = NULL, .data = 0, }; return sock_ep_sendmsg(ep, &msg, FI_INJECT | SOCK_NO_COMPLETION | SOCK_USE_OP_FLAGS); } static ssize_t sock_ep_injectdata(struct fid_ep *ep, const void *buf, size_t len, uint64_t data, fi_addr_t dest_addr) { struct iovec msg_iov = { .iov_base = (void *)buf, .iov_len = len, }; struct fi_msg msg = { .msg_iov = &msg_iov, .desc = NULL, .iov_count = 1, .addr = dest_addr, .context = NULL, .data = data, }; return sock_ep_sendmsg(ep, &msg, FI_REMOTE_CQ_DATA | FI_INJECT | SOCK_NO_COMPLETION | SOCK_USE_OP_FLAGS); } struct fi_ops_msg sock_ep_msg_ops = { .size = sizeof(struct fi_ops_msg), .recv = sock_ep_recv, .recvv = sock_ep_recvv, .recvmsg = sock_ep_recvmsg, .send = sock_ep_send, .sendv = sock_ep_sendv, .sendmsg = sock_ep_sendmsg, .inject = sock_ep_inject, .senddata = sock_ep_senddata, .injectdata = sock_ep_injectdata }; ssize_t sock_ep_trecvmsg(struct fid_ep *ep, const struct fi_msg_tagged *msg, uint64_t flags) { int ret; size_t i; struct sock_rx_ctx *rx_ctx; struct sock_rx_entry *rx_entry; struct sock_ep *sock_ep; uint64_t op_flags; switch (ep->fid.fclass) { case FI_CLASS_EP: sock_ep = container_of(ep, struct sock_ep, ep); rx_ctx = sock_ep->attr->rx_ctx; op_flags = sock_ep->rx_attr.op_flags; break; case FI_CLASS_RX_CTX: case FI_CLASS_SRX_CTX: rx_ctx = container_of(ep, struct sock_rx_ctx, ctx); op_flags = rx_ctx->attr.op_flags; break; default: SOCK_LOG_ERROR("Invalid ep type\n"); return -FI_EINVAL; } #if ENABLE_DEBUG if (msg->iov_count > SOCK_EP_MAX_IOV_LIMIT) return -FI_EINVAL; #endif if (!rx_ctx->enabled) return -FI_EOPBADSTATE; if (flags & SOCK_USE_OP_FLAGS) flags |= op_flags; flags &= ~FI_MULTI_RECV; if (flags & FI_TRIGGER) { ret = sock_queue_tmsg_op(ep, msg, flags, FI_OP_TRECV); if (ret != 1) return ret; } if (flags & FI_PEEK) { return sock_rx_peek_recv(rx_ctx, msg->addr, msg->tag, msg->ignore, msg->context, flags, 1); } else if (flags & FI_CLAIM) { return sock_rx_claim_recv(rx_ctx, msg->context, flags, msg->tag, msg->ignore, 1, msg->msg_iov, msg->iov_count); } fastlock_acquire(&rx_ctx->lock); rx_entry = sock_rx_new_entry(rx_ctx); fastlock_release(&rx_ctx->lock); if (!rx_entry) return -FI_ENOMEM; rx_entry->rx_op.op = SOCK_OP_TRECV; rx_entry->rx_op.dest_iov_len = msg->iov_count; rx_entry->flags = flags; rx_entry->context = (uintptr_t) msg->context; rx_entry->addr = (rx_ctx->attr.caps & FI_DIRECTED_RECV) ? msg->addr : FI_ADDR_UNSPEC; rx_entry->data = msg->data; rx_entry->tag = msg->tag; rx_entry->ignore = msg->ignore; rx_entry->is_tagged = 1; for (i = 0; i < msg->iov_count; i++) { rx_entry->iov[i].iov.addr = (uintptr_t) msg->msg_iov[i].iov_base; rx_entry->iov[i].iov.len = msg->msg_iov[i].iov_len; rx_entry->total_len += rx_entry->iov[i].iov.len; } fastlock_acquire(&rx_ctx->lock); SOCK_LOG_DBG("New rx_entry: %p (ctx: %p)\n", rx_entry, rx_ctx); dlist_insert_tail(&rx_entry->entry, &rx_ctx->rx_entry_list); fastlock_release(&rx_ctx->lock); return 0; } static ssize_t sock_ep_trecv(struct fid_ep *ep, void *buf, size_t len, void *desc, fi_addr_t src_addr, uint64_t tag, uint64_t ignore, void *context) { struct iovec msg_iov = { .iov_base = buf, .iov_len = len, }; struct fi_msg_tagged msg = { .msg_iov = &msg_iov, .desc = &desc, .iov_count = 1, .addr = src_addr, .context = context, .tag = tag, .ignore = ignore, .data = 0, }; return sock_ep_trecvmsg(ep, &msg, SOCK_USE_OP_FLAGS); } static ssize_t sock_ep_trecvv(struct fid_ep *ep, const struct iovec *iov, void **desc, size_t count, fi_addr_t src_addr, uint64_t tag, uint64_t ignore, void *context) { struct fi_msg_tagged msg = { .msg_iov = iov, .desc = desc, .iov_count = count, .addr = src_addr, .context = context, .tag = tag, .ignore = ignore, .data = 0, }; return sock_ep_trecvmsg(ep, &msg, SOCK_USE_OP_FLAGS); } ssize_t sock_ep_tsendmsg(struct fid_ep *ep, const struct fi_msg_tagged *msg, uint64_t flags) { int ret; size_t i; uint64_t total_len, op_flags; struct sock_op tx_op; union sock_iov tx_iov; struct sock_conn *conn; struct sock_tx_ctx *tx_ctx; struct sock_ep *sock_ep; struct sock_ep_attr *ep_attr; switch (ep->fid.fclass) { case FI_CLASS_EP: sock_ep = container_of(ep, struct sock_ep, ep); tx_ctx = sock_ep->attr->tx_ctx->use_shared ? sock_ep->attr->tx_ctx->stx_ctx : sock_ep->attr->tx_ctx; ep_attr = sock_ep->attr; op_flags = sock_ep->tx_attr.op_flags; break; case FI_CLASS_TX_CTX: tx_ctx = container_of(ep, struct sock_tx_ctx, fid.ctx); ep_attr = tx_ctx->ep_attr; op_flags = tx_ctx->attr.op_flags; break; default: SOCK_LOG_ERROR("Invalid EP type\n"); return -FI_EINVAL; } #if ENABLE_DEBUG if (msg->iov_count > SOCK_EP_MAX_IOV_LIMIT) return -FI_EINVAL; #endif if (!tx_ctx->enabled) return -FI_EOPBADSTATE; if (sock_drop_packet(ep_attr)) return 0; ret = sock_ep_get_conn(ep_attr, tx_ctx, msg->addr, &conn); if (ret) return ret; SOCK_EP_SET_TX_OP_FLAGS(flags); if (flags & SOCK_USE_OP_FLAGS) flags |= op_flags; if (flags & FI_TRIGGER) { ret = sock_queue_tmsg_op(ep, msg, flags, FI_OP_TSEND); if (ret != 1) return ret; } memset(&tx_op, 0, sizeof(tx_op)); tx_op.op = SOCK_OP_TSEND; total_len = 0; if (flags & FI_INJECT) { for (i = 0; i < msg->iov_count; i++) total_len += msg->msg_iov[i].iov_len; tx_op.src_iov_len = total_len; if (total_len > SOCK_EP_MAX_INJECT_SZ) return -FI_EINVAL; } else { total_len = msg->iov_count * sizeof(union sock_iov); tx_op.src_iov_len = msg->iov_count; } total_len += sizeof(struct sock_op_tsend); if (flags & FI_REMOTE_CQ_DATA) total_len += sizeof(uint64_t); sock_tx_ctx_start(tx_ctx); if (ofi_rbavail(&tx_ctx->rb) < total_len) { ret = -FI_EAGAIN; goto err; } sock_tx_ctx_write_op_tsend(tx_ctx, &tx_op, flags, (uintptr_t) msg->context, msg->addr, (uintptr_t) ((msg->iov_count > 0) ? msg->msg_iov[0].iov_base : NULL), ep_attr, conn, msg->tag); if (flags & FI_REMOTE_CQ_DATA) sock_tx_ctx_write(tx_ctx, &msg->data, sizeof(msg->data)); if (flags & FI_INJECT) { for (i = 0; i < msg->iov_count; i++) { sock_tx_ctx_write(tx_ctx, msg->msg_iov[i].iov_base, msg->msg_iov[i].iov_len); } } else { for (i = 0; i < msg->iov_count; i++) { tx_iov.iov.addr = (uintptr_t) msg->msg_iov[i].iov_base; tx_iov.iov.len = msg->msg_iov[i].iov_len; sock_tx_ctx_write(tx_ctx, &tx_iov, sizeof(tx_iov)); } } sock_tx_ctx_commit(tx_ctx); return 0; err: sock_tx_ctx_abort(tx_ctx); return ret; } static ssize_t sock_ep_tsend(struct fid_ep *ep, const void *buf, size_t len, void *desc, fi_addr_t dest_addr, uint64_t tag, void *context) { struct iovec msg_iov = { .iov_base = (void *)buf, .iov_len = len, }; struct fi_msg_tagged msg = { .msg_iov = &msg_iov, .desc = &desc, .iov_count = 1, .addr = dest_addr, .tag = tag, .ignore = 0, .context = context, .data = 0, }; return sock_ep_tsendmsg(ep, &msg, SOCK_USE_OP_FLAGS); } static ssize_t sock_ep_tsendv(struct fid_ep *ep, const struct iovec *iov, void **desc, size_t count, fi_addr_t dest_addr, uint64_t tag, void *context) { struct fi_msg_tagged msg = { .msg_iov = iov, .desc = desc, .iov_count = count, .addr = dest_addr, .tag = tag, .ignore = 0, .context = context, .data = 0, }; return sock_ep_tsendmsg(ep, &msg, SOCK_USE_OP_FLAGS); } static ssize_t sock_ep_tsenddata(struct fid_ep *ep, const void *buf, size_t len, void *desc, uint64_t data, fi_addr_t dest_addr, uint64_t tag, void *context) { struct iovec msg_iov = { .iov_base = (void *)buf, .iov_len = len, }; struct fi_msg_tagged msg = { .msg_iov = &msg_iov, .desc = desc, .iov_count = 1, .addr = dest_addr, .tag = tag, .ignore = 0, .context = context, .data = data, }; return sock_ep_tsendmsg(ep, &msg, FI_REMOTE_CQ_DATA | SOCK_USE_OP_FLAGS); } static ssize_t sock_ep_tinject(struct fid_ep *ep, const void *buf, size_t len, fi_addr_t dest_addr, uint64_t tag) { struct iovec msg_iov = { .iov_base = (void *)buf, .iov_len = len, }; struct fi_msg_tagged msg = { .msg_iov = &msg_iov, .desc = NULL, .iov_count = 1, .addr = dest_addr, .tag = tag, .ignore = 0, .context = NULL, .data = 0, }; return sock_ep_tsendmsg(ep, &msg, FI_INJECT | SOCK_NO_COMPLETION | SOCK_USE_OP_FLAGS); } static ssize_t sock_ep_tinjectdata(struct fid_ep *ep, const void *buf, size_t len, uint64_t data, fi_addr_t dest_addr, uint64_t tag) { struct iovec msg_iov = { .iov_base = (void *)buf, .iov_len = len, }; struct fi_msg_tagged msg = { .msg_iov = &msg_iov, .desc = NULL, .iov_count = 1, .addr = dest_addr, .tag = tag, .ignore = 0, .context = NULL, .data = data, }; return sock_ep_tsendmsg(ep, &msg, FI_REMOTE_CQ_DATA | FI_INJECT | SOCK_NO_COMPLETION | SOCK_USE_OP_FLAGS); } struct fi_ops_tagged sock_ep_tagged = { .size = sizeof(struct fi_ops_tagged), .recv = sock_ep_trecv, .recvv = sock_ep_trecvv, .recvmsg = sock_ep_trecvmsg, .send = sock_ep_tsend, .sendv = sock_ep_tsendv, .sendmsg = sock_ep_tsendmsg, .inject = sock_ep_tinject, .senddata = sock_ep_tsenddata, .injectdata = sock_ep_tinjectdata, };
struct sock_conn *sock_ep_connect(struct sock_ep_attr *ep_attr, fi_addr_t index) { int conn_fd = -1, ret; int do_retry = sock_conn_retry; struct sock_conn *conn, *new_conn; struct sockaddr_in addr; socklen_t lon; int valopt = 0; struct pollfd poll_fd; if (ep_attr->ep_type == FI_EP_MSG) { /* Need to check that destination address has been passed to endpoint */ assert(ep_attr->dest_addr); addr = *ep_attr->dest_addr; addr.sin_port = htons(ep_attr->msg_dest_port); } else { addr = *((struct sockaddr_in *)&ep_attr->av->table[index].addr); } do_connect: fastlock_acquire(&ep_attr->cmap.lock); conn = sock_ep_lookup_conn(ep_attr, index, &addr); fastlock_release(&ep_attr->cmap.lock); if (conn != SOCK_CM_CONN_IN_PROGRESS) return conn; conn_fd = ofi_socket(AF_INET, SOCK_STREAM, 0); if (conn_fd == -1) { SOCK_LOG_ERROR("failed to create conn_fd, errno: %d\n", errno); errno = FI_EOTHER; return NULL; } ret = fd_set_nonblock(conn_fd); if (ret) { SOCK_LOG_ERROR("failed to set conn_fd nonblocking, errno: %d\n", errno); errno = FI_EOTHER; ofi_close_socket(conn_fd); return NULL; } SOCK_LOG_DBG("Connecting to: %s:%d\n", inet_ntoa(addr.sin_addr), ntohs(addr.sin_port)); SOCK_LOG_DBG("Connecting using address:%s\n", inet_ntoa(ep_attr->src_addr->sin_addr)); ret = connect(conn_fd, (struct sockaddr *) &addr, sizeof addr); if (ret < 0) { if (ofi_sockerr() == EINPROGRESS) { poll_fd.fd = conn_fd; poll_fd.events = POLLOUT; ret = poll(&poll_fd, 1, 15 * 1000); if (ret < 0) { SOCK_LOG_DBG("poll failed\n"); goto retry; } lon = sizeof(int); ret = getsockopt(conn_fd, SOL_SOCKET, SO_ERROR, (void*)(&valopt), &lon); if (ret < 0) { SOCK_LOG_DBG("getsockopt failed: %d, %d\n", ret, conn_fd); goto retry; } if (valopt) { SOCK_LOG_DBG("Error in connection() %d - %s - %d\n", valopt, strerror(valopt), conn_fd); SOCK_LOG_DBG("Connecting to: %s:%d\n", inet_ntoa(addr.sin_addr), ntohs(addr.sin_port)); SOCK_LOG_DBG("Connecting using address:%s\n", inet_ntoa(ep_attr->src_addr->sin_addr)); goto retry; } goto out; } else { SOCK_LOG_DBG("Timeout or error() - %s: %d\n", strerror(errno), conn_fd); SOCK_LOG_DBG("Connecting to: %s:%d\n", inet_ntoa(addr.sin_addr), ntohs(addr.sin_port)); SOCK_LOG_DBG("Connecting using address:%s\n", inet_ntoa(ep_attr->src_addr->sin_addr)); goto retry; } } else { goto out; } retry: do_retry--; sleep(10); if (!do_retry) goto err; if (conn_fd != -1) { ofi_close_socket(conn_fd); conn_fd = -1; } SOCK_LOG_ERROR("Connect error, retrying - %s - %d\n", strerror(errno), conn_fd); SOCK_LOG_DBG("Connecting to: %s:%d\n", inet_ntoa(addr.sin_addr), ntohs(addr.sin_port)); SOCK_LOG_DBG("Connecting using address:%s\n", inet_ntoa(ep_attr->src_addr->sin_addr)); goto do_connect; out: fastlock_acquire(&ep_attr->cmap.lock); new_conn = sock_conn_map_insert(ep_attr, &addr, conn_fd, 0); if (!new_conn) { fastlock_release(&ep_attr->cmap.lock); goto err; } new_conn->av_index = (ep_attr->ep_type == FI_EP_MSG) ? FI_ADDR_NOTAVAIL : index; conn = ofi_idm_lookup(&ep_attr->av_idm, index); if (conn == SOCK_CM_CONN_IN_PROGRESS) { if (ofi_idm_set(&ep_attr->av_idm, index, new_conn) < 0) SOCK_LOG_ERROR("ofi_idm_set failed\n"); conn = new_conn; } fastlock_release(&ep_attr->cmap.lock); return conn; err: ofi_close_socket(conn_fd); return NULL; }
int sock_conn_listen(struct sock_ep_attr *ep_attr) { struct addrinfo *s_res = NULL, *p; struct addrinfo hints = { 0 }; int listen_fd = 0, ret; socklen_t addr_size; struct sockaddr_in addr; struct sock_conn_listener *listener = &ep_attr->listener; char service[NI_MAXSERV] = {0}; char *port; char ipaddr[24]; hints.ai_family = AF_INET; hints.ai_socktype = SOCK_STREAM; hints.ai_flags = AI_PASSIVE; memcpy(&addr, ep_attr->src_addr, sizeof(addr)); if (getnameinfo((void *)ep_attr->src_addr, sizeof(*ep_attr->src_addr), NULL, 0, listener->service, sizeof(listener->service), NI_NUMERICSERV)) { SOCK_LOG_ERROR("could not resolve src_addr\n"); return -FI_EINVAL; } if (ep_attr->ep_type == FI_EP_MSG) { memset(listener->service, 0, NI_MAXSERV); port = NULL; addr.sin_port = 0; } else port = listener->service; inet_ntop(addr.sin_family, &addr.sin_addr, ipaddr, sizeof(ipaddr)); ret = getaddrinfo(ipaddr, port, &hints, &s_res); if (ret) { SOCK_LOG_ERROR("no available AF_INET address, service %s, %s\n", listener->service, gai_strerror(ret)); return -FI_EINVAL; } SOCK_LOG_DBG("Binding listener thread to port: %s\n", listener->service); for (p = s_res; p; p = p->ai_next) { listen_fd = ofi_socket(p->ai_family, p->ai_socktype, p->ai_protocol); if (listen_fd >= 0) { sock_set_sockopts(listen_fd); if (!bind(listen_fd, s_res->ai_addr, s_res->ai_addrlen)) break; ofi_close_socket(listen_fd); listen_fd = -1; } } freeaddrinfo(s_res); if (listen_fd < 0) { SOCK_LOG_ERROR("failed to listen to port: %s\n", listener->service); goto err; } if (atoi(listener->service) == 0) { addr_size = sizeof(addr); if (getsockname(listen_fd, (struct sockaddr *) &addr, &addr_size)) goto err; snprintf(listener->service, sizeof listener->service, "%d", ntohs(addr.sin_port)); SOCK_LOG_DBG("Bound to port: %s - %d\n", listener->service, getpid()); ep_attr->msg_src_port = ntohs(addr.sin_port); } if (ep_attr->src_addr->sin_addr.s_addr == 0) { snprintf(service, sizeof service, "%s", listener->service); ret = sock_get_src_addr_from_hostname(ep_attr->src_addr, service); if (ret) goto err; } if (listen(listen_fd, sock_cm_def_map_sz)) { SOCK_LOG_ERROR("failed to listen socket: %s\n", strerror(errno)); goto err; } if (((struct sockaddr_in *) (ep_attr->src_addr))->sin_port == 0) { ((struct sockaddr_in *) (ep_attr->src_addr))->sin_port = htons(atoi(listener->service)); } listener->sock = listen_fd; if (socketpair(AF_UNIX, SOCK_STREAM, 0, listener->signal_fds) < 0) goto err; listener->do_listen = 1; fd_set_nonblock(listener->signal_fds[1]); if (pthread_create(&listener->listener_thread, 0, _sock_conn_listen, ep_attr)) { SOCK_LOG_ERROR("failed to create conn listener thread\n"); goto err; } while (!*((volatile int*)&listener->is_ready)); return 0; err: if (listen_fd >= 0) ofi_close_socket(listen_fd); return -FI_EINVAL; }
ssize_t sock_ep_recvmsg(struct fid_ep *ep, const struct fi_msg *msg, uint64_t flags) { int ret; size_t i; struct sock_rx_ctx *rx_ctx; struct sock_rx_entry *rx_entry; struct sock_ep *sock_ep; uint64_t op_flags; switch (ep->fid.fclass) { case FI_CLASS_EP: sock_ep = container_of(ep, struct sock_ep, ep); rx_ctx = sock_ep->attr->rx_ctx; op_flags = sock_ep->rx_attr.op_flags; break; case FI_CLASS_RX_CTX: case FI_CLASS_SRX_CTX: rx_ctx = container_of(ep, struct sock_rx_ctx, ctx); op_flags = rx_ctx->attr.op_flags; break; default: SOCK_LOG_ERROR("Invalid ep type\n"); return -FI_EINVAL; } #if ENABLE_DEBUG if (msg->iov_count > SOCK_EP_MAX_IOV_LIMIT) return -FI_EINVAL; #endif if (!rx_ctx->enabled) return -FI_EOPBADSTATE; if (flags & SOCK_USE_OP_FLAGS) flags |= op_flags; if (flags & FI_TRIGGER) { ret = sock_queue_msg_op(ep, msg, flags, FI_OP_RECV); if (ret != 1) return ret; } if (flags & FI_PEEK) { return sock_rx_peek_recv(rx_ctx, msg->addr, 0L, ~0ULL, msg->context, flags, 0); } else if (flags & FI_CLAIM) { return sock_rx_claim_recv(rx_ctx, msg->context, flags, 0L, ~0ULL, 0, msg->msg_iov, msg->iov_count); } fastlock_acquire(&rx_ctx->lock); rx_entry = sock_rx_new_entry(rx_ctx); fastlock_release(&rx_ctx->lock); if (!rx_entry) return -FI_ENOMEM; rx_entry->rx_op.op = SOCK_OP_RECV; rx_entry->rx_op.dest_iov_len = msg->iov_count; rx_entry->flags = flags; rx_entry->context = (uintptr_t) msg->context; rx_entry->addr = (rx_ctx->attr.caps & FI_DIRECTED_RECV) ? msg->addr : FI_ADDR_UNSPEC; rx_entry->data = msg->data; rx_entry->ignore = ~0ULL; rx_entry->is_tagged = 0; for (i = 0; i < msg->iov_count; i++) { rx_entry->iov[i].iov.addr = (uintptr_t) msg->msg_iov[i].iov_base; rx_entry->iov[i].iov.len = msg->msg_iov[i].iov_len; rx_entry->total_len += rx_entry->iov[i].iov.len; } SOCK_LOG_DBG("New rx_entry: %p (ctx: %p)\n", rx_entry, rx_ctx); fastlock_acquire(&rx_ctx->lock); dlist_insert_tail(&rx_entry->entry, &rx_ctx->rx_entry_list); fastlock_release(&rx_ctx->lock); return 0; }
int sock_rdm_verify_ep_attr(struct fi_ep_attr *ep_attr, struct fi_tx_attr *tx_attr, struct fi_rx_attr *rx_attr) { int ret; if (ep_attr) { switch (ep_attr->protocol) { case FI_PROTO_UNSPEC: case FI_PROTO_SOCK_TCP: break; default: SOCK_LOG_DBG("Unsupported protocol\n"); return -FI_ENODATA; } if (ep_attr->protocol_version != sock_rdm_ep_attr.protocol_version) { SOCK_LOG_DBG("Invalid protocol version\n"); return -FI_ENODATA; } if (ep_attr->max_msg_size > sock_rdm_ep_attr.max_msg_size) { SOCK_LOG_DBG("Message size too large\n"); return -FI_ENODATA; } if (ep_attr->msg_prefix_size > sock_rdm_ep_attr.msg_prefix_size) { SOCK_LOG_DBG("Msg prefix size not supported\n"); return -FI_ENODATA; } if (ep_attr->max_order_raw_size > sock_rdm_ep_attr.max_order_raw_size) { SOCK_LOG_DBG("RAW order size too large\n"); return -FI_ENODATA; } if (ep_attr->max_order_war_size > sock_rdm_ep_attr.max_order_war_size) { SOCK_LOG_DBG("WAR order size too large\n"); return -FI_ENODATA; } if (ep_attr->max_order_waw_size > sock_rdm_ep_attr.max_order_waw_size) { SOCK_LOG_DBG("WAW order size too large\n"); return -FI_ENODATA; } if ((ep_attr->tx_ctx_cnt > SOCK_EP_MAX_TX_CNT) && ep_attr->tx_ctx_cnt != FI_SHARED_CONTEXT) return -FI_ENODATA; if ((ep_attr->rx_ctx_cnt > SOCK_EP_MAX_RX_CNT) && ep_attr->rx_ctx_cnt != FI_SHARED_CONTEXT) return -FI_ENODATA; } ret = sock_rdm_verify_tx_attr(tx_attr); if (ret) return ret; ret = sock_rdm_verify_rx_attr(rx_attr); if (ret) return ret; return 0; }