ssize_t sock_ep_tx_atomic(struct fid_ep *ep, const struct fi_msg_atomic *msg, const struct fi_ioc *comparev, void **compare_desc, size_t compare_count, struct fi_ioc *resultv, void **result_desc, size_t result_count, uint64_t flags) { int i, ret; size_t datatype_sz; struct sock_op tx_op; union sock_iov tx_iov; struct sock_conn *conn; struct sock_tx_ctx *tx_ctx; uint64_t total_len, src_len, dst_len; struct sock_ep *sock_ep; switch (ep->fid.fclass) { case FI_CLASS_EP: sock_ep = container_of(ep, struct sock_ep, ep); tx_ctx = sock_ep->tx_ctx; break; case FI_CLASS_TX_CTX: tx_ctx = container_of(ep, struct sock_tx_ctx, fid.ctx); sock_ep = tx_ctx->ep; break; default: SOCK_LOG_ERROR("Invalid EP type\n"); return -FI_EINVAL; } if (msg->iov_count > SOCK_EP_MAX_IOV_LIMIT || msg->rma_iov_count > SOCK_EP_MAX_IOV_LIMIT) return -FI_EINVAL; if (!tx_ctx->enabled) return -FI_EOPBADSTATE; if (sock_ep->connected) { conn = sock_ep_lookup_conn(sock_ep); } else { conn = sock_av_lookup_addr(sock_ep, tx_ctx->av, msg->addr); if (!conn) { SOCK_LOG_ERROR("Address lookup failed\n"); return -errno; } } if (!conn) return -FI_EAGAIN; SOCK_EP_SET_TX_OP_FLAGS(flags); if (flags & SOCK_USE_OP_FLAGS) flags |= tx_ctx->attr.op_flags; if (msg->op == FI_ATOMIC_READ) { flags &= ~FI_INJECT; } if (sock_ep_is_send_cq_low(&tx_ctx->comp, flags)) { SOCK_LOG_ERROR("CQ size low\n"); return -FI_EAGAIN; } if (flags & FI_TRIGGER) { ret = sock_queue_atomic_op(ep, msg, comparev, compare_count, resultv, result_count, flags, SOCK_OP_ATOMIC); if (ret != 1) return ret; } src_len = 0; datatype_sz = fi_datatype_size(msg->datatype); if (flags & FI_INJECT) { for (i = 0; i < msg->iov_count; i++) src_len += (msg->msg_iov[i].count * datatype_sz); if (src_len > SOCK_EP_MAX_INJECT_SZ) return -FI_EINVAL; total_len = src_len; } else { total_len = msg->iov_count * sizeof(union sock_iov); } total_len += (sizeof(struct sock_op_send) + (msg->rma_iov_count * sizeof(union sock_iov)) + (result_count * sizeof(union sock_iov))); sock_tx_ctx_start(tx_ctx); if (rbfdavail(&tx_ctx->rbfd) < total_len) { ret = -FI_EAGAIN; goto err; } memset(&tx_op, 0, sizeof(tx_op)); tx_op.op = SOCK_OP_ATOMIC; tx_op.dest_iov_len = msg->rma_iov_count; tx_op.atomic.op = msg->op; tx_op.atomic.datatype = msg->datatype; tx_op.atomic.res_iov_len = result_count; tx_op.atomic.cmp_iov_len = compare_count; if (flags & FI_INJECT) tx_op.src_iov_len = src_len; else tx_op.src_iov_len = msg->iov_count; sock_tx_ctx_write_op_send(tx_ctx, &tx_op, flags, (uintptr_t) msg->context, msg->addr, (uintptr_t) msg->msg_iov[0].addr, sock_ep, conn); if (flags & FI_REMOTE_CQ_DATA) sock_tx_ctx_write(tx_ctx, &msg->data, sizeof(uint64_t)); src_len = 0; if (flags & FI_INJECT) { for (i = 0; i < msg->iov_count; i++) { sock_tx_ctx_write(tx_ctx, msg->msg_iov[i].addr, msg->msg_iov[i].count * datatype_sz); src_len += (msg->msg_iov[i].count * datatype_sz); } } else { for (i = 0; i < msg->iov_count; i++) { tx_iov.ioc.addr = (uintptr_t) msg->msg_iov[i].addr; tx_iov.ioc.count = msg->msg_iov[i].count; sock_tx_ctx_write(tx_ctx, &tx_iov, sizeof(tx_iov)); src_len += (tx_iov.ioc.count * datatype_sz); } } #ifdef ENABLE_DEBUG if (src_len > SOCK_EP_MAX_ATOMIC_SZ) { ret = -FI_EINVAL; goto err; } #endif dst_len = 0; for (i = 0; i < msg->rma_iov_count; i++) { tx_iov.ioc.addr = msg->rma_iov[i].addr; tx_iov.ioc.key = msg->rma_iov[i].key; tx_iov.ioc.count = msg->rma_iov[i].count; sock_tx_ctx_write(tx_ctx, &tx_iov, sizeof(tx_iov)); dst_len += (tx_iov.ioc.count * datatype_sz); } if (msg->iov_count && dst_len != src_len) { SOCK_LOG_ERROR("Buffer length mismatch\n"); ret = -FI_EINVAL; goto err; } else { src_len = dst_len; } dst_len = 0; for (i = 0; i < result_count; i++) { tx_iov.ioc.addr = (uintptr_t) resultv[i].addr; tx_iov.ioc.count = resultv[i].count; sock_tx_ctx_write(tx_ctx, &tx_iov, sizeof(tx_iov)); dst_len += (tx_iov.ioc.count * datatype_sz); } #ifdef ENABLE_DEBUG if (result_count && (dst_len != src_len)) { SOCK_LOG_ERROR("Buffer length mismatch\n"); ret = -FI_EINVAL; goto err; } #endif dst_len = 0; for (i = 0; i < compare_count; i++) { tx_iov.ioc.addr = (uintptr_t) comparev[i].addr; tx_iov.ioc.count = comparev[i].count; sock_tx_ctx_write(tx_ctx, &tx_iov, sizeof(tx_iov)); dst_len += (tx_iov.ioc.count * datatype_sz); } #ifdef ENABLE_DEBUG if (compare_count && (dst_len != src_len)) { SOCK_LOG_ERROR("Buffer length mismatch\n"); ret = -FI_EINVAL; goto err; } #endif sock_tx_ctx_commit(tx_ctx); return 0; err: sock_tx_ctx_abort(tx_ctx); return ret; }
struct sock_conn *sock_ep_connect(struct sock_ep_attr *ep_attr, fi_addr_t index) { int conn_fd = -1, ret; int do_retry = sock_conn_retry; struct sock_conn *conn, *new_conn; struct sockaddr_in addr; socklen_t lon; int valopt = 0; struct pollfd poll_fd; if (ep_attr->ep_type == FI_EP_MSG) { /* Need to check that destination address has been passed to endpoint */ assert(ep_attr->dest_addr); addr = *ep_attr->dest_addr; addr.sin_port = htons(ep_attr->msg_dest_port); } else { addr = *((struct sockaddr_in *)&ep_attr->av->table[index].addr); } do_connect: fastlock_acquire(&ep_attr->cmap.lock); conn = sock_ep_lookup_conn(ep_attr, index, &addr); fastlock_release(&ep_attr->cmap.lock); if (conn != SOCK_CM_CONN_IN_PROGRESS) return conn; conn_fd = ofi_socket(AF_INET, SOCK_STREAM, 0); if (conn_fd == -1) { SOCK_LOG_ERROR("failed to create conn_fd, errno: %d\n", errno); errno = FI_EOTHER; return NULL; } ret = fd_set_nonblock(conn_fd); if (ret) { SOCK_LOG_ERROR("failed to set conn_fd nonblocking, errno: %d\n", errno); errno = FI_EOTHER; ofi_close_socket(conn_fd); return NULL; } SOCK_LOG_DBG("Connecting to: %s:%d\n", inet_ntoa(addr.sin_addr), ntohs(addr.sin_port)); SOCK_LOG_DBG("Connecting using address:%s\n", inet_ntoa(ep_attr->src_addr->sin_addr)); ret = connect(conn_fd, (struct sockaddr *) &addr, sizeof addr); if (ret < 0) { if (ofi_sockerr() == EINPROGRESS) { poll_fd.fd = conn_fd; poll_fd.events = POLLOUT; ret = poll(&poll_fd, 1, 15 * 1000); if (ret < 0) { SOCK_LOG_DBG("poll failed\n"); goto retry; } lon = sizeof(int); ret = getsockopt(conn_fd, SOL_SOCKET, SO_ERROR, (void*)(&valopt), &lon); if (ret < 0) { SOCK_LOG_DBG("getsockopt failed: %d, %d\n", ret, conn_fd); goto retry; } if (valopt) { SOCK_LOG_DBG("Error in connection() %d - %s - %d\n", valopt, strerror(valopt), conn_fd); SOCK_LOG_DBG("Connecting to: %s:%d\n", inet_ntoa(addr.sin_addr), ntohs(addr.sin_port)); SOCK_LOG_DBG("Connecting using address:%s\n", inet_ntoa(ep_attr->src_addr->sin_addr)); goto retry; } goto out; } else { SOCK_LOG_DBG("Timeout or error() - %s: %d\n", strerror(errno), conn_fd); SOCK_LOG_DBG("Connecting to: %s:%d\n", inet_ntoa(addr.sin_addr), ntohs(addr.sin_port)); SOCK_LOG_DBG("Connecting using address:%s\n", inet_ntoa(ep_attr->src_addr->sin_addr)); goto retry; } } else { goto out; } retry: do_retry--; sleep(10); if (!do_retry) goto err; if (conn_fd != -1) { ofi_close_socket(conn_fd); conn_fd = -1; } SOCK_LOG_ERROR("Connect error, retrying - %s - %d\n", strerror(errno), conn_fd); SOCK_LOG_DBG("Connecting to: %s:%d\n", inet_ntoa(addr.sin_addr), ntohs(addr.sin_port)); SOCK_LOG_DBG("Connecting using address:%s\n", inet_ntoa(ep_attr->src_addr->sin_addr)); goto do_connect; out: fastlock_acquire(&ep_attr->cmap.lock); new_conn = sock_conn_map_insert(ep_attr, &addr, conn_fd, 0); if (!new_conn) { fastlock_release(&ep_attr->cmap.lock); goto err; } new_conn->av_index = (ep_attr->ep_type == FI_EP_MSG) ? FI_ADDR_NOTAVAIL : index; conn = ofi_idm_lookup(&ep_attr->av_idm, index); if (conn == SOCK_CM_CONN_IN_PROGRESS) { if (ofi_idm_set(&ep_attr->av_idm, index, new_conn) < 0) SOCK_LOG_ERROR("ofi_idm_set failed\n"); conn = new_conn; } fastlock_release(&ep_attr->cmap.lock); return conn; err: ofi_close_socket(conn_fd); return NULL; }
ssize_t sock_ep_rma_readmsg(struct fid_ep *ep, const struct fi_msg_rma *msg, uint64_t flags) { int ret, i; struct sock_op tx_op; union sock_iov tx_iov; struct sock_conn *conn; struct sock_tx_ctx *tx_ctx; uint64_t total_len, src_len, dst_len; struct sock_ep *sock_ep; switch (ep->fid.fclass) { case FI_CLASS_EP: sock_ep = container_of(ep, struct sock_ep, ep); tx_ctx = sock_ep->tx_ctx; break; case FI_CLASS_TX_CTX: tx_ctx = container_of(ep, struct sock_tx_ctx, fid.ctx); sock_ep = tx_ctx->ep; break; default: SOCK_LOG_ERROR("Invalid EP type\n"); return -FI_EINVAL; } #if ENABLE_DEBUG if (msg->iov_count > SOCK_EP_MAX_IOV_LIMIT || msg->rma_iov_count > SOCK_EP_MAX_IOV_LIMIT) return -FI_EINVAL; #endif if (!tx_ctx->enabled) return -FI_EOPBADSTATE; if (sock_ep->connected) { conn = sock_ep_lookup_conn(sock_ep); } else { conn = sock_av_lookup_addr(sock_ep, tx_ctx->av, msg->addr); if (!conn) { SOCK_LOG_ERROR("Address lookup failed\n"); return -errno; } } if (!conn) return -FI_EAGAIN; SOCK_EP_SET_TX_OP_FLAGS(flags); if (flags & SOCK_USE_OP_FLAGS) flags |= tx_ctx->attr.op_flags; if (sock_ep_is_send_cq_low(&tx_ctx->comp, flags)) { SOCK_LOG_ERROR("CQ size low\n"); return -FI_EAGAIN; } if (flags & FI_TRIGGER) { ret = sock_queue_rma_op(ep, msg, flags, SOCK_OP_READ); if (ret != 1) return ret; } total_len = sizeof(struct sock_op_send) + (msg->iov_count * sizeof(union sock_iov)) + (msg->rma_iov_count * sizeof(union sock_iov)); sock_tx_ctx_start(tx_ctx); if (rbfdavail(&tx_ctx->rbfd) < total_len) { ret = -FI_EAGAIN; goto err; } memset(&tx_op, 0, sizeof(struct sock_op)); tx_op.op = SOCK_OP_READ; tx_op.src_iov_len = msg->rma_iov_count; tx_op.dest_iov_len = msg->iov_count; sock_tx_ctx_write_op_send(tx_ctx, &tx_op, flags, (uintptr_t) msg->context, msg->addr, (uintptr_t) msg->msg_iov[0].iov_base, sock_ep, conn); src_len = 0; for (i = 0; i < msg->rma_iov_count; i++) { tx_iov.iov.addr = msg->rma_iov[i].addr; tx_iov.iov.key = msg->rma_iov[i].key; tx_iov.iov.len = msg->rma_iov[i].len; sock_tx_ctx_write(tx_ctx, &tx_iov, sizeof(tx_iov)); src_len += tx_iov.iov.len; } dst_len = 0; for (i = 0; i < msg->iov_count; i++) { tx_iov.iov.addr = (uintptr_t) msg->msg_iov[i].iov_base; tx_iov.iov.len = msg->msg_iov[i].iov_len; sock_tx_ctx_write(tx_ctx, &tx_iov, sizeof(tx_iov)); dst_len += tx_iov.iov.len; } #if ENABLE_DEBUG if (dst_len != src_len) { SOCK_LOG_ERROR("Buffer length mismatch\n"); ret = -FI_EINVAL; goto err; } #endif sock_tx_ctx_commit(tx_ctx); return 0; err: sock_tx_ctx_abort(tx_ctx); return ret; }
static ssize_t sock_ep_tx_atomic(struct fid_ep *ep, const struct fi_msg_atomic *msg, const struct fi_ioc *comparev, void **compare_desc, size_t compare_count, struct fi_ioc *resultv, void **result_desc, size_t result_count, uint64_t flags) { int i, ret; size_t datatype_sz; struct sock_op tx_op; union sock_iov tx_iov; struct sock_conn *conn; struct sock_tx_ctx *tx_ctx; uint64_t total_len, src_len, dst_len; struct sock_ep *sock_ep; switch (ep->fid.fclass) { case FI_CLASS_EP: sock_ep = container_of(ep, struct sock_ep, ep); tx_ctx = sock_ep->tx_ctx; break; case FI_CLASS_TX_CTX: tx_ctx = container_of(ep, struct sock_tx_ctx, fid.ctx); sock_ep = tx_ctx->ep; break; default: SOCK_LOG_ERROR("Invalid EP type\n"); return -FI_EINVAL; } assert(tx_ctx->enabled && msg->iov_count <= SOCK_EP_MAX_IOV_LIMIT && msg->rma_iov_count <= SOCK_EP_MAX_IOV_LIMIT); if (sock_ep->connected) { conn = sock_ep_lookup_conn(sock_ep); } else { conn = sock_av_lookup_addr(sock_ep, tx_ctx->av, msg->addr); } if (!conn) return -FI_EAGAIN; src_len = 0; datatype_sz = fi_datatype_size(msg->datatype); if (flags & FI_INJECT) { for (i=0; i< msg->iov_count; i++) { src_len += (msg->msg_iov[i].count * datatype_sz); } assert(src_len <= SOCK_EP_MAX_INJECT_SZ); total_len = src_len; } else { total_len = msg->iov_count * sizeof(union sock_iov); } total_len += (sizeof(tx_op) + (msg->rma_iov_count * sizeof(union sock_iov)) + (result_count * sizeof (union sock_iov))); sock_tx_ctx_start(tx_ctx); if (rbfdavail(&tx_ctx->rbfd) < total_len) { ret = -FI_EAGAIN; goto err; } flags |= tx_ctx->attr.op_flags; memset(&tx_op, 0, sizeof(tx_op)); tx_op.op = SOCK_OP_ATOMIC; tx_op.dest_iov_len = msg->rma_iov_count; tx_op.atomic.op = msg->op; tx_op.atomic.datatype = msg->datatype; tx_op.atomic.res_iov_len = result_count; tx_op.atomic.cmp_iov_len = compare_count; if (flags & FI_INJECT) tx_op.src_iov_len = src_len; else tx_op.src_iov_len = msg->iov_count; sock_tx_ctx_write_op_send(tx_ctx, &tx_op, flags, (uintptr_t) msg->context, msg->addr, (uintptr_t) msg->msg_iov[0].addr, sock_ep, conn); if (flags & FI_REMOTE_CQ_DATA) { sock_tx_ctx_write(tx_ctx, &msg->data, sizeof(uint64_t)); } src_len = 0; if (flags & FI_INJECT) { for (i=0; i< msg->iov_count; i++) { sock_tx_ctx_write(tx_ctx, msg->msg_iov[i].addr, msg->msg_iov[i].count * datatype_sz); src_len += (msg->msg_iov[i].count * datatype_sz); } } else { for (i = 0; i< msg->iov_count; i++) { tx_iov.ioc.addr = (uintptr_t) msg->msg_iov[i].addr; tx_iov.ioc.count = msg->msg_iov[i].count; tx_iov.ioc.key = (uintptr_t) msg->desc[i]; sock_tx_ctx_write(tx_ctx, &tx_iov, sizeof(tx_iov)); src_len += (tx_iov.ioc.count * datatype_sz); } } assert(src_len <= SOCK_EP_MAX_ATOMIC_SZ); dst_len = 0; for (i = 0; i< msg->rma_iov_count; i++) { tx_iov.ioc.addr = msg->rma_iov[i].addr; tx_iov.ioc.key = msg->rma_iov[i].key; tx_iov.ioc.count = msg->rma_iov[i].count; sock_tx_ctx_write(tx_ctx, &tx_iov, sizeof(tx_iov)); dst_len += (tx_iov.ioc.count * datatype_sz); } if (dst_len != src_len) { SOCK_LOG_ERROR("Buffer length mismatch\n"); ret = -FI_EINVAL; goto err; } dst_len = 0; for (i = 0; i< result_count; i++) { tx_iov.ioc.addr = (uintptr_t) resultv[i].addr; tx_iov.ioc.count = resultv[i].count; sock_tx_ctx_write(tx_ctx, &tx_iov, sizeof(tx_iov)); dst_len += (tx_iov.ioc.count * datatype_sz); } if (result_count && (dst_len != src_len)) { SOCK_LOG_ERROR("Buffer length mismatch\n"); ret = -FI_EINVAL; goto err; } dst_len = 0; for (i = 0; i< compare_count; i++) { tx_iov.ioc.addr = (uintptr_t) comparev[i].addr; tx_iov.ioc.count = comparev[i].count; sock_tx_ctx_write(tx_ctx, &tx_iov, sizeof(tx_iov)); dst_len += (tx_iov.ioc.count * datatype_sz); } if (compare_count && (dst_len != src_len)) { SOCK_LOG_ERROR("Buffer length mismatch\n"); ret = -FI_EINVAL; goto err; } sock_tx_ctx_commit(tx_ctx); return 0; err: SOCK_LOG_INFO("Not enough space for TX entry, try again\n"); sock_tx_ctx_abort(tx_ctx); return ret; }
struct sock_conn *sock_ep_connect(struct sock_ep *ep, fi_addr_t index) { int conn_fd = -1, ret; int do_retry = sock_conn_retry; struct sock_conn *conn, *new_conn; uint16_t idx; struct sockaddr_in *addr; socklen_t lon; int valopt = 0; struct pollfd poll_fd; if (ep->ep_type == FI_EP_MSG) { idx = 0; addr = ep->dest_addr; } else { idx = index & ep->av->mask; addr = (struct sockaddr_in *)&ep->av->table[idx].addr; } do_connect: fastlock_acquire(&ep->cmap.lock); conn = sock_ep_lookup_conn(ep, index, addr); fastlock_release(&ep->cmap.lock); if (conn != SOCK_CM_CONN_IN_PROGRESS) return conn; conn_fd = socket(AF_INET, SOCK_STREAM, 0); if (conn_fd == -1) { SOCK_LOG_ERROR("failed to create conn_fd, errno: %d\n", errno); errno = FI_EOTHER; return NULL; } ret = fd_set_nonblock(conn_fd); if (ret) { SOCK_LOG_ERROR("failed to set conn_fd nonblocking, errno: %d\n", errno); errno = FI_EOTHER; close(conn_fd); return NULL; } SOCK_LOG_DBG("Connecting to: %s:%d\n", inet_ntoa(addr->sin_addr), ntohs(addr->sin_port)); SOCK_LOG_DBG("Connecting using address:%s\n", inet_ntoa(ep->src_addr->sin_addr)); ret = connect(conn_fd, (struct sockaddr *) addr, sizeof *addr); if (ret < 0) { if (errno == EINPROGRESS) { poll_fd.fd = conn_fd; poll_fd.events = POLLOUT; ret = poll(&poll_fd, 1, 15 * 1000); if (ret < 0) { SOCK_LOG_DBG("poll failed\n"); goto retry; } lon = sizeof(int); ret = getsockopt(conn_fd, SOL_SOCKET, SO_ERROR, (void*)(&valopt), &lon); if (ret < 0) { SOCK_LOG_DBG("getsockopt failed: %d, %d\n", ret, conn_fd); goto retry; } if (valopt) { SOCK_LOG_DBG("Error in connection() %d - %s - %d\n", valopt, strerror(valopt), conn_fd); SOCK_LOG_DBG("Connecting to: %s:%d\n", inet_ntoa(addr->sin_addr), ntohs(addr->sin_port)); SOCK_LOG_DBG("Connecting using address:%s\n", inet_ntoa(ep->src_addr->sin_addr)); goto retry; } goto out; } else { SOCK_LOG_DBG("Timeout or error() - %s: %d\n", strerror(errno), conn_fd); SOCK_LOG_DBG("Connecting to: %s:%d\n", inet_ntoa(addr->sin_addr), ntohs(addr->sin_port)); SOCK_LOG_DBG("Connecting using address:%s\n", inet_ntoa(ep->src_addr->sin_addr)); goto retry; } } else { goto out; } retry: do_retry--; sleep(10); if (!do_retry) goto err; if (conn_fd != -1) { close(conn_fd); conn_fd = -1; } SOCK_LOG_ERROR("Connect error, retrying - %s - %d\n", strerror(errno), conn_fd); SOCK_LOG_DBG("Connecting to: %s:%d\n", inet_ntoa(addr->sin_addr), ntohs(addr->sin_port)); SOCK_LOG_DBG("Connecting using address:%s\n", inet_ntoa(ep->src_addr->sin_addr)); goto do_connect; out: fastlock_acquire(&ep->cmap.lock); new_conn = sock_conn_map_insert(ep, addr, conn_fd, 0); new_conn->av_index = (ep->ep_type == FI_EP_MSG) ? FI_ADDR_NOTAVAIL : (fi_addr_t) idx; conn = idm_lookup(&ep->av_idm, index); if (conn == SOCK_CM_CONN_IN_PROGRESS) { idm_set(&ep->av_idm, index, new_conn); conn = new_conn; } fastlock_release(&ep->cmap.lock); return conn; err: close(conn_fd); return NULL; }