int mlx_ep_open( struct fid_domain *domain, struct fi_info *info, struct fid_ep **fid, void *context) { struct mlx_ep *ep; struct mlx_domain *u_domain; int ofi_status = FI_SUCCESS; ucs_status_t status = UCS_OK; ucp_worker_params_t worker_params; worker_params.field_mask = UCP_WORKER_PARAM_FIELD_THREAD_MODE; worker_params.thread_mode = UCS_THREAD_MODE_MULTI; u_domain = container_of( domain, struct mlx_domain, u_domain.domain_fid); ep = (struct mlx_ep *) calloc(1, sizeof (struct mlx_ep)); if (!ep) { return -ENOMEM; } ofi_status = ofi_endpoint_init(domain, &mlx_util_prov, info, &ep->ep, context, mlx_ep_progress); if (ofi_status) { goto free_ep; } status = ucp_worker_create( u_domain->context, &worker_params, &(ep->worker)); if (status != UCS_OK) { ofi_status = MLX_TRANSLATE_ERRCODE(status); ofi_atomic_dec32(&(u_domain->u_domain.ref)); goto free_ep; } ep->ep.ep_fid.fid.ops = &mlx_fi_ops; ep->ep.ep_fid.ops = &mlx_ep_ops; ep->ep.ep_fid.cm = &mlx_cm_ops; ep->ep.ep_fid.tagged = &mlx_tagged_ops; ep->ep.flags = info->mode; ep->ep.caps = u_domain->u_domain.info_domain_caps; *fid = &(ep->ep.ep_fid); return FI_SUCCESS; free_ep: free(ep); return ofi_status; }
void mlx_send_callback( void *request, ucs_status_t status) { struct util_cq *cq; struct mlx_request *mlx_req = request; struct fi_cq_tagged_entry *t_entry; struct util_cq_err_entry *err; cq = mlx_req->cq; if (status == UCS_ERR_CANCELED) { ucp_request_release(request); return; } fastlock_acquire(&cq->cq_lock); t_entry = cirque_tail(cq->cirq); *t_entry = (mlx_req->completion.tagged); cirque_commit(cq->cirq); if (status != UCS_OK){ t_entry->flags |= UTIL_FLAG_ERROR; err = calloc(1, sizeof(struct util_cq_err_entry)); if (!err) { FI_WARN(&mlx_prov, FI_LOG_CQ, "out of memory, cannot report CQ error\n"); return; } err->err_entry = (mlx_req->completion.error); err->err_entry.prov_errno = (int)status; err->err_entry.err = MLX_TRANSLATE_ERRCODE(status); err->err_entry.olen = 0; slist_insert_tail(&err->list_entry, &cq->err_list); } mlx_req->type = MLX_FI_REQ_UNINITIALIZED; fastlock_release(&cq->cq_lock); ucp_request_release(request); }
void mlx_recv_callback ( void *request, ucs_status_t status, ucp_tag_recv_info_t *info) { struct util_cq *cq; struct mlx_request *mlx_req; mlx_req = (struct mlx_request*)request; if (status == UCS_ERR_CANCELED) { ucp_request_release(request); return; } cq = mlx_req->cq; mlx_req->completion.tagged.tag = info->sender_tag; mlx_req->completion.tagged.len = info->length; if (status != UCS_OK) { mlx_req->completion.error.prov_errno = (int)status; mlx_req->completion.error.err = MLX_TRANSLATE_ERRCODE(status); } if (mlx_req->type == MLX_FI_REQ_UNINITIALIZED) { if (status != UCS_OK) { mlx_req->completion.error.olen = info->length; mlx_req->type = MLX_FI_REQ_UNEXPECTED_ERR; } else { mlx_req->type = MLX_FI_REQ_UNEXPECTED; } } else { if (status != UCS_OK) { mlx_req->completion.error.olen = info->length - mlx_req->completion.error.len; } struct fi_cq_tagged_entry *t_entry; t_entry = cirque_tail(cq->cirq); *t_entry = (mlx_req->completion.tagged); if (status != UCS_OK) { struct util_cq_err_entry* err; t_entry->flags |= UTIL_FLAG_ERROR; err = calloc(1, sizeof(struct util_cq_err_entry)); if (!err) { FI_WARN(&mlx_prov, FI_LOG_CQ, "out of memory, cannot report CQ error\n"); return; } err->err_entry = (mlx_req->completion.error); slist_insert_tail(&err->list_entry, &cq->err_list); } if (cq->src){ cq->src[cirque_windex((struct mlx_comp_cirq*)(cq->cirq))] = FI_ADDR_NOTAVAIL; } if (cq->wait) { cq->wait->signal(cq->wait); } mlx_req->type = MLX_FI_REQ_UNINITIALIZED; cirque_commit(cq->cirq); ucp_request_release(request); } fastlock_release(&cq->cq_lock); }
static int mlx_init_errcodes() { MLX_TRANSLATE_ERRCODE (UCS_OK) = -FI_SUCCESS; MLX_TRANSLATE_ERRCODE (UCS_INPROGRESS) = -FI_EINPROGRESS; MLX_TRANSLATE_ERRCODE (UCS_ERR_NO_MESSAGE) = -FI_ENOMSG; MLX_TRANSLATE_ERRCODE (UCS_ERR_NO_RESOURCE) = -FI_EINVAL; MLX_TRANSLATE_ERRCODE (UCS_ERR_IO_ERROR) = -FI_EIO; MLX_TRANSLATE_ERRCODE (UCS_ERR_NO_MEMORY) = -FI_ENOMEM; MLX_TRANSLATE_ERRCODE (UCS_ERR_INVALID_PARAM) = -FI_EINVAL; MLX_TRANSLATE_ERRCODE (UCS_ERR_UNREACHABLE) = -FI_ENETUNREACH; MLX_TRANSLATE_ERRCODE (UCS_ERR_INVALID_ADDR) = -FI_EINVAL; MLX_TRANSLATE_ERRCODE (UCS_ERR_NOT_IMPLEMENTED) = -FI_ENOSYS; MLX_TRANSLATE_ERRCODE (UCS_ERR_MESSAGE_TRUNCATED) = -FI_EMSGSIZE; MLX_TRANSLATE_ERRCODE (UCS_ERR_NO_PROGRESS) = -FI_EAGAIN; MLX_TRANSLATE_ERRCODE (UCS_ERR_BUFFER_TOO_SMALL)= -FI_ETOOSMALL; MLX_TRANSLATE_ERRCODE (UCS_ERR_NO_ELEM) = -FI_ENOENT; MLX_TRANSLATE_ERRCODE (UCS_ERR_SOME_CONNECTS_FAILED) = -FI_EIO; MLX_TRANSLATE_ERRCODE (UCS_ERR_NO_DEVICE) = -FI_ENODEV; MLX_TRANSLATE_ERRCODE (UCS_ERR_BUSY) = -FI_EBUSY; MLX_TRANSLATE_ERRCODE (UCS_ERR_CANCELED) = -FI_ECANCELED; MLX_TRANSLATE_ERRCODE (UCS_ERR_SHMEM_SEGMENT) = -FI_EINVAL; MLX_TRANSLATE_ERRCODE (UCS_ERR_ALREADY_EXISTS) = -EEXIST; MLX_TRANSLATE_ERRCODE (UCS_ERR_OUT_OF_RANGE) = -FI_EINVAL; MLX_TRANSLATE_ERRCODE (UCS_ERR_TIMED_OUT) = -FI_ETIMEDOUT; MLX_TRANSLATE_ERRCODE (UCS_ERR_EXCEEDS_LIMIT) = -FI_E2BIG; MLX_TRANSLATE_ERRCODE (UCS_ERR_UNSUPPORTED) = -FI_ENOSYS; return 0; }