static int do_readv(MPID_Request *rreq, int pipe_fd, MPL_IOV iov[], int *iov_offset, int *iov_count, int *complete) { int mpi_errno = MPI_SUCCESS; ssize_t nread; nread = readv(pipe_fd, &rreq->dev.iov[rreq->dev.iov_offset], rreq->dev.iov_count); MPIR_ERR_CHKANDJUMP2(nread < 0 && errno != EAGAIN, mpi_errno, MPI_ERR_OTHER, "**read", "**readv %d %s", errno, MPIU_Strerror(errno)); if (nread < 0) { if (errno == EAGAIN) goto fn_exit; MPIR_ERR_CHKANDJUMP2(errno != EAGAIN, mpi_errno, MPI_ERR_OTHER, "**vmsplice", "**vmsplice %d %s", errno, MPIU_Strerror(errno)); } *complete = adjust_partially_xferred_iov(iov, iov_offset, iov_count, nread); if (*complete) { /* look for additional data to send and reload IOV if there is more */ mpi_errno = check_req_complete(rreq->ch.vc, rreq, complete); if (mpi_errno) MPIR_ERR_POP(mpi_errno); if (*complete) { nread = close(pipe_fd); MPIR_ERR_CHKANDJUMP(nread < 0, mpi_errno, MPI_ERR_OTHER, "**close"); MPL_DBG_MSG(MPIDI_CH3_DBG_CHANNEL, VERBOSE, ".... complete"); } } fn_fail: fn_exit: return mpi_errno; }
static int do_vmsplice(MPID_Request *sreq, int pipe_fd, MPL_IOV iov[], int *iov_offset, int *iov_count, int *complete) { int mpi_errno = MPI_SUCCESS; ssize_t err; #if 1 err = vmsplice(pipe_fd, &iov[*iov_offset], *iov_count, SPLICE_F_NONBLOCK); #else err = writev(pipe_fd, &iov[*iov_offset], *iov_count); #endif if (err < 0) { if (errno == EAGAIN) goto fn_exit; MPIR_ERR_CHKANDJUMP2(errno != EAGAIN, mpi_errno, MPI_ERR_OTHER, "**vmsplice", "**vmsplice %d %s", errno, MPIU_Strerror(errno)); } *complete = adjust_partially_xferred_iov(iov, iov_offset, iov_count, err); if (*complete) { /* look for additional data to send and reload IOV if there is more */ mpi_errno = check_req_complete(sreq->ch.vc, sreq, complete); if (mpi_errno) MPIR_ERR_POP(mpi_errno); if (*complete) { err = close(pipe_fd); MPIR_ERR_CHKANDJUMP(err < 0, mpi_errno, MPI_ERR_OTHER, "**close"); MPL_DBG_MSG(MPIDI_CH3_DBG_CHANNEL, VERBOSE, ".... complete"); } } fn_fail: fn_exit: return mpi_errno; }
int MPID_nem_lmt_dma_handle_cookie(MPIDI_VC_t *vc, MPID_Request *req, MPID_IOV cookie) { int mpi_errno = MPI_SUCCESS; MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_LMT_DMA_HANDLE_COOKIE); MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_LMT_DMA_HANDLE_COOKIE); if (cookie.MPID_IOV_LEN == 0 && cookie.MPID_IOV_BUF == NULL) { /* req is a send request, we need to initiate another knem request and send a COOKIE message back to the receiver indicating the lid returned from the ioctl. */ int complete; knem_cookie_t s_cookie; /* This function will invoke the OnDataAvail function to load more data. */ mpi_errno = check_req_complete(vc, req, &complete); if (mpi_errno) MPIU_ERR_POP(mpi_errno); /* If we were complete we should have received a DONE message instead of a COOKIE message. */ MPIU_Assert(!complete); mpi_errno = do_dma_send(vc, req, req->dev.iov_count, &req->dev.iov[0], &s_cookie); if (mpi_errno) MPIU_ERR_POP(mpi_errno); MPID_nem_lmt_send_COOKIE(vc, req, &s_cookie, sizeof(knem_cookie_t)); } else { /* req is a receive request and we need to continue receiving using the lid provided in the cookie iov. */ mpi_errno = MPID_nem_lmt_dma_start_recv(vc, req, cookie); if (mpi_errno) MPIU_ERR_POP(mpi_errno); } fn_fail: MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_LMT_DMA_HANDLE_COOKIE); return MPI_SUCCESS; }
int MPID_nem_lmt_dma_progress(void) { int mpi_errno = MPI_SUCCESS; struct lmt_dma_node *prev = NULL; struct lmt_dma_node *free_me = NULL; struct lmt_dma_node *cur = outstanding_head; MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_LMT_DMA_PROGRESS); MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_LMT_DMA_PROGRESS); /* Iterate over a linked-list of (req,status_idx)-tuples looking for completed/failed requests. Currently knem only provides status to the receiver, so all of these requests are recv requests. */ while (cur) { switch (*cur->status_p) { case KNEM_STATUS_SUCCESS: { /* complete the request if all data has been sent, remove it from the list */ int complete = 0; mpi_errno = check_req_complete(cur->vc, cur->req, &complete); if (mpi_errno) MPIU_ERR_POP(mpi_errno); free_status_index(cur->status_p - knem_status); if (complete) { /* request was completed by the OnDataAvail fn */ MPID_nem_lmt_send_DONE(cur->vc, cur->req); /* tell the other side to complete its request */ MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, ".... complete"); } else { /* There is more data to send. We must inform the sender that we have completely received the current batch and that the next batch should be sent. */ MPID_nem_lmt_send_COOKIE(cur->vc, cur->req, NULL, 0); } /* Right now we always free the cur element, even if the request is incomplete because it simplifies the logic. */ if (cur == outstanding_head) { outstanding_head = cur->next; prev = NULL; free_me = cur; cur = cur->next; } else { prev->next = cur->next; free_me = cur; cur = cur->next; } if (free_me) MPIU_Free(free_me); --MPID_nem_local_lmt_pending; continue; } break; case KNEM_STATUS_FAILED: /* set the error status for the request, complete it then dequeue the entry */ cur->req->status.MPI_ERROR = MPI_SUCCESS; MPIU_ERR_SET1(cur->req->status.MPI_ERROR, MPI_ERR_OTHER, "**recv_status", "**recv_status %d", *cur->status_p); MPIDI_CH3U_Request_complete(cur->req); if (cur == outstanding_head) { outstanding_head = cur->next; prev = NULL; free_me = cur; cur = cur->next; } else { prev->next = cur->next; free_me = cur; cur = cur->next; } if (free_me) MPIU_Free(free_me); --MPID_nem_local_lmt_pending; continue; break; case KNEM_STATUS_PENDING: /* nothing to do here */ break; default: MPIU_ERR_SETANDJUMP1(mpi_errno, MPI_ERR_OTHER, "**invalid_knem_status", "**invalid_knem_status %d", *cur->status_p); break; } prev = cur; cur = cur->next; } fn_exit: MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_LMT_DMA_PROGRESS); return mpi_errno; fn_fail: goto fn_exit; }
int MPID_nem_lmt_dma_start_recv(MPIDI_VC_t *vc, MPID_Request *rreq, MPID_IOV s_cookie) { int mpi_errno = MPI_SUCCESS; int nodma; int dt_contig; MPI_Aint dt_true_lb; MPIDI_msg_sz_t data_sz; MPID_Datatype * dt_ptr; volatile knem_status_t *status; knem_status_t current_status; struct lmt_dma_node *node = NULL; MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_LMT_DMA_START_RECV); MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_LMT_DMA_START_RECV); /* MT: this code assumes only one thread can be at this point at a time */ if (knem_fd < 0) { mpi_errno = open_knem_dev(); if (mpi_errno) MPIU_ERR_POP(mpi_errno); } /* find out contig/noncontig, size, and lb for the datatype */ MPIDI_Datatype_get_info(rreq->dev.user_count, rreq->dev.datatype, dt_contig, data_sz, dt_ptr, dt_true_lb); nodma = !knem_has_dma || data_sz < MPIR_CVAR_NEMESIS_LMT_DMA_THRESHOLD; if (dt_contig) { /* handle the iov creation ourselves */ rreq->dev.iov[0].MPID_IOV_BUF = (char *)rreq->dev.user_buf + dt_true_lb; rreq->dev.iov[0].MPID_IOV_LEN = data_sz; rreq->dev.iov_count = 1; } else { if (rreq->dev.segment_ptr == NULL) { /* segment_ptr may be non-null when this is a continuation of a many-part message that we couldn't fit in one single flight of iovs. */ MPIU_Assert(rreq->dev.segment_ptr == NULL); rreq->dev.segment_ptr = MPID_Segment_alloc(); MPIU_ERR_CHKANDJUMP1((rreq->dev.segment_ptr == NULL), mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %s", "MPID_Segment_alloc"); MPID_Segment_init(rreq->dev.user_buf, rreq->dev.user_count, rreq->dev.datatype, rreq->dev.segment_ptr, 0); rreq->dev.segment_first = 0; rreq->dev.segment_size = data_sz; /* see load_send_iov FIXME above */ mpi_errno = MPIDI_CH3U_Request_load_recv_iov(rreq); if (mpi_errno) MPIU_ERR_POP(mpi_errno); } } MPIU_Assert(s_cookie.MPID_IOV_LEN == sizeof(knem_cookie_t)); MPIU_Assert(s_cookie.MPID_IOV_BUF != NULL); mpi_errno = do_dma_recv(rreq->dev.iov_count, rreq->dev.iov, *((knem_cookie_t *)s_cookie.MPID_IOV_BUF), nodma, &status, ¤t_status); if (mpi_errno) MPIU_ERR_POP(mpi_errno); /* TODO refactor this block and MPID_nem_lmt_dma_progress (and anywhere * else) to share a common function. This advancement/completion code is * duplication. */ if (current_status != KNEM_STATUS_PENDING) { /* complete the request if all data has been sent, remove it from the list */ int complete = 0; MPIU_ERR_CHKANDJUMP1(current_status == KNEM_STATUS_FAILED, mpi_errno, MPI_ERR_OTHER, "**recv_status", "**recv_status %d", current_status); mpi_errno = check_req_complete(vc, rreq, &complete); if (mpi_errno) MPIU_ERR_POP(mpi_errno); free_status_index(status - knem_status); if (complete) { /* request was completed by the OnDataAvail fn */ MPID_nem_lmt_send_DONE(vc, rreq); /* tell the other side to complete its request */ MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, ".... complete"); } else { /* There is more data to send. We must inform the sender that we have completely received the current batch and that the next batch should be sent. */ MPID_nem_lmt_send_COOKIE(vc, rreq, NULL, 0); } } /* XXX DJG FIXME this looks like it always pushes! */ /* push request if not complete for progress checks later */ node = MPIU_Malloc(sizeof(struct lmt_dma_node)); node->vc = vc; node->req = rreq; node->status_p = status; node->next = outstanding_head; outstanding_head = node; ++MPID_nem_local_lmt_pending; fn_exit: MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_LMT_DMA_START_RECV); return mpi_errno; fn_fail: goto fn_exit; }