int MPIDI_CH3U_Request_load_recv_iov(MPID_Request * const rreq) { MPI_Aint last; int mpi_errno = MPI_SUCCESS; MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3U_REQUEST_LOAD_RECV_IOV); MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3U_REQUEST_LOAD_RECV_IOV); if (rreq->dev.segment_first < rreq->dev.segment_size) { /* still reading data that needs to go into the user buffer */ if (MPIDI_Request_get_type(rreq) != MPIDI_REQUEST_TYPE_ACCUM_RECV && MPIDI_Request_get_type(rreq) != MPIDI_REQUEST_TYPE_GET_ACCUM_RECV && MPIDI_Request_get_srbuf_flag(rreq)) { MPIDI_msg_sz_t data_sz; MPIDI_msg_sz_t tmpbuf_sz; /* Once a SRBuf is in use, we continue to use it since a small amount of data may already be present at the beginning of the buffer. This data is left over from the previous unpack, most like a result of alignment issues. NOTE: we could force the use of the SRBuf only when (rreq->dev.tmpbuf_off > 0)... */ data_sz = rreq->dev.segment_size - rreq->dev.segment_first - rreq->dev.tmpbuf_off; MPIU_Assert(data_sz > 0); tmpbuf_sz = rreq->dev.tmpbuf_sz - rreq->dev.tmpbuf_off; if (data_sz > tmpbuf_sz) { data_sz = tmpbuf_sz; } rreq->dev.iov[0].MPID_IOV_BUF = (MPID_IOV_BUF_CAST)((char *) rreq->dev.tmpbuf + rreq->dev.tmpbuf_off); rreq->dev.iov[0].MPID_IOV_LEN = data_sz; rreq->dev.iov_offset = 0; rreq->dev.iov_count = 1; MPIU_Assert(rreq->dev.segment_first + data_sz + rreq->dev.tmpbuf_off <= rreq->dev.recv_data_sz); if (rreq->dev.segment_first + data_sz + rreq->dev.tmpbuf_off == rreq->dev.recv_data_sz) { MPIU_DBG_MSG(CH3_CHANNEL,VERBOSE, "updating rreq to read the remaining data into the SRBuf"); rreq->dev.OnDataAvail = MPIDI_CH3_ReqHandler_UnpackSRBufComplete; } else { MPIU_DBG_MSG(CH3_CHANNEL,VERBOSE, "updating rreq to read more data into the SRBuf"); rreq->dev.OnDataAvail = MPIDI_CH3_ReqHandler_UnpackSRBufReloadIOV; } goto fn_exit; } last = rreq->dev.segment_size; rreq->dev.iov_count = MPID_IOV_LIMIT; rreq->dev.iov_offset = 0; MPIU_DBG_MSG_FMT(CH3_CHANNEL,VERBOSE,(MPIU_DBG_FDEST, "pre-upv: first=" MPIDI_MSG_SZ_FMT ", last=" MPIDI_MSG_SZ_FMT ", iov_n=%d", rreq->dev.segment_first, last, rreq->dev.iov_count)); MPIU_Assert(rreq->dev.segment_first < last); MPIU_Assert(last > 0); MPID_Segment_unpack_vector(rreq->dev.segment_ptr, rreq->dev.segment_first, &last, &rreq->dev.iov[0], &rreq->dev.iov_count); MPIU_DBG_MSG_FMT(CH3_CHANNEL,VERBOSE,(MPIU_DBG_FDEST, "post-upv: first=" MPIDI_MSG_SZ_FMT ", last=" MPIDI_MSG_SZ_FMT ", iov_n=%d, iov_offset=%lld", rreq->dev.segment_first, last, rreq->dev.iov_count, (long long)rreq->dev.iov_offset)); MPIU_Assert(rreq->dev.iov_count >= 0 && rreq->dev.iov_count <= MPID_IOV_LIMIT); /* --BEGIN ERROR HANDLING-- */ if (rreq->dev.iov_count == 0) { /* If the data can't be unpacked, the we have a mis-match between the datatype and the amount of data received. Adjust the segment info so that the remaining data is received and thrown away. */ rreq->status.MPI_ERROR = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, FCNAME, __LINE__, MPI_ERR_TYPE, "**dtypemismatch", 0); MPIR_STATUS_SET_COUNT(rreq->status, rreq->dev.segment_first); rreq->dev.segment_size = rreq->dev.segment_first; mpi_errno = MPIDI_CH3U_Request_load_recv_iov(rreq); goto fn_exit; } else { MPIU_Assert(rreq->dev.iov_offset < rreq->dev.iov_count); } /* --END ERROR HANDLING-- */ if (last == rreq->dev.recv_data_sz) { MPIU_DBG_MSG(CH3_CHANNEL,VERBOSE, "updating rreq to read the remaining data directly into the user buffer"); /* Eventually, use OnFinal for this instead */ rreq->dev.OnDataAvail = rreq->dev.OnFinal; } else if (MPIDI_Request_get_type(rreq) == MPIDI_REQUEST_TYPE_ACCUM_RECV || MPIDI_Request_get_type(rreq) == MPIDI_REQUEST_TYPE_GET_ACCUM_RECV || (last == rreq->dev.segment_size || (last - rreq->dev.segment_first) / rreq->dev.iov_count >= MPIDI_IOV_DENSITY_MIN)) { MPIU_DBG_MSG(CH3_CHANNEL,VERBOSE, "updating rreq to read more data directly into the user buffer"); rreq->dev.segment_first = last; rreq->dev.OnDataAvail = MPIDI_CH3_ReqHandler_ReloadIOV; } else { /* Too little data would have been received using an IOV. We will start receiving data into a SRBuf and unpacking it later. */ MPIU_Assert(MPIDI_Request_get_srbuf_flag(rreq) == FALSE); MPIDI_CH3U_SRBuf_alloc(rreq, rreq->dev.segment_size - rreq->dev.segment_first); rreq->dev.tmpbuf_off = 0; /* --BEGIN ERROR HANDLING-- */ if (rreq->dev.tmpbuf_sz == 0) { /* FIXME - we should drain the data off the pipe here, but we don't have a buffer to drain it into. should this be a fatal error? */ MPIU_DBG_MSG(CH3_CHANNEL,VERBOSE,"SRBuf allocation failure"); mpi_errno = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_FATAL, FCNAME, __LINE__, MPI_ERR_OTHER, "**nomem", "**nomem %d", rreq->dev.segment_size - rreq->dev.segment_first); rreq->status.MPI_ERROR = mpi_errno; goto fn_exit; } /* --END ERROR HANDLING-- */ /* fill in the IOV using a recursive call */ mpi_errno = MPIDI_CH3U_Request_load_recv_iov(rreq); } } else { /* receive and toss any extra data that does not fit in the user's buffer */ MPIDI_msg_sz_t data_sz; data_sz = rreq->dev.recv_data_sz - rreq->dev.segment_first; if (!MPIDI_Request_get_srbuf_flag(rreq)) { MPIDI_CH3U_SRBuf_alloc(rreq, data_sz); /* --BEGIN ERROR HANDLING-- */ if (rreq->dev.tmpbuf_sz == 0) { MPIU_DBG_MSG(CH3_CHANNEL,TYPICAL,"SRBuf allocation failure"); mpi_errno = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_FATAL, FCNAME, __LINE__, MPI_ERR_OTHER, "**nomem", 0); rreq->status.MPI_ERROR = mpi_errno; goto fn_exit; } /* --END ERROR HANDLING-- */ } if (data_sz <= rreq->dev.tmpbuf_sz) { MPIU_DBG_MSG(CH3_CHANNEL,VERBOSE, "updating rreq to read overflow data into the SRBuf and complete"); rreq->dev.iov[0].MPID_IOV_LEN = data_sz; MPIU_Assert(MPIDI_Request_get_type(rreq) == MPIDI_REQUEST_TYPE_RECV); /* Eventually, use OnFinal for this instead */ rreq->dev.OnDataAvail = rreq->dev.OnFinal; } else { MPIU_DBG_MSG(CH3_CHANNEL,VERBOSE, "updating rreq to read overflow data into the SRBuf and reload IOV"); rreq->dev.iov[0].MPID_IOV_LEN = rreq->dev.tmpbuf_sz; rreq->dev.segment_first += rreq->dev.tmpbuf_sz; rreq->dev.OnDataAvail = MPIDI_CH3_ReqHandler_ReloadIOV; } rreq->dev.iov[0].MPID_IOV_BUF = (MPID_IOV_BUF_CAST)rreq->dev.tmpbuf; rreq->dev.iov_count = 1; } fn_exit: MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3U_REQUEST_LOAD_RECV_IOV); return mpi_errno; }
int MPIDI_CH3U_Receive_data_found(MPIR_Request *rreq, void *buf, intptr_t *buflen, int *complete) { int dt_contig; MPI_Aint dt_true_lb; intptr_t userbuf_sz; MPIR_Datatype * dt_ptr = NULL; intptr_t data_sz; int mpi_errno = MPI_SUCCESS; MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_CH3U_RECEIVE_DATA_FOUND); MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDI_CH3U_RECEIVE_DATA_FOUND); MPL_DBG_MSG(MPIDI_CH3_DBG_OTHER,VERBOSE,"posted request found"); MPIDI_Datatype_get_info(rreq->dev.user_count, rreq->dev.datatype, dt_contig, userbuf_sz, dt_ptr, dt_true_lb); if (rreq->dev.recv_data_sz <= userbuf_sz) { data_sz = rreq->dev.recv_data_sz; } else { MPL_DBG_MSG_FMT(MPIDI_CH3_DBG_OTHER,VERBOSE,(MPL_DBG_FDEST, "receive buffer too small; message truncated, msg_sz=%" PRIdPTR ", userbuf_sz=%" PRIdPTR, rreq->dev.recv_data_sz, userbuf_sz)); rreq->status.MPI_ERROR = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, FCNAME, __LINE__, MPI_ERR_TRUNCATE, "**truncate", "**truncate %d %d %d %d", rreq->status.MPI_SOURCE, rreq->status.MPI_TAG, rreq->dev.recv_data_sz, userbuf_sz ); MPIR_STATUS_SET_COUNT(rreq->status, userbuf_sz); data_sz = userbuf_sz; } if (dt_contig && data_sz == rreq->dev.recv_data_sz) { /* user buffer is contiguous and large enough to store the entire message. However, we haven't yet *read* the data (this code describes how to read the data into the destination) */ /* if all of the data has already been received, unpack it now, otherwise build an iov and let the channel unpack */ if (*buflen >= data_sz) { MPL_DBG_MSG(MPIDI_CH3_DBG_OTHER,VERBOSE,"Copying contiguous data to user buffer"); /* copy data out of the receive buffer */ if (rreq->dev.drop_data == FALSE) { MPIR_Memcpy((char*)(rreq->dev.user_buf) + dt_true_lb, buf, data_sz); } *buflen = data_sz; *complete = TRUE; } else { MPL_DBG_MSG(MPIDI_CH3_DBG_OTHER,VERBOSE,"IOV loaded for contiguous read"); rreq->dev.iov[0].MPL_IOV_BUF = (MPL_IOV_BUF_CAST)((char*)(rreq->dev.user_buf) + dt_true_lb); rreq->dev.iov[0].MPL_IOV_LEN = data_sz; rreq->dev.iov_count = 1; *buflen = 0; *complete = FALSE; } /* Trigger OnFinal when receiving the last segment */ rreq->dev.OnDataAvail = rreq->dev.OnFinal; } else { /* user buffer is not contiguous or is too small to hold the entire message */ rreq->dev.segment_ptr = MPIR_Segment_alloc( ); MPIR_ERR_CHKANDJUMP1((rreq->dev.segment_ptr == NULL), mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %s", "MPIR_Segment_alloc"); MPIR_Segment_init(rreq->dev.user_buf, rreq->dev.user_count, rreq->dev.datatype, rreq->dev.segment_ptr); rreq->dev.segment_first = 0; rreq->dev.segment_size = data_sz; /* if all of the data has already been received, and the message is not truncated, unpack it now, otherwise build an iov and let the channel unpack */ if (data_sz == rreq->dev.recv_data_sz && *buflen >= data_sz) { intptr_t last; MPL_DBG_MSG(MPIDI_CH3_DBG_OTHER,VERBOSE,"Copying noncontiguous data to user buffer"); last = data_sz; MPIR_Segment_unpack(rreq->dev.segment_ptr, rreq->dev.segment_first, &last, buf); /* --BEGIN ERROR HANDLING-- */ if (last != data_sz) { /* If the data can't be unpacked, the we have a mismatch between the datatype and the amount of data received. Throw away received data. */ MPIR_ERR_SET(rreq->status.MPI_ERROR, MPI_ERR_TYPE, "**dtypemismatch"); MPIR_STATUS_SET_COUNT(rreq->status, rreq->dev.segment_first); *buflen = data_sz; *complete = TRUE; /* FIXME: Set OnDataAvail to 0? If not, why not? */ goto fn_exit; } /* --END ERROR HANDLING-- */ *buflen = data_sz; /* Trigger OnFinal when receiving the last segment */ rreq->dev.OnDataAvail = rreq->dev.OnFinal; *complete = TRUE; } else { MPL_DBG_MSG(MPIDI_CH3_DBG_OTHER,VERBOSE,"IOV loaded for non-contiguous read"); mpi_errno = MPIDI_CH3U_Request_load_recv_iov(rreq); if (mpi_errno != MPI_SUCCESS) { MPIR_ERR_SETFATALANDJUMP(mpi_errno,MPI_ERR_OTHER, "**ch3|loadrecviov"); } *buflen = 0; *complete = FALSE; } } fn_exit: MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIDI_CH3U_RECEIVE_DATA_FOUND); return mpi_errno; fn_fail: goto fn_exit; }
int MPIDI_CH3U_Post_data_receive_found(MPIR_Request * rreq) { int mpi_errno = MPI_SUCCESS; int dt_contig; MPI_Aint dt_true_lb; intptr_t userbuf_sz; MPIR_Datatype * dt_ptr = NULL; intptr_t data_sz; MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_CH3U_POST_DATA_RECEIVE_FOUND); MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDI_CH3U_POST_DATA_RECEIVE_FOUND); MPL_DBG_MSG(MPIDI_CH3_DBG_OTHER,VERBOSE,"posted request found"); MPIDI_Datatype_get_info(rreq->dev.user_count, rreq->dev.datatype, dt_contig, userbuf_sz, dt_ptr, dt_true_lb); if (rreq->dev.recv_data_sz <= userbuf_sz) { data_sz = rreq->dev.recv_data_sz; } else { MPL_DBG_MSG_FMT(MPIDI_CH3_DBG_OTHER,VERBOSE,(MPL_DBG_FDEST, "receive buffer too small; message truncated, msg_sz=%" PRIdPTR ", userbuf_sz=%" PRIdPTR, rreq->dev.recv_data_sz, userbuf_sz)); rreq->status.MPI_ERROR = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, FCNAME, __LINE__, MPI_ERR_TRUNCATE, "**truncate", "**truncate %d %d %d %d", rreq->status.MPI_SOURCE, rreq->status.MPI_TAG, rreq->dev.recv_data_sz, userbuf_sz ); MPIR_STATUS_SET_COUNT(rreq->status, userbuf_sz); data_sz = userbuf_sz; } if (dt_contig && data_sz == rreq->dev.recv_data_sz) { /* user buffer is contiguous and large enough to store the entire message. However, we haven't yet *read* the data (this code describes how to read the data into the destination) */ MPL_DBG_MSG(MPIDI_CH3_DBG_OTHER,VERBOSE,"IOV loaded for contiguous read"); rreq->dev.iov[0].MPL_IOV_BUF = (MPL_IOV_BUF_CAST)((char*)(rreq->dev.user_buf) + dt_true_lb); rreq->dev.iov[0].MPL_IOV_LEN = data_sz; rreq->dev.iov_count = 1; /* FIXME: We want to set the OnDataAvail to the appropriate function, which depends on whether this is an RMA request or a pt-to-pt request. */ rreq->dev.OnDataAvail = 0; } else { /* user buffer is not contiguous or is too small to hold the entire message */ MPL_DBG_MSG(MPIDI_CH3_DBG_OTHER,VERBOSE,"IOV loaded for non-contiguous read"); rreq->dev.segment_ptr = MPIR_Segment_alloc( ); MPIR_ERR_CHKANDJUMP1((rreq->dev.segment_ptr == NULL), mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %s", "MPIR_Segment_alloc"); MPIR_Segment_init(rreq->dev.user_buf, rreq->dev.user_count, rreq->dev.datatype, rreq->dev.segment_ptr); rreq->dev.segment_first = 0; rreq->dev.segment_size = data_sz; mpi_errno = MPIDI_CH3U_Request_load_recv_iov(rreq); if (mpi_errno != MPI_SUCCESS) { MPIR_ERR_SETFATALANDJUMP(mpi_errno,MPI_ERR_OTHER, "**ch3|loadrecviov"); } } fn_exit: MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIDI_CH3U_POST_DATA_RECEIVE_FOUND); return mpi_errno; fn_fail: goto fn_exit; }
int MPID_nem_lmt_dma_start_recv(MPIDI_VC_t *vc, MPID_Request *rreq, MPID_IOV s_cookie) { int mpi_errno = MPI_SUCCESS; int nodma; int dt_contig; MPI_Aint dt_true_lb; MPIDI_msg_sz_t data_sz; MPID_Datatype * dt_ptr; volatile knem_status_t *status; knem_status_t current_status; struct lmt_dma_node *node = NULL; MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_LMT_DMA_START_RECV); MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_LMT_DMA_START_RECV); /* MT: this code assumes only one thread can be at this point at a time */ if (knem_fd < 0) { mpi_errno = open_knem_dev(); if (mpi_errno) MPIU_ERR_POP(mpi_errno); } /* find out contig/noncontig, size, and lb for the datatype */ MPIDI_Datatype_get_info(rreq->dev.user_count, rreq->dev.datatype, dt_contig, data_sz, dt_ptr, dt_true_lb); nodma = !knem_has_dma || data_sz < MPIR_CVAR_NEMESIS_LMT_DMA_THRESHOLD; if (dt_contig) { /* handle the iov creation ourselves */ rreq->dev.iov[0].MPID_IOV_BUF = (char *)rreq->dev.user_buf + dt_true_lb; rreq->dev.iov[0].MPID_IOV_LEN = data_sz; rreq->dev.iov_count = 1; } else { if (rreq->dev.segment_ptr == NULL) { /* segment_ptr may be non-null when this is a continuation of a many-part message that we couldn't fit in one single flight of iovs. */ MPIU_Assert(rreq->dev.segment_ptr == NULL); rreq->dev.segment_ptr = MPID_Segment_alloc(); MPIU_ERR_CHKANDJUMP1((rreq->dev.segment_ptr == NULL), mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %s", "MPID_Segment_alloc"); MPID_Segment_init(rreq->dev.user_buf, rreq->dev.user_count, rreq->dev.datatype, rreq->dev.segment_ptr, 0); rreq->dev.segment_first = 0; rreq->dev.segment_size = data_sz; /* see load_send_iov FIXME above */ mpi_errno = MPIDI_CH3U_Request_load_recv_iov(rreq); if (mpi_errno) MPIU_ERR_POP(mpi_errno); } } MPIU_Assert(s_cookie.MPID_IOV_LEN == sizeof(knem_cookie_t)); MPIU_Assert(s_cookie.MPID_IOV_BUF != NULL); mpi_errno = do_dma_recv(rreq->dev.iov_count, rreq->dev.iov, *((knem_cookie_t *)s_cookie.MPID_IOV_BUF), nodma, &status, ¤t_status); if (mpi_errno) MPIU_ERR_POP(mpi_errno); /* TODO refactor this block and MPID_nem_lmt_dma_progress (and anywhere * else) to share a common function. This advancement/completion code is * duplication. */ if (current_status != KNEM_STATUS_PENDING) { /* complete the request if all data has been sent, remove it from the list */ int complete = 0; MPIU_ERR_CHKANDJUMP1(current_status == KNEM_STATUS_FAILED, mpi_errno, MPI_ERR_OTHER, "**recv_status", "**recv_status %d", current_status); mpi_errno = check_req_complete(vc, rreq, &complete); if (mpi_errno) MPIU_ERR_POP(mpi_errno); free_status_index(status - knem_status); if (complete) { /* request was completed by the OnDataAvail fn */ MPID_nem_lmt_send_DONE(vc, rreq); /* tell the other side to complete its request */ MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, ".... complete"); } else { /* There is more data to send. We must inform the sender that we have completely received the current batch and that the next batch should be sent. */ MPID_nem_lmt_send_COOKIE(vc, rreq, NULL, 0); } } /* XXX DJG FIXME this looks like it always pushes! */ /* push request if not complete for progress checks later */ node = MPIU_Malloc(sizeof(struct lmt_dma_node)); node->vc = vc; node->req = rreq; node->status_p = status; node->next = outstanding_head; outstanding_head = node; ++MPID_nem_local_lmt_pending; fn_exit: MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_LMT_DMA_START_RECV); return mpi_errno; fn_fail: goto fn_exit; }