Example #1
0
int MPIDI_CH3U_Request_load_recv_iov(MPID_Request * const rreq)
{
    MPI_Aint last;
    int mpi_errno = MPI_SUCCESS;
    MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3U_REQUEST_LOAD_RECV_IOV);

    MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3U_REQUEST_LOAD_RECV_IOV);
    if (rreq->dev.segment_first < rreq->dev.segment_size)
    {
	/* still reading data that needs to go into the user buffer */
	
	if (MPIDI_Request_get_type(rreq) != MPIDI_REQUEST_TYPE_ACCUM_RECV &&
            MPIDI_Request_get_type(rreq) != MPIDI_REQUEST_TYPE_GET_ACCUM_RECV &&
            MPIDI_Request_get_srbuf_flag(rreq))
	{
	    MPIDI_msg_sz_t data_sz;
	    MPIDI_msg_sz_t tmpbuf_sz;

	    /* Once a SRBuf is in use, we continue to use it since a small 
	       amount of data may already be present at the beginning
	       of the buffer.  This data is left over from the previous unpack,
	       most like a result of alignment issues.  NOTE: we
	       could force the use of the SRBuf only 
	       when (rreq->dev.tmpbuf_off > 0)... */
	    
	    data_sz = rreq->dev.segment_size - rreq->dev.segment_first - 
		rreq->dev.tmpbuf_off;
	    MPIU_Assert(data_sz > 0);
	    tmpbuf_sz = rreq->dev.tmpbuf_sz - rreq->dev.tmpbuf_off;
	    if (data_sz > tmpbuf_sz)
	    {
		data_sz = tmpbuf_sz;
	    }
	    rreq->dev.iov[0].MPID_IOV_BUF = 
		(MPID_IOV_BUF_CAST)((char *) rreq->dev.tmpbuf + 
				    rreq->dev.tmpbuf_off);
	    rreq->dev.iov[0].MPID_IOV_LEN = data_sz;
            rreq->dev.iov_offset = 0;
	    rreq->dev.iov_count = 1;
	    MPIU_Assert(rreq->dev.segment_first + data_sz + 
			rreq->dev.tmpbuf_off <= rreq->dev.recv_data_sz);
	    if (rreq->dev.segment_first + data_sz + rreq->dev.tmpbuf_off == 
		rreq->dev.recv_data_sz)
	    {
		MPIU_DBG_MSG(CH3_CHANNEL,VERBOSE,
		  "updating rreq to read the remaining data into the SRBuf");
		rreq->dev.OnDataAvail = MPIDI_CH3_ReqHandler_UnpackSRBufComplete;
	    }
	    else
	    {
		MPIU_DBG_MSG(CH3_CHANNEL,VERBOSE,
		       "updating rreq to read more data into the SRBuf");
		rreq->dev.OnDataAvail = MPIDI_CH3_ReqHandler_UnpackSRBufReloadIOV;
	    }
	    goto fn_exit;
	}
	
	last = rreq->dev.segment_size;
	rreq->dev.iov_count = MPID_IOV_LIMIT;
	rreq->dev.iov_offset = 0;
	MPIU_DBG_MSG_FMT(CH3_CHANNEL,VERBOSE,(MPIU_DBG_FDEST,
   "pre-upv: first=" MPIDI_MSG_SZ_FMT ", last=" MPIDI_MSG_SZ_FMT ", iov_n=%d",
			  rreq->dev.segment_first, last, rreq->dev.iov_count));
	MPIU_Assert(rreq->dev.segment_first < last);
	MPIU_Assert(last > 0);
	MPID_Segment_unpack_vector(rreq->dev.segment_ptr, 
				   rreq->dev.segment_first,
				   &last, &rreq->dev.iov[0], &rreq->dev.iov_count);
	MPIU_DBG_MSG_FMT(CH3_CHANNEL,VERBOSE,(MPIU_DBG_FDEST,
   "post-upv: first=" MPIDI_MSG_SZ_FMT ", last=" MPIDI_MSG_SZ_FMT ", iov_n=%d, iov_offset=%lld",
			  rreq->dev.segment_first, last, rreq->dev.iov_count, (long long)rreq->dev.iov_offset));
	MPIU_Assert(rreq->dev.iov_count >= 0 && rreq->dev.iov_count <= 
		    MPID_IOV_LIMIT);

	/* --BEGIN ERROR HANDLING-- */
	if (rreq->dev.iov_count == 0)
	{
	    /* If the data can't be unpacked, the we have a mis-match between
	       the datatype and the amount of data received.  Adjust
	       the segment info so that the remaining data is received and 
	       thrown away. */
	    rreq->status.MPI_ERROR = MPIR_Err_create_code(MPI_SUCCESS, 
		       MPIR_ERR_RECOVERABLE, FCNAME, __LINE__, MPI_ERR_TYPE,
		       "**dtypemismatch", 0);
            MPIR_STATUS_SET_COUNT(rreq->status, rreq->dev.segment_first);
	    rreq->dev.segment_size = rreq->dev.segment_first;
	    mpi_errno = MPIDI_CH3U_Request_load_recv_iov(rreq);
	    goto fn_exit;
	}
        else
        {
            MPIU_Assert(rreq->dev.iov_offset < rreq->dev.iov_count);
        }
	/* --END ERROR HANDLING-- */

	if (last == rreq->dev.recv_data_sz)
	{
	    MPIU_DBG_MSG(CH3_CHANNEL,VERBOSE,
     "updating rreq to read the remaining data directly into the user buffer");
	    /* Eventually, use OnFinal for this instead */
	    rreq->dev.OnDataAvail = rreq->dev.OnFinal;
	}
	else if (MPIDI_Request_get_type(rreq) == MPIDI_REQUEST_TYPE_ACCUM_RECV ||
                 MPIDI_Request_get_type(rreq) == MPIDI_REQUEST_TYPE_GET_ACCUM_RECV ||
                 (last == rreq->dev.segment_size ||
                  (last - rreq->dev.segment_first) / rreq->dev.iov_count >= MPIDI_IOV_DENSITY_MIN))
	{
	    MPIU_DBG_MSG(CH3_CHANNEL,VERBOSE,
	     "updating rreq to read more data directly into the user buffer");
	    rreq->dev.segment_first = last;
	    rreq->dev.OnDataAvail = MPIDI_CH3_ReqHandler_ReloadIOV;
	}
	else
	{
	    /* Too little data would have been received using an IOV.  
	       We will start receiving data into a SRBuf and unpacking it
	       later. */
	    MPIU_Assert(MPIDI_Request_get_srbuf_flag(rreq) == FALSE);
	    
	    MPIDI_CH3U_SRBuf_alloc(rreq, 
			    rreq->dev.segment_size - rreq->dev.segment_first);
	    rreq->dev.tmpbuf_off = 0;
	    /* --BEGIN ERROR HANDLING-- */
	    if (rreq->dev.tmpbuf_sz == 0)
	    {
		/* FIXME - we should drain the data off the pipe here, but we 
		   don't have a buffer to drain it into.  should this be
		   a fatal error? */
		MPIU_DBG_MSG(CH3_CHANNEL,VERBOSE,"SRBuf allocation failure");
		mpi_errno = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_FATAL, 
			      FCNAME, __LINE__, MPI_ERR_OTHER, "**nomem", 
			 "**nomem %d", 
			 rreq->dev.segment_size - rreq->dev.segment_first);
		rreq->status.MPI_ERROR = mpi_errno;
		goto fn_exit;
	    }
	    /* --END ERROR HANDLING-- */

	    /* fill in the IOV using a recursive call */
	    mpi_errno = MPIDI_CH3U_Request_load_recv_iov(rreq);
	}
    }
    else
    {
	/* receive and toss any extra data that does not fit in the user's 
	   buffer */
	MPIDI_msg_sz_t data_sz;

	data_sz = rreq->dev.recv_data_sz - rreq->dev.segment_first;
	if (!MPIDI_Request_get_srbuf_flag(rreq))
	{
	    MPIDI_CH3U_SRBuf_alloc(rreq, data_sz);
	    /* --BEGIN ERROR HANDLING-- */
	    if (rreq->dev.tmpbuf_sz == 0)
	    {
		MPIU_DBG_MSG(CH3_CHANNEL,TYPICAL,"SRBuf allocation failure");
		mpi_errno = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_FATAL, 
			       FCNAME, __LINE__, MPI_ERR_OTHER, "**nomem", 0);
		rreq->status.MPI_ERROR = mpi_errno;
		goto fn_exit;
	    }
	    /* --END ERROR HANDLING-- */
	}

	if (data_sz <= rreq->dev.tmpbuf_sz)
	{
	    MPIU_DBG_MSG(CH3_CHANNEL,VERBOSE,
	    "updating rreq to read overflow data into the SRBuf and complete");
	    rreq->dev.iov[0].MPID_IOV_LEN = data_sz;
	    MPIU_Assert(MPIDI_Request_get_type(rreq) == MPIDI_REQUEST_TYPE_RECV);
	    /* Eventually, use OnFinal for this instead */
	    rreq->dev.OnDataAvail = rreq->dev.OnFinal;
	}
	else
	{
	    MPIU_DBG_MSG(CH3_CHANNEL,VERBOSE,
	  "updating rreq to read overflow data into the SRBuf and reload IOV");
	    rreq->dev.iov[0].MPID_IOV_LEN = rreq->dev.tmpbuf_sz;
	    rreq->dev.segment_first += rreq->dev.tmpbuf_sz;
	    rreq->dev.OnDataAvail = MPIDI_CH3_ReqHandler_ReloadIOV;
	}
	
	rreq->dev.iov[0].MPID_IOV_BUF = (MPID_IOV_BUF_CAST)rreq->dev.tmpbuf;
	rreq->dev.iov_count = 1;
    }
    
  fn_exit:
    MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3U_REQUEST_LOAD_RECV_IOV);
    return mpi_errno;
}
Example #2
0
int MPID_nem_newmad_process_rdtype(MPID_Request **rreq_p, MPID_Datatype * dt_ptr, MPIDI_msg_sz_t data_sz, struct iovec *newmad_iov[], int *num_iov)
{
    MPID_Request  *rreq      = *rreq_p;
    MPIDI_msg_sz_t last;
    MPID_IOV      *iov;
    int            n_iov     = 0;
    int            mpi_errno = MPI_SUCCESS;
    int            index;
    
    MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_NEWMAD_PROCESS_RDTYPE);
    MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_NEWMAD_PROCESS_RDTYPE);

    if (rreq->dev.segment_ptr == NULL)
    {
	rreq->dev.segment_ptr = MPID_Segment_alloc( );
	MPIU_ERR_CHKANDJUMP1((rreq->dev.segment_ptr == NULL), mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %s", "MPID_Segment_alloc");
    }
    MPID_Segment_init(rreq->dev.user_buf, rreq->dev.user_count, rreq->dev.datatype, rreq->dev.segment_ptr, 0);
    rreq->dev.segment_first = 0;
    rreq->dev.segment_size = data_sz;
    last = rreq->dev.segment_size;

    MPID_Segment_count_contig_blocks(rreq->dev.segment_ptr,rreq->dev.segment_first,&last,&n_iov);
    MPIU_Assert(n_iov > 0);
    iov = MPIU_Malloc(n_iov*sizeof(MPID_IOV));

    MPID_Segment_unpack_vector(rreq->dev.segment_ptr, rreq->dev.segment_first, &last,iov, &n_iov);
    MPIU_Assert(last == rreq->dev.segment_size);

#ifdef DEBUG
    for(index = 0; index < n_iov ; index++)
	{
	    fprintf(stdout,"======================\n");
	    fprintf(stdout,"RECV iov[%i]: [base %p][len %i]\n",index,
		    iov[index].MPID_IOV_BUF,iov[index].MPID_IOV_LEN);
	}
#endif

    if(n_iov <= NMAD_IOV_MAX_DEPTH) 
    {
	for(index=0; index < n_iov ; index++)
	{
	    (*newmad_iov)[index].iov_base = iov[index].MPID_IOV_BUF;
	    (*newmad_iov)[index].iov_len  = iov[index].MPID_IOV_LEN;
	}
	rreq->dev.tmpbuf = NULL;
	*num_iov = n_iov;
    }
    else
    {
	int packsize = 0;
	MPIR_Pack_size_impl(rreq->dev.user_count, rreq->dev.datatype, &packsize);
	rreq->dev.tmpbuf = MPIU_Malloc((size_t) packsize);
	MPIU_Assert(rreq->dev.tmpbuf);
	rreq->dev.tmpbuf_sz = packsize;
	(*newmad_iov)[0].iov_base = (char *)  rreq->dev.tmpbuf;
	(*newmad_iov)[0].iov_len  = (uint32_t) packsize;
	*num_iov = 1 ;
    }
    MPIU_Free(iov);
 fn_exit:
    MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_NEWMAD_PROCESS_RDTYPE);
    return mpi_errno;
 fn_fail:  ATTRIBUTE((unused))
    goto fn_exit;
}
Example #3
0
static int _mxm_handle_rreq(MPID_Request * req)
{
    int complete = FALSE, found = FALSE;
    int dt_contig;
    MPI_Aint dt_true_lb ATTRIBUTE((unused));
    MPIDI_msg_sz_t userbuf_sz;
    MPID_Datatype *dt_ptr;
    MPIDI_msg_sz_t data_sz;
    MPID_nem_mxm_vc_area *vc_area ATTRIBUTE((unused)) = NULL;
    MPID_nem_mxm_req_area *req_area = NULL;
    void *tmp_buf = NULL;

    MPID_THREAD_CS_ENTER(POBJ, MPIR_THREAD_MSGQ_MUTEX);
    found = MPIDI_CH3U_Recvq_DP(req);
    MPID_THREAD_CS_EXIT(POBJ, MPIR_THREAD_MSGQ_MUTEX);
    /* an MPI_ANY_SOURCE request may have been previously removed from the
     * CH3 queue by an FDP (find and dequeue posted) operation */
    if (req->dev.match.parts.rank != MPI_ANY_SOURCE) {
        MPIU_Assert(found);
    }

    MPIDI_Datatype_get_info(req->dev.user_count, req->dev.datatype, dt_contig, userbuf_sz, dt_ptr,
                            dt_true_lb);

    vc_area = VC_BASE(req->ch.vc);
    req_area = REQ_BASE(req);

    _dbg_mxm_out_buf(req_area->iov_buf[0].ptr,
                     (req_area->iov_buf[0].length > 16 ? 16 : req_area->iov_buf[0].length));

    if (req->dev.recv_data_sz <= userbuf_sz) {
        data_sz = req->dev.recv_data_sz;
        if (req->status.MPI_ERROR == MPI_ERR_TRUNCATE) {
            req->status.MPI_ERROR = MPIR_Err_create_code(MPI_SUCCESS,
                                                         MPIR_ERR_RECOVERABLE, FCNAME, __LINE__,
                                                         MPI_ERR_TRUNCATE, "**truncate",
                                                         "**truncate %d %d %d %d",
                                                         req->status.MPI_SOURCE,
                                                         req->status.MPI_TAG, req->dev.recv_data_sz,
                                                         userbuf_sz);
        }
    }
    else {
        data_sz = userbuf_sz;
        MPIR_STATUS_SET_COUNT(req->status, userbuf_sz);
        MPIU_DBG_MSG_FMT(CH3_OTHER, VERBOSE, (MPIU_DBG_FDEST,
                                              "receive buffer too small; message truncated, msg_sz="
                                              MPIDI_MSG_SZ_FMT ", userbuf_sz="
                                              MPIDI_MSG_SZ_FMT, req->dev.recv_data_sz, userbuf_sz));
        req->status.MPI_ERROR = MPIR_Err_create_code(MPI_SUCCESS,
                                                     MPIR_ERR_RECOVERABLE, FCNAME, __LINE__,
                                                     MPI_ERR_TRUNCATE, "**truncate",
                                                     "**truncate %d %d %d %d",
                                                     req->status.MPI_SOURCE, req->status.MPI_TAG,
                                                     req->dev.recv_data_sz, userbuf_sz);
    }

    if (!dt_contig) {
        MPIDI_msg_sz_t last = 0;

        if (req->dev.tmpbuf != NULL) {
            last = req->dev.recv_data_sz;
            MPID_Segment_unpack(req->dev.segment_ptr, 0, &last, req->dev.tmpbuf);
            tmp_buf = req->dev.tmpbuf;
        }
        else {
            mxm_req_buffer_t *iov_buf;
            MPL_IOV *iov;
            int n_iov = 0;
            int index;

            last = req->dev.recv_data_sz;
            n_iov = req_area->iov_count;
            iov_buf = req_area->iov_buf;
            if (last && n_iov > 0) {
                iov = MPIU_Malloc(n_iov * sizeof(*iov));
                MPIU_Assert(iov);

                for (index = 0; index < n_iov; index++) {
                    iov[index].MPL_IOV_BUF = iov_buf[index].ptr;
                    iov[index].MPL_IOV_LEN = iov_buf[index].length;
                }

                MPID_Segment_unpack_vector(req->dev.segment_ptr, req->dev.segment_first, &last, iov,
                                           &n_iov);
                MPIU_Free(iov);
            }
            if (req_area->iov_count > MXM_MPICH_MAX_IOV) {
                tmp_buf = req_area->iov_buf;
                req_area->iov_buf = req_area->tmp_buf;
                req_area->iov_count = 0;
            }
        }
        if (last != data_sz) {
            MPIR_STATUS_SET_COUNT(req->status, last);
            if (req->dev.recv_data_sz <= userbuf_sz) {
                /* If the data can't be unpacked, the we have a
                 *  mismatch between the datatype and the amount of
                 *  data received.  Throw away received data.
                 */
                MPIR_ERR_SETSIMPLE(req->status.MPI_ERROR, MPI_ERR_TYPE, "**dtypemismatch");
            }
        }
    }

    MPIDI_CH3U_Handle_recv_req(req->ch.vc, req, &complete);
    MPIU_Assert(complete == TRUE);

    if (tmp_buf)
        MPIU_Free(tmp_buf);

    return complete;
}
Example #4
0
static int _mxm_process_rdtype(MPID_Request ** rreq_p, MPI_Datatype datatype,
                               MPID_Datatype * dt_ptr, MPIDI_msg_sz_t data_sz, const void *buf,
                               int count, mxm_req_buffer_t ** iov_buf, int *iov_count)
{
    int mpi_errno = MPI_SUCCESS;
    MPID_Request *rreq = *rreq_p;
    MPIDI_msg_sz_t last;
    MPL_IOV *iov;
    int n_iov = 0;
    int index;

    if (rreq->dev.segment_ptr == NULL) {
        rreq->dev.segment_ptr = MPID_Segment_alloc();
        MPIR_ERR_CHKANDJUMP1((rreq->dev.segment_ptr == NULL), mpi_errno, MPI_ERR_OTHER, "**nomem",
                             "**nomem %s", "MPID_Segment_alloc");
    }
    MPID_Segment_init(buf, count, datatype, rreq->dev.segment_ptr, 0);
    rreq->dev.segment_first = 0;
    rreq->dev.segment_size = data_sz;

    last = rreq->dev.segment_size;
    MPID_Segment_count_contig_blocks(rreq->dev.segment_ptr, rreq->dev.segment_first, &last,
                                     (MPI_Aint *) & n_iov);
    MPIU_Assert(n_iov > 0);
    iov = MPIU_Malloc(n_iov * sizeof(*iov));
    MPIU_Assert(iov);

    last = rreq->dev.segment_size;
    MPID_Segment_unpack_vector(rreq->dev.segment_ptr, rreq->dev.segment_first, &last, iov, &n_iov);
    MPIU_Assert(last == rreq->dev.segment_size);

#if defined(MXM_DEBUG) && (MXM_DEBUG > 0)
    _dbg_mxm_output(7, "Recv Noncontiguous data vector %i entries (free slots : %i)\n", n_iov,
                    MXM_REQ_DATA_MAX_IOV);
    for (index = 0; index < n_iov; index++) {
        _dbg_mxm_output(7, "======= Recv iov[%i] = ptr : %p, len : %i \n",
                        index, iov[index].MPL_IOV_BUF, iov[index].MPL_IOV_LEN);
    }
#endif

    if (n_iov <= MXM_REQ_DATA_MAX_IOV) {
        if (n_iov > MXM_MPICH_MAX_IOV) {
            *iov_buf = (mxm_req_buffer_t *) MPIU_Malloc(n_iov * sizeof(**iov_buf));
            MPIU_Assert(*iov_buf);
        }

        for (index = 0; index < n_iov; index++) {
            (*iov_buf)[index].ptr = iov[index].MPL_IOV_BUF;
            (*iov_buf)[index].length = iov[index].MPL_IOV_LEN;
        }
        rreq->dev.tmpbuf = NULL;
        rreq->dev.tmpbuf_sz = 0;
        *iov_count = n_iov;
    }
    else {
        MPI_Aint packsize = 0;
        MPIR_Pack_size_impl(rreq->dev.user_count, rreq->dev.datatype, &packsize);
        rreq->dev.tmpbuf = MPIU_Malloc((size_t) packsize);
        MPIU_Assert(rreq->dev.tmpbuf);
        rreq->dev.tmpbuf_sz = packsize;
        (*iov_buf)[0].ptr = rreq->dev.tmpbuf;
        (*iov_buf)[0].length = (size_t) packsize;
        *iov_count = 1;
    }
    MPIU_Free(iov);

  fn_exit:
    return mpi_errno;
  fn_fail:
    goto fn_exit;
}
Example #5
0
int MPID_nem_tcp_module_lmt_start_recv (MPIDI_VC_t *vc, MPID_Request *req)
{
    int mpi_errno = MPI_SUCCESS;
    int ret;
    MPIDI_msg_sz_t data_sz;
    int dt_contig;
    MPI_Aint dt_true_lb;
    MPID_Datatype * dt_ptr;
    MPIDI_msg_sz_t last;
    int nb;
    int r_len;
    MPIDI_CH3I_VC *vc_ch = (MPIDI_CH3I_VC *)vc->channel_private;
    MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_TCP_MODULE_LMT_START_RECV);

    MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_TCP_MODULE_LMT_START_RECV);

    free_cookie (vc_ch->net.tcp.lmt_cookie);

    if (!vc_ch->net.tcp.lmt_connected)
    {
        int len;
        struct sockaddr_in saddr;
        int connfd;

        len = sizeof (saddr);
        connfd = accept (vc_ch->net.tcp.lmt_desc, (struct sockaddr *)&saddr, &len);
        MPIU_ERR_CHKANDJUMP2 (connfd == -1, mpi_errno, MPI_ERR_OTHER, "**sock|poll|accept", "**sock|poll|accept %d %s", errno, strerror (errno));

        /* close listen fd */
        do
            ret = close (vc_ch->net.tcp.lmt_desc);
        while (ret == -1 && errno == EINTR);
        MPIU_ERR_CHKANDJUMP2 (ret == -1, mpi_errno, MPI_ERR_OTHER, "**closesocket", "**closesocket %s %d", strerror (errno), errno);

        /* set lmt_desc to new connected fd */
        vc_ch->net.tcp.lmt_desc = connfd;
        vc_ch->net.tcp.lmt_connected = 1;

        //        ret = fcntl (vc_ch->net.tcp.lmt_desc, F_SETFL, O_NONBLOCK);
        //        MPIU_ERR_CHKANDJUMP2 (ret == -1, mpi_errno, MPI_ERR_OTHER, "**fail", "**fail %s %d", strerror (errno), errno);
    }

    MPIDI_Datatype_get_info (req->dev.user_count, req->dev.datatype, dt_contig, data_sz, dt_ptr, dt_true_lb);

    if (data_sz > vc_ch->net.tcp.lmt_s_len)
    {
        data_sz = vc_ch->net.tcp.lmt_s_len;
    }
    else if (data_sz < vc_ch->net.tcp.lmt_s_len)
    {
        /* message will be truncated */
        r_len = data_sz;
 	req->status.MPI_ERROR = MPIU_ERR_SET2 (mpi_errno, MPI_ERR_TRUNCATE, "**truncate", "**truncate %d %d", vc_ch->net.tcp.lmt_s_len, r_len);
    }

    MPID_Segment_init (req->dev.user_buf, req->dev.user_count, req->dev.datatype, &req->dev.segment, 0);
    req->dev.segment_first = 0;
    req->dev.segment_size = data_sz;
    req->dev.iov_count = MPID_IOV_LIMIT;
    req->dev.iov_offset = 0;
    last = data_sz;

    do
    {
        int iov_offset;
        int left_to_recv;

        MPID_Segment_unpack_vector (&req->dev.segment, req->dev.segment_first, &last, req->dev.iov, &req->dev.iov_count);

        left_to_recv = last - req->dev.segment_first;
        iov_offset = 0;

#ifdef TESTING_CHUNKING
        {
            char *buf = req->dev.iov[0].MPID_IOV_BUF;
            int l;
            while (left_to_recv)
            {
                if (left_to_recv > CHUNK)
                    l = CHUNK;
                else
                    l = left_to_recv;

                do
                    nb = read (vc_ch->net.tcp.lmt_desc, buf, l);
                while (nb == -1 && errno == EINTR);
                MPIU_ERR_CHKANDJUMP (nb == -1, mpi_errno, MPI_ERR_OTHER, "**sock_writev");

                left_to_recv -= nb;
                buf += nb;
            }
            MPIDI_CH3U_Request_complete (req);
            goto fn_exit;
        }
#endif

        do
            nb = readv (vc_ch->net.tcp.lmt_desc, &req->dev.iov[iov_offset], req->dev.iov_count - iov_offset);
        while (nb == -1 && errno == EINTR);
        MPIU_ERR_CHKANDJUMP2 (nb == -1, mpi_errno, MPI_ERR_OTHER, "**fail", "**fail %s %d", strerror (errno), errno);
        MPIU_ERR_CHKANDJUMP (nb == 0, mpi_errno, MPI_ERR_OTHER, "**fail");

        left_to_recv -= nb;
        while (left_to_recv)
        { /* recv rest of iov */
            while (nb >= req->dev.iov[iov_offset].MPID_IOV_LEN)
            { /* update iov to reflect sent bytes */
                nb -= req->dev.iov[iov_offset].MPID_IOV_LEN;
                ++iov_offset;
            }
            req->dev.iov[iov_offset].MPID_IOV_BUF = (char *)req->dev.iov[iov_offset].MPID_IOV_BUF + nb;
            req->dev.iov[iov_offset].MPID_IOV_LEN -= nb;

            do
                nb = readv (vc_ch->net.tcp.lmt_desc, &req->dev.iov[iov_offset], req->dev.iov_count - iov_offset);
            while (nb == -1 && errno == EINTR);
            MPIU_ERR_CHKANDJUMP2 (nb == -1, mpi_errno, MPI_ERR_OTHER, "**fail", "**fail %s %d", strerror (errno), errno);
            MPIU_ERR_CHKANDJUMP (nb == 0, mpi_errno, MPI_ERR_OTHER, "**fail");
            left_to_recv -= nb;
        }
    }
    while (last < data_sz);

    MPIDI_CH3U_Request_complete (req);

 fn_exit:
    MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_TCP_MODULE_LMT_START_RECV);
    return mpi_errno;
 fn_fail:
    goto fn_exit;
}