コード例 #1
0
ファイル: mx_cancel.c プロジェクト: addy004/mpich2-yarn
int MPID_nem_mx_cancel_send(MPIDI_VC_t *vc, MPID_Request *sreq)
{
    mx_request_t *mx_request = NULL;
    mx_return_t ret;
    uint32_t    result;
    int mpi_errno = MPI_SUCCESS;
    int handled = FALSE;
   
     if (!VC_CH(vc)->is_local)
     {
	mx_request = &(REQ_FIELD(sreq,mx_request));
	ret = mx_cancel(MPID_nem_mx_local_endpoint,mx_request,&result);
	MPIU_ERR_CHKANDJUMP1(ret != MX_SUCCESS, mpi_errno, MPI_ERR_OTHER, "**mx_cancel", "**mx_cancel %s", mx_strerror(ret));
	
	if (result)
	{
	   sreq->status.cancelled = TRUE;
	   sreq->cc = 0;
	   MPIU_Object_set_ref(sreq, 1);       
	   MPID_nem_mx_pending_send_req--;
	}
	else
        {	    
	   sreq->status.cancelled = FALSE;
	}
	handled = TRUE;
     }
   
 fn_exit:
    return handled;
 fn_fail:
    goto fn_exit;
}
コード例 #2
0
ファイル: mpid_nem_ckpt.c プロジェクト: addy004/mpich2-yarn
int MPIDI_nem_ckpt_start(void)
{
    int mpi_errno = MPI_SUCCESS;
    int i;
    MPIDI_STATE_DECL(MPID_STATE_MPIDI_NEM_CKPT_START);

    MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_NEM_CKPT_START);

    if (checkpointing)
        goto fn_exit;

    checkpointing = TRUE;

    marker_count = MPIDI_Process.my_pg->size - 1; /* We won't receive a marker from ourselves. */

    ++current_wave;

    /* send markers to all other processes */
    /* FIXME: we're only handling processes in our pg, so no dynamic connections */
    for (i = 0; i <  MPIDI_Process.my_pg->size; ++i) {
        MPID_Request *req;
        MPIDI_VC_t *vc;
        MPIDI_CH3I_VC *vc_ch;
        MPID_PKT_DECL_CAST(upkt, MPID_nem_pkt_ckpt_marker_t, ckpt_pkt);

        /* Don't send a marker to ourselves. */
        if (i == MPIDI_Process.my_pg_rank)
            continue;
       
        MPIDI_PG_Get_vc_set_active(MPIDI_Process.my_pg, i, &vc);
        vc_ch = VC_CH(vc);

        MPIDI_Pkt_init(ckpt_pkt, MPIDI_NEM_PKT_CKPT_MARKER);
        ckpt_pkt->wave = current_wave;
        
        mpi_errno = MPIDI_CH3_iStartMsg(vc, ckpt_pkt, sizeof(ckpt_pkt), &req);
        MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**ckptpkt");
        if (req != NULL)
        {
            MPIU_ERR_CHKANDJUMP(req->status.MPI_ERROR, mpi_errno, MPI_ERR_OTHER, "**ckptpkt");
            MPID_Request_release(req);
        }

        if (!vc_ch->is_local) {
            mpi_errno = vc_ch->ckpt_pause_send_vc(vc);
            if (mpi_errno) MPIU_ERR_POP(mpi_errno);
        }
    }
    

fn_exit:
    MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_NEM_CKPT_START);
    return mpi_errno;
fn_fail:

    goto fn_exit;
}
コード例 #3
0
ファイル: mx_init.c プロジェクト: OngOngoing/219351_homework
int
MPID_nem_mx_vc_init (MPIDI_VC_t *vc)
{
   uint32_t threshold;
   MPIDI_CH3I_VC *vc_ch = VC_CH(vc);
   int mpi_errno = MPI_SUCCESS;

   /* first make sure that our private fields in the vc fit into the area provided  */
   MPIU_Assert(sizeof(MPID_nem_mx_vc_area) <= MPID_NEM_VC_NETMOD_AREA_LEN);

#ifdef ONDEMAND
   VC_FIELD(vc, local_connected)  = 0;
   VC_FIELD(vc, remote_connected) = 0;
#else
   {
       char *business_card;
       int   val_max_sz;
       int   ret;
#ifdef USE_PMI2_API
       val_max_sz = PMI2_MAX_VALLEN;
#else
       mpi_errno = PMI_KVS_Get_value_length_max(&val_max_sz);
#endif 
       business_card = (char *)MPIU_Malloc(val_max_sz); 
       mpi_errno = vc->pg->getConnInfo(vc->pg_rank, business_card,val_max_sz, vc->pg);
       if (mpi_errno) MPIU_ERR_POP(mpi_errno);
       
       mpi_errno = MPID_nem_mx_get_from_bc (business_card, &VC_FIELD(vc, remote_endpoint_id), &VC_FIELD(vc, remote_nic_id));
       if (mpi_errno)    MPIU_ERR_POP (mpi_errno);

       MPIU_Free(business_card);
       
       ret = mx_connect(MPID_nem_mx_local_endpoint,VC_FIELD(vc, remote_nic_id),VC_FIELD(vc, remote_endpoint_id),
			MPID_NEM_MX_FILTER,MX_INFINITE,&(VC_FIELD(vc, remote_endpoint_addr)));
       MPIU_ERR_CHKANDJUMP1 (ret != MX_SUCCESS, mpi_errno, MPI_ERR_OTHER, "**mx_connect", "**mx_connect %s", mx_strerror (ret));
       mx_set_endpoint_addr_context(VC_FIELD(vc, remote_endpoint_addr),(void *)vc);

       MPIDI_CHANGE_VC_STATE(vc, ACTIVE);
   }
#endif
   mx_get_info(MPID_nem_mx_local_endpoint, MX_COPY_SEND_MAX, NULL, 0, &threshold, sizeof(uint32_t));

   vc->eager_max_msg_sz = threshold;
   vc->rndvSend_fn      = NULL;
   vc->sendNoncontig_fn = MPID_nem_mx_SendNoncontig;
   vc->comm_ops         = &comm_ops;
 
   vc_ch->iStartContigMsg = MPID_nem_mx_iStartContigMsg;
   vc_ch->iSendContig     = MPID_nem_mx_iSendContig;

 fn_exit:
   return mpi_errno;
 fn_fail:
   goto fn_exit;
}
コード例 #4
0
ファイル: ib_lmt.c プロジェクト: abhinavvishnu/matex
int MPID_nem_ib_lmt_done_recv(struct MPIDI_VC *vc, struct MPID_Request *rreq)
{
    int mpi_errno = MPI_SUCCESS;
    MPIDI_CH3I_VC *vc_ch = VC_CH(vc);

    MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_LMT_DONE_RECV);
    MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_LMT_DONE_RECV);

    dprintf("lmt_done_recv,enter,rreq=%p,head=%p\n", rreq, MPID_nem_ib_lmtq.head);


    int is_contig;
    MPID_Datatype_is_contig(rreq->dev.datatype, &is_contig);
    if (!is_contig) {
        dprintf("lmt_done_recv,copying noncontiguous data to user buffer\n");

        /* see MPIDI_CH3U_Request_unpack_uebuf (in /src/mpid/ch3/src/ch3u_request.c) */
        /* or MPIDI_CH3U_Receive_data_found (in src/mpid/ch3/src/ch3u_handle_recv_pkt.c) */
        MPIDI_msg_sz_t unpack_sz = rreq->ch.lmt_data_sz;
        MPID_Segment seg;
        MPI_Aint last;

        MPID_Segment_init(rreq->dev.user_buf, rreq->dev.user_count, rreq->dev.datatype, &seg, 0);
        last = unpack_sz;
        MPID_Segment_unpack(&seg, 0, &last, REQ_FIELD(rreq, lmt_pack_buf));
        if (last != unpack_sz) {
            /* --BEGIN ERROR HANDLING-- */
            /* received data was not entirely consumed by unpack()
             * because too few bytes remained to fill the next basic
             * datatype */
            MPIR_STATUS_SET_COUNT(rreq->status, last);
            rreq->status.MPI_ERROR =
                MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, FCNAME, __LINE__,
                                     MPI_ERR_TYPE, "**MPID_nem_ib_lmt_done_recv", 0);
            /* --END ERROR HANDLING-- */
        }

        //MPIU_Free(REQ_FIELD(rreq, lmt_pack_buf));
        MPID_nem_ib_stfree(REQ_FIELD(rreq, lmt_pack_buf), (size_t) rreq->ch.lmt_data_sz);
    }

    dprintf("lmt_done_recv,1,req=%p,pcc=%d\n", rreq, MPIDI_CH3I_progress_completion_count.v);
    MPIDI_CH3U_Request_complete(rreq);
    dprintf("lmt_done_recv,2,pcc=%d\n", MPIDI_CH3I_progress_completion_count.v);

  fn_exit:
    MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_IB_LMT_DONE_RECV);
    return mpi_errno;
    //fn_fail:
    goto fn_exit;
}
コード例 #5
0
ファイル: mpid_nem_ckpt.c プロジェクト: addy004/mpich2-yarn
int MPIDI_nem_ckpt_finish(void)
{
    int mpi_errno = MPI_SUCCESS;
    int i;
    int ret;
    MPIDI_STATE_DECL(MPID_STATE_MPIDI_NEM_CKPT_FINISH);

    MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_NEM_CKPT_FINISH);

    /* Since we're checkpointing the shared memory region (i.e., the
       channels between local procs), we don't have to flush those
       channels, just make sure no one is sending or receiving during
       the checkpoint */
    mpi_errno = MPID_nem_barrier();
    if (mpi_errno) MPIU_ERR_POP(mpi_errno);

    ret = sem_post(&ckpt_sem);
    MPIU_ERR_CHKANDJUMP1(ret, mpi_errno, MPI_ERR_OTHER, "**sem_post", "**sem_post %s", MPIU_Strerror(errno));

    ret = sem_wait(&cont_sem);
    MPIU_ERR_CHKANDJUMP1(ret, mpi_errno, MPI_ERR_OTHER, "**sem_wait", "**sem_wait %s", MPIU_Strerror(errno));

    mpi_errno = MPID_nem_barrier();
    if (mpi_errno) MPIU_ERR_POP(mpi_errno);

    if (ckpt_result == CKPT_CONTINUE) {
        for (i = 0; i < MPIDI_Process.my_pg->size; ++i) {
            MPIDI_VC_t *vc;
            MPIDI_CH3I_VC *vc_ch;
            /* We didn't send a marker to ourselves. */
            if (i == MPIDI_Process.my_pg_rank)
                continue;

            MPIDI_PG_Get_vc(MPIDI_Process.my_pg, i, &vc);
            vc_ch = VC_CH(vc);
            if (!vc_ch->is_local) {
                mpi_errno = vc_ch->ckpt_continue_vc(vc);
                if (mpi_errno) MPIU_ERR_POP(mpi_errno);
            }
        }
    }
    
    checkpointing = FALSE;
    
fn_exit:
    MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_NEM_CKPT_FINISH);
    return mpi_errno;
fn_fail:

    goto fn_exit;
}
コード例 #6
0
ファイル: newmad_init.c プロジェクト: addy004/mpich2-yarn
int
MPID_nem_newmad_vc_init (MPIDI_VC_t *vc)
{
   MPIDI_CH3I_VC *vc_ch = VC_CH(vc);
   char          *business_card;
   int            mpi_errno = MPI_SUCCESS;   
   int            val_max_sz;
   int            ret;
   
#ifdef USE_PMI2_API
   val_max_sz = PMI2_MAX_VALLEN;
#else
   mpi_errno = PMI_KVS_Get_value_length_max(&val_max_sz);
#endif
   business_card = (char *)MPIU_Malloc(val_max_sz);   
   mpi_errno = vc->pg->getConnInfo(vc->pg_rank, business_card,val_max_sz,vc->pg);
   if (mpi_errno) MPIU_ERR_POP(mpi_errno);
   
   /* Very important */
   memset(VC_FIELD(vc, url),0,MPID_NEM_NMAD_MAX_SIZE);
   
   mpi_errno = MPID_nem_newmad_get_from_bc (business_card, VC_FIELD(vc, url));
   if (mpi_errno) MPIU_ERR_POP (mpi_errno);

   MPIU_Free(business_card);
   
   ret = nm_session_connect(mpid_nem_newmad_session, &(VC_FIELD(vc,p_gate)), VC_FIELD(vc, url));
   if (ret != NM_ESUCCESS) fprintf(stdout,"nm_session_connect returned ret = %d\n", ret);

   nm_gate_ref_set(VC_FIELD(vc, p_gate),(void*)vc);
   MPIDI_CHANGE_VC_STATE(vc, ACTIVE);
   
   vc->eager_max_msg_sz = 32768;
   vc->rndvSend_fn      = NULL;
   vc->sendNoncontig_fn = MPID_nem_newmad_SendNoncontig;
   vc->comm_ops         = &comm_ops;

   vc_ch->iStartContigMsg = MPID_nem_newmad_iStartContigMsg;
   vc_ch->iSendContig     = MPID_nem_newmad_iSendContig;
   
 fn_exit:
   return mpi_errno;
 fn_fail:
   goto fn_exit;
}
コード例 #7
0
ファイル: newmad_poll.c プロジェクト: addy004/mpich2-yarn
int MPID_nem_newmad_directRecv(MPIDI_VC_t *vc, MPID_Request *rreq)
{
    int mpi_errno = MPI_SUCCESS;    
    MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_NEWMAD_DIRECTRECV);    
    MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_NEWMAD_DIRECTRECV);    
    
    if (!VC_CH(vc)->is_local)
    {
	nm_tag_t          match_info = 0; 
	nm_tag_t          match_mask = NEM_NMAD_MATCH_FULL_MASK; 	    
	MPIR_Rank_t       source     = rreq->dev.match.parts.rank;
	MPIR_Context_id_t context    = rreq->dev.match.parts.context_id;
	Nmad_Nem_tag_t    tag        = rreq->dev.match.parts.tag;
	int               ret;
	MPIDI_msg_sz_t    data_sz;
	int               dt_contig;
	MPI_Aint          dt_true_lb;
	MPID_Datatype    *dt_ptr;
	
	NEM_NMAD_DIRECT_MATCH(match_info,0,source,context);
	if (tag != MPI_ANY_TAG)
	{
	    NEM_NMAD_SET_TAG(match_info,tag);
	}
	else
	{
	    NEM_NMAD_SET_ANYTAG(match_info);
	    NEM_NMAD_SET_ANYTAG(match_mask);
	}

#ifdef DEBUG
	fprintf(stdout,"========> Posting Recv req  %p (match is %lx) \n",rreq,match_info);
#endif
	MPIDI_Datatype_get_info(rreq->dev.user_count,rreq->dev.datatype, dt_contig, data_sz, dt_ptr,dt_true_lb);
	rreq->dev.OnDataAvail = NULL;

	if (dt_contig)
	{
	    ret = nm_sr_irecv_with_ref_tagged(mpid_nem_newmad_session,VC_FIELD(vc,p_gate),match_info,match_mask,
					      (char*)(rreq->dev.user_buf) + dt_true_lb,data_sz,
					      &(REQ_FIELD(rreq,newmad_req)),(void*)rreq);
	    REQ_FIELD(rreq,iov) = NULL;
	}
	else
	{
	    int           num_seg        = 0;
	    struct iovec *newmad_iov     = (struct iovec *)MPIU_Malloc(NMAD_IOV_MAX_DEPTH*sizeof(struct iovec));	    
	    struct iovec *newmad_iov_ptr = &(newmad_iov[0]); 
	    MPID_nem_newmad_process_rdtype(&rreq,dt_ptr,data_sz,&newmad_iov_ptr,&num_seg);
	    MPIU_Assert(num_seg <= NMAD_IOV_MAX_DEPTH);
#ifdef DEBUG
	    {
		int index;
		for(index = 0; index < num_seg ; index++)
		    {
			fprintf(stdout,"======================\n");
			fprintf(stdout,"RECV nmad_iov[%i]: [base %p][len %i]\n",index,
				newmad_iov[index].iov_base,newmad_iov[index].iov_len);
		    }
	    }
#endif
	    ret = nm_sr_irecv_iov_with_ref_tagged(mpid_nem_newmad_session,VC_FIELD(vc,p_gate),match_info,match_mask,
						  newmad_iov,num_seg,&(REQ_FIELD(rreq,newmad_req)),(void*)rreq);	
	    REQ_FIELD(rreq,iov) = newmad_iov;
	}
    }
    else
    {
	/* Fixme : this might not work in the case of multiple netmods */ 
	memset((&(REQ_FIELD(rreq,newmad_req))),0,sizeof(nm_sr_request_t));
    }

 fn_exit:
    MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_NEWMAD_DIRECTRECV);
    return mpi_errno;
 fn_fail:  ATTRIBUTE((unused))
    goto fn_exit;
}
コード例 #8
0
ファイル: ib_lmt.c プロジェクト: abhinavvishnu/matex
int MPID_nem_ib_lmt_initiate_lmt(struct MPIDI_VC *vc, union MPIDI_CH3_Pkt *rts_pkt,
                                 struct MPID_Request *req)
{
    int mpi_errno = MPI_SUCCESS;
    int dt_contig;
    MPIDI_msg_sz_t data_sz;
    MPID_Datatype *dt_ptr;
    MPI_Aint dt_true_lb;
    MPIDI_CH3I_VC *vc_ch = VC_CH(vc);
    MPID_nem_ib_vc_area *vc_ib = VC_IB(vc);

    MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_LMT_INITIATE_LMT);
    MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_LMT_INITIATE_LMT);

    dprintf("lmt_initiate_lmt,enter,%d->%d,req=%p\n", MPID_nem_ib_myrank, vc->pg_rank, req);

    /* obtain dt_true_lb */
    /* see MPIDI_Datatype_get_info(in, in, out, out, out, out) (in src/mpid/ch3/include/mpidimpl.h) */
    MPIDI_Datatype_get_info(req->dev.user_count, req->dev.datatype, dt_contig, data_sz, dt_ptr,
                            dt_true_lb);

    /* malloc memory area for cookie. auto variable is NG because isend does not copy payload */
    MPID_nem_ib_lmt_cookie_t *s_cookie_buf =
        (MPID_nem_ib_lmt_cookie_t *) MPIU_Malloc(sizeof(MPID_nem_ib_lmt_cookie_t));

    /* remember address to "free" when receiving DONE from receiver */
    req->ch.s_cookie = s_cookie_buf;

    /* see MPIDI_CH3_PktHandler_RndvClrToSend (in src/mpid/ch3/src/ch3u_rndv.c) */
    //assert(dt_true_lb == 0);
    void *write_from_buf;
    if (dt_contig) {
        write_from_buf = req->dev.user_buf;
    }
    else {
        /* see MPIDI_CH3_EagerNoncontigSend (in ch3u_eager.c) */
        req->dev.segment_ptr = MPID_Segment_alloc();
        MPIU_ERR_CHKANDJUMP((req->dev.segment_ptr == NULL), mpi_errno, MPI_ERR_OTHER,
                            "**outofmemory");

        MPID_Segment_init(req->dev.user_buf, req->dev.user_count, req->dev.datatype,
                          req->dev.segment_ptr, 0);
        req->dev.segment_first = 0;
        req->dev.segment_size = data_sz;

        MPIDI_msg_sz_t last;
        last = req->dev.segment_size;   /* segment_size is byte offset */
        MPIU_Assert(last > 0);
        REQ_FIELD(req, lmt_pack_buf) = MPIU_Malloc((size_t) req->dev.segment_size);
        MPIU_ERR_CHKANDJUMP(!REQ_FIELD(req, lmt_pack_buf), mpi_errno, MPI_ERR_OTHER,
                            "**outofmemory");
        MPID_Segment_pack(req->dev.segment_ptr, req->dev.segment_first, &last,
                          (char *) (REQ_FIELD(req, lmt_pack_buf)));
        MPIU_Assert(last == req->dev.segment_size);
        write_from_buf = REQ_FIELD(req, lmt_pack_buf);
    }
    dprintf
        ("lmt_initate_lmt,dt_contig=%d,write_from_buf=%p,req->dev.user_buf=%p,REQ_FIELD(req, lmt_pack_buf)=%p\n",
         dt_contig, write_from_buf, req->dev.user_buf, REQ_FIELD(req, lmt_pack_buf));

#ifdef HAVE_LIBDCFA
#else
    s_cookie_buf->addr = write_from_buf;
#endif
    /* put sz, see MPID_nem_lmt_RndvSend (in src/mpid/ch3/channels/nemesis/src/mpid_nem_lmt.c) */
    /* TODO remove sz field
     * /* pkt_RTS_handler (in src/mpid/ch3/channels/nemesis/src/mpid_nem_lmt.c)
     * rreq->ch.lmt_data_sz = rts_pkt->data_sz; */
    //s_cookie_buf->sz = (uint32_t)((MPID_nem_pkt_lmt_rts_t*)rts_pkt)->data_sz;

    /* preserve and put tail, because tail magic is written on the tail of payload
     * because we don't want to add another SGE or RDMA command */
    MPIU_Assert(((MPID_nem_pkt_lmt_rts_t *) rts_pkt)->data_sz == data_sz);
    s_cookie_buf->tail = *((uint8_t *) ((uint8_t *) write_from_buf + data_sz - sizeof(uint8_t)));
    /* prepare magic */
    //*((uint32_t*)(write_from_buf + data_sz - sizeof(tailmagic_t))) = MPID_NEM_IB_COM_MAGIC;

#if 1   /* embed RDMA-write-to buffer occupancy information */
    dprintf("lmt_initiate_lmt,rsr_seq_num_tail=%d\n", vc_ib->ibcom->rsr_seq_num_tail);
    /* embed RDMA-write-to buffer occupancy information */
    s_cookie_buf->seq_num_tail = vc_ib->ibcom->rsr_seq_num_tail;

    /* remember the last one sent */
    vc_ib->ibcom->rsr_seq_num_tail_last_sent = vc_ib->ibcom->rsr_seq_num_tail;
#endif

    /* put IB rkey */
    struct ibv_mr *mr = MPID_nem_ib_com_reg_mr_fetch(write_from_buf, data_sz);
    MPIU_ERR_CHKANDJUMP(!mr, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_com_reg_mr_fetch");
#ifdef HAVE_LIBDCFA
    s_cookie_buf->addr = (void *) mr->host_addr;
    dprintf("lmt_initiate_lmt,s_cookie_buf->addr=%p\n", s_cookie_buf->addr);
#endif
    s_cookie_buf->rkey = mr->rkey;
    dprintf("lmt_initiate_lmt,tail=%02x,mem-tail=%p,%02x,sz=%ld,raddr=%p,rkey=%08x\n",
            s_cookie_buf->tail, write_from_buf + data_sz - sizeof(uint8_t),
            *((uint8_t *) (write_from_buf + data_sz - sizeof(uint8_t))), data_sz,
            s_cookie_buf->addr, s_cookie_buf->rkey);
    /* send cookie. rts_pkt as the MPI-header, s_cookie_buf as the payload */
    MPID_nem_lmt_send_RTS(vc, (MPID_nem_pkt_lmt_rts_t *) rts_pkt, s_cookie_buf,
                          sizeof(MPID_nem_ib_lmt_cookie_t));

  fn_exit:
    MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_IB_LMT_INITIATE_LMT);
    return mpi_errno;
  fn_fail:
    goto fn_exit;
}
コード例 #9
0
ファイル: ib_lmt.c プロジェクト: abhinavvishnu/matex
int MPID_nem_ib_lmt_start_recv(struct MPIDI_VC *vc, struct MPID_Request *req, MPID_IOV s_cookie)
{
    int mpi_errno = MPI_SUCCESS;
    int ibcom_errno;
    int dt_contig;
    MPIDI_msg_sz_t data_sz;
    MPID_Datatype *dt_ptr;
    MPI_Aint dt_true_lb;
    MPIDI_CH3I_VC *vc_ch = VC_CH(vc);
    MPID_nem_ib_vc_area *vc_ib = VC_IB(vc);

    MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_LMT_START_RECV);
    MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_LMT_START_RECV);

    dprintf("lmt_start_recv,enter,%d<-%d,req=%p\n", MPID_nem_ib_myrank, vc->pg_rank, req);

    /* obtain dt_true_lb */
    /* see MPIDI_Datatype_get_info(in, in, out, out, out, out) (in src/mpid/ch3/include/mpidimpl.h) */
    MPIDI_Datatype_get_info(req->dev.user_count, req->dev.datatype, dt_contig, data_sz, dt_ptr,
                            dt_true_lb);

    MPID_nem_ib_lmt_cookie_t *s_cookie_buf = s_cookie.iov_base;

    /* stash vc for ib_poll */
    req->ch.vc = vc;

    void *write_to_buf;
    if (dt_contig) {
        write_to_buf = (void *) ((char *) req->dev.user_buf /*+ REQ_FIELD(req, lmt_dt_true_lb) */);
    }
    else {
        //REQ_FIELD(req, lmt_pack_buf) = MPIU_Malloc((size_t)req->ch.lmt_data_sz);
        REQ_FIELD(req, lmt_pack_buf) = MPID_nem_ib_stmalloc((size_t) req->ch.lmt_data_sz);
        MPIU_ERR_CHKANDJUMP(!REQ_FIELD(req, lmt_pack_buf), mpi_errno, MPI_ERR_OTHER,
                            "**outofmemory");
        write_to_buf = REQ_FIELD(req, lmt_pack_buf);
    }

#ifdef MPID_NEM_IB_LMT_GET_CQE
#else
    /* unmark magic */
    *((uint8_t *) (write_to_buf + req->ch.lmt_data_sz - sizeof(uint8_t))) = ~s_cookie_buf->tail;        /* size in cookie was not set */
#endif
    dprintf
        ("lmt_start_recv,dt_contig=%d,write_to_buf=%p,req->dev.user_buf=%p,REQ_FIELD(req, lmt_pack_buf)=%p,marked-tail=%02x,unmarked-tail=%02x\n",
         dt_contig, write_to_buf, req->dev.user_buf, REQ_FIELD(req, lmt_pack_buf),
         s_cookie_buf->tail, *((uint8_t *) (write_to_buf + req->ch.lmt_data_sz - sizeof(uint8_t))));

    /* stash tail for poll because do_cts in mpid_nem_lmt.c free s_cookie_buf just after this function */
    REQ_FIELD(req, lmt_tail) = s_cookie_buf->tail;
    dprintf("lmt_start_recv,mem-tail=%p,%02x\n",
            write_to_buf + req->ch.lmt_data_sz - sizeof(uint8_t),
            *((uint8_t *) (write_to_buf + req->ch.lmt_data_sz - sizeof(uint8_t))));

    //dprintf("lmt_start_recv,sendq_empty=%d,ncom=%d,ncqe=%d\n", MPID_nem_ib_sendq_empty(vc_ib->sendq), vc_ib->ibcom->ncom < MPID_NEM_IB_COM_MAX_SQ_CAPACITY, MPID_nem_ib_ncqe < MPID_NEM_IB_COM_MAX_CQ_CAPACITY);

    /* try to issue RDMA-read command */
    int slack = 1;              /* slack for control packet bringing sequence number */
    if (MPID_nem_ib_sendq_empty(vc_ib->sendq) &&
        vc_ib->ibcom->ncom < MPID_NEM_IB_COM_MAX_SQ_CAPACITY - slack &&
        MPID_nem_ib_ncqe < MPID_NEM_IB_COM_MAX_CQ_CAPACITY - slack) {
        mpi_errno = MPID_nem_ib_lmt_start_recv_core(req, s_cookie_buf->addr, s_cookie_buf->rkey, write_to_buf); /* fast path not storing raddr and rkey */
        if (mpi_errno) {
            MPIU_ERR_POP(mpi_errno);
        }
    }
    else {
        /* enqueue command into send_queue */
        dprintf("lmt_start_recv, enqueuing,sendq_empty=%d,ncom=%d,ncqe=%d\n",
                MPID_nem_ib_sendq_empty(vc_ib->sendq),
                vc_ib->ibcom->ncom < MPID_NEM_IB_COM_MAX_SQ_CAPACITY,
                MPID_nem_ib_ncqe < MPID_NEM_IB_COM_MAX_CQ_CAPACITY);

        /* make raddr, (sz is in rreq->ch.lmt_data_sz), rkey, (user_buf is in req->dev.user_buf) survive enqueue, free cookie, dequeue */
        REQ_FIELD(req, lmt_raddr) = s_cookie_buf->addr;
        REQ_FIELD(req, lmt_rkey) = s_cookie_buf->rkey;
        REQ_FIELD(req, lmt_write_to_buf) = write_to_buf;

        MPID_nem_ib_sendq_enqueue(&vc_ib->sendq, req);
    }

    /* extract embeded RDMA-write-to buffer occupancy information */
    dprintf("lmt_start_recv,old lsr_seq_num=%d,s_cookie_buf->seq_num_tail=%d\n",
            vc_ib->ibcom->lsr_seq_num_tail, s_cookie_buf->seq_num_tail);
    vc_ib->ibcom->lsr_seq_num_tail =
        MPID_NEM_IB_MAX(vc_ib->ibcom->lsr_seq_num_tail, s_cookie_buf->seq_num_tail);
    //dprintf("lmt_start_recv,new lsr_seq_num=%d\n", vc_ib->ibcom->lsr_seq_num_tail);

#ifndef MPID_NEM_IB_DISABLE_VAR_OCC_NOTIFY_RATE
    /* change remote notification policy of RDMA-write-to buf */
    //dprintf("lmt_start_recv,reply_seq_num,old rstate=%d\n", vc_ib->ibcom->rdmabuf_occupancy_notify_rstate);
    MPID_nem_ib_change_rdmabuf_occupancy_notify_policy_lw(vc_ib, &vc_ib->ibcom->lsr_seq_num_tail);
    //dprintf("lmt_start_recv,reply_seq_num,new rstate=%d\n", vc_ib->ibcom->rdmabuf_occupancy_notify_rstate);
#endif
    //dprintf("lmt_start_recv,reply_seq_num,sendq_empty=%d,ncom=%d,ncqe=%d,rdmabuf_occ=%d\n", MPID_nem_ib_sendq_empty(vc_ib->sendq), vc_ib->ibcom->ncom, MPID_nem_ib_ncqe, MPID_nem_ib_diff32(vc_ib->ibcom->sseq_num, vc_ib->ibcom->lsr_seq_num_tail));
    /* try to send from sendq because at least one RDMA-write-to buffer has been released */
    //dprintf("lmt_start_recv,reply_seq_num,send_progress\n");
    if (!MPID_nem_ib_sendq_empty(vc_ib->sendq)) {
        dprintf("lmt_start_recv,ncom=%d,ncqe=%d,diff=%d\n",
                vc_ib->ibcom->ncom < MPID_NEM_IB_COM_MAX_SQ_CAPACITY,
                MPID_nem_ib_ncqe < MPID_NEM_IB_COM_MAX_CQ_CAPACITY,
                MPID_nem_ib_diff32(vc_ib->ibcom->sseq_num,
                                   vc_ib->ibcom->lsr_seq_num_tail) < MPID_NEM_IB_COM_RDMABUF_NSEG);
    }
    if (!MPID_nem_ib_sendq_empty(vc_ib->sendq) && MPID_nem_ib_sendq_ready_to_send_head(vc_ib)) {
        dprintf("lmt_start_recv,send_progress\n");
        fflush(stdout);
        MPID_nem_ib_send_progress(vc_ib);
    }

  fn_exit:
    MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_IB_LMT_START_RECV);
    return mpi_errno;
  fn_fail:
    goto fn_exit;
}