예제 #1
0
void
MPID_nem_newmad_handle_sreq(MPID_Request *req)
{
    int (*reqFn)(MPIDI_VC_t *, MPID_Request *, int *);
#ifdef DEBUG
    fprintf(stdout,"========> Completing Send req  %p \n",req);
#endif
    reqFn = req->dev.OnDataAvail;
    if (!reqFn){
	MPIDI_CH3U_Request_complete(req);
	MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, ".... complete");
    }
    else{
	MPIDI_VC_t *vc = req->ch.vc;
	int complete   = 0;
	reqFn(vc, req, &complete);
	if(!complete)
        {   
	   MPIU_Assert(complete == TRUE);
	}
    }
    if (REQ_FIELD(req,iov) != NULL)
	MPIU_Free((REQ_FIELD(req,iov)));
    mpid_nem_newmad_pending_send_req--;
}
예제 #2
0
int MPID_nem_ib_lmt_done_recv(struct MPIDI_VC *vc, struct MPID_Request *rreq)
{
    int mpi_errno = MPI_SUCCESS;

    MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_LMT_DONE_RECV);
    MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_LMT_DONE_RECV);

    dprintf("lmt_done_recv,enter,rreq=%p,head=%p\n", rreq, MPID_nem_ib_lmtq.head);


    int is_contig;
    MPID_Datatype_is_contig(rreq->dev.datatype, &is_contig);
    if (!is_contig) {
        dprintf("lmt_done_recv,copying noncontiguous data to user buffer\n");

        /* see MPIDI_CH3U_Request_unpack_uebuf (in /src/mpid/ch3/src/ch3u_request.c) */
        /* or MPIDI_CH3U_Receive_data_found (in src/mpid/ch3/src/ch3u_handle_recv_pkt.c) */
        MPIDI_msg_sz_t unpack_sz = rreq->ch.lmt_data_sz;
        MPID_Segment seg;
        MPI_Aint last;

        MPID_Segment_init(rreq->dev.user_buf, rreq->dev.user_count, rreq->dev.datatype, &seg, 0);
        last = unpack_sz;
        MPID_Segment_unpack(&seg, 0, &last, REQ_FIELD(rreq, lmt_pack_buf));
        if (last != unpack_sz) {
            /* --BEGIN ERROR HANDLING-- */
            /* received data was not entirely consumed by unpack()
             * because too few bytes remained to fill the next basic
             * datatype */
            MPIR_STATUS_SET_COUNT(rreq->status, last);
            rreq->status.MPI_ERROR =
                MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, FCNAME, __LINE__,
                                     MPI_ERR_TYPE, "**MPID_nem_ib_lmt_done_recv", 0);
            /* --END ERROR HANDLING-- */
        }

        //MPIU_Free(REQ_FIELD(rreq, lmt_pack_buf));
        MPID_nem_ib_stfree(REQ_FIELD(rreq, lmt_pack_buf), (size_t) rreq->ch.lmt_data_sz);
    }

    dprintf("lmt_done_recv,1,req=%p,pcc=%d\n", rreq, MPIDI_CH3I_progress_completion_count.v);
    MPIDI_CH3U_Request_complete(rreq);
    dprintf("lmt_done_recv,complete,req=%p\n", rreq);
    dprintf("lmt_done_recv,2,pcc=%d\n", MPIDI_CH3I_progress_completion_count.v);

  fn_exit:
    MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_IB_LMT_DONE_RECV);
    return mpi_errno;
    //fn_fail:
    goto fn_exit;
}
예제 #3
0
int MPID_nem_newmad_anysource_matched(MPID_Request *rreq)
{
    /* This function is called when an anysource request in the posted
       receive queue is matched and dequeued */
    nm_sr_request_t *nmad_request = NULL;
    int ret;
    int matched = FALSE;

#ifdef DEBUG
    fprintf(stdout,"========> Any Source : MPID_nem_newmad_anysource_matched , req is %p\n",rreq);
#endif

    MPID_NEM_NMAD_GET_REQ_FROM_HASH(rreq,nmad_request);

    if(nmad_request != NULL)
    {	

#ifdef DEBUG
	fprintf(stdout,"========> Any Source nmad req found :%p \n",nmad_request);
#endif
	ret = nm_sr_rcancel(mpid_nem_newmad_session,nmad_request);
	if (ret !=  NM_ESUCCESS)
	{

#ifdef DEBUG
	    fprintf(stdout,"========> Any Source nmad req (%p) not cancelled \n",nmad_request);
#endif
	    size_t size;
	    nm_tag_t match_info;
	    MPIU_Assert(MPIDI_Request_get_type(rreq) != MPIDI_REQUEST_TYPE_GET_RESP);                  	
	    ret = nm_sr_rwait(mpid_nem_newmad_session,nmad_request);
	    MPIU_Assert(ret ==  NM_ESUCCESS);
	    nm_sr_request_unset_completion_queue(mpid_nem_newmad_session,nmad_request);
	    nm_sr_get_rtag(mpid_nem_newmad_session,nmad_request,&match_info);
	    nm_sr_get_size(mpid_nem_newmad_session,nmad_request,&size);
	    MPID_nem_newmad_handle_rreq(rreq,match_info, size);
	    matched = TRUE;
	}
	else
	{
	    MPID_Segment_free(rreq->dev.segment_ptr);
	    if (REQ_FIELD(rreq,iov) != NULL)
	      MPIU_Free(REQ_FIELD(rreq,iov));	
	}    
	MPIU_Free(nmad_request);
    }    
    return matched;
}
void
ngx_http_reqstat_count(void *data, off_t offset, ngx_int_t incr)
{
    ngx_http_reqstat_rbnode_t    *node = data;

    (void) ngx_atomic_fetch_add(REQ_FIELD(node, offset), incr);
}
예제 #5
0
int MPID_nem_mx_cancel_send(MPIDI_VC_t *vc, MPID_Request *sreq)
{
    mx_request_t *mx_request = NULL;
    mx_return_t ret;
    uint32_t    result;
    int mpi_errno = MPI_SUCCESS;
    int handled = FALSE;
   
     if (!VC_CH(vc)->is_local)
     {
	mx_request = &(REQ_FIELD(sreq,mx_request));
	ret = mx_cancel(MPID_nem_mx_local_endpoint,mx_request,&result);
	MPIU_ERR_CHKANDJUMP1(ret != MX_SUCCESS, mpi_errno, MPI_ERR_OTHER, "**mx_cancel", "**mx_cancel %s", mx_strerror(ret));
	
	if (result)
	{
	   sreq->status.cancelled = TRUE;
	   sreq->cc = 0;
	   MPIU_Object_set_ref(sreq, 1);       
	   MPID_nem_mx_pending_send_req--;
	}
	else
        {	    
	   sreq->status.cancelled = FALSE;
	}
	handled = TRUE;
     }
   
 fn_exit:
    return handled;
 fn_fail:
    goto fn_exit;
}
예제 #6
0
int MPID_nem_ib_lmt_done_send(struct MPIDI_VC *vc, struct MPID_Request *req)
{
    int mpi_errno = MPI_SUCCESS;
    MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_LMT_DONE_SEND);
    MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_LMT_DONE_SEND);

    dprintf("lmt_done_send,enter,%d<-%d,req=%p,REQ_FIELD(req, lmt_pack_buf)=%p\n",
            MPID_nem_ib_myrank, vc->pg_rank, req, REQ_FIELD(req, lmt_pack_buf));


    /* free memory area for cookie */
    if (!req->ch.s_cookie) {
        dprintf("lmt_done_send,enter,req->ch.s_cookie is zero");
    }
    MPIU_Free(req->ch.s_cookie);
    //dprintf("lmt_done_send,free cookie,%p\n", req->ch.s_cookie);

    /* free temporal buffer for eager-send non-contiguous data.
     * MPIDI_CH3U_Recvq_FDU_or_AEP (in mpid_isend.c) sets req->dev.datatype */
    int is_contig;
    MPID_Datatype_is_contig(req->dev.datatype, &is_contig);
    if (!is_contig && REQ_FIELD(req, lmt_pack_buf)) {
        dprintf("lmt_done_send,lmt-get,non-contiguous,free lmt_pack_buf\n");
#if 1   /* debug, enable again later */
        MPIU_Free(REQ_FIELD(req, lmt_pack_buf));
#endif
    }

    /* mark completion on sreq */
    MPIU_ERR_CHKANDJUMP(req->dev.OnDataAvail, mpi_errno, MPI_ERR_OTHER,
                        "**MPID_nem_ib_lmt_done_send");
    dprintf("lmt_done_send,1,req=%p,pcc=%d\n", req, MPIDI_CH3I_progress_completion_count.v);
    MPIDI_CH3U_Request_complete(req);
    dprintf("lmt_done_send,complete,req=%p\n", req);
    dprintf("lmt_done_send,2,req=%p,pcc=%d\n", req, MPIDI_CH3I_progress_completion_count.v);
    //dprintf("lmt_done_send, mark completion on sreq\n");

  fn_exit:
    MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_IB_LMT_DONE_SEND);
    return mpi_errno;
  fn_fail:
    goto fn_exit;
}
예제 #7
0
int MPID_nem_ib_lmt_start_recv_core(struct MPID_Request *req, void *raddr, uint32_t rkey,
                                    void *write_to_buf)
{
    int mpi_errno = MPI_SUCCESS;
    int ibcom_errno;
    struct MPIDI_VC *vc = req->ch.vc;
    MPID_nem_ib_vc_area *vc_ib = VC_IB(vc);

    MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_LMT_START_RECV_CORE);
    MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_LMT_START_RECV_CORE);

    ibcom_errno =
        MPID_nem_ib_com_lrecv(vc_ib->sc->fd, (uint64_t) req, raddr, req->ch.lmt_data_sz, rkey,
                              write_to_buf);
    MPID_nem_ib_ncqe += 1;
    //dprintf("start_recv,ncqe=%d\n", MPID_nem_ib_ncqe);
    MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_com_lrecv");
    dprintf("lmt_start_recv_core,MPID_nem_ib_ncqe=%d\n", MPID_nem_ib_ncqe);
    dprintf
        ("lmt_start_recv_core,req=%p,sz=%ld,write_to_buf=%p,lmt_pack_buf=%p,user_buf=%p,raddr=%p,rkey=%08x,tail=%p=%02x\n",
         req, req->ch.lmt_data_sz, write_to_buf, REQ_FIELD(req, lmt_pack_buf), req->dev.user_buf,
         raddr, rkey, write_to_buf + req->ch.lmt_data_sz - sizeof(uint8_t),
         *((uint8_t *) (write_to_buf + req->ch.lmt_data_sz - sizeof(uint8_t))));

#ifdef MPID_NEM_IB_LMT_GET_CQE
    MPID_nem_ib_ncqe_to_drain += 1;     /* use CQE instead of polling */
#else
    /* drain_scq and ib_poll is not ordered, so both can decrement ref_count */
    MPIR_Request_add_ref(req);

    /* register to poll list in ib_poll() */
    /* don't use req->dev.next because it causes unknown problem */
    MPID_nem_ib_lmtq_enqueue(&MPID_nem_ib_lmtq, req);
    dprintf("lmt_start_recv_core,lmtq enqueue\n");
    //volatile uint8_t* tailmagic = (uint8_t*)((void*)req->dev.user_buf + req->ch.lmt_data_sz - sizeof(uint8_t));
    //dprintf("start_recv_core,cur_tail=%02x,lmt_receiver_tail=%02x\n", *tailmagic, REQ_FIELD(req, lmt_receiver_tail));
#endif

  fn_exit:
    MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_IB_LMT_START_RECV_CORE);
    return mpi_errno;
  fn_fail:
    goto fn_exit;
}
예제 #8
0
int MPID_nem_mx_cancel_recv(MPIDI_VC_t *vc, MPID_Request *rreq)
{
    mx_request_t *mx_request = NULL;
    mx_return_t   ret;
    uint32_t      result;
    int           mpi_errno = MPI_SUCCESS;
    int           handled = FALSE;
   
    mx_request = &(REQ_FIELD(rreq,mx_request));
    /* FIXME this test is probably not correct with multiple netmods        */
    /* We need to know to which netmod a recv request actually "belongs" to */
    if(mx_request != NULL)
    {
       ret = mx_cancel(MPID_nem_mx_local_endpoint,mx_request,&result);
       MPIU_ERR_CHKANDJUMP1(ret != MX_SUCCESS, mpi_errno, MPI_ERR_OTHER, "**mx_cancel", "**mx_cancel %s", mx_strerror(ret));
       
       if (result)
       {	    
	  int found;
	  rreq->status.cancelled = TRUE;
	  found = MPIDI_CH3U_Recvq_DP(rreq);
	  MPIU_Assert(found);
	  rreq->status.count = 0;
	  MPID_REQUEST_SET_COMPLETED(rreq);
	  MPID_Request_release(rreq);       
       }
       else
       {
	  rreq->status.cancelled = FALSE;
	  MPIU_DBG_MSG_P(CH3_OTHER,VERBOSE,
			 "request 0x%08x already matched, unable to cancel", rreq->handle);
       }
       handled = TRUE;
     }
   
 fn_exit:
    return mpi_errno;
 fn_fail:
    goto fn_exit;
}
예제 #9
0
int MPID_nem_newmad_cancel_recv(MPIDI_VC_t *vc, MPID_Request *rreq)
{
    nm_sr_request_t *nmad_req = NULL;
    int mpi_errno = MPI_SUCCESS;
    int ret;
    
    nmad_req = &(REQ_FIELD(rreq,newmad_req));    
    ret = nm_sr_rcancel(mpid_nem_newmad_session,nmad_req);

    if (ret ==  NM_ESUCCESS)
    {	    
        rreq->status.cancelled = TRUE;
    }
    else
    {
        rreq->status.cancelled = FALSE;
    }

 fn_exit:
    return mpi_errno;
 fn_fail:  ATTRIBUTE((unused))
    goto fn_exit;
}
예제 #10
0
void MPID_nem_newmad_anysource_posted(MPID_Request *rreq)
{
    /* This function is called whenever an anyource request has been
       posted to the posted receive queue.  */
    MPIR_Context_id_t context;
    Nmad_Nem_tag_t    tag;
    nm_tag_t          match_info  = 0;
    nm_tag_t          match_mask  = NEM_NMAD_MATCH_FULL_MASK; 
    nm_sr_request_t  *newmad_req  = MPIU_Malloc(sizeof(nm_sr_request_t));
    int               num_seg     = 1;
    int               ret;
    MPIDI_msg_sz_t    data_sz;
    int               dt_contig;
    MPI_Aint          dt_true_lb;
    MPID_Datatype    *dt_ptr;               
    struct iovec     *newmad_iov  = (struct iovec *)MPIU_Malloc(NMAD_IOV_MAX_DEPTH*sizeof(struct iovec));
  
    tag     = rreq->dev.match.parts.tag;
    context = rreq->dev.match.parts.context_id;                       
    NEM_NMAD_DIRECT_MATCH(match_info,0,0,context);
    if (tag != MPI_ANY_TAG)
    {
	NEM_NMAD_SET_TAG(match_info,tag);	
    }
    else
    {
	NEM_NMAD_SET_ANYTAG(match_info);
	NEM_NMAD_SET_ANYTAG(match_mask); 
    }
    NEM_NMAD_SET_ANYSRC(match_info);
    NEM_NMAD_SET_ANYSRC(match_mask);

#ifdef DEBUG
    fprintf(stdout,"========> Any Source : Posting Recv req  %p (nmad req is %p) (match is %lx) (mask is %lx) \n",
	    rreq,newmad_req,match_info,match_mask);
#endif

    MPIDI_Datatype_get_info(rreq->dev.user_count,rreq->dev.datatype, dt_contig, data_sz, dt_ptr,dt_true_lb);
    rreq->dev.OnDataAvail = NULL;
    
    if (dt_contig)
    {
	newmad_iov[0].iov_base = (char*)(rreq->dev.user_buf) + dt_true_lb;
	newmad_iov[0].iov_len  = data_sz;
    }
    else
    {
	struct iovec *newmad_iov_ptr = &(newmad_iov[0]); 
	MPID_nem_newmad_process_rdtype(&rreq,dt_ptr,data_sz,&newmad_iov_ptr,&num_seg);
    }

    ret = nm_sr_irecv_iov_with_ref_tagged(mpid_nem_newmad_session,NM_ANY_GATE,match_info,match_mask,
					  newmad_iov,num_seg,newmad_req,(void*)rreq);	
    REQ_FIELD(rreq,iov) = newmad_iov;    
    MPID_MEM_NMAD_ADD_REQ_IN_HASH(rreq,newmad_req);  
    /*
      #ifdef DEBUG
      fprintf(stdout,"========> Any Source : callback end \n");
      #endif
    */
}
예제 #11
0
static int
MPID_nem_newmad_handle_rreq(MPID_Request *req, nm_tag_t match_info, size_t size)
{
    int            mpi_errno = MPI_SUCCESS;
    int            complete = FALSE;
    int            dt_contig;
    MPI_Aint       dt_true_lb;
    MPIDI_msg_sz_t userbuf_sz;
    MPID_Datatype *dt_ptr;
    MPIDI_msg_sz_t data_sz;
    MPIDI_VC_t    *vc = NULL;

#ifdef DEBUG
   fprintf(stdout,"========> Completing Recv req  %p (match is %lx) \n",req,match_info);
#endif

    NEM_NMAD_MATCH_GET_RANK(match_info,req->status.MPI_SOURCE);
    NEM_NMAD_MATCH_GET_TAG(match_info,req->status.MPI_TAG);
    req->status.count = size;
    req->dev.recv_data_sz = size;

    MPIDI_Datatype_get_info(req->dev.user_count, req->dev.datatype, dt_contig, userbuf_sz, dt_ptr, dt_true_lb);

    if (size <=  userbuf_sz) {
	data_sz = req->dev.recv_data_sz;
    }
    else
    {
	MPIU_DBG_MSG_FMT(CH3_OTHER,VERBOSE,(MPIU_DBG_FDEST,
					    "receive buffer too small; message truncated, msg_sz="
					    MPIDI_MSG_SZ_FMT ", userbuf_sz="
					    MPIDI_MSG_SZ_FMT,
					    req->dev.recv_data_sz, userbuf_sz));
	req->status.MPI_ERROR = MPIR_Err_create_code(MPI_SUCCESS,
						     MPIR_ERR_RECOVERABLE, FCNAME, __LINE__, MPI_ERR_TRUNCATE,
						     "**truncate", "**truncate %d %d %d %d",
						     req->status.MPI_SOURCE, req->status.MPI_TAG,
						     req->dev.recv_data_sz, userbuf_sz );
	req->status.count = userbuf_sz;
	data_sz = userbuf_sz;
    }
    
    if ((!dt_contig)&&(req->dev.tmpbuf != NULL))
    {
	MPIDI_msg_sz_t last;
	last = req->dev.recv_data_sz;
	MPID_Segment_unpack( req->dev.segment_ptr, 0, &last, req->dev.tmpbuf);
	MPIU_Free(req->dev.tmpbuf);
	if (last != data_sz) {
	    req->status.count = (int)last;
	    if (req->dev.recv_data_sz <= userbuf_sz) {
		MPIU_ERR_SETSIMPLE(req->status.MPI_ERROR,MPI_ERR_TYPE,"**dtypemismatch");
	    }
	}
    }

    if (REQ_FIELD(req,iov) != NULL)
	MPIU_Free(REQ_FIELD(req,iov));	

    MPIDI_Comm_get_vc_set_active(req->comm, req->status.MPI_SOURCE, &vc);
    MPIDI_CH3U_Handle_recv_req(vc, req, &complete);
    MPIU_Assert(complete == TRUE);


#ifdef DEBUG
   fprintf(stdout,"========> Completing Recv req  %p done \n",req);
#endif

 fn_exit:
    return mpi_errno;
 fn_fail: ATTRIBUTE((unused))
	goto fn_exit;
}
예제 #12
0
int MPID_nem_ib_lmt_switch_send(struct MPIDI_VC *vc, struct MPID_Request *req)
{
    int mpi_errno = MPI_SUCCESS;
    int dt_contig;
    MPIDI_msg_sz_t data_sz;
    MPID_Datatype *dt_ptr;
    MPI_Aint dt_true_lb;
    MPID_IOV r_cookie = req->ch.lmt_tmp_cookie;
    MPID_nem_ib_lmt_cookie_t *r_cookie_buf = r_cookie.iov_base;

    MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_LMT_SWITCH_SEND);
    MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_LMT_SWITCH_SEND);

    MPIDI_Datatype_get_info(req->dev.user_count, req->dev.datatype, dt_contig, data_sz, dt_ptr,
                            dt_true_lb);

    void *write_from_buf;
    if (dt_contig) {
        write_from_buf = req->dev.user_buf;
    }
    else {
        /* see MPIDI_CH3_EagerNoncontigSend (in ch3u_eager.c) */
        req->dev.segment_ptr = MPID_Segment_alloc();
        MPIU_ERR_CHKANDJUMP((req->dev.segment_ptr == NULL), mpi_errno, MPI_ERR_OTHER,
                            "**outofmemory");

        MPID_Segment_init(req->dev.user_buf, req->dev.user_count, req->dev.datatype,
                          req->dev.segment_ptr, 0);
        req->dev.segment_first = 0;
        req->dev.segment_size = data_sz;

        MPIDI_msg_sz_t last;
        last = req->dev.segment_size;   /* segment_size is byte offset */
        MPIU_Assert(last > 0);

        REQ_FIELD(req, lmt_pack_buf) = MPIU_Malloc(data_sz);
        MPIU_ERR_CHKANDJUMP(!REQ_FIELD(req, lmt_pack_buf), mpi_errno, MPI_ERR_OTHER,
                            "**outofmemory");

        MPID_Segment_pack(req->dev.segment_ptr, req->dev.segment_first, &last,
                          (char *) (REQ_FIELD(req, lmt_pack_buf)));
        MPIU_Assert(last == req->dev.segment_size);

        write_from_buf = REQ_FIELD(req, lmt_pack_buf);
    }

    //assert(dt_true_lb == 0);
    uint8_t *tailp =
        (uint8_t *) ((uint8_t *) write_from_buf /*+ dt_true_lb */  + data_sz - sizeof(uint8_t));
#if 0
    *is_end_flag_same = (r_cookie_buf->tail == *tailp) ? 1 : 0;
#else
    REQ_FIELD(req, lmt_receiver_tail) = r_cookie_buf->tail;
    REQ_FIELD(req, lmt_sender_tail) = *tailp;
    dprintf("lmt_switch_send,tail on sender=%02x,tail onreceiver=%02x,req=%p\n", *tailp,
            r_cookie_buf->tail, req);
#ifdef MPID_NEM_IB_DEBUG_LMT
    uint8_t *tail_wordp = (uint8_t *) ((uint8_t *) write_from_buf + data_sz - sizeof(uint32_t) * 2);
#endif
    dprintf("lmt_switch_send,tail on sender=%d\n", *tail_wordp);
    fflush(stdout);
#endif

  fn_exit:
    MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_IB_LMT_SWITCH_SEND);
    return mpi_errno;
  fn_fail:
    goto fn_exit;
}
예제 #13
0
int MPID_nem_ib_lmt_initiate_lmt(struct MPIDI_VC *vc, union MPIDI_CH3_Pkt *rts_pkt,
                                 struct MPID_Request *req)
{
    int mpi_errno = MPI_SUCCESS;
    int dt_contig;
    MPIDI_msg_sz_t data_sz;
    MPID_Datatype *dt_ptr;
    MPI_Aint dt_true_lb;
#if 0
    MPID_nem_ib_vc_area *vc_ib = VC_IB(vc);
#endif

    MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_LMT_INITIATE_LMT);
    MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_LMT_INITIATE_LMT);

    dprintf("lmt_initiate_lmt,enter,%d->%d,req=%p\n", MPID_nem_ib_myrank, vc->pg_rank, req);

    /* obtain dt_true_lb */
    /* see MPIDI_Datatype_get_info(in, in, out, out, out, out) (in src/mpid/ch3/include/mpidimpl.h) */
    MPIDI_Datatype_get_info(req->dev.user_count, req->dev.datatype, dt_contig, data_sz, dt_ptr,
                            dt_true_lb);

    /* FIXME: who frees s_cookie_buf? */
    /* malloc memory area for cookie. auto variable is NG because isend does not copy payload */
    MPID_nem_ib_lmt_cookie_t *s_cookie_buf =
        (MPID_nem_ib_lmt_cookie_t *) MPIU_Malloc(sizeof(MPID_nem_ib_lmt_cookie_t));

    /* remember address to "free" when receiving DONE from receiver */
    req->ch.s_cookie = s_cookie_buf;

    /* see MPIDI_CH3_PktHandler_RndvClrToSend (in src/mpid/ch3/src/ch3u_rndv.c) */
    //assert(dt_true_lb == 0);
    void *write_from_buf;
    if (dt_contig) {
        write_from_buf = (void *) ((char *) req->dev.user_buf + dt_true_lb);
    }
    else {
        /* see MPIDI_CH3_EagerNoncontigSend (in ch3u_eager.c) */
        req->dev.segment_ptr = MPID_Segment_alloc();
        MPIU_ERR_CHKANDJUMP((req->dev.segment_ptr == NULL), mpi_errno, MPI_ERR_OTHER,
                            "**outofmemory");

        MPID_Segment_init(req->dev.user_buf, req->dev.user_count, req->dev.datatype,
                          req->dev.segment_ptr, 0);
        req->dev.segment_first = 0;
        req->dev.segment_size = data_sz;

        MPIDI_msg_sz_t last;
        last = req->dev.segment_size;   /* segment_size is byte offset */
        MPIU_Assert(last > 0);
        REQ_FIELD(req, lmt_pack_buf) = MPIU_Malloc((size_t) req->dev.segment_size);
        MPIU_ERR_CHKANDJUMP(!REQ_FIELD(req, lmt_pack_buf), mpi_errno, MPI_ERR_OTHER,
                            "**outofmemory");
        MPID_Segment_pack(req->dev.segment_ptr, req->dev.segment_first, &last,
                          (char *) (REQ_FIELD(req, lmt_pack_buf)));
        MPIU_Assert(last == req->dev.segment_size);
        write_from_buf = REQ_FIELD(req, lmt_pack_buf);
    }
    dprintf
        ("lmt_initate_lmt,dt_contig=%d,write_from_buf=%p,req->dev.user_buf=%p,REQ_FIELD(req, lmt_pack_buf)=%p\n",
         dt_contig, write_from_buf, req->dev.user_buf, REQ_FIELD(req, lmt_pack_buf));

#ifdef HAVE_LIBDCFA
#else
    s_cookie_buf->addr = write_from_buf;
#endif
    /* put sz, see MPID_nem_lmt_RndvSend (in src/mpid/ch3/channels/nemesis/src/mpid_nem_lmt.c) */
    /* TODO remove sz field
     *   pkt_RTS_handler (in src/mpid/ch3/channels/nemesis/src/mpid_nem_lmt.c)
     * rreq->ch.lmt_data_sz = rts_pkt->data_sz; */
    //s_cookie_buf->sz = (uint32_t)((MPID_nem_pkt_lmt_rts_t*)rts_pkt)->data_sz;

    /* preserve and put tail, because tail magic is written on the tail of payload
     * because we don't want to add another SGE or RDMA command */
    MPIU_Assert(((MPID_nem_pkt_lmt_rts_t *) rts_pkt)->data_sz == data_sz);
    s_cookie_buf->tail = *((uint8_t *) ((uint8_t *) write_from_buf + data_sz - sizeof(uint8_t)));
    /* prepare magic */
    //*((uint32_t*)(write_from_buf + data_sz - sizeof(tailmagic_t))) = MPID_NEM_IB_COM_MAGIC;

#if 0   /* moving to packet header */   /* embed RDMA-write-to buffer occupancy information */
    dprintf("lmt_initiate_lmt,rsr_seq_num_tail=%d\n", vc_ib->ibcom->rsr_seq_num_tail);
    /* embed RDMA-write-to buffer occupancy information */
    s_cookie_buf->seq_num_tail = vc_ib->ibcom->rsr_seq_num_tail;

    /* remember the last one sent */
    vc_ib->ibcom->rsr_seq_num_tail_last_sent = vc_ib->ibcom->rsr_seq_num_tail;
#endif

    int post_num;
    uint32_t max_msg_sz;
    MPID_nem_ib_vc_area *vc_ib = VC_IB(vc);
    MPID_nem_ib_com_get_info_conn(vc_ib->sc->fd, MPID_NEM_IB_COM_INFOKEY_PATTR_MAX_MSG_SZ,
                                  &max_msg_sz, sizeof(uint32_t));

    /* Type of max_msg_sz is uint32_t. */
    post_num = (data_sz + (long) max_msg_sz - 1) / (long) max_msg_sz;

    s_cookie_buf->max_msg_sz = max_msg_sz;
    s_cookie_buf->seg_seq_num = 1;
    s_cookie_buf->seg_num = post_num;

    REQ_FIELD(req, buf.from) = write_from_buf;
    REQ_FIELD(req, data_sz) = data_sz;
    REQ_FIELD(req, seg_seq_num) = 1;    // only send 1st-segment, even if there are some segments.
    REQ_FIELD(req, seg_num) = post_num;
    REQ_FIELD(req, max_msg_sz) = max_msg_sz;

    long length;
    if (post_num > 1) {
        length = max_msg_sz;
    }
    else {
        length = data_sz;
    }
    /* put IB rkey */
    struct MPID_nem_ib_com_reg_mr_cache_entry_t *mr_cache =
        MPID_nem_ib_com_reg_mr_fetch(write_from_buf, length, 0, MPID_NEM_IB_COM_REG_MR_GLOBAL);
    MPIU_ERR_CHKANDJUMP(!mr_cache, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_com_reg_mr_fetch");
    struct ibv_mr *mr = mr_cache->mr;
    REQ_FIELD(req, lmt_mr_cache) = (void *) mr_cache;
#ifdef HAVE_LIBDCFA
    s_cookie_buf->addr = (void *) mr->host_addr;
    dprintf("lmt_initiate_lmt,s_cookie_buf->addr=%p\n", s_cookie_buf->addr);
#endif
    s_cookie_buf->rkey = mr->rkey;
    dprintf("lmt_initiate_lmt,tail=%02x,mem-tail=%p,%02x,sz=%ld,raddr=%p,rkey=%08x\n",
            s_cookie_buf->tail, write_from_buf + data_sz - sizeof(uint8_t),
            *((uint8_t *) (write_from_buf + data_sz - sizeof(uint8_t))), data_sz,
            s_cookie_buf->addr, s_cookie_buf->rkey);
    /* send cookie. rts_pkt as the MPI-header, s_cookie_buf as the payload */
    MPID_nem_lmt_send_RTS(vc, (MPID_nem_pkt_lmt_rts_t *) rts_pkt, s_cookie_buf,
                          sizeof(MPID_nem_ib_lmt_cookie_t));

  fn_exit:
    MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_IB_LMT_INITIATE_LMT);
    return mpi_errno;
  fn_fail:
    goto fn_exit;
}
예제 #14
0
int MPID_nem_ib_lmt_start_recv_core(struct MPID_Request *req, void *raddr, uint32_t rkey, long len,
                                    void *write_to_buf, uint32_t max_msg_sz, int end)
{
    int mpi_errno = MPI_SUCCESS;
    int ibcom_errno;
    struct MPIDI_VC *vc = req->ch.vc;
    MPID_nem_ib_vc_area *vc_ib = VC_IB(vc);
    int i;
    int divide;
    int posted_num;
    int last;
    uint32_t r_max_msg_sz;      /* responder's max_msg_sz */
    void *write_pos;
    void *addr;
    long data_sz;
    MPIDI_msg_sz_t rest_data_sz;

    MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_LMT_START_RECV_CORE);
    MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_LMT_START_RECV_CORE);

    MPID_nem_ib_com_get_info_conn(vc_ib->sc->fd, MPID_NEM_IB_COM_INFOKEY_PATTR_MAX_MSG_SZ,
                                  &r_max_msg_sz, sizeof(uint32_t));

    divide = (max_msg_sz + r_max_msg_sz - 1) / r_max_msg_sz;

    write_pos = write_to_buf;
    posted_num = 0;
    last = MPID_NEM_IB_LMT_PART_OF_SEGMENT;
    rest_data_sz = len;
    addr = raddr;

    for (i = 0; i < divide; i++) {
        if (i == divide - 1)
            data_sz = max_msg_sz - i * r_max_msg_sz;
        else
            data_sz = r_max_msg_sz;

        if (i == divide - 1) {
            if (end)
                last = MPID_NEM_IB_LMT_LAST_PKT;        /* last part of last segment packet */
            else
                last = MPID_NEM_IB_LMT_SEGMENT_LAST;    /* last part of this segment */

            /* last data may be smaller than initiator's max_msg_sz */
            if (rest_data_sz < max_msg_sz)
                data_sz = rest_data_sz;
        }

        ibcom_errno =
            MPID_nem_ib_com_lrecv(vc_ib->sc->fd, (uint64_t) req, addr, data_sz, rkey,
                                  write_pos, last);
        MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_com_lrecv");

        /* update position */
        write_pos = (void *) ((char *) write_pos + data_sz);
        addr = (void *) ((char *) addr + data_sz);

        /* update rest data size */
        rest_data_sz -= data_sz;

        /* count request number */
        posted_num++;
    }

    MPIU_Assert(rest_data_sz == 0);
    MPID_nem_ib_ncqe += posted_num;
    //dprintf("start_recv,ncqe=%d\n", MPID_nem_ib_ncqe);
    dprintf("lmt_start_recv_core,MPID_nem_ib_ncqe=%d\n", MPID_nem_ib_ncqe);
    dprintf
        ("lmt_start_recv_core,req=%p,sz=%ld,write_to_buf=%p,lmt_pack_buf=%p,user_buf=%p,raddr=%p,rkey=%08x,tail=%p=%02x\n",
         req, req->ch.lmt_data_sz, write_to_buf, REQ_FIELD(req, lmt_pack_buf), req->dev.user_buf,
         raddr, rkey, write_to_buf + req->ch.lmt_data_sz - sizeof(uint8_t),
         *((uint8_t *) (write_to_buf + req->ch.lmt_data_sz - sizeof(uint8_t))));
    //fflush(stdout);

#ifdef MPID_NEM_IB_LMT_GET_CQE
    MPID_nem_ib_ncqe_to_drain += posted_num;    /* use CQE instead of polling */
#else
    /* drain_scq and ib_poll is not ordered, so both can decrement ref_count */
    MPIR_Request_add_ref(req);

    /* register to poll list in ib_poll() */
    /* don't use req->dev.next because it causes unknown problem */
    MPID_nem_ib_lmtq_enqueue(&MPID_nem_ib_lmtq, req);
    dprintf("lmt_start_recv_core,lmtq enqueue\n");
    //volatile uint8_t* tailmagic = (uint8_t*)((void*)req->dev.user_buf + req->ch.lmt_data_sz - sizeof(uint8_t));
    //dprintf("start_recv_core,cur_tail=%02x,lmt_receiver_tail=%02x\n", *tailmagic, REQ_FIELD(req, lmt_receiver_tail));
#endif

  fn_exit:
    MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_IB_LMT_START_RECV_CORE);
    return mpi_errno;
  fn_fail:
    goto fn_exit;
}
예제 #15
0
static ngx_int_t
ngx_http_reqstat_show_handler(ngx_http_request_t *r)
{
    ngx_int_t                     rc;
    ngx_buf_t                    *b;
    ngx_uint_t                    i, j;
    ngx_array_t                  *display;
    ngx_chain_t                  *tl, *free, *busy;
    ngx_queue_t                  *q;
    ngx_shm_zone_t              **shm_zone;
    ngx_http_reqstat_ctx_t       *ctx;
    ngx_http_reqstat_conf_t      *slcf;
    ngx_http_reqstat_conf_t      *smcf;
    ngx_http_reqstat_rbnode_t    *node;

    slcf = ngx_http_get_module_loc_conf(r, ngx_http_reqstat_module);
    smcf = ngx_http_get_module_main_conf(r, ngx_http_reqstat_module);

    display = slcf->display == NULL ? smcf->monitor : slcf->display;
    if (display == NULL) {
        r->headers_out.status = NGX_HTTP_NO_CONTENT;
        return ngx_http_send_header(r);
    }

    r->headers_out.status = NGX_HTTP_OK;
    ngx_http_clear_content_length(r);

    rc = ngx_http_send_header(r);
    if (rc == NGX_ERROR || rc > NGX_OK || r->header_only) {
        return rc;
    }

    shm_zone = display->elts;

    for (free = busy = NULL, i = 0; i < display->nelts; i++) {

        ctx = shm_zone[i]->data;

        for (q = ngx_queue_head(&ctx->sh->queue);
             q != ngx_queue_sentinel(&ctx->sh->queue);
             q = ngx_queue_next(q))
        {
            node = ngx_queue_data(q, ngx_http_reqstat_rbnode_t, queue);

            tl = ngx_chain_get_free_buf(r->pool, &free);
            if (tl == NULL) {
                return NGX_HTTP_INTERNAL_SERVER_ERROR;
            }

            b = tl->buf;
            if (b->start == NULL) {
                b->start = ngx_pcalloc(r->pool, 512);
                if (b->start == NULL) {
                    return NGX_HTTP_INTERNAL_SERVER_ERROR;
                }

                b->end = b->start + 512;
            }

            b->last = b->pos = b->start;
            b->memory = 1;
            b->temporary = 1;

            b->last = ngx_slprintf(b->last, b->end, "%*s,",
                                   (size_t) node->len, node->data);

            for (j = 0;
                 j < sizeof(ngx_http_reqstat_fields) / sizeof(off_t);
                 j++)
            {
                b->last = ngx_slprintf(b->last, b->end, "%uA,",
                                       *REQ_FIELD(node,
                                                  ngx_http_reqstat_fields[j]));
            }

            *(b->last - 1) = '\n';

            if (ngx_http_output_filter(r, tl) == NGX_ERROR) {
                return NGX_HTTP_INTERNAL_SERVER_ERROR;
            }

#if nginx_version >= 1002000
            ngx_chain_update_chains(r->pool, &free, &busy, &tl,
                                    (ngx_buf_tag_t) &ngx_http_reqstat_module);
#else
            ngx_chain_update_chains(&free, &busy, &tl,
                                    (ngx_buf_tag_t) &ngx_http_reqstat_module);
#endif
        }
    }

    tl = ngx_chain_get_free_buf(r->pool, &free);
    if (tl == NULL) {
        return NGX_HTTP_INTERNAL_SERVER_ERROR;
    }

    b = tl->buf;
    b->last_buf = 1;

    return ngx_http_output_filter(r, tl);
}
예제 #16
0
int MPID_nem_newmad_directRecv(MPIDI_VC_t *vc, MPID_Request *rreq)
{
    int mpi_errno = MPI_SUCCESS;    
    MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_NEWMAD_DIRECTRECV);    
    MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_NEWMAD_DIRECTRECV);    
    
    if (!VC_CH(vc)->is_local)
    {
	nm_tag_t          match_info = 0; 
	nm_tag_t          match_mask = NEM_NMAD_MATCH_FULL_MASK; 	    
	MPIR_Rank_t       source     = rreq->dev.match.parts.rank;
	MPIR_Context_id_t context    = rreq->dev.match.parts.context_id;
	Nmad_Nem_tag_t    tag        = rreq->dev.match.parts.tag;
	int               ret;
	MPIDI_msg_sz_t    data_sz;
	int               dt_contig;
	MPI_Aint          dt_true_lb;
	MPID_Datatype    *dt_ptr;
	
	NEM_NMAD_DIRECT_MATCH(match_info,0,source,context);
	if (tag != MPI_ANY_TAG)
	{
	    NEM_NMAD_SET_TAG(match_info,tag);
	}
	else
	{
	    NEM_NMAD_SET_ANYTAG(match_info);
	    NEM_NMAD_SET_ANYTAG(match_mask);
	}

#ifdef DEBUG
	fprintf(stdout,"========> Posting Recv req  %p (match is %lx) \n",rreq,match_info);
#endif
	MPIDI_Datatype_get_info(rreq->dev.user_count,rreq->dev.datatype, dt_contig, data_sz, dt_ptr,dt_true_lb);
	rreq->dev.OnDataAvail = NULL;

	if (dt_contig)
	{
	    ret = nm_sr_irecv_with_ref_tagged(mpid_nem_newmad_session,VC_FIELD(vc,p_gate),match_info,match_mask,
					      (char*)(rreq->dev.user_buf) + dt_true_lb,data_sz,
					      &(REQ_FIELD(rreq,newmad_req)),(void*)rreq);
	    REQ_FIELD(rreq,iov) = NULL;
	}
	else
	{
	    int           num_seg        = 0;
	    struct iovec *newmad_iov     = (struct iovec *)MPIU_Malloc(NMAD_IOV_MAX_DEPTH*sizeof(struct iovec));	    
	    struct iovec *newmad_iov_ptr = &(newmad_iov[0]); 
	    MPID_nem_newmad_process_rdtype(&rreq,dt_ptr,data_sz,&newmad_iov_ptr,&num_seg);
	    MPIU_Assert(num_seg <= NMAD_IOV_MAX_DEPTH);
#ifdef DEBUG
	    {
		int index;
		for(index = 0; index < num_seg ; index++)
		    {
			fprintf(stdout,"======================\n");
			fprintf(stdout,"RECV nmad_iov[%i]: [base %p][len %i]\n",index,
				newmad_iov[index].iov_base,newmad_iov[index].iov_len);
		    }
	    }
#endif
	    ret = nm_sr_irecv_iov_with_ref_tagged(mpid_nem_newmad_session,VC_FIELD(vc,p_gate),match_info,match_mask,
						  newmad_iov,num_seg,&(REQ_FIELD(rreq,newmad_req)),(void*)rreq);	
	    REQ_FIELD(rreq,iov) = newmad_iov;
	}
    }
    else
    {
	/* Fixme : this might not work in the case of multiple netmods */ 
	memset((&(REQ_FIELD(rreq,newmad_req))),0,sizeof(nm_sr_request_t));
    }

 fn_exit:
    MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_NEWMAD_DIRECTRECV);
    return mpi_errno;
 fn_fail:  ATTRIBUTE((unused))
    goto fn_exit;
}
예제 #17
0
int MPID_nem_ib_lmt_start_recv(struct MPIDI_VC *vc, struct MPID_Request *req, MPID_IOV s_cookie)
{
    int mpi_errno = MPI_SUCCESS;
    int dt_contig;
    MPIDI_msg_sz_t data_sz;
    MPID_Datatype *dt_ptr;
    MPI_Aint dt_true_lb;
    MPID_nem_ib_vc_area *vc_ib = VC_IB(vc);

    MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_LMT_START_RECV);
    MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_LMT_START_RECV);

    dprintf("lmt_start_recv,enter,%d<-%d,req=%p\n", MPID_nem_ib_myrank, vc->pg_rank, req);

    /* obtain dt_true_lb */
    /* see MPIDI_Datatype_get_info(in, in, out, out, out, out) (in src/mpid/ch3/include/mpidimpl.h) */
    MPIDI_Datatype_get_info(req->dev.user_count, req->dev.datatype, dt_contig, data_sz, dt_ptr,
                            dt_true_lb);

    MPID_nem_ib_lmt_cookie_t *s_cookie_buf = s_cookie.iov_base;

    /* stash vc for ib_poll */
    req->ch.vc = vc;

    void *write_to_buf;
    if (dt_contig) {
        write_to_buf = (void *) ((char *) req->dev.user_buf + dt_true_lb);
    }
    else {
        //REQ_FIELD(req, lmt_pack_buf) = MPIU_Malloc((size_t)req->ch.lmt_data_sz);
        REQ_FIELD(req, lmt_pack_buf) = MPID_nem_ib_stmalloc((size_t) req->ch.lmt_data_sz);
        MPIU_ERR_CHKANDJUMP(!REQ_FIELD(req, lmt_pack_buf), mpi_errno, MPI_ERR_OTHER,
                            "**outofmemory");
        write_to_buf = REQ_FIELD(req, lmt_pack_buf);
    }

    REQ_FIELD(req, buf.to) = write_to_buf;

#ifdef MPID_NEM_IB_LMT_GET_CQE
#else
    /* unmark magic */
    *((uint8_t *) (write_to_buf + req->ch.lmt_data_sz - sizeof(uint8_t))) = ~s_cookie_buf->tail;        /* size in cookie was not set */
#endif
    dprintf
        ("lmt_start_recv,dt_contig=%d,write_to_buf=%p,req->dev.user_buf=%p,REQ_FIELD(req, lmt_pack_buf)=%p,marked-tail=%02x,unmarked-tail=%02x\n",
         dt_contig, write_to_buf, req->dev.user_buf, REQ_FIELD(req, lmt_pack_buf),
         s_cookie_buf->tail, *((uint8_t *) (write_to_buf + req->ch.lmt_data_sz - sizeof(uint8_t))));

    /* stash tail for poll because do_cts in mpid_nem_lmt.c free s_cookie_buf just after this function */
    REQ_FIELD(req, lmt_tail) = s_cookie_buf->tail;
    dprintf("lmt_start_recv,mem-tail=%p,%02x\n",
            write_to_buf + req->ch.lmt_data_sz - sizeof(uint8_t),
            *((uint8_t *) (write_to_buf + req->ch.lmt_data_sz - sizeof(uint8_t))));

    //dprintf("lmt_start_recv,sendq_empty=%d,ncom=%d,ncqe=%d\n", MPID_nem_ib_sendq_empty(vc_ib->sendq), vc_ib->ibcom->ncom < MPID_NEM_IB_COM_MAX_SQ_CAPACITY, MPID_nem_ib_ncqe < MPID_NEM_IB_COM_MAX_CQ_CAPACITY);

    int last = 1;
    long length = req->ch.lmt_data_sz;

    if (s_cookie_buf->seg_seq_num != s_cookie_buf->seg_num) {
        last = 0;
        length = s_cookie_buf->max_msg_sz;
    }

    REQ_FIELD(req, max_msg_sz) = s_cookie_buf->max_msg_sz; /* store initiator's max_msg_sz */
    REQ_FIELD(req, seg_num) = s_cookie_buf->seg_num; /* store number of segments */

    /* try to issue RDMA-read command */
    int slack = 1;              /* slack for control packet bringing sequence number */
    if (MPID_nem_ib_sendq_empty(vc_ib->sendq) &&
        vc_ib->ibcom->ncom < MPID_NEM_IB_COM_MAX_SQ_CAPACITY - slack &&
        MPID_nem_ib_ncqe < MPID_NEM_IB_COM_MAX_CQ_CAPACITY - slack) {
        mpi_errno =
            MPID_nem_ib_lmt_start_recv_core(req, s_cookie_buf->addr, s_cookie_buf->rkey, length,
                                            write_to_buf, s_cookie_buf->max_msg_sz, last);
        if (mpi_errno) {
            MPIU_ERR_POP(mpi_errno);
        }
    }
    else {
        /* enqueue command into send_queue */
        dprintf("lmt_start_recv, enqueuing,sendq_empty=%d,ncom=%d,ncqe=%d\n",
                MPID_nem_ib_sendq_empty(vc_ib->sendq),
                vc_ib->ibcom->ncom < MPID_NEM_IB_COM_MAX_SQ_CAPACITY,
                MPID_nem_ib_ncqe < MPID_NEM_IB_COM_MAX_CQ_CAPACITY);

        /* make raddr, (sz is in rreq->ch.lmt_data_sz), rkey, (user_buf is in req->dev.user_buf) survive enqueue, free cookie, dequeue */
        REQ_FIELD(req, lmt_raddr) = s_cookie_buf->addr;
        REQ_FIELD(req, lmt_rkey) = s_cookie_buf->rkey;
        REQ_FIELD(req, lmt_write_to_buf) = write_to_buf;
        REQ_FIELD(req, lmt_szsend) = length;
        REQ_FIELD(req, last) = last;

        MPID_nem_ib_sendq_enqueue(&vc_ib->sendq, req);
    }

#if 0   /* moving to packet header */
    /* extract embeded RDMA-write-to buffer occupancy information */
    dprintf("lmt_start_recv,old lsr_seq_num=%d,s_cookie_buf->seq_num_tail=%d\n",
            vc_ib->ibcom->lsr_seq_num_tail, s_cookie_buf->seq_num_tail);
    vc_ib->ibcom->lsr_seq_num_tail = s_cookie_buf->seq_num_tail;
    //dprintf("lmt_start_recv,new lsr_seq_num=%d\n", vc_ib->ibcom->lsr_seq_num_tail);
#endif

#ifndef MPID_NEM_IB_DISABLE_VAR_OCC_NOTIFY_RATE
    /* change remote notification policy of RDMA-write-to buf */
    //dprintf("lmt_start_recv,reply_seq_num,old rstate=%d\n", vc_ib->ibcom->rdmabuf_occupancy_notify_rstate);
    MPID_nem_ib_change_rdmabuf_occupancy_notify_policy_lw(vc_ib, &vc_ib->ibcom->lsr_seq_num_tail);
    //dprintf("lmt_start_recv,reply_seq_num,new rstate=%d\n", vc_ib->ibcom->rdmabuf_occupancy_notify_rstate);
#endif
    //dprintf("lmt_start_recv,reply_seq_num,sendq_empty=%d,ncom=%d,ncqe=%d,rdmabuf_occ=%d\n", MPID_nem_ib_sendq_empty(vc_ib->sendq), vc_ib->ibcom->ncom, MPID_nem_ib_ncqe, MPID_nem_ib_diff16(vc_ib->ibcom->sseq_num, vc_ib->ibcom->lsr_seq_num_tail));
    /* try to send from sendq because at least one RDMA-write-to buffer has been released */
    //dprintf("lmt_start_recv,reply_seq_num,send_progress\n");
    if (!MPID_nem_ib_sendq_empty(vc_ib->sendq)) {
        dprintf("lmt_start_recv,ncom=%d,ncqe=%d,diff=%d\n",
                vc_ib->ibcom->ncom < MPID_NEM_IB_COM_MAX_SQ_CAPACITY,
                MPID_nem_ib_ncqe < MPID_NEM_IB_COM_MAX_CQ_CAPACITY,
                MPID_nem_ib_diff16(vc_ib->ibcom->sseq_num,
                                   vc_ib->ibcom->lsr_seq_num_tail) < MPID_NEM_IB_COM_RDMABUF_NSEG);
    }
    if (!MPID_nem_ib_sendq_empty(vc_ib->sendq) && MPID_nem_ib_sendq_ready_to_send_head(vc_ib)) {
        dprintf("lmt_start_recv,send_progress\n");
        fflush(stdout);
        MPID_nem_ib_send_progress(vc);
    }

  fn_exit:
    MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_IB_LMT_START_RECV);
    return mpi_errno;
  fn_fail:
    goto fn_exit;
}