int MPID_nem_mx_cancel_send(MPIDI_VC_t *vc, MPID_Request *sreq) { mx_request_t *mx_request = NULL; mx_return_t ret; uint32_t result; int mpi_errno = MPI_SUCCESS; int handled = FALSE; if (!VC_CH(vc)->is_local) { mx_request = &(REQ_FIELD(sreq,mx_request)); ret = mx_cancel(MPID_nem_mx_local_endpoint,mx_request,&result); MPIU_ERR_CHKANDJUMP1(ret != MX_SUCCESS, mpi_errno, MPI_ERR_OTHER, "**mx_cancel", "**mx_cancel %s", mx_strerror(ret)); if (result) { sreq->status.cancelled = TRUE; sreq->cc = 0; MPIU_Object_set_ref(sreq, 1); MPID_nem_mx_pending_send_req--; } else { sreq->status.cancelled = FALSE; } handled = TRUE; } fn_exit: return handled; fn_fail: goto fn_exit; }
int MPIDI_nem_ckpt_start(void) { int mpi_errno = MPI_SUCCESS; int i; MPIDI_STATE_DECL(MPID_STATE_MPIDI_NEM_CKPT_START); MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_NEM_CKPT_START); if (checkpointing) goto fn_exit; checkpointing = TRUE; marker_count = MPIDI_Process.my_pg->size - 1; /* We won't receive a marker from ourselves. */ ++current_wave; /* send markers to all other processes */ /* FIXME: we're only handling processes in our pg, so no dynamic connections */ for (i = 0; i < MPIDI_Process.my_pg->size; ++i) { MPID_Request *req; MPIDI_VC_t *vc; MPIDI_CH3I_VC *vc_ch; MPID_PKT_DECL_CAST(upkt, MPID_nem_pkt_ckpt_marker_t, ckpt_pkt); /* Don't send a marker to ourselves. */ if (i == MPIDI_Process.my_pg_rank) continue; MPIDI_PG_Get_vc_set_active(MPIDI_Process.my_pg, i, &vc); vc_ch = VC_CH(vc); MPIDI_Pkt_init(ckpt_pkt, MPIDI_NEM_PKT_CKPT_MARKER); ckpt_pkt->wave = current_wave; mpi_errno = MPIDI_CH3_iStartMsg(vc, ckpt_pkt, sizeof(ckpt_pkt), &req); MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**ckptpkt"); if (req != NULL) { MPIU_ERR_CHKANDJUMP(req->status.MPI_ERROR, mpi_errno, MPI_ERR_OTHER, "**ckptpkt"); MPID_Request_release(req); } if (!vc_ch->is_local) { mpi_errno = vc_ch->ckpt_pause_send_vc(vc); if (mpi_errno) MPIU_ERR_POP(mpi_errno); } } fn_exit: MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_NEM_CKPT_START); return mpi_errno; fn_fail: goto fn_exit; }
int MPID_nem_mx_vc_init (MPIDI_VC_t *vc) { uint32_t threshold; MPIDI_CH3I_VC *vc_ch = VC_CH(vc); int mpi_errno = MPI_SUCCESS; /* first make sure that our private fields in the vc fit into the area provided */ MPIU_Assert(sizeof(MPID_nem_mx_vc_area) <= MPID_NEM_VC_NETMOD_AREA_LEN); #ifdef ONDEMAND VC_FIELD(vc, local_connected) = 0; VC_FIELD(vc, remote_connected) = 0; #else { char *business_card; int val_max_sz; int ret; #ifdef USE_PMI2_API val_max_sz = PMI2_MAX_VALLEN; #else mpi_errno = PMI_KVS_Get_value_length_max(&val_max_sz); #endif business_card = (char *)MPIU_Malloc(val_max_sz); mpi_errno = vc->pg->getConnInfo(vc->pg_rank, business_card,val_max_sz, vc->pg); if (mpi_errno) MPIU_ERR_POP(mpi_errno); mpi_errno = MPID_nem_mx_get_from_bc (business_card, &VC_FIELD(vc, remote_endpoint_id), &VC_FIELD(vc, remote_nic_id)); if (mpi_errno) MPIU_ERR_POP (mpi_errno); MPIU_Free(business_card); ret = mx_connect(MPID_nem_mx_local_endpoint,VC_FIELD(vc, remote_nic_id),VC_FIELD(vc, remote_endpoint_id), MPID_NEM_MX_FILTER,MX_INFINITE,&(VC_FIELD(vc, remote_endpoint_addr))); MPIU_ERR_CHKANDJUMP1 (ret != MX_SUCCESS, mpi_errno, MPI_ERR_OTHER, "**mx_connect", "**mx_connect %s", mx_strerror (ret)); mx_set_endpoint_addr_context(VC_FIELD(vc, remote_endpoint_addr),(void *)vc); MPIDI_CHANGE_VC_STATE(vc, ACTIVE); } #endif mx_get_info(MPID_nem_mx_local_endpoint, MX_COPY_SEND_MAX, NULL, 0, &threshold, sizeof(uint32_t)); vc->eager_max_msg_sz = threshold; vc->rndvSend_fn = NULL; vc->sendNoncontig_fn = MPID_nem_mx_SendNoncontig; vc->comm_ops = &comm_ops; vc_ch->iStartContigMsg = MPID_nem_mx_iStartContigMsg; vc_ch->iSendContig = MPID_nem_mx_iSendContig; fn_exit: return mpi_errno; fn_fail: goto fn_exit; }
int MPID_nem_ib_lmt_done_recv(struct MPIDI_VC *vc, struct MPID_Request *rreq) { int mpi_errno = MPI_SUCCESS; MPIDI_CH3I_VC *vc_ch = VC_CH(vc); MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_LMT_DONE_RECV); MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_LMT_DONE_RECV); dprintf("lmt_done_recv,enter,rreq=%p,head=%p\n", rreq, MPID_nem_ib_lmtq.head); int is_contig; MPID_Datatype_is_contig(rreq->dev.datatype, &is_contig); if (!is_contig) { dprintf("lmt_done_recv,copying noncontiguous data to user buffer\n"); /* see MPIDI_CH3U_Request_unpack_uebuf (in /src/mpid/ch3/src/ch3u_request.c) */ /* or MPIDI_CH3U_Receive_data_found (in src/mpid/ch3/src/ch3u_handle_recv_pkt.c) */ MPIDI_msg_sz_t unpack_sz = rreq->ch.lmt_data_sz; MPID_Segment seg; MPI_Aint last; MPID_Segment_init(rreq->dev.user_buf, rreq->dev.user_count, rreq->dev.datatype, &seg, 0); last = unpack_sz; MPID_Segment_unpack(&seg, 0, &last, REQ_FIELD(rreq, lmt_pack_buf)); if (last != unpack_sz) { /* --BEGIN ERROR HANDLING-- */ /* received data was not entirely consumed by unpack() * because too few bytes remained to fill the next basic * datatype */ MPIR_STATUS_SET_COUNT(rreq->status, last); rreq->status.MPI_ERROR = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, FCNAME, __LINE__, MPI_ERR_TYPE, "**MPID_nem_ib_lmt_done_recv", 0); /* --END ERROR HANDLING-- */ } //MPIU_Free(REQ_FIELD(rreq, lmt_pack_buf)); MPID_nem_ib_stfree(REQ_FIELD(rreq, lmt_pack_buf), (size_t) rreq->ch.lmt_data_sz); } dprintf("lmt_done_recv,1,req=%p,pcc=%d\n", rreq, MPIDI_CH3I_progress_completion_count.v); MPIDI_CH3U_Request_complete(rreq); dprintf("lmt_done_recv,2,pcc=%d\n", MPIDI_CH3I_progress_completion_count.v); fn_exit: MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_IB_LMT_DONE_RECV); return mpi_errno; //fn_fail: goto fn_exit; }
int MPIDI_nem_ckpt_finish(void) { int mpi_errno = MPI_SUCCESS; int i; int ret; MPIDI_STATE_DECL(MPID_STATE_MPIDI_NEM_CKPT_FINISH); MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_NEM_CKPT_FINISH); /* Since we're checkpointing the shared memory region (i.e., the channels between local procs), we don't have to flush those channels, just make sure no one is sending or receiving during the checkpoint */ mpi_errno = MPID_nem_barrier(); if (mpi_errno) MPIU_ERR_POP(mpi_errno); ret = sem_post(&ckpt_sem); MPIU_ERR_CHKANDJUMP1(ret, mpi_errno, MPI_ERR_OTHER, "**sem_post", "**sem_post %s", MPIU_Strerror(errno)); ret = sem_wait(&cont_sem); MPIU_ERR_CHKANDJUMP1(ret, mpi_errno, MPI_ERR_OTHER, "**sem_wait", "**sem_wait %s", MPIU_Strerror(errno)); mpi_errno = MPID_nem_barrier(); if (mpi_errno) MPIU_ERR_POP(mpi_errno); if (ckpt_result == CKPT_CONTINUE) { for (i = 0; i < MPIDI_Process.my_pg->size; ++i) { MPIDI_VC_t *vc; MPIDI_CH3I_VC *vc_ch; /* We didn't send a marker to ourselves. */ if (i == MPIDI_Process.my_pg_rank) continue; MPIDI_PG_Get_vc(MPIDI_Process.my_pg, i, &vc); vc_ch = VC_CH(vc); if (!vc_ch->is_local) { mpi_errno = vc_ch->ckpt_continue_vc(vc); if (mpi_errno) MPIU_ERR_POP(mpi_errno); } } } checkpointing = FALSE; fn_exit: MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_NEM_CKPT_FINISH); return mpi_errno; fn_fail: goto fn_exit; }
int MPID_nem_newmad_vc_init (MPIDI_VC_t *vc) { MPIDI_CH3I_VC *vc_ch = VC_CH(vc); char *business_card; int mpi_errno = MPI_SUCCESS; int val_max_sz; int ret; #ifdef USE_PMI2_API val_max_sz = PMI2_MAX_VALLEN; #else mpi_errno = PMI_KVS_Get_value_length_max(&val_max_sz); #endif business_card = (char *)MPIU_Malloc(val_max_sz); mpi_errno = vc->pg->getConnInfo(vc->pg_rank, business_card,val_max_sz,vc->pg); if (mpi_errno) MPIU_ERR_POP(mpi_errno); /* Very important */ memset(VC_FIELD(vc, url),0,MPID_NEM_NMAD_MAX_SIZE); mpi_errno = MPID_nem_newmad_get_from_bc (business_card, VC_FIELD(vc, url)); if (mpi_errno) MPIU_ERR_POP (mpi_errno); MPIU_Free(business_card); ret = nm_session_connect(mpid_nem_newmad_session, &(VC_FIELD(vc,p_gate)), VC_FIELD(vc, url)); if (ret != NM_ESUCCESS) fprintf(stdout,"nm_session_connect returned ret = %d\n", ret); nm_gate_ref_set(VC_FIELD(vc, p_gate),(void*)vc); MPIDI_CHANGE_VC_STATE(vc, ACTIVE); vc->eager_max_msg_sz = 32768; vc->rndvSend_fn = NULL; vc->sendNoncontig_fn = MPID_nem_newmad_SendNoncontig; vc->comm_ops = &comm_ops; vc_ch->iStartContigMsg = MPID_nem_newmad_iStartContigMsg; vc_ch->iSendContig = MPID_nem_newmad_iSendContig; fn_exit: return mpi_errno; fn_fail: goto fn_exit; }
int MPID_nem_newmad_directRecv(MPIDI_VC_t *vc, MPID_Request *rreq) { int mpi_errno = MPI_SUCCESS; MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_NEWMAD_DIRECTRECV); MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_NEWMAD_DIRECTRECV); if (!VC_CH(vc)->is_local) { nm_tag_t match_info = 0; nm_tag_t match_mask = NEM_NMAD_MATCH_FULL_MASK; MPIR_Rank_t source = rreq->dev.match.parts.rank; MPIR_Context_id_t context = rreq->dev.match.parts.context_id; Nmad_Nem_tag_t tag = rreq->dev.match.parts.tag; int ret; MPIDI_msg_sz_t data_sz; int dt_contig; MPI_Aint dt_true_lb; MPID_Datatype *dt_ptr; NEM_NMAD_DIRECT_MATCH(match_info,0,source,context); if (tag != MPI_ANY_TAG) { NEM_NMAD_SET_TAG(match_info,tag); } else { NEM_NMAD_SET_ANYTAG(match_info); NEM_NMAD_SET_ANYTAG(match_mask); } #ifdef DEBUG fprintf(stdout,"========> Posting Recv req %p (match is %lx) \n",rreq,match_info); #endif MPIDI_Datatype_get_info(rreq->dev.user_count,rreq->dev.datatype, dt_contig, data_sz, dt_ptr,dt_true_lb); rreq->dev.OnDataAvail = NULL; if (dt_contig) { ret = nm_sr_irecv_with_ref_tagged(mpid_nem_newmad_session,VC_FIELD(vc,p_gate),match_info,match_mask, (char*)(rreq->dev.user_buf) + dt_true_lb,data_sz, &(REQ_FIELD(rreq,newmad_req)),(void*)rreq); REQ_FIELD(rreq,iov) = NULL; } else { int num_seg = 0; struct iovec *newmad_iov = (struct iovec *)MPIU_Malloc(NMAD_IOV_MAX_DEPTH*sizeof(struct iovec)); struct iovec *newmad_iov_ptr = &(newmad_iov[0]); MPID_nem_newmad_process_rdtype(&rreq,dt_ptr,data_sz,&newmad_iov_ptr,&num_seg); MPIU_Assert(num_seg <= NMAD_IOV_MAX_DEPTH); #ifdef DEBUG { int index; for(index = 0; index < num_seg ; index++) { fprintf(stdout,"======================\n"); fprintf(stdout,"RECV nmad_iov[%i]: [base %p][len %i]\n",index, newmad_iov[index].iov_base,newmad_iov[index].iov_len); } } #endif ret = nm_sr_irecv_iov_with_ref_tagged(mpid_nem_newmad_session,VC_FIELD(vc,p_gate),match_info,match_mask, newmad_iov,num_seg,&(REQ_FIELD(rreq,newmad_req)),(void*)rreq); REQ_FIELD(rreq,iov) = newmad_iov; } } else { /* Fixme : this might not work in the case of multiple netmods */ memset((&(REQ_FIELD(rreq,newmad_req))),0,sizeof(nm_sr_request_t)); } fn_exit: MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_NEWMAD_DIRECTRECV); return mpi_errno; fn_fail: ATTRIBUTE((unused)) goto fn_exit; }
int MPID_nem_ib_lmt_initiate_lmt(struct MPIDI_VC *vc, union MPIDI_CH3_Pkt *rts_pkt, struct MPID_Request *req) { int mpi_errno = MPI_SUCCESS; int dt_contig; MPIDI_msg_sz_t data_sz; MPID_Datatype *dt_ptr; MPI_Aint dt_true_lb; MPIDI_CH3I_VC *vc_ch = VC_CH(vc); MPID_nem_ib_vc_area *vc_ib = VC_IB(vc); MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_LMT_INITIATE_LMT); MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_LMT_INITIATE_LMT); dprintf("lmt_initiate_lmt,enter,%d->%d,req=%p\n", MPID_nem_ib_myrank, vc->pg_rank, req); /* obtain dt_true_lb */ /* see MPIDI_Datatype_get_info(in, in, out, out, out, out) (in src/mpid/ch3/include/mpidimpl.h) */ MPIDI_Datatype_get_info(req->dev.user_count, req->dev.datatype, dt_contig, data_sz, dt_ptr, dt_true_lb); /* malloc memory area for cookie. auto variable is NG because isend does not copy payload */ MPID_nem_ib_lmt_cookie_t *s_cookie_buf = (MPID_nem_ib_lmt_cookie_t *) MPIU_Malloc(sizeof(MPID_nem_ib_lmt_cookie_t)); /* remember address to "free" when receiving DONE from receiver */ req->ch.s_cookie = s_cookie_buf; /* see MPIDI_CH3_PktHandler_RndvClrToSend (in src/mpid/ch3/src/ch3u_rndv.c) */ //assert(dt_true_lb == 0); void *write_from_buf; if (dt_contig) { write_from_buf = req->dev.user_buf; } else { /* see MPIDI_CH3_EagerNoncontigSend (in ch3u_eager.c) */ req->dev.segment_ptr = MPID_Segment_alloc(); MPIU_ERR_CHKANDJUMP((req->dev.segment_ptr == NULL), mpi_errno, MPI_ERR_OTHER, "**outofmemory"); MPID_Segment_init(req->dev.user_buf, req->dev.user_count, req->dev.datatype, req->dev.segment_ptr, 0); req->dev.segment_first = 0; req->dev.segment_size = data_sz; MPIDI_msg_sz_t last; last = req->dev.segment_size; /* segment_size is byte offset */ MPIU_Assert(last > 0); REQ_FIELD(req, lmt_pack_buf) = MPIU_Malloc((size_t) req->dev.segment_size); MPIU_ERR_CHKANDJUMP(!REQ_FIELD(req, lmt_pack_buf), mpi_errno, MPI_ERR_OTHER, "**outofmemory"); MPID_Segment_pack(req->dev.segment_ptr, req->dev.segment_first, &last, (char *) (REQ_FIELD(req, lmt_pack_buf))); MPIU_Assert(last == req->dev.segment_size); write_from_buf = REQ_FIELD(req, lmt_pack_buf); } dprintf ("lmt_initate_lmt,dt_contig=%d,write_from_buf=%p,req->dev.user_buf=%p,REQ_FIELD(req, lmt_pack_buf)=%p\n", dt_contig, write_from_buf, req->dev.user_buf, REQ_FIELD(req, lmt_pack_buf)); #ifdef HAVE_LIBDCFA #else s_cookie_buf->addr = write_from_buf; #endif /* put sz, see MPID_nem_lmt_RndvSend (in src/mpid/ch3/channels/nemesis/src/mpid_nem_lmt.c) */ /* TODO remove sz field * /* pkt_RTS_handler (in src/mpid/ch3/channels/nemesis/src/mpid_nem_lmt.c) * rreq->ch.lmt_data_sz = rts_pkt->data_sz; */ //s_cookie_buf->sz = (uint32_t)((MPID_nem_pkt_lmt_rts_t*)rts_pkt)->data_sz; /* preserve and put tail, because tail magic is written on the tail of payload * because we don't want to add another SGE or RDMA command */ MPIU_Assert(((MPID_nem_pkt_lmt_rts_t *) rts_pkt)->data_sz == data_sz); s_cookie_buf->tail = *((uint8_t *) ((uint8_t *) write_from_buf + data_sz - sizeof(uint8_t))); /* prepare magic */ //*((uint32_t*)(write_from_buf + data_sz - sizeof(tailmagic_t))) = MPID_NEM_IB_COM_MAGIC; #if 1 /* embed RDMA-write-to buffer occupancy information */ dprintf("lmt_initiate_lmt,rsr_seq_num_tail=%d\n", vc_ib->ibcom->rsr_seq_num_tail); /* embed RDMA-write-to buffer occupancy information */ s_cookie_buf->seq_num_tail = vc_ib->ibcom->rsr_seq_num_tail; /* remember the last one sent */ vc_ib->ibcom->rsr_seq_num_tail_last_sent = vc_ib->ibcom->rsr_seq_num_tail; #endif /* put IB rkey */ struct ibv_mr *mr = MPID_nem_ib_com_reg_mr_fetch(write_from_buf, data_sz); MPIU_ERR_CHKANDJUMP(!mr, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_com_reg_mr_fetch"); #ifdef HAVE_LIBDCFA s_cookie_buf->addr = (void *) mr->host_addr; dprintf("lmt_initiate_lmt,s_cookie_buf->addr=%p\n", s_cookie_buf->addr); #endif s_cookie_buf->rkey = mr->rkey; dprintf("lmt_initiate_lmt,tail=%02x,mem-tail=%p,%02x,sz=%ld,raddr=%p,rkey=%08x\n", s_cookie_buf->tail, write_from_buf + data_sz - sizeof(uint8_t), *((uint8_t *) (write_from_buf + data_sz - sizeof(uint8_t))), data_sz, s_cookie_buf->addr, s_cookie_buf->rkey); /* send cookie. rts_pkt as the MPI-header, s_cookie_buf as the payload */ MPID_nem_lmt_send_RTS(vc, (MPID_nem_pkt_lmt_rts_t *) rts_pkt, s_cookie_buf, sizeof(MPID_nem_ib_lmt_cookie_t)); fn_exit: MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_IB_LMT_INITIATE_LMT); return mpi_errno; fn_fail: goto fn_exit; }
int MPID_nem_ib_lmt_start_recv(struct MPIDI_VC *vc, struct MPID_Request *req, MPID_IOV s_cookie) { int mpi_errno = MPI_SUCCESS; int ibcom_errno; int dt_contig; MPIDI_msg_sz_t data_sz; MPID_Datatype *dt_ptr; MPI_Aint dt_true_lb; MPIDI_CH3I_VC *vc_ch = VC_CH(vc); MPID_nem_ib_vc_area *vc_ib = VC_IB(vc); MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_LMT_START_RECV); MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_LMT_START_RECV); dprintf("lmt_start_recv,enter,%d<-%d,req=%p\n", MPID_nem_ib_myrank, vc->pg_rank, req); /* obtain dt_true_lb */ /* see MPIDI_Datatype_get_info(in, in, out, out, out, out) (in src/mpid/ch3/include/mpidimpl.h) */ MPIDI_Datatype_get_info(req->dev.user_count, req->dev.datatype, dt_contig, data_sz, dt_ptr, dt_true_lb); MPID_nem_ib_lmt_cookie_t *s_cookie_buf = s_cookie.iov_base; /* stash vc for ib_poll */ req->ch.vc = vc; void *write_to_buf; if (dt_contig) { write_to_buf = (void *) ((char *) req->dev.user_buf /*+ REQ_FIELD(req, lmt_dt_true_lb) */); } else { //REQ_FIELD(req, lmt_pack_buf) = MPIU_Malloc((size_t)req->ch.lmt_data_sz); REQ_FIELD(req, lmt_pack_buf) = MPID_nem_ib_stmalloc((size_t) req->ch.lmt_data_sz); MPIU_ERR_CHKANDJUMP(!REQ_FIELD(req, lmt_pack_buf), mpi_errno, MPI_ERR_OTHER, "**outofmemory"); write_to_buf = REQ_FIELD(req, lmt_pack_buf); } #ifdef MPID_NEM_IB_LMT_GET_CQE #else /* unmark magic */ *((uint8_t *) (write_to_buf + req->ch.lmt_data_sz - sizeof(uint8_t))) = ~s_cookie_buf->tail; /* size in cookie was not set */ #endif dprintf ("lmt_start_recv,dt_contig=%d,write_to_buf=%p,req->dev.user_buf=%p,REQ_FIELD(req, lmt_pack_buf)=%p,marked-tail=%02x,unmarked-tail=%02x\n", dt_contig, write_to_buf, req->dev.user_buf, REQ_FIELD(req, lmt_pack_buf), s_cookie_buf->tail, *((uint8_t *) (write_to_buf + req->ch.lmt_data_sz - sizeof(uint8_t)))); /* stash tail for poll because do_cts in mpid_nem_lmt.c free s_cookie_buf just after this function */ REQ_FIELD(req, lmt_tail) = s_cookie_buf->tail; dprintf("lmt_start_recv,mem-tail=%p,%02x\n", write_to_buf + req->ch.lmt_data_sz - sizeof(uint8_t), *((uint8_t *) (write_to_buf + req->ch.lmt_data_sz - sizeof(uint8_t)))); //dprintf("lmt_start_recv,sendq_empty=%d,ncom=%d,ncqe=%d\n", MPID_nem_ib_sendq_empty(vc_ib->sendq), vc_ib->ibcom->ncom < MPID_NEM_IB_COM_MAX_SQ_CAPACITY, MPID_nem_ib_ncqe < MPID_NEM_IB_COM_MAX_CQ_CAPACITY); /* try to issue RDMA-read command */ int slack = 1; /* slack for control packet bringing sequence number */ if (MPID_nem_ib_sendq_empty(vc_ib->sendq) && vc_ib->ibcom->ncom < MPID_NEM_IB_COM_MAX_SQ_CAPACITY - slack && MPID_nem_ib_ncqe < MPID_NEM_IB_COM_MAX_CQ_CAPACITY - slack) { mpi_errno = MPID_nem_ib_lmt_start_recv_core(req, s_cookie_buf->addr, s_cookie_buf->rkey, write_to_buf); /* fast path not storing raddr and rkey */ if (mpi_errno) { MPIU_ERR_POP(mpi_errno); } } else { /* enqueue command into send_queue */ dprintf("lmt_start_recv, enqueuing,sendq_empty=%d,ncom=%d,ncqe=%d\n", MPID_nem_ib_sendq_empty(vc_ib->sendq), vc_ib->ibcom->ncom < MPID_NEM_IB_COM_MAX_SQ_CAPACITY, MPID_nem_ib_ncqe < MPID_NEM_IB_COM_MAX_CQ_CAPACITY); /* make raddr, (sz is in rreq->ch.lmt_data_sz), rkey, (user_buf is in req->dev.user_buf) survive enqueue, free cookie, dequeue */ REQ_FIELD(req, lmt_raddr) = s_cookie_buf->addr; REQ_FIELD(req, lmt_rkey) = s_cookie_buf->rkey; REQ_FIELD(req, lmt_write_to_buf) = write_to_buf; MPID_nem_ib_sendq_enqueue(&vc_ib->sendq, req); } /* extract embeded RDMA-write-to buffer occupancy information */ dprintf("lmt_start_recv,old lsr_seq_num=%d,s_cookie_buf->seq_num_tail=%d\n", vc_ib->ibcom->lsr_seq_num_tail, s_cookie_buf->seq_num_tail); vc_ib->ibcom->lsr_seq_num_tail = MPID_NEM_IB_MAX(vc_ib->ibcom->lsr_seq_num_tail, s_cookie_buf->seq_num_tail); //dprintf("lmt_start_recv,new lsr_seq_num=%d\n", vc_ib->ibcom->lsr_seq_num_tail); #ifndef MPID_NEM_IB_DISABLE_VAR_OCC_NOTIFY_RATE /* change remote notification policy of RDMA-write-to buf */ //dprintf("lmt_start_recv,reply_seq_num,old rstate=%d\n", vc_ib->ibcom->rdmabuf_occupancy_notify_rstate); MPID_nem_ib_change_rdmabuf_occupancy_notify_policy_lw(vc_ib, &vc_ib->ibcom->lsr_seq_num_tail); //dprintf("lmt_start_recv,reply_seq_num,new rstate=%d\n", vc_ib->ibcom->rdmabuf_occupancy_notify_rstate); #endif //dprintf("lmt_start_recv,reply_seq_num,sendq_empty=%d,ncom=%d,ncqe=%d,rdmabuf_occ=%d\n", MPID_nem_ib_sendq_empty(vc_ib->sendq), vc_ib->ibcom->ncom, MPID_nem_ib_ncqe, MPID_nem_ib_diff32(vc_ib->ibcom->sseq_num, vc_ib->ibcom->lsr_seq_num_tail)); /* try to send from sendq because at least one RDMA-write-to buffer has been released */ //dprintf("lmt_start_recv,reply_seq_num,send_progress\n"); if (!MPID_nem_ib_sendq_empty(vc_ib->sendq)) { dprintf("lmt_start_recv,ncom=%d,ncqe=%d,diff=%d\n", vc_ib->ibcom->ncom < MPID_NEM_IB_COM_MAX_SQ_CAPACITY, MPID_nem_ib_ncqe < MPID_NEM_IB_COM_MAX_CQ_CAPACITY, MPID_nem_ib_diff32(vc_ib->ibcom->sseq_num, vc_ib->ibcom->lsr_seq_num_tail) < MPID_NEM_IB_COM_RDMABUF_NSEG); } if (!MPID_nem_ib_sendq_empty(vc_ib->sendq) && MPID_nem_ib_sendq_ready_to_send_head(vc_ib)) { dprintf("lmt_start_recv,send_progress\n"); fflush(stdout); MPID_nem_ib_send_progress(vc_ib); } fn_exit: MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_IB_LMT_START_RECV); return mpi_errno; fn_fail: goto fn_exit; }