int MPIR_Finalize_async_thread(void) { int mpi_errno = MPI_SUCCESS; #if MPICH_THREAD_LEVEL == MPI_THREAD_MULTIPLE MPIR_Request *request_ptr = NULL; MPI_Request request; MPI_Status status; MPIR_FUNC_TERSE_STATE_DECL(MPID_STATE_MPIR_FINALIZE_ASYNC_THREAD); MPIR_FUNC_TERSE_ENTER(MPID_STATE_MPIR_FINALIZE_ASYNC_THREAD); mpi_errno = MPID_Isend(NULL, 0, MPI_CHAR, 0, WAKE_TAG, progress_comm_ptr, MPIR_CONTEXT_INTRA_PT2PT, &request_ptr); MPIR_Assert(!mpi_errno); request = request_ptr->handle; mpi_errno = MPIR_Wait_impl(&request, &status); MPIR_Assert(!mpi_errno); /* XXX DJG why is this unlock/lock necessary? Should we just YIELD here or later? */ MPID_THREAD_CS_EXIT(GLOBAL, MPIR_THREAD_GLOBAL_ALLFUNC_MUTEX); MPID_Thread_mutex_lock(&progress_mutex, &mpi_errno); MPIR_Assert(!mpi_errno); while (!progress_thread_done) { MPID_Thread_cond_wait(&progress_cond, &progress_mutex, &mpi_errno); MPIR_Assert(!mpi_errno); } MPID_Thread_mutex_unlock(&progress_mutex, &mpi_errno); MPIR_Assert(!mpi_errno); mpi_errno = MPIR_Comm_free_impl(progress_comm_ptr); MPIR_Assert(!mpi_errno); MPID_THREAD_CS_ENTER(GLOBAL, MPIR_THREAD_GLOBAL_ALLFUNC_MUTEX); MPID_Thread_cond_destroy(&progress_cond, &mpi_errno); MPIR_Assert(!mpi_errno); MPID_Thread_mutex_destroy(&progress_mutex, &mpi_errno); MPIR_Assert(!mpi_errno); MPIR_FUNC_TERSE_EXIT(MPID_STATE_MPIR_FINALIZE_ASYNC_THREAD); #endif /* MPICH_THREAD_LEVEL == MPI_THREAD_MULTIPLE */ return mpi_errno; }
static void progress_fn(void * data) { int mpi_errno = MPI_SUCCESS; MPIR_Request *request_ptr = NULL; MPI_Request request; MPI_Status status; /* Explicitly add CS_ENTER/EXIT since this thread is created from * within an internal function and will call NMPI functions * directly. */ MPID_THREAD_CS_ENTER(GLOBAL, MPIR_THREAD_GLOBAL_ALLFUNC_MUTEX); /* FIXME: We assume that waiting on some request forces progress * on all requests. With fine-grained threads, will this still * work as expected? We can imagine an approach where a request on * a non-conflicting communicator would not touch the remaining * requests to avoid locking issues. Once the fine-grained threads * code is fully functional, we need to revisit this and, if * appropriate, either change what we do in this thread, or delete * this comment. */ mpi_errno = MPID_Irecv(NULL, 0, MPI_CHAR, 0, WAKE_TAG, progress_comm_ptr, MPIR_CONTEXT_INTRA_PT2PT, &request_ptr); MPIR_Assert(!mpi_errno); request = request_ptr->handle; mpi_errno = MPIR_Wait_impl(&request, &status); MPIR_Assert(!mpi_errno); /* Send a signal to the main thread saying we are done */ MPID_Thread_mutex_lock(&progress_mutex, &mpi_errno); MPIR_Assert(!mpi_errno); progress_thread_done = 1; MPID_Thread_mutex_unlock(&progress_mutex, &mpi_errno); MPIR_Assert(!mpi_errno); MPID_Thread_cond_signal(&progress_cond, &mpi_errno); MPIR_Assert(!mpi_errno); MPID_THREAD_CS_EXIT(GLOBAL, MPIR_THREAD_GLOBAL_ALLFUNC_MUTEX); return; }
int MPIDI_CH3I_MRAILI_Cq_poll(vbuf **vbuf_handle, MPIDI_VC_t * vc_req, int receiving, int is_blocking) { int ne, ret; MPIDI_VC_t *vc = NULL; struct ibv_wc wc; vbuf *v; int i = 0; int cq_choice = 0; int num_cqs = 0; int needed; int is_send_completion; int type = T_CHANNEL_NO_ARRIVE; static unsigned long nspin = 0; struct ibv_cq *ev_cq; struct ibv_cq *chosen_cq; void *ev_ctx; MPIDI_CH3I_MRAILI_Pkt_comm_header *p; int myrank; MPIDI_STATE_DECL(MPID_GEN2_MRAILI_CQ_POLL); MPIDI_FUNC_ENTER(MPID_GEN2_MRAILI_CQ_POLL); myrank = PMI_Get_rank(&myrank); *vbuf_handle = NULL; needed = 0; if (!receiving && !vc_req) { type = MPIDI_CH3I_MRAILI_Test_pkt(vbuf_handle); if (type == T_CHANNEL_EXACT_ARRIVE || type == T_CHANNEL_CONTROL_MSG_ARRIVE) goto fn_exit; } if (rdma_iwarp_use_multiple_cq && MV2_IS_CHELSIO_IWARP_CARD(MPIDI_CH3I_RDMA_Process.hca_type) && (MPIDI_CH3I_RDMA_Process.cluster_size != VERY_SMALL_CLUSTER)) { num_cqs = 2; } else { num_cqs = 1; } for (; i < rdma_num_hcas; ++i) { for (cq_choice = 0; cq_choice < num_cqs; ++cq_choice) { if (1 == num_cqs) { chosen_cq = MPIDI_CH3I_RDMA_Process.cq_hndl[i]; } else { if (0 == cq_choice) { chosen_cq = MPIDI_CH3I_RDMA_Process.send_cq_hndl[i]; } else { chosen_cq = MPIDI_CH3I_RDMA_Process.recv_cq_hndl[i]; } } ne = ibv_poll_cq(chosen_cq, 1, &wc); if (ne < 0 ) { ibv_error_abort(IBV_RETURN_ERR, "Fail to poll cq\n"); } else if (ne) { v = (vbuf *) ((uintptr_t) wc.wr_id); vc = (MPIDI_VC_t *) (v->vc); cq_poll_completion = 1; if (wc.status != IBV_WC_SUCCESS) { if (wc.opcode == IBV_WC_SEND || wc.opcode == IBV_WC_RDMA_WRITE ) { fprintf(stderr, "[%d->%d] send desc error, wc_opcode=%d\n",myrank, vc->pg_rank, wc.opcode ); } else { fprintf(stderr, "[%d<-%d] recv desc error, wc_opcode=%d\n",myrank, vc->pg_rank, wc.opcode); } fprintf(stderr, "[%d->%d] wc.status=%d, wc.wr_id=%p, wc.opcode=%d, vbuf->phead->type=%d = %s\n", myrank, vc->pg_rank, wc.status, v, wc.opcode,((MPIDI_CH3I_MRAILI_Pkt_comm_header*)v->pheader)->type, MPIDI_CH3_Pkt_type_to_string[((MPIDI_CH3I_MRAILI_Pkt_comm_header*)v->pheader)->type] ); ibv_va_error_abort(IBV_STATUS_ERR, "[] Got completion with error %d, " "vendor code=0x%x, dest rank=%d\n", wc.status, wc.vendor_err, ((MPIDI_VC_t *)v->vc)->pg_rank ); } is_send_completion = (wc.opcode == IBV_WC_SEND || wc.opcode == IBV_WC_RDMA_WRITE || wc.opcode == IBV_WC_RDMA_READ); if (2 == num_cqs) { if (0 == cq_choice) { if (MPIDI_CH3I_RDMA_Process.global_used_send_cq) { MPIDI_CH3I_RDMA_Process.global_used_send_cq--; } else { DEBUG_PRINT("[%d] Possibly received a duplicate \ send completion event \n", MPIDI_Process.my_pg_rank); } } } else { if(is_send_completion && (MPIDI_CH3I_RDMA_Process.global_used_send_cq > 0)) { MPIDI_CH3I_RDMA_Process.global_used_send_cq--; } else { DEBUG_PRINT("[%d] Possibly received a duplicate \ send completion event \n", MPIDI_Process.my_pg_rank); } } if(!is_send_completion && (MPIDI_CH3I_RDMA_Process.has_srq || v->transport == IB_TRANSPORT_UD)) { SET_PKT_LEN_HEADER(v, wc); SET_PKT_HEADER_OFFSET(v); p = v->pheader; #ifdef _ENABLE_UD_ MPIDI_PG_Get_vc(MPIDI_Process.my_pg, p->src.rank, &vc); #else vc = (MPIDI_VC_t *)p->src.vc_addr; #endif v->vc = vc; v->rail = p->rail; } /* get the VC and increase its wqe */ if (is_send_completion) { #ifdef _ENABLE_UD_ if (rdma_enable_hybrid) { if(v->transport == IB_TRANSPORT_RC || (v->pheader && IS_CNTL_MSG(v->pheader))) { MRAILI_Process_send(v); } if (v->transport == IB_TRANSPORT_UD) { mv2_ud_update_send_credits(v); } if(v->transport == IB_TRANSPORT_UD && v->flags & UD_VBUF_SEND_INPROGRESS) { v->flags &= ~(UD_VBUF_SEND_INPROGRESS); if (v->flags & UD_VBUF_FREE_PENIDING) { v->flags &= ~(UD_VBUF_FREE_PENIDING); MRAILI_Release_vbuf(v); } } } else #endif { MRAILI_Process_send(v); } type = T_CHANNEL_NO_ARRIVE; *vbuf_handle = NULL; } else if ((NULL == vc_req || vc_req == vc) && 0 == receiving ){ /* In this case, we should return the vbuf * any way if it is next expected*/ int seqnum = GetSeqNumVbuf(v); *vbuf_handle = v; SET_PKT_LEN_HEADER(v, wc); SET_PKT_HEADER_OFFSET(v); v->seqnum = seqnum; p = v->pheader; PRINT_DEBUG(DEBUG_UD_verbose>1,"Received from rank:%d seqnum :%d ack:%d size:%d type:%d trasport :%d \n",vc->pg_rank, v->seqnum, p->acknum, v->content_size, p->type, v->transport); #ifdef _ENABLE_UD_ if (v->transport == IB_TRANSPORT_UD) { mv2_ud_ctx_t *ud_ctx = MPIDI_CH3I_RDMA_Process.ud_rails[i]; --ud_ctx->num_recvs_posted; if(ud_ctx->num_recvs_posted < ud_ctx->credit_preserve) { ud_ctx->num_recvs_posted += mv2_post_ud_recv_buffers( (rdma_default_max_ud_recv_wqe - ud_ctx->num_recvs_posted), ud_ctx); } } else #endif if (MPIDI_CH3I_RDMA_Process.has_srq) { pthread_spin_lock(&MPIDI_CH3I_RDMA_Process. srq_post_spin_lock); if(v->padding == NORMAL_VBUF_FLAG) { /* Can only be from SRQ path */ --MPIDI_CH3I_RDMA_Process.posted_bufs[i]; } if(MPIDI_CH3I_RDMA_Process.posted_bufs[i] <= rdma_credit_preserve) { /* Need to post more to the SRQ */ MPIDI_CH3I_RDMA_Process.posted_bufs[i] += viadev_post_srq_buffers(viadev_srq_fill_size - MPIDI_CH3I_RDMA_Process.posted_bufs[i], i); } pthread_spin_unlock(&MPIDI_CH3I_RDMA_Process. srq_post_spin_lock); /* Check if we need to release the SRQ limit thread */ if (MPIDI_CH3I_RDMA_Process. srq_zero_post_counter[i] >= 1) { pthread_mutex_lock( &MPIDI_CH3I_RDMA_Process. srq_post_mutex_lock[i]); MPIDI_CH3I_RDMA_Process.srq_zero_post_counter[i] = 0; pthread_cond_signal(&MPIDI_CH3I_RDMA_Process. srq_post_cond[i]); pthread_mutex_unlock( &MPIDI_CH3I_RDMA_Process. srq_post_mutex_lock[i]); } } else { --vc->mrail.srp.credits[v->rail].preposts; needed = rdma_prepost_depth + rdma_prepost_noop_extra + MIN(rdma_prepost_rendezvous_extra, vc->mrail.srp.credits[v->rail]. rendezvous_packets_expected); } #ifdef _ENABLE_UD_ if (rdma_enable_hybrid){ if (IS_CNTL_MSG(p)){ type = T_CHANNEL_CONTROL_MSG_ARRIVE; } else { type = T_CHANNEL_HYBRID_MSG_ARRIVE; } } else #endif { if (seqnum == PKT_NO_SEQ_NUM){ type = T_CHANNEL_CONTROL_MSG_ARRIVE; } else if (seqnum == vc->mrail.seqnum_next_torecv) { vc->mrail.seqnum_next_toack = vc->mrail.seqnum_next_torecv; ++vc->mrail.seqnum_next_torecv; type = T_CHANNEL_EXACT_ARRIVE; DEBUG_PRINT("[channel manager] get one with exact seqnum\n"); } else { type = T_CHANNEL_OUT_OF_ORDER_ARRIVE; VQUEUE_ENQUEUE(&vc->mrail.cmanager, INDEX_GLOBAL(&vc->mrail.cmanager, v->rail), v); DEBUG_PRINT("get recv %d (%d)\n", seqnum, vc->mrail.seqnum_next_torecv); } } if (!MPIDI_CH3I_RDMA_Process.has_srq && v->transport != IB_TRANSPORT_UD) { if (PKT_IS_NOOP(v)) { PREPOST_VBUF_RECV(vc, v->rail); /* noops don't count for credits */ --vc->mrail.srp.credits[v->rail].local_credit; } else if ((vc->mrail.srp.credits[v->rail].preposts < rdma_rq_size) && (vc->mrail.srp.credits[v->rail].preposts + rdma_prepost_threshold < needed)) { do { PREPOST_VBUF_RECV(vc, v->rail); } while (vc->mrail.srp.credits[v->rail].preposts < rdma_rq_size && vc->mrail.srp.credits[v->rail].preposts < needed); } MRAILI_Send_noop_if_needed(vc, v->rail); } if (type == T_CHANNEL_CONTROL_MSG_ARRIVE || type == T_CHANNEL_EXACT_ARRIVE || type == T_CHANNEL_HYBRID_MSG_ARRIVE || type == T_CHANNEL_OUT_OF_ORDER_ARRIVE) { goto fn_exit; } } else { /* Commenting out the assert - possible coding error * MPIU_Assert(0); */ /* Now since this is not the packet we want, we have to * enqueue it */ type = T_CHANNEL_OUT_OF_ORDER_ARRIVE; *vbuf_handle = NULL; v->content_size = wc.byte_len; VQUEUE_ENQUEUE(&vc->mrail.cmanager, INDEX_GLOBAL(&vc->mrail.cmanager, v->rail), v); if (v->transport != IB_TRANSPORT_UD) { if (MPIDI_CH3I_RDMA_Process.has_srq) { pthread_spin_lock(&MPIDI_CH3I_RDMA_Process.srq_post_spin_lock); if(v->padding == NORMAL_VBUF_FLAG ) { /* Can only be from SRQ path */ --MPIDI_CH3I_RDMA_Process.posted_bufs[i]; } if(MPIDI_CH3I_RDMA_Process.posted_bufs[i] <= rdma_credit_preserve) { /* Need to post more to the SRQ */ MPIDI_CH3I_RDMA_Process.posted_bufs[i] += viadev_post_srq_buffers(viadev_srq_fill_size - MPIDI_CH3I_RDMA_Process.posted_bufs[i], i); } pthread_spin_unlock(&MPIDI_CH3I_RDMA_Process. srq_post_spin_lock); } else { --vc->mrail.srp.credits[v->rail].preposts; needed = rdma_prepost_depth + rdma_prepost_noop_extra + MIN(rdma_prepost_rendezvous_extra, vc->mrail.srp.credits[v->rail]. rendezvous_packets_expected); if (PKT_IS_NOOP(v)) { PREPOST_VBUF_RECV(vc, v->rail); --vc->mrail.srp.credits[v->rail].local_credit; } else if ((vc->mrail.srp.credits[v->rail].preposts < rdma_rq_size) && (vc->mrail.srp.credits[v->rail].preposts + rdma_prepost_threshold < needed)) { do { PREPOST_VBUF_RECV(vc, v->rail); } while (vc->mrail.srp.credits[v->rail].preposts < rdma_rq_size && vc->mrail.srp.credits[v->rail].preposts < needed); } MRAILI_Send_noop_if_needed(vc, v->rail); } } } } else { *vbuf_handle = NULL; type = T_CHANNEL_NO_ARRIVE; ++nspin; /* Blocking mode progress */ if(rdma_use_blocking && is_blocking && nspin >= rdma_blocking_spin_count_threshold){ /* Okay ... spun long enough, now time to go to sleep! */ #if (MPICH_THREAD_LEVEL == MPI_THREAD_MULTIPLE) MPIU_THREAD_CHECK_BEGIN MPID_Thread_mutex_unlock(&MPIR_ThreadInfo.global_mutex); MPIU_THREAD_CHECK_END #endif do { ret = ibv_get_cq_event( MPIDI_CH3I_RDMA_Process.comp_channel[i], &ev_cq, &ev_ctx); if (ret && errno != EINTR) { ibv_va_error_abort(IBV_RETURN_ERR, "Failed to get cq event: %d\n", ret); } } while (ret && errno == EINTR); #if (MPICH_THREAD_LEVEL == MPI_THREAD_MULTIPLE) MPIU_THREAD_CHECK_BEGIN MPID_Thread_mutex_lock(&MPIR_ThreadInfo.global_mutex); MPIU_THREAD_CHECK_END #endif if (num_cqs == 1) { if (ev_cq != MPIDI_CH3I_RDMA_Process.cq_hndl[i]) { ibv_error_abort(IBV_STATUS_ERR, "Event in unknown CQ\n"); } ibv_ack_cq_events(MPIDI_CH3I_RDMA_Process.cq_hndl[i], 1); if (ibv_req_notify_cq( MPIDI_CH3I_RDMA_Process.cq_hndl[i], 0)) { ibv_error_abort(IBV_RETURN_ERR, "Couldn't request for CQ notification\n"); } } else { if (ev_cq == MPIDI_CH3I_RDMA_Process.send_cq_hndl[i]) { ibv_ack_cq_events( MPIDI_CH3I_RDMA_Process.send_cq_hndl[i], 1); if (ibv_req_notify_cq( MPIDI_CH3I_RDMA_Process.send_cq_hndl[i], 0)) { ibv_error_abort(IBV_RETURN_ERR, "Couldn't request for CQ notification\n"); } } else if (ev_cq == MPIDI_CH3I_RDMA_Process.recv_cq_hndl[i]) { ibv_ack_cq_events( MPIDI_CH3I_RDMA_Process.recv_cq_hndl[i], 1); if (ibv_req_notify_cq( MPIDI_CH3I_RDMA_Process.recv_cq_hndl[i], 0)) { ibv_error_abort(IBV_RETURN_ERR, "Couldn't request for CQ notification\n"); } } else { ibv_error_abort(IBV_STATUS_ERR, "Event in unknown CQ\n"); } } nspin = 0; } } } }