Example #1
0
int MPIR_Finalize_async_thread(void)
{
    int mpi_errno = MPI_SUCCESS;
#if MPICH_THREAD_LEVEL == MPI_THREAD_MULTIPLE
    MPIR_Request *request_ptr = NULL;
    MPI_Request request;
    MPI_Status status;
    MPIR_FUNC_TERSE_STATE_DECL(MPID_STATE_MPIR_FINALIZE_ASYNC_THREAD);

    MPIR_FUNC_TERSE_ENTER(MPID_STATE_MPIR_FINALIZE_ASYNC_THREAD);

    mpi_errno = MPID_Isend(NULL, 0, MPI_CHAR, 0, WAKE_TAG, progress_comm_ptr,
                           MPIR_CONTEXT_INTRA_PT2PT, &request_ptr);
    MPIR_Assert(!mpi_errno);
    request = request_ptr->handle;
    mpi_errno = MPIR_Wait_impl(&request, &status);
    MPIR_Assert(!mpi_errno);

    /* XXX DJG why is this unlock/lock necessary?  Should we just YIELD here or later?  */
    MPID_THREAD_CS_EXIT(GLOBAL, MPIR_THREAD_GLOBAL_ALLFUNC_MUTEX);

    MPID_Thread_mutex_lock(&progress_mutex, &mpi_errno);
    MPIR_Assert(!mpi_errno);

    while (!progress_thread_done) {
        MPID_Thread_cond_wait(&progress_cond, &progress_mutex, &mpi_errno);
        MPIR_Assert(!mpi_errno);
    }

    MPID_Thread_mutex_unlock(&progress_mutex, &mpi_errno);
    MPIR_Assert(!mpi_errno);

    mpi_errno = MPIR_Comm_free_impl(progress_comm_ptr);
    MPIR_Assert(!mpi_errno);

    MPID_THREAD_CS_ENTER(GLOBAL, MPIR_THREAD_GLOBAL_ALLFUNC_MUTEX);

    MPID_Thread_cond_destroy(&progress_cond, &mpi_errno);
    MPIR_Assert(!mpi_errno);

    MPID_Thread_mutex_destroy(&progress_mutex, &mpi_errno);
    MPIR_Assert(!mpi_errno);

    MPIR_FUNC_TERSE_EXIT(MPID_STATE_MPIR_FINALIZE_ASYNC_THREAD);

#endif /* MPICH_THREAD_LEVEL == MPI_THREAD_MULTIPLE */
    return mpi_errno;
}
Example #2
0
static void progress_fn(void * data)
{
    int mpi_errno = MPI_SUCCESS;
    MPIR_Request *request_ptr = NULL;
    MPI_Request request;
    MPI_Status status;

    /* Explicitly add CS_ENTER/EXIT since this thread is created from
     * within an internal function and will call NMPI functions
     * directly. */
    MPID_THREAD_CS_ENTER(GLOBAL, MPIR_THREAD_GLOBAL_ALLFUNC_MUTEX);

    /* FIXME: We assume that waiting on some request forces progress
     * on all requests. With fine-grained threads, will this still
     * work as expected? We can imagine an approach where a request on
     * a non-conflicting communicator would not touch the remaining
     * requests to avoid locking issues. Once the fine-grained threads
     * code is fully functional, we need to revisit this and, if
     * appropriate, either change what we do in this thread, or delete
     * this comment. */

    mpi_errno = MPID_Irecv(NULL, 0, MPI_CHAR, 0, WAKE_TAG, progress_comm_ptr,
                           MPIR_CONTEXT_INTRA_PT2PT, &request_ptr);
    MPIR_Assert(!mpi_errno);
    request = request_ptr->handle;
    mpi_errno = MPIR_Wait_impl(&request, &status);
    MPIR_Assert(!mpi_errno);

    /* Send a signal to the main thread saying we are done */
    MPID_Thread_mutex_lock(&progress_mutex, &mpi_errno);
    MPIR_Assert(!mpi_errno);

    progress_thread_done = 1;

    MPID_Thread_mutex_unlock(&progress_mutex, &mpi_errno);
    MPIR_Assert(!mpi_errno);

    MPID_Thread_cond_signal(&progress_cond, &mpi_errno);
    MPIR_Assert(!mpi_errno);

    MPID_THREAD_CS_EXIT(GLOBAL, MPIR_THREAD_GLOBAL_ALLFUNC_MUTEX);

    return;
}
Example #3
0
int MPIDI_CH3I_MRAILI_Cq_poll(vbuf **vbuf_handle, 
        MPIDI_VC_t * vc_req, int receiving, int is_blocking)
{
    int ne, ret;
    MPIDI_VC_t *vc = NULL;
    struct ibv_wc wc;
    vbuf *v;
    int i = 0;
    int cq_choice = 0;
    int num_cqs = 0;
    int needed;
    int is_send_completion;
    int type = T_CHANNEL_NO_ARRIVE;
    static unsigned long nspin = 0;
    struct ibv_cq *ev_cq; 
    struct ibv_cq *chosen_cq; 
    void *ev_ctx;
    MPIDI_CH3I_MRAILI_Pkt_comm_header *p;

    int myrank;
    MPIDI_STATE_DECL(MPID_GEN2_MRAILI_CQ_POLL);
    MPIDI_FUNC_ENTER(MPID_GEN2_MRAILI_CQ_POLL);
    myrank = PMI_Get_rank(&myrank);

    *vbuf_handle = NULL;
    needed = 0;

    if (!receiving && !vc_req) {
        type = MPIDI_CH3I_MRAILI_Test_pkt(vbuf_handle);
        if (type == T_CHANNEL_EXACT_ARRIVE 
                || type == T_CHANNEL_CONTROL_MSG_ARRIVE)
            goto fn_exit;
    }

    if (rdma_iwarp_use_multiple_cq &&
        MV2_IS_CHELSIO_IWARP_CARD(MPIDI_CH3I_RDMA_Process.hca_type) &&
        (MPIDI_CH3I_RDMA_Process.cluster_size != VERY_SMALL_CLUSTER)) {
        num_cqs = 2;
    } else {
        num_cqs = 1;
    }

    for (; i < rdma_num_hcas; ++i) {
        for (cq_choice = 0; cq_choice < num_cqs; ++cq_choice) {
            if (1 == num_cqs) {
	            chosen_cq = MPIDI_CH3I_RDMA_Process.cq_hndl[i];
	        } else {
	            if (0 == cq_choice) {
	                chosen_cq = MPIDI_CH3I_RDMA_Process.send_cq_hndl[i];
                } else {
	                chosen_cq = MPIDI_CH3I_RDMA_Process.recv_cq_hndl[i];
                }
	        }
	        ne = ibv_poll_cq(chosen_cq, 1, &wc);
	        if (ne < 0 ) {
	            ibv_error_abort(IBV_RETURN_ERR, "Fail to poll cq\n");
	        } else if (ne) {         
	            v = (vbuf *) ((uintptr_t) wc.wr_id);
	
	            vc = (MPIDI_VC_t *) (v->vc);
                cq_poll_completion = 1;
	
	            if (wc.status != IBV_WC_SUCCESS) {
	                if (wc.opcode == IBV_WC_SEND ||
	                    wc.opcode == IBV_WC_RDMA_WRITE ) {
			    		fprintf(stderr, "[%d->%d] send desc error, wc_opcode=%d\n",myrank, vc->pg_rank, wc.opcode );
	                } else {
			    		fprintf(stderr, "[%d<-%d] recv desc error, wc_opcode=%d\n",myrank, vc->pg_rank, wc.opcode);
					}
                    fprintf(stderr, "[%d->%d] wc.status=%d, wc.wr_id=%p, wc.opcode=%d, vbuf->phead->type=%d = %s\n", 
                           myrank, vc->pg_rank, wc.status, v, 
			               wc.opcode,((MPIDI_CH3I_MRAILI_Pkt_comm_header*)v->pheader)->type, 
           			MPIDI_CH3_Pkt_type_to_string[((MPIDI_CH3I_MRAILI_Pkt_comm_header*)v->pheader)->type] );
	
	                ibv_va_error_abort(IBV_STATUS_ERR,
	                        "[] Got completion with error %d, "
	                        "vendor code=0x%x, dest rank=%d\n",
	                        wc.status,    
	                        wc.vendor_err, 
	                        ((MPIDI_VC_t *)v->vc)->pg_rank
	                        );
	            }

                is_send_completion = (wc.opcode == IBV_WC_SEND
                    || wc.opcode == IBV_WC_RDMA_WRITE
                    || wc.opcode == IBV_WC_RDMA_READ);
	
                if (2 == num_cqs) {
    	            if (0 == cq_choice) {
    	                if (MPIDI_CH3I_RDMA_Process.global_used_send_cq) {
                             MPIDI_CH3I_RDMA_Process.global_used_send_cq--;
    	                } else {
                            DEBUG_PRINT("[%d] Possibly received a duplicate \
                                       send completion event \n", 
                                       MPIDI_Process.my_pg_rank);
    	                }
    	            } 
                } else {
                       if(is_send_completion && 
                              (MPIDI_CH3I_RDMA_Process.global_used_send_cq > 0)) {
                             MPIDI_CH3I_RDMA_Process.global_used_send_cq--;
                       } else {
                            DEBUG_PRINT("[%d] Possibly received a duplicate \
                                       send completion event \n",
                                       MPIDI_Process.my_pg_rank);
                       }     
                }
 
	            if(!is_send_completion && (MPIDI_CH3I_RDMA_Process.has_srq
                                    || v->transport == IB_TRANSPORT_UD)) {
                    SET_PKT_LEN_HEADER(v, wc);
                    SET_PKT_HEADER_OFFSET(v);
                    p = v->pheader;
#ifdef _ENABLE_UD_
                    MPIDI_PG_Get_vc(MPIDI_Process.my_pg, p->src.rank, &vc);
#else
                    vc = (MPIDI_VC_t *)p->src.vc_addr;
#endif
                    v->vc = vc;
                    v->rail = p->rail;
	            } 
	            
	            /* get the VC and increase its wqe */
	            if (is_send_completion) {
#ifdef _ENABLE_UD_
                if (rdma_enable_hybrid) {
                    if(v->transport == IB_TRANSPORT_RC  || 
                        (v->pheader && IS_CNTL_MSG(v->pheader))) {
                        MRAILI_Process_send(v);
                    }
                    if (v->transport == IB_TRANSPORT_UD) {
                        mv2_ud_update_send_credits(v);
                    }
                    if(v->transport == IB_TRANSPORT_UD &&
                            v->flags & UD_VBUF_SEND_INPROGRESS) {
                        v->flags &= ~(UD_VBUF_SEND_INPROGRESS);
                        if (v->flags & UD_VBUF_FREE_PENIDING) {
                            v->flags &= ~(UD_VBUF_FREE_PENIDING);
                            MRAILI_Release_vbuf(v);
                        }
                    }
                }
                else
#endif
                {
	                MRAILI_Process_send(v);
                }
                    type = T_CHANNEL_NO_ARRIVE;
                    *vbuf_handle = NULL;
	            } else if ((NULL == vc_req || vc_req == vc) && 0 == receiving ){
	                /* In this case, we should return the vbuf 
	                 * any way if it is next expected*/
	                int seqnum = GetSeqNumVbuf(v);
	                *vbuf_handle = v; 
                    SET_PKT_LEN_HEADER(v, wc);
                    SET_PKT_HEADER_OFFSET(v);
                    v->seqnum =  seqnum;
                    p = v->pheader;
                    PRINT_DEBUG(DEBUG_UD_verbose>1,"Received from rank:%d seqnum :%d ack:%d size:%d type:%d trasport :%d \n",vc->pg_rank, v->seqnum, p->acknum, v->content_size, p->type, v->transport);
#ifdef _ENABLE_UD_
                    if (v->transport == IB_TRANSPORT_UD)
                    {
                        mv2_ud_ctx_t *ud_ctx = 
                            MPIDI_CH3I_RDMA_Process.ud_rails[i];
                        --ud_ctx->num_recvs_posted;
                        if(ud_ctx->num_recvs_posted < ud_ctx->credit_preserve) {
                            ud_ctx->num_recvs_posted += mv2_post_ud_recv_buffers(
                                    (rdma_default_max_ud_recv_wqe - ud_ctx->num_recvs_posted), ud_ctx);
                        }
                    }
                    else
#endif 
                    if (MPIDI_CH3I_RDMA_Process.has_srq) {
	                    pthread_spin_lock(&MPIDI_CH3I_RDMA_Process.
	                            srq_post_spin_lock);
	
	                    if(v->padding == NORMAL_VBUF_FLAG) {
	                        /* Can only be from SRQ path */
	                        --MPIDI_CH3I_RDMA_Process.posted_bufs[i];
	                    }
	
	                    if(MPIDI_CH3I_RDMA_Process.posted_bufs[i] <= 
	                            rdma_credit_preserve) {
	                        /* Need to post more to the SRQ */
	                        MPIDI_CH3I_RDMA_Process.posted_bufs[i] +=
	                            viadev_post_srq_buffers(viadev_srq_fill_size - 
	                                MPIDI_CH3I_RDMA_Process.posted_bufs[i], i);
	
	                    }
	
	                    pthread_spin_unlock(&MPIDI_CH3I_RDMA_Process.
	                            srq_post_spin_lock);
	
	                    /* Check if we need to release the SRQ limit thread */
	                    if (MPIDI_CH3I_RDMA_Process.
	                            srq_zero_post_counter[i] >= 1) {
	                        pthread_mutex_lock(
	                                &MPIDI_CH3I_RDMA_Process.
	                                srq_post_mutex_lock[i]);
	                        MPIDI_CH3I_RDMA_Process.srq_zero_post_counter[i] = 0;
	                        pthread_cond_signal(&MPIDI_CH3I_RDMA_Process.
	                                srq_post_cond[i]);
	                        pthread_mutex_unlock(
	                                &MPIDI_CH3I_RDMA_Process.
	                                srq_post_mutex_lock[i]);
	                    }
	
	                }
	                else
	                {
	                    --vc->mrail.srp.credits[v->rail].preposts;
	
	                    needed = rdma_prepost_depth + rdma_prepost_noop_extra
	                             + MIN(rdma_prepost_rendezvous_extra,
	                                   vc->mrail.srp.credits[v->rail].
	                                   rendezvous_packets_expected);
	                }
#ifdef _ENABLE_UD_
                    if (rdma_enable_hybrid){
                        if (IS_CNTL_MSG(p)){
                            type = T_CHANNEL_CONTROL_MSG_ARRIVE;
                        } else {
                            type = T_CHANNEL_HYBRID_MSG_ARRIVE;
                        }
                    }
                    else
#endif
                    {
                        if (seqnum == PKT_NO_SEQ_NUM){
                            type = T_CHANNEL_CONTROL_MSG_ARRIVE;
                        } else if (seqnum == vc->mrail.seqnum_next_torecv) {
                            vc->mrail.seqnum_next_toack = vc->mrail.seqnum_next_torecv;
                            ++vc->mrail.seqnum_next_torecv;
                            type = T_CHANNEL_EXACT_ARRIVE;
                            DEBUG_PRINT("[channel manager] get one with exact seqnum\n");
                        } else {
                            type = T_CHANNEL_OUT_OF_ORDER_ARRIVE;
                            VQUEUE_ENQUEUE(&vc->mrail.cmanager, 
                                    INDEX_GLOBAL(&vc->mrail.cmanager, v->rail),
                                    v);
                            DEBUG_PRINT("get recv %d (%d)\n", seqnum, vc->mrail.seqnum_next_torecv);
                        }
                    }
	                if (!MPIDI_CH3I_RDMA_Process.has_srq && v->transport != IB_TRANSPORT_UD) {
                          
	                    if (PKT_IS_NOOP(v)) {
	                        PREPOST_VBUF_RECV(vc, v->rail);
	                        /* noops don't count for credits */
	                        --vc->mrail.srp.credits[v->rail].local_credit;
	                    } 
	                    else if ((vc->mrail.srp.credits[v->rail].preposts 
                                 < rdma_rq_size) &&
	                             (vc->mrail.srp.credits[v->rail].preposts + 
	                             rdma_prepost_threshold < needed))
	                    {
	                        do {
	                            PREPOST_VBUF_RECV(vc, v->rail);
	                        } while (vc->mrail.srp.credits[v->rail].preposts 
                                     < rdma_rq_size &&
	                                 vc->mrail.srp.credits[v->rail].preposts 
                                     < needed);
	                    }
	
	                    MRAILI_Send_noop_if_needed(vc, v->rail);
	                }
	
	                if (type == T_CHANNEL_CONTROL_MSG_ARRIVE || 
	                        type == T_CHANNEL_EXACT_ARRIVE ||
                            type == T_CHANNEL_HYBRID_MSG_ARRIVE || 
	                        type == T_CHANNEL_OUT_OF_ORDER_ARRIVE) {
	                    goto fn_exit;
	                }
	            } else {
	                /* Commenting out the assert - possible coding error
	                 * MPIU_Assert(0);
	                 */
	                /* Now since this is not the packet we want, we have to 
                     * enqueue it */
	                type = T_CHANNEL_OUT_OF_ORDER_ARRIVE;
	                *vbuf_handle = NULL;
	                v->content_size = wc.byte_len;
	                VQUEUE_ENQUEUE(&vc->mrail.cmanager,
	                        INDEX_GLOBAL(&vc->mrail.cmanager, v->rail),
	                        v);
                    if (v->transport != IB_TRANSPORT_UD) {
                        if (MPIDI_CH3I_RDMA_Process.has_srq) {
                            pthread_spin_lock(&MPIDI_CH3I_RDMA_Process.srq_post_spin_lock);

                            if(v->padding == NORMAL_VBUF_FLAG ) {
                                /* Can only be from SRQ path */
                                --MPIDI_CH3I_RDMA_Process.posted_bufs[i];
                            }

                            if(MPIDI_CH3I_RDMA_Process.posted_bufs[i] <= rdma_credit_preserve) {
                                /* Need to post more to the SRQ */
                                MPIDI_CH3I_RDMA_Process.posted_bufs[i] +=
                                    viadev_post_srq_buffers(viadev_srq_fill_size - 
                                            MPIDI_CH3I_RDMA_Process.posted_bufs[i], i);

                            }

                            pthread_spin_unlock(&MPIDI_CH3I_RDMA_Process.
                                    srq_post_spin_lock);
                        } else {
                            --vc->mrail.srp.credits[v->rail].preposts;

                            needed = rdma_prepost_depth + rdma_prepost_noop_extra
                                + MIN(rdma_prepost_rendezvous_extra,
                                        vc->mrail.srp.credits[v->rail].
                                        rendezvous_packets_expected);

                            if (PKT_IS_NOOP(v)) {
                                PREPOST_VBUF_RECV(vc, v->rail);
                                --vc->mrail.srp.credits[v->rail].local_credit;
                            }
                            else if ((vc->mrail.srp.credits[v->rail].preposts 
                                        < rdma_rq_size) &&
                                    (vc->mrail.srp.credits[v->rail].preposts + 
                                     rdma_prepost_threshold < needed)) {
                                do {
                                    PREPOST_VBUF_RECV(vc, v->rail);
                                } while (vc->mrail.srp.credits[v->rail].preposts 
                                        < rdma_rq_size && 
                                        vc->mrail.srp.credits[v->rail].preposts 
                                        < needed);
                            }
                            MRAILI_Send_noop_if_needed(vc, v->rail);
                        }
                    }
	            }
	        } else {
	            *vbuf_handle = NULL;
	            type = T_CHANNEL_NO_ARRIVE;
	            ++nspin;
	
	            /* Blocking mode progress */
	            if(rdma_use_blocking && is_blocking && nspin >= rdma_blocking_spin_count_threshold){
	                /* Okay ... spun long enough, now time to go to sleep! */
	
	#if (MPICH_THREAD_LEVEL == MPI_THREAD_MULTIPLE)
	                MPIU_THREAD_CHECK_BEGIN
	                MPID_Thread_mutex_unlock(&MPIR_ThreadInfo.global_mutex);
	                MPIU_THREAD_CHECK_END
	#endif
	                do {    
	                    ret = ibv_get_cq_event(
	                            MPIDI_CH3I_RDMA_Process.comp_channel[i], 
	                            &ev_cq, &ev_ctx);
	                    if (ret && errno != EINTR) {
	                        ibv_va_error_abort(IBV_RETURN_ERR,
	                                "Failed to get cq event: %d\n", ret);
	                    }       
	                } while (ret && errno == EINTR); 
	#if (MPICH_THREAD_LEVEL == MPI_THREAD_MULTIPLE)
	                MPIU_THREAD_CHECK_BEGIN
	                MPID_Thread_mutex_lock(&MPIR_ThreadInfo.global_mutex);
	                MPIU_THREAD_CHECK_END
	#endif
	
                    if (num_cqs == 1) {
		                if (ev_cq != MPIDI_CH3I_RDMA_Process.cq_hndl[i]) {
		                    ibv_error_abort(IBV_STATUS_ERR,
                                             "Event in unknown CQ\n");
		                }
		
	                   ibv_ack_cq_events(MPIDI_CH3I_RDMA_Process.cq_hndl[i], 1);
		
		                if (ibv_req_notify_cq(
                                    MPIDI_CH3I_RDMA_Process.cq_hndl[i], 0)) {
		                    ibv_error_abort(IBV_RETURN_ERR,
		                            "Couldn't request for CQ notification\n");
		                }
                    } else {
		                if (ev_cq == MPIDI_CH3I_RDMA_Process.send_cq_hndl[i]) {
	                        ibv_ack_cq_events(
                                    MPIDI_CH3I_RDMA_Process.send_cq_hndl[i], 1);
		
		                    if (ibv_req_notify_cq(
                                  MPIDI_CH3I_RDMA_Process.send_cq_hndl[i], 0)) {
		                        ibv_error_abort(IBV_RETURN_ERR,
		                           "Couldn't request for CQ notification\n");
		                    }
                        } else if (ev_cq == 
                                    MPIDI_CH3I_RDMA_Process.recv_cq_hndl[i]) {
	                        ibv_ack_cq_events(
                                    MPIDI_CH3I_RDMA_Process.recv_cq_hndl[i], 1);
		
		                    if (ibv_req_notify_cq(
                                  MPIDI_CH3I_RDMA_Process.recv_cq_hndl[i], 0)) {
		                        ibv_error_abort(IBV_RETURN_ERR,
		                           "Couldn't request for CQ notification\n");
		                    }
		                } else {
		                   ibv_error_abort(IBV_STATUS_ERR,
                                             "Event in unknown CQ\n");
                        }
                    }
	                nspin = 0;
	            }
	        }
        }
    }