mca_btl_base_descriptor_t* mca_btl_tcp_alloc( struct mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t* endpoint, uint8_t order, size_t size, uint32_t flags) { mca_btl_tcp_frag_t* frag = NULL; if(size <= btl->btl_eager_limit) { MCA_BTL_TCP_FRAG_ALLOC_EAGER(frag); } else if (size <= btl->btl_max_send_size) { MCA_BTL_TCP_FRAG_ALLOC_MAX(frag); } if( OPAL_UNLIKELY(NULL == frag) ) { return NULL; } frag->segments[0].seg_len = size; frag->segments[0].seg_addr.pval = frag+1; frag->base.des_segments = frag->segments; frag->base.des_segment_count = 1; frag->base.des_flags = flags; frag->base.order = MCA_BTL_NO_ORDER; frag->btl = (mca_btl_tcp_module_t*)btl; return (mca_btl_base_descriptor_t*)frag; }
/** * Pack data and return a descriptor that can be * used for send/put. * * @param btl (IN) BTL module * @param peer (IN) BTL peer addressing */ mca_btl_base_descriptor_t* mca_btl_tcp_prepare_src( struct mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t* endpoint, struct opal_convertor_t* convertor, uint8_t order, size_t reserve, size_t* size, uint32_t flags) { mca_btl_tcp_frag_t* frag; struct iovec iov; uint32_t iov_count = 1; size_t max_data = *size; int rc; if( OPAL_UNLIKELY(max_data > UINT32_MAX) ) { /* limit the size to what we support */ max_data = (size_t)UINT32_MAX; } /* * if we aren't pinning the data and the requested size is less * than the eager limit pack into a fragment from the eager pool */ if (max_data+reserve <= btl->btl_eager_limit) { MCA_BTL_TCP_FRAG_ALLOC_EAGER(frag); } else { /* * otherwise pack as much data as we can into a fragment * that is the max send size. */ MCA_BTL_TCP_FRAG_ALLOC_MAX(frag); } if( OPAL_UNLIKELY(NULL == frag) ) { return NULL; } frag->segments[0].seg_addr.pval = (frag + 1); frag->segments[0].seg_len = reserve; frag->base.des_segment_count = 1; if(opal_convertor_need_buffers(convertor)) { if (max_data + reserve > frag->size) { max_data = frag->size - reserve; } iov.iov_len = max_data; iov.iov_base = (IOVBASE_TYPE*)(((unsigned char*)(frag->segments[0].seg_addr.pval)) + reserve); rc = opal_convertor_pack(convertor, &iov, &iov_count, &max_data ); if( OPAL_UNLIKELY(rc < 0) ) { mca_btl_tcp_free(btl, &frag->base); return NULL; } frag->segments[0].seg_len += max_data; } else { iov.iov_len = max_data; iov.iov_base = NULL; rc = opal_convertor_pack(convertor, &iov, &iov_count, &max_data ); if( OPAL_UNLIKELY(rc < 0) ) { mca_btl_tcp_free(btl, &frag->base); return NULL; } frag->segments[1].seg_addr.pval = iov.iov_base; frag->segments[1].seg_len = max_data; frag->base.des_segment_count = 2; } frag->base.des_segments = frag->segments; frag->base.des_flags = flags; frag->base.order = MCA_BTL_NO_ORDER; *size = max_data; return &frag->base; }
static void mca_btl_tcp2_endpoint_recv_handler(int sd, short flags, void* user) { mca_btl_base_endpoint_t* btl_endpoint = (mca_btl_base_endpoint_t *)user; /* Make sure we don't have a race between a thread that remove the * recv event, and one event already scheduled. */ if( sd != btl_endpoint->endpoint_sd ) return; OPAL_THREAD_LOCK(&btl_endpoint->endpoint_recv_lock); switch(btl_endpoint->endpoint_state) { case MCA_BTL_TCP_CONNECT_ACK: { int rc = OMPI_ERROR; rc = mca_btl_tcp2_endpoint_recv_connect_ack(btl_endpoint); if( OMPI_SUCCESS == rc ) { /* we are now connected. Start sending the data */ OPAL_THREAD_LOCK(&btl_endpoint->endpoint_send_lock); mca_btl_tcp2_endpoint_connected(btl_endpoint); OPAL_THREAD_UNLOCK(&btl_endpoint->endpoint_send_lock); #if OPAL_ENABLE_DEBUG && WANT_PEER_DUMP mca_btl_tcp2_endpoint_dump(btl_endpoint, "connected"); #endif } OPAL_THREAD_UNLOCK(&btl_endpoint->endpoint_recv_lock); return; } case MCA_BTL_TCP_CONNECTED: { mca_btl_tcp2_frag_t* frag; frag = btl_endpoint->endpoint_recv_frag; if(NULL == frag) { if(mca_btl_tcp2_module.super.btl_max_send_size > mca_btl_tcp2_module.super.btl_eager_limit) { MCA_BTL_TCP_FRAG_ALLOC_MAX(frag); } else { MCA_BTL_TCP_FRAG_ALLOC_EAGER(frag); } if(NULL == frag) { OPAL_THREAD_UNLOCK(&btl_endpoint->endpoint_recv_lock); return; } MCA_BTL_TCP_FRAG_INIT_DST(frag, btl_endpoint); } #if MCA_BTL_TCP_ENDPOINT_CACHE assert( 0 == btl_endpoint->endpoint_cache_length ); data_still_pending_on_endpoint: #endif /* MCA_BTL_TCP_ENDPOINT_CACHE */ /* check for completion of non-blocking recv on the current fragment */ if(mca_btl_tcp2_frag_recv(frag, btl_endpoint->endpoint_sd) == false) { btl_endpoint->endpoint_recv_frag = frag; } else { btl_endpoint->endpoint_recv_frag = NULL; if( MCA_BTL_TCP_HDR_TYPE_SEND == frag->hdr.type ) { mca_btl_active_message_callback_t* reg; reg = mca_btl_base_active_message_trigger + frag->hdr.base.tag; reg->cbfunc(&frag->btl->super, frag->hdr.base.tag, &frag->base, reg->cbdata); } #if MCA_BTL_TCP_ENDPOINT_CACHE if( 0 != btl_endpoint->endpoint_cache_length ) { /* If the cache still contain some data we can reuse the same fragment * until we flush it completly. */ MCA_BTL_TCP_FRAG_INIT_DST(frag, btl_endpoint); goto data_still_pending_on_endpoint; } #endif /* MCA_BTL_TCP_ENDPOINT_CACHE */ MCA_BTL_TCP_FRAG_RETURN(frag); } #if MCA_BTL_TCP_ENDPOINT_CACHE assert( 0 == btl_endpoint->endpoint_cache_length ); #endif /* MCA_BTL_TCP_ENDPOINT_CACHE */ OPAL_THREAD_UNLOCK(&btl_endpoint->endpoint_recv_lock); break; } case MCA_BTL_TCP_CLOSED: /* This is a thread-safety issue. As multiple threads are allowed * to generate events (in the lib event) we endup with several * threads executing the receive callback, when we reach the end * of the MPI_Finalize. The first one will close the connections, * and all others will complain. */ OPAL_THREAD_UNLOCK(&btl_endpoint->endpoint_recv_lock); break; default: OPAL_THREAD_UNLOCK(&btl_endpoint->endpoint_recv_lock); BTL_ERROR(("invalid socket state(%d)", btl_endpoint->endpoint_state)); mca_btl_tcp2_endpoint_close(btl_endpoint); break; } }