/* * A blocking recv on a non-blocking socket. Used to receive the small amount of connection * information that identifies the endpoints endpoint. */ static int mca_btl_tcp_endpoint_recv_blocking(mca_btl_base_endpoint_t* btl_endpoint, void* data, size_t size) { unsigned char* ptr = (unsigned char*)data; size_t cnt = 0; while(cnt < size) { int retval = recv(btl_endpoint->endpoint_sd, (char *)ptr+cnt, size-cnt, 0); /* remote closed connection */ if(retval == 0) { mca_btl_tcp_endpoint_close(btl_endpoint); return -1; } /* socket is non-blocking so handle errors */ if(retval < 0) { if(opal_socket_errno != EINTR && opal_socket_errno != EAGAIN && opal_socket_errno != EWOULDBLOCK) { BTL_ERROR(("recv(%d) failed: %s (%d)", btl_endpoint->endpoint_sd, strerror(opal_socket_errno), opal_socket_errno)); mca_btl_tcp_endpoint_close(btl_endpoint); return -1; } continue; } cnt += retval; } return cnt; }
bool mca_btl_tcp_frag_send(mca_btl_tcp_frag_t* frag, int sd) { int cnt=-1; size_t i, num_vecs; /* non-blocking write, but continue if interrupted */ while(cnt < 0) { cnt = writev(sd, frag->iov_ptr, frag->iov_cnt); if(cnt < 0) { switch(opal_socket_errno) { case EINTR: continue; case EWOULDBLOCK: return false; case EFAULT: BTL_ERROR(("mca_btl_tcp_frag_send: writev error (%p, %lu)\n\t%s(%lu)\n", frag->iov_ptr[0].iov_base, (unsigned long) frag->iov_ptr[0].iov_len, strerror(opal_socket_errno), (unsigned long) frag->iov_cnt)); frag->endpoint->endpoint_state = MCA_BTL_TCP_FAILED; mca_btl_tcp_endpoint_close(frag->endpoint); return false; default: BTL_ERROR(("mca_btl_tcp_frag_send: writev failed: %s (%d)", strerror(opal_socket_errno), opal_socket_errno)); frag->endpoint->endpoint_state = MCA_BTL_TCP_FAILED; mca_btl_tcp_endpoint_close(frag->endpoint); return false; } } } /* if the write didn't complete - update the iovec state */ num_vecs = frag->iov_cnt; for(i=0; i<num_vecs; i++) { if(cnt >= (int)frag->iov_ptr->iov_len) { cnt -= frag->iov_ptr->iov_len; frag->iov_ptr++; frag->iov_idx++; frag->iov_cnt--; } else { frag->iov_ptr->iov_base = (opal_iov_base_ptr_t) (((unsigned char*)frag->iov_ptr->iov_base) + cnt); frag->iov_ptr->iov_len -= cnt; OPAL_OUTPUT_VERBOSE((100, opal_btl_base_framework.framework_output, "%s:%d write %d bytes on socket %d\n", __FILE__, __LINE__, cnt, sd)); break; } } return (frag->iov_cnt == 0); }
bool mca_btl_tcp_frag_send(mca_btl_tcp_frag_t* frag, int sd) { int cnt=-1; size_t i, num_vecs; /* non-blocking write, but continue if interrupted */ while(cnt < 0) { cnt = writev(sd, frag->iov_ptr, frag->iov_cnt); if(cnt < 0) { switch(opal_socket_errno) { case EINTR: continue; case EWOULDBLOCK: return false; case EFAULT: BTL_ERROR(("mca_btl_tcp_frag_send: writev error (%p, %d)\n\t%s(%d)\n", frag->iov_ptr[0].iov_base, frag->iov_ptr[0].iov_len, strerror(opal_socket_errno), frag->iov_cnt)); mca_btl_tcp_endpoint_close(frag->endpoint); return false; default: BTL_ERROR(("mca_btl_tcp_frag_send: writev failed: %s (%d)", strerror(opal_socket_errno), opal_socket_errno)); mca_btl_tcp_endpoint_close(frag->endpoint); return false; } } } /* if the write didn't complete - update the iovec state */ num_vecs = frag->iov_cnt; for(i=0; i<num_vecs; i++) { if(cnt >= (int)frag->iov_ptr->iov_len) { cnt -= frag->iov_ptr->iov_len; frag->iov_ptr++; frag->iov_idx++; frag->iov_cnt--; } else { frag->iov_ptr->iov_base = (ompi_iov_base_ptr_t) (((unsigned char*)frag->iov_ptr->iov_base) + cnt); frag->iov_ptr->iov_len -= cnt; break; } } return (frag->iov_cnt == 0); }
/* * Destroy a endpoint * */ static void mca_btl_tcp_endpoint_destruct(mca_btl_tcp_endpoint_t* endpoint) { mca_btl_tcp_proc_remove(endpoint->endpoint_proc, endpoint); mca_btl_tcp_endpoint_close(endpoint); OBJ_DESTRUCT(&endpoint->endpoint_frags); OBJ_DESTRUCT(&endpoint->endpoint_send_lock); OBJ_DESTRUCT(&endpoint->endpoint_recv_lock); }
bool mca_btl_tcp_endpoint_accept(mca_btl_base_endpoint_t* btl_endpoint, struct sockaddr* addr, int sd) { mca_btl_tcp_proc_t* this_proc = mca_btl_tcp_proc_local(); mca_btl_tcp_proc_t *endpoint_proc = btl_endpoint->endpoint_proc; int cmpval; OPAL_THREAD_LOCK(&btl_endpoint->endpoint_recv_lock); OPAL_THREAD_LOCK(&btl_endpoint->endpoint_send_lock); if(NULL == btl_endpoint->endpoint_addr) { OPAL_THREAD_UNLOCK(&btl_endpoint->endpoint_send_lock); OPAL_THREAD_UNLOCK(&btl_endpoint->endpoint_recv_lock); return false; } cmpval = orte_util_compare_name_fields(ORTE_NS_CMP_ALL, &endpoint_proc->proc_ompi->proc_name, &this_proc->proc_ompi->proc_name); if((btl_endpoint->endpoint_sd < 0) || (btl_endpoint->endpoint_state != MCA_BTL_TCP_CONNECTED && cmpval < 0)) { mca_btl_tcp_endpoint_close(btl_endpoint); btl_endpoint->endpoint_sd = sd; if(mca_btl_tcp_endpoint_send_connect_ack(btl_endpoint) != OMPI_SUCCESS) { mca_btl_tcp_endpoint_close(btl_endpoint); OPAL_THREAD_UNLOCK(&btl_endpoint->endpoint_send_lock); OPAL_THREAD_UNLOCK(&btl_endpoint->endpoint_recv_lock); return false; } mca_btl_tcp_endpoint_event_init(btl_endpoint); opal_event_add(&btl_endpoint->endpoint_recv_event, 0); mca_btl_tcp_endpoint_connected(btl_endpoint); #if OPAL_ENABLE_DEBUG && WANT_PEER_DUMP mca_btl_tcp_endpoint_dump(btl_endpoint, "accepted"); #endif OPAL_THREAD_UNLOCK(&btl_endpoint->endpoint_send_lock); OPAL_THREAD_UNLOCK(&btl_endpoint->endpoint_recv_lock); return true; } OPAL_THREAD_UNLOCK(&btl_endpoint->endpoint_send_lock); OPAL_THREAD_UNLOCK(&btl_endpoint->endpoint_recv_lock); return false; }
/* * Check the status of the connection. If the connection failed, will retry * later. Otherwise, send this processes identifier to the endpoint on the * newly connected socket. */ static void mca_btl_tcp_endpoint_complete_connect(mca_btl_base_endpoint_t* btl_endpoint) { int so_error = 0; opal_socklen_t so_length = sizeof(so_error); struct sockaddr_storage endpoint_addr; mca_btl_tcp_proc_tosocks(btl_endpoint->endpoint_addr, &endpoint_addr); /* unregister from receiving event notifications */ opal_event_del(&btl_endpoint->endpoint_send_event); /* check connect completion status */ if(getsockopt(btl_endpoint->endpoint_sd, SOL_SOCKET, SO_ERROR, (char *)&so_error, &so_length) < 0) { BTL_ERROR(("getsockopt() to %s failed: %s (%d)", opal_net_get_hostname((struct sockaddr*) &endpoint_addr), strerror(opal_socket_errno), opal_socket_errno)); mca_btl_tcp_endpoint_close(btl_endpoint); return; } if(so_error == EINPROGRESS || so_error == EWOULDBLOCK) { opal_event_add(&btl_endpoint->endpoint_send_event, 0); return; } if(so_error != 0) { BTL_ERROR(("connect() to %s failed: %s (%d)", opal_net_get_hostname((struct sockaddr*) &endpoint_addr), strerror(so_error), so_error)); mca_btl_tcp_endpoint_close(btl_endpoint); return; } if(mca_btl_tcp_endpoint_send_connect_ack(btl_endpoint) == OMPI_SUCCESS) { btl_endpoint->endpoint_state = MCA_BTL_TCP_CONNECT_ACK; opal_event_add(&btl_endpoint->endpoint_recv_event, 0); } else { mca_btl_tcp_endpoint_close(btl_endpoint); } }
/* * A blocking send on a non-blocking socket. Used to send the small amount of connection * information that identifies the endpoints endpoint. */ static int mca_btl_tcp_endpoint_send_blocking(mca_btl_base_endpoint_t* btl_endpoint, void* data, size_t size) { unsigned char* ptr = (unsigned char*)data; size_t cnt = 0; while(cnt < size) { int retval = send(btl_endpoint->endpoint_sd, (const char *)ptr+cnt, size-cnt, 0); if(retval < 0) { if(opal_socket_errno != EINTR && opal_socket_errno != EAGAIN && opal_socket_errno != EWOULDBLOCK) { BTL_ERROR(("send() failed: %s (%d)", strerror(opal_socket_errno), opal_socket_errno)); mca_btl_tcp_endpoint_close(btl_endpoint); return -1; } continue; } cnt += retval; } return cnt; }
/* * Receive the endpoints globally unique process identification from a newly * connected socket and verify the expected response. If so, move the * socket to a connected state. */ static int mca_btl_tcp_endpoint_recv_connect_ack(mca_btl_base_endpoint_t* btl_endpoint) { orte_process_name_t guid; mca_btl_tcp_proc_t* btl_proc = btl_endpoint->endpoint_proc; if((mca_btl_tcp_endpoint_recv_blocking(btl_endpoint, &guid, sizeof(orte_process_name_t))) != sizeof(orte_process_name_t)) { return OMPI_ERR_UNREACH; } ORTE_PROCESS_NAME_NTOH(guid); /* compare this to the expected values */ if (OPAL_EQUAL != orte_util_compare_name_fields(ORTE_NS_CMP_ALL, &btl_proc->proc_ompi->proc_name, &guid)) { BTL_ERROR(("received unexpected process identifier %s", ORTE_NAME_PRINT(&guid))); mca_btl_tcp_endpoint_close(btl_endpoint); return OMPI_ERR_UNREACH; } return OMPI_SUCCESS; }
bool mca_btl_tcp_frag_recv(mca_btl_tcp_frag_t* frag, int sd) { mca_btl_base_endpoint_t* btl_endpoint = frag->endpoint; int i, num_vecs, dont_copy_data = 0; ssize_t cnt; repeat: num_vecs = frag->iov_cnt; #if MCA_BTL_TCP_ENDPOINT_CACHE if( 0 != btl_endpoint->endpoint_cache_length ) { size_t length; /* It's strange at the first look but cnt have to be set to the full amount of data * available. After going to advance_iov_position we will use cnt to detect if there * is still some data pending. */ cnt = length = btl_endpoint->endpoint_cache_length; for( i = 0; i < (int)frag->iov_cnt; i++ ) { if( length > frag->iov_ptr[i].iov_len ) length = frag->iov_ptr[i].iov_len; if( (0 == dont_copy_data) || (length < frag->iov_ptr[i].iov_len) ) { memcpy( frag->iov_ptr[i].iov_base, btl_endpoint->endpoint_cache_pos, length ); } else { frag->segments[0].seg_addr.pval = btl_endpoint->endpoint_cache_pos; frag->iov_ptr[i].iov_base = btl_endpoint->endpoint_cache_pos; } btl_endpoint->endpoint_cache_pos += length; btl_endpoint->endpoint_cache_length -= length; length = btl_endpoint->endpoint_cache_length; if( 0 == length ) { btl_endpoint->endpoint_cache_pos = btl_endpoint->endpoint_cache; break; } } goto advance_iov_position; } /* What's happens if all iovecs are used by the fragment ? It still work, as we reserve one * iovec for the caching in the fragment structure (the +1). */ frag->iov_ptr[num_vecs].iov_base = btl_endpoint->endpoint_cache_pos; frag->iov_ptr[num_vecs].iov_len = mca_btl_tcp_component.tcp_endpoint_cache - btl_endpoint->endpoint_cache_length; num_vecs++; #endif /* MCA_BTL_TCP_ENDPOINT_CACHE */ /* non-blocking read, but continue if interrupted */ cnt = -1; while( cnt < 0 ) { cnt = readv(sd, frag->iov_ptr, num_vecs); if( 0 < cnt ) goto advance_iov_position; if( cnt == 0 ) { btl_endpoint->endpoint_state = MCA_BTL_TCP_FAILED; mca_btl_tcp_endpoint_close(btl_endpoint); return false; } switch(opal_socket_errno) { case EINTR: continue; case EWOULDBLOCK: return false; case EFAULT: BTL_ERROR(("mca_btl_tcp_frag_recv: readv error (%p, %lu)\n\t%s(%lu)\n", frag->iov_ptr[0].iov_base, (unsigned long) frag->iov_ptr[0].iov_len, strerror(opal_socket_errno), (unsigned long) frag->iov_cnt)); btl_endpoint->endpoint_state = MCA_BTL_TCP_FAILED; mca_btl_tcp_endpoint_close(btl_endpoint); return false; default: BTL_ERROR(("mca_btl_tcp_frag_recv: readv failed: %s (%d)", strerror(opal_socket_errno), opal_socket_errno)); btl_endpoint->endpoint_state = MCA_BTL_TCP_FAILED; mca_btl_tcp_endpoint_close(btl_endpoint); return false; } } advance_iov_position: /* if the read didn't complete - update the iovec state */ num_vecs = frag->iov_cnt; for( i = 0; i < num_vecs; i++ ) { if( cnt < (int)frag->iov_ptr->iov_len ) { frag->iov_ptr->iov_base = (opal_iov_base_ptr_t) (((unsigned char*)frag->iov_ptr->iov_base) + cnt); frag->iov_ptr->iov_len -= cnt; cnt = 0; break; } cnt -= frag->iov_ptr->iov_len; frag->iov_idx++; frag->iov_ptr++; frag->iov_cnt--; } #if MCA_BTL_TCP_ENDPOINT_CACHE btl_endpoint->endpoint_cache_length = cnt; #endif /* MCA_BTL_TCP_ENDPOINT_CACHE */ /* read header */ if(frag->iov_cnt == 0) { if (btl_endpoint->endpoint_nbo && frag->iov_idx == 1) MCA_BTL_TCP_HDR_NTOH(frag->hdr); switch(frag->hdr.type) { case MCA_BTL_TCP_HDR_TYPE_SEND: if(frag->iov_idx == 1 && frag->hdr.size) { frag->segments[0].seg_addr.pval = frag+1; frag->segments[0].seg_len = frag->hdr.size; frag->iov[1].iov_base = (IOVBASE_TYPE*)(frag->segments[0].seg_addr.pval); frag->iov[1].iov_len = frag->hdr.size; frag->iov_cnt++; #ifndef __sparc #if !MCA_BTL_TCP_SUPPORT_PROGRESS_THREAD /* The following cannot be done for sparc code * because it causes alignment errors when accessing * structures later on in the btl and pml code. */ dont_copy_data = 1; #endif #endif goto repeat; } break; case MCA_BTL_TCP_HDR_TYPE_PUT: if(frag->iov_idx == 1) { frag->iov[1].iov_base = (IOVBASE_TYPE*)frag->segments; frag->iov[1].iov_len = frag->hdr.count * sizeof(mca_btl_base_segment_t); frag->iov_cnt++; goto repeat; } else if (frag->iov_idx == 2) { for( i = 0; i < frag->hdr.count; i++ ) { frag->iov[i+2].iov_base = (IOVBASE_TYPE*)frag->segments[i].seg_addr.pval; frag->iov[i+2].iov_len = frag->segments[i].seg_len; } frag->iov_cnt += frag->hdr.count; goto repeat; } break; case MCA_BTL_TCP_HDR_TYPE_GET: default: break; } return true; } return false; }
static void mca_btl_tcp2_endpoint_recv_handler(int sd, short flags, void* user) { mca_btl_base_endpoint_t* btl_endpoint = (mca_btl_base_endpoint_t *)user; /* Make sure we don't have a race between a thread that remove the * recv event, and one event already scheduled. */ if( sd != btl_endpoint->endpoint_sd ) return; OPAL_THREAD_LOCK(&btl_endpoint->endpoint_recv_lock); switch(btl_endpoint->endpoint_state) { case MCA_BTL_TCP_CONNECT_ACK: { int rc = OMPI_ERROR; rc = mca_btl_tcp2_endpoint_recv_connect_ack(btl_endpoint); if( OMPI_SUCCESS == rc ) { /* we are now connected. Start sending the data */ OPAL_THREAD_LOCK(&btl_endpoint->endpoint_send_lock); mca_btl_tcp2_endpoint_connected(btl_endpoint); OPAL_THREAD_UNLOCK(&btl_endpoint->endpoint_send_lock); #if OPAL_ENABLE_DEBUG && WANT_PEER_DUMP mca_btl_tcp2_endpoint_dump(btl_endpoint, "connected"); #endif } OPAL_THREAD_UNLOCK(&btl_endpoint->endpoint_recv_lock); return; } case MCA_BTL_TCP_CONNECTED: { mca_btl_tcp2_frag_t* frag; frag = btl_endpoint->endpoint_recv_frag; data_still_pending_on_endpoint: if(NULL == frag) { if(mca_btl_tcp_module.super.btl_max_send_size > mca_btl_tcp_module.super.btl_eager_limit) { MCA_BTL_TCP_FRAG_ALLOC_MAX(frag); } else { MCA_BTL_TCP_FRAG_ALLOC_EAGER(frag); } if(NULL == frag) { OPAL_THREAD_UNLOCK(&btl_endpoint->endpoint_recv_lock); return; } MCA_BTL_TCP_FRAG_INIT_DST(frag, btl_endpoint); } /* check for completion of non-blocking recv on the current fragment */ if( mca_btl_tcp_frag_recv(frag, btl_endpoint->endpoint_sd) == false ) { btl_endpoint->endpoint_recv_frag = frag; } else { btl_endpoint->endpoint_recv_frag = NULL; TODO_MCA_BTL_TCP_RECV_TRIGGER_CB(frag); #if MCA_BTL_TCP_ENDPOINT_CACHE if( 0 != btl_endpoint->endpoint_cache_length ) { #if MCA_BTL_TCP_USES_PROGRESS_THREAD /* Get a new fragment and try again */ frag = NULL; #else /* If the cache still contain some data we can reuse the same fragment * until we flush it completly. */ MCA_BTL_TCP_FRAG_INIT_DST(frag, btl_endpoint); #endif /* MCA_BTL_TCP_USES_PROGRESS_THREAD */ goto data_still_pending_on_endpoint; } #endif /* MCA_BTL_TCP_ENDPOINT_CACHE */ #if !MCA_BTL_TCP_USES_PROGRESS_THREAD MCA_BTL_TCP_FRAG_RETURN(frag); #endif /* MCA_BTL_TCP_USES_PROGRESS_THREAD */ } #if MCA_BTL_TCP_ENDPOINT_CACHE assert( 0 == btl_endpoint->endpoint_cache_length ); #endif /* MCA_BTL_TCP_ENDPOINT_CACHE */ OPAL_THREAD_UNLOCK(&btl_endpoint->endpoint_recv_lock); break; } case MCA_BTL_TCP_CLOSED: /* This is a thread-safety issue. As multiple threads are allowed * to generate events (in the lib event) we endup with several * threads executing the receive callback, when we reach the end * of the MPI_Finalize. The first one will close the connections, * and all others will complain. */ break; default: OPAL_THREAD_LOCK(&btl_endpoint->endpoint_recv_lock); BTL_ERROR(("invalid socket state(%d)", btl_endpoint->endpoint_state)); btl_endpoint->endpoint_state = MCA_BTL_TCP_FAILED; mca_btl_tcp_endpoint_close(btl_endpoint); OPAL_THREAD_UNLOCK(&btl_endpoint->endpoint_recv_lock); break; } }
/* * Start a connection to the endpoint. This will likely not complete, * as the socket is set to non-blocking, so register for event * notification of connect completion. On connection we send * our globally unique process identifier to the endpoint and wait for * the endpoints response. */ static int mca_btl_tcp_endpoint_start_connect(mca_btl_base_endpoint_t* btl_endpoint) { int rc,flags; struct sockaddr_storage endpoint_addr; /* By default consider a IPv4 connection */ uint16_t af_family = AF_INET; opal_socklen_t addrlen = sizeof(struct sockaddr_in); #if OPAL_WANT_IPV6 if (AF_INET6 == btl_endpoint->endpoint_addr->addr_family) { af_family = AF_INET6; addrlen = sizeof (struct sockaddr_in6); } #endif btl_endpoint->endpoint_sd = socket(af_family, SOCK_STREAM, 0); if (btl_endpoint->endpoint_sd < 0) { btl_endpoint->endpoint_retries++; return OMPI_ERR_UNREACH; } /* setup socket buffer sizes */ mca_btl_tcp_set_socket_options(btl_endpoint->endpoint_sd); /* setup event callbacks */ mca_btl_tcp_endpoint_event_init(btl_endpoint); /* setup the socket as non-blocking */ if((flags = fcntl(btl_endpoint->endpoint_sd, F_GETFL, 0)) < 0) { BTL_ERROR(("fcntl(F_GETFL) failed: %s (%d)", strerror(opal_socket_errno), opal_socket_errno)); } else { flags |= O_NONBLOCK; if(fcntl(btl_endpoint->endpoint_sd, F_SETFL, flags) < 0) BTL_ERROR(("fcntl(F_SETFL) failed: %s (%d)", strerror(opal_socket_errno), opal_socket_errno)); } /* start the connect - will likely fail with EINPROGRESS */ mca_btl_tcp_proc_tosocks(btl_endpoint->endpoint_addr, &endpoint_addr); opal_output_verbose(20, mca_btl_base_output, "btl: tcp: attempting to connect() to %s address %s on port %d", ORTE_NAME_PRINT(&btl_endpoint->endpoint_proc->proc_ompi->proc_name), opal_net_get_hostname((struct sockaddr*) &endpoint_addr), ntohs(btl_endpoint->endpoint_addr->addr_port)); if(connect(btl_endpoint->endpoint_sd, (struct sockaddr*)&endpoint_addr, addrlen) < 0) { /* non-blocking so wait for completion */ if(opal_socket_errno == EINPROGRESS || opal_socket_errno == EWOULDBLOCK) { btl_endpoint->endpoint_state = MCA_BTL_TCP_CONNECTING; opal_event_add(&btl_endpoint->endpoint_send_event, 0); return OMPI_SUCCESS; } { char *address; address = opal_net_get_hostname((struct sockaddr*) &endpoint_addr); BTL_PEER_ERROR( btl_endpoint->endpoint_proc->proc_ompi, ( "Unable to connect to the peer %s on port %d: %s\n", address, btl_endpoint->endpoint_addr->addr_port, strerror(opal_socket_errno) ) ); } mca_btl_tcp_endpoint_close(btl_endpoint); btl_endpoint->endpoint_retries++; return OMPI_ERR_UNREACH; } /* send our globally unique process identifier to the endpoint */ if((rc = mca_btl_tcp_endpoint_send_connect_ack(btl_endpoint)) == OMPI_SUCCESS) { btl_endpoint->endpoint_state = MCA_BTL_TCP_CONNECT_ACK; opal_event_add(&btl_endpoint->endpoint_recv_event, 0); } else { mca_btl_tcp_endpoint_close(btl_endpoint); } return rc; }