int mca_btl_mx_proc_connect( mca_btl_mx_endpoint_t* module_endpoint ) { int num_retry = 0; mx_return_t mx_status; mx_endpoint_addr_t mx_remote_addr; module_endpoint->status = MCA_BTL_MX_CONNECTION_PENDING; retry_connect: mx_status = mx_connect( module_endpoint->endpoint_btl->mx_endpoint, module_endpoint->mx_peer->nic_id, module_endpoint->mx_peer->endpoint_id, mca_btl_mx_component.mx_filter, mca_btl_mx_component.mx_timeout, &mx_remote_addr ); if( MX_SUCCESS != mx_status ) { if( MX_TIMEOUT == mx_status ) if( num_retry++ < mca_btl_mx_component.mx_connection_retries ) goto retry_connect; { char peer_name[MX_MAX_HOSTNAME_LEN]; if( MX_SUCCESS != mx_nic_id_to_hostname( module_endpoint->mx_peer->nic_id, peer_name ) ) sprintf( peer_name, "unknown %lx nic_id", (long)module_endpoint->mx_peer->nic_id ); opal_output( 0, "mx_connect fail for %s with key %x (error %s)\n\tUnique ID (local %x remote %x)\n", peer_name, mca_btl_mx_component.mx_filter, mx_strerror(mx_status), module_endpoint->endpoint_btl->mx_unique_network_id, module_endpoint->mx_peer->unique_network_id ); } module_endpoint->status = MCA_BTL_MX_NOT_REACHEABLE; return OMPI_ERROR; } module_endpoint->mx_peer_addr = mx_remote_addr; module_endpoint->status = MCA_BTL_MX_CONNECTED; return OMPI_SUCCESS; }
mca_mtl_mx_endpoint_t* mca_mtl_mx_endpoint_create(ompi_proc_t* ompi_proc) { mca_mtl_mx_endpoint_t* mtl_mx_endpoint = NULL; int rc; mca_mtl_mx_addr_t *mx_peer; size_t size; mx_return_t mx_return; int num_retry = 0; /* get the remote proc's address (only one) */ rc = ompi_modex_recv(&mca_mtl_mx_component.super.mtl_version, ompi_proc, (void**)&mx_peer, &size); if( rc != OMPI_SUCCESS || size != sizeof(mca_mtl_mx_addr_t)) { return NULL; } mtl_mx_endpoint = (mca_mtl_mx_endpoint_t*) OBJ_NEW(mca_mtl_mx_endpoint_t); mtl_mx_endpoint->mx_peer = mx_peer; retry_connect: mx_return = mx_connect(ompi_mtl_mx.mx_endpoint, mx_peer->nic_id, mx_peer->endpoint_id, ompi_mtl_mx.mx_filter, ompi_mtl_mx.mx_timeout, &mtl_mx_endpoint->mx_peer_addr); if(MX_SUCCESS != mx_return) { char peer_name[MX_MAX_HOSTNAME_LEN]; if(MX_TIMEOUT == mx_return) { if( num_retry++ < ompi_mtl_mx.mx_retries ) { goto retry_connect; } } if(MX_SUCCESS != mx_nic_id_to_hostname( mx_peer->nic_id, peer_name)) { sprintf( peer_name, "unknown %lx nic_id", (long)mx_peer->nic_id ); } opal_output(ompi_mtl_base_output, "mx_connect fail for %s with key %x (error %s)\n", peer_name, ompi_mtl_mx.mx_filter, mx_strerror(mx_return) ); return NULL; } return mtl_mx_endpoint; }
int ompi_mtl_mx_send(struct mca_mtl_base_module_t* mtl, struct ompi_communicator_t* comm, int dest, int tag, struct opal_convertor_t *convertor, mca_pml_base_send_mode_t mode) { mx_return_t mx_return; uint64_t match_bits; mca_mtl_mx_request_t mtl_mx_request; size_t length; mx_status_t mx_status; uint32_t result; ompi_proc_t* ompi_proc = ompi_comm_peer_lookup( comm, dest ); mca_mtl_mx_endpoint_t* mx_endpoint = (mca_mtl_mx_endpoint_t*) ompi_proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_MTL]; char* where; assert(mtl == &ompi_mtl_mx.super); MX_SET_SEND_BITS(match_bits, comm->c_contextid, comm->c_my_rank, tag); ompi_mtl_datatype_pack(convertor, &mtl_mx_request.mx_segment[0].segment_ptr, &length, &mtl_mx_request.free_after); mtl_mx_request.mx_segment[0].segment_length = length; mtl_mx_request.convertor = convertor; mtl_mx_request.type = OMPI_MTL_MX_ISEND; OPAL_OUTPUT_VERBOSE((50, ompi_mtl_base_framework.framework_output, "issend bits: 0x%016" PRIu64 "\n", match_bits)); if(mode == MCA_PML_BASE_SEND_SYNCHRONOUS) { mx_return = mx_issend( ompi_mtl_mx.mx_endpoint, mtl_mx_request.mx_segment, 1, mx_endpoint->mx_peer_addr, match_bits, &mtl_mx_request, &mtl_mx_request.mx_request ); where = "mx_issend"; } else { mx_return = mx_isend( ompi_mtl_mx.mx_endpoint, mtl_mx_request.mx_segment, 1, mx_endpoint->mx_peer_addr, match_bits, &mtl_mx_request, &mtl_mx_request.mx_request ); where = "mx_isend"; } if( OPAL_UNLIKELY(mx_return != MX_SUCCESS) ) { char peer_name[MX_MAX_HOSTNAME_LEN]; if(MX_SUCCESS != mx_nic_id_to_hostname( mx_endpoint->mx_peer->nic_id, peer_name)) { sprintf( peer_name, "unknown %lx nic_id", (long)mx_endpoint->mx_peer->nic_id ); } opal_output(ompi_mtl_base_framework.framework_output, "Error in %s (error %s) sending to %s\n", where, mx_strerror(mx_return), peer_name); /* Free buffer if needed */ if(mtl_mx_request.free_after) { free(mtl_mx_request.mx_segment[0].segment_ptr); } return OMPI_ERROR; } do { mx_return = mx_test(ompi_mtl_mx.mx_endpoint, &mtl_mx_request.mx_request, &mx_status, &result); if( OPAL_UNLIKELY(mx_return != MX_SUCCESS) ) { opal_output(ompi_mtl_base_framework.framework_output, "Error in mx_wait (error %s)\n", mx_strerror(mx_return)); abort(); } if( OPAL_UNLIKELY(result && mx_status.code != MX_STATUS_SUCCESS) ) { opal_output(ompi_mtl_base_framework.framework_output, "Error in ompi_mtl_mx_send, mx_wait returned something other than MX_STATUS_SUCCESS: mx_status.code = %d.\n", mx_status.code); abort(); } } while(!result); /* Free buffer if needed */ if(mtl_mx_request.free_after) { free(mtl_mx_request.mx_segment[0].segment_ptr); } return OMPI_SUCCESS; }
int ompi_mtl_mx_isend(struct mca_mtl_base_module_t* mtl, struct ompi_communicator_t* comm, int dest, int tag, struct opal_convertor_t *convertor, mca_pml_base_send_mode_t mode, bool blocking, mca_mtl_request_t * mtl_request) { mx_return_t mx_return; uint64_t match_bits; mca_mtl_mx_request_t * mtl_mx_request = (mca_mtl_mx_request_t*) mtl_request; size_t length; ompi_proc_t* ompi_proc = ompi_comm_peer_lookup( comm, dest ); mca_mtl_mx_endpoint_t* mx_endpoint = (mca_mtl_mx_endpoint_t*) ompi_proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_MTL]; char* where; assert(mtl == &ompi_mtl_mx.super); MX_SET_SEND_BITS(match_bits, comm->c_contextid, comm->c_my_rank, tag); ompi_mtl_datatype_pack(convertor, &mtl_mx_request->mx_segment[0].segment_ptr, &length, &mtl_mx_request->free_after); mtl_mx_request->mx_segment[0].segment_length = length; mtl_mx_request->convertor = convertor; mtl_mx_request->type = OMPI_MTL_MX_ISEND; OPAL_OUTPUT_VERBOSE((50, ompi_mtl_base_framework.framework_output, "issend bits: 0x%016" PRIu64 "\n", match_bits)); if(mode == MCA_PML_BASE_SEND_SYNCHRONOUS) { mx_return = mx_issend( ompi_mtl_mx.mx_endpoint, mtl_mx_request->mx_segment, 1, mx_endpoint->mx_peer_addr, match_bits, mtl_mx_request, &mtl_mx_request->mx_request ); where = "mx_issend"; } else { mx_return = mx_isend( ompi_mtl_mx.mx_endpoint, mtl_mx_request->mx_segment, 1, mx_endpoint->mx_peer_addr, match_bits, mtl_mx_request, &mtl_mx_request->mx_request ); where = "mx_isend"; } if( OPAL_UNLIKELY(mx_return != MX_SUCCESS) ) { char peer_name[MX_MAX_HOSTNAME_LEN]; if(MX_SUCCESS != mx_nic_id_to_hostname( mx_endpoint->mx_peer->nic_id, peer_name)) { sprintf( peer_name, "unknown %lx nic_id", (long)mx_endpoint->mx_peer->nic_id ); } opal_output(ompi_mtl_base_framework.framework_output, "Error in %s (error %s) sending to %s\n", where, mx_strerror(mx_return), peer_name); return OMPI_ERROR; } return OMPI_SUCCESS; }