int mca_btl_mx_proc_connect( mca_btl_mx_endpoint_t* module_endpoint )
{
    int num_retry = 0;
    mx_return_t mx_status;
    mx_endpoint_addr_t mx_remote_addr;

    module_endpoint->status = MCA_BTL_MX_CONNECTION_PENDING;

 retry_connect:
    mx_status = mx_connect( module_endpoint->endpoint_btl->mx_endpoint,
                            module_endpoint->mx_peer->nic_id, module_endpoint->mx_peer->endpoint_id,
                            mca_btl_mx_component.mx_filter, mca_btl_mx_component.mx_timeout, &mx_remote_addr );
    if( MX_SUCCESS != mx_status ) {
        if( MX_TIMEOUT == mx_status )
            if( num_retry++ < mca_btl_mx_component.mx_connection_retries )
                goto retry_connect;
        {
            char peer_name[MX_MAX_HOSTNAME_LEN];
            
            if( MX_SUCCESS != mx_nic_id_to_hostname( module_endpoint->mx_peer->nic_id, peer_name ) )
                sprintf( peer_name, "unknown %lx nic_id", (long)module_endpoint->mx_peer->nic_id );
            
            opal_output( 0, "mx_connect fail for %s with key %x (error %s)\n\tUnique ID (local %x remote %x)\n",
                         peer_name, mca_btl_mx_component.mx_filter, mx_strerror(mx_status),
			 module_endpoint->endpoint_btl->mx_unique_network_id,
			 module_endpoint->mx_peer->unique_network_id );
        }
        module_endpoint->status = MCA_BTL_MX_NOT_REACHEABLE;
        return OMPI_ERROR;
    }
    module_endpoint->mx_peer_addr = mx_remote_addr;
    module_endpoint->status       = MCA_BTL_MX_CONNECTED;

    return OMPI_SUCCESS;
}
mca_mtl_mx_endpoint_t* mca_mtl_mx_endpoint_create(ompi_proc_t* ompi_proc) { 
    mca_mtl_mx_endpoint_t* mtl_mx_endpoint = NULL;
    int rc; 
    mca_mtl_mx_addr_t *mx_peer; 
    size_t size;
    mx_return_t mx_return;
    int num_retry = 0;
    /* get the remote proc's address (only one) */
    rc = ompi_modex_recv(&mca_mtl_mx_component.super.mtl_version, 
                                 ompi_proc, (void**)&mx_peer, &size);
    if( rc != OMPI_SUCCESS || size != sizeof(mca_mtl_mx_addr_t)) { 
        return NULL; 
    }
    
    mtl_mx_endpoint = (mca_mtl_mx_endpoint_t*) OBJ_NEW(mca_mtl_mx_endpoint_t);
    mtl_mx_endpoint->mx_peer = mx_peer;
    
 retry_connect:
    mx_return = mx_connect(ompi_mtl_mx.mx_endpoint, 
                           mx_peer->nic_id, 
                           mx_peer->endpoint_id, 
                           ompi_mtl_mx.mx_filter, 
                           ompi_mtl_mx.mx_timeout, 
                           &mtl_mx_endpoint->mx_peer_addr);
    if(MX_SUCCESS != mx_return) { 
        char peer_name[MX_MAX_HOSTNAME_LEN];
        if(MX_TIMEOUT == mx_return) { 
            if( num_retry++ < ompi_mtl_mx.mx_retries ) { 
                goto retry_connect;
            }
        }
        
        if(MX_SUCCESS != mx_nic_id_to_hostname( mx_peer->nic_id, peer_name)) { 
            sprintf( peer_name, "unknown %lx nic_id", (long)mx_peer->nic_id ); 
        }
        opal_output(ompi_mtl_base_output, 
                    "mx_connect fail for %s with key %x (error %s)\n", 
                    peer_name, ompi_mtl_mx.mx_filter, mx_strerror(mx_return) );
        return NULL;
    }
    
    
    return mtl_mx_endpoint;
    
}
Example #3
0
int
ompi_mtl_mx_send(struct mca_mtl_base_module_t* mtl, 
                 struct ompi_communicator_t* comm,
                 int dest,
                 int tag,
                 struct opal_convertor_t *convertor,
                 mca_pml_base_send_mode_t mode)
{
    mx_return_t mx_return;
    uint64_t match_bits;
    mca_mtl_mx_request_t mtl_mx_request;
    size_t length;
    mx_status_t mx_status;
    uint32_t result;
    ompi_proc_t* ompi_proc = ompi_comm_peer_lookup( comm, dest );
    mca_mtl_mx_endpoint_t* mx_endpoint = (mca_mtl_mx_endpoint_t*) ompi_proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_MTL];
    char* where;

    assert(mtl == &ompi_mtl_mx.super);

    MX_SET_SEND_BITS(match_bits, comm->c_contextid, comm->c_my_rank, tag); 
    
    ompi_mtl_datatype_pack(convertor, 
                           &mtl_mx_request.mx_segment[0].segment_ptr, 
                           &length,
                           &mtl_mx_request.free_after);

    mtl_mx_request.mx_segment[0].segment_length = length;
    mtl_mx_request.convertor = convertor;
    mtl_mx_request.type = OMPI_MTL_MX_ISEND;

    OPAL_OUTPUT_VERBOSE((50, ompi_mtl_base_framework.framework_output,
                         "issend bits: 0x%016" PRIu64 "\n", 
                         match_bits));

    if(mode == MCA_PML_BASE_SEND_SYNCHRONOUS) { 
        mx_return = mx_issend( ompi_mtl_mx.mx_endpoint, 
                               mtl_mx_request.mx_segment, 
                               1,
                               mx_endpoint->mx_peer_addr, 
                               match_bits, 
                               &mtl_mx_request, 
                               &mtl_mx_request.mx_request
                               );
        where = "mx_issend";
    } else { 
        mx_return = mx_isend( ompi_mtl_mx.mx_endpoint, 
                              mtl_mx_request.mx_segment,
                              1,
                              mx_endpoint->mx_peer_addr,
                              match_bits,
                              &mtl_mx_request,
                              &mtl_mx_request.mx_request
                              );
        where = "mx_isend";
    }
    if( OPAL_UNLIKELY(mx_return != MX_SUCCESS) ) { 
        char peer_name[MX_MAX_HOSTNAME_LEN];
        if(MX_SUCCESS != mx_nic_id_to_hostname( mx_endpoint->mx_peer->nic_id, peer_name)) { 
            sprintf( peer_name, "unknown %lx nic_id", (long)mx_endpoint->mx_peer->nic_id ); 
        }
        opal_output(ompi_mtl_base_framework.framework_output, "Error in %s (error %s) sending to %s\n",
                    where, mx_strerror(mx_return), peer_name);

        /* Free buffer if needed */
        if(mtl_mx_request.free_after) { 
            free(mtl_mx_request.mx_segment[0].segment_ptr);
        }
        return OMPI_ERROR;
    }
    
    do { 
        mx_return = mx_test(ompi_mtl_mx.mx_endpoint, 
                            &mtl_mx_request.mx_request,
                            &mx_status,
                            &result);
        if( OPAL_UNLIKELY(mx_return != MX_SUCCESS) ) { 
            opal_output(ompi_mtl_base_framework.framework_output, "Error in mx_wait (error %s)\n", mx_strerror(mx_return));
            abort();
        }
        if( OPAL_UNLIKELY(result && mx_status.code != MX_STATUS_SUCCESS) ) { 
            opal_output(ompi_mtl_base_framework.framework_output, 
                        "Error in ompi_mtl_mx_send, mx_wait returned something other than MX_STATUS_SUCCESS: mx_status.code = %d.\n", 
                        mx_status.code);
            abort();
        }
    } while(!result);

    /* Free buffer if needed */
    if(mtl_mx_request.free_after) { 
        free(mtl_mx_request.mx_segment[0].segment_ptr);
    }
    
    return OMPI_SUCCESS;
}
Example #4
0
int
ompi_mtl_mx_isend(struct mca_mtl_base_module_t* mtl, 
                  struct ompi_communicator_t* comm,
                  int dest,
                  int tag,
                  struct opal_convertor_t *convertor,
                  mca_pml_base_send_mode_t mode,
                  bool blocking,
                  mca_mtl_request_t * mtl_request)
{
    mx_return_t mx_return;
    uint64_t match_bits;
    mca_mtl_mx_request_t * mtl_mx_request = (mca_mtl_mx_request_t*) mtl_request;
    size_t length;
    ompi_proc_t* ompi_proc = ompi_comm_peer_lookup( comm, dest );
    mca_mtl_mx_endpoint_t* mx_endpoint = (mca_mtl_mx_endpoint_t*) ompi_proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_MTL];
    char* where;

    assert(mtl == &ompi_mtl_mx.super);

    MX_SET_SEND_BITS(match_bits, comm->c_contextid, comm->c_my_rank, tag); 
    
    ompi_mtl_datatype_pack(convertor, 
                           &mtl_mx_request->mx_segment[0].segment_ptr, 
                           &length,
                           &mtl_mx_request->free_after);
    mtl_mx_request->mx_segment[0].segment_length = length;
    mtl_mx_request->convertor = convertor;
    mtl_mx_request->type = OMPI_MTL_MX_ISEND;

    OPAL_OUTPUT_VERBOSE((50, ompi_mtl_base_framework.framework_output,
                         "issend bits: 0x%016" PRIu64 "\n", match_bits));

    if(mode == MCA_PML_BASE_SEND_SYNCHRONOUS) { 
        mx_return = mx_issend( ompi_mtl_mx.mx_endpoint, 
                               mtl_mx_request->mx_segment, 
                               1,
                               mx_endpoint->mx_peer_addr, 
                               match_bits, 
                               mtl_mx_request, 
                               &mtl_mx_request->mx_request
                               );
        where = "mx_issend";
    } else { 
        mx_return = mx_isend( ompi_mtl_mx.mx_endpoint, 
                              mtl_mx_request->mx_segment,
                              1,
                              mx_endpoint->mx_peer_addr,
                              match_bits,
                              mtl_mx_request,
                              &mtl_mx_request->mx_request
                              );
        where = "mx_isend";
    }
    if( OPAL_UNLIKELY(mx_return != MX_SUCCESS) ) { 
        char peer_name[MX_MAX_HOSTNAME_LEN];
        if(MX_SUCCESS != mx_nic_id_to_hostname( mx_endpoint->mx_peer->nic_id, peer_name)) { 
            sprintf( peer_name, "unknown %lx nic_id", (long)mx_endpoint->mx_peer->nic_id ); 
        }
        opal_output(ompi_mtl_base_framework.framework_output, "Error in %s (error %s) sending to %s\n",
                    where, mx_strerror(mx_return), peer_name);
        return OMPI_ERROR;
    }
    return OMPI_SUCCESS;
}