int mca_btl_mx_proc_connect( mca_btl_mx_endpoint_t* module_endpoint ) { int num_retry = 0; mx_return_t mx_status; mx_endpoint_addr_t mx_remote_addr; module_endpoint->status = MCA_BTL_MX_CONNECTION_PENDING; retry_connect: mx_status = mx_connect( module_endpoint->endpoint_btl->mx_endpoint, module_endpoint->mx_peer->nic_id, module_endpoint->mx_peer->endpoint_id, mca_btl_mx_component.mx_filter, mca_btl_mx_component.mx_timeout, &mx_remote_addr ); if( MX_SUCCESS != mx_status ) { if( MX_TIMEOUT == mx_status ) if( num_retry++ < mca_btl_mx_component.mx_connection_retries ) goto retry_connect; { char peer_name[MX_MAX_HOSTNAME_LEN]; if( MX_SUCCESS != mx_nic_id_to_hostname( module_endpoint->mx_peer->nic_id, peer_name ) ) sprintf( peer_name, "unknown %lx nic_id", (long)module_endpoint->mx_peer->nic_id ); opal_output( 0, "mx_connect fail for %s with key %x (error %s)\n\tUnique ID (local %x remote %x)\n", peer_name, mca_btl_mx_component.mx_filter, mx_strerror(mx_status), module_endpoint->endpoint_btl->mx_unique_network_id, module_endpoint->mx_peer->unique_network_id ); } module_endpoint->status = MCA_BTL_MX_NOT_REACHEABLE; return OMPI_ERROR; } module_endpoint->mx_peer_addr = mx_remote_addr; module_endpoint->status = MCA_BTL_MX_CONNECTED; return OMPI_SUCCESS; }
int ompi_common_mx_initialize(void) { mx_return_t mx_return; struct mca_mpool_base_resources_t mpool_resources; int index, value; ompi_common_mx_initialize_ref_cnt++; if(ompi_common_mx_initialize_ref_cnt == 1) { /* set the MX error handle to always return. This function is the * only MX function allowed to be called before mx_init in order * to make sure that if the MX is not up and running the MX * library does not exit the application. */ mx_set_error_handler(MX_ERRORS_RETURN); /* If we have a memory manager available, and mpi_leave_pinned == -1, then set mpi_leave_pinned to 1. We have a memory manager if: - we have both FREE and MUNMAP support - we have MUNMAP support and the linux mallopt */ value = opal_mem_hooks_support_level(); if ((value & (OPAL_MEMORY_FREE_SUPPORT | OPAL_MEMORY_MUNMAP_SUPPORT)) == (OPAL_MEMORY_FREE_SUPPORT | OPAL_MEMORY_MUNMAP_SUPPORT)) { index = mca_base_param_find("mpi", NULL, "leave_pinned"); if (index >= 0) if ((mca_base_param_lookup_int(index, &value) == OPAL_SUCCESS) && (value == -1)) { ompi_mpi_leave_pinned = 1; setenv("MX_RCACHE", "2", 1); mpool_resources.regcache_clean = mx__regcache_clean; ompi_common_mx_fake_mpool = mca_mpool_base_module_create("fake", NULL, &mpool_resources); if (!ompi_common_mx_fake_mpool) { ompi_mpi_leave_pinned = 0; setenv("MX_RCACHE", "0", 1); opal_output(0, "Error creating fake mpool (error %s)\n", strerror(errno)); } } } /* initialize the mx library */ mx_return = mx_init(); if(MX_SUCCESS != mx_return) { opal_output(0, "Error in mx_init (error %s)\n", mx_strerror(mx_return)); return OMPI_ERR_NOT_AVAILABLE; } } return OMPI_SUCCESS; }
mca_btl_base_descriptor_t* mca_btl_mx_prepare_dst( struct mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t* endpoint, struct mca_mpool_base_registration_t* registration, struct ompi_convertor_t* convertor, uint8_t order, size_t reserve, size_t* size, uint32_t flags) { mca_btl_mx_module_t* mx_btl = (mca_btl_mx_module_t*)btl; mca_btl_mx_frag_t* frag; mx_return_t mx_return; mx_segment_t mx_segment; int rc; MCA_BTL_MX_FRAG_ALLOC_USER(btl, frag, rc); if( OPAL_UNLIKELY(NULL == frag) ) { return NULL; } frag->segment[0].seg_len = *size; ompi_convertor_get_current_pointer( convertor, (void**)&(frag->segment[0].seg_addr.pval) ); frag->segment[0].seg_key.key64 = (uint64_t)(intptr_t)frag; mx_segment.segment_ptr = frag->segment[0].seg_addr.pval; mx_segment.segment_length = frag->segment[0].seg_len; mx_return = mx_irecv( mx_btl->mx_endpoint, &mx_segment, 1, frag->segment[0].seg_key.key64, BTL_MX_PUT_MASK, NULL, &(frag->mx_request) ); if( OPAL_UNLIKELY(MX_SUCCESS != mx_return) ) { opal_output( 0, "Fail to re-register a fragment with the MX NIC ...\n" ); MCA_BTL_MX_FRAG_RETURN( btl, frag ); return NULL; } #ifdef HAVE_MX_FORGET { mx_return = mx_forget( mx_btl->mx_endpoint, &(frag->mx_request) ); if( OPAL_UNLIKELY(MX_SUCCESS != mx_return) ) { opal_output( 0, "mx_forget failed in mca_btl_mx_prepare_dst with error %d (%s)\n", mx_return, mx_strerror(mx_return) ); return NULL; } } #endif /* Allow the fragment to be recycled using the mca_btl_mx_free function */ frag->type = MCA_BTL_MX_SEND; frag->base.des_dst = frag->segment; frag->base.des_dst_cnt = 1; frag->base.des_flags = flags; frag->base.order = MCA_BTL_NO_ORDER; return &frag->base; }
int ompi_mtl_mx_module_init(){ mx_param_t mx_param; mx_return_t mx_return; /* setup params */ mx_param.key = MX_PARAM_UNEXP_QUEUE_MAX; mx_param.val.unexp_queue_max = ompi_mtl_mx.mx_unexp_queue_max; /* get a local endpoint */ mx_return = mx_open_endpoint(MX_ANY_NIC, MX_ANY_ENDPOINT, ompi_mtl_mx.mx_filter, NULL, 0, &ompi_mtl_mx.mx_endpoint); if(mx_return != MX_SUCCESS) { opal_output(ompi_mtl_base_output, "Error in mx_open_endpoint (error %s)\n", mx_strerror(mx_return)); return OMPI_ERROR; } /* get the endpoint address */ mx_return = mx_get_endpoint_addr( ompi_mtl_mx.mx_endpoint, &ompi_mtl_mx.mx_endpoint_addr); if(mx_return != MX_SUCCESS) { opal_output(ompi_mtl_base_output, "Error in mx_get_endpoint_addr (error %s)\n", mx_strerror(mx_return)); return OMPI_ERROR; } mx_return = mx_decompose_endpoint_addr( ompi_mtl_mx.mx_endpoint_addr, &(ompi_mtl_mx.mx_addr.nic_id), &(ompi_mtl_mx.mx_addr.endpoint_id) ); if(mx_return != MX_SUCCESS) { opal_output(ompi_mtl_base_output, "Error in mx_decompose_endpoint_addr (error %s)\n", mx_strerror(mx_return)); return OMPI_ERROR; } ompi_modex_send( &mca_mtl_mx_component.super.mtl_version, &ompi_mtl_mx.mx_addr, sizeof(mca_mtl_mx_addr_t)); /* register the mtl mx progress function */ opal_progress_register(ompi_mtl_mx_progress); return OMPI_SUCCESS; }
int ompi_mtl_mx_finalize(struct mca_mtl_base_module_t* mtl) { mx_return_t mx_return; opal_progress_unregister(ompi_mtl_mx_progress); /* free resources */ mx_return = mx_close_endpoint(ompi_mtl_mx.mx_endpoint); if(mx_return != MX_SUCCESS){ opal_output(ompi_mtl_base_output, "Error in mx_close_endpoint (error %s)\n", mx_strerror(mx_return)); return OMPI_ERROR; } return ompi_common_mx_finalize(); }
mca_mtl_mx_endpoint_t* mca_mtl_mx_endpoint_create(ompi_proc_t* ompi_proc) { mca_mtl_mx_endpoint_t* mtl_mx_endpoint = NULL; int rc; mca_mtl_mx_addr_t *mx_peer; size_t size; mx_return_t mx_return; int num_retry = 0; /* get the remote proc's address (only one) */ rc = ompi_modex_recv(&mca_mtl_mx_component.super.mtl_version, ompi_proc, (void**)&mx_peer, &size); if( rc != OMPI_SUCCESS || size != sizeof(mca_mtl_mx_addr_t)) { return NULL; } mtl_mx_endpoint = (mca_mtl_mx_endpoint_t*) OBJ_NEW(mca_mtl_mx_endpoint_t); mtl_mx_endpoint->mx_peer = mx_peer; retry_connect: mx_return = mx_connect(ompi_mtl_mx.mx_endpoint, mx_peer->nic_id, mx_peer->endpoint_id, ompi_mtl_mx.mx_filter, ompi_mtl_mx.mx_timeout, &mtl_mx_endpoint->mx_peer_addr); if(MX_SUCCESS != mx_return) { char peer_name[MX_MAX_HOSTNAME_LEN]; if(MX_TIMEOUT == mx_return) { if( num_retry++ < ompi_mtl_mx.mx_retries ) { goto retry_connect; } } if(MX_SUCCESS != mx_nic_id_to_hostname( mx_peer->nic_id, peer_name)) { sprintf( peer_name, "unknown %lx nic_id", (long)mx_peer->nic_id ); } opal_output(ompi_mtl_base_output, "mx_connect fail for %s with key %x (error %s)\n", peer_name, ompi_mtl_mx.mx_filter, mx_strerror(mx_return) ); return NULL; } return mtl_mx_endpoint; }
int ompi_common_mx_finalize(void) { mx_return_t mx_return; ompi_common_mx_initialize_ref_cnt--; if( 0 == ompi_common_mx_initialize_ref_cnt ) { if (ompi_common_mx_fake_mpool) mca_mpool_base_module_destroy(ompi_common_mx_fake_mpool); mx_return = mx_finalize(); if(mx_return != MX_SUCCESS){ opal_output(0, "Error in mx_finalize (error %s)\n", mx_strerror(mx_return)); return OMPI_ERROR; } } return OMPI_SUCCESS; }
int ompi_mtl_mx_iprobe(struct mca_mtl_base_module_t* mtl, struct ompi_communicator_t *comm, int src, int tag, int *flag, struct ompi_status_public_t *status) { uint32_t result; mx_return_t ret; mx_status_t mx_status; uint64_t match_bits; uint64_t mask_bits; MX_SET_RECV_BITS(match_bits, mask_bits, comm->c_contextid, src, tag); ret = mx_iprobe(ompi_mtl_mx.mx_endpoint, match_bits, mask_bits, &mx_status, &result); if (MX_SUCCESS != ret) { opal_output(ompi_mtl_base_output, "Error in mx_iprobe (error %s)\n", mx_strerror(ret)); return OMPI_ERROR; } if (result) { if(MPI_STATUS_IGNORE != status) { MX_GET_SRC(mx_status.match_info, status->MPI_SOURCE); MX_GET_TAG(mx_status.match_info, status->MPI_TAG); status->_ucount = mx_status.msg_length; } *flag = 1; } else { *flag = 0; } return OMPI_SUCCESS; }
/** * Initiate an asynchronous put. * * @param btl (IN) BTL module * @param endpoint (IN) BTL addressing information * @param descriptor (IN) Description of the data to be transferred */ static int mca_btl_mx_put( struct mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t* endpoint, struct mca_btl_base_descriptor_t* descriptor ) { mca_btl_mx_module_t* mx_btl = (mca_btl_mx_module_t*)btl; mca_btl_mx_frag_t* frag = (mca_btl_mx_frag_t*)descriptor; mx_segment_t mx_segment[2]; mx_return_t mx_return; uint32_t i = 0; if( OPAL_UNLIKELY(MCA_BTL_MX_CONNECTED != ((mca_btl_mx_endpoint_t*)endpoint)->status) ) { if( MCA_BTL_MX_NOT_REACHEABLE == ((mca_btl_mx_endpoint_t*)endpoint)->status ) return OMPI_ERROR; if( MCA_BTL_MX_CONNECTION_PENDING == ((mca_btl_mx_endpoint_t*)endpoint)->status ) return OMPI_ERR_OUT_OF_RESOURCE; if( OMPI_SUCCESS != mca_btl_mx_proc_connect( (mca_btl_mx_endpoint_t*)endpoint ) ) return OMPI_ERROR; } frag->endpoint = endpoint; frag->type = MCA_BTL_MX_SEND; descriptor->des_flags |= MCA_BTL_DES_SEND_ALWAYS_CALLBACK; do { mx_segment[i].segment_ptr = descriptor->des_src[i].seg_addr.pval; mx_segment[i].segment_length = descriptor->des_src[i].seg_len; } while (++i < descriptor->des_src_cnt); mx_return = mx_isend( mx_btl->mx_endpoint, mx_segment, descriptor->des_src_cnt, endpoint->mx_peer_addr, descriptor->des_dst[0].seg_key.key64, frag, &frag->mx_request ); if( OPAL_UNLIKELY(MX_SUCCESS != mx_return) ) { opal_output( 0, "mx_isend fails with error %s\n", mx_strerror(mx_return) ); return OMPI_ERROR; } return OMPI_SUCCESS; }
int main(int argc, char **argv) { mx_endpoint_t ep; uint64_t nic_id; uint16_t my_eid; uint64_t his_nic_id; uint32_t board_id; uint32_t filter; uint16_t his_eid; mx_endpoint_addr_t his_addr; char *rem_host; int len; int iter; int c; int do_wait; int do_bothways; extern char *optarg; mx_return_t ret; #if DEBUG extern int mx_debug_mask; mx_debug_mask = 0xFFF; #endif mx_init(); MX_MUTEX_INIT(&stream_mutex); /* set up defaults */ rem_host = NULL; filter = FILTER; my_eid = DFLT_EID; his_eid = DFLT_EID; board_id = MX_ANY_NIC; len = DFLT_LEN; iter = DFLT_ITER; do_wait = 0; do_bothways = 0; num_threads = 1; while ((c = getopt(argc, argv, "hd:e:f:n:b:r:l:N:Vvwx")) != EOF) switch(c) { case 'd': rem_host = optarg; break; case 'e': my_eid = atoi(optarg); break; case 'f': filter = atoi(optarg); break; case 'n': sscanf(optarg, "%"SCNx64, &nic_id); mx_nic_id_to_board_number(nic_id, &board_id); break; case 'b': board_id = atoi(optarg); break; case 'r': his_eid = atoi(optarg); break; case 'l': len = atoi(optarg); if (len > MAX_LEN) { fprintf(stderr, "len too large, max is %d\n", MAX_LEN); exit(1); } break; case 'N': iter = atoi(optarg); break; case 'V': Verify = 1; break; case 'v': do_verbose = 1; break; case 'w': do_wait = 1; break; case 'x': #if MX_THREAD_SAFE do_bothways = 1; #else fprintf(stderr, "bi-directional mode only supported with threadsafe mx lib\n"); exit(1); #endif break; case 'h': default: usage(); exit(1); } if (rem_host != NULL) num_threads += do_bothways; ret = mx_open_endpoint(board_id, my_eid, filter, NULL, 0, &ep); if (ret != MX_SUCCESS) { fprintf(stderr, "Failed to open endpoint %s\n", mx_strerror(ret)); exit(1); } /* If no host, we are receiver */ if (rem_host == NULL) { if (do_verbose) printf("Starting streaming receiver\n"); if (Verify) { fprintf(stderr, "-V ignored. Verify must be set by sender\n"); Verify = 0; } if (do_wait) receiver_blocking(ep, MATCH_VAL_MAIN, filter); else receiver_polling(ep, MATCH_VAL_MAIN, filter); } else { /* get address of destination */ mx_hostname_to_nic_id(rem_host, &his_nic_id); mx_connect(ep, his_nic_id, his_eid, filter, MX_INFINITE, &his_addr); if (do_verbose) printf("Starting streaming send to host %s\n", rem_host); if (Verify) printf("Verifying results\n"); /* start up the sender */ if (do_wait) sender_blocking(ep, his_addr, iter, len, do_bothways,MATCH_VAL_MAIN); else sender_polling(ep, his_addr, iter, len, do_bothways, MATCH_VAL_MAIN); } mx_close_endpoint(ep); mx_finalize(); exit(0); }
int ompi_mtl_mx_isend(struct mca_mtl_base_module_t* mtl, struct ompi_communicator_t* comm, int dest, int tag, struct opal_convertor_t *convertor, mca_pml_base_send_mode_t mode, bool blocking, mca_mtl_request_t * mtl_request) { mx_return_t mx_return; uint64_t match_bits; mca_mtl_mx_request_t * mtl_mx_request = (mca_mtl_mx_request_t*) mtl_request; size_t length; ompi_proc_t* ompi_proc = ompi_comm_peer_lookup( comm, dest ); mca_mtl_mx_endpoint_t* mx_endpoint = (mca_mtl_mx_endpoint_t*) ompi_proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_MTL]; char* where; assert(mtl == &ompi_mtl_mx.super); MX_SET_SEND_BITS(match_bits, comm->c_contextid, comm->c_my_rank, tag); ompi_mtl_datatype_pack(convertor, &mtl_mx_request->mx_segment[0].segment_ptr, &length, &mtl_mx_request->free_after); mtl_mx_request->mx_segment[0].segment_length = length; mtl_mx_request->convertor = convertor; mtl_mx_request->type = OMPI_MTL_MX_ISEND; OPAL_OUTPUT_VERBOSE((50, ompi_mtl_base_framework.framework_output, "issend bits: 0x%016" PRIu64 "\n", match_bits)); if(mode == MCA_PML_BASE_SEND_SYNCHRONOUS) { mx_return = mx_issend( ompi_mtl_mx.mx_endpoint, mtl_mx_request->mx_segment, 1, mx_endpoint->mx_peer_addr, match_bits, mtl_mx_request, &mtl_mx_request->mx_request ); where = "mx_issend"; } else { mx_return = mx_isend( ompi_mtl_mx.mx_endpoint, mtl_mx_request->mx_segment, 1, mx_endpoint->mx_peer_addr, match_bits, mtl_mx_request, &mtl_mx_request->mx_request ); where = "mx_isend"; } if( OPAL_UNLIKELY(mx_return != MX_SUCCESS) ) { char peer_name[MX_MAX_HOSTNAME_LEN]; if(MX_SUCCESS != mx_nic_id_to_hostname( mx_endpoint->mx_peer->nic_id, peer_name)) { sprintf( peer_name, "unknown %lx nic_id", (long)mx_endpoint->mx_peer->nic_id ); } opal_output(ompi_mtl_base_framework.framework_output, "Error in %s (error %s) sending to %s\n", where, mx_strerror(mx_return), peer_name); return OMPI_ERROR; } return OMPI_SUCCESS; }
int mca_btl_mx_send( struct mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t* endpoint, struct mca_btl_base_descriptor_t* descriptor, mca_btl_base_tag_t tag ) { mca_btl_mx_module_t* mx_btl = (mca_btl_mx_module_t*)btl; mca_btl_mx_frag_t* frag = (mca_btl_mx_frag_t*)descriptor; mx_segment_t mx_segment[2]; mx_return_t mx_return; uint64_t total_length = 0, tag64; uint32_t i = 0; int btl_ownership = (descriptor->des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP); if( OPAL_UNLIKELY(MCA_BTL_MX_CONNECTED != ((mca_btl_mx_endpoint_t*)endpoint)->status) ) { if( MCA_BTL_MX_NOT_REACHEABLE == ((mca_btl_mx_endpoint_t*)endpoint)->status ) return OMPI_ERROR; if( MCA_BTL_MX_CONNECTION_PENDING == ((mca_btl_mx_endpoint_t*)endpoint)->status ) return OMPI_ERR_OUT_OF_RESOURCE; if( OMPI_SUCCESS != mca_btl_mx_proc_connect( (mca_btl_mx_endpoint_t*)endpoint ) ) return OMPI_ERROR; } frag->endpoint = endpoint; frag->type = MCA_BTL_MX_SEND; do { mx_segment[i].segment_ptr = descriptor->des_src[i].seg_addr.pval; mx_segment[i].segment_length = descriptor->des_src[i].seg_len; total_length += descriptor->des_src[i].seg_len; } while (++i < descriptor->des_src_cnt); tag64 = 0x01ULL | (((uint64_t)tag) << 8); mx_return = mx_isend( mx_btl->mx_endpoint, mx_segment, descriptor->des_src_cnt, endpoint->mx_peer_addr, tag64, frag, &frag->mx_request ); if( OPAL_UNLIKELY(MX_SUCCESS != mx_return) ) { opal_output( 0, "mx_isend fails with error %s\n", mx_strerror(mx_return) ); return OMPI_ERROR; } #ifdef HAVE_MX_FORGET { uint32_t mx_result; mx_return = mx_ibuffered( mx_btl->mx_endpoint, &(frag->mx_request), &mx_result ); if( OPAL_UNLIKELY(MX_SUCCESS != mx_return) ) { opal_output( 0, "mx_ibuffered failed with error %d (%s)\n", mx_return, mx_strerror(mx_return) ); frag->base.des_flags |= MCA_BTL_DES_SEND_ALWAYS_CALLBACK; return OMPI_ERROR; } if( mx_result ) { mx_return = mx_forget( mx_btl->mx_endpoint, &(frag->mx_request) ); if( OPAL_UNLIKELY(MX_SUCCESS != mx_return) ) { opal_output( 0, "mx_forget failed with error %d (%s)\n", mx_return, mx_strerror(mx_return) ); frag->base.des_flags |= MCA_BTL_DES_SEND_ALWAYS_CALLBACK; return OMPI_SUCCESS; } if( MCA_BTL_DES_SEND_ALWAYS_CALLBACK & frag->base.des_flags ) { frag->base.des_cbfunc( &(mx_btl->super), frag->endpoint, &(frag->base), OMPI_SUCCESS); } if( btl_ownership ) { MCA_BTL_MX_FRAG_RETURN( mx_btl, frag ); } return 1; } } #endif if( 2048 > total_length ) { mx_status_t mx_status; uint32_t mx_result; /* let's check for completness */ mx_return = mx_test( mx_btl->mx_endpoint, &(frag->mx_request), &mx_status, &mx_result ); if( OPAL_LIKELY(MX_SUCCESS == mx_return) ) { if( mx_result ) { if( MCA_BTL_DES_SEND_ALWAYS_CALLBACK & frag->base.des_flags ) { frag->base.des_cbfunc( &(mx_btl->super), frag->endpoint, &(frag->base), OMPI_SUCCESS); } if( btl_ownership ) { MCA_BTL_MX_FRAG_RETURN( mx_btl, frag ); } return 1; } } } frag->base.des_flags |= MCA_BTL_DES_SEND_ALWAYS_CALLBACK; return OMPI_SUCCESS; }
/** * Initiate an inline send to the peer. If failure then return a descriptor. * * @param btl (IN) BTL module * @param peer (IN) BTL peer addressing */ static int mca_btl_mx_sendi( struct mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t* endpoint, struct ompi_convertor_t* convertor, void* header, size_t header_size, size_t payload_size, uint8_t order, uint32_t flags, mca_btl_base_tag_t tag, mca_btl_base_descriptor_t** descriptor ) { mca_btl_mx_module_t* mx_btl = (mca_btl_mx_module_t*) btl; size_t max_data; if( OPAL_UNLIKELY(MCA_BTL_MX_CONNECTED != ((mca_btl_mx_endpoint_t*)endpoint)->status) ) { if( MCA_BTL_MX_NOT_REACHEABLE == ((mca_btl_mx_endpoint_t*)endpoint)->status ) return OMPI_ERROR; if( MCA_BTL_MX_CONNECTION_PENDING == ((mca_btl_mx_endpoint_t*)endpoint)->status ) return OMPI_ERR_OUT_OF_RESOURCE; if( OMPI_SUCCESS != mca_btl_mx_proc_connect( (mca_btl_mx_endpoint_t*)endpoint ) ) return OMPI_ERROR; } if( !ompi_convertor_need_buffers(convertor) ) { uint32_t mx_segment_count = 0; uint64_t tag64 = 0x01ULL | (((uint64_t)tag) << 8); mx_return_t mx_return; mx_request_t mx_request; mx_segment_t mx_segments[2], *mx_segment = mx_segments; if( 0 != header_size ) { mx_segment->segment_ptr = header; mx_segment->segment_length = header_size; mx_segment++; mx_segment_count++; } if( 0 != payload_size ) { struct iovec iov; uint32_t iov_count = 1; iov.iov_base = NULL; iov.iov_len = payload_size; (void)ompi_convertor_pack( convertor, &iov, &iov_count, &max_data ); assert( max_data == payload_size ); mx_segment->segment_ptr = iov.iov_base; mx_segment->segment_length = max_data; mx_segment_count++; } mx_return = mx_isend( mx_btl->mx_endpoint, mx_segments, mx_segment_count, endpoint->mx_peer_addr, tag64, NULL, &mx_request ); if( OPAL_UNLIKELY(MX_SUCCESS != mx_return) ) { opal_output( 0, "mx_isend fails with error %s\n", mx_strerror(mx_return) ); return OMPI_ERROR; } #ifdef HAVE_MX_FORGET { uint32_t mx_result; mx_return = mx_ibuffered( mx_btl->mx_endpoint, &mx_request, &mx_result ); if( OPAL_UNLIKELY(MX_SUCCESS != mx_return) ) { opal_output( 0, "mx_ibuffered failed with error %d (%s)\n", mx_return, mx_strerror(mx_return) ); return OMPI_SUCCESS; } if( mx_result ) { mx_return = mx_forget( mx_btl->mx_endpoint, &mx_request ); if( OPAL_UNLIKELY(MX_SUCCESS != mx_return) ) { opal_output( 0, "mx_forget failed with error %d (%s)\n", mx_return, mx_strerror(mx_return) ); } } return OMPI_SUCCESS; } #endif } /* No optimization on this path. Just allocate a descriptor and return it * to the user. */ *descriptor = mca_btl_mx_alloc( btl, endpoint, order, header_size + payload_size, flags ); return OMPI_ERR_RESOURCE_BUSY; }
int ompi_mtl_mx_send(struct mca_mtl_base_module_t* mtl, struct ompi_communicator_t* comm, int dest, int tag, struct opal_convertor_t *convertor, mca_pml_base_send_mode_t mode) { mx_return_t mx_return; uint64_t match_bits; mca_mtl_mx_request_t mtl_mx_request; size_t length; mx_status_t mx_status; uint32_t result; ompi_proc_t* ompi_proc = ompi_comm_peer_lookup( comm, dest ); mca_mtl_mx_endpoint_t* mx_endpoint = (mca_mtl_mx_endpoint_t*) ompi_proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_MTL]; char* where; assert(mtl == &ompi_mtl_mx.super); MX_SET_SEND_BITS(match_bits, comm->c_contextid, comm->c_my_rank, tag); ompi_mtl_datatype_pack(convertor, &mtl_mx_request.mx_segment[0].segment_ptr, &length, &mtl_mx_request.free_after); mtl_mx_request.mx_segment[0].segment_length = length; mtl_mx_request.convertor = convertor; mtl_mx_request.type = OMPI_MTL_MX_ISEND; OPAL_OUTPUT_VERBOSE((50, ompi_mtl_base_framework.framework_output, "issend bits: 0x%016" PRIu64 "\n", match_bits)); if(mode == MCA_PML_BASE_SEND_SYNCHRONOUS) { mx_return = mx_issend( ompi_mtl_mx.mx_endpoint, mtl_mx_request.mx_segment, 1, mx_endpoint->mx_peer_addr, match_bits, &mtl_mx_request, &mtl_mx_request.mx_request ); where = "mx_issend"; } else { mx_return = mx_isend( ompi_mtl_mx.mx_endpoint, mtl_mx_request.mx_segment, 1, mx_endpoint->mx_peer_addr, match_bits, &mtl_mx_request, &mtl_mx_request.mx_request ); where = "mx_isend"; } if( OPAL_UNLIKELY(mx_return != MX_SUCCESS) ) { char peer_name[MX_MAX_HOSTNAME_LEN]; if(MX_SUCCESS != mx_nic_id_to_hostname( mx_endpoint->mx_peer->nic_id, peer_name)) { sprintf( peer_name, "unknown %lx nic_id", (long)mx_endpoint->mx_peer->nic_id ); } opal_output(ompi_mtl_base_framework.framework_output, "Error in %s (error %s) sending to %s\n", where, mx_strerror(mx_return), peer_name); /* Free buffer if needed */ if(mtl_mx_request.free_after) { free(mtl_mx_request.mx_segment[0].segment_ptr); } return OMPI_ERROR; } do { mx_return = mx_test(ompi_mtl_mx.mx_endpoint, &mtl_mx_request.mx_request, &mx_status, &result); if( OPAL_UNLIKELY(mx_return != MX_SUCCESS) ) { opal_output(ompi_mtl_base_framework.framework_output, "Error in mx_wait (error %s)\n", mx_strerror(mx_return)); abort(); } if( OPAL_UNLIKELY(result && mx_status.code != MX_STATUS_SUCCESS) ) { opal_output(ompi_mtl_base_framework.framework_output, "Error in ompi_mtl_mx_send, mx_wait returned something other than MX_STATUS_SUCCESS: mx_status.code = %d.\n", mx_status.code); abort(); } } while(!result); /* Free buffer if needed */ if(mtl_mx_request.free_after) { free(mtl_mx_request.mx_segment[0].segment_ptr); } return OMPI_SUCCESS; }
int MPID_nem_mx_cancel_recv(MPIDI_VC_t *vc, MPID_Request *rreq) { mx_request_t *mx_request = NULL; mx_return_t ret; uint32_t result; int mpi_errno = MPI_SUCCESS; int handled = FALSE; mx_request = &(REQ_FIELD(rreq,mx_request)); /* FIXME this test is probably not correct with multiple netmods */ /* We need to know to which netmod a recv request actually "belongs" to */ if(mx_request != NULL) { ret = mx_cancel(MPID_nem_mx_local_endpoint,mx_request,&result); MPIU_ERR_CHKANDJUMP1(ret != MX_SUCCESS, mpi_errno, MPI_ERR_OTHER, "**mx_cancel", "**mx_cancel %s", mx_strerror(ret)); if (result) { int found; rreq->status.cancelled = TRUE; found = MPIDI_CH3U_Recvq_DP(rreq); MPIU_Assert(found); rreq->status.count = 0; MPID_REQUEST_SET_COMPLETED(rreq); MPID_Request_release(rreq); } else { rreq->status.cancelled = FALSE; MPIU_DBG_MSG_P(CH3_OTHER,VERBOSE, "request 0x%08x already matched, unable to cancel", rreq->handle); } handled = TRUE; } fn_exit: return mpi_errno; fn_fail: goto fn_exit; }
int ompi_mtl_mx_progress( void ) { mx_return_t mx_return; mx_request_t mx_request; mx_status_t mx_status; uint32_t result; mca_mtl_mx_request_t* mtl_mx_request; int completed = 0; while(1){ mx_return = mx_ipeek(ompi_mtl_mx.mx_endpoint, &mx_request, &result); if( OPAL_UNLIKELY(mx_return != MX_SUCCESS) ) { opal_output(ompi_mtl_base_framework.framework_output, "Error in mx_ipeek (error %s)\n", mx_strerror(mx_return)); } if(result) { completed++; mx_return = mx_test(ompi_mtl_mx.mx_endpoint, &mx_request, &mx_status, &result); if( OPAL_UNLIKELY(mx_return != MX_SUCCESS) ) { opal_output(ompi_mtl_base_framework.framework_output, "Error in mx_test (error %s)\n", mx_strerror(mx_return)); abort(); } if( OPAL_UNLIKELY(0 == result) ) { opal_output(ompi_mtl_base_framework.framework_output, "Error in ompi_mtl_mx_progress, mx_ipeek returned a request, mx_test on the request resulted failure.\n"); abort(); } mtl_mx_request = (mca_mtl_mx_request_t*) mx_status.context; if(OMPI_MTL_MX_ISEND == mtl_mx_request->type) { if(mtl_mx_request->free_after) { free(mtl_mx_request->mx_segment[0].segment_ptr); } } else { assert( OMPI_MTL_MX_IRECV == mtl_mx_request->type ); ompi_mtl_datatype_unpack(mtl_mx_request->convertor, mtl_mx_request->mx_segment[0].segment_ptr, mx_status.xfer_length); /* set the status */ MX_GET_SRC(mx_status.match_info, mtl_mx_request->super.ompi_req->req_status.MPI_SOURCE); MX_GET_TAG(mx_status.match_info, mtl_mx_request->super.ompi_req->req_status.MPI_TAG); mtl_mx_request->super.ompi_req->req_status._ucount = mx_status.xfer_length; } /* suppose everything went just fine ... */ mtl_mx_request->super.ompi_req->req_status.MPI_ERROR = OMPI_SUCCESS; if( OPAL_UNLIKELY(MX_STATUS_SUCCESS != mx_status.code) ) { if( MX_STATUS_TRUNCATED == mx_status.code ) { mtl_mx_request->super.ompi_req->req_status.MPI_ERROR = MPI_ERR_TRUNCATE; } else { mtl_mx_request->super.ompi_req->req_status.MPI_ERROR = MPI_ERR_INTERN; } return completed; } mtl_mx_request->super.completion_callback(&mtl_mx_request->super); return completed; } else { return completed; } } }
int MPID_nem_mx_vc_init (MPIDI_VC_t *vc) { uint32_t threshold; MPIDI_CH3I_VC *vc_ch = VC_CH(vc); int mpi_errno = MPI_SUCCESS; /* first make sure that our private fields in the vc fit into the area provided */ MPIU_Assert(sizeof(MPID_nem_mx_vc_area) <= MPID_NEM_VC_NETMOD_AREA_LEN); #ifdef ONDEMAND VC_FIELD(vc, local_connected) = 0; VC_FIELD(vc, remote_connected) = 0; #else { char *business_card; int val_max_sz; int ret; #ifdef USE_PMI2_API val_max_sz = PMI2_MAX_VALLEN; #else mpi_errno = PMI_KVS_Get_value_length_max(&val_max_sz); #endif business_card = (char *)MPIU_Malloc(val_max_sz); mpi_errno = vc->pg->getConnInfo(vc->pg_rank, business_card,val_max_sz, vc->pg); if (mpi_errno) MPIU_ERR_POP(mpi_errno); mpi_errno = MPID_nem_mx_get_from_bc (business_card, &VC_FIELD(vc, remote_endpoint_id), &VC_FIELD(vc, remote_nic_id)); if (mpi_errno) MPIU_ERR_POP (mpi_errno); MPIU_Free(business_card); ret = mx_connect(MPID_nem_mx_local_endpoint,VC_FIELD(vc, remote_nic_id),VC_FIELD(vc, remote_endpoint_id), MPID_NEM_MX_FILTER,MX_INFINITE,&(VC_FIELD(vc, remote_endpoint_addr))); MPIU_ERR_CHKANDJUMP1 (ret != MX_SUCCESS, mpi_errno, MPI_ERR_OTHER, "**mx_connect", "**mx_connect %s", mx_strerror (ret)); mx_set_endpoint_addr_context(VC_FIELD(vc, remote_endpoint_addr),(void *)vc); MPIDI_CHANGE_VC_STATE(vc, ACTIVE); } #endif mx_get_info(MPID_nem_mx_local_endpoint, MX_COPY_SEND_MAX, NULL, 0, &threshold, sizeof(uint32_t)); vc->eager_max_msg_sz = threshold; vc->rndvSend_fn = NULL; vc->sendNoncontig_fn = MPID_nem_mx_SendNoncontig; vc->comm_ops = &comm_ops; vc_ch->iStartContigMsg = MPID_nem_mx_iStartContigMsg; vc_ch->iSendContig = MPID_nem_mx_iSendContig; fn_exit: return mpi_errno; fn_fail: goto fn_exit; }
static int init_mx( MPIDI_PG_t *pg_p ) { mx_endpoint_addr_t local_endpoint_addr; mx_return_t ret; mx_param_t param; int mpi_errno = MPI_SUCCESS; int r; r = MPL_putenv("MX_DISABLE_SHARED=1"); MPIU_ERR_CHKANDJUMP(r, mpi_errno, MPI_ERR_OTHER, "**putenv"); r = MPL_putenv("MX_DISABLE_SELF=1"); MPIU_ERR_CHKANDJUMP(r, mpi_errno, MPI_ERR_OTHER, "**putenv"); ret = mx_init(); MPIU_ERR_CHKANDJUMP1 (ret != MX_SUCCESS, mpi_errno, MPI_ERR_OTHER, "**mx_init", "**mx_init %s", mx_strerror (ret)); mx_set_error_handler(MX_ERRORS_RETURN); /* ret = mx_get_info(NULL, MX_NIC_COUNT, NULL, 0, &nic_count, sizeof(int)); MPIU_ERR_CHKANDJUMP1 (ret != MX_SUCCESS, mpi_errno, MPI_ERR_OTHER, "**mx_get_info", "**mx_get_info %s", mx_strerror (ret)); count = ++nic_count; mx_nics = (uint64_t *)MPIU_Malloc(count*sizeof(uint64_t)); ret = mx_get_info(NULL, MX_NIC_IDS, NULL, 0, mx_nics, count*sizeof(uint64_t)); MPIU_ERR_CHKANDJUMP1 (ret != MX_SUCCESS, mpi_errno, MPI_ERR_OTHER, "**mx_get_info", "**mx_get_info %s", mx_strerror (ret)); do{ ret = mx_nic_id_to_board_number(mx_nics[index],&mx_board_num); index++; }while(ret != MX_SUCCESS); */ #ifndef USE_CTXT_AS_MARK param.key = MX_PARAM_CONTEXT_ID; param.val.context_id.bits = NEM_MX_MATCHING_BITS - SHIFT_TYPE; param.val.context_id.shift = SHIFT_TYPE; ret = mx_open_endpoint(MX_ANY_NIC,MX_ANY_ENDPOINT,MPID_NEM_MX_FILTER,¶m,1,&MPID_nem_mx_local_endpoint); #else ret = mx_open_endpoint(MX_ANY_NIC,MX_ANY_ENDPOINT,MPID_NEM_MX_FILTER,NULL,0,&MPID_nem_mx_local_endpoint); #endif MPIU_ERR_CHKANDJUMP1 (ret != MX_SUCCESS, mpi_errno, MPI_ERR_OTHER, "**mx_open_endpoint", "**mx_open_endpoint %s", mx_strerror (ret)); ret = mx_get_endpoint_addr(MPID_nem_mx_local_endpoint,&local_endpoint_addr); MPIU_ERR_CHKANDJUMP1 (ret != MX_SUCCESS, mpi_errno, MPI_ERR_OTHER, "**mx_get_endpoint_addr", "**mx_get_endpoint_addr %s", mx_strerror (ret)); ret = mx_decompose_endpoint_addr(local_endpoint_addr,&MPID_nem_mx_local_nic_id,&MPID_nem_mx_local_endpoint_id); MPIU_ERR_CHKANDJUMP1 (ret != MX_SUCCESS, mpi_errno, MPI_ERR_OTHER, "**mx_decompose_endpoint_addr", "**mx_decompose_endpoint_addr %s", mx_strerror (ret)); fn_exit: return mpi_errno; fn_fail: goto fn_exit; }
int mca_btl_mx_register( struct mca_btl_base_module_t* btl, mca_btl_base_tag_t tag, mca_btl_base_module_recv_cb_fn_t cbfunc, void* cbdata ) { mca_btl_mx_module_t* mx_btl = (mca_btl_mx_module_t*) btl; #if 0 if( (NULL != cbfunc) && ( 0 == mca_btl_mx_component.mx_use_unexpected) ) { #endif if( NULL != cbfunc ) { mca_btl_mx_frag_t* frag; mx_return_t mx_return; mx_segment_t mx_segment; int i, rc; /* Post the receives if there is no unexpected handler */ for( i = 0; i < mca_btl_mx_component.mx_max_posted_recv; i++ ) { MCA_BTL_MX_FRAG_ALLOC_EAGER( mx_btl, frag, rc ); if( NULL == frag ) { opal_output( 0, "mca_btl_mx_register: unable to allocate more eager fragments\n" ); if( 0 == i ) { return OMPI_ERROR; } break; /* some fragments are already registered. Try to continue... */ } frag->base.des_dst = frag->segment; frag->base.des_dst_cnt = 1; frag->base.des_src = NULL; frag->base.des_src_cnt = 0; frag->mx_frag_list = NULL; frag->type = MCA_BTL_MX_RECV; mx_segment.segment_ptr = (void*)(frag+1); mx_segment.segment_length = mx_btl->super.btl_eager_limit; mx_return = mx_irecv( mx_btl->mx_endpoint, &mx_segment, 1, 0x01ULL, BTL_MX_RECV_MASK, frag, &(frag->mx_request) ); if( MX_SUCCESS != mx_return ) { opal_output( 0, "mca_btl_mx_register: mx_irecv failed with status %d (%s)\n", mx_return, mx_strerror(mx_return) ); MCA_BTL_MX_FRAG_RETURN( mx_btl, frag ); return OMPI_ERROR; } } } return OMPI_SUCCESS; } /** * Allocate a segment. * * @param btl (IN) BTL module * @param size (IN) Request segment size. */ mca_btl_base_descriptor_t* mca_btl_mx_alloc( struct mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t* endpoint, uint8_t order, size_t size, uint32_t flags) { mca_btl_mx_module_t* mx_btl = (mca_btl_mx_module_t*) btl; mca_btl_mx_frag_t* frag; int rc; MCA_BTL_MX_FRAG_ALLOC_EAGER(mx_btl, frag, rc); if( OPAL_UNLIKELY(NULL == frag) ) { return NULL; } frag->segment[0].seg_len = size <= mx_btl->super.btl_eager_limit ? size : mx_btl->super.btl_eager_limit ; frag->segment[0].seg_addr.pval = (void*)(frag+1); frag->base.des_src = frag->segment; frag->base.des_src_cnt = 1; frag->base.des_flags = flags; frag->base.order = MCA_BTL_NO_ORDER; return (mca_btl_base_descriptor_t*)frag; }
int ompi_mtl_mx_module_init(){ mx_param_t mx_param; mx_return_t mx_return; int32_t nic, ep; /* setup params */ mx_param.key = MX_PARAM_UNEXP_QUEUE_MAX; mx_param.val.unexp_queue_max = ompi_mtl_mx.mx_unexp_queue_max; /* get a local endpoint */ nic = ompi_mtl_mx.mx_board_num; if (nic < 0) { nic = MX_ANY_NIC; } ep = ompi_mtl_mx.mx_endpoint_num; if (ep < 0) { ep = MX_ANY_ENDPOINT; } mx_return = mx_open_endpoint(nic, ep, ompi_mtl_mx.mx_filter, NULL, 0, &ompi_mtl_mx.mx_endpoint); if(mx_return != MX_SUCCESS) { opal_output(ompi_mtl_base_framework.framework_output, "Error in mx_open_endpoint (error %s)\n", mx_strerror(mx_return)); return OMPI_ERROR; } /* get the endpoint address */ mx_return = mx_get_endpoint_addr( ompi_mtl_mx.mx_endpoint, &ompi_mtl_mx.mx_endpoint_addr); if(mx_return != MX_SUCCESS) { opal_output(ompi_mtl_base_framework.framework_output, "Error in mx_get_endpoint_addr (error %s)\n", mx_strerror(mx_return)); return OMPI_ERROR; } mx_return = mx_decompose_endpoint_addr( ompi_mtl_mx.mx_endpoint_addr, &(ompi_mtl_mx.mx_addr.nic_id), &(ompi_mtl_mx.mx_addr.endpoint_id) ); if(mx_return != MX_SUCCESS) { opal_output(ompi_mtl_base_framework.framework_output, "Error in mx_decompose_endpoint_addr (error %s)\n", mx_strerror(mx_return)); return OMPI_ERROR; } opal_output_verbose(10, ompi_mtl_base_framework.framework_output, "mtl:mx: local nic %d, endpoint %d, got nic %d, ep %d\n", nic, ep, (int)ompi_mtl_mx.mx_addr.nic_id, ompi_mtl_mx.mx_addr.endpoint_id); ompi_modex_send( &mca_mtl_mx_component.super.mtl_version, &ompi_mtl_mx.mx_addr, sizeof(mca_mtl_mx_addr_t)); /* register the mtl mx progress function */ opal_progress_register(ompi_mtl_mx_progress); return OMPI_SUCCESS; }
int MPID_nem_mx_cancel_send(MPIDI_VC_t *vc, MPID_Request *sreq) { mx_request_t *mx_request = NULL; mx_return_t ret; uint32_t result; int mpi_errno = MPI_SUCCESS; int handled = FALSE; if (!VC_CH(vc)->is_local) { mx_request = &(REQ_FIELD(sreq,mx_request)); ret = mx_cancel(MPID_nem_mx_local_endpoint,mx_request,&result); MPIU_ERR_CHKANDJUMP1(ret != MX_SUCCESS, mpi_errno, MPI_ERR_OTHER, "**mx_cancel", "**mx_cancel %s", mx_strerror(ret)); if (result) { sreq->status.cancelled = TRUE; sreq->cc = 0; MPIU_Object_set_ref(sreq, 1); MPID_nem_mx_pending_send_req--; } else { sreq->status.cancelled = FALSE; } handled = TRUE; } fn_exit: return handled; fn_fail: goto fn_exit; }