mca_btl_base_descriptor_t* mca_btl_portals_prepare_src(struct mca_btl_base_module_t* btl_base, struct mca_btl_base_endpoint_t* peer, mca_mpool_base_registration_t* registration, struct ompi_convertor_t* convertor, size_t reserve, size_t* size) { mca_btl_portals_frag_t* frag; size_t max_data = *size; struct iovec iov; uint32_t iov_count = 1; int ret; assert(&mca_btl_portals_module == (mca_btl_portals_module_t*) btl_base); if (0 != reserve || 0 != ompi_convertor_need_buffers(convertor)) { frag = (mca_btl_portals_frag_t*) mca_btl_portals_alloc(btl_base, max_data + reserve); if (NULL == frag) { return NULL; } if (max_data + reserve > frag->size) { max_data = frag->size - reserve; } iov.iov_len = max_data; iov.iov_base = (unsigned char*) frag->segments[0].seg_addr.pval + reserve; ret = ompi_convertor_pack(convertor, &iov, &iov_count, &max_data ); *size = max_data; if ( ret < 0 ) { return NULL; } frag->segments[0].seg_len = max_data + reserve; frag->base.des_src_cnt = 1; } else { /* no need to pack - rdma operation out of user's buffer */ ptl_md_t md; ptl_handle_me_t me_h; /* reserve space in the event queue for rdma operations immediately */ while (OPAL_THREAD_ADD32(&mca_btl_portals_module.portals_outstanding_ops, 1) > mca_btl_portals_module.portals_max_outstanding_ops) { OPAL_THREAD_ADD32(&mca_btl_portals_module.portals_outstanding_ops, -1); mca_btl_portals_component_progress(); } OMPI_BTL_PORTALS_FRAG_ALLOC_USER(&mca_btl_portals_module.super, frag, ret); if(NULL == frag){ OPAL_THREAD_ADD32(&mca_btl_portals_module.portals_outstanding_ops, -1); return NULL; } iov.iov_len = max_data; iov.iov_base = NULL; ompi_convertor_pack(convertor, &iov, &iov_count, &max_data ); frag->segments[0].seg_len = max_data; frag->segments[0].seg_addr.pval = iov.iov_base; frag->segments[0].seg_key.key64 = OPAL_THREAD_ADD64(&(mca_btl_portals_module.portals_rdma_key), 1); frag->base.des_src_cnt = 1; /* either a put or get. figure out which later */ OPAL_OUTPUT_VERBOSE((90, mca_btl_portals_component.portals_output, "rdma src posted for frag 0x%x, callback 0x%x, bits %lld", frag, frag->base.des_cbfunc, frag->segments[0].seg_key.key64)); /* create a match entry */ ret = PtlMEAttach(mca_btl_portals_module.portals_ni_h, OMPI_BTL_PORTALS_RDMA_TABLE_ID, *((mca_btl_base_endpoint_t*) peer), frag->segments[0].seg_key.key64, /* match */ 0, /* ignore */ PTL_UNLINK, PTL_INS_AFTER, &me_h); if (PTL_OK != ret) { opal_output(mca_btl_portals_component.portals_output, "Error creating rdma src ME: %d", ret); OMPI_BTL_PORTALS_FRAG_RETURN_USER(&mca_btl_portals_module.super, frag); OPAL_THREAD_ADD32(&mca_btl_portals_module.portals_outstanding_ops, -1); return NULL; } /* setup the memory descriptor */ md.start = frag->segments[0].seg_addr.pval; md.length = frag->segments[0].seg_len; md.threshold = PTL_MD_THRESH_INF; md.max_size = 0; md.options = PTL_MD_OP_PUT | PTL_MD_OP_GET | PTL_MD_EVENT_START_DISABLE; md.user_ptr = frag; /* keep a pointer to ourselves */ md.eq_handle = mca_btl_portals_module.portals_eq_handles[OMPI_BTL_PORTALS_EQ_SEND]; ret = PtlMDAttach(me_h, md, PTL_UNLINK, &(frag->md_h)); if (PTL_OK != ret) { opal_output(mca_btl_portals_component.portals_output, "Error creating rdma src MD: %d", ret); PtlMEUnlink(me_h); OMPI_BTL_PORTALS_FRAG_RETURN_USER(&mca_btl_portals_module.super, frag); OPAL_THREAD_ADD32(&mca_btl_portals_module.portals_outstanding_ops, -1); return NULL; } } frag->base.des_src = frag->segments; frag->base.des_dst = NULL; frag->base.des_dst_cnt = 0; frag->base.des_flags = 0; return &frag->base; }
/** * Prepare data for send/put * * @param btl (IN) BTL module */ struct mca_btl_base_descriptor_t* mca_btl_self_prepare_src( struct mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t* endpoint, mca_mpool_base_registration_t* registration, struct ompi_convertor_t* convertor, size_t reserve, size_t* size) { mca_btl_self_frag_t* frag; struct iovec iov; uint32_t iov_count = 1; size_t max_data = *size; int rc; /* non-contigous data */ if( ompi_convertor_need_buffers(convertor) || max_data < mca_btl_self.btl_max_send_size || reserve != 0 ) { MCA_BTL_SELF_FRAG_ALLOC_SEND(frag, rc); if(NULL == frag) { return NULL; } if(reserve + max_data > frag->size) { max_data = frag->size - reserve; } iov.iov_len = max_data; iov.iov_base = (IOVBASE_TYPE*)((unsigned char*)(frag+1) + reserve); rc = ompi_convertor_pack(convertor, &iov, &iov_count, &max_data ); if(rc < 0) { MCA_BTL_SELF_FRAG_RETURN_SEND(frag); return NULL; } frag->base.des_flags = 0; frag->segment.seg_addr.pval = frag+1; frag->segment.seg_len = reserve + max_data; *size = max_data; } else { MCA_BTL_SELF_FRAG_ALLOC_RDMA(frag, rc); if(NULL == frag) { return NULL; } iov.iov_len = max_data; iov.iov_base = NULL; /* convertor should return offset into users buffer */ rc = ompi_convertor_pack(convertor, &iov, &iov_count, &max_data ); if(rc < 0) { MCA_BTL_SELF_FRAG_RETURN_RDMA(frag); return NULL; } frag->segment.seg_addr.pval = iov.iov_base; frag->segment.seg_len = max_data; frag->base.des_flags = 0; *size = max_data; } frag->base.des_src = &frag->segment; frag->base.des_src_cnt = 1; frag->segment.seg_key.key64 = (uint64_t)(intptr_t)convertor; return &frag->base; }
/** * Initiate an inline send to the peer. If failure then return a descriptor. * * @param btl (IN) BTL module * @param peer (IN) BTL peer addressing */ static int mca_btl_mx_sendi( struct mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t* endpoint, struct ompi_convertor_t* convertor, void* header, size_t header_size, size_t payload_size, uint8_t order, uint32_t flags, mca_btl_base_tag_t tag, mca_btl_base_descriptor_t** descriptor ) { mca_btl_mx_module_t* mx_btl = (mca_btl_mx_module_t*) btl; size_t max_data; if( OPAL_UNLIKELY(MCA_BTL_MX_CONNECTED != ((mca_btl_mx_endpoint_t*)endpoint)->status) ) { if( MCA_BTL_MX_NOT_REACHEABLE == ((mca_btl_mx_endpoint_t*)endpoint)->status ) return OMPI_ERROR; if( MCA_BTL_MX_CONNECTION_PENDING == ((mca_btl_mx_endpoint_t*)endpoint)->status ) return OMPI_ERR_OUT_OF_RESOURCE; if( OMPI_SUCCESS != mca_btl_mx_proc_connect( (mca_btl_mx_endpoint_t*)endpoint ) ) return OMPI_ERROR; } if( !ompi_convertor_need_buffers(convertor) ) { uint32_t mx_segment_count = 0; uint64_t tag64 = 0x01ULL | (((uint64_t)tag) << 8); mx_return_t mx_return; mx_request_t mx_request; mx_segment_t mx_segments[2], *mx_segment = mx_segments; if( 0 != header_size ) { mx_segment->segment_ptr = header; mx_segment->segment_length = header_size; mx_segment++; mx_segment_count++; } if( 0 != payload_size ) { struct iovec iov; uint32_t iov_count = 1; iov.iov_base = NULL; iov.iov_len = payload_size; (void)ompi_convertor_pack( convertor, &iov, &iov_count, &max_data ); assert( max_data == payload_size ); mx_segment->segment_ptr = iov.iov_base; mx_segment->segment_length = max_data; mx_segment_count++; } mx_return = mx_isend( mx_btl->mx_endpoint, mx_segments, mx_segment_count, endpoint->mx_peer_addr, tag64, NULL, &mx_request ); if( OPAL_UNLIKELY(MX_SUCCESS != mx_return) ) { opal_output( 0, "mx_isend fails with error %s\n", mx_strerror(mx_return) ); return OMPI_ERROR; } #ifdef HAVE_MX_FORGET { uint32_t mx_result; mx_return = mx_ibuffered( mx_btl->mx_endpoint, &mx_request, &mx_result ); if( OPAL_UNLIKELY(MX_SUCCESS != mx_return) ) { opal_output( 0, "mx_ibuffered failed with error %d (%s)\n", mx_return, mx_strerror(mx_return) ); return OMPI_SUCCESS; } if( mx_result ) { mx_return = mx_forget( mx_btl->mx_endpoint, &mx_request ); if( OPAL_UNLIKELY(MX_SUCCESS != mx_return) ) { opal_output( 0, "mx_forget failed with error %d (%s)\n", mx_return, mx_strerror(mx_return) ); } } return OMPI_SUCCESS; } #endif } /* No optimization on this path. Just allocate a descriptor and return it * to the user. */ *descriptor = mca_btl_mx_alloc( btl, endpoint, order, header_size + payload_size, flags ); return OMPI_ERR_RESOURCE_BUSY; }
/** * register user buffer or pack * data into pre-registered buffer and return a * descriptor that can be * used for send/put. * * @param btl (IN) BTL module * @param peer (IN) BTL peer addressing * * prepare source's behavior depends on the following: * Has a valid memory registration been passed to prepare_src? * if so we attempt to use the pre-registred user-buffer, if the memory registration * is to small (only a portion of the user buffer) then we must reregister the user buffer * Has the user requested the memory to be left pinned? * if so we insert the memory registration into a memory tree for later lookup, we * may also remove a previous registration if a MRU (most recently used) list of * registions is full, this prevents resources from being exhausted. * Is the requested size larger than the btl's max send size? * if so and we aren't asked to leave the registration pinned than we register the memory if * the users buffer is contiguous * Otherwise we choose from two free lists of pre-registered memory in which to pack the data into. * */ mca_btl_base_descriptor_t* mca_btl_openib_prepare_src( struct mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t* endpoint, mca_mpool_base_registration_t* registration, struct ompi_convertor_t* convertor, size_t reserve, size_t* size ) { mca_btl_openib_module_t *openib_btl; mca_btl_openib_frag_t *frag = NULL; mca_btl_openib_reg_t *openib_reg; struct iovec iov; uint32_t iov_count = 1; size_t max_data = *size; int rc; openib_btl = (mca_btl_openib_module_t*)btl; if(ompi_convertor_need_buffers(convertor) == false && 0 == reserve) { if(registration != NULL || max_data > btl->btl_max_send_size) { MCA_BTL_IB_FRAG_ALLOC_SEND_FRAG(btl, frag, rc); if(NULL == frag) { return NULL; } iov.iov_len = max_data; iov.iov_base = NULL; ompi_convertor_pack(convertor, &iov, &iov_count, &max_data); *size = max_data; if(NULL == registration) { rc = btl->btl_mpool->mpool_register(btl->btl_mpool, iov.iov_base, max_data, 0, ®istration); if(OMPI_SUCCESS != rc || NULL == registration) { MCA_BTL_IB_FRAG_RETURN(openib_btl, frag); return NULL; } /* keep track of the registration we did */ frag->registration = (mca_btl_openib_reg_t*)registration; } openib_reg = (mca_btl_openib_reg_t*)registration; frag->base.des_flags = 0; frag->base.des_src = &frag->segment; frag->base.des_src_cnt = 1; frag->base.des_dst = NULL; frag->base.des_dst_cnt = 0; frag->base.des_flags = 0; frag->sg_entry.length = max_data; frag->sg_entry.lkey = openib_reg->mr->lkey; frag->sg_entry.addr = (unsigned long)iov.iov_base; frag->segment.seg_len = max_data; frag->segment.seg_addr.pval = iov.iov_base; frag->segment.seg_key.key32[0] = (uint32_t)frag->sg_entry.lkey; BTL_VERBOSE(("frag->sg_entry.lkey = %lu .addr = %llu " "frag->segment.seg_key.key32[0] = %lu", frag->sg_entry.lkey, frag->sg_entry.addr, frag->segment.seg_key.key32[0])); return &frag->base; } } if(max_data + reserve <= btl->btl_eager_limit) { /* the data is small enough to fit in the eager frag and * memory is not prepinned */ MCA_BTL_IB_FRAG_ALLOC_EAGER(btl, frag, rc); } if(NULL == frag) { /* the data doesn't fit into eager frag or eger frag is * not available */ MCA_BTL_IB_FRAG_ALLOC_MAX(btl, frag, rc); if(NULL == frag) { return NULL; } if(max_data + reserve > btl->btl_max_send_size) { max_data = btl->btl_max_send_size - reserve; } } iov.iov_len = max_data; iov.iov_base = (unsigned char*)frag->segment.seg_addr.pval + reserve; rc = ompi_convertor_pack(convertor, &iov, &iov_count, &max_data); if(rc < 0) { MCA_BTL_IB_FRAG_RETURN(openib_btl, frag); return NULL; } *size = max_data; frag->segment.seg_len = max_data + reserve; frag->segment.seg_key.key32[0] = (uint32_t)frag->sg_entry.lkey; frag->base.des_src = &frag->segment; frag->base.des_src_cnt = 1; frag->base.des_dst = NULL; frag->base.des_dst_cnt = 0; frag->base.des_flags = 0; return &frag->base; }
/** * Pack data and return a descriptor that can be * used for send/put. * * @param btl (IN) BTL module * @param peer (IN) BTL peer addressing */ mca_btl_base_descriptor_t* mca_btl_mx_prepare_src( struct mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t* endpoint, struct mca_mpool_base_registration_t* registration, struct ompi_convertor_t* convertor, uint8_t order, size_t reserve, size_t* size, uint32_t flags) { mca_btl_mx_frag_t* frag; struct iovec iov; uint32_t iov_count = 1; size_t max_data; int rc; max_data = btl->btl_eager_limit - reserve; if( (*size) < max_data ) { max_data = *size; } /* If the data is contiguous we can use directly the pointer * to the user memory. */ if( 0 == ompi_convertor_need_buffers(convertor) ) { /** * let the convertor figure out the correct pointer depending * on the data layout */ iov.iov_base = NULL; if( 0 == reserve ) { MCA_BTL_MX_FRAG_ALLOC_USER(btl, frag, rc); if( OPAL_UNLIKELY(NULL == frag) ) { return NULL; } max_data = *size; frag->base.des_src_cnt = 1; } else { MCA_BTL_MX_FRAG_ALLOC_EAGER( mx_btl, frag, rc ); if( OPAL_UNLIKELY(NULL == frag) ) { return NULL; } frag->base.des_src_cnt = 2; } } else { MCA_BTL_MX_FRAG_ALLOC_EAGER( mx_btl, frag, rc ); if( OPAL_UNLIKELY(NULL == frag) ) { return NULL; } frag->base.des_src_cnt = 1; iov.iov_base = (void*)((unsigned char*)frag->segment[0].seg_addr.pval + reserve); } iov.iov_len = max_data; (void)ompi_convertor_pack(convertor, &iov, &iov_count, &max_data ); *size = max_data; if( 1 == frag->base.des_src_cnt ) { frag->segment[0].seg_len = reserve + max_data; if( 0 == reserve ) frag->segment[0].seg_addr.pval = iov.iov_base; } else { frag->segment[0].seg_len = reserve; frag->segment[1].seg_len = max_data; frag->segment[1].seg_addr.pval = iov.iov_base; } frag->base.des_src = frag->segment; frag->base.des_flags = flags; frag->base.order = MCA_BTL_NO_ORDER; return &frag->base; }