static int mca_btl_mvapi_endpoint_send_eager_rdma( mca_btl_base_endpoint_t* endpoint) { mca_btl_mvapi_module_t* mvapi_btl = endpoint->endpoint_btl; mca_btl_mvapi_eager_rdma_header_t *rdma_hdr; mca_btl_mvapi_frag_t* frag; int rc; MCA_BTL_IB_FRAG_ALLOC_EAGER(mvapi_btl, frag, rc); if(NULL == frag) { BTL_ERROR(("error allocating fragment")); return -1; } frag->base.des_cbfunc = mca_btl_mvapi_endpoint_eager_rdma; frag->base.des_cbdata = NULL; frag->endpoint = endpoint; frag->base.des_flags |= MCA_BTL_DES_FLAGS_PRIORITY; frag->hdr->tag = MCA_BTL_TAG_BTL; rdma_hdr = (mca_btl_mvapi_eager_rdma_header_t*)frag->segment.seg_addr.pval; rdma_hdr->control.type = MCA_BTL_MVAPI_CONTROL_RDMA; rdma_hdr->rkey = endpoint->eager_rdma_local.reg->r_key; rdma_hdr->rdma_start.lval = ompi_ptr_ptol(endpoint->eager_rdma_local.base.pval); frag->segment.seg_len = sizeof(mca_btl_mvapi_eager_rdma_header_t); if (mca_btl_mvapi_endpoint_post_send(mvapi_btl, endpoint, frag) != OMPI_SUCCESS) { MCA_BTL_IB_FRAG_RETURN(mvapi_btl, frag); BTL_ERROR(("Error sending RDMA buffer", strerror(errno))); return -1; } return 0; }
/* local callback function for completion of a failover control message */ static void mca_btl_openib_endpoint_notify_cb(mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t* endpoint, struct mca_btl_base_descriptor_t* descriptor, int status) { MCA_BTL_IB_FRAG_RETURN(descriptor); }
/** * Prepare the dst buffer * * @param btl (IN) BTL module * @param peer (IN) BTL peer addressing * prepare dest's behavior depends on the following: * Has a valid memory registration been passed to prepare_src? * if so we attempt to use the pre-registred user-buffer, if the memory registration * is to small (only a portion of the user buffer) then we must reregister the user buffer * Has the user requested the memory to be left pinned? * if so we insert the memory registration into a memory tree for later lookup, we * may also remove a previous registration if a MRU (most recently used) list of * registions is full, this prevents resources from being exhausted. */ mca_btl_base_descriptor_t* mca_btl_openib_prepare_dst( struct mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t* endpoint, mca_mpool_base_registration_t* registration, struct ompi_convertor_t* convertor, size_t reserve, size_t* size) { mca_btl_openib_module_t *openib_btl; mca_btl_openib_frag_t *frag; mca_btl_openib_reg_t *openib_reg; int rc; ptrdiff_t lb; openib_btl = (mca_btl_openib_module_t*)btl; MCA_BTL_IB_FRAG_ALLOC_RECV_FRAG(btl, frag, rc); if(NULL == frag) { return NULL; } ompi_ddt_type_lb(convertor->pDesc, &lb); frag->segment.seg_addr.pval = convertor->pBaseBuf + lb + convertor->bConverted; if(NULL == registration){ /* we didn't get a memory registration passed in, so we have to * register the region ourselves */ rc = btl->btl_mpool->mpool_register(btl->btl_mpool, frag->segment.seg_addr.pval, *size, 0, ®istration); if(OMPI_SUCCESS != rc || NULL == registration) { MCA_BTL_IB_FRAG_RETURN(openib_btl, frag); return NULL; } /* keep track of the registration we did */ frag->registration = (mca_btl_openib_reg_t*)registration; } openib_reg = (mca_btl_openib_reg_t*)registration; frag->sg_entry.length = *size; frag->sg_entry.lkey = openib_reg->mr->lkey; frag->sg_entry.addr = (unsigned long) frag->segment.seg_addr.pval; frag->segment.seg_len = *size; frag->segment.seg_key.key32[0] = openib_reg->mr->rkey; frag->base.des_dst = &frag->segment; frag->base.des_dst_cnt = 1; frag->base.des_src = NULL; frag->base.des_src_cnt = 0; frag->base.des_flags = 0; BTL_VERBOSE(("frag->sg_entry.lkey = %lu .addr = %llu " "frag->segment.seg_key.key32[0] = %lu", frag->sg_entry.lkey, frag->sg_entry.addr, frag->segment.seg_key.key32[0])); return &frag->base; }
static void mca_btl_mvapi_endpoint_eager_rdma( mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t* endpoint, struct mca_btl_base_descriptor_t* descriptor, int status) { MCA_BTL_IB_FRAG_RETURN(((mca_btl_mvapi_module_t*)btl), ((mca_btl_mvapi_frag_t*)descriptor)); }
/* send the eager rdma connect message to the remote endpoint */ static int mca_btl_openib_endpoint_send_eager_rdma( mca_btl_base_endpoint_t* endpoint) { mca_btl_openib_module_t* openib_btl = endpoint->endpoint_btl; mca_btl_openib_eager_rdma_header_t *rdma_hdr; mca_btl_openib_send_control_frag_t* frag; int rc; frag = alloc_control_frag(openib_btl); if(NULL == frag) { return -1; } to_base_frag(frag)->base.des_cbfunc = mca_btl_openib_endpoint_eager_rdma_connect_cb; to_base_frag(frag)->base.des_cbdata = NULL; to_base_frag(frag)->base.des_flags |= MCA_BTL_DES_FLAGS_PRIORITY|MCA_BTL_DES_SEND_ALWAYS_CALLBACK; to_base_frag(frag)->base.order = mca_btl_openib_component.credits_qp; to_base_frag(frag)->segment.seg_len = sizeof(mca_btl_openib_eager_rdma_header_t); to_com_frag(frag)->endpoint = endpoint; frag->hdr->tag = MCA_BTL_TAG_IB; rdma_hdr = (mca_btl_openib_eager_rdma_header_t*)to_base_frag(frag)->segment.seg_addr.pval; rdma_hdr->control.type = MCA_BTL_OPENIB_CONTROL_RDMA; rdma_hdr->rkey = endpoint->eager_rdma_local.reg->mr->rkey; rdma_hdr->rdma_start.lval = opal_ptr_ptol(endpoint->eager_rdma_local.base.pval); BTL_VERBOSE(("sending rkey %" PRIu32 ", rdma_start.lval %" PRIx64 ", pval %p, ival %" PRIu32 " type %d and sizeof(rdma_hdr) %d\n", rdma_hdr->rkey, rdma_hdr->rdma_start.lval, rdma_hdr->rdma_start.pval, rdma_hdr->rdma_start.ival, rdma_hdr->control.type, (int) sizeof(mca_btl_openib_eager_rdma_header_t) )); if(endpoint->nbo) { BTL_OPENIB_EAGER_RDMA_CONTROL_HEADER_HTON((*rdma_hdr)); BTL_VERBOSE(("after HTON: sending rkey %" PRIu32 ", rdma_start.lval %" PRIx64 ", pval %p, ival %" PRIu32 "\n", rdma_hdr->rkey, rdma_hdr->rdma_start.lval, rdma_hdr->rdma_start.pval, rdma_hdr->rdma_start.ival )); } rc = mca_btl_openib_endpoint_send(endpoint, frag); if (OPAL_SUCCESS == rc || OPAL_ERR_RESOURCE_BUSY == rc) return OPAL_SUCCESS; MCA_BTL_IB_FRAG_RETURN(frag); BTL_ERROR(("Error sending RDMA buffer: %s", strerror(errno))); return rc; }
/* local callback function for completion of eager rdma connect */ static void mca_btl_openib_endpoint_eager_rdma_connect_cb( mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t* endpoint, struct mca_btl_base_descriptor_t* descriptor, int status) { mca_btl_openib_device_t *device = endpoint->endpoint_btl->device; OPAL_THREAD_ADD32(&device->non_eager_rdma_endpoints, -1); assert(device->non_eager_rdma_endpoints >= 0); MCA_BTL_IB_FRAG_RETURN(descriptor); }
void mca_btl_mvapi_endpoint_send_credits_hp( mca_btl_mvapi_endpoint_t* endpoint) { mca_btl_mvapi_module_t* mvapi_btl = endpoint->endpoint_btl; mca_btl_mvapi_frag_t* frag; int ret; MCA_BTL_IB_FRAG_ALLOC_EAGER(mvapi_btl, frag, ret); if(NULL == frag) { BTL_ERROR(("error allocating fragment")); return; } frag->base.des_cbfunc = mca_btl_mvapi_endpoint_credits_hp; frag->base.des_cbdata = NULL; frag->endpoint = endpoint; frag->hdr->tag = MCA_BTL_TAG_BTL; frag->hdr->credits = (endpoint->rd_credits_hp > 0) ? endpoint->rd_credits_hp: 0; OPAL_THREAD_ADD32(&endpoint->rd_credits_hp, -frag->hdr->credits); frag->hdr->rdma_credits = endpoint->eager_rdma_local.credits; OPAL_THREAD_ADD32(&endpoint->eager_rdma_local.credits, -frag->hdr->rdma_credits); ((mca_btl_mvapi_control_header_t *)frag->segment.seg_addr.pval)->type = MCA_BTL_MVAPI_CONTROL_NOOP; frag->desc.sr_desc.opcode = VAPI_SEND; frag->sg_entry.addr = (VAPI_virt_addr_t) (MT_virt_addr_t) frag->hdr; frag->sg_entry.len = sizeof(mca_btl_mvapi_header_t) + sizeof(mca_btl_mvapi_control_header_t); if(sizeof(mca_btl_mvapi_header_t) <= mvapi_btl->ib_inline_max) { ret = EVAPI_post_inline_sr(mvapi_btl->nic, endpoint->lcl_qp_hndl_hp, &frag->desc.sr_desc); } else { ret = VAPI_post_sr(mvapi_btl->nic, endpoint->lcl_qp_hndl_hp, &frag->desc.sr_desc); } if(ret != VAPI_SUCCESS) { OPAL_THREAD_ADD32(&endpoint->sd_credits_lp, -1); OPAL_THREAD_ADD32(&endpoint->rd_credits_lp, frag->hdr->credits); MCA_BTL_IB_FRAG_RETURN(mvapi_btl, frag); BTL_ERROR(("error posting send request errno %d says %s", strerror(errno))); return; } }
/** * Return a segment * * Return the segment to the appropriate * preallocated segment list */ int mca_btl_openib_free( struct mca_btl_base_module_t* btl, mca_btl_base_descriptor_t* des) { mca_btl_openib_frag_t* frag = (mca_btl_openib_frag_t*)des; if(((MCA_BTL_OPENIB_SEND_FRAG_FRAG == frag->type) || (MCA_BTL_OPENIB_RECV_FRAG_FRAG == frag->type)) && frag->registration != NULL) { btl->btl_mpool->mpool_deregister(btl->btl_mpool, (mca_mpool_base_registration_t*) frag->registration); frag->registration = NULL; } MCA_BTL_IB_FRAG_RETURN(((mca_btl_openib_module_t*) btl), frag); return OMPI_SUCCESS; }
static void mca_btl_mvapi_endpoint_credits_lp( mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t* endpoint, struct mca_btl_base_descriptor_t* descriptor, int status) { int32_t credits; /* we don't acquire a wqe or token for credit message - so decrement */ OPAL_THREAD_ADD32(&endpoint->sd_wqe_lp,-1); /* check to see if there are addditional credits to return */ if ((credits = OPAL_THREAD_ADD32(&endpoint->sd_credits_lp,-1)) > 0) { OPAL_THREAD_ADD32(&endpoint->sd_credits_lp,-credits); if (endpoint->rd_credits_lp >= mca_btl_mvapi_component.rd_win && OPAL_THREAD_ADD32(&endpoint->sd_credits_lp,1) == 1) { mca_btl_mvapi_endpoint_send_credits_lp(endpoint); } } MCA_BTL_IB_FRAG_RETURN(((mca_btl_mvapi_module_t*)btl), ((mca_btl_mvapi_frag_t*)descriptor)); }
int mca_btl_openib_get (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *ep, void *local_address, uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle, mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags, int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata) { mca_btl_openib_get_frag_t* frag = NULL; int qp = order; int rc; if (OPAL_UNLIKELY(size > btl->btl_get_limit)) { return OPAL_ERR_BAD_PARAM; } frag = to_get_frag(alloc_recv_user_frag()); if (OPAL_UNLIKELY(NULL == frag)) { return OPAL_ERR_OUT_OF_RESOURCE; } if (MCA_BTL_NO_ORDER == qp) { qp = mca_btl_openib_component.rdma_qp; } /* set base descriptor flags */ to_base_frag(frag)->base.order = qp; /* free this descriptor when the operation is complete */ to_base_frag(frag)->base.des_flags = MCA_BTL_DES_FLAGS_BTL_OWNERSHIP; /* set up scatter-gather entry */ to_com_frag(frag)->sg_entry.length = size; to_com_frag(frag)->sg_entry.lkey = local_handle->lkey; to_com_frag(frag)->sg_entry.addr = (uint64_t)(uintptr_t) local_address; to_com_frag(frag)->endpoint = ep; /* set up rdma callback */ frag->cb.func = cbfunc; frag->cb.context = cbcontext; frag->cb.data = cbdata; frag->cb.local_handle = local_handle; /* set up descriptor */ frag->sr_desc.wr.rdma.remote_addr = remote_address; /* the opcode may have been changed by an atomic operation */ frag->sr_desc.opcode = IBV_WR_RDMA_READ; #if OPAL_ENABLE_HETEROGENEOUS_SUPPORT if((ep->endpoint_proc->proc_opal->proc_arch & OPAL_ARCH_ISBIGENDIAN) != (opal_proc_local_get()->proc_arch & OPAL_ARCH_ISBIGENDIAN)) { frag->sr_desc.wr.rdma.rkey = opal_swap_bytes4 (remote_handle->rkey); } else #endif { frag->sr_desc.wr.rdma.rkey = remote_handle->rkey; } #if HAVE_XRC if (MCA_BTL_XRC_ENABLED && BTL_OPENIB_QP_TYPE_XRC(qp)) { #if OPAL_HAVE_CONNECTX_XRC_DOMAINS frag->sr_desc.qp_type.xrc.remote_srqn = ep->rem_info.rem_srqs[qp].rem_srq_num; #else frag->sr_desc.xrc_remote_srq_num = ep->rem_info.rem_srqs[qp].rem_srq_num; #endif } #endif if (ep->endpoint_state != MCA_BTL_IB_CONNECTED) { OPAL_THREAD_LOCK(&ep->endpoint_lock); rc = check_endpoint_state(ep, &to_base_frag(frag)->base, &ep->pending_get_frags); OPAL_THREAD_UNLOCK(&ep->endpoint_lock); if (OPAL_ERR_RESOURCE_BUSY == rc) { return OPAL_SUCCESS; } if (OPAL_SUCCESS != rc) { MCA_BTL_IB_FRAG_RETURN (frag); return rc; } } rc = mca_btl_openib_get_internal (btl, ep, frag); if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { if (OPAL_LIKELY(OPAL_ERR_OUT_OF_RESOURCE == rc)) { rc = OPAL_SUCCESS; OPAL_THREAD_LOCK(&ep->endpoint_lock); opal_list_append(&ep->pending_get_frags, (opal_list_item_t*)frag); OPAL_THREAD_UNLOCK(&ep->endpoint_lock); } else { MCA_BTL_IB_FRAG_RETURN (frag); } } return rc; }
/** * register user buffer or pack * data into pre-registered buffer and return a * descriptor that can be * used for send/put. * * @param btl (IN) BTL module * @param peer (IN) BTL peer addressing * * prepare source's behavior depends on the following: * Has a valid memory registration been passed to prepare_src? * if so we attempt to use the pre-registred user-buffer, if the memory registration * is to small (only a portion of the user buffer) then we must reregister the user buffer * Has the user requested the memory to be left pinned? * if so we insert the memory registration into a memory tree for later lookup, we * may also remove a previous registration if a MRU (most recently used) list of * registions is full, this prevents resources from being exhausted. * Is the requested size larger than the btl's max send size? * if so and we aren't asked to leave the registration pinned than we register the memory if * the users buffer is contiguous * Otherwise we choose from two free lists of pre-registered memory in which to pack the data into. * */ mca_btl_base_descriptor_t* mca_btl_openib_prepare_src( struct mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t* endpoint, mca_mpool_base_registration_t* registration, struct ompi_convertor_t* convertor, size_t reserve, size_t* size ) { mca_btl_openib_module_t *openib_btl; mca_btl_openib_frag_t *frag = NULL; mca_btl_openib_reg_t *openib_reg; struct iovec iov; uint32_t iov_count = 1; size_t max_data = *size; int rc; openib_btl = (mca_btl_openib_module_t*)btl; if(ompi_convertor_need_buffers(convertor) == false && 0 == reserve) { if(registration != NULL || max_data > btl->btl_max_send_size) { MCA_BTL_IB_FRAG_ALLOC_SEND_FRAG(btl, frag, rc); if(NULL == frag) { return NULL; } iov.iov_len = max_data; iov.iov_base = NULL; ompi_convertor_pack(convertor, &iov, &iov_count, &max_data); *size = max_data; if(NULL == registration) { rc = btl->btl_mpool->mpool_register(btl->btl_mpool, iov.iov_base, max_data, 0, ®istration); if(OMPI_SUCCESS != rc || NULL == registration) { MCA_BTL_IB_FRAG_RETURN(openib_btl, frag); return NULL; } /* keep track of the registration we did */ frag->registration = (mca_btl_openib_reg_t*)registration; } openib_reg = (mca_btl_openib_reg_t*)registration; frag->base.des_flags = 0; frag->base.des_src = &frag->segment; frag->base.des_src_cnt = 1; frag->base.des_dst = NULL; frag->base.des_dst_cnt = 0; frag->base.des_flags = 0; frag->sg_entry.length = max_data; frag->sg_entry.lkey = openib_reg->mr->lkey; frag->sg_entry.addr = (unsigned long)iov.iov_base; frag->segment.seg_len = max_data; frag->segment.seg_addr.pval = iov.iov_base; frag->segment.seg_key.key32[0] = (uint32_t)frag->sg_entry.lkey; BTL_VERBOSE(("frag->sg_entry.lkey = %lu .addr = %llu " "frag->segment.seg_key.key32[0] = %lu", frag->sg_entry.lkey, frag->sg_entry.addr, frag->segment.seg_key.key32[0])); return &frag->base; } } if(max_data + reserve <= btl->btl_eager_limit) { /* the data is small enough to fit in the eager frag and * memory is not prepinned */ MCA_BTL_IB_FRAG_ALLOC_EAGER(btl, frag, rc); } if(NULL == frag) { /* the data doesn't fit into eager frag or eger frag is * not available */ MCA_BTL_IB_FRAG_ALLOC_MAX(btl, frag, rc); if(NULL == frag) { return NULL; } if(max_data + reserve > btl->btl_max_send_size) { max_data = btl->btl_max_send_size - reserve; } } iov.iov_len = max_data; iov.iov_base = (unsigned char*)frag->segment.seg_addr.pval + reserve; rc = ompi_convertor_pack(convertor, &iov, &iov_count, &max_data); if(rc < 0) { MCA_BTL_IB_FRAG_RETURN(openib_btl, frag); return NULL; } *size = max_data; frag->segment.seg_len = max_data + reserve; frag->segment.seg_key.key32[0] = (uint32_t)frag->sg_entry.lkey; frag->base.des_src = &frag->segment; frag->base.des_src_cnt = 1; frag->base.des_dst = NULL; frag->base.des_dst_cnt = 0; frag->base.des_flags = 0; return &frag->base; }
/** * This function is used to send a message to the remote side * indicating the endpoint is broken and telling the remote side to * brings its endpoint down as well. This is needed because there are * cases where only one side of the connection determines that the * there was a problem. * @param endpoint Pointer to endpoint with error * @param type Type of message to be sent, can be one of two types * @param index When sending RDMA error message, index is non zero */ static void mca_btl_openib_endpoint_notify(mca_btl_base_endpoint_t* endpoint, uint8_t type, int index) { mca_btl_openib_module_t* openib_btl = endpoint->endpoint_btl; mca_btl_openib_module_t* newbtl = NULL; bool found = false; mca_btl_openib_broken_connection_header_t *bc_hdr; mca_btl_openib_send_control_frag_t* frag; mca_btl_base_endpoint_t* newep; int i, rc; opal_proc_t* remote_proc = endpoint->endpoint_proc->proc_opal; /* First, find a different BTL than this one that got the * error to send the message over. */ for(i = 0; i < mca_btl_openib_component.ib_num_btls; i++) { if (mca_btl_openib_component.openib_btls[i] != openib_btl) { newbtl = mca_btl_openib_component.openib_btls[i]; break; } } if (NULL == newbtl) { opal_output_verbose(20, mca_btl_openib_component.verbose_failover, "IB: Endpoint Notify: No BTL found"); /* If we cannot find one, then just return. */ return; } /* Now, find the endpoint associated with it. The device * associated with the BTL has the list of all the * endpoints. */ for (i = 0; i < opal_pointer_array_get_size(newbtl->device->endpoints); i++) { newep = (mca_btl_openib_endpoint_t*) opal_pointer_array_get_item(newbtl->device->endpoints, i); if (NULL == newep) { continue; } if (newep->endpoint_proc->proc_opal == remote_proc) { found = true; break; } } if (false == found) { opal_output_verbose(20, mca_btl_openib_component.verbose_failover, "IB: Endpoint Notify: No endpoint found"); /* If we cannot find a match, then just return. */ return; } frag = alloc_control_frag(newbtl); if(NULL == frag) { opal_output_verbose(20, mca_btl_openib_component.verbose_failover, "IB: Endpoint Notify: No frag space"); /* If no frag available, then just return. */ return; } to_base_frag(frag)->base.des_cbfunc = mca_btl_openib_endpoint_notify_cb; to_base_frag(frag)->base.des_cbdata = NULL; to_base_frag(frag)->base.des_flags |= MCA_BTL_DES_FLAGS_PRIORITY|MCA_BTL_DES_SEND_ALWAYS_CALLBACK; to_base_frag(frag)->base.order = mca_btl_openib_component.credits_qp; to_base_frag(frag)->segment.base.seg_len = sizeof(mca_btl_openib_broken_connection_header_t); to_com_frag(frag)->endpoint = newep; frag->hdr->tag = MCA_BTL_TAG_IB; bc_hdr = (mca_btl_openib_broken_connection_header_t*)to_base_frag(frag)->segment.base.seg_addr.pval; bc_hdr->control.type = type; bc_hdr->lid = endpoint->endpoint_btl->port_info.lid; bc_hdr->subnet_id = endpoint->endpoint_btl->port_info.subnet_id; bc_hdr->vpid = opal_process_name_vpid(OPAL_PROC_MY_NAME); bc_hdr->index = index; if(newep->nbo) { BTL_OPENIB_BROKEN_CONNECTION_HEADER_HTON((*bc_hdr)); } rc = mca_btl_openib_endpoint_send(newep, frag); if (OPAL_SUCCESS == rc || OPAL_ERR_RESOURCE_BUSY == rc) { return; } MCA_BTL_IB_FRAG_RETURN(frag); BTL_ERROR(("Error sending BROKEN CONNECTION buffer (%s)", strerror(errno))); return; }