static int progress_pending_collfrags(mca_bcol_iboffload_module_t *iboffload) { mca_bcol_iboffload_collfrag_t *pending_collfrag; int rc, size = opal_list_get_size(&iboffload->collfrag_pending); IBOFFLOAD_VERBOSE(10, ("Calling progress_pending_collfrags")); do { pending_collfrag = (mca_bcol_iboffload_collfrag_t *) opal_list_remove_first(&iboffload->collfrag_pending); IBOFFLOAD_VERBOSE(10, ("Get pending_collfrag - %p, iboffload - %p, " "pending list size - %d.", pending_collfrag, iboffload, opal_list_get_size(&iboffload->collfrag_pending))); /* Return back coll frag to coll request opal_list */ opal_list_append(&pending_collfrag->coll_full_req->work_requests, (opal_list_item_t *) pending_collfrag); rc = pending_collfrag->coll_full_req->progress_fn (iboffload, pending_collfrag->coll_full_req); if (OPAL_UNLIKELY(BCOL_FN_STARTED != rc && OMPI_SUCCESS != rc)) { return OMPI_ERROR; } } while (--size > 0); return OMPI_SUCCESS; }
/* Large message scatter-allgather with zero copy */ int mca_bcol_iboffload_zero_copy_progress(bcol_function_args_t *fn_arguments, struct mca_bcol_base_function_t *const_args) { int i; mca_bcol_iboffload_collreq_t *coll_request = (mca_bcol_iboffload_collreq_t *)fn_arguments->bcol_opaque_data; /* IBOFFLOAD_VERBOSE(10, ("Run general progress. %d == %d * %d == %d", coll_request->n_frag_mpi_complete, coll_request->n_fragments, coll_request->n_frag_net_complete, coll_request->n_fragments)); */ /* Complete the bcast - progress releases full request descriptors */ for (i = 0; i < mca_bcol_iboffload_component.max_progress_pull; i++) { if (coll_request->n_frag_mpi_complete == coll_request->n_fragments && coll_request->n_frag_net_complete == coll_request->n_fragments) { IBOFFLOAD_VERBOSE(10, ("Deregister user buff.\n")); coll_request->module->device->mpool->mpool_deregister( coll_request->module->device->mpool, (mca_mpool_base_registration_t *) coll_request->buffer_info[SBUF].iboffload_reg); coll_request->buffer_info[SBUF].iboffload_reg = NULL; RELEASE_COLLREQ(coll_request); IBOFFLOAD_VERBOSE(10, ("New bcast done !!!")); return BCOL_FN_COMPLETE; } } /* IBOFFLOAD_VERBOSE(10, ("Bcast general progress done")); */ /* done */ return BCOL_FN_STARTED; }
static int mca_bcol_iboffload_fanin_init( bcol_function_args_t *input_args, mca_bcol_iboffload_module_t *iboffload, struct mca_bcol_iboffload_collreq_t **coll_request) { ompi_free_list_item_t *item = NULL; mca_bcol_iboffload_collfrag_t *coll_fragment = NULL; mca_bcol_iboffload_component_t *cm = &mca_bcol_iboffload_component; IBOFFLOAD_VERBOSE(10, ("Calling for mca_bcol_iboffload_barrier_init")); OMPI_FREE_LIST_WAIT_MT(&cm->collreqs_free, item); if(OPAL_UNLIKELY(NULL == item)) { IBOFFLOAD_VERBOSE(10, ("Failing for coll request free list waiting.\n")); return OMPI_ERR_OUT_OF_RESOURCE; } (*coll_request) = (mca_bcol_iboffload_collreq_t *) item; (*coll_request)->progress_fn = iboffload->fanin_algth; (*coll_request)->completion_cb_fn = NULL; (*coll_request)->order_info = &input_args->order_info; (*coll_request)->module = iboffload; (*coll_request)->ml_buffer_index = input_args->buffer_index; (*coll_request)->buffer_info[SBUF].offset = 0; (*coll_request)->buffer_info[RBUF].offset = 0; (*coll_request)->qp_index = MCA_BCOL_IBOFFLOAD_QP_BARRIER; input_args->bcol_opaque_data = (void *) (*coll_request); /* finish initializing full message descriptor */ (*coll_request)->n_fragments = 1; (*coll_request)->n_frags_sent = 1; (*coll_request)->n_frag_mpi_complete = 0; (*coll_request)->n_frag_net_complete = 0; (*coll_request)->user_handle_freed = false; /* * setup collective work request */ /* get collective frag */ coll_fragment = &(*coll_request)->first_collfrag; mca_bcol_iboffload_collfrag_init(coll_fragment); coll_fragment->alg = FANIN_ALG; coll_fragment->mq_index = COLL_MQ; /* Set mq credits */ coll_fragment->mq_credits = iboffload->alg_task_consump[FANIN_ALG]; /* set pointers for (coll frag) <-> (coll full request) */ MCA_BCOL_IBOFFLOAD_SET_COLL_REQ_LINKS(*coll_request, coll_fragment); return OMPI_SUCCESS; }
/* Unload devices */ static int iboffload_release_devices(void) { int i; mca_bcol_iboffload_device_t *device = NULL; mca_bcol_iboffload_component_t *cm = &mca_bcol_iboffload_component; opal_pointer_array_t *devs = &cm->devices; IBOFFLOAD_VERBOSE(10, ("Destroy all devices.\n")); for (i = 0; i < cm->num_devs; i++) { device = opal_pointer_array_get_item(devs, i); IBOFFLOAD_VERBOSE(10, ("Device %s with index %d will be destroyed.\n", ibv_get_device_name(device->dev.ib_dev), i)); if (NULL != device) { OBJ_RELEASE(device); } } IBOFFLOAD_VERBOSE(10, ("All devices were destroyed.\n")); opal_pointer_array_remove_all(devs); OBJ_DESTRUCT(devs); /* release device list */ /*ibv_free_device_list_compat(cm->ib_devs);*/ ompi_ibv_free_device_list(cm->ib_devs); cm->ib_devs = NULL; IBOFFLOAD_VERBOSE(10, ("All devices destroyed.\n")); return OMPI_SUCCESS; }
int mca_bcol_iboffload_small_msg_bcast_progress( bcol_function_args_t *input_args, struct mca_bcol_base_function_t *const_args) { mca_bcol_iboffload_collreq_t *coll_request = (mca_bcol_iboffload_collreq_t *) input_args->bcol_opaque_data; IBOFFLOAD_VERBOSE(10, ("Run progress.\n")); /* We should send the data to our children in the tree before the upper layer will start with buffers recycling */ if (BCOL_AND_NET_ARE_COMPLETED(coll_request)) { coll_request->user_handle_freed = true; if (COLLREQ_IS_DONE(coll_request)) { IBOFFLOAD_VERBOSE(10, ("Coll request already done.\n")); RELEASE_COLLREQ(coll_request); } IBOFFLOAD_VERBOSE(10, ("New bcast done !!!")); return BCOL_FN_COMPLETE; } return BCOL_FN_STARTED; }
/* query to see if the component is available for use, and can * satisfy the thread and progress requirements */ int mca_bcol_iboffload_init_query(bool enable_progress_threads, bool enable_mpi_threads) { int rc; mca_bcol_iboffload_component_t *cm = &mca_bcol_iboffload_component; IBOFFLOAD_VERBOSE(10, ("Init Iboffload component.\n")); /* Get list of HCAs and ports */ rc = iboffload_load_devices(); if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { IBOFFLOAD_ERROR(("Load devices error.\n")); goto unload_devices; } /* Setup the BSRQ QP's based on the final value of mca_bcol_iboffload_component.receive_queues. */ rc = setup_qps(); if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { IBOFFLOAD_ERROR(("QPs setup error.\n")); goto unload_devices; } cm->super.collm_init_query = mca_bcol_iboffload_dummy_init_query; return OMPI_SUCCESS; /* done */ unload_devices: IBOFFLOAD_ERROR(("Release devices: an error occured.\n")); iboffload_release_devices(); return rc; }
static int mca_bcol_iboffload_new_style_fanin_intra( bcol_function_args_t *input_args, struct coll_ml_function_t *const_args) { int rc = OMPI_SUCCESS; struct mca_bcol_iboffload_collreq_t *coll_request = NULL; mca_bcol_iboffload_module_t *iboffload = (mca_bcol_iboffload_module_t *) const_args->bcol_module; assert(NULL != iboffload); MCA_BCOL_CHECK_ORDER(const_args->bcol_module, input_args); /* Init Fan-In collective reqeust */ rc = mca_bcol_iboffload_fanin_init(input_args, iboffload, &coll_request); if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { IBOFFLOAD_VERBOSE(10, ("Error from mca_bcol_iboffload_fanin_init.\n")); return BCOL_FN_NOT_STARTED; } rc = iboffload->fanin_algth(iboffload, coll_request); if (OPAL_UNLIKELY(OMPI_ERROR == rc)) { return BCOL_FN_NOT_STARTED; } return BCOL_FN_STARTED; }
int mca_bcol_iboffload_fanin_register(mca_bcol_base_module_t *super) { mca_bcol_base_coll_fn_comm_attributes_t comm_attribs; mca_bcol_base_coll_fn_invoke_attributes_t inv_attribs; IBOFFLOAD_VERBOSE(10, ("Register iboffload Fan-In.\n")); comm_attribs.bcoll_type = BCOL_FANIN; comm_attribs.comm_size_min = 0; comm_attribs.comm_size_max = 1024 * 1024; comm_attribs.waiting_semantics = NON_BLOCKING; inv_attribs.bcol_msg_min = 0; inv_attribs.bcol_msg_max = 20000; /* range 1 */ inv_attribs.datatype_bitmap = 0xffffffff; inv_attribs.op_types_bitmap = 0xffffffff; comm_attribs.data_src = DATA_SRC_KNOWN; mca_bcol_base_set_attributes(super, &comm_attribs, &inv_attribs, mca_bcol_iboffload_new_style_fanin_intra, mca_bcol_iboffload_new_style_fanin_progress); return OMPI_SUCCESS; }
int mca_bcol_iboffload_bcast_scatter_allgather_extra_intra(bcol_function_args_t *fn_arguments, struct mca_bcol_base_function_t *const_args) { mca_bcol_iboffload_module_t *iboffload_module = (mca_bcol_iboffload_module_t *) const_args->bcol_module; int rc; int mq_credits = iboffload_module->power_of_2 * 3 + 4; bool if_bcol_last = BCOL_IBOFFLOAD_IS_LAST_CALL(const_args); mca_bcol_iboffload_collreq_t *coll_request; MCA_BCOL_CHECK_ORDER(const_args->bcol_module, fn_arguments); rc = mca_bcol_iboffload_bcast_init(fn_arguments, iboffload_module, &coll_request, if_bcol_last, mq_credits, mca_bcol_iboffload_bcast_scatter_allgather_extra_exec); if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { return rc; } rc = coll_request->progress_fn(iboffload_module, coll_request); IBOFFLOAD_VERBOSE(10, ("mca_bcol_iboffload_bcast_scatter_allgather_extra_intra was started [%d]\n", rc)); return rc; }
static inline __opal_attribute_always_inline__ int binomial_scatter_smsg( mca_bcol_iboffload_module_t *iboffload_module, mca_bcol_iboffload_collfrag_t *coll_fragment, struct mqe_task **last_send, int radix_mask_pow, uint32_t my_group_index, size_t send_size ) { int rc, dst; int radix_mask = radix_mask_pow >= 0 ? 1 << radix_mask_pow : 0; while(radix_mask > 0) { /* For each level of tree, do sends */ dst = my_group_index ^ radix_mask; rc = mca_bcol_iboffload_send_small_buff_setup( last_send, send_size, dst, iboffload_module, coll_fragment); if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { IBOFFLOAD_VERBOSE(10, ("Failed to isend data")); return rc; } radix_mask >>= 1; } return OMPI_SUCCESS; }
static int mca_bcol_iboffload_alloc_reg_qp_resource(int qp_index, mca_bcol_iboffload_device_t *device) { int length; mca_bcol_iboffload_component_t *cm = &mca_bcol_iboffload_component; ompi_free_list_t *frags_free = &device->frags_free[qp_index]; OBJ_CONSTRUCT(frags_free, ompi_free_list_t); length = cm->qp_infos[qp_index].size; IBOFFLOAD_VERBOSE(10, ("free list len %d\n", length)); if (OMPI_SUCCESS != ompi_free_list_init_ex_new(frags_free, sizeof(mca_bcol_iboffload_frag_t), MCA_IBOFFLOAD_CACHE_LINE_SIZE, OBJ_CLASS(mca_bcol_iboffload_frag_t), length, cm->buffer_alignment, cm->free_list_num, cm->free_list_max, cm->free_list_inc, device->mpool, mca_bcol_iboffload_frag_init, (void *) &cm->qp_infos[qp_index].qp_index)) { IBOFFLOAD_ERROR(("Failed to allocate frags_free")); return OMPI_ERROR; } return OMPI_SUCCESS; }
static inline __opal_attribute_always_inline__ int handle_collfrag_done(mca_bcol_iboffload_collfrag_t *coll_frag, mca_bcol_iboffload_collreq_t *coll_request, mca_bcol_iboffload_device_t *device) { int rc; if (COLLFRAG_IS_DONE(coll_frag)) { IBOFFLOAD_VERBOSE(10, ("Coll frag - %p already done.\n", coll_frag)); coll_request->n_frag_net_complete++; IBOFFLOAD_VERBOSE(10, ("Free tasks resourse.\n")); /* Check if we are done with this coll_frag and release resources if so. */ rc = mca_bcol_iboffload_free_tasks_frags_resources(coll_frag, device->frags_free); if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { IBOFFLOAD_ERROR(("mca_bcol_iboffload_free_tasks_frags_resources FAILED")); fatal_error("Failed to mca_bcol_iboffload_free_tasks_frags_resources\n"); return -1; } BCOL_IBOFFLOAD_MQ_RETURN_CREDITS(coll_request->module, coll_frag->mq_index, coll_frag->mq_credits); RELEASE_COLLFRAG(coll_frag); PROGRESS_PENDING_COLLFRAG(coll_frag); IBOFFLOAD_VERBOSE(10, ("Alg %d: user_handle_freed - %d, n_frag_mpi_complete - %d, " "n_fragments- %d, n_frag_net_complete - %d, n_fragments - %d.\n", coll_frag->alg, coll_request->user_handle_freed, coll_request->n_frag_mpi_complete, coll_request->n_fragments, coll_request->n_frag_net_complete, coll_request->n_fragments)); /* check for full message completion */ if (COLLREQ_IS_DONE(coll_request)) { IBOFFLOAD_VERBOSE(10, ("Coll request already done.\n")); RELEASE_COLLREQ(coll_request); } } IBOFFLOAD_VERBOSE(10, ("Exit with success.\n")); return 0; }
int mca_bcol_iboffload_bcast_register(mca_bcol_base_module_t *super) { mca_bcol_iboffload_module_t *iboffload_module = (mca_bcol_iboffload_module_t *) super; int my_group_index = iboffload_module->ibnet->super.my_index; mca_bcol_base_coll_fn_comm_attributes_t comm_attribs; mca_bcol_base_coll_fn_invoke_attributes_t inv_attribs; IBOFFLOAD_VERBOSE(10, ("Register iboffload Bcast.\n")); comm_attribs.bcoll_type = BCOL_BCAST; comm_attribs.comm_size_min = 0; comm_attribs.comm_size_max = 1024 * 1024; comm_attribs.waiting_semantics = NON_BLOCKING; inv_attribs.bcol_msg_min = 0; inv_attribs.bcol_msg_max = 20000; /* range 1 */ inv_attribs.datatype_bitmap = 0xffffffff; inv_attribs.op_types_bitmap = 0xffffffff; comm_attribs.data_src = DATA_SRC_KNOWN; if (my_group_index < iboffload_module->power_of_2_ranks) { mca_bcol_base_set_attributes(super, &comm_attribs, &inv_attribs, mca_bcol_iboffload_small_msg_bcast_intra, mca_bcol_iboffload_small_msg_bcast_progress); inv_attribs.bcol_msg_min = 10000000; inv_attribs.bcol_msg_max = 10485760; /* range 4 */ mca_bcol_base_set_attributes(super, &comm_attribs, &inv_attribs, mca_bcol_iboffload_bcast_scatter_allgather_intra, mca_bcol_iboffload_zero_copy_progress); } else { mca_bcol_base_set_attributes(super, &comm_attribs, &inv_attribs, mca_bcol_iboffload_small_msg_bcast_extra_intra, mca_bcol_iboffload_small_msg_bcast_progress); inv_attribs.bcol_msg_min = 10000000; inv_attribs.bcol_msg_max = 10485760; /* range 4 */ mca_bcol_base_set_attributes(super, &comm_attribs, &inv_attribs, mca_bcol_iboffload_bcast_scatter_allgather_extra_intra, mca_bcol_iboffload_zero_copy_progress); } return OMPI_SUCCESS; }
/************************************************************************ ************************ New style Fan-In ****************************** ***********************************************************************/ static int mca_bcol_iboffload_new_style_fanin_progress( bcol_function_args_t *input_args, struct coll_ml_function_t *const_args) { mca_bcol_iboffload_collreq_t *coll_request = (mca_bcol_iboffload_collreq_t *) input_args->bcol_opaque_data; if (BCOL_IS_COMPLETED(coll_request)) { coll_request->user_handle_freed = true; if (COLLREQ_IS_DONE(coll_request)) { IBOFFLOAD_VERBOSE(10, ("Coll request already done.\n")); RELEASE_COLLREQ(coll_request); } IBOFFLOAD_VERBOSE(10, ("Fan-In already done.\n")); return BCOL_FN_COMPLETE; } return BCOL_FN_STARTED; }
/* Create list of IB HCA that have active port */ static int iboffload_load_devices(void) { int num_devs = 0, i; mca_bcol_iboffload_device_t *device = NULL; mca_bcol_iboffload_component_t *cm = &mca_bcol_iboffload_component; IBOFFLOAD_VERBOSE(10, ("Entering to iboffload_load_devices")); /* Get list of devices */ /*cm->ib_devs = ibv_get_device_list_compat(&num_devs);*/ cm->ib_devs = ompi_ibv_get_device_list(&num_devs); if (0 == num_devs || NULL == cm->ib_devs) { IBOFFLOAD_ERROR(("No IB devices found")); /* No hca error*/ orte_show_help("help-mpi-btl-openib.txt", "no-nics", true); return OMPI_ERROR; } cm->num_devs = num_devs; for (i = 0; i < num_devs; i++) { device = OBJ_NEW(mca_bcol_iboffload_device_t); if (NULL != device) { opal_pointer_array_set_item(&cm->devices, i, (void *) device); device->dev.ib_dev = cm->ib_devs[i]; IBOFFLOAD_VERBOSE(10, ("Device %s with index %d was appended.\n", ibv_get_device_name(device->dev.ib_dev), i)); } } if (0 == opal_pointer_array_get_size(&cm->devices)) { /* No relevand devices were found, return error */ IBOFFLOAD_ERROR(("No active devices found.\n")); return OMPI_ERROR; } return OMPI_SUCCESS; }
/* * Close the component */ static int iboffload_close(void) { int rc; mca_bcol_iboffload_component_t *cm = &mca_bcol_iboffload_component; IBOFFLOAD_VERBOSE(10, ("Destroy component free lists.\n")); if (true == cm->init_done) { OBJ_DESTRUCT(&cm->tasks_free); OBJ_DESTRUCT(&cm->collreqs_free); OBJ_DESTRUCT(&cm->collfrags_free); OBJ_DESTRUCT(&cm->calc_tasks_free); } /* Unregister the progress function */ rc = opal_progress_unregister(mca_bcol_iboffload_component_progress); if (OMPI_SUCCESS != rc) { IBOFFLOAD_ERROR(("Failed to unregister the progress function" " for iboffload component.\n")); } rc = iboffload_release_devices(); if (OMPI_SUCCESS != rc) { return rc; } if (NULL != cm->receive_queues) { free(cm->receive_queues); } OBJ_DESTRUCT(&cm->recv_wrs.lock); IBOFFLOAD_VERBOSE(10, ("The component closed.\n")); return OMPI_SUCCESS; }
/* * Open the component */ static int iboffload_open(void) { int rc; /* local variables */ mca_bcol_iboffload_component_t *cm = &mca_bcol_iboffload_component; IBOFFLOAD_VERBOSE(10, ("Open Iboffload component.\n")); cm->super.priority = 100; cm->super.n_net_contexts = 0; cm->super.network_contexts = NULL; OBJ_CONSTRUCT(&cm->recv_wrs.lock, opal_mutex_t); /* construct lists */ OBJ_CONSTRUCT(&cm->devices, opal_pointer_array_t); rc = opal_pointer_array_init(&cm->devices, 10, INT_MAX, 10); if (OMPI_SUCCESS != rc) { goto close_device; } /* load mca parametres */ rc = mca_bcol_iboffload_register_params(); if (OMPI_SUCCESS != rc) { goto close_device; } /* Register the progress function */ rc = opal_progress_register(mca_bcol_iboffload_component_progress); if (OMPI_SUCCESS != rc) { IBOFFLOAD_ERROR(("Failed to register the progress function" " for iboffload component.\n")); goto close_device; } map_ompi_to_ib_dtype(); map_ompi_to_ib_op_type(); /* The init_done set to true on first component usage */ cm->init_done = false; return OMPI_SUCCESS; close_device: OBJ_DESTRUCT(&cm->devices); OBJ_DESTRUCT(&cm->recv_wrs.lock); return rc; }
static int progress_one_device(mca_bcol_iboffload_device_t *device) { int ne, rc, count = 0; mca_bcol_iboffload_collfrag_t *coll_frag; mca_bcol_iboffload_collreq_t *coll_request; struct ibv_wc wc; memset(&wc, 0, sizeof(struct ibv_wc)); /* * poll for collective completion - does not mean resources can * be freed, as incomplete network level sends may still be pending */ /* Poll for completion on completion on wait MQEs */ if(0 != (ne = ibv_poll_cq(device->ib_mq_cq, 1, &wc))) { do { if (OPAL_UNLIKELY(0 > ne)) { IBOFFLOAD_ERROR(("Device %s: " "failed to poll MQ completion queue\n", ibv_get_device_name(device->dev.ib_dev))); fatal_error("failed to poll MQ completion queue\n"); return count; } if (OPAL_UNLIKELY(IBV_WC_SUCCESS != wc.status)) { IBOFFLOAD_ERROR(("Device %s: " "the completion with error on wait was gotten, status %d, opcode %d, " "vendor_err 0x%x, qp %x, id 0x%x\n", ibv_get_device_name(device->dev.ib_dev), wc.status, wc.opcode, wc.vendor_err, wc.qp_num, wc.wr_id)); fatal_error("wc.status \n"); return count; } IBOFFLOAD_VERBOSE(10, ("The MQ completion was polled.\n")); ++count; /* get pointer to mca_bcol_iboffload_collfrag_t */ coll_frag = (mca_bcol_iboffload_collfrag_t*) (uint64_t) (uintptr_t) wc.wr_id; /* Only last MQ task of collective frag sends completion signal, so if we got it => all MQEs were done. */ coll_frag->complete = true; IBOFFLOAD_VERBOSE(10, ("MQ completion for algorithm %d coll_frag_addr %p ml buffer index %d", coll_frag->alg, (void *)coll_frag, coll_frag->coll_full_req->ml_buffer_index)); /* full request descriptor */ coll_request = coll_frag->coll_full_req; coll_request->n_frag_mpi_complete++; /* * at this stage all receives have been completed, so * unpack the data to user buffer, the resources will be released when we will done with all * element in the task list */ if (NULL != coll_request->completion_cb_fn) { if (OMPI_SUCCESS != coll_request->completion_cb_fn(coll_frag)) { fatal_error("coll_request->completion_cb_fn\n"); return count; } } if (coll_request->n_frag_mpi_complete == coll_request->n_fragments) { coll_request->super.req_complete = true; opal_condition_broadcast(&ompi_request_cond); IBOFFLOAD_VERBOSE(10, ("After opal_condition_broadcast.\n")); } rc = handle_collfrag_done(coll_frag, coll_request, device); if (0 != rc) { return count; } } while(0 != (ne = ibv_poll_cq(device->ib_mq_cq, 1, &wc))); return count; } /* poll the send completion queue */ do { ne = ibv_poll_cq(device->ib_cq, 1, &wc); if (0 < ne) { if (OPAL_UNLIKELY(IBV_WC_SUCCESS != wc.status)) { IBOFFLOAD_ERROR(("Device %s, " "the completion with error on send was gotten, status %d, opcode %d, " "vendor_err 0x%x, qp %x, id 0x%x\n", ibv_get_device_name(device->dev.ib_dev), wc.status, wc.opcode, wc.vendor_err, wc.qp_num, wc.wr_id)); #if OPAL_ENABLE_DEBUG { mca_bcol_iboffload_module_t *iboffload; int i, qp_index, num_qps = mca_bcol_iboffload_component.num_qps; coll_frag = (mca_bcol_iboffload_collfrag_t*) (uint64_t) (uintptr_t) wc.wr_id; iboffload = coll_frag->coll_full_req->module; for (i = 0; i < iboffload->num_endpoints; ++i) { mca_bcol_iboffload_endpoint_t *ep = iboffload->endpoints[i]; for (qp_index = 0; qp_index < num_qps; ++qp_index) { if (NULL != ep->qps[qp_index].qp->lcl_qp && wc.qp_num == ep->qps[qp_index].qp->lcl_qp->qp_num) { IBOFFLOAD_ERROR(("Module - %p, coll_frag - %p, " "destination %d, qp index - %d.", iboffload, coll_frag, i, qp_index)); } } } } #endif fatal_error("Failed to ibv_poll_cq\n"); return count; } ++count; /* get pointer to mca_bcol_iboffload_collfrag_t */ coll_frag = (mca_bcol_iboffload_collfrag_t*) (uint64_t) (uintptr_t) wc.wr_id; /* update the number of completed sends */ coll_frag->n_sends_completed++; IBOFFLOAD_VERBOSE(10, ("Send CQ completion for algorithm %d coll_frag_addr %p ml buffer index %d", coll_frag->alg, (void *)coll_frag, coll_frag->coll_full_req->ml_buffer_index)); IBOFFLOAD_VERBOSE(10, ("Alg %d coll_frag_addr %p: n_sends_completed - %d, n_sends - %d.\n", coll_frag->alg, (void *)coll_frag, coll_frag->n_sends_completed, coll_frag->n_sends)); assert(coll_frag->n_sends_completed <= coll_frag->n_sends); /* full message descriptor */ coll_request = coll_frag->coll_full_req; /* check to see if all sends are complete from the network * perspective */ rc = handle_collfrag_done(coll_frag, coll_request, device); if (0 != rc) { return count; } } else if (OPAL_UNLIKELY(0 > ne)) { IBOFFLOAD_ERROR(("Device %s: " "failed to poll send completion queue\n", ibv_get_device_name(device->dev.ib_dev))); fatal_error("failed to poll send completion queue\n"); return count; } } while (0 != ne); return count; }
static int setup_qps(void) { int ret = OMPI_SUCCESS, qp = 0; int rd_num = 0, rd_low = 0, size = 0, rd_win = 0, rd_rsv = 0, sd_max = 0; mca_bcol_iboffload_qp_type_t type = 0; char **queues = NULL, **params = NULL; queues = opal_argv_split(mca_bcol_iboffload_component.receive_queues, ':'); if (0 == opal_argv_count(queues)) { orte_show_help("help-mpi-btl-openib.txt", "no qps in receive_queues", true, orte_process_info.nodename, mca_bcol_iboffload_component.receive_queues); ret = OMPI_ERROR; goto exit; } while (queues[qp] != NULL) { if (0 == strncmp("P,", queues[qp], 2)) { type = MCA_BCOL_IBOFFLOAD_PP_QP; } else if (0 == strncmp("S,", queues[qp], 2)) { type = MCA_BCOL_IBOFFLOAD_SRQ_QP; } else if (0 == strncmp("X,", queues[qp], 2)) { #if HAVE_XRC type = MCA_BCOL_IBOFFLOAD_XRC_QP; #else orte_show_help("help-mpi-btl-openib.txt", "No XRC support", true, orte_process_info.nodename, mca_bcol_iboffload_component.receive_queues); ret = OMPI_ERR_NOT_AVAILABLE; goto exit; #endif } else { orte_show_help("help-mpi-btl-openib.txt", "invalid qp type in receive_queues", true, orte_process_info.nodename, mca_bcol_iboffload_component.receive_queues, queues[qp]); ret = OMPI_ERR_BAD_PARAM; goto exit; } ++qp; } mca_bcol_iboffload_component.num_qps = MCA_BCOL_IBOFFLOAD_QP_LAST; qp = 0; #define P(N) (((N) > count) ? NULL : params[(N)]) while (NULL != queues[qp]) { int count; params = opal_argv_split_with_empty(queues[qp], ','); count = opal_argv_count(params); if ('P' == params[0][0]) { if (count < 3 || count > 6) { orte_show_help("help-mpi-btl-openib.txt", "invalid pp qp specification", true, orte_process_info.nodename, queues[qp]); ret = OMPI_ERR_BAD_PARAM; goto exit; } size = atoi_param(P(1), 0); rd_num = atoi_param(P(2), 256); /* by default set rd_low to be 3/4 of rd_num */ rd_low = atoi_param(P(3), rd_num - (rd_num / 4)); rd_win = atoi_param(P(4), (rd_num - rd_low) * 2); rd_rsv = atoi_param(P(5), (rd_num * 2) / rd_win); if ((rd_num - rd_low) > rd_win) { orte_show_help("help-mpi-btl-openib.txt", "non optimal rd_win", true, rd_win, rd_num - rd_low); } } else { if (count < 3 || count > 5) { orte_show_help("help-mpi-btl-openib.txt", "invalid srq specification", true, orte_process_info.nodename, queues[qp]); ret = OMPI_ERR_BAD_PARAM; goto exit; } size = atoi_param(P(1), 0); rd_num = atoi_param(P(2), 256); /* by default set rd_low to be 3/4 of rd_num */ rd_low = atoi_param(P(3), rd_num - (rd_num / 4)); sd_max = atoi_param(P(4), rd_low / 4); IBOFFLOAD_VERBOSE(10, ("srq: rd_num is %d rd_low is %d sd_max is %d", rd_num, rd_low, sd_max)); } if (rd_num <= rd_low) { orte_show_help("help-mpi-btl-openib.txt", "rd_num must be > rd_low", true, orte_process_info.nodename, queues[qp]); ret = OMPI_ERR_BAD_PARAM; goto exit; } opal_argv_free(params); ++qp; } params = NULL; for (qp = 0; qp < MCA_BCOL_IBOFFLOAD_QP_LAST; ++qp) { mca_bcol_iboffload_component.qp_infos[qp].qp_index = qp; mca_bcol_iboffload_component.qp_infos[qp].type = type; mca_bcol_iboffload_component.qp_infos[qp].size = size; mca_bcol_iboffload_component.qp_infos[qp].rd_num = rd_num; mca_bcol_iboffload_component.qp_infos[qp].rd_low = rd_low; mca_bcol_iboffload_component.qp_infos[qp].rd_pp_win = rd_num - rd_low; if (MCA_BCOL_IBOFFLOAD_PP_QP == type) { mca_bcol_iboffload_component.qp_infos[qp].u.pp_qp.rd_win = rd_win; mca_bcol_iboffload_component.qp_infos[qp].u.pp_qp.rd_rsv = rd_rsv; } else { mca_bcol_iboffload_component.qp_infos[qp].u.srq_qp.sd_max = sd_max; } if (NULL != setup_qps_fn[qp]) { setup_qps_fn[qp](&mca_bcol_iboffload_component.qp_infos[qp]); } } exit: if (NULL != params) { opal_argv_free(params); } if (NULL != queues) { opal_argv_free(queues); } return ret; }
static int mca_bcol_iboffload_dummy_frag_qp_prepost( mca_bcol_iboffload_endpoint_t *endpoint, int qp_index, int num_to_prepost) { struct ibv_recv_wr *recv_wr, *recv_bad; int ret, num_preposted = 0, start_wr_index; mca_bcol_iboffload_component_t *cm = &mca_bcol_iboffload_component; mca_bcol_iboffload_recv_wr_manager *recv_wrs = &cm->recv_wrs; IBOFFLOAD_VERBOSE(10, ("Recv prepost call: endpoint %p, to prepost %d", (void *) endpoint, num_to_prepost)); if (OPAL_UNLIKELY(0 == num_to_prepost)) { IBOFFLOAD_VERBOSE(10, ("num_to_prepost = 0, return immediate")); return OMPI_SUCCESS; } /* make sure that we do not overrun number of rd_wqe */ if (num_to_prepost > endpoint->qps[qp_index].rd_wqe) { IBOFFLOAD_VERBOSE(10, ("Reset num_to_prepost = %d, to rd_wqe = %d", num_to_prepost, endpoint->qps[qp_index].rd_wqe)); num_to_prepost = endpoint->qps[qp_index].rd_wqe; } OPAL_THREAD_LOCK(&recv_wrs->lock); /* calculate start index in array * of pre-allocated work requests */ start_wr_index = cm->qp_infos[qp_index].rd_num - num_to_prepost; recv_wr = &recv_wrs->recv_work_requests[qp_index][start_wr_index]; IBOFFLOAD_VERBOSE(10, ("Endpoint %p, qp_index - %d, to_porepost %d, " "start index of WRs - %d, rd_wqe - %d", (void *) endpoint, qp_index, num_to_prepost, start_wr_index, endpoint->qps[qp_index].rd_wqe)); while (num_preposted < num_to_prepost) { /* prepost the special barrier frag to recv queue */ struct ibv_sge *dummy_sg_entry = &endpoint->iboffload_module->device->dummy_frags[qp_index].sg_entry; recv_wr[num_preposted].sg_list = dummy_sg_entry; ++num_preposted; } if (OPAL_LIKELY(num_preposted > 0)) { /* Set the tail */ recv_wr[num_preposted - 1].next = NULL; /* post the list of recvs */ ret = ibv_post_recv(endpoint->qps[qp_index].qp->lcl_qp, recv_wr, &recv_bad); if (OPAL_UNLIKELY(0 != ret)) { IBOFFLOAD_ERROR(("ibv_post_recv failed, error: %s [%d], " "qp_index - %d.\n", strerror(errno), ret, qp_index)); return OMPI_ERROR; } /* recover last recv_wr if needed */ if (OPAL_UNLIKELY(num_to_prepost != num_preposted)) { recv_wr[num_preposted - 1].next = &recv_wr[num_preposted]; } /* decresing numbers of free recv wqe */ endpoint->qps[qp_index].rd_wqe -= num_preposted; } OPAL_THREAD_UNLOCK(&recv_wrs->lock); IBOFFLOAD_VERBOSE(10, ("Endpoint %p, to_porepost %d, num preposted - %d, qp_index - %d", (void *) endpoint, num_to_prepost, num_preposted, qp_index)); return OMPI_SUCCESS; }
static int mca_bcol_iboffload_fanin_leader_progress( mca_bcol_iboffload_module_t *iboffload, struct mca_bcol_iboffload_collreq_t *coll_request) { int rc = OMPI_SUCCESS, leader_rank = 0, rank, sbgp_size = iboffload->ibnet->super.group_size; struct mqe_task *last_wait = NULL; mca_bcol_iboffload_task_t *wait_task = NULL; mca_bcol_iboffload_frag_t *preposted_recv_frag = NULL; struct mqe_task **mqe_ptr_to_set; mca_bcol_iboffload_collfrag_t *coll_fragment; coll_fragment = (mca_bcol_iboffload_collfrag_t *) opal_list_get_last(&coll_request->work_requests); mqe_ptr_to_set = &coll_fragment->to_post; if (OPAL_UNLIKELY(false == BCOL_IBOFFLOAD_MQ_HAVE_CREDITS( iboffload, coll_fragment->mq_index, coll_fragment->mq_credits))) { IBOFFLOAD_VERBOSE(10, ("There are not enough credits on MQ.\n")); goto out_of_resources; } for (rank = leader_rank + 1; rank < sbgp_size; ++rank) { /* post wait */ preposted_recv_frag = mca_bcol_iboffload_get_preposted_recv_frag( iboffload, rank, coll_request->qp_index); if(NULL == preposted_recv_frag) { IBOFFLOAD_VERBOSE(10, ("Failing for getting prepost recv frag.\n")); goto out_of_resources; } wait_task = mca_bcol_iboffload_get_wait_task(iboffload, rank, 1, preposted_recv_frag, coll_request->qp_index, NULL); if(NULL == wait_task) { IBOFFLOAD_VERBOSE(10, ("Failing for getting wait task.\n")); goto out_of_resources; } APPEND_TO_TASKLIST(mqe_ptr_to_set, wait_task, last_wait); MCA_BCOL_IBOFFLOAD_APPEND_TASK_TO_LIST(coll_fragment->task_next, wait_task); } /* end of list */ *mqe_ptr_to_set = NULL; last_wait->flags |= MQE_WR_FLAG_SIGNAL; coll_fragment->signal_task_wr_id = last_wait->wr_id; last_wait->wr_id = (uint64_t) (uintptr_t) coll_fragment; /* post the mwr */ rc = mca_bcol_iboffload_post_mqe_tasks(iboffload, coll_fragment->to_post); if(OMPI_SUCCESS != rc) { IBOFFLOAD_VERBOSE(10, ("MQE task posting failing.\n")); /* Note: need to clean up */ return rc; } MCA_BCOL_UPDATE_ORDER_COUNTER(&iboffload->super, coll_request->order_info); return OMPI_SUCCESS; out_of_resources: /* Release all resources */ IBOFFLOAD_VERBOSE(10, ("Fan-in, adding collfrag to collfrag_pending")); return mca_bcol_iboffload_free_resources_and_move_to_pending(coll_fragment, iboffload); }
static void mca_bcol_iboffload_fillin_qp_attr(int qp_index, mca_bcol_iboffload_endpoint_t *ep, ompi_common_ofacm_base_qp_config_t *qp_config) { uint32_t max_sge, *init_attr_mask = &qp_config->init_attr_mask[qp_index]; struct ibv_qp_attr *attr = &qp_config->attr[qp_index]; struct ibv_qp_init_attr *init_attr = &qp_config->init_attr[qp_index]; mca_bcol_iboffload_component_t *cm = &mca_bcol_iboffload_component; /* Set special init attributes mask */ *init_attr_mask = IBV_M_QP_EXT_CLASS_1 | IBV_M_QP_EXT_CLASS_2 | IBV_M_QP_EXT_IGNORE_RQ_OVERFLOW; /* Set init attributes */ init_attr->qp_type = IBV_QPT_RC; /* Vasily: ?????? init_attr->cap.max_inline_data = max_inline_size(qp, iboffload_module->device); */ /* Pasha: we can not leave max_inline empty ! Todo: copy max_inline_size() from ofacm to common area. */ init_attr->cap.max_inline_data = (int32_t) cm->max_inline_data; /* We allocate SG list for some algorithms (Bruck's alltoall) */ max_sge = ep->iboffload_module->group_size / 2 + ep->iboffload_module->group_size % 2; /* max send sge should be less than device maximums */ if (max_sge > (uint32_t) ep->iboffload_module->device->ib_dev_attr.max_sge) { max_sge = (uint32_t) ep->iboffload_module->device->ib_dev_attr.max_sge; } init_attr->cap.max_send_sge = max_sge; init_attr->cap.max_recv_sge = max_sge; /* Vasily: the value will be changed later */ /* TODO Pasha: this is real crap */ init_attr->cap.max_recv_wr = (uint32_t) cm->cq_size; init_attr->cap.max_send_wr = (uint32_t) cm->cq_size; /* Set attributes */ /* attr->pkey_index = 0; */ /* Vasily: ????? */ attr->port_num = ep->iboffload_module->port; /* Vasily: the value will be changed later */ attr->path_mtu = (uint32_t)cm->mtu; attr->max_dest_rd_atomic = cm->max_rdma_dst_ops; attr->min_rnr_timer = (uint32_t)cm->min_rnr_timer; attr->ah_attr.is_global = 0; attr->ah_attr.sl = (uint32_t)cm->service_level; /* Vasily: from struct mca_bcol_iboffload_port_t ????? */ /* attr->ah_attr.src_path_bits = iboffload_module->src_path_bits; */ attr->ah_attr.port_num = ep->iboffload_module->port; /* JMS to be filled in later dynamically */ attr->ah_attr.static_rate = 0; /* RTS params */ attr->timeout = (uint32_t)cm->timeout; attr->retry_cnt = (uint32_t)cm->retry_count; attr->rnr_retry = (uint32_t)cm->rnr_retry; attr->max_rd_atomic = (uint32_t)cm->max_rdma_dst_ops; /* Init for local mca_bcol_iboffload_endpoint_qp_t qps structure * that caches the qp information on endpoint */ OBJ_CONSTRUCT(&ep->qps[qp_index].preposted_frags, opal_list_t); /* Pasha: Need to add function that will */ ep->qps[qp_index].ib_inline_max = cm->max_inline_data; /* TODO Pasha - this is crap too... we do not have info for sevice qps. Fix it later */ ep->qps[qp_index].sd_wqe = cm->qp_infos[qp_index].rd_num; ep->qps[qp_index].rd_wqe = cm->qp_infos[qp_index].rd_num; IBOFFLOAD_VERBOSE(10, ("ep - %p, qp index - %d, num of rd_wqe - %d.", ep, qp_index, ep->qps[qp_index].rd_wqe)); }
/* * Receive prepost: * return values: * 0 - no prepost was done * -1 - fatal error during prepost * other value - number preposted elements */ static int mca_bcol_iboffload_frag_reg_qp_prepost( mca_bcol_iboffload_endpoint_t *endpoint, int qp_index, int num_to_prepost) { ompi_free_list_item_t *item; mca_bcol_iboffload_frag_t *frag; struct ibv_recv_wr *recv_wr, *recv_bad; int i, ret, num_preposted = 0, start_wr_index; mca_bcol_iboffload_component_t *cm = &mca_bcol_iboffload_component; mca_bcol_iboffload_device_t *device = endpoint->iboffload_module->device; opal_list_t *preposted = &(endpoint->qps[qp_index].preposted_frags); mca_bcol_iboffload_recv_wr_manager *recv_wrs = &cm->recv_wrs; IBOFFLOAD_VERBOSE(10, ("Recv prepost call: endpoint %p, to prepost %d", (void *) endpoint, num_to_prepost)); if (OPAL_UNLIKELY(0 == num_to_prepost)) { IBOFFLOAD_VERBOSE(10, ("num_to_prepost = 0, return immediate")); return OMPI_SUCCESS; } /* make sure that we do not overrun number of rd_wqe */ if (num_to_prepost > endpoint->qps[qp_index].rd_wqe) { IBOFFLOAD_VERBOSE(10, ("Reset num_to_prepost = %d, to rd_wqe = %d", num_to_prepost, endpoint->qps[qp_index].rd_wqe)); num_to_prepost = endpoint->qps[qp_index].rd_wqe; } OPAL_THREAD_LOCK(&recv_wrs->lock); /* calculate start index in array * of pre-allocated work requests */ start_wr_index = cm->qp_infos[qp_index].rd_num - num_to_prepost; recv_wr = &recv_wrs->recv_work_requests[qp_index][start_wr_index]; IBOFFLOAD_VERBOSE(10, ("Endpoint %p, qp_index - %d, to_porepost %d, " "start index of WRs - %d, rd_wqe - %d", (void *) endpoint, qp_index, num_to_prepost, start_wr_index, endpoint->qps[qp_index].rd_wqe)); while (num_preposted < num_to_prepost) { /* put the item on list of preposted */ OMPI_FREE_LIST_GET_MT(&device->frags_free[qp_index], item); if (OPAL_UNLIKELY(NULL == item)) { break; } frag = (mca_bcol_iboffload_frag_t *) item; opal_list_append(preposted, (opal_list_item_t *) item); recv_wr[num_preposted].sg_list = &frag->sg_entry; /* TODO Pasha - fix it later */ /* Vasily: Is it right place to take a size value ???? */ frag->sg_entry.length = cm->qp_infos[qp_index].size; ++num_preposted; } if (OPAL_LIKELY(num_preposted > 0)) { /* Set the tail */ recv_wr[num_preposted - 1].next = NULL; /* post the list of recvs */ ret = ibv_post_recv(endpoint->qps[qp_index].qp->lcl_qp, recv_wr, &recv_bad); if (OPAL_UNLIKELY(0 != ret)) { IBOFFLOAD_ERROR(("ibv_post_recv failed (%s), error: %s [%d], " "qp_index - %d.\n", ibv_get_device_name(device->dev.ib_dev), strerror(errno), ret, qp_index)); /* Return allocated frags */ for (i = 0; i < num_preposted; i++) { OMPI_FREE_LIST_RETURN_MT(&device->frags_free[qp_index], (ompi_free_list_item_t *) opal_list_remove_last(preposted)); } return OMPI_ERROR; } /* recover last recv_wr if needed */ if (OPAL_UNLIKELY(num_to_prepost != num_preposted)) { recv_wr[num_preposted - 1].next = &recv_wr[num_preposted]; } /* decresing numbers of free recv wqe */ endpoint->qps[qp_index].rd_wqe -= num_preposted; } OPAL_THREAD_UNLOCK(&recv_wrs->lock); IBOFFLOAD_VERBOSE(10, ("Endpoint %p, to_porepost %d, num preposted - %d", (void *) endpoint, num_to_prepost, num_preposted)); return OMPI_SUCCESS; }
static void mca_bcol_iboffload_device_destructor (mca_bcol_iboffload_device_t *device) { int qp_index, num_qps = mca_bcol_iboffload_component.num_qps; IBOFFLOAD_VERBOSE(10, ("Device %s will be destroyed.\n", ibv_get_device_name(device->dev.ib_dev))); if (NULL != device->frags_free) { for (qp_index = 0; qp_index < num_qps; ++qp_index) { mca_bcol_iboffload_dealloc_qps_resource_fn_t dealloc_resource = mca_bcol_iboffload_component.qp_infos[qp_index].dealloc_resource; if (NULL != dealloc_resource) { dealloc_resource(qp_index, device); } } free(device->frags_free); } if (NULL != device->mpool) { IBOFFLOAD_VERBOSE(10, ("Mpool destroy - %p.\n", device->mpool)); if (OMPI_SUCCESS != mca_mpool_base_module_destroy(device->mpool)) { IBOFFLOAD_ERROR(("Device %s, failed to destroy mpool", ibv_get_device_name(device->dev.ib_dev))); } } if (NULL != device->dummy_reg.mr) { IBOFFLOAD_VERBOSE(10, ("Dummy memory MR unregister - %p.\n", device->dummy_reg.mr)); if (OMPI_SUCCESS != mca_bcol_iboffload_deregister_mr((void *) device, &device->dummy_reg.base)) { IBOFFLOAD_ERROR(("Device %s: failed to unregister dummy memory MR.", ibv_get_device_name(device->dev.ib_dev))); } } if (NULL != device->ib_cq) { if (ibv_destroy_cq(device->ib_cq)) { IBOFFLOAD_ERROR(("Device %s, failed to destroy CQ, errno says %s", ibv_get_device_name(device->dev.ib_dev), strerror(errno))); } } if (NULL != device->ib_mq_cq) { if (ibv_destroy_cq(device->ib_mq_cq)) { IBOFFLOAD_ERROR(("Device %s, failed to destroy mq CQ, errno says %s", ibv_get_device_name(device->dev.ib_dev), strerror(errno))); } } /* Release IB PD if we have one */ if (NULL != device->ib_pd) { if(ibv_dealloc_pd(device->ib_pd)) { IBOFFLOAD_ERROR(("Device %s, failed to release PD, errno says %s", ibv_get_device_name(device->dev.ib_dev), strerror(errno))); } } /* close the device */ if (NULL != device->dev.ib_dev_context) { if (ibv_close_device(device->dev.ib_dev_context)) { IBOFFLOAD_ERROR(("Device %s " ", failed to close the device, errno says %s", ibv_get_device_name(device->dev.ib_dev), strerror(errno))); } } /* release memory */ if (NULL != device->ports) { free(device->ports); } }
static int mca_bcol_iboffload_bcast_scatter_allgather_extra_exec(mca_bcol_iboffload_module_t *iboffload_module, mca_bcol_iboffload_collreq_t *coll_request) { netpatterns_pair_exchange_node_t *recursive_doubling_tree = &iboffload_module->recursive_doubling_tree; int rc, dst; int count = coll_request->count * coll_request->dtype->super.size; int my_group_index = iboffload_module->ibnet->super.my_index; struct mqe_task *last_send = NULL, *last_wait = NULL; mca_bcol_iboffload_collfrag_t *coll_fragment = &coll_request->first_collfrag; if (OPAL_UNLIKELY(!iboffload_module->connection_status[RECURSIVE_DOUBLING_TREE_BCAST])) { bcol_iboffload_setup_binomial_connection(iboffload_module); } /* register memory in mpool/rcache */ rc = mca_bcol_iboffload_prepare_buffer(coll_request->buffer_info[SBUF].buf, count, &coll_request->buffer_info[SBUF].iboffload_reg, iboffload_module); if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { IBOFFLOAD_ERROR(("Cannot register memory: " "addr - %p, %d bytes.\n", coll_request->buffer_info[SBUF].buf, count)); return OMPI_ERROR; } coll_request->buffer_info[SBUF].lkey = coll_request->buffer_info[SBUF].iboffload_reg->mr->lkey; /* it is estimated mq consumption... */ if (OPAL_UNLIKELY(false == BCOL_IBOFFLOAD_MQ_HAVE_CREDITS( iboffload_module, coll_fragment->mq_index, coll_fragment->mq_credits) || false == opal_list_is_empty(&iboffload_module->collfrag_pending))) { IBOFFLOAD_VERBOSE(10, ("There are not enough credits on MQ.\n")); goto out_of_resources; } coll_fragment->tail_next = &coll_fragment->to_post; /* send or recv the data */ if (coll_request->root == my_group_index) { IBOFFLOAD_VERBOSE(10, ("I'm root of the data %d", iboffload_module->power_of_2)); /* send the all data to your extra peer */ dst = recursive_doubling_tree->rank_extra_source; rc = mca_bcol_iboffload_recv_rtr_setup( &last_wait, dst, iboffload_module, coll_fragment); if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { IBOFFLOAD_VERBOSE(10, ("Failed to" " mca_bcol_iboffload_recv_rtr_setup")); return OMPI_ERROR; } rc = mca_bcol_iboffload_send_large_buff_setup( &last_send, SBUF, 0, count, dst, iboffload_module, coll_fragment); if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { IBOFFLOAD_VERBOSE(10, ("Failed to" " mca_bcol_iboffload_send_large_buff_setup")); return OMPI_ERROR; } } else { /* Not root case */ dst = recursive_doubling_tree->rank_extra_source; rc = mca_bcol_iboffload_send_rtr_setup(&last_send, dst, iboffload_module, coll_fragment); if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { IBOFFLOAD_VERBOSE(10, ("Failed to setup send rtr")); return OMPI_ERROR; } rc = mca_bcol_iboffload_recv_large_buff_setup(&last_wait, SBUF, 0, count, dst, iboffload_module, coll_fragment); if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { IBOFFLOAD_VERBOSE(10, ("Failed to setup data receive")); return OMPI_ERROR; } } IBOFFLOAD_VERBOSE(10, ("Fill in the the rest of the coll_fragment.\n")); /* end of list */ *coll_fragment->tail_next = NULL; /* finish initializing full message descriptor */ coll_request->n_fragments += 1; coll_request->n_frags_sent += 1; if (NULL != last_wait) { last_wait->flags |= MQE_WR_FLAG_SIGNAL; coll_fragment->signal_task_wr_id = last_wait->wr_id; last_wait->wr_id = (uint64_t) (uintptr_t) coll_fragment; } else { last_send->flags |= MQE_WR_FLAG_SIGNAL; coll_fragment->signal_task_wr_id = last_send->wr_id; last_send->wr_id = (uint64_t) (uintptr_t) coll_fragment; } /* post the mwr */ rc = mca_bcol_iboffload_post_mqe_tasks(iboffload_module, coll_fragment->to_post); if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { IBOFFLOAD_VERBOSE(10, ("MQE task posting failing.\n")); /* Note: need to clean up */ return rc; } MCA_BCOL_UPDATE_ORDER_COUNTER(&iboffload_module->super, coll_request->order_info); IBOFFLOAD_VERBOSE(10, ("Return success.\n")); return BCOL_FN_STARTED; out_of_resources: /* Release all resources */ IBOFFLOAD_VERBOSE(10, ("Barrier, adding collfrag to collfrag_pending.\n")); rc = mca_bcol_iboffload_free_resources_and_move_to_pending(coll_fragment, iboffload_module); return (OMPI_SUCCESS != rc) ? BCOL_FN_NOT_STARTED : BCOL_FN_STARTED; }
static int mca_bcol_iboffload_bcast_scatter_allgather_exec(mca_bcol_iboffload_module_t *iboffload_module, mca_bcol_iboffload_collreq_t *coll_request) { netpatterns_pair_exchange_node_t *recursive_doubling_tree = &iboffload_module->recursive_doubling_tree; int rc, dst, group_src, power_of_2_distance, recv_count; size_t offset; int count = coll_request->count * coll_request->dtype->super.size; int my_group_index = iboffload_module->ibnet->super.my_index; size_t base_block_size = (count + iboffload_module->power_of_2_ranks - 1) / iboffload_module->power_of_2_ranks; struct mqe_task *last_send = NULL, *last_wait = NULL; mca_bcol_iboffload_collfrag_t *coll_fragment = &coll_request->first_collfrag; if (OPAL_UNLIKELY(!iboffload_module->connection_status[RECURSIVE_DOUBLING_TREE_BCAST])) { bcol_iboffload_setup_binomial_connection(iboffload_module); } /* register memory in mpool/rcache */ rc = mca_bcol_iboffload_prepare_buffer(coll_request->buffer_info[SBUF].buf, count, &coll_request->buffer_info[SBUF].iboffload_reg, iboffload_module); if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { IBOFFLOAD_ERROR(("Cannot register memory: " "addr - %p, %d bytes.\n", coll_request->buffer_info[SBUF].buf, count)); return OMPI_ERROR; } coll_request->buffer_info[SBUF].lkey = coll_request->buffer_info[SBUF].iboffload_reg->mr->lkey; /* it is estimated mq consumption... */ if (OPAL_UNLIKELY(false == BCOL_IBOFFLOAD_MQ_HAVE_CREDITS( iboffload_module, coll_fragment->mq_index, coll_fragment->mq_credits) || false == opal_list_is_empty(&iboffload_module->collfrag_pending))) { IBOFFLOAD_VERBOSE(10, ("There are not enough credits on MQ.\n")); goto out_of_resources; } coll_fragment->tail_next = &coll_fragment->to_post; if (coll_request->root == my_group_index) { IBOFFLOAD_VERBOSE(10, ("I'm root of the data %d %d", iboffload_module->power_of_2, recursive_doubling_tree->n_extra_sources )); /* for proxy we have little bit more work to do */ if (recursive_doubling_tree->n_extra_sources > 0) { /* send the all data to your extra peer */ dst = recursive_doubling_tree->rank_extra_source; rc = mca_bcol_iboffload_recv_rtr_setup( &last_wait, dst, iboffload_module, coll_fragment); if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { IBOFFLOAD_VERBOSE(10, ("Failed to" " mca_bcol_iboffload_recv_rtr_setup")); return OMPI_ERROR; } rc = mca_bcol_iboffload_send_large_buff_setup( &last_send, SBUF, 0, count, dst, iboffload_module, coll_fragment); if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { IBOFFLOAD_VERBOSE(10, ("Failed to" " mca_bcol_iboffload_send_large_buff_setup")); return OMPI_ERROR; } } power_of_2_distance = iboffload_module->power_of_2; BINOMIAL_SCATTER(iboffload_module, coll_fragment, last_wait, last_send, power_of_2_distance - 1, my_group_index, base_block_size, count ); /* EXIT OR GO TO Gather */ goto GATHER; } /* prepare and post recv operation */ group_src = bcol_iboffload_binomial_root_to_src(coll_request->root, my_group_index, iboffload_module->power_of_2_ranks, iboffload_module->group_size, &power_of_2_distance); IBOFFLOAD_VERBOSE(10, ("SRC %d DIST %d ranks %d gsize %d root %d my rank %d", group_src, power_of_2_distance, iboffload_module->power_of_2_ranks, iboffload_module->group_size, coll_request->root, my_group_index)); assert(group_src >= 0); if (0 > power_of_2_distance) { /* the rank is virtual root for this group, receive the data and scatter gather as root */ power_of_2_distance = iboffload_module->power_of_2; offset = 0; recv_count = count; IBOFFLOAD_VERBOSE(10, ("Virtual root %d , set mask to %d", my_group_index, power_of_2_distance)); } else { int my_left_boundary_rank; int delta; recv_count = base_block_size * (1 << power_of_2_distance); /* we may receive larger data */ my_left_boundary_rank = my_group_index & ((~(int)0) << power_of_2_distance ); offset = (size_t) (base_block_size * my_left_boundary_rank); delta = count - offset; if (OPAL_UNLIKELY(delta <= 0)) { /* no data to recv */ goto GATHER; } else { recv_count = (delta < recv_count) ? delta : recv_count; } IBOFFLOAD_VERBOSE(10, ("Recv data set mask to %d", power_of_2_distance)); } IBOFFLOAD_VERBOSE(10, ("Bcast, receive data from %d[%d], count %d, offset %d", group_src, recv_count, offset)); /* Receive data to user buffer */ rc = mca_bcol_iboffload_send_rtr_setup(&last_send, group_src, iboffload_module, coll_fragment); if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { IBOFFLOAD_VERBOSE(10, ("Failed to setup send rtr")); return OMPI_ERROR; } rc = mca_bcol_iboffload_recv_large_buff_setup(&last_wait, SBUF, offset, recv_count, group_src, iboffload_module, coll_fragment); if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { IBOFFLOAD_VERBOSE(10, ("Failed to setup data receive")); return OMPI_ERROR; } BINOMIAL_SCATTER(iboffload_module, coll_fragment, last_wait, last_send, power_of_2_distance - 1, my_group_index, base_block_size, count); GATHER: rc = bcol_iboffload_bcast_binomial_gather(iboffload_module, &last_send, &last_wait, coll_fragment, count, base_block_size, power_of_2_distance); if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { IBOFFLOAD_VERBOSE(10, ("Failed to setup gather. Return %d", rc)); return rc; } if (recursive_doubling_tree->n_extra_sources > 0 && iboffload_module->power_of_2 != power_of_2_distance) { dst = recursive_doubling_tree->rank_extra_source; rc = mca_bcol_iboffload_recv_rtr_setup( &last_wait, dst, iboffload_module, coll_fragment); if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { IBOFFLOAD_VERBOSE(10, ("Failed to" " mca_bcol_iboffload_recv_rtr_setup")); return OMPI_ERROR; } rc = mca_bcol_iboffload_send_large_buff_setup( &last_send, SBUF, 0, count, dst, iboffload_module, coll_fragment); if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { IBOFFLOAD_VERBOSE(10, ("Failed to" " mca_bcol_iboffload_send_large_buff_setup")); return OMPI_ERROR; } } IBOFFLOAD_VERBOSE(10, ("Fill in the the rest of the coll_fragment.\n")); /* end of list */ *coll_fragment->tail_next = NULL; /* finish initializing full message descriptor */ coll_request->n_fragments += 1; coll_request->n_frags_sent += 1; if (NULL != last_wait) { last_wait->flags |= MQE_WR_FLAG_SIGNAL; coll_fragment->signal_task_wr_id = last_wait->wr_id; last_wait->wr_id = (uint64_t) (uintptr_t) coll_fragment; } else { last_send->flags |= MQE_WR_FLAG_SIGNAL; coll_fragment->signal_task_wr_id = last_send->wr_id; last_send->wr_id = (uint64_t) (uintptr_t) coll_fragment; } /* post the mwr */ rc = mca_bcol_iboffload_post_mqe_tasks(iboffload_module, coll_fragment->to_post); if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { IBOFFLOAD_VERBOSE(10, ("MQE task posting failing.\n")); /* Note: need to clean up */ return rc; } MCA_BCOL_UPDATE_ORDER_COUNTER(&iboffload_module->super, coll_request->order_info); IBOFFLOAD_VERBOSE(10, ("Return success.\n")); return BCOL_FN_STARTED; out_of_resources: /* Release all resources */ IBOFFLOAD_VERBOSE(10, ("Barrier, adding collfrag to collfrag_pending.\n")); rc = mca_bcol_iboffload_free_resources_and_move_to_pending(coll_fragment, iboffload_module); return (OMPI_SUCCESS != rc) ? BCOL_FN_NOT_STARTED : BCOL_FN_STARTED; }
static int mca_bcol_iboffload_small_msg_bcast_exec(mca_bcol_iboffload_module_t *iboffload_module, mca_bcol_iboffload_collreq_t *coll_request) { netpatterns_pair_exchange_node_t *recursive_doubling_tree = &iboffload_module->recursive_doubling_tree; int rc, distance_mask_pow , dst, group_src, power_of_2_distance; uint32_t pack_len; int my_group_index = iboffload_module->super.sbgp_partner_module->my_index; struct mqe_task *last_send = NULL, *last_wait = NULL; mca_bcol_iboffload_collfrag_t *coll_fragment = &coll_request->first_collfrag; IBOFFLOAD_VERBOSE(10,("Entering small msg iboffload bcast")); if (OPAL_UNLIKELY(!iboffload_module->connection_status[RECURSIVE_DOUBLING_TREE_BCAST])) { IBOFFLOAD_VERBOSE(10,("Bcast open new connection ")); bcol_iboffload_setup_binomial_connection(iboffload_module); } pack_len = coll_request->count * coll_request->dtype->super.size; IBOFFLOAD_VERBOSE(10,("My packet length %d pack_len frag_count %d dtype size %d ", pack_len, coll_request->count, coll_request->dtype->super.size)); /* it is estimated mq consumption... */ if (OPAL_UNLIKELY(false == BCOL_IBOFFLOAD_MQ_HAVE_CREDITS( iboffload_module, coll_fragment->mq_index, coll_fragment->mq_credits))) { IBOFFLOAD_VERBOSE(10, ("There are not enough credits on MQ.\n")); goto out_of_resources; } coll_fragment->tail_next = &coll_fragment->to_post; coll_request->buffer_info[SBUF].lkey = iboffload_module->rdma_block.ib_info.lkey; if (coll_request->root == my_group_index) { IBOFFLOAD_VERBOSE(10, ("I'm root of the data")); /* Send data to the extra peer */ if (recursive_doubling_tree->n_extra_sources > 0) { /* send the all data to your extra peer */ dst = recursive_doubling_tree->rank_extra_source; IBOFFLOAD_VERBOSE(10,("Sending the dat to Dst %d",dst)); rc = mca_bcol_iboffload_send_small_buff_setup( &last_send, pack_len, dst, iboffload_module, coll_fragment); if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { IBOFFLOAD_VERBOSE(10, ("Failed to" " mca_bcol_iboffload_send_large_buff_setup")); goto out_of_resources; } } distance_mask_pow = iboffload_module->power_of_2 - 1; rc = binomial_scatter_smsg(iboffload_module, coll_fragment, &last_send, distance_mask_pow, my_group_index, pack_len); if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { IBOFFLOAD_VERBOSE(10, ("Failed to binomial_scatter_smsg")); goto out_of_resources; } goto finalize; } /* prepare and post recv operation */ group_src = bcol_iboffload_binomial_root_to_src(coll_request->root, my_group_index, iboffload_module->power_of_2_ranks, iboffload_module->group_size, &power_of_2_distance); assert(group_src >= 0); if (0 > power_of_2_distance) { /* the rank is virtual root for this group, receive the data and scatter gather as root */ IBOFFLOAD_VERBOSE(10,("Virtual root distance_mask_pow %d ",iboffload_module->power_of_2)); distance_mask_pow = iboffload_module->power_of_2 - 1; } else { distance_mask_pow = power_of_2_distance - 1; } IBOFFLOAD_VERBOSE(10, ("Bcast, receive data from %d[%d], count %d, offset %d", group_src)); rc = mca_bcol_iboffload_recv_small_buff_setup(&last_wait, pack_len, group_src, iboffload_module, coll_fragment); if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { IBOFFLOAD_VERBOSE(10, ("Failed to setup data receive")); goto out_of_resources; } rc = binomial_scatter_smsg(iboffload_module, coll_fragment, &last_send, distance_mask_pow, my_group_index, pack_len); if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { IBOFFLOAD_VERBOSE(10, ("Failed to binomial_scatter_smsg")); goto out_of_resources; } if (recursive_doubling_tree->n_extra_sources > 0 && iboffload_module->power_of_2 - 1 != distance_mask_pow) { /* if ((recursive_doubling_tree->n_extra_sources > 0) && ((my_group_index + iboffload_module->power_of_2_ranks ) < iboffload_module->group_size) ) { */ dst = recursive_doubling_tree->rank_extra_source; /* dst = my_group_index + iboffload_module->power_of_2_ranks; */ rc = mca_bcol_iboffload_send_small_buff_setup( &last_send, pack_len, dst, iboffload_module, coll_fragment); if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { IBOFFLOAD_VERBOSE(10, ("Failed to" " mca_bcol_iboffload_send_small_buff_setup")); goto out_of_resources; } } finalize: /* end of list */ *coll_fragment->tail_next = NULL; /* finish initializing full message descriptor */ (coll_request)->n_fragments += 1; (coll_request)->n_frags_sent += 1; if (NULL != last_wait) { last_wait->flags |= MQE_WR_FLAG_SIGNAL; coll_fragment->signal_task_wr_id = last_wait->wr_id; last_wait->wr_id = (uint64_t) (uintptr_t) coll_fragment; } else { last_send->flags |= MQE_WR_FLAG_SIGNAL; coll_fragment->signal_task_wr_id = last_send->wr_id; last_send->wr_id = (uint64_t) (uintptr_t) coll_fragment; } /* post the mwr */ rc = mca_bcol_iboffload_post_mqe_tasks(iboffload_module, coll_fragment->to_post); if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { IBOFFLOAD_VERBOSE(10, ("MQE task posting failing.\n")); /* Note: need to clean up */ return rc; } MCA_BCOL_UPDATE_ORDER_COUNTER(&iboffload_module->super, coll_request->order_info); IBOFFLOAD_VERBOSE(10, ("Return success.\n")); return BCOL_FN_STARTED; out_of_resources: /* Release all resources */ IBOFFLOAD_VERBOSE(10, ("Barrier, adding collfrag to collfrag_pending.\n")); rc = mca_bcol_iboffload_free_resources_and_move_to_pending(coll_fragment, iboffload_module); return (OMPI_SUCCESS != rc) ? BCOL_FN_NOT_STARTED : BCOL_FN_STARTED; }
static int mca_bcol_iboffload_small_msg_bcast_extra_exec(mca_bcol_iboffload_module_t *iboffload_module, mca_bcol_iboffload_collreq_t *coll_request) { netpatterns_pair_exchange_node_t *recursive_doubling_tree = &iboffload_module->recursive_doubling_tree; int rc, dst; int my_group_index = iboffload_module->super.sbgp_partner_module->my_index; uint32_t pack_len; struct mqe_task *last_send = NULL, *last_wait = NULL; mca_bcol_iboffload_collfrag_t *coll_fragment = &coll_request->first_collfrag; IBOFFLOAD_VERBOSE(10,("Entering small msg extra iboffload bcast")); if (OPAL_UNLIKELY(!iboffload_module->connection_status[RECURSIVE_DOUBLING_TREE_BCAST])) { IBOFFLOAD_VERBOSE(10,("Bcast open new connection ")); bcol_iboffload_setup_binomial_connection(iboffload_module); } pack_len = coll_request->count * coll_request->dtype->super.size; coll_request->buffer_info[SBUF].lkey = iboffload_module->rdma_block.ib_info.lkey; IBOFFLOAD_VERBOSE(10,("My packet length %d pack_len frag_count %d dtype size %d ", pack_len, coll_request->count, coll_request->dtype->super.size)); /* it is estimated mq consumption... */ if (OPAL_UNLIKELY(false == BCOL_IBOFFLOAD_MQ_HAVE_CREDITS( iboffload_module, coll_fragment->mq_index, coll_fragment->mq_credits))) { IBOFFLOAD_VERBOSE(10, ("There are not enough credits on MQ.\n")); goto out_of_resources; } coll_fragment->tail_next = &coll_fragment->to_post; if (coll_request->root == my_group_index) { IBOFFLOAD_VERBOSE(10, ("I'm root of the data %d", iboffload_module->power_of_2)); /* send the all data to your extra peer */ dst = recursive_doubling_tree->rank_extra_source; IBOFFLOAD_VERBOSE(10,("Im extra root sending data to %d \n",dst)); rc = mca_bcol_iboffload_send_small_buff_setup( &last_send, pack_len, dst, iboffload_module, coll_fragment); if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { IBOFFLOAD_VERBOSE(10, ("Failed to" " mca_bcol_iboffload_send_small_buff_setup")); goto out_of_resources; } } else { /* Not root case */ dst = recursive_doubling_tree->rank_extra_source; rc = mca_bcol_iboffload_recv_small_buff_setup(&last_wait, pack_len, dst, iboffload_module, coll_fragment); if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { IBOFFLOAD_VERBOSE(10, ("Failed to setup data receive")); return OMPI_ERROR; } } /* end of list */ *coll_fragment->tail_next = NULL; /* finish initializing full message descriptor */ (coll_request)->n_fragments = 1; (coll_request)->n_frags_sent = 1; if (NULL != last_wait) { last_wait->flags |= MQE_WR_FLAG_SIGNAL; coll_fragment->signal_task_wr_id = last_wait->wr_id; last_wait->wr_id = (uint64_t) (uintptr_t) coll_fragment; } else { last_send->flags |= MQE_WR_FLAG_SIGNAL; coll_fragment->signal_task_wr_id = last_send->wr_id; last_send->wr_id = (uint64_t) (uintptr_t) coll_fragment; } /* post the mwr */ rc = mca_bcol_iboffload_post_mqe_tasks(iboffload_module, coll_fragment->to_post); if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { IBOFFLOAD_VERBOSE(10, ("MQE task posting failing.\n")); /* Note: need to clean up */ return rc; } MCA_BCOL_UPDATE_ORDER_COUNTER(&iboffload_module->super, coll_request->order_info); IBOFFLOAD_VERBOSE(10, ("Return success.\n")); return BCOL_FN_STARTED; out_of_resources: /* Release all resources */ IBOFFLOAD_VERBOSE(10, ("Barrier, adding collfrag to collfrag_pending.\n")); rc = mca_bcol_iboffload_free_resources_and_move_to_pending(coll_fragment, iboffload_module); return (OMPI_SUCCESS != rc) ? BCOL_FN_NOT_STARTED : BCOL_FN_STARTED; }
static int mca_bcol_iboffload_fanin_proxy_progress( mca_bcol_iboffload_module_t *iboffload, struct mca_bcol_iboffload_collreq_t *coll_request) { int rc = OMPI_SUCCESS, leader_rank = 0; struct mqe_task *last_send = NULL; mca_bcol_iboffload_task_t *send_task = NULL; mca_bcol_iboffload_frag_t *send_fragment = NULL; struct mqe_task **mqe_ptr_to_set; mca_bcol_iboffload_collfrag_t *coll_fragment; coll_fragment = (mca_bcol_iboffload_collfrag_t *) opal_list_get_last(&coll_request->work_requests); mqe_ptr_to_set = &coll_fragment->to_post; if (OPAL_UNLIKELY(false == BCOL_IBOFFLOAD_MQ_HAVE_CREDITS( iboffload, coll_fragment->mq_index, coll_fragment->mq_credits))) { IBOFFLOAD_VERBOSE(10, ("There are not enough credits on MQ.\n")); goto out_of_resources; } /* post send */ send_fragment = mca_bcol_iboffload_get_send_frag(coll_request, leader_rank, coll_request->qp_index, 0, 0, SBUF, MCA_BCOL_IBOFFLOAD_SEND_FRAG_DUMMY); if(NULL == send_fragment) { IBOFFLOAD_VERBOSE(10, ("Failing for getting and packing send frag.\n")); goto out_of_resources; } send_task = mca_bcol_iboffload_get_send_task(iboffload, leader_rank, MCA_BCOL_IBOFFLOAD_QP_BARRIER, send_fragment, coll_fragment, INLINE); if(NULL == send_task) { IBOFFLOAD_VERBOSE(10, ("Failing for getting send task.\n")); goto out_of_resources; } APPEND_TO_TASKLIST(mqe_ptr_to_set, send_task, last_send); MCA_BCOL_IBOFFLOAD_APPEND_TASK_TO_LIST(coll_fragment->task_next, send_task); /* end of list */ *mqe_ptr_to_set = NULL; assert(NULL != last_send); last_send->flags |= MQE_WR_FLAG_SIGNAL; coll_fragment->signal_task_wr_id = last_send->wr_id; last_send->wr_id = (uint64_t) (uintptr_t) coll_fragment; /* post the mwr */ rc = mca_bcol_iboffload_post_mqe_tasks(iboffload, coll_fragment->to_post); if(OMPI_SUCCESS != rc) { IBOFFLOAD_VERBOSE(10, ("MQE task posting failing.\n")); /* Note: need to clean up */ return rc; } MCA_BCOL_UPDATE_ORDER_COUNTER(&iboffload->super, coll_request->order_info); return OMPI_SUCCESS; out_of_resources: /* Release all resources */ IBOFFLOAD_VERBOSE(10, ("Fan-in, adding collfrag to collfrag_pending")); return mca_bcol_iboffload_free_resources_and_move_to_pending(coll_fragment, iboffload); }
/* * Open the component */ static int iboffload_open(void) { int rc; /* local variables */ mca_bcol_iboffload_component_t *cm = &mca_bcol_iboffload_component; IBOFFLOAD_VERBOSE(10, ("Open Iboffload component.\n")); (void) mca_bcol_iboffload_verify_params(); cm->super.priority = 100; cm->super.n_net_contexts = 0; cm->super.network_contexts = NULL; OBJ_CONSTRUCT(&cm->recv_wrs.lock, opal_mutex_t); /* construct lists */ OBJ_CONSTRUCT(&cm->devices, opal_pointer_array_t); rc = opal_pointer_array_init(&cm->devices, 10, INT_MAX, 10); if (OMPI_SUCCESS != rc) { goto close_device; } /* Check MCA parameters */ if (0 != (mca_bcol_iboffload_component.exchange_tree_order & (mca_bcol_iboffload_component.exchange_tree_order - 1))) { IBOFFLOAD_ERROR(("Warning: ibcol_iboffload_exchange_tree_order is %d which is not a power of 2, setting it to 2", mca_bcol_iboffload_component.exchange_tree_order)); mca_bcol_iboffload_component.exchange_tree_order = 2; } /* Pasha: Since we do not have max inline check like in openib, I will put some dummy check here. All mlnx devices support at least 512b */ if (mca_bcol_iboffload_component.max_inline_data > 512) { IBOFFLOAD_ERROR(("Warning the inline %d, is to big and unsupported", mca_bcol_iboffload_component.max_inline_data)); rc = OMPI_ERROR; goto close_device; } /* Register the progress function */ rc = opal_progress_register(mca_bcol_iboffload_component_progress); if (OMPI_SUCCESS != rc) { IBOFFLOAD_ERROR(("Failed to register the progress function" " for iboffload component.\n")); goto close_device; } map_ompi_to_ib_dtype(); map_ompi_to_ib_op_type(); /* The init_done set to true on first component usage */ cm->init_done = false; return OMPI_SUCCESS; close_device: OBJ_DESTRUCT(&cm->devices); OBJ_DESTRUCT(&cm->recv_wrs.lock); return rc; }