static int mca_bcol_iboffload_fanin_proxy_progress( mca_bcol_iboffload_module_t *iboffload, struct mca_bcol_iboffload_collreq_t *coll_request) { int rc = OMPI_SUCCESS, leader_rank = 0; struct mqe_task *last_send = NULL; mca_bcol_iboffload_task_t *send_task = NULL; mca_bcol_iboffload_frag_t *send_fragment = NULL; struct mqe_task **mqe_ptr_to_set; mca_bcol_iboffload_collfrag_t *coll_fragment; coll_fragment = (mca_bcol_iboffload_collfrag_t *) opal_list_get_last(&coll_request->work_requests); mqe_ptr_to_set = &coll_fragment->to_post; if (OPAL_UNLIKELY(false == BCOL_IBOFFLOAD_MQ_HAVE_CREDITS( iboffload, coll_fragment->mq_index, coll_fragment->mq_credits))) { IBOFFLOAD_VERBOSE(10, ("There are not enough credits on MQ.\n")); goto out_of_resources; } /* post send */ send_fragment = mca_bcol_iboffload_get_send_frag(coll_request, leader_rank, coll_request->qp_index, 0, 0, SBUF, MCA_BCOL_IBOFFLOAD_SEND_FRAG_DUMMY); if(NULL == send_fragment) { IBOFFLOAD_VERBOSE(10, ("Failing for getting and packing send frag.\n")); goto out_of_resources; } send_task = mca_bcol_iboffload_get_send_task(iboffload, leader_rank, MCA_BCOL_IBOFFLOAD_QP_BARRIER, send_fragment, coll_fragment, INLINE); if(NULL == send_task) { IBOFFLOAD_VERBOSE(10, ("Failing for getting send task.\n")); goto out_of_resources; } APPEND_TO_TASKLIST(mqe_ptr_to_set, send_task, last_send); MCA_BCOL_IBOFFLOAD_APPEND_TASK_TO_LIST(coll_fragment->task_next, send_task); /* end of list */ *mqe_ptr_to_set = NULL; assert(NULL != last_send); last_send->flags |= MQE_WR_FLAG_SIGNAL; coll_fragment->signal_task_wr_id = last_send->wr_id; last_send->wr_id = (uint64_t) (uintptr_t) coll_fragment; /* post the mwr */ rc = mca_bcol_iboffload_post_mqe_tasks(iboffload, coll_fragment->to_post); if(OMPI_SUCCESS != rc) { IBOFFLOAD_VERBOSE(10, ("MQE task posting failing.\n")); /* Note: need to clean up */ return rc; } MCA_BCOL_UPDATE_ORDER_COUNTER(&iboffload->super, coll_request->order_info); return OMPI_SUCCESS; out_of_resources: /* Release all resources */ IBOFFLOAD_VERBOSE(10, ("Fan-in, adding collfrag to collfrag_pending")); return mca_bcol_iboffload_free_resources_and_move_to_pending(coll_fragment, iboffload); }
static int mca_bcol_iboffload_fanin_leader_progress( mca_bcol_iboffload_module_t *iboffload, struct mca_bcol_iboffload_collreq_t *coll_request) { int rc = OMPI_SUCCESS, leader_rank = 0, rank, sbgp_size = iboffload->ibnet->super.group_size; struct mqe_task *last_wait = NULL; mca_bcol_iboffload_task_t *wait_task = NULL; mca_bcol_iboffload_frag_t *preposted_recv_frag = NULL; struct mqe_task **mqe_ptr_to_set; mca_bcol_iboffload_collfrag_t *coll_fragment; coll_fragment = (mca_bcol_iboffload_collfrag_t *) opal_list_get_last(&coll_request->work_requests); mqe_ptr_to_set = &coll_fragment->to_post; if (OPAL_UNLIKELY(false == BCOL_IBOFFLOAD_MQ_HAVE_CREDITS( iboffload, coll_fragment->mq_index, coll_fragment->mq_credits))) { IBOFFLOAD_VERBOSE(10, ("There are not enough credits on MQ.\n")); goto out_of_resources; } for (rank = leader_rank + 1; rank < sbgp_size; ++rank) { /* post wait */ preposted_recv_frag = mca_bcol_iboffload_get_preposted_recv_frag( iboffload, rank, coll_request->qp_index); if(NULL == preposted_recv_frag) { IBOFFLOAD_VERBOSE(10, ("Failing for getting prepost recv frag.\n")); goto out_of_resources; } wait_task = mca_bcol_iboffload_get_wait_task(iboffload, rank, 1, preposted_recv_frag, coll_request->qp_index, NULL); if(NULL == wait_task) { IBOFFLOAD_VERBOSE(10, ("Failing for getting wait task.\n")); goto out_of_resources; } APPEND_TO_TASKLIST(mqe_ptr_to_set, wait_task, last_wait); MCA_BCOL_IBOFFLOAD_APPEND_TASK_TO_LIST(coll_fragment->task_next, wait_task); } /* end of list */ *mqe_ptr_to_set = NULL; last_wait->flags |= MQE_WR_FLAG_SIGNAL; coll_fragment->signal_task_wr_id = last_wait->wr_id; last_wait->wr_id = (uint64_t) (uintptr_t) coll_fragment; /* post the mwr */ rc = mca_bcol_iboffload_post_mqe_tasks(iboffload, coll_fragment->to_post); if(OMPI_SUCCESS != rc) { IBOFFLOAD_VERBOSE(10, ("MQE task posting failing.\n")); /* Note: need to clean up */ return rc; } MCA_BCOL_UPDATE_ORDER_COUNTER(&iboffload->super, coll_request->order_info); return OMPI_SUCCESS; out_of_resources: /* Release all resources */ IBOFFLOAD_VERBOSE(10, ("Fan-in, adding collfrag to collfrag_pending")); return mca_bcol_iboffload_free_resources_and_move_to_pending(coll_fragment, iboffload); }
static int mca_bcol_iboffload_bcast_scatter_allgather_extra_exec(mca_bcol_iboffload_module_t *iboffload_module, mca_bcol_iboffload_collreq_t *coll_request) { netpatterns_pair_exchange_node_t *recursive_doubling_tree = &iboffload_module->recursive_doubling_tree; int rc, dst; int count = coll_request->count * coll_request->dtype->super.size; int my_group_index = iboffload_module->ibnet->super.my_index; struct mqe_task *last_send = NULL, *last_wait = NULL; mca_bcol_iboffload_collfrag_t *coll_fragment = &coll_request->first_collfrag; if (OPAL_UNLIKELY(!iboffload_module->connection_status[RECURSIVE_DOUBLING_TREE_BCAST])) { bcol_iboffload_setup_binomial_connection(iboffload_module); } /* register memory in mpool/rcache */ rc = mca_bcol_iboffload_prepare_buffer(coll_request->buffer_info[SBUF].buf, count, &coll_request->buffer_info[SBUF].iboffload_reg, iboffload_module); if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { IBOFFLOAD_ERROR(("Cannot register memory: " "addr - %p, %d bytes.\n", coll_request->buffer_info[SBUF].buf, count)); return OMPI_ERROR; } coll_request->buffer_info[SBUF].lkey = coll_request->buffer_info[SBUF].iboffload_reg->mr->lkey; /* it is estimated mq consumption... */ if (OPAL_UNLIKELY(false == BCOL_IBOFFLOAD_MQ_HAVE_CREDITS( iboffload_module, coll_fragment->mq_index, coll_fragment->mq_credits) || false == opal_list_is_empty(&iboffload_module->collfrag_pending))) { IBOFFLOAD_VERBOSE(10, ("There are not enough credits on MQ.\n")); goto out_of_resources; } coll_fragment->tail_next = &coll_fragment->to_post; /* send or recv the data */ if (coll_request->root == my_group_index) { IBOFFLOAD_VERBOSE(10, ("I'm root of the data %d", iboffload_module->power_of_2)); /* send the all data to your extra peer */ dst = recursive_doubling_tree->rank_extra_source; rc = mca_bcol_iboffload_recv_rtr_setup( &last_wait, dst, iboffload_module, coll_fragment); if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { IBOFFLOAD_VERBOSE(10, ("Failed to" " mca_bcol_iboffload_recv_rtr_setup")); return OMPI_ERROR; } rc = mca_bcol_iboffload_send_large_buff_setup( &last_send, SBUF, 0, count, dst, iboffload_module, coll_fragment); if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { IBOFFLOAD_VERBOSE(10, ("Failed to" " mca_bcol_iboffload_send_large_buff_setup")); return OMPI_ERROR; } } else { /* Not root case */ dst = recursive_doubling_tree->rank_extra_source; rc = mca_bcol_iboffload_send_rtr_setup(&last_send, dst, iboffload_module, coll_fragment); if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { IBOFFLOAD_VERBOSE(10, ("Failed to setup send rtr")); return OMPI_ERROR; } rc = mca_bcol_iboffload_recv_large_buff_setup(&last_wait, SBUF, 0, count, dst, iboffload_module, coll_fragment); if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { IBOFFLOAD_VERBOSE(10, ("Failed to setup data receive")); return OMPI_ERROR; } } IBOFFLOAD_VERBOSE(10, ("Fill in the the rest of the coll_fragment.\n")); /* end of list */ *coll_fragment->tail_next = NULL; /* finish initializing full message descriptor */ coll_request->n_fragments += 1; coll_request->n_frags_sent += 1; if (NULL != last_wait) { last_wait->flags |= MQE_WR_FLAG_SIGNAL; coll_fragment->signal_task_wr_id = last_wait->wr_id; last_wait->wr_id = (uint64_t) (uintptr_t) coll_fragment; } else { last_send->flags |= MQE_WR_FLAG_SIGNAL; coll_fragment->signal_task_wr_id = last_send->wr_id; last_send->wr_id = (uint64_t) (uintptr_t) coll_fragment; } /* post the mwr */ rc = mca_bcol_iboffload_post_mqe_tasks(iboffload_module, coll_fragment->to_post); if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { IBOFFLOAD_VERBOSE(10, ("MQE task posting failing.\n")); /* Note: need to clean up */ return rc; } MCA_BCOL_UPDATE_ORDER_COUNTER(&iboffload_module->super, coll_request->order_info); IBOFFLOAD_VERBOSE(10, ("Return success.\n")); return BCOL_FN_STARTED; out_of_resources: /* Release all resources */ IBOFFLOAD_VERBOSE(10, ("Barrier, adding collfrag to collfrag_pending.\n")); rc = mca_bcol_iboffload_free_resources_and_move_to_pending(coll_fragment, iboffload_module); return (OMPI_SUCCESS != rc) ? BCOL_FN_NOT_STARTED : BCOL_FN_STARTED; }
static int mca_bcol_iboffload_bcast_scatter_allgather_exec(mca_bcol_iboffload_module_t *iboffload_module, mca_bcol_iboffload_collreq_t *coll_request) { netpatterns_pair_exchange_node_t *recursive_doubling_tree = &iboffload_module->recursive_doubling_tree; int rc, dst, group_src, power_of_2_distance, recv_count; size_t offset; int count = coll_request->count * coll_request->dtype->super.size; int my_group_index = iboffload_module->ibnet->super.my_index; size_t base_block_size = (count + iboffload_module->power_of_2_ranks - 1) / iboffload_module->power_of_2_ranks; struct mqe_task *last_send = NULL, *last_wait = NULL; mca_bcol_iboffload_collfrag_t *coll_fragment = &coll_request->first_collfrag; if (OPAL_UNLIKELY(!iboffload_module->connection_status[RECURSIVE_DOUBLING_TREE_BCAST])) { bcol_iboffload_setup_binomial_connection(iboffload_module); } /* register memory in mpool/rcache */ rc = mca_bcol_iboffload_prepare_buffer(coll_request->buffer_info[SBUF].buf, count, &coll_request->buffer_info[SBUF].iboffload_reg, iboffload_module); if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { IBOFFLOAD_ERROR(("Cannot register memory: " "addr - %p, %d bytes.\n", coll_request->buffer_info[SBUF].buf, count)); return OMPI_ERROR; } coll_request->buffer_info[SBUF].lkey = coll_request->buffer_info[SBUF].iboffload_reg->mr->lkey; /* it is estimated mq consumption... */ if (OPAL_UNLIKELY(false == BCOL_IBOFFLOAD_MQ_HAVE_CREDITS( iboffload_module, coll_fragment->mq_index, coll_fragment->mq_credits) || false == opal_list_is_empty(&iboffload_module->collfrag_pending))) { IBOFFLOAD_VERBOSE(10, ("There are not enough credits on MQ.\n")); goto out_of_resources; } coll_fragment->tail_next = &coll_fragment->to_post; if (coll_request->root == my_group_index) { IBOFFLOAD_VERBOSE(10, ("I'm root of the data %d %d", iboffload_module->power_of_2, recursive_doubling_tree->n_extra_sources )); /* for proxy we have little bit more work to do */ if (recursive_doubling_tree->n_extra_sources > 0) { /* send the all data to your extra peer */ dst = recursive_doubling_tree->rank_extra_source; rc = mca_bcol_iboffload_recv_rtr_setup( &last_wait, dst, iboffload_module, coll_fragment); if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { IBOFFLOAD_VERBOSE(10, ("Failed to" " mca_bcol_iboffload_recv_rtr_setup")); return OMPI_ERROR; } rc = mca_bcol_iboffload_send_large_buff_setup( &last_send, SBUF, 0, count, dst, iboffload_module, coll_fragment); if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { IBOFFLOAD_VERBOSE(10, ("Failed to" " mca_bcol_iboffload_send_large_buff_setup")); return OMPI_ERROR; } } power_of_2_distance = iboffload_module->power_of_2; BINOMIAL_SCATTER(iboffload_module, coll_fragment, last_wait, last_send, power_of_2_distance - 1, my_group_index, base_block_size, count ); /* EXIT OR GO TO Gather */ goto GATHER; } /* prepare and post recv operation */ group_src = bcol_iboffload_binomial_root_to_src(coll_request->root, my_group_index, iboffload_module->power_of_2_ranks, iboffload_module->group_size, &power_of_2_distance); IBOFFLOAD_VERBOSE(10, ("SRC %d DIST %d ranks %d gsize %d root %d my rank %d", group_src, power_of_2_distance, iboffload_module->power_of_2_ranks, iboffload_module->group_size, coll_request->root, my_group_index)); assert(group_src >= 0); if (0 > power_of_2_distance) { /* the rank is virtual root for this group, receive the data and scatter gather as root */ power_of_2_distance = iboffload_module->power_of_2; offset = 0; recv_count = count; IBOFFLOAD_VERBOSE(10, ("Virtual root %d , set mask to %d", my_group_index, power_of_2_distance)); } else { int my_left_boundary_rank; int delta; recv_count = base_block_size * (1 << power_of_2_distance); /* we may receive larger data */ my_left_boundary_rank = my_group_index & ((~(int)0) << power_of_2_distance ); offset = (size_t) (base_block_size * my_left_boundary_rank); delta = count - offset; if (OPAL_UNLIKELY(delta <= 0)) { /* no data to recv */ goto GATHER; } else { recv_count = (delta < recv_count) ? delta : recv_count; } IBOFFLOAD_VERBOSE(10, ("Recv data set mask to %d", power_of_2_distance)); } IBOFFLOAD_VERBOSE(10, ("Bcast, receive data from %d[%d], count %d, offset %d", group_src, recv_count, offset)); /* Receive data to user buffer */ rc = mca_bcol_iboffload_send_rtr_setup(&last_send, group_src, iboffload_module, coll_fragment); if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { IBOFFLOAD_VERBOSE(10, ("Failed to setup send rtr")); return OMPI_ERROR; } rc = mca_bcol_iboffload_recv_large_buff_setup(&last_wait, SBUF, offset, recv_count, group_src, iboffload_module, coll_fragment); if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { IBOFFLOAD_VERBOSE(10, ("Failed to setup data receive")); return OMPI_ERROR; } BINOMIAL_SCATTER(iboffload_module, coll_fragment, last_wait, last_send, power_of_2_distance - 1, my_group_index, base_block_size, count); GATHER: rc = bcol_iboffload_bcast_binomial_gather(iboffload_module, &last_send, &last_wait, coll_fragment, count, base_block_size, power_of_2_distance); if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { IBOFFLOAD_VERBOSE(10, ("Failed to setup gather. Return %d", rc)); return rc; } if (recursive_doubling_tree->n_extra_sources > 0 && iboffload_module->power_of_2 != power_of_2_distance) { dst = recursive_doubling_tree->rank_extra_source; rc = mca_bcol_iboffload_recv_rtr_setup( &last_wait, dst, iboffload_module, coll_fragment); if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { IBOFFLOAD_VERBOSE(10, ("Failed to" " mca_bcol_iboffload_recv_rtr_setup")); return OMPI_ERROR; } rc = mca_bcol_iboffload_send_large_buff_setup( &last_send, SBUF, 0, count, dst, iboffload_module, coll_fragment); if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { IBOFFLOAD_VERBOSE(10, ("Failed to" " mca_bcol_iboffload_send_large_buff_setup")); return OMPI_ERROR; } } IBOFFLOAD_VERBOSE(10, ("Fill in the the rest of the coll_fragment.\n")); /* end of list */ *coll_fragment->tail_next = NULL; /* finish initializing full message descriptor */ coll_request->n_fragments += 1; coll_request->n_frags_sent += 1; if (NULL != last_wait) { last_wait->flags |= MQE_WR_FLAG_SIGNAL; coll_fragment->signal_task_wr_id = last_wait->wr_id; last_wait->wr_id = (uint64_t) (uintptr_t) coll_fragment; } else { last_send->flags |= MQE_WR_FLAG_SIGNAL; coll_fragment->signal_task_wr_id = last_send->wr_id; last_send->wr_id = (uint64_t) (uintptr_t) coll_fragment; } /* post the mwr */ rc = mca_bcol_iboffload_post_mqe_tasks(iboffload_module, coll_fragment->to_post); if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { IBOFFLOAD_VERBOSE(10, ("MQE task posting failing.\n")); /* Note: need to clean up */ return rc; } MCA_BCOL_UPDATE_ORDER_COUNTER(&iboffload_module->super, coll_request->order_info); IBOFFLOAD_VERBOSE(10, ("Return success.\n")); return BCOL_FN_STARTED; out_of_resources: /* Release all resources */ IBOFFLOAD_VERBOSE(10, ("Barrier, adding collfrag to collfrag_pending.\n")); rc = mca_bcol_iboffload_free_resources_and_move_to_pending(coll_fragment, iboffload_module); return (OMPI_SUCCESS != rc) ? BCOL_FN_NOT_STARTED : BCOL_FN_STARTED; }
static int mca_bcol_iboffload_small_msg_bcast_extra_exec(mca_bcol_iboffload_module_t *iboffload_module, mca_bcol_iboffload_collreq_t *coll_request) { netpatterns_pair_exchange_node_t *recursive_doubling_tree = &iboffload_module->recursive_doubling_tree; int rc, dst; int my_group_index = iboffload_module->super.sbgp_partner_module->my_index; uint32_t pack_len; struct mqe_task *last_send = NULL, *last_wait = NULL; mca_bcol_iboffload_collfrag_t *coll_fragment = &coll_request->first_collfrag; IBOFFLOAD_VERBOSE(10,("Entering small msg extra iboffload bcast")); if (OPAL_UNLIKELY(!iboffload_module->connection_status[RECURSIVE_DOUBLING_TREE_BCAST])) { IBOFFLOAD_VERBOSE(10,("Bcast open new connection ")); bcol_iboffload_setup_binomial_connection(iboffload_module); } pack_len = coll_request->count * coll_request->dtype->super.size; coll_request->buffer_info[SBUF].lkey = iboffload_module->rdma_block.ib_info.lkey; IBOFFLOAD_VERBOSE(10,("My packet length %d pack_len frag_count %d dtype size %d ", pack_len, coll_request->count, coll_request->dtype->super.size)); /* it is estimated mq consumption... */ if (OPAL_UNLIKELY(false == BCOL_IBOFFLOAD_MQ_HAVE_CREDITS( iboffload_module, coll_fragment->mq_index, coll_fragment->mq_credits))) { IBOFFLOAD_VERBOSE(10, ("There are not enough credits on MQ.\n")); goto out_of_resources; } coll_fragment->tail_next = &coll_fragment->to_post; if (coll_request->root == my_group_index) { IBOFFLOAD_VERBOSE(10, ("I'm root of the data %d", iboffload_module->power_of_2)); /* send the all data to your extra peer */ dst = recursive_doubling_tree->rank_extra_source; IBOFFLOAD_VERBOSE(10,("Im extra root sending data to %d \n",dst)); rc = mca_bcol_iboffload_send_small_buff_setup( &last_send, pack_len, dst, iboffload_module, coll_fragment); if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { IBOFFLOAD_VERBOSE(10, ("Failed to" " mca_bcol_iboffload_send_small_buff_setup")); goto out_of_resources; } } else { /* Not root case */ dst = recursive_doubling_tree->rank_extra_source; rc = mca_bcol_iboffload_recv_small_buff_setup(&last_wait, pack_len, dst, iboffload_module, coll_fragment); if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { IBOFFLOAD_VERBOSE(10, ("Failed to setup data receive")); return OMPI_ERROR; } } /* end of list */ *coll_fragment->tail_next = NULL; /* finish initializing full message descriptor */ (coll_request)->n_fragments = 1; (coll_request)->n_frags_sent = 1; if (NULL != last_wait) { last_wait->flags |= MQE_WR_FLAG_SIGNAL; coll_fragment->signal_task_wr_id = last_wait->wr_id; last_wait->wr_id = (uint64_t) (uintptr_t) coll_fragment; } else { last_send->flags |= MQE_WR_FLAG_SIGNAL; coll_fragment->signal_task_wr_id = last_send->wr_id; last_send->wr_id = (uint64_t) (uintptr_t) coll_fragment; } /* post the mwr */ rc = mca_bcol_iboffload_post_mqe_tasks(iboffload_module, coll_fragment->to_post); if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { IBOFFLOAD_VERBOSE(10, ("MQE task posting failing.\n")); /* Note: need to clean up */ return rc; } MCA_BCOL_UPDATE_ORDER_COUNTER(&iboffload_module->super, coll_request->order_info); IBOFFLOAD_VERBOSE(10, ("Return success.\n")); return BCOL_FN_STARTED; out_of_resources: /* Release all resources */ IBOFFLOAD_VERBOSE(10, ("Barrier, adding collfrag to collfrag_pending.\n")); rc = mca_bcol_iboffload_free_resources_and_move_to_pending(coll_fragment, iboffload_module); return (OMPI_SUCCESS != rc) ? BCOL_FN_NOT_STARTED : BCOL_FN_STARTED; }
static int mca_bcol_iboffload_small_msg_bcast_exec(mca_bcol_iboffload_module_t *iboffload_module, mca_bcol_iboffload_collreq_t *coll_request) { netpatterns_pair_exchange_node_t *recursive_doubling_tree = &iboffload_module->recursive_doubling_tree; int rc, distance_mask_pow , dst, group_src, power_of_2_distance; uint32_t pack_len; int my_group_index = iboffload_module->super.sbgp_partner_module->my_index; struct mqe_task *last_send = NULL, *last_wait = NULL; mca_bcol_iboffload_collfrag_t *coll_fragment = &coll_request->first_collfrag; IBOFFLOAD_VERBOSE(10,("Entering small msg iboffload bcast")); if (OPAL_UNLIKELY(!iboffload_module->connection_status[RECURSIVE_DOUBLING_TREE_BCAST])) { IBOFFLOAD_VERBOSE(10,("Bcast open new connection ")); bcol_iboffload_setup_binomial_connection(iboffload_module); } pack_len = coll_request->count * coll_request->dtype->super.size; IBOFFLOAD_VERBOSE(10,("My packet length %d pack_len frag_count %d dtype size %d ", pack_len, coll_request->count, coll_request->dtype->super.size)); /* it is estimated mq consumption... */ if (OPAL_UNLIKELY(false == BCOL_IBOFFLOAD_MQ_HAVE_CREDITS( iboffload_module, coll_fragment->mq_index, coll_fragment->mq_credits))) { IBOFFLOAD_VERBOSE(10, ("There are not enough credits on MQ.\n")); goto out_of_resources; } coll_fragment->tail_next = &coll_fragment->to_post; coll_request->buffer_info[SBUF].lkey = iboffload_module->rdma_block.ib_info.lkey; if (coll_request->root == my_group_index) { IBOFFLOAD_VERBOSE(10, ("I'm root of the data")); /* Send data to the extra peer */ if (recursive_doubling_tree->n_extra_sources > 0) { /* send the all data to your extra peer */ dst = recursive_doubling_tree->rank_extra_source; IBOFFLOAD_VERBOSE(10,("Sending the dat to Dst %d",dst)); rc = mca_bcol_iboffload_send_small_buff_setup( &last_send, pack_len, dst, iboffload_module, coll_fragment); if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { IBOFFLOAD_VERBOSE(10, ("Failed to" " mca_bcol_iboffload_send_large_buff_setup")); goto out_of_resources; } } distance_mask_pow = iboffload_module->power_of_2 - 1; rc = binomial_scatter_smsg(iboffload_module, coll_fragment, &last_send, distance_mask_pow, my_group_index, pack_len); if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { IBOFFLOAD_VERBOSE(10, ("Failed to binomial_scatter_smsg")); goto out_of_resources; } goto finalize; } /* prepare and post recv operation */ group_src = bcol_iboffload_binomial_root_to_src(coll_request->root, my_group_index, iboffload_module->power_of_2_ranks, iboffload_module->group_size, &power_of_2_distance); assert(group_src >= 0); if (0 > power_of_2_distance) { /* the rank is virtual root for this group, receive the data and scatter gather as root */ IBOFFLOAD_VERBOSE(10,("Virtual root distance_mask_pow %d ",iboffload_module->power_of_2)); distance_mask_pow = iboffload_module->power_of_2 - 1; } else { distance_mask_pow = power_of_2_distance - 1; } IBOFFLOAD_VERBOSE(10, ("Bcast, receive data from %d[%d], count %d, offset %d", group_src)); rc = mca_bcol_iboffload_recv_small_buff_setup(&last_wait, pack_len, group_src, iboffload_module, coll_fragment); if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { IBOFFLOAD_VERBOSE(10, ("Failed to setup data receive")); goto out_of_resources; } rc = binomial_scatter_smsg(iboffload_module, coll_fragment, &last_send, distance_mask_pow, my_group_index, pack_len); if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { IBOFFLOAD_VERBOSE(10, ("Failed to binomial_scatter_smsg")); goto out_of_resources; } if (recursive_doubling_tree->n_extra_sources > 0 && iboffload_module->power_of_2 - 1 != distance_mask_pow) { /* if ((recursive_doubling_tree->n_extra_sources > 0) && ((my_group_index + iboffload_module->power_of_2_ranks ) < iboffload_module->group_size) ) { */ dst = recursive_doubling_tree->rank_extra_source; /* dst = my_group_index + iboffload_module->power_of_2_ranks; */ rc = mca_bcol_iboffload_send_small_buff_setup( &last_send, pack_len, dst, iboffload_module, coll_fragment); if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { IBOFFLOAD_VERBOSE(10, ("Failed to" " mca_bcol_iboffload_send_small_buff_setup")); goto out_of_resources; } } finalize: /* end of list */ *coll_fragment->tail_next = NULL; /* finish initializing full message descriptor */ (coll_request)->n_fragments += 1; (coll_request)->n_frags_sent += 1; if (NULL != last_wait) { last_wait->flags |= MQE_WR_FLAG_SIGNAL; coll_fragment->signal_task_wr_id = last_wait->wr_id; last_wait->wr_id = (uint64_t) (uintptr_t) coll_fragment; } else { last_send->flags |= MQE_WR_FLAG_SIGNAL; coll_fragment->signal_task_wr_id = last_send->wr_id; last_send->wr_id = (uint64_t) (uintptr_t) coll_fragment; } /* post the mwr */ rc = mca_bcol_iboffload_post_mqe_tasks(iboffload_module, coll_fragment->to_post); if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { IBOFFLOAD_VERBOSE(10, ("MQE task posting failing.\n")); /* Note: need to clean up */ return rc; } MCA_BCOL_UPDATE_ORDER_COUNTER(&iboffload_module->super, coll_request->order_info); IBOFFLOAD_VERBOSE(10, ("Return success.\n")); return BCOL_FN_STARTED; out_of_resources: /* Release all resources */ IBOFFLOAD_VERBOSE(10, ("Barrier, adding collfrag to collfrag_pending.\n")); rc = mca_bcol_iboffload_free_resources_and_move_to_pending(coll_fragment, iboffload_module); return (OMPI_SUCCESS != rc) ? BCOL_FN_NOT_STARTED : BCOL_FN_STARTED; }