static int mca_btl_ugni_component_progress (void) { mca_btl_ugni_module_t *ugni_module; static int64_t call_count = 0; int64_t cur_call_count = OPAL_THREAD_ADD64(&call_count, 1); unsigned int i; int count = 0; for (i = 0 ; i < mca_btl_ugni_component.ugni_num_btls ; ++i) { ugni_module = mca_btl_ugni_component.modules + i; if ((cur_call_count & 0x7) == 0) { count += mca_btl_ugni_progress_datagram (ugni_module); } if (ugni_module->connected_peer_count) { mca_btl_ugni_progress_wait_list (ugni_module); count += mca_btl_ugni_progress_local_smsg (ugni_module); count += mca_btl_ugni_progress_remote_smsg (ugni_module); } if (ugni_module->active_rdma_count) { count += mca_btl_ugni_progress_rdma (ugni_module, 0); } if (mca_btl_ugni_component.progress_thread_enabled) { count += mca_btl_ugni_progress_rdma (ugni_module, 1); } /* post pending after progressing rdma */ mca_btl_ugni_post_pending (ugni_module); } return count; }
static int mca_coll_ml_barrier_launch(mca_coll_ml_module_t *ml_module, ompi_request_t **req) { int rc; ompi_free_list_item_t *item; mca_coll_ml_collective_operation_progress_t *coll_op; ml_payload_buffer_desc_t *src_buffer_desc = NULL; /* allocate an ml buffer for signaling purposes */ src_buffer_desc = mca_coll_ml_alloc_buffer(ml_module); while (NULL == src_buffer_desc) { opal_progress(); src_buffer_desc = mca_coll_ml_alloc_buffer(ml_module); } /* Blocking call on fragment allocation (Maybe we want to make it non blocking ?) */ OMPI_FREE_LIST_WAIT(&(ml_module->coll_ml_collective_descriptors), item, rc); coll_op = (mca_coll_ml_collective_operation_progress_t *) item; assert(NULL != coll_op); ML_VERBOSE(10, ("Get coll request %p", coll_op)); MCA_COLL_ML_OP_BASIC_SETUP(coll_op, 0, 0, NULL, NULL, ml_module->coll_ml_barrier_function); coll_op->fragment_data.buffer_desc = src_buffer_desc; coll_op->dag_description.num_tasks_completed = 0; coll_op->variable_fn_params.buffer_index = src_buffer_desc->buffer_index; coll_op->variable_fn_params.sequence_num = OPAL_THREAD_ADD64(&(ml_module->collective_sequence_num), 1); /* Pointer to a coll finalize function */ coll_op->process_fn = NULL; (*req) = &coll_op->full_message.super; OMPI_REQUEST_INIT((*req), false); (*req)->req_status._cancelled = 0; (*req)->req_state = OMPI_REQUEST_ACTIVE; (*req)->req_status.MPI_ERROR = OMPI_SUCCESS; /* Set order info if there is a bcol needs ordering */ MCA_COLL_ML_SET_ORDER_INFO(coll_op, 1); return mca_coll_ml_generic_collectives_launcher(coll_op, mca_coll_ml_barrier_task_setup); }
int ompi_mtl_portals4_imrecv(struct mca_mtl_base_module_t* mtl, struct opal_convertor_t *convertor, struct ompi_message_t **message, struct mca_mtl_request_t *mtl_request) { ompi_mtl_portals4_recv_request_t *ptl_request = (ompi_mtl_portals4_recv_request_t*) mtl_request; void *start; size_t length; bool free_after; int ret; ompi_mtl_portals4_message_t *ptl_message = (ompi_mtl_portals4_message_t*) (*message)->req_ptr; ret = ompi_mtl_datatype_recv_buf(convertor, &start, &length, &free_after); if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { return ret; } #if OPAL_ENABLE_DEBUG ptl_request->opcount = OPAL_THREAD_ADD64((int64_t*) &ompi_mtl_portals4.recv_opcount, 1); ptl_request->hdr_data = 0; #endif ptl_request->super.type = portals4_req_recv; ptl_request->super.event_callback = ompi_mtl_portals4_recv_progress; ptl_request->buffer_ptr = (free_after) ? start : NULL; ptl_request->convertor = convertor; ptl_request->delivery_ptr = start; ptl_request->delivery_len = length; ptl_request->super.super.ompi_req->req_status.MPI_ERROR = OMPI_SUCCESS; ptl_request->pending_reply = 0; OPAL_OUTPUT_VERBOSE((50, ompi_mtl_base_framework.framework_output, "Mrecv %lu of length %ld (0x%lx)\n", ptl_request->opcount, (int64_t)length, (unsigned long) ptl_request)); (*message) = MPI_MESSAGE_NULL; return ompi_mtl_portals4_recv_progress(&(ptl_message->ev), &ptl_request->super); }
int ompi_mtl_portals4_irecv(struct mca_mtl_base_module_t* mtl, struct ompi_communicator_t *comm, int src, int tag, struct opal_convertor_t *convertor, mca_mtl_request_t *mtl_request) { ptl_match_bits_t match_bits, ignore_bits; int ret = OMPI_SUCCESS; ptl_process_t remote_proc; ompi_mtl_portals4_recv_request_t *ptl_request = (ompi_mtl_portals4_recv_request_t*) mtl_request; void *start; size_t length; bool free_after; ptl_me_t me; if (MPI_ANY_SOURCE == src) { if (ompi_mtl_portals4.use_logical) { remote_proc.rank = PTL_RANK_ANY; } else { remote_proc.phys.nid = PTL_NID_ANY; remote_proc.phys.pid = PTL_PID_ANY; } } else if ((ompi_mtl_portals4.use_logical) && (MPI_COMM_WORLD == comm)) { remote_proc.rank = src; } else { ompi_proc_t* ompi_proc = ompi_comm_peer_lookup( comm, src ); remote_proc = *((ptl_process_t*) ompi_mtl_portals4_get_endpoint (mtl, ompi_proc)); } MTL_PORTALS4_SET_RECV_BITS(match_bits, ignore_bits, comm->c_contextid, src, tag); ret = ompi_mtl_datatype_recv_buf(convertor, &start, &length, &free_after); if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { return ret; } ptl_request->super.type = portals4_req_recv; ptl_request->super.event_callback = ompi_mtl_portals4_recv_progress; #if OPAL_ENABLE_DEBUG ptl_request->opcount = OPAL_THREAD_ADD64((int64_t*) &ompi_mtl_portals4.recv_opcount, 1); ptl_request->hdr_data = 0; #endif ptl_request->buffer_ptr = (free_after) ? start : NULL; ptl_request->convertor = convertor; ptl_request->delivery_ptr = start; ptl_request->delivery_len = length; ptl_request->req_started = false; ptl_request->super.super.ompi_req->req_status.MPI_ERROR = OMPI_SUCCESS; ptl_request->pending_reply = 0; OPAL_OUTPUT_VERBOSE((50, ompi_mtl_base_framework.framework_output, "Recv %lu from %x,%x of length %ld (0x%lx, 0x%lx, 0x%lx)\n", ptl_request->opcount, remote_proc.phys.nid, remote_proc.phys.pid, (int64_t)length, match_bits, ignore_bits, (unsigned long) ptl_request)); me.start = start; me.length = length; me.ct_handle = PTL_CT_NONE; me.min_free = 0; me.uid = ompi_mtl_portals4.uid; me.options = PTL_ME_OP_PUT | PTL_ME_USE_ONCE | PTL_ME_EVENT_UNLINK_DISABLE; if (length <= ompi_mtl_portals4.short_limit) { me.options |= PTL_ME_EVENT_LINK_DISABLE; } me.match_id = remote_proc; me.match_bits = match_bits; me.ignore_bits = ignore_bits; ret = PtlMEAppend(ompi_mtl_portals4.ni_h, ompi_mtl_portals4.recv_idx, &me, PTL_PRIORITY_LIST, ptl_request, &ptl_request->me_h); if (OPAL_UNLIKELY(PTL_OK != ret)) { if (NULL != ptl_request->buffer_ptr) free(ptl_request->buffer_ptr); opal_output_verbose(1, ompi_mtl_base_framework.framework_output, "%s:%d: PtlMEAppend failed: %d", __FILE__, __LINE__, ret); return ompi_mtl_portals4_get_error(ret); } /* if a long message, spin until we either have a comm event or a link event, guaranteeing progress for long unexpected messages. */ if (length > ompi_mtl_portals4.short_limit) { while (true != ptl_request->req_started) { ompi_mtl_portals4_progress(); } } return OMPI_SUCCESS; }
mca_btl_base_descriptor_t* mca_btl_portals_prepare_dst(struct mca_btl_base_module_t* btl_base, struct mca_btl_base_endpoint_t* peer, mca_mpool_base_registration_t* registration, struct ompi_convertor_t* convertor, size_t reserve, size_t* size) { mca_btl_portals_frag_t* frag; ptl_md_t md; ptl_handle_me_t me_h; int ret; ptrdiff_t lb; assert(&mca_btl_portals_module == (mca_btl_portals_module_t*) btl_base); /* reserve space in the event queue for rdma operations immediately */ while (OPAL_THREAD_ADD32(&mca_btl_portals_module.portals_outstanding_ops, 1) > mca_btl_portals_module.portals_max_outstanding_ops) { OPAL_THREAD_ADD32(&mca_btl_portals_module.portals_outstanding_ops, -1); mca_btl_portals_component_progress(); } OMPI_BTL_PORTALS_FRAG_ALLOC_USER(&mca_btl_portals_module.super, frag, ret); if(NULL == frag) { OPAL_THREAD_ADD32(&mca_btl_portals_module.portals_outstanding_ops, -1); return NULL; } ompi_ddt_type_lb(convertor->pDesc, &lb); frag->segments[0].seg_len = *size; frag->segments[0].seg_addr.pval = convertor->pBaseBuf + lb + convertor->bConverted; frag->segments[0].seg_key.key64 = OPAL_THREAD_ADD64(&(mca_btl_portals_module.portals_rdma_key), 1); frag->base.des_src = NULL; frag->base.des_src_cnt = 0; frag->base.des_dst = frag->segments; frag->base.des_dst_cnt = 1; frag->base.des_flags = 0; OPAL_OUTPUT_VERBOSE((90, mca_btl_portals_component.portals_output, "rdma dest posted for frag 0x%x, callback 0x%x, bits %lld", frag, frag->base.des_cbfunc, frag->segments[0].seg_key.key64)); /* create a match entry */ ret = PtlMEAttach(mca_btl_portals_module.portals_ni_h, OMPI_BTL_PORTALS_RDMA_TABLE_ID, *((mca_btl_base_endpoint_t*) peer), frag->segments[0].seg_key.key64, /* match */ 0, /* ignore */ PTL_UNLINK, PTL_INS_AFTER, &me_h); if (PTL_OK != ret) { opal_output(mca_btl_portals_component.portals_output, "Error creating rdma dest ME: %d", ret); OPAL_THREAD_ADD32(&mca_btl_portals_module.portals_outstanding_ops, -1); OMPI_BTL_PORTALS_FRAG_RETURN_USER(&mca_btl_portals_module.super, frag); return NULL; } /* setup the memory descriptor. */ md.start = frag->segments[0].seg_addr.pval; md.length = frag->segments[0].seg_len; md.threshold = PTL_MD_THRESH_INF; md.max_size = 0; md.options = PTL_MD_OP_PUT | PTL_MD_OP_GET | PTL_MD_EVENT_START_DISABLE; md.user_ptr = frag; /* keep a pointer to ourselves */ md.eq_handle = mca_btl_portals_module.portals_eq_handles[OMPI_BTL_PORTALS_EQ_SEND]; ret = PtlMDAttach(me_h, md, PTL_UNLINK, &(frag->md_h)); if (PTL_OK != ret) { opal_output(mca_btl_portals_component.portals_output, "Error creating rdma dest MD: %d", ret); PtlMEUnlink(me_h); OPAL_THREAD_ADD32(&mca_btl_portals_module.portals_outstanding_ops, -1); OMPI_BTL_PORTALS_FRAG_RETURN_USER(&mca_btl_portals_module.super, frag); return NULL; } return &frag->base; }
mca_btl_base_descriptor_t* mca_btl_portals_prepare_src(struct mca_btl_base_module_t* btl_base, struct mca_btl_base_endpoint_t* peer, mca_mpool_base_registration_t* registration, struct ompi_convertor_t* convertor, size_t reserve, size_t* size) { mca_btl_portals_frag_t* frag; size_t max_data = *size; struct iovec iov; uint32_t iov_count = 1; int ret; assert(&mca_btl_portals_module == (mca_btl_portals_module_t*) btl_base); if (0 != reserve || 0 != ompi_convertor_need_buffers(convertor)) { frag = (mca_btl_portals_frag_t*) mca_btl_portals_alloc(btl_base, max_data + reserve); if (NULL == frag) { return NULL; } if (max_data + reserve > frag->size) { max_data = frag->size - reserve; } iov.iov_len = max_data; iov.iov_base = (unsigned char*) frag->segments[0].seg_addr.pval + reserve; ret = ompi_convertor_pack(convertor, &iov, &iov_count, &max_data ); *size = max_data; if ( ret < 0 ) { return NULL; } frag->segments[0].seg_len = max_data + reserve; frag->base.des_src_cnt = 1; } else { /* no need to pack - rdma operation out of user's buffer */ ptl_md_t md; ptl_handle_me_t me_h; /* reserve space in the event queue for rdma operations immediately */ while (OPAL_THREAD_ADD32(&mca_btl_portals_module.portals_outstanding_ops, 1) > mca_btl_portals_module.portals_max_outstanding_ops) { OPAL_THREAD_ADD32(&mca_btl_portals_module.portals_outstanding_ops, -1); mca_btl_portals_component_progress(); } OMPI_BTL_PORTALS_FRAG_ALLOC_USER(&mca_btl_portals_module.super, frag, ret); if(NULL == frag){ OPAL_THREAD_ADD32(&mca_btl_portals_module.portals_outstanding_ops, -1); return NULL; } iov.iov_len = max_data; iov.iov_base = NULL; ompi_convertor_pack(convertor, &iov, &iov_count, &max_data ); frag->segments[0].seg_len = max_data; frag->segments[0].seg_addr.pval = iov.iov_base; frag->segments[0].seg_key.key64 = OPAL_THREAD_ADD64(&(mca_btl_portals_module.portals_rdma_key), 1); frag->base.des_src_cnt = 1; /* either a put or get. figure out which later */ OPAL_OUTPUT_VERBOSE((90, mca_btl_portals_component.portals_output, "rdma src posted for frag 0x%x, callback 0x%x, bits %lld", frag, frag->base.des_cbfunc, frag->segments[0].seg_key.key64)); /* create a match entry */ ret = PtlMEAttach(mca_btl_portals_module.portals_ni_h, OMPI_BTL_PORTALS_RDMA_TABLE_ID, *((mca_btl_base_endpoint_t*) peer), frag->segments[0].seg_key.key64, /* match */ 0, /* ignore */ PTL_UNLINK, PTL_INS_AFTER, &me_h); if (PTL_OK != ret) { opal_output(mca_btl_portals_component.portals_output, "Error creating rdma src ME: %d", ret); OMPI_BTL_PORTALS_FRAG_RETURN_USER(&mca_btl_portals_module.super, frag); OPAL_THREAD_ADD32(&mca_btl_portals_module.portals_outstanding_ops, -1); return NULL; } /* setup the memory descriptor */ md.start = frag->segments[0].seg_addr.pval; md.length = frag->segments[0].seg_len; md.threshold = PTL_MD_THRESH_INF; md.max_size = 0; md.options = PTL_MD_OP_PUT | PTL_MD_OP_GET | PTL_MD_EVENT_START_DISABLE; md.user_ptr = frag; /* keep a pointer to ourselves */ md.eq_handle = mca_btl_portals_module.portals_eq_handles[OMPI_BTL_PORTALS_EQ_SEND]; ret = PtlMDAttach(me_h, md, PTL_UNLINK, &(frag->md_h)); if (PTL_OK != ret) { opal_output(mca_btl_portals_component.portals_output, "Error creating rdma src MD: %d", ret); PtlMEUnlink(me_h); OMPI_BTL_PORTALS_FRAG_RETURN_USER(&mca_btl_portals_module.super, frag); OPAL_THREAD_ADD32(&mca_btl_portals_module.portals_outstanding_ops, -1); return NULL; } } frag->base.des_src = frag->segments; frag->base.des_dst = NULL; frag->base.des_dst_cnt = 0; frag->base.des_flags = 0; return &frag->base; }
mca_btl_base_registration_handle_t * mca_btl_portals4_register_mem(mca_btl_base_module_t *btl_base, mca_btl_base_endpoint_t *endpoint, void *base, size_t size, uint32_t flags) { struct mca_btl_portals4_module_t *portals4_btl = (struct mca_btl_portals4_module_t*) btl_base; mca_btl_base_registration_handle_t *handle = NULL; ptl_me_t me; int ret; handle = (mca_btl_base_registration_handle_t *)malloc(sizeof(mca_btl_base_registration_handle_t)); if (!handle) { return NULL; } handle->key = OPAL_THREAD_ADD64(&(portals4_btl->portals_rdma_key), 1); OPAL_OUTPUT_VERBOSE((90, opal_btl_base_framework.framework_output, "mca_btl_portals4_register_mem NI=%d base=%p size=%ld handle=%p key=%ld\n", portals4_btl->interface_num, base, size, (void *)handle, handle->key)); if (MCA_BTL_FLAGS_PUT == flags) { /* create a match entry */ me.start = base; me.length = size; me.ct_handle = PTL_CT_NONE; me.min_free = 0; me.uid = PTL_UID_ANY; me.options = PTL_ME_OP_GET | PTL_ME_USE_ONCE | PTL_ME_EVENT_LINK_DISABLE | PTL_ME_EVENT_COMM_DISABLE | PTL_ME_EVENT_UNLINK_DISABLE; if (mca_btl_portals4_component.use_logical) { me.match_id.rank = endpoint->ptl_proc.rank; } else { me.match_id.phys.nid = endpoint->ptl_proc.phys.nid; me.match_id.phys.pid = endpoint->ptl_proc.phys.pid; } me.match_bits = handle->key; me.ignore_bits = BTL_PORTALS4_PROTOCOL_MASK | BTL_PORTALS4_CONTEXT_MASK | BTL_PORTALS4_SOURCE_MASK; me.ignore_bits = 0; ret = PtlMEAppend(portals4_btl->portals_ni_h, portals4_btl->recv_idx, &me, PTL_PRIORITY_LIST, handle, &(handle->me_h)); if (PTL_OK != ret) { opal_output_verbose(1, opal_btl_base_framework.framework_output, "%s:%d: PtlMEAppend failed: %d\n", __FILE__, __LINE__, ret); OPAL_THREAD_ADD32(&portals4_btl->portals_outstanding_ops, -1); return NULL; } OPAL_OUTPUT_VERBOSE((90, opal_btl_base_framework.framework_output, "PtlMEAppend (mca_btl_portals4_register_mem) handle=%p, me_h=%d start=%p length=%ld rank=%x nid=%x pid=%x match_bits=%lx\n", (void *)handle, handle->me_h, me.start, me.length, me.match_id.rank, me.match_id.phys.nid, me.match_id.phys.pid, me.match_bits)); } return handle; }
static inline int ompi_mtl_portals4_send_start(struct mca_mtl_base_module_t* mtl, struct ompi_communicator_t* comm, int dest, int tag, struct opal_convertor_t *convertor, mca_pml_base_send_mode_t mode, ompi_mtl_portals4_isend_request_t* ptl_request) { int ret= OMPI_SUCCESS; void *start; size_t length; bool free_after; ptl_process_t ptl_proc; #if OMPI_MTL_PORTALS4_FLOW_CONTROL opal_free_list_item_t *item; ompi_mtl_portals4_pending_request_t *pending; #endif if ((ompi_mtl_portals4.use_logical) && (MPI_COMM_WORLD == comm)) { ptl_proc.rank = dest; } else { ompi_proc_t *ompi_proc = ompi_comm_peer_lookup(comm, dest); ptl_proc = *((ptl_process_t*) ompi_mtl_portals4_get_endpoint (mtl, ompi_proc)); } ret = ompi_mtl_datatype_pack(convertor, &start, &length, &free_after); if (OMPI_SUCCESS != ret) return ret; ptl_request->opcount = OPAL_THREAD_ADD64((int64_t*)&ompi_mtl_portals4.opcount, 1); ptl_request->buffer_ptr = (free_after) ? start : NULL; ptl_request->event_count = 0; OPAL_OUTPUT_VERBOSE((50, ompi_mtl_base_framework.framework_output, "Send %lu to %x,%x of length %ld\n", ptl_request->opcount, ptl_proc.phys.nid, ptl_proc.phys.pid, (int64_t)length)); #if OMPI_MTL_PORTALS4_FLOW_CONTROL item = opal_free_list_get (&ompi_mtl_portals4.flowctl.pending_fl); if (NULL == item) return OMPI_ERR_OUT_OF_RESOURCE; pending = (ompi_mtl_portals4_pending_request_t*) item; ptl_request->pending = pending; pending->mode = mode; pending->start = start; pending->length = length; pending->contextid = comm->c_contextid; pending->tag = tag; pending->my_rank = comm->c_my_rank; pending->fc_notified = 0; pending->ptl_proc = ptl_proc; pending->ptl_request = ptl_request; if (OPAL_UNLIKELY(OPAL_THREAD_ADD32(&ompi_mtl_portals4.flowctl.send_slots, -1) < 0)) { OPAL_THREAD_ADD32(&ompi_mtl_portals4.flowctl.send_slots, 1); opal_list_append(&ompi_mtl_portals4.flowctl.pending_sends, &pending->super.super); return OMPI_SUCCESS; } if (OPAL_UNLIKELY(0 != opal_list_get_size(&ompi_mtl_portals4.flowctl.pending_sends))) { OPAL_THREAD_ADD32(&ompi_mtl_portals4.flowctl.send_slots, 1); opal_list_append(&ompi_mtl_portals4.flowctl.pending_sends, &pending->super.super); ompi_mtl_portals4_pending_list_progress(); return OMPI_SUCCESS; } if (OPAL_UNLIKELY(ompi_mtl_portals4.flowctl.flowctl_active)) { OPAL_THREAD_ADD32(&ompi_mtl_portals4.flowctl.send_slots, 1); opal_list_append(&ompi_mtl_portals4.flowctl.pending_sends, &pending->super.super); return OMPI_SUCCESS; } #endif if (length <= ompi_mtl_portals4.eager_limit) { ret = ompi_mtl_portals4_short_isend(mode, start, length, comm->c_contextid, tag, comm->c_my_rank, ptl_proc, ptl_request); } else { ret = ompi_mtl_portals4_long_isend(start, length, comm->c_contextid, tag, comm->c_my_rank, ptl_proc, ptl_request); } return ret; }