int ompi_osc_pt2pt_passive_unlock(ompi_osc_pt2pt_module_t *module, int32_t origin, int32_t count) { ompi_proc_t *proc = ompi_comm_peer_lookup( module->p2p_comm, origin ); ompi_osc_pt2pt_pending_lock_t *new_pending = NULL; assert(module->p2p_lock_status != 0); OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_output, "%d: received unlock request from %d with %d requests\n", ompi_comm_rank(module->p2p_comm), origin, count)); new_pending = OBJ_NEW(ompi_osc_pt2pt_pending_lock_t); new_pending->proc = proc; new_pending->lock_type = 0; OPAL_THREAD_LOCK(&(module->p2p_lock)); module->p2p_num_pending_in += count; opal_list_append(&module->p2p_unlocks_pending, &(new_pending->super)); OPAL_THREAD_UNLOCK(&(module->p2p_lock)); return ompi_osc_pt2pt_passive_unlock_complete(module); }
int ompi_osc_pt2pt_module_lock(int lock_type, int target, int assert, ompi_win_t *win) { ompi_osc_pt2pt_module_t *module = P2P_MODULE(win); ompi_proc_t *proc = ompi_comm_peer_lookup( module->p2p_comm, target ); assert(lock_type != 0); /* set our mode on the window */ ompi_win_remove_mode(win, OMPI_WIN_FENCE); ompi_win_append_mode(win, OMPI_WIN_ACCESS_EPOCH | OMPI_WIN_LOCK_ACCESS); OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_output, "%d: sending lock request to %d", ompi_comm_rank(module->p2p_comm), target)); /* generate a lock request */ ompi_osc_pt2pt_control_send(module, proc, OMPI_OSC_PT2PT_HDR_LOCK_REQ, ompi_comm_rank(module->p2p_comm), lock_type); /* return */ return OMPI_SUCCESS; }
/*ompi_request_test(&ompi_req,completed,MPI_STATUS_IGNORE); */ *completed = ompi_req->req_complete; if (*completed) { ompi_request_free(&ompi_req); request->status = HCOLRTE_REQUEST_DONE; } return HCOLL_SUCCESS; } static int ec_handle_compare( rte_ec_handle_t handle_1 , rte_grp_handle_t group_handle_1 , rte_ec_handle_t handle_2 , rte_grp_handle_t group_handle_2 ) { return handle_1.handle == handle_2.handle; } static int get_ec_handles( int num_ec , int * ec_indexes , rte_grp_handle_t grp_h, rte_ec_handle_t * ec_handles ) { int i; ompi_communicator_t *comm = (ompi_communicator_t *)grp_h; for (i=0; i<num_ec; i++) { ompi_proc_t *proc = ompi_comm_peer_lookup(comm,ec_indexes[i]); ec_handles[i].rank = ec_indexes[i]; ec_handles[i].handle = (void *)proc; } return HCOLL_SUCCESS; } #if 0 /* This callback is not used */ static int get_my_ec ( rte_grp_handle_t grp_h, rte_ec_handle_t *ec_handle) { ompi_communicator_t *comm = (ompi_communicator_t *)grp_h; int my_rank = ompi_comm_rank(comm); ompi_proc_t *my_proc = ompi_comm_peer_lookup(comm,my_rank); ec_handle->handle = (void *)my_proc; ec_handle->rank = my_rank; return HCOLL_SUCCESS; }
int mca_pml_ob1_isend(void *buf, size_t count, ompi_datatype_t * datatype, int dst, int tag, mca_pml_base_send_mode_t sendmode, ompi_communicator_t * comm, ompi_request_t ** request) { mca_pml_ob1_comm_t* ob1_comm = comm->c_pml_comm; mca_pml_ob1_send_request_t *sendreq = NULL; ompi_proc_t *dst_proc = ompi_comm_peer_lookup (comm, dst); mca_bml_base_endpoint_t* endpoint = (mca_bml_base_endpoint_t*) dst_proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML]; int16_t seqn; int rc; seqn = (uint16_t) OPAL_THREAD_ADD32(&ob1_comm->procs[dst].send_sequence, 1); if (MCA_PML_BASE_SEND_SYNCHRONOUS != sendmode) { rc = mca_pml_ob1_send_inline (buf, count, datatype, dst, tag, seqn, dst_proc, endpoint, comm); if (OPAL_LIKELY(0 <= rc)) { /* NTH: it is legal to return ompi_request_empty since the only valid * field in a send completion status is whether or not the send was * cancelled (which it can't be at this point anyway). */ *request = &ompi_request_empty; return OMPI_SUCCESS; } } MCA_PML_OB1_SEND_REQUEST_ALLOC(comm, dst, sendreq); if (NULL == sendreq) return OMPI_ERR_OUT_OF_RESOURCE; MCA_PML_OB1_SEND_REQUEST_INIT(sendreq, buf, count, datatype, dst, tag, comm, sendmode, false); PERUSE_TRACE_COMM_EVENT (PERUSE_COMM_REQ_ACTIVATE, &(sendreq)->req_send.req_base, PERUSE_SEND); MCA_PML_OB1_SEND_REQUEST_START_W_SEQ(sendreq, endpoint, seqn, rc); *request = (ompi_request_t *) sendreq; return rc; }
static int get_ec_handles( int num_ec , int * ec_indexes , rte_grp_handle_t grp_h, rte_ec_handle_t * ec_handles ) { int i; ompi_communicator_t *comm = (ompi_communicator_t *)grp_h; for (i=0; i<num_ec; i++) { ompi_proc_t *proc = ompi_comm_peer_lookup(comm,ec_indexes[i]); ec_handles[i].rank = ec_indexes[i]; ec_handles[i].handle = (void *)proc; } return HCOLL_SUCCESS; }
ucp_ep_h mca_pml_ucx_add_proc(ompi_communicator_t *comm, int dst) { ucp_address_t *address; ucs_status_t status; size_t addrlen; ucp_ep_h ep; int ret; ompi_proc_t *proc0 = ompi_comm_peer_lookup(comm, 0); ompi_proc_t *proc_peer = ompi_comm_peer_lookup(comm, dst); /* Note, mca_pml_base_pml_check_selected, doesn't use 3rd argument */ if (OMPI_SUCCESS != (ret = mca_pml_base_pml_check_selected("ucx", &proc0, dst))) { return NULL; } ret = mca_pml_ucx_recv_worker_address(proc_peer, &address, &addrlen); if (ret < 0) { PML_UCX_ERROR("Failed to receive worker address from proc: %d", proc_peer->super.proc_name.vpid); return NULL; } PML_UCX_VERBOSE(2, "connecting to proc. %d", proc_peer->super.proc_name.vpid); status = ucp_ep_create(ompi_pml_ucx.ucp_worker, address, &ep); free(address); if (UCS_OK != status) { PML_UCX_ERROR("Failed to connect to proc: %d, %s", proc_peer->super.proc_name.vpid, ucs_status_string(status)); return NULL; } proc_peer->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_PML] = ep; return ep; }
int ompi_mtl_psm_isend(struct mca_mtl_base_module_t* mtl, struct ompi_communicator_t* comm, int dest, int tag, struct ompi_convertor_t *convertor, mca_pml_base_send_mode_t mode, bool blocking, mca_mtl_request_t * mtl_request) { psm_error_t psm_error; uint64_t mqtag; uint32_t flags = 0; int ret; mca_mtl_psm_request_t * mtl_psm_request = (mca_mtl_psm_request_t*) mtl_request; size_t length; ompi_proc_t* ompi_proc = ompi_comm_peer_lookup( comm, dest ); mca_mtl_psm_endpoint_t* psm_endpoint = (mca_mtl_psm_endpoint_t*)ompi_proc->proc_pml; assert(mtl == &ompi_mtl_psm.super); mqtag = PSM_MAKE_MQTAG(comm->c_contextid, comm->c_my_rank, tag); ret = ompi_mtl_datatype_pack(convertor, &mtl_psm_request->buf, &length, &mtl_psm_request->free_after); mtl_psm_request->length= length; mtl_psm_request->convertor = convertor; mtl_psm_request->type = OMPI_MTL_PSM_ISEND; if (OMPI_SUCCESS != ret) return ret; if (mode == MCA_PML_BASE_SEND_SYNCHRONOUS) flags |= PSM_MQ_FLAG_SENDSYNC; psm_error = psm_mq_isend(ompi_mtl_psm.mq, psm_endpoint->peer_addr, flags, mqtag, mtl_psm_request->buf, length, mtl_psm_request, &mtl_psm_request->psm_request); return psm_error == PSM_OK ? OMPI_SUCCESS : OMPI_ERROR; }
int ompi_mtl_psm2_send(struct mca_mtl_base_module_t* mtl, struct ompi_communicator_t* comm, int dest, int tag, struct opal_convertor_t *convertor, mca_pml_base_send_mode_t mode) { psm_error_t err; mca_mtl_psm2_request_t mtl_psm2_request; psm_mq_tag_t mqtag; uint32_t flags = 0; int ret; size_t length; ompi_proc_t* ompi_proc = ompi_comm_peer_lookup( comm, dest ); mca_mtl_psm2_endpoint_t* psm_endpoint = (mca_mtl_psm2_endpoint_t*) ompi_proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_MTL]; assert(mtl == &ompi_mtl_psm2.super); PSM_MAKE_MQTAG(comm->c_contextid, comm->c_my_rank, tag, mqtag); ret = ompi_mtl_datatype_pack(convertor, &mtl_psm2_request.buf, &length, &mtl_psm2_request.free_after); mtl_psm2_request.length = length; mtl_psm2_request.convertor = convertor; mtl_psm2_request.type = OMPI_mtl_psm2_ISEND; if (OMPI_SUCCESS != ret) return ret; if (mode == MCA_PML_BASE_SEND_SYNCHRONOUS) flags |= PSM_MQ_FLAG_SENDSYNC; err = psm_mq_send2(ompi_mtl_psm2.mq, psm_endpoint->peer_addr, flags, &mqtag, mtl_psm2_request.buf, length); if (mtl_psm2_request.free_after) { free(mtl_psm2_request.buf); } return err == PSM_OK ? OMPI_SUCCESS : OMPI_ERROR; }
int ompi_osc_pt2pt_passive_lock(ompi_osc_pt2pt_module_t *module, int32_t origin, int32_t lock_type) { bool send_ack = false; int ret = OMPI_SUCCESS; ompi_proc_t *proc = ompi_comm_peer_lookup( module->p2p_comm, origin ); ompi_osc_pt2pt_pending_lock_t *new_pending; OPAL_THREAD_LOCK(&(module->p2p_lock)); if (lock_type == MPI_LOCK_EXCLUSIVE) { if (module->p2p_lock_status == 0) { module->p2p_lock_status = MPI_LOCK_EXCLUSIVE; OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_output, "%d: setting lock status to EXCLUSIVE (from %d)", ompi_comm_rank(module->p2p_comm), origin)); ompi_win_append_mode(module->p2p_win, OMPI_WIN_EXPOSE_EPOCH); send_ack = true; } else { OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_output, "%d: queuing lock request from %d (type=%d)", ompi_comm_rank(module->p2p_comm), origin, lock_type)); new_pending = OBJ_NEW(ompi_osc_pt2pt_pending_lock_t); new_pending->proc = proc; new_pending->lock_type = lock_type; opal_list_append(&(module->p2p_locks_pending), &(new_pending->super)); } } else if (lock_type == MPI_LOCK_SHARED) { if (module->p2p_lock_status != MPI_LOCK_EXCLUSIVE) { module->p2p_lock_status = MPI_LOCK_SHARED; module->p2p_shared_count++; OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_output, "%d: setting lock status to SHARED (from %d), count %d", ompi_comm_rank(module->p2p_comm), origin, module->p2p_shared_count)); ompi_win_append_mode(module->p2p_win, OMPI_WIN_EXPOSE_EPOCH); send_ack = true; } else { OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_output, "%d: queuing lock request from %d (type=%d)", ompi_comm_rank(module->p2p_comm), origin, lock_type)); new_pending = OBJ_NEW(ompi_osc_pt2pt_pending_lock_t); new_pending->proc = proc; new_pending->lock_type = lock_type; opal_list_append(&(module->p2p_locks_pending), &(new_pending->super)); } } else { ret = OMPI_ERROR; } OPAL_THREAD_UNLOCK(&(module->p2p_lock)); if (send_ack) { OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_output, "%d: sending lock ack to %d", ompi_comm_rank(module->p2p_comm), origin)); ompi_osc_pt2pt_control_send(module, proc, OMPI_OSC_PT2PT_HDR_LOCK_REQ, ompi_comm_rank(module->p2p_comm), OMPI_SUCCESS); } return OMPI_SUCCESS; }
int ompi_osc_pt2pt_module_unlock(int target, ompi_win_t *win) { int32_t out_count; opal_list_item_t *item; int ret; ompi_osc_pt2pt_module_t *module = P2P_MODULE(win); ompi_proc_t *proc = ompi_comm_peer_lookup( module->p2p_comm, target ); OPAL_THREAD_LOCK(&module->p2p_lock); while (0 == module->p2p_lock_received_ack) { opal_condition_wait(&module->p2p_cond, &module->p2p_lock); } module->p2p_lock_received_ack -= 1; /* start all the requests */ ompi_osc_pt2pt_flip_sendreqs(module); /* try to start all the requests. We've copied everything we need out of pending_sendreqs, so don't need the lock here */ out_count = opal_list_get_size(&(module->p2p_copy_pending_sendreqs)); /* we want to send all the requests, plus we wait for one more completion event for the control message ack from the unlocker saying we're done */ module->p2p_num_pending_out += (out_count + 1); OPAL_THREAD_UNLOCK(&module->p2p_lock); /* send the unlock request */ OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_output, "%d: sending unlock request to %d with %d requests", ompi_comm_rank(module->p2p_comm), target, out_count)); ompi_osc_pt2pt_control_send(module, proc, OMPI_OSC_PT2PT_HDR_UNLOCK_REQ, ompi_comm_rank(module->p2p_comm), out_count); while (NULL != (item = opal_list_remove_first(&(module->p2p_copy_pending_sendreqs)))) { ompi_osc_pt2pt_sendreq_t *req = (ompi_osc_pt2pt_sendreq_t*) item; ret = ompi_osc_pt2pt_sendreq_send(module, req); if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == ret ) { opal_output_verbose(5, ompi_osc_base_output, "complete: failure in starting sendreq (%d). Will try later.", ret); opal_list_append(&(module->p2p_copy_pending_sendreqs), item); } else if (OMPI_SUCCESS != ret) { return ret; } } /* wait for all the requests */ OPAL_THREAD_LOCK(&module->p2p_lock); while (0 != module->p2p_num_pending_out) { opal_condition_wait(&module->p2p_cond, &module->p2p_lock); } OPAL_THREAD_UNLOCK(&module->p2p_lock); OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_output, "%d: finished unlock to %d", ompi_comm_rank(module->p2p_comm), target)); /* set our mode on the window */ ompi_win_remove_mode(win, OMPI_WIN_ACCESS_EPOCH | OMPI_WIN_LOCK_ACCESS); return OMPI_SUCCESS; }
int mca_pml_ob1_send(void *buf, size_t count, ompi_datatype_t * datatype, int dst, int tag, mca_pml_base_send_mode_t sendmode, ompi_communicator_t * comm) { mca_pml_ob1_comm_t* ob1_comm = comm->c_pml_comm; ompi_proc_t *dst_proc = ompi_comm_peer_lookup (comm, dst); mca_bml_base_endpoint_t* endpoint = (mca_bml_base_endpoint_t*) dst_proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML]; mca_pml_ob1_send_request_t *sendreq = NULL; int16_t seqn; int rc; if (OPAL_UNLIKELY(MCA_PML_BASE_SEND_BUFFERED == sendmode)) { /* large buffered sends *need* a real request so use isend instead */ ompi_request_t *brequest; rc = mca_pml_ob1_isend (buf, count, datatype, dst, tag, sendmode, comm, &brequest); if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { return rc; } /* free the request and return. don't care if it completes now */ ompi_request_free (&brequest); return OMPI_SUCCESS; } if (OPAL_UNLIKELY(NULL == endpoint)) { return OMPI_ERR_UNREACH; } seqn = (uint16_t) OPAL_THREAD_ADD32(&ob1_comm->procs[dst].send_sequence, 1); /** * The immediate send will not have a request, so they are * intracable from the point of view of any debugger attached to * the parallel application. */ if (MCA_PML_BASE_SEND_SYNCHRONOUS != sendmode) { rc = mca_pml_ob1_send_inline (buf, count, datatype, dst, tag, seqn, dst_proc, endpoint, comm); if (OPAL_LIKELY(0 <= rc)) { return OMPI_SUCCESS; } } #if !OMPI_ENABLE_THREAD_MULTIPLE sendreq = mca_pml_ob1_sendreq; if( OPAL_UNLIKELY(NULL == sendreq) ) #endif /* !OMPI_ENABLE_THREAD_MULTIPLE */ { MCA_PML_OB1_SEND_REQUEST_ALLOC(comm, dst, sendreq); if (NULL == sendreq) return OMPI_ERR_TEMP_OUT_OF_RESOURCE; #if !OMPI_ENABLE_THREAD_MULTIPLE mca_pml_ob1_sendreq = sendreq; #endif /* !OMPI_ENABLE_THREAD_MULTIPLE */ } sendreq->req_send.req_base.req_proc = dst_proc; sendreq->rdma_frag = NULL; MCA_PML_OB1_SEND_REQUEST_INIT(sendreq, buf, count, datatype, dst, tag, comm, sendmode, false); PERUSE_TRACE_COMM_EVENT (PERUSE_COMM_REQ_ACTIVATE, &sendreq->req_send.req_base, PERUSE_SEND); MCA_PML_OB1_SEND_REQUEST_START_W_SEQ(sendreq, endpoint, seqn, rc); if (OPAL_LIKELY(rc == OMPI_SUCCESS)) { ompi_request_wait_completion(&sendreq->req_send.req_base.req_ompi); rc = sendreq->req_send.req_base.req_ompi.req_status.MPI_ERROR; } #if OMPI_ENABLE_THREAD_MULTIPLE MCA_PML_OB1_SEND_REQUEST_RETURN(sendreq); #else mca_pml_ob1_send_request_fini (sendreq); #endif return rc; }
int MPI_Sendrecv_replace(void * buf, int count, MPI_Datatype datatype, int dest, int sendtag, int source, int recvtag, MPI_Comm comm, MPI_Status *status) { int rc = MPI_SUCCESS; if ( MPI_PARAM_CHECK ) { rc = MPI_SUCCESS; OMPI_ERR_INIT_FINALIZE(FUNC_NAME); OMPI_CHECK_DATATYPE_FOR_RECV(rc, datatype, count); if (ompi_comm_invalid(comm)) { return OMPI_ERRHANDLER_INVOKE(MPI_COMM_WORLD, MPI_ERR_COMM, FUNC_NAME); } else if (dest != MPI_PROC_NULL && ompi_comm_peer_invalid(comm, dest)) { rc = MPI_ERR_RANK; } else if (sendtag < 0 || sendtag > mca_pml.pml_max_tag) { rc = MPI_ERR_TAG; } else if (source != MPI_PROC_NULL && source != MPI_ANY_SOURCE && ompi_comm_peer_invalid(comm, source)) { rc = MPI_ERR_RANK; } else if (((recvtag < 0) && (recvtag != MPI_ANY_TAG)) || (recvtag > mca_pml.pml_max_tag)) { rc = MPI_ERR_TAG; } OMPI_ERRHANDLER_CHECK(rc, comm, rc, FUNC_NAME); } /* simple case */ if ( source == MPI_PROC_NULL || dest == MPI_PROC_NULL || count == 0 ) { return MPI_Sendrecv(buf,count,datatype,dest,sendtag,buf,count,datatype,source,recvtag,comm,status); } else { ompi_convertor_t convertor; struct iovec iov; unsigned char recv_data[2048]; size_t packed_size, max_data; uint32_t iov_count; ompi_status_public_t recv_status; ompi_proc_t* proc = ompi_comm_peer_lookup(comm,dest); if(proc == NULL) { rc = MPI_ERR_RANK; OMPI_ERRHANDLER_RETURN(rc, comm, rc, FUNC_NAME); } /* initialize convertor to unpack recv buffer */ OBJ_CONSTRUCT(&convertor, ompi_convertor_t); ompi_convertor_copy_and_prepare_for_recv( proc->proc_convertor, datatype, count, buf, 0, &convertor ); /* setup a buffer for recv */ ompi_convertor_get_packed_size( &convertor, &packed_size ); if( packed_size > sizeof(recv_data) ) { rc = MPI_Alloc_mem(packed_size, MPI_INFO_NULL, &iov.iov_base); if(OMPI_SUCCESS != rc) { OMPI_ERRHANDLER_RETURN(OMPI_ERR_OUT_OF_RESOURCE, comm, MPI_ERR_BUFFER, FUNC_NAME); } } else { iov.iov_base = (caddr_t)recv_data; } /* recv into temporary buffer */ rc = MPI_Sendrecv( buf, count, datatype, dest, sendtag, iov.iov_base, packed_size, MPI_BYTE, source, recvtag, comm, &recv_status ); if (rc != MPI_SUCCESS) { if(packed_size > sizeof(recv_data)) MPI_Free_mem(iov.iov_base); OBJ_DESTRUCT(&convertor); OMPI_ERRHANDLER_RETURN(rc, comm, rc, FUNC_NAME); } /* unpack into users buffer */ iov.iov_len = recv_status._count; iov_count = 1; max_data = recv_status._count; ompi_convertor_unpack(&convertor, &iov, &iov_count, &max_data ); /* return status to user */ if(status != MPI_STATUS_IGNORE) { *status = recv_status; } /* release resources */ if(packed_size > sizeof(recv_data)) { MPI_Free_mem(iov.iov_base); } OBJ_DESTRUCT(&convertor); return MPI_SUCCESS; } }
int ompi_osc_pt2pt_sendreq_recv_accum(ompi_osc_pt2pt_module_t *module, ompi_osc_pt2pt_send_header_t *header, void *payload) { int ret = OMPI_SUCCESS; struct ompi_op_t *op = ompi_osc_pt2pt_op_create(header->hdr_target_op); ompi_proc_t *proc = ompi_comm_peer_lookup( module->p2p_comm, header->hdr_origin ); struct ompi_datatype_t *datatype = ompi_osc_pt2pt_datatype_create(proc, &payload); if (header->hdr_msg_length > 0) { /* lock the window for accumulates */ OPAL_THREAD_LOCK(&module->p2p_acc_lock); /* copy the data from the temporary buffer into the user window */ ret = ompi_osc_pt2pt_process_op(module, header, datatype, op, payload, header->hdr_msg_length); /* unlock the window for accumulates */ OPAL_THREAD_UNLOCK(&module->p2p_acc_lock); /* Release datatype & op */ OBJ_RELEASE(datatype); OBJ_RELEASE(op); OPAL_THREAD_ADD32(&(module->p2p_num_pending_in), -1); opal_output_verbose(50, ompi_osc_base_output, "%d received accum message from %d", module->p2p_comm->c_my_rank, header->hdr_origin); } else { ompi_osc_pt2pt_longreq_t *longreq; ptrdiff_t lb, extent, true_lb, true_extent; size_t buflen; /* figure out how big a buffer we need */ ompi_ddt_get_extent(datatype, &lb, &extent); ompi_ddt_get_true_extent(datatype, &true_lb, &true_extent); buflen = true_extent + (header->hdr_target_count - 1) * extent; /* get a longreq and fill it in */ ompi_osc_pt2pt_longreq_alloc(&longreq); longreq->req_comp_cb = ompi_osc_pt2pt_sendreq_recv_accum_long_cb; longreq->req_datatype = datatype; longreq->req_op = op; longreq->req_module = module; /* allocate a buffer to receive into ... */ longreq->req_comp_cbdata = malloc(buflen + sizeof(ompi_osc_pt2pt_send_header_t)); if (NULL == longreq->req_comp_cbdata) return OMPI_ERR_TEMP_OUT_OF_RESOURCE; /* fill in tmp header */ memcpy(longreq->req_comp_cbdata, header, sizeof(ompi_osc_pt2pt_send_header_t)); ((ompi_osc_pt2pt_send_header_t*) longreq->req_comp_cbdata)->hdr_msg_length = buflen; ret = mca_pml.pml_irecv(((char*) longreq->req_comp_cbdata) + sizeof(ompi_osc_pt2pt_send_header_t), header->hdr_target_count, datatype, header->hdr_origin, header->hdr_origin_tag, module->p2p_comm, &(longreq->req_pml_req)); opal_output_verbose(50, ompi_osc_base_output, "%d started long recv accum message from %d (%d)", module->p2p_comm->c_my_rank, header->hdr_origin, header->hdr_origin_tag); /* put the send request in the waiting list */ OPAL_THREAD_LOCK(&(module->p2p_lock)); opal_list_append(&(module->p2p_long_msgs), &(longreq->super.super)); OPAL_THREAD_UNLOCK(&(module->p2p_lock)); } return ret; }
int ompi_mtl_portals4_irecv(struct mca_mtl_base_module_t* mtl, struct ompi_communicator_t *comm, int src, int tag, struct opal_convertor_t *convertor, mca_mtl_request_t *mtl_request) { ptl_match_bits_t match_bits, ignore_bits; int ret = OMPI_SUCCESS; ptl_process_t remote_proc; ompi_mtl_portals4_recv_request_t *ptl_request = (ompi_mtl_portals4_recv_request_t*) mtl_request; void *start; size_t length; bool free_after; ptl_me_t me; if (MPI_ANY_SOURCE == src) { if (ompi_mtl_portals4.use_logical) { remote_proc.rank = PTL_RANK_ANY; } else { remote_proc.phys.nid = PTL_NID_ANY; remote_proc.phys.pid = PTL_PID_ANY; } } else if ((ompi_mtl_portals4.use_logical) && (MPI_COMM_WORLD == comm)) { remote_proc.rank = src; } else { ompi_proc_t* ompi_proc = ompi_comm_peer_lookup( comm, src ); remote_proc = *((ptl_process_t*) ompi_mtl_portals4_get_endpoint (mtl, ompi_proc)); } MTL_PORTALS4_SET_RECV_BITS(match_bits, ignore_bits, comm->c_contextid, src, tag); ret = ompi_mtl_datatype_recv_buf(convertor, &start, &length, &free_after); if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { return ret; } ptl_request->super.type = portals4_req_recv; ptl_request->super.event_callback = ompi_mtl_portals4_recv_progress; #if OPAL_ENABLE_DEBUG ptl_request->opcount = OPAL_THREAD_ADD64((int64_t*) &ompi_mtl_portals4.recv_opcount, 1); ptl_request->hdr_data = 0; #endif ptl_request->buffer_ptr = (free_after) ? start : NULL; ptl_request->convertor = convertor; ptl_request->delivery_ptr = start; ptl_request->delivery_len = length; ptl_request->req_started = false; ptl_request->super.super.ompi_req->req_status.MPI_ERROR = OMPI_SUCCESS; ptl_request->pending_reply = 0; OPAL_OUTPUT_VERBOSE((50, ompi_mtl_base_framework.framework_output, "Recv %lu from %x,%x of length %ld (0x%lx, 0x%lx, 0x%lx)\n", ptl_request->opcount, remote_proc.phys.nid, remote_proc.phys.pid, (int64_t)length, match_bits, ignore_bits, (unsigned long) ptl_request)); me.start = start; me.length = length; me.ct_handle = PTL_CT_NONE; me.min_free = 0; me.uid = ompi_mtl_portals4.uid; me.options = PTL_ME_OP_PUT | PTL_ME_USE_ONCE | PTL_ME_EVENT_UNLINK_DISABLE; if (length <= ompi_mtl_portals4.short_limit) { me.options |= PTL_ME_EVENT_LINK_DISABLE; } me.match_id = remote_proc; me.match_bits = match_bits; me.ignore_bits = ignore_bits; ret = PtlMEAppend(ompi_mtl_portals4.ni_h, ompi_mtl_portals4.recv_idx, &me, PTL_PRIORITY_LIST, ptl_request, &ptl_request->me_h); if (OPAL_UNLIKELY(PTL_OK != ret)) { if (NULL != ptl_request->buffer_ptr) free(ptl_request->buffer_ptr); opal_output_verbose(1, ompi_mtl_base_framework.framework_output, "%s:%d: PtlMEAppend failed: %d", __FILE__, __LINE__, ret); return ompi_mtl_portals4_get_error(ret); } /* if a long message, spin until we either have a comm event or a link event, guaranteeing progress for long unexpected messages. */ if (length > ompi_mtl_portals4.short_limit) { while (true != ptl_request->req_started) { ompi_mtl_portals4_progress(); } } return OMPI_SUCCESS; }
int ompi_osc_pt2pt_compare_and_swap (const void *origin_addr, const void *compare_addr, void *result_addr, struct ompi_datatype_t *dt, int target, OPAL_PTRDIFF_TYPE target_disp, struct ompi_win_t *win) { ompi_osc_pt2pt_module_t *module = GET_MODULE(win); ompi_proc_t *proc = ompi_comm_peer_lookup(module->comm, target); ompi_osc_pt2pt_frag_t *frag; ompi_osc_pt2pt_header_cswap_t *header; ompi_osc_pt2pt_sync_t *pt2pt_sync; size_t ddt_len, payload_len, frag_len; ompi_osc_pt2pt_request_t *request; const void *packed_ddt; int ret, tag; char *ptr; OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, "cswap: 0x%lx, 0x%lx, 0x%lx, %s, %d, %d, %s", (unsigned long) origin_addr, (unsigned long) compare_addr, (unsigned long) result_addr, dt->name, target, (int) target_disp, win->w_name)); pt2pt_sync = ompi_osc_pt2pt_module_sync_lookup (module, target, NULL); if (OPAL_UNLIKELY(NULL == pt2pt_sync)) { return OMPI_ERR_RMA_SYNC; } /* optimize self case. TODO: optimize local case */ if (ompi_comm_rank (module->comm) == target) { return ompi_osc_pt2pt_cas_self (pt2pt_sync, origin_addr, compare_addr, result_addr, dt, target_disp, module); } /* compare-and-swaps are always request based, so that we know where to land the data */ OMPI_OSC_PT2PT_REQUEST_ALLOC(win, request); request->type = OMPI_OSC_PT2PT_HDR_TYPE_CSWAP; request->origin_addr = origin_addr; request->internal = true; OMPI_DATATYPE_RETAIN(dt); request->origin_dt = dt; /* Compute datatype and payload lengths. Note that the datatype description * must fit in a single frag. It should be small in this case. */ ddt_len = ompi_datatype_pack_description_length(dt); /* we need to send both the origin and compare buffers */ payload_len = dt->super.size * 2; ret = ompi_datatype_get_pack_description(dt, &packed_ddt); if (OMPI_SUCCESS != ret) { return ret; } frag_len = sizeof(ompi_osc_pt2pt_header_cswap_t) + ddt_len + payload_len; ret = ompi_osc_pt2pt_frag_alloc(module, target, frag_len, &frag, &ptr, false, false); if (OMPI_SUCCESS != ret) { return OMPI_ERR_OUT_OF_RESOURCE; } tag = get_tag (module); ompi_osc_signal_outgoing (module, target, 1); header = (ompi_osc_pt2pt_header_cswap_t *) ptr; header->base.type = OMPI_OSC_PT2PT_HDR_TYPE_CSWAP; header->base.flags = OMPI_OSC_PT2PT_HDR_FLAG_VALID; header->len = frag_len; header->displacement = target_disp; header->tag = tag; osc_pt2pt_hton(header, proc); ptr += sizeof(ompi_osc_pt2pt_header_cswap_t); memcpy((unsigned char*) ptr, packed_ddt, ddt_len); ptr += ddt_len; /* pack the origin and compare data */ osc_pt2pt_copy_for_send (ptr, dt->super.size, origin_addr, proc, 1, dt); ptr += dt->super.size; osc_pt2pt_copy_for_send (ptr, dt->super.size, compare_addr, proc, 1, dt); request->outstanding_requests = 1; ret = ompi_osc_pt2pt_irecv_w_cb (result_addr, 1, dt, target, tag_to_origin(tag), module->comm, NULL, ompi_osc_pt2pt_req_comm_complete, request); if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { return ret; } return ompi_osc_pt2pt_frag_finish (module, frag); }
int ompi_mtl_mx_send(struct mca_mtl_base_module_t* mtl, struct ompi_communicator_t* comm, int dest, int tag, struct opal_convertor_t *convertor, mca_pml_base_send_mode_t mode) { mx_return_t mx_return; uint64_t match_bits; mca_mtl_mx_request_t mtl_mx_request; size_t length; mx_status_t mx_status; uint32_t result; ompi_proc_t* ompi_proc = ompi_comm_peer_lookup( comm, dest ); mca_mtl_mx_endpoint_t* mx_endpoint = (mca_mtl_mx_endpoint_t*) ompi_proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_MTL]; char* where; assert(mtl == &ompi_mtl_mx.super); MX_SET_SEND_BITS(match_bits, comm->c_contextid, comm->c_my_rank, tag); ompi_mtl_datatype_pack(convertor, &mtl_mx_request.mx_segment[0].segment_ptr, &length, &mtl_mx_request.free_after); mtl_mx_request.mx_segment[0].segment_length = length; mtl_mx_request.convertor = convertor; mtl_mx_request.type = OMPI_MTL_MX_ISEND; OPAL_OUTPUT_VERBOSE((50, ompi_mtl_base_framework.framework_output, "issend bits: 0x%016" PRIu64 "\n", match_bits)); if(mode == MCA_PML_BASE_SEND_SYNCHRONOUS) { mx_return = mx_issend( ompi_mtl_mx.mx_endpoint, mtl_mx_request.mx_segment, 1, mx_endpoint->mx_peer_addr, match_bits, &mtl_mx_request, &mtl_mx_request.mx_request ); where = "mx_issend"; } else { mx_return = mx_isend( ompi_mtl_mx.mx_endpoint, mtl_mx_request.mx_segment, 1, mx_endpoint->mx_peer_addr, match_bits, &mtl_mx_request, &mtl_mx_request.mx_request ); where = "mx_isend"; } if( OPAL_UNLIKELY(mx_return != MX_SUCCESS) ) { char peer_name[MX_MAX_HOSTNAME_LEN]; if(MX_SUCCESS != mx_nic_id_to_hostname( mx_endpoint->mx_peer->nic_id, peer_name)) { sprintf( peer_name, "unknown %lx nic_id", (long)mx_endpoint->mx_peer->nic_id ); } opal_output(ompi_mtl_base_framework.framework_output, "Error in %s (error %s) sending to %s\n", where, mx_strerror(mx_return), peer_name); /* Free buffer if needed */ if(mtl_mx_request.free_after) { free(mtl_mx_request.mx_segment[0].segment_ptr); } return OMPI_ERROR; } do { mx_return = mx_test(ompi_mtl_mx.mx_endpoint, &mtl_mx_request.mx_request, &mx_status, &result); if( OPAL_UNLIKELY(mx_return != MX_SUCCESS) ) { opal_output(ompi_mtl_base_framework.framework_output, "Error in mx_wait (error %s)\n", mx_strerror(mx_return)); abort(); } if( OPAL_UNLIKELY(result && mx_status.code != MX_STATUS_SUCCESS) ) { opal_output(ompi_mtl_base_framework.framework_output, "Error in ompi_mtl_mx_send, mx_wait returned something other than MX_STATUS_SUCCESS: mx_status.code = %d.\n", mx_status.code); abort(); } } while(!result); /* Free buffer if needed */ if(mtl_mx_request.free_after) { free(mtl_mx_request.mx_segment[0].segment_ptr); } return OMPI_SUCCESS; }
int ompi_mtl_portals_irecv(struct mca_mtl_base_module_t* mtl, struct ompi_communicator_t *comm, int src, int tag, struct ompi_convertor_t *convertor, mca_mtl_request_t *mtl_request) { ptl_match_bits_t match_bits, ignore_bits; ptl_md_t md; ptl_handle_md_t md_h; ptl_handle_me_t me_h; int ret; ptl_process_id_t remote_proc; mca_mtl_base_endpoint_t *endpoint = NULL; ompi_mtl_portals_request_t *ptl_request = (ompi_mtl_portals_request_t*) mtl_request; ompi_mtl_portals_event_t *recv_event = NULL; size_t buflen; ptl_request->convertor = convertor; if (MPI_ANY_SOURCE == src) { remote_proc.nid = PTL_NID_ANY; remote_proc.pid = PTL_PID_ANY; } else { ompi_proc_t* ompi_proc = ompi_comm_peer_lookup( comm, src ); endpoint = (mca_mtl_base_endpoint_t*) ompi_proc->proc_pml; remote_proc = endpoint->ptl_proc; } PTL_SET_RECV_BITS(match_bits, ignore_bits, comm->c_contextid, src, tag); OPAL_OUTPUT_VERBOSE((50, ompi_mtl_base_output, "recv bits: 0x%016llx 0x%016llx\n", match_bits, ignore_bits)); /* first, check the queue of processed unexpected messages */ recv_event = ompi_mtl_portals_search_unex_q(match_bits, ignore_bits); if (NULL != recv_event) { /* found it */ ompi_mtl_portals_get_data(recv_event, convertor, ptl_request); OMPI_FREE_LIST_RETURN(&ompi_mtl_portals.event_fl, (ompi_free_list_item_t*)recv_event); goto cleanup; } else { restart_search: /* check unexpected events */ recv_event = ompi_mtl_portals_search_unex_events(match_bits, ignore_bits); if (NULL != recv_event) { /* found it */ ompi_mtl_portals_get_data(recv_event, convertor, ptl_request); OMPI_FREE_LIST_RETURN(&ompi_mtl_portals.event_fl, (ompi_free_list_item_t*)recv_event); goto cleanup; } } /* didn't find it, now post the receive */ ret = ompi_mtl_datatype_recv_buf(convertor, &md.start, &buflen, &ptl_request->free_after); md.length = buflen; /* create ME entry */ ret = PtlMEInsert(ompi_mtl_portals.ptl_match_ins_me_h, remote_proc, match_bits, ignore_bits, PTL_UNLINK, PTL_INS_BEFORE, &me_h); if( ret !=PTL_OK) { return ompi_common_portals_error_ptl_to_ompi(ret); } /* associate a memory descriptor with the Match list Entry */ md.threshold = 0; md.options = PTL_MD_OP_PUT | PTL_MD_TRUNCATE | PTL_MD_EVENT_START_DISABLE; md.user_ptr = ptl_request; md.eq_handle = ompi_mtl_portals.ptl_eq_h; ret=PtlMDAttach(me_h, md, PTL_UNLINK, &md_h); if( ret !=PTL_OK) { return ompi_common_portals_error_ptl_to_ompi(ret); } /* now try to make active */ md.threshold = 1; /* enable the memory descritor, if the ptl_unexpected_recv_eq_h * queue is empty */ ret = PtlMDUpdate(md_h, NULL, &md, ompi_mtl_portals.ptl_unexpected_recv_eq_h); if (ret == PTL_MD_NO_UPDATE) { /* a message has arrived since we searched - look again */ PtlMDUnlink(md_h); if (ptl_request->free_after) { free(md.start); } goto restart_search; } else if( PTL_OK != ret ) { return ompi_common_portals_error_ptl_to_ompi(ret); } ptl_request->event_callback = ompi_mtl_portals_recv_progress; cleanup: return OMPI_SUCCESS; }
int ompi_mtl_mx_isend(struct mca_mtl_base_module_t* mtl, struct ompi_communicator_t* comm, int dest, int tag, struct opal_convertor_t *convertor, mca_pml_base_send_mode_t mode, bool blocking, mca_mtl_request_t * mtl_request) { mx_return_t mx_return; uint64_t match_bits; mca_mtl_mx_request_t * mtl_mx_request = (mca_mtl_mx_request_t*) mtl_request; size_t length; ompi_proc_t* ompi_proc = ompi_comm_peer_lookup( comm, dest ); mca_mtl_mx_endpoint_t* mx_endpoint = (mca_mtl_mx_endpoint_t*) ompi_proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_MTL]; char* where; assert(mtl == &ompi_mtl_mx.super); MX_SET_SEND_BITS(match_bits, comm->c_contextid, comm->c_my_rank, tag); ompi_mtl_datatype_pack(convertor, &mtl_mx_request->mx_segment[0].segment_ptr, &length, &mtl_mx_request->free_after); mtl_mx_request->mx_segment[0].segment_length = length; mtl_mx_request->convertor = convertor; mtl_mx_request->type = OMPI_MTL_MX_ISEND; OPAL_OUTPUT_VERBOSE((50, ompi_mtl_base_framework.framework_output, "issend bits: 0x%016" PRIu64 "\n", match_bits)); if(mode == MCA_PML_BASE_SEND_SYNCHRONOUS) { mx_return = mx_issend( ompi_mtl_mx.mx_endpoint, mtl_mx_request->mx_segment, 1, mx_endpoint->mx_peer_addr, match_bits, mtl_mx_request, &mtl_mx_request->mx_request ); where = "mx_issend"; } else { mx_return = mx_isend( ompi_mtl_mx.mx_endpoint, mtl_mx_request->mx_segment, 1, mx_endpoint->mx_peer_addr, match_bits, mtl_mx_request, &mtl_mx_request->mx_request ); where = "mx_isend"; } if( OPAL_UNLIKELY(mx_return != MX_SUCCESS) ) { char peer_name[MX_MAX_HOSTNAME_LEN]; if(MX_SUCCESS != mx_nic_id_to_hostname( mx_endpoint->mx_peer->nic_id, peer_name)) { sprintf( peer_name, "unknown %lx nic_id", (long)mx_endpoint->mx_peer->nic_id ); } opal_output(ompi_mtl_base_framework.framework_output, "Error in %s (error %s) sending to %s\n", where, mx_strerror(mx_return), peer_name); return OMPI_ERROR; } return OMPI_SUCCESS; }
static mca_sbgp_base_module_t *mca_sbgp_basesmsocket_select_procs(struct ompi_proc_t ** procs, int n_procs_in, struct ompi_communicator_t *comm, char *key, void *output_data ) { /* local variables */ mca_sbgp_basesmsocket_module_t *module; /* opal_buffer_t* sbuffer = OBJ_NEW(opal_buffer_t); opal_buffer_t* rbuffer = OBJ_NEW(opal_buffer_t); */ opal_paffinity_base_cpu_set_t my_cpu_set; bool bound; int ret; int num_processors; int socket_tmp; int my_socket_index; int core_index=-1; int proc, cnt, local, n_local_peers, my_index, my_rank; ompi_proc_t* my_proc; int *local_ranks_in_comm=NULL; int *socket_info=NULL, my_socket_info; int i_cnt, lp_cnt, my_local_index, comm_size=ompi_comm_size(comm); /* initialize data */ output_data=NULL; my_rank=ompi_comm_rank(comm); my_proc=ompi_comm_peer_lookup(comm,my_rank); for( proc=0 ; proc < n_procs_in ; proc++) { if( procs[proc]==my_proc) { my_index=proc; } } /*create a new module*/ module=OBJ_NEW(mca_sbgp_basesmsocket_module_t); if (!module ) { return NULL; } module->super.group_size=0; module->super.group_comm = comm; module->super.group_list = NULL; module->super.group_net = OMPI_SBGP_SOCKET; /* ** get my process affinity information ** */ /* get the number of processors on this node */ ret=opal_paffinity_base_get_processor_info(&num_processors); /* get process affinity mask */ OPAL_PAFFINITY_CPU_ZERO(my_cpu_set); ret=opal_paffinity_base_get(&my_cpu_set); OPAL_PAFFINITY_PROCESS_IS_BOUND(my_cpu_set,&bound); /*debug process affinity*/ /* { ret=opal_paffinity_base_get_socket_info(&num_socket); fprintf(stderr,"Number of sockets %d\n",num_socket); fprintf(stderr,"Test if rank %d is bound %d\n", my_rank, bound); fprintf(stderr,"return from opal_paffinity_base_get: %d\n\n",ret); fprintf(stderr,"bitmask elements: "); unsigned int long jj; for(jj=0; jj < OPAL_PAFFINITY_BITMASK_NUM_ELEMENTS; jj++) fprintf(stderr," %d ",my_cpu_set.bitmask[jj]); fprintf(stderr,"\n"); fflush(stderr); } end debug process affinity*/ if( !bound ) { /* pa affinity not set, so socket index will be set to -1 */ my_socket_index=-1; /*debug print*/ /* */ fprintf(stderr,"[%d]FAILED to set basesmsocket group !!!\n",my_rank); fflush(stderr); /*end debug*/ goto NoLocalPeers; } else { my_socket_index=-1; /* loop over number of processors */ for ( proc=0 ; proc < num_processors ; proc++ ) { if (OPAL_PAFFINITY_CPU_ISSET(proc,my_cpu_set)) { ret=opal_paffinity_base_get_map_to_socket_core(proc,&socket_tmp,&core_index); if( my_socket_index != socket_tmp ) { my_socket_index=socket_tmp; break; } } } /* end of proc loop */ } /* Debug prints */ /* { fprintf(stderr,"Number of processors per node: %d\n",num_processors); fprintf(stderr,"I am rank %d and my socket index is %d\n and my core index is %d\n",my_rank,my_socket_index,core_index); fprintf(stderr,"n_proc_in = %d\n",n_procs_in); fprintf(stderr,"\n"); fflush(stderr); } end debug prints */ /*get my socket index*/ cnt=0; for( proc=0 ; proc < n_procs_in ; proc++) { local=OPAL_PROC_ON_LOCAL_NODE(procs[proc]->proc_flags); if( local ) { cnt++; } } /*debug print */ /* fprintf(stderr,"Number of local processors %d\n",cnt); end debug print*/ /* if no other local procs found skip to end */ if( 1 >= cnt ) { goto NoLocalPeers; } #if 0 int *local_ranks_in_comm; int32_t *socket_info, *my_socket_info; int my_local_index; #endif /* allocate structure to hold the list of local ranks */ local_ranks_in_comm=(int *)malloc(sizeof(int)*cnt); if(NULL == local_ranks_in_comm ) { goto Error; } /* figure out which ranks from the input communicator - comm - will * particiapte in the local socket determination. */ n_local_peers=0; i_cnt=0; for( proc = 0; proc < n_procs_in; proc++) { local = OPAL_PROC_ON_LOCAL_NODE(procs[proc]->proc_flags); if ( local ) { /* set the rank within the on-host ranks - this will be used for tha * allgather */ if( my_proc == procs[proc] ) { my_local_index=n_local_peers; } /* find the rank of the current proc in comm. We take advantage * of the fact that ranks in a group have the same relative * ordering as they do within the communicator. */ #if 1 /*for( lp_cnt=i_cnt; lp_cnt < comm_size ; lp_cnt++ ) {*/ for( lp_cnt=proc; lp_cnt < comm_size ; lp_cnt++ ) { if(procs[proc] == ompi_comm_peer_lookup(comm,lp_cnt) ) { local_ranks_in_comm[i_cnt]=lp_cnt; /* lp_cnt has alrady been checked */ i_cnt++; /* found the corresponding rank in comm, so don't need * to search any more */ break; } /*i_cnt++;*/ /*fprintf(stderr,"QQQ i_cnt %d \n",i_cnt);*/ } #endif n_local_peers++; } } /*fprintf(stderr,"YYY n_local_peers %d\n",n_local_peers);*/ socket_info=(int *)malloc(sizeof(int)*n_local_peers); /*fprintf(stderr,"XXX got socket info\n");*/ if(NULL == socket_info ) { goto Error; } my_socket_info=my_socket_index; /* Allgather data over the communicator */ ret=comm_allgather_pml(&my_socket_info, socket_info, 1, MPI_INT, my_local_index, n_local_peers, local_ranks_in_comm,comm); if (OMPI_SUCCESS != ret ) { fprintf(stderr," comm_allgather_pml returned error %d \n", ret); fflush(stderr); return NULL; } /*allocate memory to the group_list probably an overestimation of the necessary resources */ module->super.group_list=(int *)malloc(sizeof(int)*cnt); if(NULL == module->super.group_list) { goto Error; } /* figure out who is sharing the same socket */ cnt=0; for (proc = 0; proc < n_local_peers; proc++) { int rem_rank=local_ranks_in_comm[proc]; int rem_socket_index=socket_info[proc]; /*Populate the list*/ if (rem_socket_index == my_socket_index) { module->super.group_list[cnt]=rem_rank; cnt++; } } module->super.group_size=cnt; /*debug print*/ /* { int ii; fprintf(stderr,"Ranks per socket: %d\n",cnt); fprintf(stderr,"Socket %d owns ranks: ", my_socket_index); for (ii=0; ii < cnt; ii++) fprintf(stderr,"%d ",module->super.group_list[ii]); fprintf(stderr,"\n"); fflush(stderr); } { cpu_set_t set; unsigned int len = sizeof(set); int i; unsigned long mask = 0; CPU_ZERO(&set); if (sched_getaffinity(0, len, &set) < 0) { perror("sched_getaffinity"); return -1; } for (i = 0; i < CPU_SETSIZE; i++) { int cpu = CPU_ISSET(i, &set); if (cpu) { mask |= 1<< i; } } opal_output(0,"%d: my affinity mask is: %08lx\n", my_local_index,mask); } end debug*/ /*Free resources*/ free(local_ranks_in_comm); free(socket_info); /*Return the module*/ return (mca_sbgp_base_module_t *) module; NoLocalPeers: /* nothing to store, so just free the module and return */ /*fprintf(stderr,"No local socket peers\n");*/ /*free(module);*/ if(socket_info) { free(socket_info); socket_info=NULL; } if(local_ranks_in_comm) { free(local_ranks_in_comm); } OBJ_RELEASE(module); return NULL; Error: /*clean up*/ if( NULL != module->super.group_list) { free(module->super.group_list); module->super.group_list=NULL; } if(socket_info) { free(socket_info); socket_info=NULL; } if(local_ranks_in_comm) { free(local_ranks_in_comm); } OBJ_RELEASE(module); return NULL; }
static mca_sbgp_base_module_t *mca_sbgp_basesmsocket_select_procs(struct ompi_proc_t ** procs, int n_procs_in, struct ompi_communicator_t *comm, char *key, void *output_data ) { /* local variables */ mca_sbgp_basesmsocket_module_t *module; int ret; int my_socket_index; int proc, cnt, local, n_local_peers, my_rank; ompi_proc_t* my_proc; int *local_ranks_in_comm=NULL; int *socket_info=NULL, my_socket_info; int i_cnt, lp_cnt, my_local_index = -1, comm_size=ompi_comm_size(comm); /* initialize data */ output_data=NULL; my_rank=ompi_comm_rank(comm); my_proc=ompi_comm_peer_lookup(comm,my_rank); /*create a new module*/ module=OBJ_NEW(mca_sbgp_basesmsocket_module_t); if (!module ) { return NULL; } module->super.group_size=0; module->super.group_comm = comm; module->super.group_list = NULL; module->super.group_net = OMPI_SBGP_SOCKET; /* test to see if process is bound */ if( OPAL_BIND_TO_NONE == OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy) ) { /* pa affinity not set, so socket index will be set to -1 */ my_socket_index=-1; /*debug print*/ /* */ BASESMSOCKET_VERBOSE(10, ("[%d] FAILED to set basesmsocket group, processes are not bound!!!\n",my_rank)); /*end debug*/ goto NoLocalPeers; } else { my_socket_index=-1; /* this should find my logical socket id which is the socket id we want * physical socket ids are not necessarily unique, logical ones, as defined * by the hwloc API are unique. */ if( OMPI_SUCCESS != mca_sbgp_map_to_logical_socket_id(&my_socket_index)) { BASESMSOCKET_VERBOSE(10, ("[%d] FAILED to set basesmsocket group !!!\n",my_rank)); goto NoLocalPeers; } } /* Debug prints */ /* { fprintf(stderr,"Number of processors per node: %d\n",num_processors); fprintf(stderr,"I am rank %d and my socket index is %d\n and my core index is %d\n",my_rank,my_socket_index,core_index); fprintf(stderr,"n_proc_in = %d\n",n_procs_in); fprintf(stderr,"\n"); fflush(stderr); } end debug prints */ /*get my socket index*/ cnt=0; for( proc=0 ; proc < n_procs_in ; proc++) { local=OPAL_PROC_ON_LOCAL_NODE(procs[proc]->proc_flags); if( local ) { cnt++; } } /*debug print */ /* fprintf(stderr,"Number of local processors %d\n",cnt); end debug print*/ /* if no other local procs found skip to end */ if( 1 >= cnt ) { goto NoLocalPeers; } /* allocate structure to hold the list of local ranks */ local_ranks_in_comm=(int *)malloc(sizeof(int)*cnt); if(NULL == local_ranks_in_comm ) { goto Error; } /* figure out which ranks from the input communicator - comm - will * particiapte in the local socket determination. */ n_local_peers=0; i_cnt=0; for( proc = 0; proc < n_procs_in; proc++) { local = OPAL_PROC_ON_LOCAL_NODE(procs[proc]->proc_flags); if ( local ) { /* set the rank within the on-host ranks - this will be used for tha * allgather */ if( my_proc == procs[proc] ) { my_local_index=n_local_peers; } /* find the rank of the current proc in comm. We take advantage * of the fact that ranks in a group have the same relative * ordering as they do within the communicator. */ for( lp_cnt=proc; lp_cnt < comm_size ; lp_cnt++ ) { if(procs[proc] == ompi_comm_peer_lookup(comm,lp_cnt) ) { local_ranks_in_comm[i_cnt]=lp_cnt; /* lp_cnt has alrady been checked */ i_cnt++; /* found the corresponding rank in comm, so don't need * to search any more */ break; } /*i_cnt++;*/ /*fprintf(stderr,"QQQ i_cnt %d \n",i_cnt);*/ } n_local_peers++; } } /*fprintf(stderr,"YYY n_local_peers %d\n",n_local_peers);*/ socket_info=(int *)malloc(sizeof(int)*n_local_peers); /*fprintf(stderr,"XXX got socket info\n");*/ if(NULL == socket_info ) { goto Error; } my_socket_info=my_socket_index; /* Allgather data over the communicator */ ret=comm_allgather_pml(&my_socket_info, socket_info, 1, MPI_INT, my_local_index, n_local_peers, local_ranks_in_comm,comm); if (OMPI_SUCCESS != ret ) { BASESMSOCKET_VERBOSE(10, ("comm_allgather_pml returned error %d\n",ret)); return NULL; } /*allocate memory to the group_list probably an overestimation of the necessary resources */ module->super.group_list=(int *)malloc(sizeof(int)*cnt); if(NULL == module->super.group_list) { goto Error; } /* figure out who is sharing the same socket */ cnt=0; for (proc = 0; proc < n_local_peers; proc++) { int rem_rank=local_ranks_in_comm[proc]; int rem_socket_index=socket_info[proc]; /*Populate the list*/ if (rem_socket_index == my_socket_index) { module->super.group_list[cnt]=rem_rank; cnt++; } } module->super.group_size=cnt; #if 0 /*debug print*/ { int ii; fprintf(stderr,"Ranks per socket: %d\n",cnt); fprintf(stderr,"Socket %d owns ranks: ", my_socket_index); for (ii=0; ii < cnt; ii++) fprintf(stderr,"%d ",module->super.group_list[ii]); fprintf(stderr,"\n"); fflush(stderr); } #endif /* end debug*/ /*Free resources*/ free(local_ranks_in_comm); free(socket_info); /*Return the module*/ return (mca_sbgp_base_module_t *) module; NoLocalPeers: /* nothing to store, so just free the module and return */ /*fprintf(stderr,"No local socket peers\n");*/ /*free(module);*/ if(socket_info) { free(socket_info); socket_info=NULL; } if(local_ranks_in_comm) { free(local_ranks_in_comm); } OBJ_RELEASE(module); return NULL; Error: /*clean up*/ if( NULL != module->super.group_list) { free(module->super.group_list); module->super.group_list=NULL; } if(socket_info) { free(socket_info); socket_info=NULL; } if(local_ranks_in_comm) { free(local_ranks_in_comm); } OBJ_RELEASE(module); return NULL; }
static int ompi_coll_portals4_scatter_intra_linear_bottom(struct ompi_communicator_t *comm, ompi_coll_portals4_request_t *request) { int ret, line; OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "coll:portals4:scatter_intra_linear_bottom enter rank %d", request->u.scatter.my_rank)); ret = cleanup_scatter_handles(request); if (MPI_SUCCESS != ret) { line = __LINE__; goto err_hdlr; } ret = cleanup_sync_handles(request); if (MPI_SUCCESS != ret) { line = __LINE__; goto err_hdlr; } if (NULL != request->u.scatter.unpack_dst_buf) { uint32_t iov_count = 1; struct iovec iov; size_t max_data; ompi_coll_portals4_create_recv_converter (&request->u.scatter.recv_converter, request->u.scatter.unpack_dst_buf, ompi_comm_peer_lookup(comm, request->u.scatter.my_rank), request->u.scatter.unpack_dst_count, request->u.scatter.unpack_dst_dtype); iov.iov_len = request->u.scatter.packed_size; if (request->u.scatter.my_rank == request->u.scatter.root_rank) { /* unpack my data from the location in scatter_buf where is was packed */ uint64_t offset = request->u.scatter.pack_src_extent * request->u.scatter.pack_src_count * request->u.scatter.my_rank; iov.iov_base = (IOVBASE_TYPE *)((char *)request->u.scatter.scatter_buf + offset); } else { iov.iov_base = (IOVBASE_TYPE *)request->u.scatter.scatter_buf; } opal_convertor_unpack(&request->u.scatter.recv_converter, &iov, &iov_count, &max_data); OBJ_DESTRUCT(&request->u.scatter.recv_converter); } if (request->u.scatter.free_after) free(request->u.scatter.scatter_buf); request->super.req_status.MPI_ERROR = OMPI_SUCCESS; OPAL_THREAD_LOCK(&ompi_request_lock); ompi_request_complete(&request->super, true); OPAL_THREAD_UNLOCK(&ompi_request_lock); OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "coll:portals4:scatter_intra_linear_bottom exit rank %d", request->u.scatter.my_rank)); return OMPI_SUCCESS; err_hdlr: request->super.req_status.MPI_ERROR = ret; if (request->u.scatter.free_after) free(request->u.scatter.scatter_buf); opal_output(ompi_coll_base_framework.framework_output, "%s:%4d:%4d\tError occurred ret=%d, rank %2d", __FILE__, __LINE__, line, ret, request->u.scatter.my_rank); return ret; }
int ompi_osc_pt2pt_sendreq_recv_put(ompi_osc_pt2pt_module_t *module, ompi_osc_pt2pt_send_header_t *header, void *inbuf) { int ret = OMPI_SUCCESS; void *target = (unsigned char*) module->p2p_win->w_baseptr + (header->hdr_target_disp * module->p2p_win->w_disp_unit); ompi_proc_t *proc = ompi_comm_peer_lookup( module->p2p_comm, header->hdr_origin ); struct ompi_datatype_t *datatype = ompi_osc_pt2pt_datatype_create(proc, &inbuf); if (header->hdr_msg_length > 0) { ompi_convertor_t convertor; struct iovec iov; uint32_t iov_count = 1; size_t max_data; ompi_proc_t *proc; /* create convertor */ OBJ_CONSTRUCT(&convertor, ompi_convertor_t); /* initialize convertor */ proc = ompi_comm_peer_lookup(module->p2p_comm, header->hdr_origin); ompi_convertor_copy_and_prepare_for_recv(proc->proc_convertor, datatype, header->hdr_target_count, target, 0, &convertor); iov.iov_len = header->hdr_msg_length; iov.iov_base = (IOVBASE_TYPE*)inbuf; max_data = iov.iov_len; ompi_convertor_unpack(&convertor, &iov, &iov_count, &max_data ); OBJ_DESTRUCT(&convertor); OBJ_RELEASE(datatype); OPAL_THREAD_ADD32(&(module->p2p_num_pending_in), -1); } else { ompi_osc_pt2pt_longreq_t *longreq; ompi_osc_pt2pt_longreq_alloc(&longreq); longreq->req_comp_cb = ompi_osc_pt2pt_sendreq_recv_put_long_cb; longreq->req_comp_cbdata = NULL; longreq->req_datatype = datatype; longreq->req_module = module; ret = mca_pml.pml_irecv(target, header->hdr_target_count, datatype, header->hdr_origin, header->hdr_origin_tag, module->p2p_comm, &(longreq->req_pml_req)); /* put the send request in the waiting list */ OPAL_THREAD_LOCK(&(module->p2p_lock)); opal_list_append(&(module->p2p_long_msgs), &(longreq->super.super)); OPAL_THREAD_UNLOCK(&(module->p2p_lock)); } return ret; }
static int ompi_osc_pt2pt_accumulate_w_req (const void *origin_addr, int origin_count, struct ompi_datatype_t *origin_dt, int target, OPAL_PTRDIFF_TYPE target_disp, int target_count, struct ompi_datatype_t *target_dt, struct ompi_op_t *op, ompi_win_t *win, ompi_osc_pt2pt_request_t *request) { int ret; ompi_osc_pt2pt_module_t *module = GET_MODULE(win); ompi_proc_t *proc = ompi_comm_peer_lookup(module->comm, target); bool is_long_datatype = false; bool is_long_msg = false; ompi_osc_pt2pt_frag_t *frag; ompi_osc_pt2pt_header_acc_t *header; ompi_osc_pt2pt_sync_t *pt2pt_sync; size_t ddt_len, payload_len, frag_len; char *ptr; const void *packed_ddt; int tag = -1; OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, "acc: 0x%lx, %d, %s, %d, %d, %d, %s, %s, %s", (unsigned long) origin_addr, origin_count, origin_dt->name, target, (int) target_disp, target_count, target_dt->name, op->o_name, win->w_name)); pt2pt_sync = ompi_osc_pt2pt_module_sync_lookup (module, target, NULL); if (OPAL_UNLIKELY(NULL == pt2pt_sync)) { return OMPI_ERR_RMA_SYNC; } /* short-circuit case */ if (0 == origin_count || 0 == target_count) { if (request) { ompi_osc_pt2pt_request_complete (request, MPI_SUCCESS); } return OMPI_SUCCESS; } /* optimize the self case. TODO: optimize the local case */ if (ompi_comm_rank (module->comm) == target) { return ompi_osc_pt2pt_acc_self (pt2pt_sync, origin_addr, origin_count, origin_dt, target_disp, target_count, target_dt, op, module, request); } /* Compute datatype and payload lengths. Note that the datatype description * must fit in a single frag */ ddt_len = ompi_datatype_pack_description_length(target_dt); payload_len = origin_dt->super.size * origin_count; frag_len = sizeof(*header) + ddt_len + payload_len; ret = ompi_osc_pt2pt_frag_alloc(module, target, frag_len, &frag, &ptr, false, true); if (OMPI_SUCCESS != ret) { frag_len = sizeof(*header) + ddt_len; ret = ompi_osc_pt2pt_frag_alloc(module, target, frag_len, &frag, &ptr, true, !request); if (OMPI_SUCCESS != ret) { /* allocate space for the header plus space to store ddt_len */ frag_len = sizeof(*header) + 8; ret = ompi_osc_pt2pt_frag_alloc(module, target, frag_len, &frag, &ptr, true, !request); if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { return OMPI_ERR_OUT_OF_RESOURCE; } is_long_datatype = true; } is_long_msg = true; tag = get_tag (module); } else { /* still need to set the tag for the active/passive logic on the target */ tag = !!(module->passive_target_access_epoch); } if (is_long_msg) { /* wait for synchronization before posting a long message */ if (pt2pt_sync->type == OMPI_OSC_PT2PT_SYNC_TYPE_LOCK) { OPAL_THREAD_LOCK(&pt2pt_sync->lock); ompi_osc_pt2pt_peer_t *peer = ompi_osc_pt2pt_peer_lookup (module, target); while (!(peer->flags & OMPI_OSC_PT2PT_PEER_FLAG_EAGER)) { opal_condition_wait(&pt2pt_sync->cond, &pt2pt_sync->lock); } OPAL_THREAD_UNLOCK(&pt2pt_sync->lock); } else { ompi_osc_pt2pt_sync_wait_expected (pt2pt_sync); } } header = (ompi_osc_pt2pt_header_acc_t*) ptr; header->base.flags = 0; header->len = frag_len; header->count = target_count; header->displacement = target_disp; header->op = op->o_f_to_c_index; header->tag = tag; ptr += sizeof (*header); do { ret = ompi_datatype_get_pack_description(target_dt, &packed_ddt); if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { break; } if (is_long_datatype) { /* the datatype does not fit in an eager message. send it seperately */ header->base.flags |= OMPI_OSC_PT2PT_HDR_FLAG_LARGE_DATATYPE; OMPI_DATATYPE_RETAIN(target_dt); ret = ompi_osc_pt2pt_isend_w_cb ((void *) packed_ddt, ddt_len, MPI_BYTE, target, tag_to_target(tag), module->comm, ompi_osc_pt2pt_dt_send_complete, target_dt); if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { break; } *((uint64_t *) ptr) = ddt_len; ptr += 8; } else { memcpy((unsigned char*) ptr, packed_ddt, ddt_len); ptr += ddt_len; } if (!is_long_msg) { header->base.type = OMPI_OSC_PT2PT_HDR_TYPE_ACC; osc_pt2pt_hton(header, proc); osc_pt2pt_copy_for_send (ptr, payload_len, origin_addr, proc, origin_count, origin_dt); /* the user's buffer is no longer needed so mark the request as * complete. */ if (request) { ompi_osc_pt2pt_request_complete (request, MPI_SUCCESS); } } else { header->base.type = OMPI_OSC_PT2PT_HDR_TYPE_ACC_LONG; osc_pt2pt_hton(header, proc); OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output, "acc: starting long accumulate with tag %d", tag)); ret = ompi_osc_pt2pt_data_isend (module, origin_addr, origin_count, origin_dt, target, tag_to_target(tag), request); } } while (0); if (OMPI_SUCCESS != ret) { OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output, "acc: failed with eror %d", ret)); } else { /* mark the fragment as valid */ header->base.flags |= OMPI_OSC_PT2PT_HDR_FLAG_VALID; } return ompi_osc_pt2pt_frag_finish(module, frag); }
int bcol_basesmuma_smcm_allgather_connection( mca_bcol_basesmuma_module_t *sm_bcol_module, mca_sbgp_base_module_t *module, opal_list_t *peer_list, bcol_basesmuma_smcm_proc_item_t ***back_files, ompi_communicator_t *comm, bcol_basesmuma_smcm_file_t input, char *base_fname, bool map_all) { /* define local variables */ int rc, i, fd; ptrdiff_t mem_offset; ompi_proc_t *proc_temp, *my_id; bcol_basesmuma_smcm_proc_item_t *temp; bcol_basesmuma_smcm_proc_item_t *item_ptr; bcol_basesmuma_smcm_proc_item_t **backing_files; struct file_info_t local_file; struct file_info_t *all_files=NULL; /* sanity check */ if (strlen(input.file_name) > SM_BACKING_FILE_NAME_MAX_LEN-1) { opal_output (ompi_bcol_base_framework.framework_output, "backing file name too long: %s len :: %d", input.file_name, (int) strlen(input.file_name)); return OMPI_ERR_BAD_PARAM; } backing_files = (bcol_basesmuma_smcm_proc_item_t **) calloc(module->group_size, sizeof(bcol_basesmuma_smcm_proc_item_t *)); if (!backing_files) { return OMPI_ERR_OUT_OF_RESOURCE; } /* FIXME *back_files might have been already allocated * so free it in order to avoid a memory leak */ if (NULL != *back_files) { free (*back_files); } *back_files = backing_files; my_id = ompi_proc_local(); /* Phase One: gather a list of processes that will participate in the allgather - I'm preparing this list from the sbgp-ing module that was passed into the function */ /* fill in local file information */ local_file.vpid = ((orte_process_name_t*)&my_id->super.proc_name)->vpid; local_file.jobid = ((orte_process_name_t*)&my_id->super.proc_name)->jobid; local_file.file_size=input.size; local_file.size_ctl_structure=input.size_ctl_structure; local_file.data_seg_alignment=input.data_seg_alignment; strcpy (local_file.file_name, input.file_name); /* will exchange this data type as a string of characters - * this routine is first called before MPI_init() completes * and before error handling is setup, so can't use the * MPI data types to send this data */ all_files = (struct file_info_t *) calloc(module->group_size, sizeof (struct file_info_t)); if (!all_files) { return OMPI_ERR_OUT_OF_RESOURCE; } /* exchange data */ rc = comm_allgather_pml(&local_file,all_files,sizeof(struct file_info_t), MPI_CHAR, sm_bcol_module->super.sbgp_partner_module->my_index, sm_bcol_module->super.sbgp_partner_module->group_size, sm_bcol_module->super.sbgp_partner_module->group_list, sm_bcol_module->super.sbgp_partner_module->group_comm); if( OMPI_SUCCESS != rc ) { opal_output (ompi_bcol_base_framework.framework_output, "failed in comm_allgather_pml. Error code: %d", rc); goto Error; } /* Phase four: loop through the receive buffer, unpack the data recieved from remote peers */ for (i = 0; i < module->group_size; i++) { struct file_info_t *rem_file = all_files + i; /* check if this is my index or if the file is already mapped (set above). ther * is no reason to look through the peer list again because no two members of * the group will have the same vpid/jobid pair. ignore this previously found * mapping if map_all was requested (NTH: not sure why exactly since we re-map * and already mapped file) */ if (sm_bcol_module->super.sbgp_partner_module->my_index == i) { continue; } proc_temp = ompi_comm_peer_lookup(comm,module->group_list[i]); OPAL_LIST_FOREACH(item_ptr, peer_list, bcol_basesmuma_smcm_proc_item_t) { /* if the vpid/jobid/filename combination already exists in the list, then do not map this peer's file --- because you already have */ if (0 == ompi_rte_compare_name_fields(OMPI_RTE_CMP_ALL, OMPI_CAST_RTE_NAME(&proc_temp->super.proc_name), &item_ptr->peer) && 0 == strcmp (item_ptr->sm_file.file_name, rem_file->file_name)) { ++item_ptr->refcnt; /* record file data */ backing_files[i] = item_ptr; break; } } if (!map_all && backing_files[i]) { continue; } temp = OBJ_NEW(bcol_basesmuma_smcm_proc_item_t); if (!temp) { rc = OMPI_ERR_OUT_OF_RESOURCE; goto Error; } temp->peer.vpid = rem_file->vpid; temp->peer.jobid = rem_file->jobid; temp->sm_file.file_name = strdup (rem_file->file_name); if (!temp->sm_file.file_name) { rc = OMPI_ERR_OUT_OF_RESOURCE; OBJ_RELEASE(temp); goto Error; } temp->sm_file.size = (size_t) rem_file->file_size; temp->sm_file.mpool_size = (size_t) rem_file->file_size; temp->sm_file.size_ctl_structure = (size_t) rem_file->size_ctl_structure; temp->sm_file.data_seg_alignment = (size_t) rem_file->data_seg_alignment; temp->refcnt = 1; /* Phase Five: If map_all == true, then we map every peer's file else we check to see if I have already mapped this vpid/jobid/filename combination and if I have, then I do not mmap this peer's file. * */ fd = open(temp->sm_file.file_name, O_RDWR, 0600); if (0 > fd) { opal_output (ompi_bcol_base_framework.framework_output, "SMCM Allgather failed to open sm backing file %s. errno = %d", temp->sm_file.file_name, errno); rc = OMPI_ERROR; goto Error; } /* map the file */ temp->sm_mmap = bcol_basesmuma_smcm_reg_mmap (NULL, fd, temp->sm_file.size, temp->sm_file.size_ctl_structure, temp->sm_file.data_seg_alignment, temp->sm_file.file_name); close (fd); if (NULL == temp->sm_mmap) { opal_output (ompi_bcol_base_framework.framework_output, "mmapping failed to map remote peer's file"); OBJ_RELEASE(temp); rc = OMPI_ERROR; goto Error; } /* compute memory offset */ mem_offset = (ptrdiff_t) temp->sm_mmap->data_addr - (ptrdiff_t) temp->sm_mmap->map_seg; temp->sm_mmap->map_seg->seg_offset = mem_offset; temp->sm_mmap->map_seg->seg_size = temp->sm_file.size - mem_offset; /* more stuff to follow */ /* append this peer's info, including shared memory map addr, onto the peer_list */ /* record file data */ backing_files[i] = (bcol_basesmuma_smcm_proc_item_t *) temp; opal_list_append(peer_list, (opal_list_item_t*) temp); } rc = OMPI_SUCCESS; Error: /* error clean-up and return */ if (NULL != all_files) { free(all_files); } return rc; }
static inline int ompi_osc_pt2pt_rget_accumulate_internal (const void *origin_addr, int origin_count, struct ompi_datatype_t *origin_datatype, void *result_addr, int result_count, struct ompi_datatype_t *result_datatype, int target_rank, MPI_Aint target_disp, int target_count, struct ompi_datatype_t *target_datatype, struct ompi_op_t *op, struct ompi_win_t *win, bool release_req, struct ompi_request_t **request) { int ret; ompi_osc_pt2pt_module_t *module = GET_MODULE(win); ompi_proc_t *proc = ompi_comm_peer_lookup(module->comm, target_rank); bool is_long_datatype = false; bool is_long_msg = false; ompi_osc_pt2pt_frag_t *frag; ompi_osc_pt2pt_header_acc_t *header; ompi_osc_pt2pt_sync_t *pt2pt_sync; size_t ddt_len, payload_len, frag_len; char *ptr; const void *packed_ddt; int tag; ompi_osc_pt2pt_request_t *pt2pt_request; OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, "rget_acc: 0x%lx, %d, %s, 0x%lx, %d, %s, 0x%x, %d, %d, %s, %s, %s", (unsigned long) origin_addr, origin_count, origin_datatype->name, (unsigned long) result_addr, result_count, result_datatype->name, target_rank, (int) target_disp, target_count, target_datatype->name, op->o_name, win->w_name)); pt2pt_sync = ompi_osc_pt2pt_module_sync_lookup (module, target_rank, NULL); if (OPAL_UNLIKELY(NULL == pt2pt_sync)) { return OMPI_ERR_RMA_SYNC; } /* get_accumulates are always request based, so that we know where to land the data */ OMPI_OSC_PT2PT_REQUEST_ALLOC(win, pt2pt_request); pt2pt_request->internal = release_req; /* short-circuit case. note that origin_count may be 0 if op is MPI_NO_OP */ if (0 == result_count || 0 == target_count) { ompi_osc_pt2pt_request_complete (pt2pt_request, MPI_SUCCESS); *request = &pt2pt_request->super; return OMPI_SUCCESS; } if (!release_req) { /* wait for epoch to begin before starting operation */ ompi_osc_pt2pt_sync_wait_expected (pt2pt_sync); } /* optimize the self case. TODO: optimize the local case */ if (ompi_comm_rank (module->comm) == target_rank) { *request = &pt2pt_request->super; return ompi_osc_pt2pt_gacc_self (pt2pt_sync, origin_addr, origin_count, origin_datatype, result_addr, result_count, result_datatype, target_disp, target_count, target_datatype, op, module, pt2pt_request); } pt2pt_request->type = OMPI_OSC_PT2PT_HDR_TYPE_GET_ACC; pt2pt_request->origin_addr = origin_addr; pt2pt_request->origin_count = origin_count; OMPI_DATATYPE_RETAIN(origin_datatype); pt2pt_request->origin_dt = origin_datatype; /* Compute datatype and payload lengths. Note that the datatype description * must fit in a single frag */ ddt_len = ompi_datatype_pack_description_length(target_datatype); if (&ompi_mpi_op_no_op.op != op) { payload_len = origin_datatype->super.size * origin_count; } else { payload_len = 0; } frag_len = sizeof(*header) + ddt_len + payload_len; ret = ompi_osc_pt2pt_frag_alloc(module, target_rank, frag_len, &frag, &ptr, false, release_req); if (OMPI_SUCCESS != ret) { frag_len = sizeof(*header) + ddt_len; ret = ompi_osc_pt2pt_frag_alloc(module, target_rank, frag_len, &frag, &ptr, true, release_req); if (OMPI_SUCCESS != ret) { /* allocate space for the header plus space to store ddt_len */ frag_len = sizeof(*header) + 8; ret = ompi_osc_pt2pt_frag_alloc(module, target_rank, frag_len, &frag, &ptr, true, release_req); if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { return OMPI_ERR_OUT_OF_RESOURCE; } is_long_datatype = true; } is_long_msg = true; } tag = get_tag (module); /* If this is a long message then we need two completions before the * request is complete (1 for the send, 1 for the receive) */ pt2pt_request->outstanding_requests = 1 + is_long_msg; /* increment the number of outgoing fragments */ ompi_osc_signal_outgoing (module, target_rank, pt2pt_request->outstanding_requests); header = (ompi_osc_pt2pt_header_acc_t *) ptr; header->base.flags = 0; header->len = frag_len; header->count = target_count; header->displacement = target_disp; header->op = op->o_f_to_c_index; header->tag = tag; ptr = (char *)(header + 1); do { ret = ompi_datatype_get_pack_description(target_datatype, &packed_ddt); if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { break; } if (is_long_datatype) { /* the datatype does not fit in an eager message. send it seperately */ header->base.flags |= OMPI_OSC_PT2PT_HDR_FLAG_LARGE_DATATYPE; OMPI_DATATYPE_RETAIN(target_datatype); ret = ompi_osc_pt2pt_isend_w_cb ((void *) packed_ddt, ddt_len, MPI_BYTE, target_rank, tag_to_target(tag), module->comm, ompi_osc_pt2pt_dt_send_complete, target_datatype); if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { break; } *((uint64_t *) ptr) = ddt_len; ptr += 8; } else { memcpy((unsigned char*) ptr, packed_ddt, ddt_len); ptr += ddt_len; } ret = ompi_osc_pt2pt_irecv_w_cb (result_addr, result_count, result_datatype, target_rank, tag_to_origin(tag), module->comm, NULL, ompi_osc_pt2pt_req_comm_complete, pt2pt_request); if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { break; } if (!is_long_msg) { header->base.type = OMPI_OSC_PT2PT_HDR_TYPE_GET_ACC; osc_pt2pt_hton(header, proc); if (&ompi_mpi_op_no_op.op != op) { osc_pt2pt_copy_for_send (ptr, payload_len, origin_addr, proc, origin_count, origin_datatype); } } else { header->base.type = OMPI_OSC_PT2PT_HDR_TYPE_GET_ACC_LONG; osc_pt2pt_hton(header, proc); ret = ompi_osc_pt2pt_isend_w_cb (origin_addr, origin_count, origin_datatype, target_rank, tag_to_target(tag), module->comm, ompi_osc_pt2pt_req_comm_complete, pt2pt_request); } } while (0); if (OMPI_SUCCESS == ret) { header->base.flags |= OMPI_OSC_PT2PT_HDR_FLAG_VALID; *request = (ompi_request_t *) pt2pt_request; } return ompi_osc_pt2pt_frag_finish(module, frag); }
static int ompi_osc_pt2pt_accumulate_w_req (const void *origin_addr, int origin_count, struct ompi_datatype_t *origin_dt, int target, OPAL_PTRDIFF_TYPE target_disp, int target_count, struct ompi_datatype_t *target_dt, struct ompi_op_t *op, ompi_win_t *win, ompi_osc_pt2pt_request_t *request) { int ret; ompi_osc_pt2pt_module_t *module = GET_MODULE(win); ompi_proc_t *proc = ompi_comm_peer_lookup(module->comm, target); bool is_long_datatype = false; bool is_long_msg = false; ompi_osc_pt2pt_frag_t *frag; ompi_osc_pt2pt_header_acc_t *header; ompi_osc_pt2pt_sync_t *pt2pt_sync; size_t ddt_len, payload_len, frag_len; char *ptr; const void *packed_ddt; int tag = -1; OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, "acc: 0x%lx, %d, %s, %d, %d, %d, %s, %s, %s", (unsigned long) origin_addr, origin_count, origin_dt->name, target, (int) target_disp, target_count, target_dt->name, op->o_name, win->w_name)); pt2pt_sync = ompi_osc_pt2pt_module_sync_lookup (module, target, NULL); if (OPAL_UNLIKELY(NULL == pt2pt_sync)) { return OMPI_ERR_RMA_SYNC; } /* short-circuit case */ if (0 == origin_count || 0 == target_count) { if (request) { ompi_osc_pt2pt_request_complete (request, MPI_SUCCESS); } return OMPI_SUCCESS; } /* optimize the self case. TODO: optimize the local case */ if (ompi_comm_rank (module->comm) == target) { return ompi_osc_pt2pt_acc_self (pt2pt_sync, origin_addr, origin_count, origin_dt, target_disp, target_count, target_dt, op, module, request); } /* Compute datatype and payload lengths. Note that the datatype description * must fit in a single frag */ ddt_len = ompi_datatype_pack_description_length(target_dt); payload_len = origin_dt->super.size * origin_count; frag_len = sizeof(*header) + ddt_len + payload_len; ret = ompi_osc_pt2pt_frag_alloc(module, target, frag_len, &frag, &ptr); if (OMPI_SUCCESS != ret) { frag_len = sizeof(*header) + ddt_len; ret = ompi_osc_pt2pt_frag_alloc(module, target, frag_len, &frag, &ptr); if (OMPI_SUCCESS != ret) { /* allocate space for the header plus space to store ddt_len */ frag_len = sizeof(*header) + 8; ret = ompi_osc_pt2pt_frag_alloc(module, target, frag_len, &frag, &ptr); if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { return OMPI_ERR_OUT_OF_RESOURCE; } is_long_datatype = true; } is_long_msg = true; tag = get_rtag (module); } /* flush will be called at the end of this function. make sure all post messages have * arrived. */ if ((is_long_msg || request) && OMPI_OSC_PT2PT_SYNC_TYPE_PSCW == pt2pt_sync->type) { ompi_osc_pt2pt_sync_wait (pt2pt_sync); } header = (ompi_osc_pt2pt_header_acc_t*) ptr; header->base.flags = 0; header->len = frag_len; header->count = target_count; header->displacement = target_disp; header->op = op->o_f_to_c_index; ptr += sizeof (*header); do { ret = ompi_datatype_get_pack_description(target_dt, &packed_ddt); if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { break; } if (is_long_datatype) { /* the datatype does not fit in an eager message. send it seperately */ header->base.flags |= OMPI_OSC_PT2PT_HDR_FLAG_LARGE_DATATYPE; OBJ_RETAIN(target_dt); ret = ompi_osc_pt2pt_isend_w_cb ((void *) packed_ddt, ddt_len, MPI_BYTE, target, tag, module->comm, ompi_osc_pt2pt_dt_send_complete, target_dt); if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { break; } *((uint64_t *) ptr) = ddt_len; ptr += 8; } else { memcpy((unsigned char*) ptr, packed_ddt, ddt_len); ptr += ddt_len; } if (!is_long_msg) { header->base.type = OMPI_OSC_PT2PT_HDR_TYPE_ACC; osc_pt2pt_hton(header, proc); osc_pt2pt_copy_for_send (ptr, payload_len, origin_addr, proc, origin_count, origin_dt); /* the user's buffer is no longer needed so mark the request as * complete. */ if (request) { ompi_osc_pt2pt_request_complete (request, MPI_SUCCESS); } } else { header->base.type = OMPI_OSC_PT2PT_HDR_TYPE_ACC_LONG; header->tag = tag; osc_pt2pt_hton(header, proc); OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output, "acc: starting long accumulate with tag %d", tag)); /* increment the outgoing send count */ ompi_osc_signal_outgoing (module, target, 1); if (request) { request->outstanding_requests = 1; ret = ompi_osc_pt2pt_isend_w_cb (origin_addr, origin_count, origin_dt, target, tag, module->comm, ompi_osc_pt2pt_req_comm_complete, request); } else { ret = ompi_osc_pt2pt_component_isend (module, origin_addr, origin_count, origin_dt, target, tag, module->comm); } } } while (0); if (OMPI_SUCCESS != ret) { OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output, "acc: failed with eror %d", ret)); } else { /* mark the fragment as valid */ header->base.flags |= OMPI_OSC_PT2PT_HDR_FLAG_VALID; } ret = ompi_osc_pt2pt_frag_finish(module, frag); if (is_long_msg || request) { /* need to flush now in case the caller decides to wait on the request */ ompi_osc_pt2pt_frag_flush_target (module, target); } return ret; }
static int setup_scatter_buffers_linear(struct ompi_communicator_t *comm, ompi_coll_portals4_request_t *request, mca_coll_portals4_module_t *portals4_module) { int ret, line; int8_t i_am_root = (request->u.scatter.my_rank == request->u.scatter.root_rank); ompi_coll_portals4_create_send_converter (&request->u.scatter.send_converter, request->u.scatter.pack_src_buf, ompi_comm_peer_lookup(comm, request->u.scatter.my_rank), request->u.scatter.pack_src_count, request->u.scatter.pack_src_dtype); opal_convertor_get_packed_size(&request->u.scatter.send_converter, &request->u.scatter.packed_size); OBJ_DESTRUCT(&request->u.scatter.send_converter); /**********************************/ /* Setup Scatter Buffers */ /**********************************/ if (i_am_root) { /* * calculate the total size of the packed data */ request->u.scatter.scatter_bytes=request->u.scatter.packed_size * (ptrdiff_t)request->u.scatter.size; /* all transfers done using request->u.scatter.sdtype. * allocate temp buffer for recv, copy and/or rotate data at the end */ request->u.scatter.scatter_buf = (char *) malloc(request->u.scatter.scatter_bytes); if (NULL == request->u.scatter.scatter_buf) { ret = OMPI_ERR_OUT_OF_RESOURCE; line = __LINE__; goto err_hdlr; } request->u.scatter.free_after = 1; for (int32_t i=0; i<request->u.scatter.size; i++) { uint32_t iov_count = 1; struct iovec iov; size_t max_data; uint64_t offset = request->u.scatter.pack_src_extent * request->u.scatter.pack_src_count * i; opal_output_verbose(30, ompi_coll_base_framework.framework_output, "%s:%d:rank(%d): offset(%lu)", __FILE__, __LINE__, request->u.scatter.my_rank, offset); ompi_coll_portals4_create_send_converter (&request->u.scatter.send_converter, request->u.scatter.pack_src_buf + offset, ompi_comm_peer_lookup(comm, request->u.scatter.my_rank), request->u.scatter.pack_src_count, request->u.scatter.pack_src_dtype); iov.iov_len = request->u.scatter.packed_size; iov.iov_base = (IOVBASE_TYPE *) ((char *)request->u.scatter.scatter_buf + (request->u.scatter.packed_size*i)); opal_convertor_pack(&request->u.scatter.send_converter, &iov, &iov_count, &max_data); OBJ_DESTRUCT(&request->u.scatter.send_converter); } opal_output_verbose(30, ompi_coll_base_framework.framework_output, "%s:%d:rank(%d): root - scatter_buf(%p) - scatter_bytes(%lu)=packed_size(%ld) * size(%d)", __FILE__, __LINE__, request->u.scatter.my_rank, request->u.scatter.scatter_buf, request->u.scatter.scatter_bytes, request->u.scatter.packed_size, request->u.scatter.size); } else { request->u.scatter.scatter_bytes=request->u.scatter.packed_size; request->u.scatter.scatter_buf = (char *) malloc(request->u.scatter.scatter_bytes); if (NULL == request->u.scatter.scatter_buf) { ret = OMPI_ERR_OUT_OF_RESOURCE; line = __LINE__; goto err_hdlr; } request->u.scatter.free_after = 1; opal_output_verbose(30, ompi_coll_base_framework.framework_output, "%s:%d:rank(%d): leaf - scatter_buf(%p) - scatter_bytes(%lu)=packed_size(%ld)", __FILE__, __LINE__, request->u.scatter.my_rank, request->u.scatter.scatter_buf, request->u.scatter.scatter_bytes, request->u.scatter.packed_size); } return OMPI_SUCCESS; err_hdlr: opal_output(ompi_coll_base_framework.framework_output, "%s:%4d:%4d\tError occurred ret=%d, rank %2d", __FILE__, __LINE__, line, ret, request->u.scatter.my_rank); return ret; }
int ompi_osc_pt2pt_module_start(ompi_group_t *group, int assert, ompi_win_t *win) { int i, ret = OMPI_SUCCESS; ompi_osc_pt2pt_module_t *module = P2P_MODULE(win); OBJ_RETAIN(group); ompi_group_increment_proc_count(group); OPAL_THREAD_LOCK(&(module->p2p_lock)); if (NULL != module->p2p_sc_group) { OPAL_THREAD_UNLOCK(&module->p2p_lock); ret = MPI_ERR_RMA_SYNC; goto cleanup; } module->p2p_sc_group = group; /* possible we've already received a couple in messages, so add however many we're going to wait for */ module->p2p_num_post_msgs += ompi_group_size(module->p2p_sc_group); OPAL_THREAD_UNLOCK(&(module->p2p_lock)); memset(module->p2p_sc_remote_active_ranks, 0, sizeof(bool) * ompi_comm_size(module->p2p_comm)); /* for each process in the specified group, find it's rank in our communicator, store those indexes, and set the true / false in the active ranks table */ for (i = 0 ; i < ompi_group_size(group) ; i++) { int comm_rank = -1, j; /* find the rank in the communicator associated with this windows */ for (j = 0 ; j < ompi_comm_size(module->p2p_comm) ; ++j) { if (ompi_group_peer_lookup(module->p2p_sc_group, i) == ompi_comm_peer_lookup(module->p2p_comm, j)) { comm_rank = j; break; } } if (comm_rank == -1) { ret = MPI_ERR_RMA_SYNC; goto cleanup; } module->p2p_sc_remote_active_ranks[comm_rank] = true; module->p2p_sc_remote_ranks[i] = comm_rank; } /* Set our mode to access w/ start */ ompi_win_remove_mode(win, OMPI_WIN_FENCE); ompi_win_append_mode(win, OMPI_WIN_ACCESS_EPOCH | OMPI_WIN_STARTED); return OMPI_SUCCESS; cleanup: ompi_group_decrement_proc_count(group); OBJ_RELEASE(group); return ret; }
static inline int ompi_osc_rdma_put_w_req (void *origin_addr, int origin_count, struct ompi_datatype_t *origin_dt, int target, OPAL_PTRDIFF_TYPE target_disp, int target_count, struct ompi_datatype_t *target_dt, ompi_win_t *win, ompi_osc_rdma_request_t *request) { ompi_osc_rdma_module_t *module = GET_MODULE(win); ompi_proc_t *proc = ompi_comm_peer_lookup(module->comm, target); ompi_osc_rdma_frag_t *frag; ompi_osc_rdma_header_put_t *header; size_t ddt_len, payload_len, frag_len; bool is_long_datatype = false; bool is_long_msg = false; const void *packed_ddt; int tag = -1, ret; char *ptr; OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, "put: 0x%lx, %d, %s, %d, %d, %d, %s, %s", (unsigned long) origin_addr, origin_count, origin_dt->name, target, (int) target_disp, target_count, target_dt->name, win->w_name)); if (!ompi_osc_rdma_check_access_epoch (module, target)) { return OMPI_ERR_RMA_SYNC; } /* short-circuit case */ if (0 == origin_count || 0 == target_count) { if (request) { ompi_osc_rdma_request_complete (request, MPI_SUCCESS); } return OMPI_SUCCESS; } /* optimize self communication. TODO: optimize local communication */ if (ompi_comm_rank (module->comm) == target) { return ompi_osc_rdma_put_self (origin_addr, origin_count, origin_dt, target_disp, target_count, target_dt, module, request); } /* Compute datatype and payload lengths. Note that the datatype description * must fit in a single buffer */ ddt_len = ompi_datatype_pack_description_length(target_dt); payload_len = origin_dt->super.size * origin_count; frag_len = sizeof(ompi_osc_rdma_header_put_t) + ddt_len + payload_len; OPAL_THREAD_LOCK(&module->lock); ret = ompi_osc_rdma_frag_alloc(module, target, frag_len, &frag, &ptr); if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { frag_len = sizeof(ompi_osc_rdma_header_put_t) + ddt_len; ret = ompi_osc_rdma_frag_alloc(module, target, frag_len, &frag, &ptr); if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { /* allocate space for the header plus space to store ddt_len */ frag_len = sizeof(ompi_osc_rdma_header_put_t) + 8; ret = ompi_osc_rdma_frag_alloc(module, target, frag_len, &frag, &ptr); if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { OPAL_THREAD_UNLOCK(&module->lock); return OMPI_ERR_OUT_OF_RESOURCE; } is_long_datatype = true; } is_long_msg = true; tag = get_tag(module); } /* flush will be called at the end of this function. make sure the post message has * arrived. */ if ((is_long_msg || request) && module->sc_group) { while (0 != module->num_post_msgs) { OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, "waiting for post messages. num_post_msgs = %d", module->num_post_msgs)); opal_condition_wait(&module->cond, &module->lock); } } OPAL_THREAD_UNLOCK(&module->lock); OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, "osc rdma: put long protocol: %d, large datatype: %d", (int) is_long_msg, (int) is_long_datatype)); header = (ompi_osc_rdma_header_put_t *) ptr; header->base.flags = 0; header->len = frag_len; header->count = target_count; header->displacement = target_disp; ptr += sizeof(ompi_osc_rdma_header_put_t); do { ret = ompi_datatype_get_pack_description(target_dt, &packed_ddt); if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { break; } if (is_long_datatype) { /* the datatype does not fit in an eager message. send it seperately */ header->base.flags |= OMPI_OSC_RDMA_HDR_FLAG_LARGE_DATATYPE; OBJ_RETAIN(target_dt); ret = ompi_osc_rdma_isend_w_cb ((void *) packed_ddt, ddt_len, MPI_BYTE, target, tag, module->comm, ompi_osc_rdma_dt_send_complete, target_dt); if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { break; } *((uint64_t *) ptr) = ddt_len; ptr += 8; } else { memcpy((unsigned char*) ptr, packed_ddt, ddt_len); ptr += ddt_len; } if (!is_long_msg) { header->base.type = OMPI_OSC_RDMA_HDR_TYPE_PUT; osc_rdma_copy_for_send (ptr, payload_len, origin_addr, proc, origin_count, origin_dt); /* the user's buffer is no longer needed so mark the request as * complete. */ if (request) { ompi_osc_rdma_request_complete (request, MPI_SUCCESS); } } else { header->base.type = OMPI_OSC_RDMA_HDR_TYPE_PUT_LONG; header->tag = tag; /* increase the outgoing signal count */ ompi_osc_signal_outgoing (module, target, 1); if (request) { request->outstanding_requests = 1; ret = ompi_osc_rdma_isend_w_cb (origin_addr, origin_count, origin_dt, target, tag, module->comm, ompi_osc_rdma_req_comm_complete, request); } else { ret = ompi_osc_rdma_component_isend (module,origin_addr, origin_count, origin_dt, target, tag, module->comm); } } } while (0); if (OPAL_LIKELY(OMPI_SUCCESS == ret)) { header->base.flags |= OMPI_OSC_RDMA_HDR_FLAG_VALID; } OPAL_THREAD_LOCK(&module->lock); ret = ompi_osc_rdma_frag_finish(module, frag); if (request || is_long_msg) { /* need to flush now in case the caller decides to wait on the request */ ompi_osc_rdma_frag_flush_target (module, target); } OPAL_THREAD_UNLOCK(&module->lock); return ret; }
int mca_pml_ob1_send(void *buf, size_t count, ompi_datatype_t * datatype, int dst, int tag, mca_pml_base_send_mode_t sendmode, ompi_communicator_t * comm) { mca_pml_ob1_comm_t* ob1_comm = comm->c_pml_comm; ompi_proc_t *dst_proc = ompi_comm_peer_lookup (comm, dst); mca_bml_base_endpoint_t* endpoint = (mca_bml_base_endpoint_t*) dst_proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML]; mca_pml_ob1_send_request_t *sendreq = alloca(mca_pml_base_send_requests.fl_frag_size); int16_t seqn; int rc; if (OPAL_UNLIKELY(MCA_PML_BASE_SEND_BUFFERED == sendmode)) { /* large buffered sends *need* a real request so use isend instead */ ompi_request_t *brequest; rc = mca_pml_ob1_isend (buf, count, datatype, dst, tag, sendmode, comm, &brequest); if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { return rc; } /* free the request and return. don't care if it completes now */ ompi_request_free (&brequest); return OMPI_SUCCESS; } if (OPAL_UNLIKELY(NULL == endpoint)) { return OMPI_ERR_UNREACH; } seqn = (uint16_t) OPAL_THREAD_ADD32(&ob1_comm->procs[dst].send_sequence, 1); if (MCA_PML_BASE_SEND_SYNCHRONOUS != sendmode) { rc = mca_pml_ob1_send_inline (buf, count, datatype, dst, tag, seqn, dst_proc, endpoint, comm); if (OPAL_LIKELY(0 <= rc)) { return OMPI_SUCCESS; } } OBJ_CONSTRUCT(sendreq, mca_pml_ob1_send_request_t); sendreq->req_send.req_base.req_proc = dst_proc; sendreq->src_des = NULL; MCA_PML_OB1_SEND_REQUEST_INIT(sendreq, buf, count, datatype, dst, tag, comm, sendmode, false); PERUSE_TRACE_COMM_EVENT (PERUSE_COMM_REQ_ACTIVATE, &sendreq->req_send.req_base, PERUSE_SEND); MCA_PML_OB1_SEND_REQUEST_START_W_SEQ(sendreq, endpoint, seqn, rc); if (rc != OMPI_SUCCESS) { return rc; } ompi_request_wait_completion(&sendreq->req_send.req_base.req_ompi); rc = sendreq->req_send.req_base.req_ompi.req_status.MPI_ERROR; MCA_PML_BASE_SEND_REQUEST_FINI(&sendreq->req_send); OBJ_DESTRUCT(sendreq); return rc; }