int ompi_mtl_portals4_imrecv(struct mca_mtl_base_module_t* mtl, struct opal_convertor_t *convertor, struct ompi_message_t **message, struct mca_mtl_request_t *mtl_request) { ompi_mtl_portals4_recv_request_t *ptl_request = (ompi_mtl_portals4_recv_request_t*) mtl_request; void *start; size_t length; bool free_after; int ret; ompi_mtl_portals4_message_t *ptl_message = (ompi_mtl_portals4_message_t*) (*message)->req_ptr; ret = ompi_mtl_datatype_recv_buf(convertor, &start, &length, &free_after); if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { return ret; } #if OPAL_ENABLE_DEBUG ptl_request->opcount = OPAL_THREAD_ADD64((int64_t*) &ompi_mtl_portals4.recv_opcount, 1); ptl_request->hdr_data = 0; #endif ptl_request->super.type = portals4_req_recv; ptl_request->super.event_callback = ompi_mtl_portals4_recv_progress; ptl_request->buffer_ptr = (free_after) ? start : NULL; ptl_request->convertor = convertor; ptl_request->delivery_ptr = start; ptl_request->delivery_len = length; ptl_request->super.super.ompi_req->req_status.MPI_ERROR = OMPI_SUCCESS; ptl_request->pending_reply = 0; OPAL_OUTPUT_VERBOSE((50, ompi_mtl_base_framework.framework_output, "Mrecv %lu of length %ld (0x%lx)\n", ptl_request->opcount, (int64_t)length, (unsigned long) ptl_request)); (*message) = MPI_MESSAGE_NULL; return ompi_mtl_portals4_recv_progress(&(ptl_message->ev), &ptl_request->super); }
int mca_coll_hcoll_igatherv(const void* sbuf, int scount, struct ompi_datatype_t *sdtype, void* rbuf, const int *rcounts, const int *displs, struct ompi_datatype_t *rdtype, int root, struct ompi_communicator_t *comm, ompi_request_t ** request, mca_coll_base_module_t *module) { dte_data_representation_t stype; dte_data_representation_t rtype; int rc; void** rt_handle; HCOL_VERBOSE(20,"RUNNING HCOL IGATHERV"); mca_coll_hcoll_module_t *hcoll_module = (mca_coll_hcoll_module_t*)module; rt_handle = (void**) request; stype = ompi_dtype_2_hcoll_dtype(sdtype, NO_DERIVED); rtype = ompi_dtype_2_hcoll_dtype(rdtype, NO_DERIVED); if (OPAL_UNLIKELY(HCOL_DTE_IS_ZERO(stype) || HCOL_DTE_IS_ZERO(rtype))) { /*If we are here then datatype is not simple predefined datatype */ /*In future we need to add more complex mapping to the dte_data_representation_t */ /* Now use fallback */ HCOL_VERBOSE(20,"Ompi_datatype is not supported: sdtype = %s, rdtype = %s; calling fallback igatherv;", sdtype->super.name, rdtype->super.name); rc = hcoll_module->previous_igatherv(sbuf,scount,sdtype, rbuf, rcounts, displs, rdtype,root, comm, request, hcoll_module->previous_igatherv_module); return rc; } rc = hcoll_collectives.coll_igatherv((void *)sbuf, scount, stype, rbuf, (int *)rcounts, (int *)displs, rtype, root, hcoll_module->hcoll_context, rt_handle); if (HCOLL_SUCCESS != rc){ HCOL_VERBOSE(20,"RUNNING FALLBACK IGATHERV"); rc = hcoll_module->previous_igatherv(sbuf,scount,sdtype, rbuf, rcounts, displs, rdtype,root, comm, request, hcoll_module->previous_igatherv_module); } return rc; }
/* Return the largest size data size that can be packed into max_len using the * given convertor. For example, a 1000 byte max_len buffer may only be able * to hold 998 bytes if an indivisible convertor element straddles the 1000 * byte boundary. * * This routine internally clones the convertor and does not mutate it! */ size_t opal_btl_usnic_convertor_pack_peek( const opal_convertor_t *conv, size_t max_len) { int rc; size_t packable_len, position; opal_convertor_t temp; OBJ_CONSTRUCT(&temp, opal_convertor_t); position = conv->bConverted + max_len; rc = opal_convertor_clone_with_position(conv, &temp, 1, &position); if (OPAL_UNLIKELY(rc < 0)) { BTL_ERROR(("unexpected convertor error")); abort(); /* XXX */ } assert(position >= conv->bConverted); packable_len = position - conv->bConverted; OBJ_DESTRUCT(&temp); return packable_len; }
int mca_btl_ugni_ep_handle_cleanup (mca_btl_ugni_endpoint_handle_t *ep_handle) { int rc; if (0 == ep_handle->gni_handle) { return OPAL_SUCCESS; } /* TODO: need to fix, may be outstanding tx's, etc. */ rc = GNI_EpUnbind (ep_handle->gni_handle); if (OPAL_UNLIKELY(GNI_RC_SUCCESS != rc)) { /* should warn */ } else { (void) GNI_EpDestroy (ep_handle->gni_handle); } ep_handle->gni_handle = 0; return OPAL_SUCCESS; }
static int mca_btl_ugni_smsg_setup (void) { gni_smsg_attr_t tmp_smsg_attrib; unsigned int mbox_size; size_t nprocs; gni_return_t rc; (void) ompi_proc_world (&nprocs); if (0 == mca_btl_ugni_component.ugni_smsg_limit) { /* auto-set the smsg limit based on the number of ranks */ if (nprocs <= 512) { mca_btl_ugni_component.ugni_smsg_limit = 8192; } else if (nprocs <= 1024) { mca_btl_ugni_component.ugni_smsg_limit = 2048; } else if (nprocs <= 8192) { mca_btl_ugni_component.ugni_smsg_limit = 1024; } else if (nprocs <= 16384) { mca_btl_ugni_component.ugni_smsg_limit = 512; } else { mca_btl_ugni_component.ugni_smsg_limit = 256; } } mca_btl_ugni_component.smsg_max_data = mca_btl_ugni_component.ugni_smsg_limit - sizeof (mca_btl_ugni_send_frag_hdr_t); /* calculate mailbox size */ tmp_smsg_attrib.msg_type = GNI_SMSG_TYPE_MBOX_AUTO_RETRANSMIT; tmp_smsg_attrib.msg_maxsize = mca_btl_ugni_component.ugni_smsg_limit; tmp_smsg_attrib.mbox_maxcredit = mca_btl_ugni_component.smsg_max_credits; rc = GNI_SmsgBufferSizeNeeded (&tmp_smsg_attrib, &mbox_size); if (OPAL_UNLIKELY(GNI_RC_SUCCESS != rc)) { BTL_ERROR(("error in GNI_SmsgBufferSizeNeeded")); return ompi_common_rc_ugni_to_ompi (rc); } mca_btl_ugni_component.smsg_mbox_size = OPAL_ALIGN(mbox_size, 64, unsigned int); return OMPI_SUCCESS; }
int mca_pml_crcpw_enable(bool enable) { int ret; ompi_crcp_base_pml_state_t * pml_state = NULL; if( OPAL_UNLIKELY(NULL == ompi_crcp.pml_enable) ) { return mca_pml_crcpw_module.wrapped_pml_module.pml_enable(enable); } PML_CRCP_STATE_ALLOC(pml_state, ret); pml_state->wrapped_pml_component = &(mca_pml_crcpw_module.wrapped_pml_component); pml_state->wrapped_pml_module = &(mca_pml_crcpw_module.wrapped_pml_module); pml_state->state = OMPI_CRCP_PML_PRE; pml_state = ompi_crcp.pml_enable(enable, pml_state); if( OMPI_SUCCESS != pml_state->error_code) { ret = pml_state->error_code; PML_CRCP_STATE_RETURN(pml_state); return ret; } if( OMPI_CRCP_PML_SKIP != pml_state->state) { if( OMPI_SUCCESS != (ret = mca_pml_crcpw_module.wrapped_pml_module.pml_enable(enable) ) ) { PML_CRCP_STATE_RETURN(pml_state); return ret; } } pml_state->state = OMPI_CRCP_PML_POST; pml_state = ompi_crcp.pml_enable(enable, pml_state); if( OMPI_SUCCESS != pml_state->error_code) { ret = pml_state->error_code; PML_CRCP_STATE_RETURN(pml_state); return ret; } PML_CRCP_STATE_RETURN(pml_state); return OMPI_SUCCESS; }
/** * Handle the CUDA buffer. */ int mca_pml_bfo_send_request_start_cuda(mca_pml_bfo_send_request_t* sendreq, mca_bml_base_btl_t* bml_btl, size_t size) { int rc; #if OPAL_CUDA_SUPPORT_41 sendreq->req_send.req_base.req_convertor.flags &= ~CONVERTOR_CUDA; if (opal_convertor_need_buffers(&sendreq->req_send.req_base.req_convertor) == false) { unsigned char *base; opal_convertor_get_current_pointer( &sendreq->req_send.req_base.req_convertor, (void**)&base ); /* Set flag back */ sendreq->req_send.req_base.req_convertor.flags |= CONVERTOR_CUDA; if( 0 != (sendreq->req_rdma_cnt = (uint32_t)mca_pml_bfo_rdma_cuda_btls( sendreq->req_endpoint, base, sendreq->req_send.req_bytes_packed, sendreq->req_rdma))) { rc = mca_pml_bfo_send_request_start_rdma(sendreq, bml_btl, sendreq->req_send.req_bytes_packed); if( OPAL_UNLIKELY(OMPI_SUCCESS != rc) ) { mca_pml_bfo_free_rdma_resources(sendreq); } } else { if (bml_btl->btl_flags & MCA_BTL_FLAGS_CUDA_PUT) { rc = mca_pml_bfo_send_request_start_rndv(sendreq, bml_btl, size, MCA_PML_BFO_HDR_FLAGS_CONTIG); } else { rc = mca_pml_bfo_send_request_start_rndv(sendreq, bml_btl, size, 0); } } } else { /* Do not send anything with first rendezvous message as copying GPU * memory into RNDV message is expensive. */ sendreq->req_send.req_base.req_convertor.flags |= CONVERTOR_CUDA; rc = mca_pml_bfo_send_request_start_rndv(sendreq, bml_btl, 0, 0); } #else /* Just do the rendezvous but set initial data to be sent to zero */ rc = mca_pml_bfo_send_request_start_rndv(sendreq, bml_btl, 0, 0); #endif /* OPAL_CUDA_SUPPORT_41 */ return rc; }
int mca_btl_ugni_progress_send_wait_list (mca_btl_base_endpoint_t *endpoint) { mca_btl_ugni_base_frag_t *frag; int rc; while (NULL != (frag = (mca_btl_ugni_base_frag_t *) opal_list_remove_first (&endpoint->frag_wait_list))) { rc = mca_btl_ugni_send_frag (endpoint, frag); if (OPAL_UNLIKELY(OMPI_SUCCESS > rc)) { if (OPAL_LIKELY(OMPI_ERR_OUT_OF_RESOURCE == rc)) { opal_list_prepend (&endpoint->frag_wait_list, (opal_list_item_t *) frag); } else { mca_btl_ugni_frag_complete (frag, rc); } return rc; } } return OMPI_SUCCESS; }
/* Get a context to use for communication. * If TLS is supported, it will use the cached endpoint. * If not, it will invoke the normal round-robin assignment. */ mca_btl_ofi_context_t *get_ofi_context(mca_btl_ofi_module_t *btl) { #if OPAL_HAVE_THREAD_LOCAL /* With TLS, we cache the context we use. */ static volatile int64_t cur_num = 0; if (OPAL_UNLIKELY(my_context == NULL)) { OPAL_THREAD_LOCK(&btl->module_lock); my_context = &btl->contexts[cur_num]; cur_num = (cur_num + 1) %btl->num_contexts; OPAL_THREAD_UNLOCK(&btl->module_lock); } assert (my_context); return my_context; #else return get_ofi_context_rr(btl); #endif }
int mca_coll_hcoll_allgather(const void *sbuf, int scount, struct ompi_datatype_t *sdtype, void *rbuf, int rcount, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { dte_data_representation_t stype; dte_data_representation_t rtype; int rc; HCOL_VERBOSE(20,"RUNNING HCOL ALLGATHER"); mca_coll_hcoll_module_t *hcoll_module = (mca_coll_hcoll_module_t*)module; stype = ompi_dtype_2_hcoll_dtype(sdtype, TRY_FIND_DERIVED); rtype = ompi_dtype_2_hcoll_dtype(rdtype, TRY_FIND_DERIVED); if (sbuf == MPI_IN_PLACE) { stype = rtype; } if (OPAL_UNLIKELY(HCOL_DTE_IS_ZERO(stype) || HCOL_DTE_IS_ZERO(rtype))) { /*If we are here then datatype is not simple predefined datatype */ /*In future we need to add more complex mapping to the dte_data_representation_t */ /* Now use fallback */ HCOL_VERBOSE(20,"Ompi_datatype is not supported: sdtype = %s, rdtype = %s; calling fallback allgather;", sdtype->super.name, rdtype->super.name); rc = hcoll_module->previous_allgather(sbuf,scount,sdtype, rbuf,rcount,rdtype, comm, hcoll_module->previous_allgather_module); return rc; } rc = hcoll_collectives.coll_allgather((void *)sbuf,scount,stype,rbuf,rcount,rtype,hcoll_module->hcoll_context); if (HCOLL_SUCCESS != rc){ HCOL_VERBOSE(20,"RUNNING FALLBACK ALLGATHER"); rc = hcoll_module->previous_allgather(sbuf,scount,sdtype, rbuf,rcount,rdtype, comm, hcoll_module->previous_allgather_module); } return rc; }
int ompi_mtl_portals4_recv_short_init(void) { int i; OBJ_CONSTRUCT(&ompi_mtl_portals4.short_block_mutex, opal_mutex_t); OBJ_CONSTRUCT(&(ompi_mtl_portals4.recv_short_blocks), opal_list_t); /* create the recv blocks */ for (i = 0 ; i < ompi_mtl_portals4.recv_short_num ; ++i) { ompi_mtl_portals4_recv_short_block_t *block = ompi_mtl_portals4_recv_short_block_alloc(false); if (OPAL_UNLIKELY(NULL == block)) { return OMPI_ERR_OUT_OF_RESOURCE; } opal_list_append(&ompi_mtl_portals4.recv_short_blocks, &block->base); ompi_mtl_portals4_activate_block(block); } return OMPI_SUCCESS; }
int mca_btl_vader_put_xpmem (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint, void *local_address, uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle, mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags, int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata) { mca_rcache_base_registration_t *reg; void *rem_ptr; reg = vader_get_registation (endpoint, (void *)(intptr_t) remote_address, size, 0, &rem_ptr); if (OPAL_UNLIKELY(NULL == reg)) { return OPAL_ERROR; } vader_memmove (rem_ptr, local_address, size); vader_return_registration (reg, endpoint); /* always call the callback function */ cbfunc (btl, endpoint, local_address, local_handle, cbcontext, cbdata, OPAL_SUCCESS); return OPAL_SUCCESS; }
int NBC_Start(NBC_Handle *handle) { int res; /* bozo case */ if ((ompi_request_t *)handle == &ompi_request_empty) { return OMPI_SUCCESS; } /* kick off first round */ handle->super.req_state = OMPI_REQUEST_ACTIVE; handle->super.req_status.MPI_ERROR = OMPI_SUCCESS; res = NBC_Start_round(handle); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { return res; } OPAL_THREAD_LOCK(&mca_coll_libnbc_component.lock); opal_list_append(&mca_coll_libnbc_component.active_requests, &(handle->super.super.super)); OPAL_THREAD_UNLOCK(&mca_coll_libnbc_component.lock); return OMPI_SUCCESS; }
static inline int ompi_osc_rdma_rget_accumulate_internal (ompi_osc_rdma_sync_t *sync, const void *origin_addr, int origin_count, struct ompi_datatype_t *origin_datatype, void *result_addr, int result_count, struct ompi_datatype_t *result_datatype, ompi_osc_rdma_peer_t *peer, int target_rank, MPI_Aint target_disp, int target_count, struct ompi_datatype_t *target_datatype, struct ompi_op_t *op, ompi_osc_rdma_request_t *request) { ompi_osc_rdma_module_t *module = sync->module; mca_btl_base_registration_handle_t *target_handle; uint64_t target_address; int ret; /* short-circuit case. note that origin_count may be 0 if op is MPI_NO_OP */ if ((result_addr && 0 == result_count) || 0 == target_count) { if (request) { ompi_osc_rdma_request_complete (request, MPI_SUCCESS); } return OMPI_SUCCESS; } ret = osc_rdma_get_remote_segment (module, peer, target_disp, target_datatype->super.size * target_count, &target_address, &target_handle); if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { return ret; } if (ompi_osc_rdma_peer_local_base (peer)) { /* local/self optimization */ return ompi_osc_rdma_gacc_local (origin_addr, origin_count, origin_datatype, result_addr, result_count, result_datatype, peer, target_address, target_handle, target_count, target_datatype, op, module, request); } return ompi_osc_rdma_gacc_master (sync, origin_addr, origin_count, origin_datatype, result_addr, result_count, result_datatype, peer, target_address, target_handle, target_count, target_datatype, op, request); }
int mca_pml_cm_irecv_init(void *addr, size_t count, ompi_datatype_t * datatype, int src, int tag, struct ompi_communicator_t *comm, struct ompi_request_t **request) { int ret; mca_pml_cm_hvy_recv_request_t *recvreq; ompi_proc_t* ompi_proc; MCA_PML_CM_HVY_RECV_REQUEST_ALLOC(recvreq, ret); if( OPAL_UNLIKELY(OMPI_SUCCESS != ret) ) return ret; MCA_PML_CM_HVY_RECV_REQUEST_INIT(recvreq, ompi_proc, comm, tag, src, datatype, addr, count, true); *request = (ompi_request_t*) recvreq; return OMPI_SUCCESS; }
/** * Hierarchical non-blocking barrier */ int mca_coll_ml_ibarrier_intra(struct ompi_communicator_t *comm, ompi_request_t **req, mca_coll_base_module_t *module) { int rc; mca_coll_ml_module_t *ml_module = (mca_coll_ml_module_t *) module; #if OPAL_ENABLE_DEBUG static int barriers_count = 0; #endif ML_VERBOSE(10, ("IBarrier num %d start.", ++barriers_count)); rc = mca_coll_ml_barrier_launch(ml_module, req); if (OPAL_UNLIKELY(rc != OMPI_SUCCESS)) { ML_ERROR(("Failed to launch a barrier.")); return rc; } ML_VERBOSE(10, ("IBarrier num %d was done.", barriers_count)); return OMPI_SUCCESS; }
static inline int ompi_osc_pt2pt_get_self (ompi_osc_pt2pt_sync_t *pt2pt_sync, void *target, int target_count, ompi_datatype_t *target_datatype, OPAL_PTRDIFF_TYPE source_disp, int source_count, ompi_datatype_t *source_datatype, ompi_osc_pt2pt_module_t *module, ompi_osc_pt2pt_request_t *request) { void *source = (unsigned char*) module->baseptr + ((unsigned long) source_disp * module->disp_unit); int ret; /* if we are in active target mode wait until all post messages arrive */ ompi_osc_pt2pt_sync_wait_expected (pt2pt_sync); ret = ompi_datatype_sndrcv (source, source_count, source_datatype, target, target_count, target_datatype); if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { return ret; } if (request) { ompi_osc_pt2pt_request_complete (request, MPI_SUCCESS); } return OMPI_SUCCESS; }
int mca_coll_ml_allgather_nb(const void *sbuf, int scount, struct ompi_datatype_t *sdtype, void* rbuf, int rcount, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, ompi_request_t **req, mca_coll_base_module_t *module) { int ret; ML_VERBOSE(10, ("Starting non-blocking allgather")); ret = mca_coll_ml_allgather_start (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm, module, req); if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { return ret; } ML_VERBOSE(10, ("Non-blocking allgather started")); return ret; }
int mca_btl_vader_get_sc_emu (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint, void *local_address, uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle, mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags, int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata) { mca_btl_vader_frag_t *frag; if (size > mca_btl_vader.super.btl_get_limit) { return OPAL_ERR_NOT_AVAILABLE; } frag = mca_btl_vader_rdma_frag_alloc (btl, endpoint, MCA_BTL_VADER_OP_GET, 0, 0, 0, order, flags, size, local_address, remote_address, cbfunc, cbcontext, cbdata, mca_btl_vader_sc_emu_get_complete); if (OPAL_UNLIKELY(NULL == frag)) { return OPAL_ERR_OUT_OF_RESOURCE; } /* send is always successful */ (void) mca_btl_vader_send (btl, endpoint, &frag->base, MCA_BTL_TAG_VADER); return OPAL_SUCCESS; }
int ompi_osc_rdma_accumulate (const void *origin_addr, int origin_count, struct ompi_datatype_t *origin_datatype, int target_rank, OPAL_PTRDIFF_TYPE target_disp, int target_count, struct ompi_datatype_t *target_datatype, struct ompi_op_t *op, struct ompi_win_t *win) { ompi_osc_rdma_module_t *module = GET_MODULE(win); ompi_osc_rdma_peer_t *peer; ompi_osc_rdma_sync_t *sync; sync = ompi_osc_rdma_module_sync_lookup (module, target_rank, &peer); if (OPAL_UNLIKELY(NULL == sync)) { return OMPI_ERR_RMA_SYNC; } OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, "acc: 0x%lx, %d, %s, %d, 0x%lx, %d, %s, %s, %s", (unsigned long) origin_addr, origin_count, origin_datatype->name, target_rank, (unsigned long) target_disp, target_count, target_datatype->name, op->o_name, win->w_name)); return ompi_osc_rdma_rget_accumulate_internal (sync, origin_addr, origin_count, origin_datatype, NULL, 0, NULL, peer, target_rank, target_disp, target_count, target_datatype, op, NULL); }
int mca_rcache_base_vma_find_all (mca_rcache_base_vma_module_t *vma_module, void *addr, size_t size, mca_rcache_base_registration_t **regs, int reg_cnt) { int rc; unsigned char *bound_addr; if(size == 0) { return OPAL_ERROR; } bound_addr = (unsigned char *) ((intptr_t) addr + size - 1); /* Check to ensure that the cache is valid */ if (OPAL_UNLIKELY(opal_memory_changed() && NULL != opal_memory->memoryc_process && OPAL_SUCCESS != (rc = opal_memory->memoryc_process()))) { return rc; } return mca_rcache_base_vma_tree_find_all (vma_module, (unsigned char *) addr, bound_addr, regs, reg_cnt); }
/* * These functions can be used in order to create an IDENTICAL copy of one convertor. In this * context IDENTICAL means that the datatype and count and all other properties of the basic * convertor get replicated on this new convertor. However, the references to the datatype * are not increased. This function take special care about the stack. If all the cases the * stack is created with the correct number of entries but if the copy_stack is true (!= 0) * then the content of the old stack is copied on the new one. The result will be a convertor * ready to use starting from the old position. If copy_stack is false then the convertor * is created with a empty stack (you have to use opal_convertor_set_position before using it). */ int opal_convertor_clone( const opal_convertor_t* source, opal_convertor_t* destination, int32_t copy_stack ) { destination->remoteArch = source->remoteArch; destination->flags = source->flags; destination->pDesc = source->pDesc; destination->use_desc = source->use_desc; destination->count = source->count; destination->pBaseBuf = source->pBaseBuf; destination->fAdvance = source->fAdvance; destination->master = source->master; destination->local_size = source->local_size; destination->remote_size = source->remote_size; /* create the stack */ if( OPAL_UNLIKELY(source->stack_size > DT_STATIC_STACK_SIZE) ) { destination->pStack = (dt_stack_t*)malloc(sizeof(dt_stack_t) * source->stack_size ); } else { destination->pStack = destination->static_stack; } destination->stack_size = source->stack_size; /* initialize the stack */ if( OPAL_LIKELY(0 == copy_stack) ) { destination->bConverted = -1; destination->stack_pos = -1; } else { memcpy( destination->pStack, source->pStack, sizeof(dt_stack_t) * (source->stack_pos+1) ); destination->bConverted = source->bConverted; destination->stack_pos = source->stack_pos; } #if OPAL_CUDA_SUPPORT destination->cbmemcpy = source->cbmemcpy; #endif return OPAL_SUCCESS; }
int mca_pml_ucx_isend(const void *buf, size_t count, ompi_datatype_t *datatype, int dst, int tag, mca_pml_base_send_mode_t mode, struct ompi_communicator_t* comm, struct ompi_request_t **request) { ompi_request_t *req; ucp_ep_h ep; PML_UCX_TRACE_SEND("isend request *%p", buf, count, datatype, dst, tag, mode, comm, (void*)request) /* TODO special care to sync/buffered send */ ep = mca_pml_ucx_get_ep(comm, dst); if (OPAL_UNLIKELY(NULL == ep)) { PML_UCX_ERROR("Failed to get ep for rank %d", dst); return OMPI_ERROR; } req = (ompi_request_t*)ucp_tag_send_nb(ep, buf, count, mca_pml_ucx_get_datatype(datatype), PML_UCX_MAKE_SEND_TAG(tag, comm), mca_pml_ucx_send_completion); if (req == NULL) { PML_UCX_VERBOSE(8, "returning completed request"); *request = &ompi_pml_ucx.completed_send_req; return OMPI_SUCCESS; } else if (!UCS_PTR_IS_ERR(req)) { PML_UCX_VERBOSE(8, "got request %p", (void*)req); *request = req; return OMPI_SUCCESS; } else { PML_UCX_ERROR("ucx send failed: %s", ucs_status_string(UCS_PTR_STATUS(req))); return OMPI_ERROR; } }
/** * Allocate a segment. * * @param btl (IN) BTL module * @param size (IN) Request segment size. */ static mca_btl_base_descriptor_t *mca_btl_self_alloc (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, uint8_t order, size_t size, uint32_t flags) { mca_btl_self_frag_t *frag = NULL; if (size <= MCA_BTL_SELF_MAX_INLINE_SIZE) { MCA_BTL_SELF_FRAG_ALLOC_RDMA(frag); } else if (size <= mca_btl_self.btl_eager_limit) { MCA_BTL_SELF_FRAG_ALLOC_EAGER(frag); } else if (size <= btl->btl_max_send_size) { MCA_BTL_SELF_FRAG_ALLOC_SEND(frag); } if( OPAL_UNLIKELY(NULL == frag) ) { return NULL; } frag->segments[0].seg_len = size; frag->base.des_segment_count = 1; frag->base.des_flags = flags; return &frag->base; }
void mca_spml_yoda_put_completion(mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t* ep, struct mca_btl_base_descriptor_t* des, int status) { mca_spml_yoda_rdma_frag_t* frag = (mca_spml_yoda_rdma_frag_t*) des->des_cbdata; mca_spml_yoda_put_request_t* putreq = (mca_spml_yoda_put_request_t*) frag->rdma_req; mca_bml_base_btl_t* bml_btl = (mca_bml_base_btl_t*) des->des_context; OPAL_THREAD_ADD32(&mca_spml_yoda.n_active_puts, -1); /* check completion status */ if (OPAL_UNLIKELY(OSHMEM_SUCCESS != status)) { /* no way to propagete errors. die */ SPML_ERROR("FATAL put completion error"); oshmem_shmem_abort(-1); } putreq->req_put.req_base.req_spml_complete = true; oshmem_request_complete(&putreq->req_put.req_base.req_oshmem, 1); oshmem_request_free((oshmem_request_t**) &putreq); mca_bml_base_free(bml_btl, des); }
/** * Allocate a segment. * * @param btl (IN) BTL module * @param size (IN) Request segment size. */ mca_btl_base_descriptor_t* mca_btl_self_alloc( struct mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t* endpoint, uint8_t order, size_t size, uint32_t flags) { mca_btl_self_frag_t* frag = NULL; if(size <= mca_btl_self.btl_eager_limit) { MCA_BTL_SELF_FRAG_ALLOC_EAGER(frag); } else if (size <= btl->btl_max_send_size) { MCA_BTL_SELF_FRAG_ALLOC_SEND(frag); } if( OPAL_UNLIKELY(NULL == frag) ) { return NULL; } frag->segment.seg_len = size; frag->base.des_flags = flags; frag->base.des_src = &(frag->segment); frag->base.des_src_cnt = 1; return (mca_btl_base_descriptor_t*)frag; }
int mca_rcache_vma_find(struct mca_rcache_base_module_t* rcache, void* addr, size_t size, mca_mpool_base_registration_t **reg) { int rc; unsigned char* bound_addr; if(size == 0) { return OMPI_ERROR; } bound_addr = addr + size - 1; /* Check to ensure that the cache is valid */ if (OPAL_UNLIKELY(opal_memory_changed() && NULL != opal_memory->memoryc_process && OPAL_SUCCESS != (rc = opal_memory->memoryc_process()))) { return rc; } *reg = mca_rcache_vma_tree_find((mca_rcache_vma_module_t*)rcache, (unsigned char*)addr, bound_addr); return OMPI_SUCCESS; }
/** * Initiate an synchronous get. * * @param btl (IN) BTL module * @param endpoint (IN) BTL addressing information * @param descriptor (IN) Description of the data to be transferred */ int mca_btl_vader_get (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, struct mca_btl_base_descriptor_t *des) { mca_btl_vader_frag_t *frag = (mca_btl_vader_frag_t *) des; mca_btl_base_segment_t *src = des->des_src; mca_btl_base_segment_t *dst = des->des_dst; const size_t size = min(dst->seg_len, src->seg_len); mca_mpool_base_registration_t *reg; void *rem_ptr; reg = vader_get_registation (endpoint, src->seg_addr.pval, src->seg_len, 0, &rem_ptr); if (OPAL_UNLIKELY(NULL == rem_ptr)) { return OMPI_ERROR; } vader_memmove (dst->seg_addr.pval, rem_ptr, size); vader_return_registration (reg, endpoint); mca_btl_vader_frag_complete (frag); return OMPI_SUCCESS; }
static inline int mca_spml_yoda_put_internal(void *dst_addr, size_t size, void *src_addr, int dst, int is_nb) { int rc = OSHMEM_SUCCESS; mca_spml_yoda_put_request_t *putreq = NULL; mca_bml_base_btl_t* bml_btl; mca_btl_base_descriptor_t* des = NULL; mca_btl_base_segment_t* segment; mca_spml_yoda_rdma_frag_t* frag; int nfrags; int i; unsigned ncopied = 0; unsigned int frag_size = 0; char *p_src, *p_dst; void* rva; sshmem_mkey_t *r_mkey; int btl_id = 0; struct yoda_btl *ybtl; int put_via_send; mca_btl_base_registration_handle_t *local_handle = NULL, *remote_handle = NULL; /* If nothing to put its OK.*/ if (0 >= size) { return OSHMEM_SUCCESS; } /* Find bml_btl and its global btl_id */ bml_btl = get_next_btl(dst, &btl_id); if (!bml_btl) { SPML_ERROR("cannot reach %d pe: no appropriate btl found", oshmem_my_proc_id()); rc = OSHMEM_ERR_FATAL; goto exit_fatal; } /* Check if btl has PUT method. If it doesn't - use SEND*/ put_via_send = !(bml_btl->btl->btl_flags & MCA_BTL_FLAGS_PUT); /* Get rkey of remote PE (dst proc) which must be on memheap*/ r_mkey = mca_memheap_base_get_cached_mkey(dst, dst_addr, btl_id, &rva); if (!r_mkey) { SPML_ERROR("pe=%d: %p is not address of shared variable", dst, dst_addr); rc = OSHMEM_ERR_FATAL; goto exit_fatal; } #if SPML_YODA_DEBUG == 1 SPML_VERBOSE(100, "put: pe:%d dst=%p <- src: %p sz=%d. dst_rva=%p, %s", dst, dst_addr, src_addr, (int)size, (void *)rva, mca_spml_base_mkey2str(r_mkey)); #endif ybtl = &mca_spml_yoda.btl_type_map[btl_id]; if (ybtl->btl->btl_register_mem) { assert (r_mkey->len == ybtl->btl->btl_registration_handle_size); remote_handle = (mca_btl_base_registration_handle_t *) r_mkey->u.data; } /* check if we doing put into shm attached segment and if so * just do memcpy */ if ((YODA_BTL_SM == ybtl->btl_type || YODA_BTL_VADER == ybtl->btl_type) && mca_memheap_base_can_local_copy(r_mkey, dst_addr)) { memcpy((void *) (unsigned long) rva, src_addr, size); return OSHMEM_SUCCESS; } /* We support only blocking PUT now => we always need copy for src buffer*/ calc_nfrags_put (bml_btl, size, &frag_size, &nfrags, put_via_send); p_src = (char*) src_addr; p_dst = (char*) (unsigned long) rva; for (i = 0; i < nfrags; i++) { /* Allocating send request from free list */ putreq = mca_spml_yoda_putreq_alloc(dst); frag = &putreq->put_frag; ncopied = i < nfrags - 1 ? frag_size :(unsigned) ((char *) src_addr + size - p_src); /* Preparing source buffer */ /* allocate buffer */ mca_spml_yoda_bml_alloc(bml_btl, &des, MCA_BTL_NO_ORDER, ncopied, MCA_BTL_DES_SEND_ALWAYS_CALLBACK, put_via_send); if (OPAL_UNLIKELY(!des || !des->des_segments )) { SPML_ERROR("src=%p nfrags = %d frag_size=%d", src_addr, nfrags, frag_size); SPML_ERROR("shmem OOM error need %d bytes", ncopied); opal_show_help("help-oshmem-spml-yoda.txt", "internal_oom_error", true, "Put", ncopied, mca_spml_yoda.bml_alloc_threshold); rc = OSHMEM_ERR_FATAL; goto exit_fatal; } /* copy data to allocated buffer*/ segment = des->des_segments; spml_yoda_prepare_for_put((void*)segment->seg_addr.pval, ncopied, (void*)p_src, (void*)p_dst, put_via_send); if (!put_via_send && ybtl->btl->btl_register_mem) { local_handle = ybtl->btl->btl_register_mem (ybtl->btl, bml_btl->btl_endpoint, segment->seg_addr.pval, ncopied, 0); if (NULL == local_handle) { /* No free resources, Block on completion here */ SPML_ERROR("shmem error: OSHMEM_ERR_OUT_OF_RESOURCE"); oshmem_request_wait_completion(&putreq->req_put.req_base.req_oshmem); } } frag->rdma_segs[0].base_seg.seg_addr.lval = (uintptr_t) p_dst; frag->rdma_segs[0].base_seg.seg_len = (put_via_send ? ncopied + SPML_YODA_SEND_CONTEXT_SIZE : ncopied); frag->rdma_req = putreq; /* initialize callback data for put*/ des->des_cbdata = frag; des->des_cbfunc = mca_spml_yoda_put_completion; OPAL_THREAD_ADD32(&mca_spml_yoda.n_active_puts, 1); /* put the data to remote side */ if (!put_via_send) { rc = mca_bml_base_put (bml_btl, segment->seg_addr.pval, (uint64_t) (intptr_t) p_dst, local_handle, remote_handle, ncopied, 0, 0, mca_spml_yoda_put_completion_rdma, des); } else { rc = mca_bml_base_send(bml_btl, des, MCA_SPML_YODA_PUT); if (1 == rc) rc = OSHMEM_SUCCESS; } if (OPAL_UNLIKELY(OSHMEM_SUCCESS != rc)) { if (OSHMEM_ERR_OUT_OF_RESOURCE == rc) { /* No free resources, Block on completion here */ SPML_ERROR("shmem error: OSHMEM_ERR_OUT_OF_RESOURCE"); oshmem_request_wait_completion(&putreq->req_put.req_base.req_oshmem); } else { SPML_ERROR("shmem error"); } /* exit with errro */ SPML_ERROR("shmem error: ret = %i, send_pe = %i, dest_pe = %i", rc, oshmem_my_proc_id(), dst); rc = OSHMEM_ERR_FATAL; goto exit_fatal; } p_src += ncopied; p_dst += ncopied; } return rc; exit_fatal: if (OSHMEM_SUCCESS != rc) { oshmem_shmem_abort(rc); } return rc; }
static void mca_yoda_get_callback(mca_btl_base_module_t* btl, mca_btl_base_tag_t tag, mca_btl_base_descriptor_t* des, void* cbdata ) { void** p, ** p_src, **p_dst; size_t* size; int* dst; void** p_getreq; mca_btl_base_descriptor_t* des_loc; int rc; mca_bml_base_btl_t* bml_btl; mca_spml_yoda_rdma_frag_t* frag; int btl_id; mca_spml_yoda_put_request_t *putreq; rc = OSHMEM_SUCCESS; btl_id = 0; putreq = NULL; /* Unpack data */ p = (void **)des->des_segments->seg_addr.pval; p_src = (void*) p; size = (size_t*)((char*)p_src + sizeof(*p_src) ); dst = (int*)( (char*)size + sizeof(*size)); p_dst = (void*) ((char*)dst + sizeof(*dst)); p_getreq =(void**) ( (char*)p_dst + sizeof(*p_dst)); /* Prepare put via send*/ bml_btl = get_next_btl(*dst, &btl_id); putreq = mca_spml_yoda_putreq_alloc(*dst); frag = &putreq->put_frag; mca_spml_yoda_bml_alloc(bml_btl, &des_loc, MCA_BTL_NO_ORDER, *size, MCA_BTL_DES_SEND_ALWAYS_CALLBACK, 1); if (OPAL_UNLIKELY(!des_loc || !des_loc->des_segments)) { SPML_ERROR("shmem OOM error need %d bytes", (int)*size); oshmem_shmem_abort(-1); } spml_yoda_prepare_for_get_response((void*)des_loc->des_segments->seg_addr.pval, *size, (void*)*p_src, (void*) *p_dst,(void*)*p_getreq,1); frag->rdma_req = putreq; /* Initialize callback data for put*/ des_loc->des_cbdata = frag; des_loc->des_cbfunc = mca_spml_yoda_put_completion; des_loc->des_segment_count = 1; OPAL_THREAD_ADD32(&mca_spml_yoda.n_active_puts, 1); /* Put via send*/ rc = mca_bml_base_send(bml_btl, des_loc, MCA_SPML_YODA_GET_RESPONSE); if (1 == rc) { rc = OSHMEM_SUCCESS; } if (OPAL_UNLIKELY(OSHMEM_SUCCESS != rc)) { if (OSHMEM_ERR_OUT_OF_RESOURCE == rc) { /* No free resources, Block on completion here */ SPML_ERROR("shmem error: OSHMEM_ERR_OUT_OF_RESOURCE"); oshmem_request_wait_completion(&putreq->req_put.req_base.req_oshmem); } else { SPML_ERROR("shmem error"); } /* exit with errro */ SPML_ERROR("shmem error: ret = %i, send_pe = %i, dest_pe = %i", rc, oshmem_my_proc_id(), *dst); oshmem_shmem_abort(-1); rc = OSHMEM_ERROR; } }