static inline int a2aw_sched_linear(int rank, int p, NBC_Schedule *schedule, const void *sendbuf, const int *sendcounts, const int *sdispls, struct ompi_datatype_t * const * sendtypes, void *recvbuf, const int *recvcounts, const int *rdispls, struct ompi_datatype_t * const * recvtypes) { int res; for (int i = 0; i < p; i++) { ptrdiff_t gap, span; if (i == rank) { continue; } /* post send */ span = opal_datatype_span(&sendtypes[i]->super, sendcounts[i], &gap); if (OPAL_LIKELY(0 < span)) { char *sbuf = (char *) sendbuf + sdispls[i]; res = NBC_Sched_send (sbuf, false, sendcounts[i], sendtypes[i], i, schedule, false); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { return res; } } /* post receive */ span = opal_datatype_span(&recvtypes[i]->super, recvcounts[i], &gap); if (OPAL_LIKELY(0 < span)) { char *rbuf = (char *) recvbuf + rdispls[i]; res = NBC_Sched_recv (rbuf, false, recvcounts[i], recvtypes[i], i, schedule, false); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { return res; } } } return OMPI_SUCCESS; }
int mca_pml_ob1_progress(void) { int i, queue_length = opal_list_get_size(&mca_pml_ob1.send_pending); int j, completed_requests = 0; bool send_succedded; #if OPAL_CUDA_SUPPORT mca_pml_ob1_process_pending_cuda_async_copies(); #endif /* OPAL_CUDA_SUPPORT */ if( OPAL_LIKELY(0 == queue_length) ) return 0; for( i = 0; i < queue_length; i++ ) { mca_pml_ob1_send_pending_t pending_type = MCA_PML_OB1_SEND_PENDING_NONE; mca_pml_ob1_send_request_t* sendreq; mca_bml_base_endpoint_t* endpoint; sendreq = get_request_from_send_pending(&pending_type); if(OPAL_UNLIKELY(NULL == sendreq)) break; switch(pending_type) { case MCA_PML_OB1_SEND_PENDING_NONE: assert(0); return 0; case MCA_PML_OB1_SEND_PENDING_SCHEDULE: if( mca_pml_ob1_send_request_schedule_exclusive(sendreq) == OMPI_ERR_OUT_OF_RESOURCE ) { return 0; } completed_requests++; break; case MCA_PML_OB1_SEND_PENDING_START: MCA_PML_OB1_SEND_REQUEST_RESET(sendreq); endpoint = sendreq->req_endpoint; send_succedded = false; for(j = 0; j < (int)mca_bml_base_btl_array_get_size(&endpoint->btl_eager); j++) { mca_bml_base_btl_t* bml_btl; int rc; /* select a btl */ bml_btl = mca_bml_base_btl_array_get_next(&endpoint->btl_eager); rc = mca_pml_ob1_send_request_start_btl(sendreq, bml_btl); if( OPAL_LIKELY(OMPI_SUCCESS == rc) ) { send_succedded = true; completed_requests++; break; } } if( false == send_succedded ) { add_request_to_send_pending(sendreq, MCA_PML_OB1_SEND_PENDING_START, true); } } } return completed_requests; }
/** * Return 0 if everything went OK and if there is still room before the complete * conversion of the data (need additional call with others input buffers ) * 1 if everything went fine and the data was completly converted * -1 something wrong occurs. */ int32_t opal_convertor_pack( opal_convertor_t* pConv, struct iovec* iov, uint32_t* out_size, size_t* max_data ) { OPAL_CONVERTOR_SET_STATUS_BEFORE_PACK_UNPACK( pConv, iov, out_size, max_data ); if( OPAL_LIKELY(pConv->flags & CONVERTOR_NO_OP) ) { /** * We are doing conversion on a contiguous datatype on a homogeneous * environment. The convertor contain minimal informations, we only * use the bConverted to manage the conversion. */ uint32_t i; unsigned char* base_pointer; size_t pending_length = pConv->local_size - pConv->bConverted; *max_data = pending_length; opal_convertor_get_current_pointer( pConv, (void**)&base_pointer ); for( i = 0; i < *out_size; i++ ) { if( iov[i].iov_len >= pending_length ) { goto complete_contiguous_data_pack; } if( OPAL_LIKELY(NULL == iov[i].iov_base) ) iov[i].iov_base = (IOVBASE_TYPE *) base_pointer; else #if OPAL_CUDA_SUPPORT MEMCPY_CUDA( iov[i].iov_base, base_pointer, iov[i].iov_len, pConv ); #else MEMCPY( iov[i].iov_base, base_pointer, iov[i].iov_len ); #endif pending_length -= iov[i].iov_len; base_pointer += iov[i].iov_len; } *max_data -= pending_length; pConv->bConverted += (*max_data); return 0; complete_contiguous_data_pack: iov[i].iov_len = pending_length; if( OPAL_LIKELY(NULL == iov[i].iov_base) ) iov[i].iov_base = (IOVBASE_TYPE *) base_pointer; else #if OPAL_CUDA_SUPPORT MEMCPY_CUDA( iov[i].iov_base, base_pointer, iov[i].iov_len, pConv ); #else MEMCPY( iov[i].iov_base, base_pointer, iov[i].iov_len ); #endif pConv->bConverted = pConv->local_size; *out_size = i + 1; pConv->flags |= CONVERTOR_COMPLETED; return 1; } return pConv->fAdvance( pConv, iov, out_size, max_data ); }
int mca_pml_cm_irecv(void *addr, size_t count, ompi_datatype_t * datatype, int src, int tag, struct ompi_communicator_t *comm, struct ompi_request_t **request) { int ret; mca_pml_cm_thin_recv_request_t *recvreq; ompi_proc_t* ompi_proc; MCA_PML_CM_THIN_RECV_REQUEST_ALLOC(recvreq, ret); if( OPAL_UNLIKELY(OMPI_SUCCESS != ret) ) return ret; MCA_PML_CM_THIN_RECV_REQUEST_INIT(recvreq, ompi_proc, comm, src, datatype, addr, count); MCA_PML_CM_THIN_RECV_REQUEST_START(recvreq, comm, tag, src, ret); if( OPAL_LIKELY(OMPI_SUCCESS == ret) ) *request = (ompi_request_t*) recvreq; return ret; }
int mca_pml_cm_imrecv(void *buf, size_t count, ompi_datatype_t *datatype, struct ompi_message_t **message, struct ompi_request_t **request) { int ret; mca_pml_cm_thin_recv_request_t *recvreq; ompi_proc_t* ompi_proc; ompi_communicator_t *comm = (*message)->comm; int peer = (*message)->peer; MCA_PML_CM_THIN_RECV_REQUEST_ALLOC(recvreq, ret); if( OPAL_UNLIKELY(OMPI_SUCCESS != ret) ) return ret; MCA_PML_CM_THIN_RECV_REQUEST_INIT(recvreq, ompi_proc, comm, peer, datatype, buf, count); MCA_PML_CM_THIN_RECV_REQUEST_MATCHED_START(recvreq, message, ret); if( OPAL_LIKELY(OMPI_SUCCESS == ret) ) *request = (ompi_request_t*) recvreq; return ret; }
/** * Initiate a send to the peer. * * @param btl (IN) BTL module * @param peer (IN) BTL peer addressing */ int mca_btl_vader_send (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, struct mca_btl_base_descriptor_t *descriptor, mca_btl_base_tag_t tag) { mca_btl_vader_frag_t *frag = (mca_btl_vader_frag_t *) descriptor; if (OPAL_LIKELY(frag->fbox)) { mca_btl_vader_fbox_send (frag->fbox, tag, frag->segments[0].seg_len); mca_btl_vader_frag_complete (frag); return 1; } /* header (+ optional inline data) */ frag->hdr->len = frag->segments[0].seg_len; /* type of message, pt-2-pt, one-sided, etc */ frag->hdr->tag = tag; /* post the relative address of the descriptor into the peer's fifo */ vader_fifo_write_ep (frag->hdr, endpoint); if ((frag->hdr->flags & MCA_BTL_VADER_FLAG_SINGLE_COPY) || !(frag->base.des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP)) { frag->base.des_flags |= MCA_BTL_DES_SEND_ALWAYS_CALLBACK; return 0; } /* data is gone (from the pml's perspective). frag callback/release will happen later */ return 1; }
static inline int mca_btl_ugni_post_pending (mca_btl_ugni_module_t *ugni_module, mca_btl_ugni_device_t *device) { int pending_post_count = opal_list_get_size (&device->pending_post); mca_btl_ugni_post_descriptor_t *post_desc; int rc; /* check if there are any posts pending resources */ if (OPAL_LIKELY(0 == pending_post_count)) { return 0; } BTL_VERBOSE(("progressing %d pending FMA/RDMA operations", pending_post_count)); for (int i = 0 ; i < pending_post_count ; ++i) { mca_btl_ugni_device_lock (device); post_desc = (mca_btl_ugni_post_descriptor_t *) opal_list_remove_first (&device->pending_post); mca_btl_ugni_device_unlock (device); if (NULL == post_desc) { break; } rc = mca_btl_ugni_repost (ugni_module, post_desc); if (OPAL_SUCCESS != rc) { mca_btl_ugni_device_lock (device); opal_list_prepend (&device->pending_post, (opal_list_item_t *) post_desc); mca_btl_ugni_device_unlock (device); break; } } return 1; }
/** * This function always work in local representation. This means no representation * conversion (i.e. no heterogeneity) has to be taken into account, and that all * length we're working on are local. */ int32_t opal_convertor_raw( opal_convertor_t* pConvertor, struct iovec* iov, uint32_t* iov_count, size_t* length ) { const opal_datatype_t *pData = pConvertor->pDesc; dt_stack_t* pStack; /* pointer to the position on the stack */ uint32_t pos_desc; /* actual position in the description of the derived datatype */ uint32_t count_desc; /* the number of items already done in the actual pos_desc */ dt_elem_desc_t* description, *pElem; unsigned char *source_base; /* origin of the data */ size_t raw_data = 0; /* sum of raw data lengths in the iov_len fields */ uint32_t index = 0, i; /* the iov index and a simple counter */ assert( (*iov_count) > 0 ); if( OPAL_LIKELY(pConvertor->flags & CONVERTOR_NO_OP) ) { /* The convertor contain minimal informations, we only use the bConverted * to manage the conversion. This function work even after the convertor * was moved to a specific position. */ opal_convertor_get_current_pointer( pConvertor, (void**)&iov[0].iov_base ); iov[0].iov_len = pConvertor->local_size - pConvertor->bConverted; *length = iov[0].iov_len; pConvertor->bConverted = pConvertor->local_size; pConvertor->flags |= CONVERTOR_COMPLETED; *iov_count = 1; return 1; /* we're done */ } DO_DEBUG( opal_output( 0, "opal_convertor_raw( %p, {%p, %u}, %lu )\n", (void*)pConvertor, (void*)iov, *iov_count, (unsigned long)*length ); );
/* * Given an incoming segment, lookup the endpoint that sent it */ static inline ompi_btl_usnic_endpoint_t * lookup_sender(ompi_btl_usnic_module_t *module, ompi_btl_usnic_segment_t *seg) { int ret; ompi_btl_usnic_endpoint_t *sender; /* Use the hashed RTE process name in the BTL header to uniquely identify the sending process (using the MAC/hardware address only identifies the sending server -- not the sending RTE process). */ /* JMS We've experimented with using a handshake before sending any data so that instead of looking up a hash on the btl_header->sender, echo back the ptr to the sender's ompi_proc. There was limited speedup with this scheme; more investigation is required. */ ret = opal_hash_table_get_value_uint64(&module->senders, seg->us_btl_header->sender, (void**) &sender); if (OPAL_LIKELY(OPAL_SUCCESS == ret)) { return sender; } /* The sender wasn't in the hash table, so do a slow lookup and put the result in the hash table */ sender = ompi_btl_usnic_proc_lookup_endpoint(module, seg->us_btl_header->sender); if (NULL != sender) { opal_hash_table_set_value_uint64(&module->senders, seg->us_btl_header->sender, sender); return sender; } /* Whoa -- not found at all! */ return NULL; }
static inline map_segment_t *__find_va(const void* va) { map_segment_t *s; if (OPAL_LIKELY((uintptr_t)va >= (uintptr_t)memheap_map->mem_segs[HEAP_SEG_INDEX].seg_base_addr && (uintptr_t)va < (uintptr_t)memheap_map->mem_segs[HEAP_SEG_INDEX].end)) { s = &memheap_map->mem_segs[HEAP_SEG_INDEX]; } else { s = bsearch(va, &memheap_map->mem_segs[SYMB_SEG_INDEX], memheap_map->n_segments - 1, sizeof(*s), _seg_cmp); } #if MEMHEAP_BASE_DEBUG == 1 if (s) { MEMHEAP_VERBOSE(5, "match seg#%02ld: 0x%llX - 0x%llX %llu bytes va=%p", s - memheap_map->mem_segs, (long long)s->seg_base_addr, (long long)s->end, (long long)(s->end - s->seg_base_addr), (void *)va); } #endif return s; }
/* * These functions can be used in order to create an IDENTICAL copy of one convertor. In this * context IDENTICAL means that the datatype and count and all other properties of the basic * convertor get replicated on this new convertor. However, the references to the datatype * are not increased. This function take special care about the stack. If all the cases the * stack is created with the correct number of entries but if the copy_stack is true (!= 0) * then the content of the old stack is copied on the new one. The result will be a convertor * ready to use starting from the old position. If copy_stack is false then the convertor * is created with a empty stack (you have to use opal_convertor_set_position before using it). */ int opal_convertor_clone( const opal_convertor_t* source, opal_convertor_t* destination, int32_t copy_stack ) { destination->remoteArch = source->remoteArch; destination->flags = source->flags; destination->pDesc = source->pDesc; destination->use_desc = source->use_desc; destination->count = source->count; destination->pBaseBuf = source->pBaseBuf; destination->fAdvance = source->fAdvance; destination->master = source->master; destination->local_size = source->local_size; destination->remote_size = source->remote_size; /* create the stack */ if( OPAL_UNLIKELY(source->stack_size > DT_STATIC_STACK_SIZE) ) { destination->pStack = (dt_stack_t*)malloc(sizeof(dt_stack_t) * source->stack_size ); } else { destination->pStack = destination->static_stack; } destination->stack_size = source->stack_size; /* initialize the stack */ if( OPAL_LIKELY(0 == copy_stack) ) { destination->bConverted = -1; destination->stack_pos = -1; } else { memcpy( destination->pStack, source->pStack, sizeof(dt_stack_t) * (source->stack_pos+1) ); destination->bConverted = source->bConverted; destination->stack_pos = source->stack_pos; } return OPAL_SUCCESS; }
static inline int mca_btl_ugni_handle_remote_smsg_overrun (mca_btl_ugni_module_t *btl) { gni_cq_entry_t event_data; unsigned int ep_index; int count, rc; BTL_VERBOSE(("btl/ugni_component detected SMSG CQ overrun. " "processing message backlog...")); /* we don't know which endpoint lost an smsg completion. clear the smsg remote cq and check all mailboxes */ /* clear out remote cq */ do { rc = GNI_CqGetEvent (btl->smsg_remote_cq, &event_data); } while (GNI_RC_NOT_DONE != rc); for (ep_index = 0, count = 0 ; ep_index < btl->endpoint_count ; ++ep_index) { mca_btl_base_endpoint_t *ep = btl->endpoints[ep_index]; if (NULL == ep || MCA_BTL_UGNI_EP_STATE_CONNECTED != ep->state) { continue; } /* clear out smsg mailbox */ rc = mca_btl_ugni_smsg_process (ep); if (OPAL_LIKELY(rc >= 0)) { count += rc; } } return count; }
int32_t opal_convertor_set_position_nocheck( opal_convertor_t* convertor, size_t* position ) { int32_t rc; /** * If we plan to rollback the convertor then first we have to set it * at the beginning. */ if( (0 == (*position)) || ((*position) < convertor->bConverted) ) { rc = opal_convertor_create_stack_at_begining( convertor, opal_datatype_local_sizes ); if( 0 == (*position) ) return rc; } if( OPAL_LIKELY(convertor->flags & OPAL_DATATYPE_FLAG_CONTIGUOUS) ) { rc = opal_convertor_create_stack_with_pos_contig( convertor, (*position), opal_datatype_local_sizes ); } else { rc = opal_convertor_generic_simple_position( convertor, position ); /** * If we have a non-contigous send convertor don't allow it move in the middle * of a predefined datatype, it won't be able to copy out the left-overs * anyway. Instead force the position to stay on predefined datatypes * boundaries. As we allow partial predefined datatypes on the contiguous * case, we should be accepted by any receiver convertor. */ if( CONVERTOR_SEND & convertor->flags ) { convertor->bConverted -= convertor->partial_length; convertor->partial_length = 0; } } *position = convertor->bConverted; return rc; }
int mca_pml_ucx_send(const void *buf, size_t count, ompi_datatype_t *datatype, int dst, int tag, mca_pml_base_send_mode_t mode, struct ompi_communicator_t* comm) { ompi_request_t *req; ucp_ep_h ep; PML_UCX_TRACE_SEND("%s", buf, count, datatype, dst, tag, mode, comm, "send"); /* TODO special care to sync/buffered send */ ep = mca_pml_ucx_get_ep(comm, dst); if (OPAL_UNLIKELY(NULL == ep)) { PML_UCX_ERROR("Failed to get ep for rank %d", dst); return OMPI_ERROR; } req = (ompi_request_t*)ucp_tag_send_nb(ep, buf, count, mca_pml_ucx_get_datatype(datatype), PML_UCX_MAKE_SEND_TAG(tag, comm), mca_pml_ucx_send_completion); if (OPAL_LIKELY(req == NULL)) { return OMPI_SUCCESS; } else if (!UCS_PTR_IS_ERR(req)) { PML_UCX_VERBOSE(8, "got request %p", (void*)req); ucp_worker_progress(ompi_pml_ucx.ucp_worker); ompi_request_wait(&req, MPI_STATUS_IGNORE); return OMPI_SUCCESS; } else { PML_UCX_ERROR("ucx send failed: %s", ucs_status_string(UCS_PTR_STATUS(req))); return OMPI_ERROR; } }
int mca_rcache_vma_insert(struct mca_rcache_base_module_t* rcache, mca_mpool_base_registration_t* reg, size_t limit) { int rc; size_t reg_size = reg->bound - reg->base + 1; mca_rcache_vma_module_t *vma_rcache = (mca_rcache_vma_module_t*)rcache; if(limit != 0 && reg_size > limit) { /* return out of resources if request is bigger than cache size * return temp out of resources otherwise */ return OMPI_ERR_OUT_OF_RESOURCE; } /* Check to ensure that the cache is valid */ if (OPAL_UNLIKELY(opal_memory_changed() && NULL != opal_memory->memoryc_process && OPAL_SUCCESS != (rc = opal_memory->memoryc_process()))) { return rc; } rc = mca_rcache_vma_tree_insert(vma_rcache, reg, limit); if (OPAL_LIKELY(OMPI_SUCCESS == rc)) { /* If we successfully registered, then tell the memory manager to start monitoring this region */ opal_memory->memoryc_register(reg->base, (uint64_t) reg_size, (uint64_t) (uintptr_t) reg); } return rc; }
static int oshmem_mkey_recv_cb(void) { MPI_Status status; int flag; int n; int rc; opal_buffer_t *msg; int32_t size; void *tmp_buf; oob_comm_request_t *r; n = 0; r = (oob_comm_request_t *)opal_list_get_first(&memheap_oob.req_list); assert(r); while (1) { my_MPI_Test(&r->recv_req, &flag, &status); if (OPAL_LIKELY(0 == flag)) { return n; } MPI_Get_count(&status, MPI_BYTE, &size); MEMHEAP_VERBOSE(5, "OOB request from PE: %d, size %d", status.MPI_SOURCE, size); n++; opal_list_remove_first(&memheap_oob.req_list); /* to avoid deadlock we must start request * before processing it. Data are copied to * the tmp buffer */ tmp_buf = malloc(size); if (NULL == tmp_buf) { MEMHEAP_ERROR("not enough memory"); ORTE_ERROR_LOG(0); return n; } memcpy(tmp_buf, (void*)&r->buf, size); msg = OBJ_NEW(opal_buffer_t); if (NULL == msg) { MEMHEAP_ERROR("not enough memory"); ORTE_ERROR_LOG(0); return n; } opal_dss.load(msg, (void*)tmp_buf, size); rc = MPI_Start(&r->recv_req); if (MPI_SUCCESS != rc) { MEMHEAP_ERROR("Failed to post recv request %d", rc); ORTE_ERROR_LOG(rc); return n; } opal_list_append(&memheap_oob.req_list, &r->super); do_recv(status.MPI_SOURCE, msg); OBJ_RELEASE(msg); r = (oob_comm_request_t *)opal_list_get_first(&memheap_oob.req_list); assert(r); } return 1; }
static inline struct mca_btl_base_descriptor_t * mca_btl_scif_prepare_src_send (struct mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint, struct opal_convertor_t *convertor, uint8_t order, size_t reserve, size_t *size, uint32_t flags) { mca_btl_scif_base_frag_t *frag = NULL; uint32_t iov_count = 1; struct iovec iov; size_t max_size = *size; int rc; if (OPAL_LIKELY((mca_btl_scif_module.super.btl_flags & MCA_BTL_FLAGS_SEND_INPLACE) && !opal_convertor_need_buffers (convertor) && reserve <= 128)) { /* inplace send */ void *data_ptr; opal_convertor_get_current_pointer (convertor, &data_ptr); (void) MCA_BTL_SCIF_FRAG_ALLOC_DMA(endpoint, frag); if (OPAL_UNLIKELY(NULL == frag)) { return NULL; } frag->segments[0].seg_len = reserve; frag->segments[1].seg_addr.pval = data_ptr; frag->segments[1].seg_len = *size; frag->base.des_segment_count = 2; } else { /* buffered send */ (void) MCA_BTL_SCIF_FRAG_ALLOC_EAGER(endpoint, frag); if (OPAL_UNLIKELY(NULL == frag)) { return NULL; } if (*size) { iov.iov_len = *size; iov.iov_base = (IOVBASE_TYPE *) ((uintptr_t) frag->segments[0].seg_addr.pval + reserve); rc = opal_convertor_pack (convertor, &iov, &iov_count, &max_size); if (OPAL_UNLIKELY(rc < 0)) { mca_btl_scif_frag_return (frag); return NULL; } *size = max_size; } frag->segments[0].seg_len = reserve + *size; frag->base.des_segment_count = 1; } frag->base.des_segments = frag->segments; frag->base.order = order; frag->base.des_flags = flags; return &frag->base; }
static mca_btl_base_registration_handle_t *mca_btl_scif_register_mem (struct mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint, void *base, size_t size, uint32_t flags) { mca_btl_scif_reg_t *scif_reg; int access_flags = flags & MCA_BTL_REG_FLAG_ACCESS_ANY; int rc; if (MCA_BTL_ENDPOINT_ANY == endpoint) { /* it probably isn't possible to support registering memory to use with any endpoint so * return NULL */ return NULL; } if (OPAL_LIKELY(MCA_BTL_SCIF_EP_STATE_CONNECTED != endpoint->state)) { /* the endpoint needs to be connected before the fragment can be * registered. */ rc = mca_btl_scif_ep_connect (endpoint); if (OPAL_LIKELY(MCA_BTL_SCIF_EP_STATE_CONNECTED != endpoint->state)) { /* not yet connected */ return NULL; } } rc = btl->btl_mpool->mpool_register(btl->btl_mpool, base, size, 0, access_flags, (mca_mpool_base_registration_t **) &scif_reg); if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { return NULL; } /* register the memory location with this peer if it isn't already */ if ((off_t) -1 == scif_reg->handles[endpoint->id].btl_handle.scif_offset) { size_t seg_size = (size_t)((uintptr_t) scif_reg->base.bound - (uintptr_t) scif_reg->base.base) + 1; /* NTH: until we determine a way to pass permissions to the mpool just make all segments * read/write */ scif_reg->handles[endpoint->id].btl_handle.scif_offset = scif_register (endpoint->scif_epd, scif_reg->base.base, seg_size, 0, SCIF_PROT_READ | SCIF_PROT_WRITE, 0); BTL_VERBOSE(("registered fragment for scif DMA transaction. offset = %lu", (unsigned long) scif_reg->handles[endpoint->id].btl_handle.scif_offset)); } return &scif_reg->handles[endpoint->id].btl_handle; }
int ompi_osc_rdma_lock_atomic (int lock_type, int target, int assert, ompi_win_t *win) { ompi_osc_rdma_module_t *module = GET_MODULE(win); ompi_osc_rdma_peer_t *peer = ompi_osc_rdma_module_peer (module, target); ompi_osc_rdma_sync_t *lock; int ret = OMPI_SUCCESS; OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "lock: %d, %d, %d, %s", lock_type, target, assert, win->w_name); if (module->no_locks) { OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "attempted to lock with no_locks set"); return OMPI_ERR_RMA_SYNC; } if (module->all_sync.epoch_active && (OMPI_OSC_RDMA_SYNC_TYPE_LOCK != module->all_sync.type || MPI_LOCK_EXCLUSIVE == lock_type)) { /* impossible to get an exclusive lock while holding a global shared lock or in a active * target access epoch */ return OMPI_ERR_RMA_SYNC; } /* clear the global sync object (in case MPI_Win_fence was called) */ module->all_sync.type = OMPI_OSC_RDMA_SYNC_TYPE_NONE; /* create lock item */ lock = ompi_osc_rdma_sync_allocate (module); if (OPAL_UNLIKELY(NULL == lock)) { return OMPI_ERR_OUT_OF_RESOURCE; } lock->type = OMPI_OSC_RDMA_SYNC_TYPE_LOCK; lock->sync.lock.target = target; lock->sync.lock.type = lock_type; lock->sync.lock.assert = assert; lock->peer_list.peer = peer; lock->num_peers = 1; OBJ_RETAIN(peer); if (0 == (assert & MPI_MODE_NOCHECK)) { ret = ompi_osc_rdma_lock_atomic_internal (module, peer, lock); } if (OPAL_LIKELY(OMPI_SUCCESS == ret)) { ++module->passive_target_access_epoch; opal_atomic_wmb (); OPAL_THREAD_SCOPED_LOCK(&module->lock, ompi_osc_rdma_module_lock_insert (module, lock)); } else { OBJ_RELEASE(lock); } OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "lock %d complete", target); return ret; }
/** * Send an FIN to the peer. If we fail to send this ack (no more available * fragments or the send failed) this function automatically add the FIN * to the list of pending FIN, Which guarantee that the FIN will be sent * later. */ int mca_pml_ob1_send_fin( ompi_proc_t* proc, mca_bml_base_btl_t* bml_btl, ompi_ptr_t hdr_des, uint8_t order, uint32_t status ) { mca_btl_base_descriptor_t* fin; mca_pml_ob1_fin_hdr_t* hdr; int rc; mca_bml_base_alloc(bml_btl, &fin, order, sizeof(mca_pml_ob1_fin_hdr_t), MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP); if(NULL == fin) { MCA_PML_OB1_ADD_FIN_TO_PENDING(proc, hdr_des, bml_btl, order, status); return OMPI_ERR_OUT_OF_RESOURCE; } fin->des_cbfunc = mca_pml_ob1_fin_completion; fin->des_cbdata = NULL; /* fill in header */ hdr = (mca_pml_ob1_fin_hdr_t*)fin->des_src->seg_addr.pval; hdr->hdr_common.hdr_flags = 0; hdr->hdr_common.hdr_type = MCA_PML_OB1_HDR_TYPE_FIN; hdr->hdr_des = hdr_des; hdr->hdr_fail = status; ob1_hdr_hton(hdr, MCA_PML_OB1_HDR_TYPE_FIN, proc); /* queue request */ rc = mca_bml_base_send( bml_btl, fin, MCA_PML_OB1_HDR_TYPE_FIN ); if( OPAL_LIKELY( rc >= 0 ) ) { if( OPAL_LIKELY( 1 == rc ) ) { MCA_PML_OB1_PROGRESS_PENDING(bml_btl); } return OMPI_SUCCESS; } mca_bml_base_free(bml_btl, fin); MCA_PML_OB1_ADD_FIN_TO_PENDING(proc, hdr_des, bml_btl, order, status); return OMPI_ERR_OUT_OF_RESOURCE; }
static inline int mca_btl_ugni_progress_datagram (mca_btl_ugni_module_t *ugni_module) { uint32_t remote_addr, remote_id; mca_btl_base_endpoint_t *ep; gni_post_state_t post_state; gni_ep_handle_t handle; uint64_t datagram_id; gni_return_t grc; int count = 0; /* check for datagram completion */ grc = GNI_PostDataProbeById (ugni_module->device->dev_handle, &datagram_id); if (OPAL_LIKELY(GNI_RC_SUCCESS != grc)) { return 0; } if ((datagram_id & MCA_BTL_UGNI_DATAGRAM_MASK) == MCA_BTL_UGNI_CONNECT_WILDCARD_ID) { handle = ugni_module->wildcard_ep; } else { handle = ugni_module->endpoints[(uint32_t)(datagram_id & 0xffffffffull)]->smsg_ep_handle; } /* wait for the incoming datagram to complete (in case it isn't) */ grc = GNI_EpPostDataWaitById (handle, datagram_id, -1, &post_state, &remote_addr, &remote_id); if (GNI_RC_SUCCESS != grc) { BTL_ERROR(("GNI_EpPostDataWaitById failed with rc = %d", grc)); return ompi_common_rc_ugni_to_ompi (grc); } BTL_VERBOSE(("got a datagram completion: id = %" PRIx64 ", state = %d, " "peer = %d", datagram_id, post_state, remote_id)); ep = ugni_module->endpoints[remote_id]; /* NTH: TODO -- error handling */ (void) mca_btl_ugni_ep_connect_progress (ep); if (MCA_BTL_UGNI_EP_STATE_CONNECTED == ep->state) { /* process messages waiting in the endpoint's smsg mailbox */ count = mca_btl_ugni_smsg_process (ep); } /* repost the wildcard datagram */ if ((datagram_id & MCA_BTL_UGNI_DATAGRAM_MASK) == MCA_BTL_UGNI_CONNECT_WILDCARD_ID) { mca_btl_ugni_wildcard_ep_post (ugni_module); } return count; }
int mca_pml_ob1_isend(const void *buf, size_t count, ompi_datatype_t * datatype, int dst, int tag, mca_pml_base_send_mode_t sendmode, ompi_communicator_t * comm, ompi_request_t ** request) { mca_pml_ob1_comm_proc_t *ob1_proc = mca_pml_ob1_peer_lookup (comm, dst); mca_pml_ob1_send_request_t *sendreq = NULL; ompi_proc_t *dst_proc = ob1_proc->ompi_proc; mca_bml_base_endpoint_t* endpoint = mca_bml_base_get_endpoint (dst_proc); int16_t seqn; int rc; if (OPAL_UNLIKELY(NULL == endpoint)) { return OMPI_ERR_UNREACH; } seqn = (uint16_t) OPAL_THREAD_ADD32(&ob1_proc->send_sequence, 1); if (MCA_PML_BASE_SEND_SYNCHRONOUS != sendmode) { rc = mca_pml_ob1_send_inline (buf, count, datatype, dst, tag, seqn, dst_proc, endpoint, comm); if (OPAL_LIKELY(0 <= rc)) { /* NTH: it is legal to return ompi_request_empty since the only valid * field in a send completion status is whether or not the send was * cancelled (which it can't be at this point anyway). */ *request = &ompi_request_empty; return OMPI_SUCCESS; } } MCA_PML_OB1_SEND_REQUEST_ALLOC(comm, dst, sendreq); if (NULL == sendreq) return OMPI_ERR_OUT_OF_RESOURCE; MCA_PML_OB1_SEND_REQUEST_INIT(sendreq, buf, count, datatype, dst, tag, comm, sendmode, false); PERUSE_TRACE_COMM_EVENT (PERUSE_COMM_REQ_ACTIVATE, &(sendreq)->req_send.req_base, PERUSE_SEND); MCA_PML_OB1_SEND_REQUEST_START_W_SEQ(sendreq, endpoint, seqn, rc); *request = (ompi_request_t *) sendreq; return rc; }
/** * Send an FIN to the peer. If we fail to send this ack (no more available * fragments or the send failed) this function automatically add the FIN * to the list of pending FIN, Which guarantee that the FIN will be sent * later. */ int mca_pml_ob1_send_fin( ompi_proc_t* proc, mca_bml_base_btl_t* bml_btl, opal_ptr_t hdr_frag, uint64_t rdma_size, uint8_t order, int status ) { mca_btl_base_descriptor_t* fin; int rc; mca_bml_base_alloc(bml_btl, &fin, order, sizeof(mca_pml_ob1_fin_hdr_t), MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP | MCA_BTL_DES_FLAGS_SIGNAL); if(NULL == fin) { MCA_PML_OB1_ADD_FIN_TO_PENDING(proc, hdr_frag, rdma_size, bml_btl, order, status); return OMPI_ERR_OUT_OF_RESOURCE; } fin->des_cbfunc = mca_pml_ob1_fin_completion; fin->des_cbdata = NULL; /* fill in header */ mca_pml_ob1_fin_hdr_prepare ((mca_pml_ob1_fin_hdr_t *) fin->des_segments->seg_addr.pval, 0, hdr_frag.lval, status ? status : (int64_t) rdma_size); ob1_hdr_hton((mca_pml_ob1_hdr_t *) fin->des_segments->seg_addr.pval, MCA_PML_OB1_HDR_TYPE_FIN, proc); /* queue request */ rc = mca_bml_base_send( bml_btl, fin, MCA_PML_OB1_HDR_TYPE_FIN ); if( OPAL_LIKELY( rc >= 0 ) ) { if( OPAL_LIKELY( 1 == rc ) ) { MCA_PML_OB1_PROGRESS_PENDING(bml_btl); } return OMPI_SUCCESS; } mca_bml_base_free(bml_btl, fin); MCA_PML_OB1_ADD_FIN_TO_PENDING(proc, hdr_frag, rdma_size, bml_btl, order, status); return OMPI_ERR_OUT_OF_RESOURCE; }
int mca_btl_ugni_ep_handle_init (mca_btl_ugni_endpoint_t *ep, gni_cq_handle_t cq, mca_btl_ugni_device_t *device, mca_btl_ugni_endpoint_handle_t *ep_handle) { gni_return_t grc; ep_handle->device = device; /* create a uGNI endpoint handle and bind it to the remote peer */ grc = GNI_EpCreate (device->dev_handle, cq, &ep_handle->gni_handle); if (OPAL_LIKELY(GNI_RC_SUCCESS == grc)) { grc = GNI_EpBind (ep_handle->gni_handle, ep->ep_rem_addr, ep->ep_rem_id); } return mca_btl_rc_ugni_to_opal (grc); }
int mca_pml_ob1_recv(void *addr, size_t count, ompi_datatype_t * datatype, int src, int tag, struct ompi_communicator_t *comm, ompi_status_public_t * status) { mca_pml_ob1_recv_request_t *recvreq = NULL; int rc; if (OPAL_LIKELY(!ompi_mpi_thread_multiple)) { recvreq = mca_pml_ob1_recvreq; mca_pml_ob1_recvreq = NULL; } if( OPAL_UNLIKELY(NULL == recvreq) ) { MCA_PML_OB1_RECV_REQUEST_ALLOC(recvreq); if (NULL == recvreq) return OMPI_ERR_TEMP_OUT_OF_RESOURCE; } recvreq->req_recv.req_base.req_type = MCA_PML_REQUEST_RECV; MCA_PML_OB1_RECV_REQUEST_INIT(recvreq, addr, count, datatype, src, tag, comm, false); PERUSE_TRACE_COMM_EVENT (PERUSE_COMM_REQ_ACTIVATE, &(recvreq->req_recv.req_base), PERUSE_RECV); MCA_PML_OB1_RECV_REQUEST_START(recvreq); ompi_request_wait_completion(&recvreq->req_recv.req_base.req_ompi); if (NULL != status) { /* return status */ *status = recvreq->req_recv.req_base.req_ompi.req_status; } rc = recvreq->req_recv.req_base.req_ompi.req_status.MPI_ERROR; if (recvreq->req_recv.req_base.req_pml_complete) { /* make buffer defined when the request is compeleted, and before releasing the objects. */ MEMCHECKER( memchecker_call(&opal_memchecker_base_mem_defined, recvreq->req_recv.req_base.req_addr, recvreq->req_recv.req_base.req_count, recvreq->req_recv.req_base.req_datatype); );
int mca_pml_ob1_recv(void *addr, size_t count, ompi_datatype_t * datatype, int src, int tag, struct ompi_communicator_t *comm, ompi_status_public_t * status) { mca_pml_ob1_recv_request_t *recvreq = NULL; int rc; if (OPAL_LIKELY(!ompi_mpi_thread_multiple)) { recvreq = mca_pml_ob1_recvreq; mca_pml_ob1_recvreq = NULL; } if( OPAL_UNLIKELY(NULL == recvreq) ) { MCA_PML_OB1_RECV_REQUEST_ALLOC(recvreq); if (NULL == recvreq) return OMPI_ERR_TEMP_OUT_OF_RESOURCE; } recvreq->req_recv.req_base.req_type = MCA_PML_REQUEST_RECV; MCA_PML_OB1_RECV_REQUEST_INIT(recvreq, addr, count, datatype, src, tag, comm, false); PERUSE_TRACE_COMM_EVENT (PERUSE_COMM_REQ_ACTIVATE, &(recvreq->req_recv.req_base), PERUSE_RECV); MCA_PML_OB1_RECV_REQUEST_START(recvreq); ompi_request_wait_completion(&recvreq->req_recv.req_base.req_ompi); if (NULL != status) { /* return status */ *status = recvreq->req_recv.req_base.req_ompi.req_status; } rc = recvreq->req_recv.req_base.req_ompi.req_status.MPI_ERROR; if (OPAL_UNLIKELY(ompi_mpi_thread_multiple || NULL != mca_pml_ob1_recvreq)) { MCA_PML_OB1_RECV_REQUEST_RETURN(recvreq); } else { mca_pml_ob1_recv_request_fini (recvreq); mca_pml_ob1_recvreq = recvreq; } return rc; }
sshmem_mkey_t * mca_memheap_base_get_cached_mkey(int pe, void* va, int btl_id, void** rva) { map_segment_t *s; int rc; sshmem_mkey_t *mkey; MEMHEAP_VERBOSE_FASTPATH(10, "rkey: pe=%d va=%p", pe, va); s = __find_va(va); if (NULL == s) return NULL ; if (!MAP_SEGMENT_IS_VALID(s)) return NULL ; if (pe == oshmem_my_proc_id()) { *rva = va; MEMHEAP_VERBOSE_FASTPATH(10, "rkey: pe=%d va=%p -> (local) %lx %p", pe, va, s->mkeys[btl_id].u.key, *rva); return &s->mkeys[btl_id]; } if (OPAL_LIKELY(s->mkeys_cache[pe])) { mkey = &s->mkeys_cache[pe][btl_id]; *rva = va2rva(va, s->seg_base_addr, mkey->va_base); MEMHEAP_VERBOSE_FASTPATH(10, "rkey: pe=%d va=%p -> (cached) %lx %p", pe, (void *)va, mkey->u.key, (void *)*rva); return mkey; } s->mkeys_cache[pe] = (sshmem_mkey_t *) calloc(memheap_map->num_transports, sizeof(sshmem_mkey_t)); if (!s->mkeys_cache[pe]) return NULL ; rc = memheap_oob_get_mkeys(pe, s - memheap_map->mem_segs, s->mkeys_cache[pe]); if (OSHMEM_SUCCESS != rc) return NULL ; mkey = &s->mkeys_cache[pe][btl_id]; *rva = va2rva(va, s->seg_base_addr, mkey->va_base); MEMHEAP_VERBOSE_FASTPATH(5, "rkey: pe=%d va=%p -> (remote lookup) %lx %p", pe, (void *)va, mkey->u.key, (void *)*rva); return mkey; }
static int mca_btl_ugni_ep_send_disconnect (mca_btl_base_endpoint_t *ep) { int rc; do { rc = mca_btl_ugni_endpoint_smsg_send_wtag (ep, NULL, 0, NULL, 0, -1, MCA_BTL_UGNI_TAG_DISCONNECT); if (OPAL_LIKELY(GNI_RC_NOT_DONE != rc)) { break; } /* most likely got here because we are out of credits. check the remote CQ to get credit return */ (void) mca_btl_ugni_progress_remote_smsg (mca_btl_ugni_ep_btl (ep)); } while (1); return mca_btl_rc_ugni_to_opal (rc); }
static uint32_t mca_mpool_udreg_dereg_func (void *device_data, void *dreg_context) { mca_mpool_udreg_module_t *mpool_udreg = (mca_mpool_udreg_module_t *) dreg_context; mca_mpool_base_registration_t *udreg_reg = (mca_mpool_base_registration_t *) device_data; int rc; rc = mpool_udreg->resources.deregister_mem(mpool_udreg->resources.reg_data, udreg_reg); if (OPAL_LIKELY(OMPI_SUCCESS == rc)) { OMPI_FREE_LIST_RETURN_MT(&mpool_udreg->reg_list, (ompi_free_list_item_t *) udreg_reg); } /* might be worth printing out a warning if an error occurs here */ return 0; }
static struct mca_btl_base_descriptor_t * mca_btl_ugni_prepare_src (struct mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint, mca_mpool_base_registration_t *registration, struct opal_convertor_t *convertor, uint8_t order, size_t reserve, size_t *size, uint32_t flags) { if (OPAL_LIKELY(reserve)) { return mca_btl_ugni_prepare_src_send (btl, endpoint, convertor, order, reserve, size, flags); } else { return mca_btl_ugni_prepare_src_rdma (btl, endpoint, registration, convertor, order, size, flags); } }