Example #1
0
static inline int a2aw_sched_linear(int rank, int p, NBC_Schedule *schedule,
                                    const void *sendbuf, const int *sendcounts, const int *sdispls,
                                    struct ompi_datatype_t * const * sendtypes,
                                    void *recvbuf, const int *recvcounts, const int *rdispls,
                                    struct ompi_datatype_t * const * recvtypes) {
  int res;

  for (int i = 0; i < p; i++) {
    ptrdiff_t gap, span;
    if (i == rank) {
      continue;
    }

    /* post send */
    span = opal_datatype_span(&sendtypes[i]->super, sendcounts[i], &gap);
    if (OPAL_LIKELY(0 < span)) {
      char *sbuf = (char *) sendbuf + sdispls[i];
      res = NBC_Sched_send (sbuf, false, sendcounts[i], sendtypes[i], i, schedule, false);
      if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
        return res;
      }
    }
    /* post receive */
    span = opal_datatype_span(&recvtypes[i]->super, recvcounts[i], &gap);
    if (OPAL_LIKELY(0 < span)) {
      char *rbuf = (char *) recvbuf + rdispls[i];
      res = NBC_Sched_recv (rbuf, false, recvcounts[i], recvtypes[i], i, schedule, false);
      if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
        return res;
      }
    }
  }

  return OMPI_SUCCESS;
}
Example #2
0
int mca_pml_ob1_progress(void)
{
    int i, queue_length = opal_list_get_size(&mca_pml_ob1.send_pending);
    int j, completed_requests = 0;
    bool send_succedded;

#if OPAL_CUDA_SUPPORT
    mca_pml_ob1_process_pending_cuda_async_copies();
#endif /* OPAL_CUDA_SUPPORT */

    if( OPAL_LIKELY(0 == queue_length) )
        return 0;

    for( i = 0; i < queue_length; i++ ) {
        mca_pml_ob1_send_pending_t pending_type = MCA_PML_OB1_SEND_PENDING_NONE;
        mca_pml_ob1_send_request_t* sendreq;
        mca_bml_base_endpoint_t* endpoint;

        sendreq = get_request_from_send_pending(&pending_type);
        if(OPAL_UNLIKELY(NULL == sendreq))
            break;

        switch(pending_type) {
        case MCA_PML_OB1_SEND_PENDING_NONE:
            assert(0);
            return 0;
        case MCA_PML_OB1_SEND_PENDING_SCHEDULE:
            if( mca_pml_ob1_send_request_schedule_exclusive(sendreq) ==
                OMPI_ERR_OUT_OF_RESOURCE ) {
                return 0;
            }
            completed_requests++;
            break;
        case MCA_PML_OB1_SEND_PENDING_START:
            MCA_PML_OB1_SEND_REQUEST_RESET(sendreq);
            endpoint = sendreq->req_endpoint;
            send_succedded = false;
            for(j = 0; j < (int)mca_bml_base_btl_array_get_size(&endpoint->btl_eager); j++) {
                mca_bml_base_btl_t* bml_btl;
                int rc;

                /* select a btl */
                bml_btl = mca_bml_base_btl_array_get_next(&endpoint->btl_eager);
                rc = mca_pml_ob1_send_request_start_btl(sendreq, bml_btl);
                if( OPAL_LIKELY(OMPI_SUCCESS == rc) ) {
                    send_succedded = true;
                    completed_requests++;
                    break;
                }
            }
            if( false == send_succedded ) {
                add_request_to_send_pending(sendreq, MCA_PML_OB1_SEND_PENDING_START, true);
            }
        }
    }
    return completed_requests;
}
/** 
 * Return 0 if everything went OK and if there is still room before the complete
 *          conversion of the data (need additional call with others input buffers )
 *        1 if everything went fine and the data was completly converted
 *       -1 something wrong occurs.
 */
int32_t opal_convertor_pack( opal_convertor_t* pConv,
                             struct iovec* iov, uint32_t* out_size,
                             size_t* max_data )
{
    OPAL_CONVERTOR_SET_STATUS_BEFORE_PACK_UNPACK( pConv, iov, out_size, max_data );

    if( OPAL_LIKELY(pConv->flags & CONVERTOR_NO_OP) ) {
        /**
         * We are doing conversion on a contiguous datatype on a homogeneous
         * environment. The convertor contain minimal informations, we only
         * use the bConverted to manage the conversion.
         */
        uint32_t i;
        unsigned char* base_pointer;
        size_t pending_length = pConv->local_size - pConv->bConverted;

        *max_data = pending_length;
        opal_convertor_get_current_pointer( pConv, (void**)&base_pointer );

        for( i = 0; i < *out_size; i++ ) {
            if( iov[i].iov_len >= pending_length ) {
                goto complete_contiguous_data_pack;
            }
            if( OPAL_LIKELY(NULL == iov[i].iov_base) )
                iov[i].iov_base = (IOVBASE_TYPE *) base_pointer;
            else
#if OPAL_CUDA_SUPPORT
                MEMCPY_CUDA( iov[i].iov_base, base_pointer, iov[i].iov_len, pConv );
#else
                MEMCPY( iov[i].iov_base, base_pointer, iov[i].iov_len );
#endif
            pending_length -= iov[i].iov_len;
            base_pointer += iov[i].iov_len;
        }
        *max_data -= pending_length;
        pConv->bConverted += (*max_data);
        return 0;

complete_contiguous_data_pack:
        iov[i].iov_len = pending_length;
        if( OPAL_LIKELY(NULL == iov[i].iov_base) )
            iov[i].iov_base = (IOVBASE_TYPE *) base_pointer;
        else
#if OPAL_CUDA_SUPPORT
            MEMCPY_CUDA( iov[i].iov_base, base_pointer, iov[i].iov_len, pConv );
#else
            MEMCPY( iov[i].iov_base, base_pointer, iov[i].iov_len );
#endif
        pConv->bConverted = pConv->local_size;
        *out_size = i + 1;
        pConv->flags |= CONVERTOR_COMPLETED;
        return 1;
    }

    return pConv->fAdvance( pConv, iov, out_size, max_data );
}
Example #4
0
int
mca_pml_cm_irecv(void *addr,
                 size_t count,
                 ompi_datatype_t * datatype,
                 int src,
                 int tag,
                 struct ompi_communicator_t *comm,
                 struct ompi_request_t **request)
{
    int ret;
    mca_pml_cm_thin_recv_request_t *recvreq;
    ompi_proc_t* ompi_proc;
    
    MCA_PML_CM_THIN_RECV_REQUEST_ALLOC(recvreq, ret);
    if( OPAL_UNLIKELY(OMPI_SUCCESS != ret) ) return ret;
    
    MCA_PML_CM_THIN_RECV_REQUEST_INIT(recvreq,
                                      ompi_proc,
                                      comm,
                                      src,
                                      datatype,
                                      addr,
                                      count);
    
    MCA_PML_CM_THIN_RECV_REQUEST_START(recvreq, comm, tag, src, ret);

    if( OPAL_LIKELY(OMPI_SUCCESS == ret) ) *request = (ompi_request_t*) recvreq;

    return ret;
}
Example #5
0
int
mca_pml_cm_imrecv(void *buf,
                  size_t count,
                  ompi_datatype_t *datatype,
                  struct ompi_message_t **message,
                  struct ompi_request_t **request)
{
    int ret;
    mca_pml_cm_thin_recv_request_t *recvreq;
    ompi_proc_t* ompi_proc;
    ompi_communicator_t *comm = (*message)->comm;
    int peer = (*message)->peer;

    MCA_PML_CM_THIN_RECV_REQUEST_ALLOC(recvreq, ret);
    if( OPAL_UNLIKELY(OMPI_SUCCESS != ret) ) return ret;
    
    MCA_PML_CM_THIN_RECV_REQUEST_INIT(recvreq,
                                      ompi_proc,
                                      comm,
                                      peer,
                                      datatype,
                                      buf,
                                      count);
    
    MCA_PML_CM_THIN_RECV_REQUEST_MATCHED_START(recvreq, message, ret);

    if( OPAL_LIKELY(OMPI_SUCCESS == ret) ) *request = (ompi_request_t*) recvreq;

    return ret;
}
Example #6
0
/**
 * Initiate a send to the peer.
 *
 * @param btl (IN)      BTL module
 * @param peer (IN)     BTL peer addressing
 */
int mca_btl_vader_send (struct mca_btl_base_module_t *btl,
                        struct mca_btl_base_endpoint_t *endpoint,
                        struct mca_btl_base_descriptor_t *descriptor,
                        mca_btl_base_tag_t tag)
{
    mca_btl_vader_frag_t *frag = (mca_btl_vader_frag_t *) descriptor;

    if (OPAL_LIKELY(frag->fbox)) {
        mca_btl_vader_fbox_send (frag->fbox, tag, frag->segments[0].seg_len);
        mca_btl_vader_frag_complete (frag);

        return 1;
    }

    /* header (+ optional inline data) */
    frag->hdr->len = frag->segments[0].seg_len;
    /* type of message, pt-2-pt, one-sided, etc */
    frag->hdr->tag = tag;

    /* post the relative address of the descriptor into the peer's fifo */
    vader_fifo_write_ep (frag->hdr, endpoint);

    if ((frag->hdr->flags & MCA_BTL_VADER_FLAG_SINGLE_COPY) ||
        !(frag->base.des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP)) {
        frag->base.des_flags |= MCA_BTL_DES_SEND_ALWAYS_CALLBACK;

        return 0;
    }

    /* data is gone (from the pml's perspective). frag callback/release will
       happen later */
    return 1;
}
Example #7
0
static inline int
mca_btl_ugni_post_pending (mca_btl_ugni_module_t *ugni_module, mca_btl_ugni_device_t *device)
{
    int pending_post_count = opal_list_get_size (&device->pending_post);
    mca_btl_ugni_post_descriptor_t *post_desc;
    int rc;

    /* check if there are any posts pending resources */
    if (OPAL_LIKELY(0 == pending_post_count)) {
        return 0;
    }

    BTL_VERBOSE(("progressing %d pending FMA/RDMA operations", pending_post_count));
    for (int i = 0 ; i < pending_post_count ; ++i) {
        mca_btl_ugni_device_lock (device);
        post_desc = (mca_btl_ugni_post_descriptor_t *) opal_list_remove_first (&device->pending_post);
        mca_btl_ugni_device_unlock (device);
        if (NULL == post_desc) {
            break;
        }
        rc = mca_btl_ugni_repost (ugni_module, post_desc);
        if (OPAL_SUCCESS != rc) {
            mca_btl_ugni_device_lock (device);
            opal_list_prepend (&device->pending_post, (opal_list_item_t *) post_desc);
            mca_btl_ugni_device_unlock (device);
            break;
        }
    }

    return 1;
}
/**
 * This function always work in local representation. This means no representation
 * conversion (i.e. no heterogeneity) has to be taken into account, and that all
 * length we're working on are local.
 */
int32_t
opal_convertor_raw( opal_convertor_t* pConvertor, 
		    struct iovec* iov, uint32_t* iov_count,
		    size_t* length )
{
    const opal_datatype_t *pData = pConvertor->pDesc;
    dt_stack_t* pStack;       /* pointer to the position on the stack */
    uint32_t pos_desc;        /* actual position in the description of the derived datatype */
    uint32_t count_desc;      /* the number of items already done in the actual pos_desc */
    dt_elem_desc_t* description, *pElem;
    unsigned char *source_base;  /* origin of the data */
    size_t raw_data = 0;      /* sum of raw data lengths in the iov_len fields */
    uint32_t index = 0, i;    /* the iov index and a simple counter */

    assert( (*iov_count) > 0 );
    if( OPAL_LIKELY(pConvertor->flags & CONVERTOR_NO_OP) ) {
        /* The convertor contain minimal informations, we only use the bConverted
         * to manage the conversion. This function work even after the convertor
         * was moved to a specific position.
         */
        opal_convertor_get_current_pointer( pConvertor, (void**)&iov[0].iov_base );
        iov[0].iov_len = pConvertor->local_size - pConvertor->bConverted;
        *length = iov[0].iov_len;
        pConvertor->bConverted = pConvertor->local_size;
        pConvertor->flags |= CONVERTOR_COMPLETED;
        *iov_count = 1;
        return 1;  /* we're done */
    }

    DO_DEBUG( opal_output( 0, "opal_convertor_raw( %p, {%p, %u}, %lu )\n", (void*)pConvertor,
                           (void*)iov, *iov_count, (unsigned long)*length ); );
/*
 * Given an incoming segment, lookup the endpoint that sent it
 */
static inline ompi_btl_usnic_endpoint_t *
lookup_sender(ompi_btl_usnic_module_t *module, ompi_btl_usnic_segment_t *seg)
{
    int ret;
    ompi_btl_usnic_endpoint_t *sender;

    /* Use the hashed RTE process name in the BTL header to uniquely
       identify the sending process (using the MAC/hardware address
       only identifies the sending server -- not the sending RTE
       process). */
    /* JMS We've experimented with using a handshake before sending
       any data so that instead of looking up a hash on the
       btl_header->sender, echo back the ptr to the sender's
       ompi_proc.  There was limited speedup with this scheme; more
       investigation is required. */
    ret = opal_hash_table_get_value_uint64(&module->senders, 
                                           seg->us_btl_header->sender,
                                           (void**) &sender);
    if (OPAL_LIKELY(OPAL_SUCCESS == ret)) {
        return sender;
    }

    /* The sender wasn't in the hash table, so do a slow lookup and
       put the result in the hash table */
    sender = ompi_btl_usnic_proc_lookup_endpoint(module, 
                                                 seg->us_btl_header->sender);
    if (NULL != sender) {
        opal_hash_table_set_value_uint64(&module->senders, 
                                         seg->us_btl_header->sender, sender);
        return sender;
    }

    /* Whoa -- not found at all! */
    return NULL;
}
Example #10
0
static inline map_segment_t *__find_va(const void* va)
{
    map_segment_t *s;

    if (OPAL_LIKELY((uintptr_t)va >= (uintptr_t)memheap_map->mem_segs[HEAP_SEG_INDEX].seg_base_addr &&
                    (uintptr_t)va < (uintptr_t)memheap_map->mem_segs[HEAP_SEG_INDEX].end)) {
        s = &memheap_map->mem_segs[HEAP_SEG_INDEX];
    } else {
        s = bsearch(va,
                    &memheap_map->mem_segs[SYMB_SEG_INDEX],
                    memheap_map->n_segments - 1,
                    sizeof(*s),
                    _seg_cmp);
    }

#if MEMHEAP_BASE_DEBUG == 1
    if (s) {
        MEMHEAP_VERBOSE(5, "match seg#%02ld: 0x%llX - 0x%llX %llu bytes va=%p",
                s - memheap_map->mem_segs,
                (long long)s->seg_base_addr,
                (long long)s->end,
                (long long)(s->end - s->seg_base_addr),
                (void *)va);
    }
#endif
    return s;
}
/*
 * These functions can be used in order to create an IDENTICAL copy of one convertor. In this
 * context IDENTICAL means that the datatype and count and all other properties of the basic
 * convertor get replicated on this new convertor. However, the references to the datatype
 * are not increased. This function take special care about the stack. If all the cases the
 * stack is created with the correct number of entries but if the copy_stack is true (!= 0)
 * then the content of the old stack is copied on the new one. The result will be a convertor
 * ready to use starting from the old position. If copy_stack is false then the convertor
 * is created with a empty stack (you have to use opal_convertor_set_position before using it).
 */
int opal_convertor_clone( const opal_convertor_t* source,
                          opal_convertor_t* destination,
                          int32_t copy_stack )
{
    destination->remoteArch        = source->remoteArch;
    destination->flags             = source->flags;
    destination->pDesc             = source->pDesc;
    destination->use_desc          = source->use_desc;
    destination->count             = source->count;
    destination->pBaseBuf          = source->pBaseBuf;
    destination->fAdvance          = source->fAdvance;
    destination->master            = source->master;
    destination->local_size        = source->local_size;
    destination->remote_size       = source->remote_size;
    /* create the stack */
    if( OPAL_UNLIKELY(source->stack_size > DT_STATIC_STACK_SIZE) ) {
        destination->pStack = (dt_stack_t*)malloc(sizeof(dt_stack_t) * source->stack_size );
    } else {
        destination->pStack = destination->static_stack;
    }
    destination->stack_size = source->stack_size;

    /* initialize the stack */
    if( OPAL_LIKELY(0 == copy_stack) ) {
        destination->bConverted = -1;
        destination->stack_pos  = -1;
    } else {
        memcpy( destination->pStack, source->pStack, sizeof(dt_stack_t) * (source->stack_pos+1) );
        destination->bConverted = source->bConverted;
        destination->stack_pos  = source->stack_pos;
    }
    return OPAL_SUCCESS;
}
Example #12
0
static inline int
mca_btl_ugni_handle_remote_smsg_overrun (mca_btl_ugni_module_t *btl)
{
    gni_cq_entry_t event_data;
    unsigned int ep_index;
    int count, rc;

    BTL_VERBOSE(("btl/ugni_component detected SMSG CQ overrun. "
                 "processing message backlog..."));

    /* we don't know which endpoint lost an smsg completion. clear the
       smsg remote cq and check all mailboxes */

    /* clear out remote cq */
    do {
        rc = GNI_CqGetEvent (btl->smsg_remote_cq, &event_data);
    } while (GNI_RC_NOT_DONE != rc);

    for (ep_index = 0, count = 0 ; ep_index < btl->endpoint_count ; ++ep_index) {
        mca_btl_base_endpoint_t *ep = btl->endpoints[ep_index];

        if (NULL == ep || MCA_BTL_UGNI_EP_STATE_CONNECTED != ep->state) {
            continue;
        }

        /* clear out smsg mailbox */
        rc = mca_btl_ugni_smsg_process (ep);
        if (OPAL_LIKELY(rc >= 0)) {
            count += rc;
        }
    }

    return count;
}
Example #13
0
int32_t opal_convertor_set_position_nocheck( opal_convertor_t* convertor,
                                             size_t* position )
{
    int32_t rc;

    /**
     * If we plan to rollback the convertor then first we have to set it
     * at the beginning.
     */
    if( (0 == (*position)) || ((*position) < convertor->bConverted) ) {
        rc = opal_convertor_create_stack_at_begining( convertor, opal_datatype_local_sizes );
        if( 0 == (*position) ) return rc;
    }
    if( OPAL_LIKELY(convertor->flags & OPAL_DATATYPE_FLAG_CONTIGUOUS) ) {
        rc = opal_convertor_create_stack_with_pos_contig( convertor, (*position),
                                                          opal_datatype_local_sizes );
    } else {
        rc = opal_convertor_generic_simple_position( convertor, position );
        /**
         * If we have a non-contigous send convertor don't allow it move in the middle
         * of a predefined datatype, it won't be able to copy out the left-overs
         * anyway. Instead force the position to stay on predefined datatypes
         * boundaries. As we allow partial predefined datatypes on the contiguous
         * case, we should be accepted by any receiver convertor.
         */
        if( CONVERTOR_SEND & convertor->flags ) {
            convertor->bConverted -= convertor->partial_length;
            convertor->partial_length = 0;
        }
    }
    *position = convertor->bConverted;
    return rc;
}
Example #14
0
int mca_pml_ucx_send(const void *buf, size_t count, ompi_datatype_t *datatype, int dst,
                     int tag, mca_pml_base_send_mode_t mode,
                     struct ompi_communicator_t* comm)
{
    ompi_request_t *req;
    ucp_ep_h ep;

    PML_UCX_TRACE_SEND("%s", buf, count, datatype, dst, tag, mode, comm, "send");

    /* TODO special care to sync/buffered send */

    ep = mca_pml_ucx_get_ep(comm, dst);
    if (OPAL_UNLIKELY(NULL == ep)) {
        PML_UCX_ERROR("Failed to get ep for rank %d", dst);
        return OMPI_ERROR;
    }

    req = (ompi_request_t*)ucp_tag_send_nb(ep, buf, count,
                                           mca_pml_ucx_get_datatype(datatype),
                                           PML_UCX_MAKE_SEND_TAG(tag, comm),
                                           mca_pml_ucx_send_completion);
    if (OPAL_LIKELY(req == NULL)) {
        return OMPI_SUCCESS;
    } else if (!UCS_PTR_IS_ERR(req)) {
        PML_UCX_VERBOSE(8, "got request %p", (void*)req);
        ucp_worker_progress(ompi_pml_ucx.ucp_worker);
        ompi_request_wait(&req, MPI_STATUS_IGNORE);
        return OMPI_SUCCESS;
    } else {
        PML_UCX_ERROR("ucx send failed: %s", ucs_status_string(UCS_PTR_STATUS(req)));
        return OMPI_ERROR;
    }
}
Example #15
0
int mca_rcache_vma_insert(struct mca_rcache_base_module_t* rcache,
        mca_mpool_base_registration_t* reg, size_t limit)
{
    int rc;
    size_t reg_size = reg->bound - reg->base + 1;
    mca_rcache_vma_module_t *vma_rcache = (mca_rcache_vma_module_t*)rcache;

    if(limit != 0 && reg_size > limit) {
        /* return out of resources if request is bigger than cache size
         * return temp out of resources otherwise */
        return OMPI_ERR_OUT_OF_RESOURCE;
    }

    /* Check to ensure that the cache is valid */
    if (OPAL_UNLIKELY(opal_memory_changed() &&
                      NULL != opal_memory->memoryc_process &&
                      OPAL_SUCCESS != (rc = opal_memory->memoryc_process()))) {
        return rc;
    }

    rc = mca_rcache_vma_tree_insert(vma_rcache, reg, limit);
    if (OPAL_LIKELY(OMPI_SUCCESS == rc)) {
        /* If we successfully registered, then tell the memory manager
           to start monitoring this region */
        opal_memory->memoryc_register(reg->base, 
                                      (uint64_t) reg_size, (uint64_t) (uintptr_t) reg);
    }

    return rc;
}
Example #16
0
static int oshmem_mkey_recv_cb(void)
{
    MPI_Status status;
    int flag;
    int n;
    int rc;
    opal_buffer_t *msg;
    int32_t size;
    void *tmp_buf;
    oob_comm_request_t *r;

    n = 0;
    r = (oob_comm_request_t *)opal_list_get_first(&memheap_oob.req_list);
    assert(r);
    while (1) {
        my_MPI_Test(&r->recv_req, &flag, &status);
        if (OPAL_LIKELY(0 == flag)) {
            return n;
        }
        MPI_Get_count(&status, MPI_BYTE, &size);
        MEMHEAP_VERBOSE(5, "OOB request from PE: %d, size %d", status.MPI_SOURCE, size);
        n++;
        opal_list_remove_first(&memheap_oob.req_list);

        /* to avoid deadlock we must start request
         * before processing it. Data are copied to
         * the tmp buffer
         */
        tmp_buf = malloc(size);
        if (NULL == tmp_buf) {
            MEMHEAP_ERROR("not enough memory");
            ORTE_ERROR_LOG(0);
            return n;
        }
        memcpy(tmp_buf, (void*)&r->buf, size);
        msg = OBJ_NEW(opal_buffer_t);
        if (NULL == msg) {
            MEMHEAP_ERROR("not enough memory");
            ORTE_ERROR_LOG(0);
            return n;
        }
        opal_dss.load(msg, (void*)tmp_buf, size);

        rc = MPI_Start(&r->recv_req);
        if (MPI_SUCCESS != rc) {
            MEMHEAP_ERROR("Failed to post recv request %d", rc);
            ORTE_ERROR_LOG(rc);
            return n;
        }
        opal_list_append(&memheap_oob.req_list, &r->super);

        do_recv(status.MPI_SOURCE, msg);
        OBJ_RELEASE(msg);

        r = (oob_comm_request_t *)opal_list_get_first(&memheap_oob.req_list);
        assert(r);
    }
    return 1;  
}
Example #17
0
static inline struct mca_btl_base_descriptor_t *
mca_btl_scif_prepare_src_send (struct mca_btl_base_module_t *btl,
                               mca_btl_base_endpoint_t *endpoint,
                               struct opal_convertor_t *convertor,
                               uint8_t order, size_t reserve, size_t *size,
                               uint32_t flags)
{
    mca_btl_scif_base_frag_t *frag = NULL;
    uint32_t iov_count = 1;
    struct iovec iov;
    size_t max_size = *size;
    int rc;

    if (OPAL_LIKELY((mca_btl_scif_module.super.btl_flags & MCA_BTL_FLAGS_SEND_INPLACE) &&
                    !opal_convertor_need_buffers (convertor) &&
                    reserve <= 128)) {
        /* inplace send */
        void *data_ptr;
        opal_convertor_get_current_pointer (convertor, &data_ptr);

        (void) MCA_BTL_SCIF_FRAG_ALLOC_DMA(endpoint, frag);
        if (OPAL_UNLIKELY(NULL == frag)) {
            return NULL;
        }

        frag->segments[0].seg_len       = reserve;
        frag->segments[1].seg_addr.pval = data_ptr;
        frag->segments[1].seg_len       = *size;
        frag->base.des_segment_count = 2;
    } else {
        /* buffered send */
        (void) MCA_BTL_SCIF_FRAG_ALLOC_EAGER(endpoint, frag);
        if (OPAL_UNLIKELY(NULL == frag)) {
            return NULL;
        }

        if (*size) {
            iov.iov_len  = *size;
            iov.iov_base = (IOVBASE_TYPE *) ((uintptr_t) frag->segments[0].seg_addr.pval + reserve);

            rc = opal_convertor_pack (convertor, &iov, &iov_count, &max_size);
            if (OPAL_UNLIKELY(rc < 0)) {
                mca_btl_scif_frag_return (frag);
                return NULL;
            }
            *size = max_size;
        }

        frag->segments[0].seg_len = reserve + *size;
        frag->base.des_segment_count = 1;
    }

    frag->base.des_segments = frag->segments;
    frag->base.order        = order;
    frag->base.des_flags    = flags;

    return &frag->base;
}
Example #18
0
static mca_btl_base_registration_handle_t *mca_btl_scif_register_mem (struct mca_btl_base_module_t *btl,
        mca_btl_base_endpoint_t *endpoint,
        void *base, size_t size, uint32_t flags)
{
    mca_btl_scif_reg_t *scif_reg;
    int access_flags = flags & MCA_BTL_REG_FLAG_ACCESS_ANY;
    int rc;

    if (MCA_BTL_ENDPOINT_ANY == endpoint) {
        /* it probably isn't possible to support registering memory to use with any endpoint so
         * return NULL */
        return NULL;
    }

    if (OPAL_LIKELY(MCA_BTL_SCIF_EP_STATE_CONNECTED != endpoint->state)) {
        /* the endpoint needs to be connected before the fragment can be
         * registered. */
        rc = mca_btl_scif_ep_connect (endpoint);
        if (OPAL_LIKELY(MCA_BTL_SCIF_EP_STATE_CONNECTED != endpoint->state)) {
            /* not yet connected */
            return NULL;
        }
    }

    rc = btl->btl_mpool->mpool_register(btl->btl_mpool, base, size, 0, access_flags,
                                        (mca_mpool_base_registration_t **) &scif_reg);
    if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) {
        return NULL;
    }

    /* register the memory location with this peer if it isn't already */
    if ((off_t) -1 == scif_reg->handles[endpoint->id].btl_handle.scif_offset) {
        size_t seg_size = (size_t)((uintptr_t) scif_reg->base.bound - (uintptr_t) scif_reg->base.base) + 1;

        /* NTH: until we determine a way to pass permissions to the mpool just make all segments
         * read/write */
        scif_reg->handles[endpoint->id].btl_handle.scif_offset =
            scif_register (endpoint->scif_epd, scif_reg->base.base, seg_size, 0, SCIF_PROT_READ |
                           SCIF_PROT_WRITE, 0);
        BTL_VERBOSE(("registered fragment for scif DMA transaction. offset = %lu",
                     (unsigned long) scif_reg->handles[endpoint->id].btl_handle.scif_offset));
    }

    return &scif_reg->handles[endpoint->id].btl_handle;
}
Example #19
0
int ompi_osc_rdma_lock_atomic (int lock_type, int target, int assert, ompi_win_t *win)
{
    ompi_osc_rdma_module_t *module = GET_MODULE(win);
    ompi_osc_rdma_peer_t *peer = ompi_osc_rdma_module_peer (module, target);
    ompi_osc_rdma_sync_t *lock;
    int ret = OMPI_SUCCESS;

    OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "lock: %d, %d, %d, %s", lock_type, target, assert, win->w_name);

    if (module->no_locks) {
        OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "attempted to lock with no_locks set");
        return OMPI_ERR_RMA_SYNC;
    }

    if (module->all_sync.epoch_active && (OMPI_OSC_RDMA_SYNC_TYPE_LOCK != module->all_sync.type || MPI_LOCK_EXCLUSIVE == lock_type)) {
        /* impossible to get an exclusive lock while holding a global shared lock or in a active
         * target access epoch */
        return OMPI_ERR_RMA_SYNC;
    }

    /* clear the global sync object (in case MPI_Win_fence was called) */
    module->all_sync.type = OMPI_OSC_RDMA_SYNC_TYPE_NONE;

    /* create lock item */
    lock = ompi_osc_rdma_sync_allocate (module);
    if (OPAL_UNLIKELY(NULL == lock)) {
        return OMPI_ERR_OUT_OF_RESOURCE;
    }

    lock->type = OMPI_OSC_RDMA_SYNC_TYPE_LOCK;
    lock->sync.lock.target = target;
    lock->sync.lock.type = lock_type;
    lock->sync.lock.assert = assert;

    lock->peer_list.peer = peer;
    lock->num_peers = 1;
    OBJ_RETAIN(peer);

    if (0 == (assert & MPI_MODE_NOCHECK)) {
        ret = ompi_osc_rdma_lock_atomic_internal (module, peer, lock);
    }

    if (OPAL_LIKELY(OMPI_SUCCESS == ret)) {
        ++module->passive_target_access_epoch;

        opal_atomic_wmb ();

        OPAL_THREAD_SCOPED_LOCK(&module->lock, ompi_osc_rdma_module_lock_insert (module, lock));
    } else {
        OBJ_RELEASE(lock);
    }

    OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "lock %d complete", target);

    return ret;
}
Example #20
0
/**
 * Send an FIN to the peer. If we fail to send this ack (no more available
 * fragments or the send failed) this function automatically add the FIN
 * to the list of pending FIN, Which guarantee that the FIN will be sent
 * later.
 */
int mca_pml_ob1_send_fin( ompi_proc_t* proc,
                          mca_bml_base_btl_t* bml_btl,
                          ompi_ptr_t hdr_des,
                          uint8_t order,
                          uint32_t status )
{
    mca_btl_base_descriptor_t* fin;
    mca_pml_ob1_fin_hdr_t* hdr;
    int rc;

    mca_bml_base_alloc(bml_btl, &fin, order, sizeof(mca_pml_ob1_fin_hdr_t),
                       MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP);

    if(NULL == fin) {
        MCA_PML_OB1_ADD_FIN_TO_PENDING(proc, hdr_des, bml_btl, order, status);
        return OMPI_ERR_OUT_OF_RESOURCE;
    }
    fin->des_cbfunc = mca_pml_ob1_fin_completion;
    fin->des_cbdata = NULL;

    /* fill in header */
    hdr = (mca_pml_ob1_fin_hdr_t*)fin->des_src->seg_addr.pval;
    hdr->hdr_common.hdr_flags = 0;
    hdr->hdr_common.hdr_type = MCA_PML_OB1_HDR_TYPE_FIN;
    hdr->hdr_des = hdr_des;
    hdr->hdr_fail = status;

    ob1_hdr_hton(hdr, MCA_PML_OB1_HDR_TYPE_FIN, proc);

    /* queue request */
    rc = mca_bml_base_send( bml_btl,
                            fin,
                            MCA_PML_OB1_HDR_TYPE_FIN );
    if( OPAL_LIKELY( rc >= 0 ) ) {
        if( OPAL_LIKELY( 1 == rc ) ) {
            MCA_PML_OB1_PROGRESS_PENDING(bml_btl);
        }
        return OMPI_SUCCESS;
    }
    mca_bml_base_free(bml_btl, fin);
    MCA_PML_OB1_ADD_FIN_TO_PENDING(proc, hdr_des, bml_btl, order, status);
    return OMPI_ERR_OUT_OF_RESOURCE;
}
static inline int
mca_btl_ugni_progress_datagram (mca_btl_ugni_module_t *ugni_module)
{
    uint32_t remote_addr, remote_id;
    mca_btl_base_endpoint_t *ep;
    gni_post_state_t post_state;
    gni_ep_handle_t handle;
    uint64_t datagram_id;
    gni_return_t grc;
    int count = 0;

    /* check for datagram completion */
    grc = GNI_PostDataProbeById (ugni_module->device->dev_handle, &datagram_id);
    if (OPAL_LIKELY(GNI_RC_SUCCESS != grc)) {
        return 0;
    }

    if ((datagram_id & MCA_BTL_UGNI_DATAGRAM_MASK) ==
        MCA_BTL_UGNI_CONNECT_WILDCARD_ID) {
        handle = ugni_module->wildcard_ep;
    } else {
        handle =
            ugni_module->endpoints[(uint32_t)(datagram_id & 0xffffffffull)]->smsg_ep_handle;
    }

    /* wait for the incoming datagram to complete (in case it isn't) */
    grc = GNI_EpPostDataWaitById (handle, datagram_id, -1, &post_state,
                                  &remote_addr, &remote_id);
    if (GNI_RC_SUCCESS != grc) {
        BTL_ERROR(("GNI_EpPostDataWaitById failed with rc = %d", grc));
        return ompi_common_rc_ugni_to_ompi (grc);
    }

    BTL_VERBOSE(("got a datagram completion: id = %" PRIx64 ", state = %d, "
                 "peer = %d", datagram_id, post_state, remote_id));

    ep = ugni_module->endpoints[remote_id];

    /* NTH: TODO -- error handling */
    (void) mca_btl_ugni_ep_connect_progress (ep);

    if (MCA_BTL_UGNI_EP_STATE_CONNECTED == ep->state) {
        /*  process messages waiting in the endpoint's smsg mailbox */
        count = mca_btl_ugni_smsg_process (ep);
    }

    /* repost the wildcard datagram */
    if ((datagram_id & MCA_BTL_UGNI_DATAGRAM_MASK) ==
        MCA_BTL_UGNI_CONNECT_WILDCARD_ID) {
        mca_btl_ugni_wildcard_ep_post (ugni_module);
    }

    return count;
}
Example #22
0
int mca_pml_ob1_isend(const void *buf,
                      size_t count,
                      ompi_datatype_t * datatype,
                      int dst,
                      int tag,
                      mca_pml_base_send_mode_t sendmode,
                      ompi_communicator_t * comm,
                      ompi_request_t ** request)
{
    mca_pml_ob1_comm_proc_t *ob1_proc = mca_pml_ob1_peer_lookup (comm, dst);
    mca_pml_ob1_send_request_t *sendreq = NULL;
    ompi_proc_t *dst_proc = ob1_proc->ompi_proc;
    mca_bml_base_endpoint_t* endpoint = mca_bml_base_get_endpoint (dst_proc);
    int16_t seqn;
    int rc;

    if (OPAL_UNLIKELY(NULL == endpoint)) {
        return OMPI_ERR_UNREACH;
    }

    seqn = (uint16_t) OPAL_THREAD_ADD32(&ob1_proc->send_sequence, 1);

    if (MCA_PML_BASE_SEND_SYNCHRONOUS != sendmode) {
        rc = mca_pml_ob1_send_inline (buf, count, datatype, dst, tag, seqn, dst_proc,
                                      endpoint, comm);
        if (OPAL_LIKELY(0 <= rc)) {
            /* NTH: it is legal to return ompi_request_empty since the only valid
             * field in a send completion status is whether or not the send was
             * cancelled (which it can't be at this point anyway). */
            *request = &ompi_request_empty;
            return OMPI_SUCCESS;
        }
    }

    MCA_PML_OB1_SEND_REQUEST_ALLOC(comm, dst, sendreq);
    if (NULL == sendreq)
        return OMPI_ERR_OUT_OF_RESOURCE;

    MCA_PML_OB1_SEND_REQUEST_INIT(sendreq,
                                  buf,
                                  count,
                                  datatype,
                                  dst, tag,
                                  comm, sendmode, false);

    PERUSE_TRACE_COMM_EVENT (PERUSE_COMM_REQ_ACTIVATE,
                             &(sendreq)->req_send.req_base,
                             PERUSE_SEND);

    MCA_PML_OB1_SEND_REQUEST_START_W_SEQ(sendreq, endpoint, seqn, rc);
    *request = (ompi_request_t *) sendreq;
    return rc;
}
Example #23
0
/**
 * Send an FIN to the peer. If we fail to send this ack (no more available
 * fragments or the send failed) this function automatically add the FIN
 * to the list of pending FIN, Which guarantee that the FIN will be sent
 * later.
 */
int mca_pml_ob1_send_fin( ompi_proc_t* proc,
                          mca_bml_base_btl_t* bml_btl,
                          opal_ptr_t hdr_frag,
                          uint64_t rdma_size,
                          uint8_t order,
                          int status )
{
    mca_btl_base_descriptor_t* fin;
    int rc;

    mca_bml_base_alloc(bml_btl, &fin, order, sizeof(mca_pml_ob1_fin_hdr_t),
                       MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP | MCA_BTL_DES_FLAGS_SIGNAL);

    if(NULL == fin) {
        MCA_PML_OB1_ADD_FIN_TO_PENDING(proc, hdr_frag, rdma_size, bml_btl, order, status);
        return OMPI_ERR_OUT_OF_RESOURCE;
    }
    fin->des_cbfunc = mca_pml_ob1_fin_completion;
    fin->des_cbdata = NULL;

    /* fill in header */
    mca_pml_ob1_fin_hdr_prepare ((mca_pml_ob1_fin_hdr_t *) fin->des_segments->seg_addr.pval,
                                 0, hdr_frag.lval, status ? status : (int64_t) rdma_size);

    ob1_hdr_hton((mca_pml_ob1_hdr_t *) fin->des_segments->seg_addr.pval, MCA_PML_OB1_HDR_TYPE_FIN, proc);

    /* queue request */
    rc = mca_bml_base_send( bml_btl, fin, MCA_PML_OB1_HDR_TYPE_FIN );
    if( OPAL_LIKELY( rc >= 0 ) ) {
        if( OPAL_LIKELY( 1 == rc ) ) {
            MCA_PML_OB1_PROGRESS_PENDING(bml_btl);
        }
        return OMPI_SUCCESS;
    }
    mca_bml_base_free(bml_btl, fin);
    MCA_PML_OB1_ADD_FIN_TO_PENDING(proc, hdr_frag, rdma_size, bml_btl, order, status);
    return OMPI_ERR_OUT_OF_RESOURCE;
}
Example #24
0
int mca_btl_ugni_ep_handle_init (mca_btl_ugni_endpoint_t *ep, gni_cq_handle_t cq,
                                 mca_btl_ugni_device_t *device, mca_btl_ugni_endpoint_handle_t *ep_handle)
{
    gni_return_t grc;

    ep_handle->device = device;

    /* create a uGNI endpoint handle and bind it to the remote peer */
    grc = GNI_EpCreate (device->dev_handle, cq, &ep_handle->gni_handle);
    if (OPAL_LIKELY(GNI_RC_SUCCESS == grc)) {
        grc = GNI_EpBind (ep_handle->gni_handle, ep->ep_rem_addr, ep->ep_rem_id);
    }

    return mca_btl_rc_ugni_to_opal (grc);
}
Example #25
0
int mca_pml_ob1_recv(void *addr,
                     size_t count,
                     ompi_datatype_t * datatype,
                     int src,
                     int tag,
                     struct ompi_communicator_t *comm,
                     ompi_status_public_t * status)
{
    mca_pml_ob1_recv_request_t *recvreq = NULL;
    int rc;

    if (OPAL_LIKELY(!ompi_mpi_thread_multiple)) {
        recvreq = mca_pml_ob1_recvreq;
        mca_pml_ob1_recvreq = NULL;
    }

    if( OPAL_UNLIKELY(NULL == recvreq) ) {
        MCA_PML_OB1_RECV_REQUEST_ALLOC(recvreq);
        if (NULL == recvreq)
            return OMPI_ERR_TEMP_OUT_OF_RESOURCE;
    }

    recvreq->req_recv.req_base.req_type = MCA_PML_REQUEST_RECV;
    MCA_PML_OB1_RECV_REQUEST_INIT(recvreq, addr, count, datatype,
                                  src, tag, comm, false);

    PERUSE_TRACE_COMM_EVENT (PERUSE_COMM_REQ_ACTIVATE,
                             &(recvreq->req_recv.req_base),
                             PERUSE_RECV);

    MCA_PML_OB1_RECV_REQUEST_START(recvreq);
    ompi_request_wait_completion(&recvreq->req_recv.req_base.req_ompi);

    if (NULL != status) {  /* return status */
        *status = recvreq->req_recv.req_base.req_ompi.req_status;
    }

    rc = recvreq->req_recv.req_base.req_ompi.req_status.MPI_ERROR;

    if (recvreq->req_recv.req_base.req_pml_complete) {
        /* make buffer defined when the request is compeleted,
           and before releasing the objects. */
        MEMCHECKER(
            memchecker_call(&opal_memchecker_base_mem_defined,
                            recvreq->req_recv.req_base.req_addr,
                            recvreq->req_recv.req_base.req_count,
                            recvreq->req_recv.req_base.req_datatype);
        );
Example #26
0
int mca_pml_ob1_recv(void *addr,
                     size_t count,
                     ompi_datatype_t * datatype,
                     int src,
                     int tag,
                     struct ompi_communicator_t *comm,
                     ompi_status_public_t * status)
{
    mca_pml_ob1_recv_request_t *recvreq = NULL;
    int rc;

    if (OPAL_LIKELY(!ompi_mpi_thread_multiple)) {
        recvreq = mca_pml_ob1_recvreq;
        mca_pml_ob1_recvreq = NULL;
    }

    if( OPAL_UNLIKELY(NULL == recvreq) ) {
        MCA_PML_OB1_RECV_REQUEST_ALLOC(recvreq);
        if (NULL == recvreq)
            return OMPI_ERR_TEMP_OUT_OF_RESOURCE;
    }

    recvreq->req_recv.req_base.req_type = MCA_PML_REQUEST_RECV;
    MCA_PML_OB1_RECV_REQUEST_INIT(recvreq, addr, count, datatype,
                                  src, tag, comm, false);

    PERUSE_TRACE_COMM_EVENT (PERUSE_COMM_REQ_ACTIVATE,
                             &(recvreq->req_recv.req_base),
                             PERUSE_RECV);

    MCA_PML_OB1_RECV_REQUEST_START(recvreq);
    ompi_request_wait_completion(&recvreq->req_recv.req_base.req_ompi);

    if (NULL != status) {  /* return status */
        *status = recvreq->req_recv.req_base.req_ompi.req_status;
    }

    rc = recvreq->req_recv.req_base.req_ompi.req_status.MPI_ERROR;

    if (OPAL_UNLIKELY(ompi_mpi_thread_multiple || NULL != mca_pml_ob1_recvreq)) {
        MCA_PML_OB1_RECV_REQUEST_RETURN(recvreq);
    } else {
        mca_pml_ob1_recv_request_fini (recvreq);
        mca_pml_ob1_recvreq = recvreq;
    }

    return rc;
}
Example #27
0
sshmem_mkey_t * mca_memheap_base_get_cached_mkey(int pe,
                                                   void* va,
                                                   int btl_id,
                                                   void** rva)
{
    map_segment_t *s;
    int rc;
    sshmem_mkey_t *mkey;

    MEMHEAP_VERBOSE_FASTPATH(10, "rkey: pe=%d va=%p", pe, va);
    s = __find_va(va);
    if (NULL == s)
        return NULL ;

    if (!MAP_SEGMENT_IS_VALID(s))
        return NULL ;

    if (pe == oshmem_my_proc_id()) {
        *rva = va;
        MEMHEAP_VERBOSE_FASTPATH(10, "rkey: pe=%d va=%p -> (local) %lx %p", pe, va,
                s->mkeys[btl_id].u.key, *rva);
        return &s->mkeys[btl_id];
    }

    if (OPAL_LIKELY(s->mkeys_cache[pe])) {
        mkey = &s->mkeys_cache[pe][btl_id];
        *rva = va2rva(va, s->seg_base_addr, mkey->va_base);
        MEMHEAP_VERBOSE_FASTPATH(10, "rkey: pe=%d va=%p -> (cached) %lx %p", pe, (void *)va, mkey->u.key, (void *)*rva);
        return mkey;
    }

    s->mkeys_cache[pe] = (sshmem_mkey_t *) calloc(memheap_map->num_transports,
                                                    sizeof(sshmem_mkey_t));
    if (!s->mkeys_cache[pe])
        return NULL ;

    rc = memheap_oob_get_mkeys(pe,
                               s - memheap_map->mem_segs,
                               s->mkeys_cache[pe]);
    if (OSHMEM_SUCCESS != rc)
        return NULL ;

    mkey = &s->mkeys_cache[pe][btl_id];
    *rva = va2rva(va, s->seg_base_addr, mkey->va_base);

    MEMHEAP_VERBOSE_FASTPATH(5, "rkey: pe=%d va=%p -> (remote lookup) %lx %p", pe, (void *)va, mkey->u.key, (void *)*rva);
    return mkey;
}
Example #28
0
static int mca_btl_ugni_ep_send_disconnect (mca_btl_base_endpoint_t *ep)
{
    int rc;

    do {
        rc = mca_btl_ugni_endpoint_smsg_send_wtag (ep, NULL, 0, NULL, 0, -1, MCA_BTL_UGNI_TAG_DISCONNECT);
        if (OPAL_LIKELY(GNI_RC_NOT_DONE != rc)) {
            break;
        }

        /* most likely got here because we are out of credits. check the remote CQ to get credit return */
        (void) mca_btl_ugni_progress_remote_smsg (mca_btl_ugni_ep_btl (ep));
    } while (1);

    return mca_btl_rc_ugni_to_opal (rc);
}
static uint32_t mca_mpool_udreg_dereg_func (void *device_data, void *dreg_context)
{
    mca_mpool_udreg_module_t *mpool_udreg = (mca_mpool_udreg_module_t *) dreg_context;
    mca_mpool_base_registration_t *udreg_reg = (mca_mpool_base_registration_t *) device_data;
    int rc;

    rc = mpool_udreg->resources.deregister_mem(mpool_udreg->resources.reg_data, udreg_reg);

    if (OPAL_LIKELY(OMPI_SUCCESS == rc)) {
        OMPI_FREE_LIST_RETURN_MT(&mpool_udreg->reg_list,
                              (ompi_free_list_item_t *) udreg_reg);
    }
    /* might be worth printing out a warning if an error occurs here */

    return 0;
}
static struct mca_btl_base_descriptor_t *
mca_btl_ugni_prepare_src (struct mca_btl_base_module_t *btl,
                          mca_btl_base_endpoint_t *endpoint,
                          mca_mpool_base_registration_t *registration,
                          struct opal_convertor_t *convertor,
                          uint8_t order, size_t reserve, size_t *size,
                          uint32_t flags)
{
    if (OPAL_LIKELY(reserve)) {
        return mca_btl_ugni_prepare_src_send (btl, endpoint, convertor,
                                              order, reserve, size, flags);
    } else {
        return mca_btl_ugni_prepare_src_rdma (btl, endpoint, registration,
                                              convertor, order, size, flags);
    }
}