Beispiel #1
0
static UCS_CLASS_INIT_FUNC(uct_ugni_smsg_iface_t, uct_md_h md, uct_worker_h worker,
                           const char *dev_name, size_t rx_headroom,
                           const uct_iface_config_t *tl_config)
{
    uct_ugni_iface_config_t *config = ucs_derived_of(tl_config, uct_ugni_iface_config_t);
    ucs_status_t status;
    gni_return_t ugni_rc;
    unsigned int bytes_per_mbox;
    gni_smsg_attr_t smsg_attr;

    pthread_mutex_lock(&uct_ugni_global_lock);

    UCS_CLASS_CALL_SUPER_INIT(uct_ugni_iface_t, md, worker, dev_name, &uct_ugni_smsg_iface_ops,
                              &config->super UCS_STATS_ARG(NULL));

    /* Setting initial configuration */
    self->config.smsg_seg_size = 2048;
    self->config.rx_headroom  = rx_headroom;
    self->config.smsg_max_retransmit = 16;
    self->config.smsg_max_credit = 8;
    self->smsg_id = 0;

    smsg_attr.msg_type = GNI_SMSG_TYPE_MBOX_AUTO_RETRANSMIT;
    smsg_attr.mbox_maxcredit = self->config.smsg_max_credit;
    smsg_attr.msg_maxsize = self->config.smsg_seg_size;

    ugni_rc = GNI_SmsgBufferSizeNeeded(&(smsg_attr), &bytes_per_mbox);
    self->bytes_per_mbox = ucs_align_up_pow2(bytes_per_mbox, ucs_get_page_size());

    if (ugni_rc != GNI_RC_SUCCESS) {
        ucs_error("Smsg buffer size calculation failed");
        status = UCS_ERR_INVALID_PARAM;
        goto exit;
    }

    status = ucs_mpool_init(&self->free_desc,
                            0,
                            self->config.smsg_seg_size + sizeof(uct_ugni_smsg_desc_t),
                            0,
                            UCS_SYS_CACHE_LINE_SIZE,      /* alignment */
                            128           ,               /* grow */
                            config->mpool.max_bufs,       /* max buffers */
                            &uct_ugni_smsg_desc_mpool_ops,
                            "UGNI-SMSG-DESC");

    if (UCS_OK != status) {
        ucs_error("Desc Mpool creation failed");
        goto exit;
    }

    status = ucs_mpool_init(&self->free_mbox,
                            0,
                            self->bytes_per_mbox + sizeof(uct_ugni_smsg_mbox_t),
                            sizeof(uct_ugni_smsg_mbox_t),
                            UCS_SYS_CACHE_LINE_SIZE,      /* alignment */
                            128,                          /* grow */
                            config->mpool.max_bufs,       /* max buffers */
                            &uct_ugni_smsg_mbox_mpool_ops,
                            "UGNI-SMSG-MBOX");

    if (UCS_OK != status) {
        ucs_error("Mbox Mpool creation failed");
        goto clean_desc;
    }

    UCT_TL_IFACE_GET_TX_DESC(&self->super.super, &self->free_desc,
                             self->user_desc, self->user_desc = NULL);

    status = ugni_smsg_activate_iface(self);
    if (UCS_OK != status) {
        ucs_error("Failed to activate the interface");
        goto clean_mbox;
    }

    ugni_rc = GNI_SmsgSetMaxRetrans(self->super.nic_handle, self->config.smsg_max_retransmit);

    if (ugni_rc != GNI_RC_SUCCESS) {
        ucs_error("Smsg setting max retransmit count failed.");
        status = UCS_ERR_INVALID_PARAM;
        goto clean_iface;
    }

    /* TBD: eventually the uct_ugni_progress has to be moved to
     * udt layer so each ugni layer will have own progress */
    uct_worker_progress_register(worker, uct_ugni_smsg_progress, self);
    pthread_mutex_unlock(&uct_ugni_global_lock);
    return UCS_OK;

 clean_iface:
    ugni_smsg_deactivate_iface(self);
 clean_desc:
    ucs_mpool_put(self->user_desc);
    ucs_mpool_cleanup(&self->free_desc, 1);
 clean_mbox:
    ucs_mpool_cleanup(&self->free_mbox, 1);
 exit:
    ucs_error("Failed to activate interface");
    pthread_mutex_unlock(&uct_ugni_global_lock);
    return status;
}
Beispiel #2
0
ucs_status_t uct_mem_alloc(size_t min_length, uct_alloc_method_t *methods,
                           unsigned num_methods, uct_pd_h *pds, unsigned num_pds,
                           const char *alloc_name, uct_allocated_memory_t *mem)
{
    uct_alloc_method_t *method;
    uct_pd_attr_t pd_attr;
    ucs_status_t status;
    size_t alloc_length;
    unsigned pd_index;
    uct_mem_h memh;
    uct_pd_h pd;
    void *address;
    int shmid;

    if (min_length == 0) {
        ucs_error("Allocation length cannot be 0");
        return UCS_ERR_INVALID_PARAM;
    }

    if (num_methods == 0) {
        ucs_error("No allocation methods provided");
        return UCS_ERR_INVALID_PARAM;
    }

    for (method = methods; method < methods + num_methods; ++method) {
        ucs_debug("trying allocation method %s", uct_alloc_method_names[*method]);

        switch (*method) {
        case UCT_ALLOC_METHOD_PD:
            /* Allocate with one of the specified protection domains */
            for (pd_index = 0; pd_index < num_pds; ++pd_index) {
                pd = pds[pd_index];
                status = uct_pd_query(pd, &pd_attr);
                if (status != UCS_OK) {
                    ucs_error("Failed to query PD");
                    return status;
                }

                /* Check if PD supports allocation */
                if (!(pd_attr.cap.flags & UCT_PD_FLAG_ALLOC)) {
                    continue;
                }

                /* Allocate memory using the PD.
                 * If the allocation fails, it's considered an error and we don't
                 * fall-back, because this PD already exposed support for memory
                 * allocation.
                 */
                alloc_length = min_length;
                status = uct_pd_mem_alloc(pd, &alloc_length, &address,
                                          alloc_name, &memh);
                if (status != UCS_OK) {
                    ucs_error("failed to allocate %zu bytes using pd %s: %s",
                              alloc_length, pd->component->name,
                              ucs_status_string(status));
                    return status;
                }
                mem->pd   = pd;
                mem->memh = memh;
                goto allocated;

            }
            break;

        case UCT_ALLOC_METHOD_HEAP:
            /* Allocate aligned memory using libc allocator */
            alloc_length = min_length;
            address = ucs_memalign(UCS_SYS_CACHE_LINE_SIZE, alloc_length
                                   UCS_MEMTRACK_VAL);
            if (address != NULL) {
                goto allocated_without_pd;
            }

            ucs_debug("failed to allocate %zu bytes from the heap", alloc_length);
            break;

        case UCT_ALLOC_METHOD_MMAP:
            /* Request memory from operating system using mmap() */
            alloc_length = ucs_align_up_pow2(min_length, ucs_get_page_size());
            address = ucs_mmap(NULL, alloc_length, PROT_READ|PROT_WRITE,
                               MAP_PRIVATE|MAP_ANON, -1, 0 UCS_MEMTRACK_VAL);
            if (address != MAP_FAILED) {
                goto allocated_without_pd;
            }

            ucs_debug("failed to mmap %zu bytes: %m", alloc_length);
            break;

        case UCT_ALLOC_METHOD_HUGE:
            /* Allocate huge pages */
            alloc_length = min_length;
            status = ucs_sysv_alloc(&alloc_length, &address, SHM_HUGETLB, &shmid
                                    UCS_MEMTRACK_VAL);
            if (status == UCS_OK) {
                goto allocated_without_pd;
            }

            ucs_debug("failed to allocate %zu bytes from hugetlb: %s",
                      min_length, ucs_status_string(status));
            break;

        default:
            ucs_error("Invalid allocation method %d", *method);
            return UCS_ERR_INVALID_PARAM;
        }
    }

    ucs_debug("Could not allocate memory with any of the provided methods");
    return UCS_ERR_NO_MEMORY;

allocated_without_pd:
    mem->pd      = NULL;
    mem->memh    = UCT_INVALID_MEM_HANDLE;
allocated:
    ucs_debug("allocated %zu bytes at %p using %s", alloc_length, address,
              (mem->pd == NULL) ? uct_alloc_method_names[*method]
                                : mem->pd->component->name);
    mem->address = address;
    mem->length  = alloc_length;
    mem->method  = *method;
    return UCS_OK;
}
Beispiel #3
0
/*
 * Generic data-pointer posting function.
 * Parameters which are not relevant to the opcode are ignored.
 *
 *            +--------+-----+-------+--------+-------+
 * SEND       | CTRL   | INL | am_id | am_hdr | DPSEG |
 *            +--------+-----+---+---+----+----+------+
 * RDMA_WRITE | CTRL   | RADDR   | DPSEG  |
 *            +--------+---------+--------+-------+
 * ATOMIC     | CTRL   | RADDR   | ATOMIC | DPSEG |
 *            +--------+---------+--------+-------+
 */
static UCS_F_ALWAYS_INLINE void
uct_rc_mlx5_ep_dptr_post(uct_rc_mlx5_ep_t *ep, unsigned opcode_flags,
                         const void *buffer, unsigned length, uint32_t *lkey_p,
                         /* SEND */ uint8_t am_id, const void *am_hdr, unsigned am_hdr_len,
                         /* RDMA/ATOMIC */ uint64_t remote_addr, uct_rkey_t rkey,
                         /* ATOMIC */ uint64_t compare_mask, uint64_t compare, uint64_t swap_add,
                         int signal)
{
    struct mlx5_wqe_ctrl_seg                     *ctrl;
    struct mlx5_wqe_raddr_seg                    *raddr;
    struct mlx5_wqe_atomic_seg                   *atomic;
    struct mlx5_wqe_data_seg                     *dptr;
    struct mlx5_wqe_inl_data_seg                 *inl;
    struct uct_ib_mlx5_atomic_masked_cswap32_seg *masked_cswap32;
    struct uct_ib_mlx5_atomic_masked_fadd32_seg  *masked_fadd32;
    struct uct_ib_mlx5_atomic_masked_cswap64_seg *masked_cswap64;

    uct_rc_mlx5_iface_t *iface;
    uct_rc_hdr_t        *rch;
    unsigned            wqe_size, inl_seg_size;
    uint8_t             opmod;

    iface = ucs_derived_of(ep->super.super.super.iface, uct_rc_mlx5_iface_t);
    if (!signal) {
        signal = uct_rc_iface_tx_moderation(&iface->super, &ep->super,
                                            MLX5_WQE_CTRL_CQ_UPDATE);
    } else {
        ucs_assert(signal == MLX5_WQE_CTRL_CQ_UPDATE);
    }

    opmod = 0;
    ctrl = ep->tx.wq.curr;
    switch (opcode_flags) {
    case MLX5_OPCODE_SEND:
        inl_seg_size     = ucs_align_up_pow2(sizeof(*inl) + sizeof(*rch) + am_hdr_len,
                                             UCT_IB_MLX5_WQE_SEG_SIZE);

        ucs_assert(sizeof(*ctrl) + inl_seg_size + sizeof(*dptr) <=
                   UCT_RC_MLX5_MAX_BB * MLX5_SEND_WQE_BB);
        ucs_assert(length + sizeof(*rch) + am_hdr_len <=
                   iface->super.super.config.seg_size);

        /* Inline segment with AM ID and header */
        inl              = (void*)(ctrl + 1);
        inl->byte_count  = htonl((sizeof(*rch) + am_hdr_len) | MLX5_INLINE_SEG);
        rch              = (void*)(inl + 1);
        rch->am_id       = am_id;

        uct_ib_mlx5_inline_copy(rch + 1, am_hdr, am_hdr_len, &ep->tx.wq);

        /* Data segment with payload */
        if (length == 0) {
            wqe_size     = sizeof(*ctrl) + inl_seg_size;
        } else {
            wqe_size     = sizeof(*ctrl) + inl_seg_size + sizeof(*dptr);
            dptr         = (void*)(ctrl + 1) + inl_seg_size;
            if (ucs_unlikely((void*)dptr >= ep->tx.wq.qend)) {
                dptr = (void*)dptr - (ep->tx.wq.qend - ep->tx.wq.qstart);
            }

            ucs_assert((void*)dptr       >= ep->tx.wq.qstart);
            ucs_assert((void*)(dptr + 1) <= ep->tx.wq.qend);
            uct_ib_mlx5_set_data_seg(dptr, buffer, length, *lkey_p);
        }
        break;

    case MLX5_OPCODE_SEND|UCT_RC_MLX5_OPCODE_FLAG_RAW:
        /* Data segment only */
        ucs_assert(length < (2ul << 30));
        ucs_assert(length <= iface->super.super.config.seg_size);

        wqe_size         = sizeof(*ctrl) + sizeof(*dptr);
        uct_ib_mlx5_set_data_seg((void*)(ctrl + 1), buffer, length, *lkey_p);
        break;

    case MLX5_OPCODE_RDMA_READ:
    case MLX5_OPCODE_RDMA_WRITE:
        /* Set RDMA segment */
        ucs_assert(length <= UCT_IB_MAX_MESSAGE_SIZE);

        raddr            = (void*)(ctrl + 1);
        uct_rc_mlx5_ep_set_rdma_seg(raddr, remote_addr, rkey);

        /* Data segment */
        if (length == 0) {
            wqe_size     = sizeof(*ctrl) + sizeof(*raddr);
        } else {
            wqe_size     = sizeof(*ctrl) + sizeof(*raddr) + sizeof(*dptr);
            uct_ib_mlx5_set_data_seg((void*)(raddr + 1), buffer, length, *lkey_p);
        }
        break;

    case MLX5_OPCODE_ATOMIC_FA:
    case MLX5_OPCODE_ATOMIC_CS:
        ucs_assert(length == sizeof(uint64_t));
        raddr = (void*)(ctrl + 1);
        uct_rc_mlx5_ep_set_rdma_seg(raddr, remote_addr, rkey);

        atomic            = (void*)(raddr + 1);
        if (opcode_flags == MLX5_OPCODE_ATOMIC_CS) {
            atomic->compare = compare;
        }
        atomic->swap_add  = swap_add;

        uct_ib_mlx5_set_data_seg((void*)(atomic + 1), buffer, length, *lkey_p);
        wqe_size          = sizeof(*ctrl) + sizeof(*raddr) + sizeof(*atomic) +
                            sizeof(*dptr);
        break;

    case MLX5_OPCODE_ATOMIC_MASKED_CS:
        raddr = (void*)(ctrl + 1);
        uct_rc_mlx5_ep_set_rdma_seg(raddr, remote_addr, rkey);

        switch (length) {
        case sizeof(uint32_t):
            opmod                        = UCT_IB_MLX5_OPMOD_EXT_ATOMIC(2);
            masked_cswap32 = (void*)(raddr + 1);
            masked_cswap32->swap         = swap_add;
            masked_cswap32->compare      = compare;
            masked_cswap32->swap_mask    = (uint32_t)-1;
            masked_cswap32->compare_mask = compare_mask;
            dptr                         = (void*)(masked_cswap32 + 1);
            wqe_size                     = sizeof(*ctrl) + sizeof(*raddr) +
                                           sizeof(*masked_cswap32) + sizeof(*dptr);
            break;
        case sizeof(uint64_t):
            opmod                        = UCT_IB_MLX5_OPMOD_EXT_ATOMIC(3); /* Ext. atomic, size 2**3 */
            masked_cswap64 = (void*)(raddr + 1);
            masked_cswap64->swap         = swap_add;
            masked_cswap64->compare      = compare;
            masked_cswap64->swap_mask    = (uint64_t)-1;
            masked_cswap64->compare_mask = compare_mask;
            dptr                         = (void*)(masked_cswap64 + 1);
            wqe_size                     = sizeof(*ctrl) + sizeof(*raddr) +
                                           sizeof(*masked_cswap64) + sizeof(*dptr);

            /* Handle QP wrap-around. It cannot happen in the middle of
             * masked-cswap segment, because it's still in the first BB.
             */
            ucs_assert((void*)dptr <= ep->tx.wq.qend);
            if (dptr == ep->tx.wq.qend) {
                dptr = ep->tx.wq.qstart;
            } else {
                ucs_assert((void*)masked_cswap64 < ep->tx.wq.qend);
            }
            break;
        default:
            ucs_assert(0);
        }

        uct_ib_mlx5_set_data_seg(dptr, buffer, length, *lkey_p);
        break;

     case MLX5_OPCODE_ATOMIC_MASKED_FA:
        ucs_assert(length == sizeof(uint32_t));
        raddr = (void*)(ctrl + 1);
        uct_rc_mlx5_ep_set_rdma_seg(raddr, remote_addr, rkey);

        opmod                         = UCT_IB_MLX5_OPMOD_EXT_ATOMIC(2);
        masked_fadd32                 = (void*)(raddr + 1);
        masked_fadd32->add            = swap_add;
        masked_fadd32->filed_boundary = 0;

        uct_ib_mlx5_set_data_seg((void*)(masked_fadd32 + 1), buffer, length,
                                 *lkey_p);
        wqe_size                      = sizeof(*ctrl) + sizeof(*raddr) +
                                        sizeof(*masked_fadd32) + sizeof(*dptr);
        break;

    default:
        ucs_fatal("invalid send opcode");
    }

    uct_rc_mlx5_post_send(ep, ctrl, (opcode_flags & UCT_RC_MLX5_OPCODE_MASK),
                          opmod, signal, wqe_size);
}