Exemple #1
0
/**
  * allocate function
  */
void* mca_mpool_sm_alloc(
    mca_mpool_base_module_t* mpool,
    size_t size,
    size_t align,
    uint32_t flags,
    mca_mpool_base_registration_t** registration)
{
    mca_mpool_sm_module_t* mpool_sm = (mca_mpool_sm_module_t*)mpool;
    opal_hwloc_base_memory_segment_t mseg;

    mseg.mbs_start_addr =
        mpool_sm->sm_allocator->alc_alloc(mpool_sm->sm_allocator, size, align, registration);

    if(mpool_sm->mem_node >= 0) {
        mseg.mbs_len = size;
#if OPAL_HAVE_HWLOC
        opal_hwloc_base_membind(&mseg, 1, mpool_sm->mem_node);
#endif
    }

#if OPAL_CUDA_SUPPORT
    if ((flags & MCA_MPOOL_FLAGS_CUDA_REGISTER_MEM) && (NULL != mseg.mbs_start_addr)) {
        mca_common_cuda_register(mseg.mbs_start_addr, size,
                                 mpool->mpool_component->mpool_version.mca_component_name);
    }
#endif

    return mseg.mbs_start_addr;
}
Exemple #2
0
static void* mca_common_ompio_cuda_alloc_seg ( void*ctx, size_t *size )
{
    char *buf=NULL;
    size_t realsize, numpages;

    numpages = (*size + mca_common_ompio_pagesize -1 )/mca_common_ompio_pagesize;
    realsize = numpages * mca_common_ompio_pagesize;

    buf = malloc ( realsize);
    if ( NULL != buf ) {
        mca_common_cuda_register ( ( char *)buf, realsize, NULL  );
    }
    *size = realsize;
    return buf;
}
Exemple #3
0
static int
smcuda_btl_first_time_init(mca_btl_smcuda_t *smcuda_btl,
                       int32_t my_smp_rank,
                       int n)
{
    size_t length, length_payload;
    sm_fifo_t *my_fifos;
    int my_mem_node, num_mem_nodes, i, rc;
    mca_mpool_base_resources_t *res = NULL;
    mca_btl_smcuda_component_t* m = &mca_btl_smcuda_component;

    /* Assume we don't have hwloc support and fill in dummy info */
    mca_btl_smcuda_component.mem_node = my_mem_node = 0;
    mca_btl_smcuda_component.num_mem_nodes = num_mem_nodes = 1;

    /* If we have hwloc support, then get accurate information */
    if (NULL != opal_hwloc_topology) {
        i = opal_hwloc_base_get_nbobjs_by_type(opal_hwloc_topology,
                                               HWLOC_OBJ_NODE, 0,
                                               OPAL_HWLOC_AVAILABLE);

        /* If we find >0 NUMA nodes, then investigate further */
        if (i > 0) {
            int numa=0, w;
            unsigned n_bound=0;
            hwloc_cpuset_t avail;
            hwloc_obj_t obj;

            /* JMS This tells me how many numa nodes are *available*,
               but it's not how many are being used *by this job*.
               Note that this is the value we've previously used (from
               the previous carto-based implementation), but it really
               should be improved to be how many NUMA nodes are being
               used *in this job*. */
            mca_btl_smcuda_component.num_mem_nodes = num_mem_nodes = i;

            /* if we are not bound, then there is nothing further to do */
            if (NULL != opal_process_info.cpuset) {
                /* count the number of NUMA nodes to which we are bound */
                for (w=0; w < i; w++) {
                    if (NULL == (obj = opal_hwloc_base_get_obj_by_type(opal_hwloc_topology,
                                                                       HWLOC_OBJ_NODE, 0, w,
                                                                       OPAL_HWLOC_AVAILABLE))) {
                        continue;
                    }
                    /* get that NUMA node's available cpus */
                    avail = opal_hwloc_base_get_available_cpus(opal_hwloc_topology, obj);
                    /* see if we intersect */
                    if (hwloc_bitmap_intersects(avail, opal_hwloc_my_cpuset)) {
                        n_bound++;
                        numa = w;
                    }
                }
                /* if we are located on more than one NUMA, or we didn't find
                 * a NUMA we are on, then not much we can do
                 */
                if (1 == n_bound) {
                    mca_btl_smcuda_component.mem_node = my_mem_node = numa;
                } else {
                    mca_btl_smcuda_component.mem_node = my_mem_node = -1;
                }
            }
        }
    }

    if (NULL == (res = calloc(1, sizeof(*res)))) {
        return OPAL_ERR_OUT_OF_RESOURCE;
    }

    /* lookup shared memory pool */
    mca_btl_smcuda_component.sm_mpools =
        (mca_mpool_base_module_t **)calloc(num_mem_nodes,
                                           sizeof(mca_mpool_base_module_t *));

    /* Disable memory binding, because each MPI process will claim pages in the
     * mpool for their local NUMA node */
    res->mem_node = -1;

    if (OPAL_SUCCESS != (rc = setup_mpool_base_resources(m, res))) {
        free(res);
        return rc;
    }
    /* now that res is fully populated, create the thing */
    mca_btl_smcuda_component.sm_mpools[0] =
        mca_mpool_base_module_create(mca_btl_smcuda_component.sm_mpool_name,
                                     smcuda_btl, res);
    /* Sanity check to ensure that we found it */
    if (NULL == mca_btl_smcuda_component.sm_mpools[0]) {
        free(res);
        return OPAL_ERR_OUT_OF_RESOURCE;
    }

    mca_btl_smcuda_component.sm_mpool = mca_btl_smcuda_component.sm_mpools[0];

    mca_btl_smcuda_component.sm_mpool_base =
        mca_btl_smcuda_component.sm_mpools[0]->mpool_base(mca_btl_smcuda_component.sm_mpools[0]);

    /* create a list of peers */
    mca_btl_smcuda_component.sm_peers = (struct mca_btl_base_endpoint_t**)
        calloc(n, sizeof(struct mca_btl_base_endpoint_t*));
    if (NULL == mca_btl_smcuda_component.sm_peers) {
        free(res);
        return OPAL_ERR_OUT_OF_RESOURCE;
    }

    /* remember that node rank zero is already attached */
    if (0 != my_smp_rank) {
        if (OPAL_SUCCESS != (rc = sm_segment_attach(m))) {
            free(res);
            return rc;
        }
    }
#if OPAL_CUDA_SUPPORT
    /* Register the entire shared memory region with the CUDA library which will
     * force it to be pinned.  This aproach was chosen as there is no way for this
     * local process to know which parts of the memory are being utilized by a
     * remote process. */
    opal_output_verbose(10, opal_btl_base_framework.framework_output,
                        "btl:smcuda: CUDA cuMemHostRegister address=%p, size=%d",
                        mca_btl_smcuda_component.sm_mpool_base, (int)res->size);
    mca_common_cuda_register(mca_btl_smcuda_component.sm_mpool_base, res->size, "smcuda");

    /* Create a local memory pool that sends handles to the remote
     * side.  Note that the res argument is not really used, but
     * needed to satisfy function signature. */
    smcuda_btl->super.btl_mpool = mca_mpool_base_module_create("gpusm",
                                                               smcuda_btl,
                                                               res);
    if (NULL == smcuda_btl->super.btl_mpool) {
        return OPAL_ERR_OUT_OF_RESOURCE;
    }
#endif /* OPAL_CUDA_SUPPORT */

    /* it is now safe to free the mpool resources */
    free(res);

    /* check to make sure number of local procs is within the
     * specified limits */
    if(mca_btl_smcuda_component.sm_max_procs > 0 &&
       mca_btl_smcuda_component.num_smp_procs + n >
       mca_btl_smcuda_component.sm_max_procs) {
        return OPAL_ERROR;
    }

    mca_btl_smcuda_component.shm_fifo = (volatile sm_fifo_t **)mca_btl_smcuda_component.sm_seg->module_data_addr;
    mca_btl_smcuda_component.shm_bases = (char**)(mca_btl_smcuda_component.shm_fifo + n);
    mca_btl_smcuda_component.shm_mem_nodes = (uint16_t*)(mca_btl_smcuda_component.shm_bases + n);

    /* set the base of the shared memory segment */
    mca_btl_smcuda_component.shm_bases[mca_btl_smcuda_component.my_smp_rank] =
        (char*)mca_btl_smcuda_component.sm_mpool_base;
    mca_btl_smcuda_component.shm_mem_nodes[mca_btl_smcuda_component.my_smp_rank] =
        (uint16_t)my_mem_node;

    /* initialize the array of fifo's "owned" by this process */
    if(NULL == (my_fifos = (sm_fifo_t*)mpool_calloc(FIFO_MAP_NUM(n), sizeof(sm_fifo_t))))
        return OPAL_ERR_OUT_OF_RESOURCE;

    mca_btl_smcuda_component.shm_fifo[mca_btl_smcuda_component.my_smp_rank] = my_fifos;

    /* cache the pointer to the 2d fifo array.  These addresses
     * are valid in the current process space */
    mca_btl_smcuda_component.fifo = (sm_fifo_t**)malloc(sizeof(sm_fifo_t*) * n);

    if(NULL == mca_btl_smcuda_component.fifo)
        return OPAL_ERR_OUT_OF_RESOURCE;

    mca_btl_smcuda_component.fifo[mca_btl_smcuda_component.my_smp_rank] = my_fifos;

    mca_btl_smcuda_component.mem_nodes = (uint16_t *) malloc(sizeof(uint16_t) * n);
    if(NULL == mca_btl_smcuda_component.mem_nodes)
        return OPAL_ERR_OUT_OF_RESOURCE;

    /* initialize fragment descriptor free lists */

    /* allocation will be for the fragment descriptor and payload buffer */
    length = sizeof(mca_btl_smcuda_frag1_t);
    length_payload =
        sizeof(mca_btl_smcuda_hdr_t) + mca_btl_smcuda_component.eager_limit;
    i = opal_free_list_init (&mca_btl_smcuda_component.sm_frags_eager, length,
                             opal_cache_line_size, OBJ_CLASS(mca_btl_smcuda_frag1_t),
                             length_payload, opal_cache_line_size,
                             mca_btl_smcuda_component.sm_free_list_num,
                             mca_btl_smcuda_component.sm_free_list_max,
                             mca_btl_smcuda_component.sm_free_list_inc,
                             mca_btl_smcuda_component.sm_mpool, 0, NULL, NULL, NULL);
    if ( OPAL_SUCCESS != i )
        return i;

    length = sizeof(mca_btl_smcuda_frag2_t);
    length_payload =
        sizeof(mca_btl_smcuda_hdr_t) + mca_btl_smcuda_component.max_frag_size;
    i = opal_free_list_init (&mca_btl_smcuda_component.sm_frags_max, length,
                             opal_cache_line_size, OBJ_CLASS(mca_btl_smcuda_frag2_t),
                             length_payload, opal_cache_line_size,
                             mca_btl_smcuda_component.sm_free_list_num,
                             mca_btl_smcuda_component.sm_free_list_max,
                             mca_btl_smcuda_component.sm_free_list_inc,
                             mca_btl_smcuda_component.sm_mpool, 0, NULL, NULL, NULL);
    if ( OPAL_SUCCESS != i )
        return i;

    i = opal_free_list_init (&mca_btl_smcuda_component.sm_frags_user,
                             sizeof(mca_btl_smcuda_user_t),
                             opal_cache_line_size, OBJ_CLASS(mca_btl_smcuda_user_t),
                             sizeof(mca_btl_smcuda_hdr_t), opal_cache_line_size,
                             mca_btl_smcuda_component.sm_free_list_num,
                             mca_btl_smcuda_component.sm_free_list_max,
                             mca_btl_smcuda_component.sm_free_list_inc,
                             mca_btl_smcuda_component.sm_mpool, 0, NULL, NULL, NULL);
    if ( OPAL_SUCCESS != i )
	    return i;

    mca_btl_smcuda_component.num_outstanding_frags = 0;

    mca_btl_smcuda_component.num_pending_sends = 0;
    i = opal_free_list_init (&mca_btl_smcuda_component.pending_send_fl,
                             sizeof(btl_smcuda_pending_send_item_t), 8,
                             OBJ_CLASS(opal_free_list_item_t),
                             0, 0, 16, -1, 32, NULL, 0, NULL, NULL, NULL);
    if ( OPAL_SUCCESS != i )
        return i;

    /* set flag indicating btl has been inited */
    smcuda_btl->btl_inited = true;

    return OPAL_SUCCESS;
}