/** * allocate function */ void* mca_mpool_sm_alloc( mca_mpool_base_module_t* mpool, size_t size, size_t align, uint32_t flags, mca_mpool_base_registration_t** registration) { mca_mpool_sm_module_t* mpool_sm = (mca_mpool_sm_module_t*)mpool; opal_hwloc_base_memory_segment_t mseg; mseg.mbs_start_addr = mpool_sm->sm_allocator->alc_alloc(mpool_sm->sm_allocator, size, align, registration); if(mpool_sm->mem_node >= 0) { mseg.mbs_len = size; #if OPAL_HAVE_HWLOC opal_hwloc_base_membind(&mseg, 1, mpool_sm->mem_node); #endif } #if OPAL_CUDA_SUPPORT if ((flags & MCA_MPOOL_FLAGS_CUDA_REGISTER_MEM) && (NULL != mseg.mbs_start_addr)) { mca_common_cuda_register(mseg.mbs_start_addr, size, mpool->mpool_component->mpool_version.mca_component_name); } #endif return mseg.mbs_start_addr; }
static void* mca_common_ompio_cuda_alloc_seg ( void*ctx, size_t *size ) { char *buf=NULL; size_t realsize, numpages; numpages = (*size + mca_common_ompio_pagesize -1 )/mca_common_ompio_pagesize; realsize = numpages * mca_common_ompio_pagesize; buf = malloc ( realsize); if ( NULL != buf ) { mca_common_cuda_register ( ( char *)buf, realsize, NULL ); } *size = realsize; return buf; }
static int smcuda_btl_first_time_init(mca_btl_smcuda_t *smcuda_btl, int32_t my_smp_rank, int n) { size_t length, length_payload; sm_fifo_t *my_fifos; int my_mem_node, num_mem_nodes, i, rc; mca_mpool_base_resources_t *res = NULL; mca_btl_smcuda_component_t* m = &mca_btl_smcuda_component; /* Assume we don't have hwloc support and fill in dummy info */ mca_btl_smcuda_component.mem_node = my_mem_node = 0; mca_btl_smcuda_component.num_mem_nodes = num_mem_nodes = 1; /* If we have hwloc support, then get accurate information */ if (NULL != opal_hwloc_topology) { i = opal_hwloc_base_get_nbobjs_by_type(opal_hwloc_topology, HWLOC_OBJ_NODE, 0, OPAL_HWLOC_AVAILABLE); /* If we find >0 NUMA nodes, then investigate further */ if (i > 0) { int numa=0, w; unsigned n_bound=0; hwloc_cpuset_t avail; hwloc_obj_t obj; /* JMS This tells me how many numa nodes are *available*, but it's not how many are being used *by this job*. Note that this is the value we've previously used (from the previous carto-based implementation), but it really should be improved to be how many NUMA nodes are being used *in this job*. */ mca_btl_smcuda_component.num_mem_nodes = num_mem_nodes = i; /* if we are not bound, then there is nothing further to do */ if (NULL != opal_process_info.cpuset) { /* count the number of NUMA nodes to which we are bound */ for (w=0; w < i; w++) { if (NULL == (obj = opal_hwloc_base_get_obj_by_type(opal_hwloc_topology, HWLOC_OBJ_NODE, 0, w, OPAL_HWLOC_AVAILABLE))) { continue; } /* get that NUMA node's available cpus */ avail = opal_hwloc_base_get_available_cpus(opal_hwloc_topology, obj); /* see if we intersect */ if (hwloc_bitmap_intersects(avail, opal_hwloc_my_cpuset)) { n_bound++; numa = w; } } /* if we are located on more than one NUMA, or we didn't find * a NUMA we are on, then not much we can do */ if (1 == n_bound) { mca_btl_smcuda_component.mem_node = my_mem_node = numa; } else { mca_btl_smcuda_component.mem_node = my_mem_node = -1; } } } } if (NULL == (res = calloc(1, sizeof(*res)))) { return OPAL_ERR_OUT_OF_RESOURCE; } /* lookup shared memory pool */ mca_btl_smcuda_component.sm_mpools = (mca_mpool_base_module_t **)calloc(num_mem_nodes, sizeof(mca_mpool_base_module_t *)); /* Disable memory binding, because each MPI process will claim pages in the * mpool for their local NUMA node */ res->mem_node = -1; if (OPAL_SUCCESS != (rc = setup_mpool_base_resources(m, res))) { free(res); return rc; } /* now that res is fully populated, create the thing */ mca_btl_smcuda_component.sm_mpools[0] = mca_mpool_base_module_create(mca_btl_smcuda_component.sm_mpool_name, smcuda_btl, res); /* Sanity check to ensure that we found it */ if (NULL == mca_btl_smcuda_component.sm_mpools[0]) { free(res); return OPAL_ERR_OUT_OF_RESOURCE; } mca_btl_smcuda_component.sm_mpool = mca_btl_smcuda_component.sm_mpools[0]; mca_btl_smcuda_component.sm_mpool_base = mca_btl_smcuda_component.sm_mpools[0]->mpool_base(mca_btl_smcuda_component.sm_mpools[0]); /* create a list of peers */ mca_btl_smcuda_component.sm_peers = (struct mca_btl_base_endpoint_t**) calloc(n, sizeof(struct mca_btl_base_endpoint_t*)); if (NULL == mca_btl_smcuda_component.sm_peers) { free(res); return OPAL_ERR_OUT_OF_RESOURCE; } /* remember that node rank zero is already attached */ if (0 != my_smp_rank) { if (OPAL_SUCCESS != (rc = sm_segment_attach(m))) { free(res); return rc; } } #if OPAL_CUDA_SUPPORT /* Register the entire shared memory region with the CUDA library which will * force it to be pinned. This aproach was chosen as there is no way for this * local process to know which parts of the memory are being utilized by a * remote process. */ opal_output_verbose(10, opal_btl_base_framework.framework_output, "btl:smcuda: CUDA cuMemHostRegister address=%p, size=%d", mca_btl_smcuda_component.sm_mpool_base, (int)res->size); mca_common_cuda_register(mca_btl_smcuda_component.sm_mpool_base, res->size, "smcuda"); /* Create a local memory pool that sends handles to the remote * side. Note that the res argument is not really used, but * needed to satisfy function signature. */ smcuda_btl->super.btl_mpool = mca_mpool_base_module_create("gpusm", smcuda_btl, res); if (NULL == smcuda_btl->super.btl_mpool) { return OPAL_ERR_OUT_OF_RESOURCE; } #endif /* OPAL_CUDA_SUPPORT */ /* it is now safe to free the mpool resources */ free(res); /* check to make sure number of local procs is within the * specified limits */ if(mca_btl_smcuda_component.sm_max_procs > 0 && mca_btl_smcuda_component.num_smp_procs + n > mca_btl_smcuda_component.sm_max_procs) { return OPAL_ERROR; } mca_btl_smcuda_component.shm_fifo = (volatile sm_fifo_t **)mca_btl_smcuda_component.sm_seg->module_data_addr; mca_btl_smcuda_component.shm_bases = (char**)(mca_btl_smcuda_component.shm_fifo + n); mca_btl_smcuda_component.shm_mem_nodes = (uint16_t*)(mca_btl_smcuda_component.shm_bases + n); /* set the base of the shared memory segment */ mca_btl_smcuda_component.shm_bases[mca_btl_smcuda_component.my_smp_rank] = (char*)mca_btl_smcuda_component.sm_mpool_base; mca_btl_smcuda_component.shm_mem_nodes[mca_btl_smcuda_component.my_smp_rank] = (uint16_t)my_mem_node; /* initialize the array of fifo's "owned" by this process */ if(NULL == (my_fifos = (sm_fifo_t*)mpool_calloc(FIFO_MAP_NUM(n), sizeof(sm_fifo_t)))) return OPAL_ERR_OUT_OF_RESOURCE; mca_btl_smcuda_component.shm_fifo[mca_btl_smcuda_component.my_smp_rank] = my_fifos; /* cache the pointer to the 2d fifo array. These addresses * are valid in the current process space */ mca_btl_smcuda_component.fifo = (sm_fifo_t**)malloc(sizeof(sm_fifo_t*) * n); if(NULL == mca_btl_smcuda_component.fifo) return OPAL_ERR_OUT_OF_RESOURCE; mca_btl_smcuda_component.fifo[mca_btl_smcuda_component.my_smp_rank] = my_fifos; mca_btl_smcuda_component.mem_nodes = (uint16_t *) malloc(sizeof(uint16_t) * n); if(NULL == mca_btl_smcuda_component.mem_nodes) return OPAL_ERR_OUT_OF_RESOURCE; /* initialize fragment descriptor free lists */ /* allocation will be for the fragment descriptor and payload buffer */ length = sizeof(mca_btl_smcuda_frag1_t); length_payload = sizeof(mca_btl_smcuda_hdr_t) + mca_btl_smcuda_component.eager_limit; i = opal_free_list_init (&mca_btl_smcuda_component.sm_frags_eager, length, opal_cache_line_size, OBJ_CLASS(mca_btl_smcuda_frag1_t), length_payload, opal_cache_line_size, mca_btl_smcuda_component.sm_free_list_num, mca_btl_smcuda_component.sm_free_list_max, mca_btl_smcuda_component.sm_free_list_inc, mca_btl_smcuda_component.sm_mpool, 0, NULL, NULL, NULL); if ( OPAL_SUCCESS != i ) return i; length = sizeof(mca_btl_smcuda_frag2_t); length_payload = sizeof(mca_btl_smcuda_hdr_t) + mca_btl_smcuda_component.max_frag_size; i = opal_free_list_init (&mca_btl_smcuda_component.sm_frags_max, length, opal_cache_line_size, OBJ_CLASS(mca_btl_smcuda_frag2_t), length_payload, opal_cache_line_size, mca_btl_smcuda_component.sm_free_list_num, mca_btl_smcuda_component.sm_free_list_max, mca_btl_smcuda_component.sm_free_list_inc, mca_btl_smcuda_component.sm_mpool, 0, NULL, NULL, NULL); if ( OPAL_SUCCESS != i ) return i; i = opal_free_list_init (&mca_btl_smcuda_component.sm_frags_user, sizeof(mca_btl_smcuda_user_t), opal_cache_line_size, OBJ_CLASS(mca_btl_smcuda_user_t), sizeof(mca_btl_smcuda_hdr_t), opal_cache_line_size, mca_btl_smcuda_component.sm_free_list_num, mca_btl_smcuda_component.sm_free_list_max, mca_btl_smcuda_component.sm_free_list_inc, mca_btl_smcuda_component.sm_mpool, 0, NULL, NULL, NULL); if ( OPAL_SUCCESS != i ) return i; mca_btl_smcuda_component.num_outstanding_frags = 0; mca_btl_smcuda_component.num_pending_sends = 0; i = opal_free_list_init (&mca_btl_smcuda_component.pending_send_fl, sizeof(btl_smcuda_pending_send_item_t), 8, OBJ_CLASS(opal_free_list_item_t), 0, 0, 16, -1, 32, NULL, 0, NULL, NULL, NULL); if ( OPAL_SUCCESS != i ) return i; /* set flag indicating btl has been inited */ smcuda_btl->btl_inited = true; return OPAL_SUCCESS; }