/* ////////////////////////////////////////////////////////////////////////// */ static int verbs_runtime_query(mca_base_module_t **module, int *priority, const char *hint) { int rc = OSHMEM_SUCCESS; openib_device_t my_device; openib_device_t *device = &my_device; int num_devs = 0; int i = 0; *priority = 0; *module = NULL; memset(device, 0, sizeof(*device)); #ifdef HAVE_IBV_GET_DEVICE_LIST device->ib_devs = ibv_get_device_list(&num_devs); #else #error unsupported ibv_get_device_list in infiniband/verbs.h #endif if (num_devs == 0 || !device->ib_devs) { return OSHMEM_ERR_NOT_SUPPORTED; } /* Open device */ if (NULL != mca_sshmem_verbs_component.hca_name) { for (i = 0; i < num_devs; i++) { if (0 == strcmp(mca_sshmem_verbs_component.hca_name, ibv_get_device_name(device->ib_devs[i]))) { device->ib_dev = device->ib_devs[i]; break; } } } else { device->ib_dev = device->ib_devs[0]; } if (NULL == device->ib_dev) { rc = OSHMEM_ERR_NOT_FOUND; goto out; } if (NULL == (device->ib_dev_context = ibv_open_device(device->ib_dev))) { rc = OSHMEM_ERR_RESOURCE_BUSY; goto out; } /* Obtain device attributes */ if (ibv_query_device(device->ib_dev_context, &device->ib_dev_attr)) { rc = OSHMEM_ERR_RESOURCE_BUSY; goto out; } /* Allocate the protection domain for the device */ device->ib_pd = ibv_alloc_pd(device->ib_dev_context); if (NULL == device->ib_pd) { rc = OSHMEM_ERR_RESOURCE_BUSY; goto out; } /* Allocate memory */ if (!rc) { void *addr = NULL; size_t size = getpagesize(); struct ibv_mr *ib_mr = NULL; uint64_t access_flag = IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ; uint64_t exp_access_flag = 0; OBJ_CONSTRUCT(&device->ib_mr_array, opal_value_array_t); opal_value_array_init(&device->ib_mr_array, sizeof(struct ibv_mr *)); #if defined(MPAGE_ENABLE) && (MPAGE_ENABLE > 0) exp_access_flag = IBV_EXP_ACCESS_ALLOCATE_MR | IBV_EXP_ACCESS_SHARED_MR_USER_READ | IBV_EXP_ACCESS_SHARED_MR_USER_WRITE; #endif /* MPAGE_ENABLE */ struct ibv_exp_reg_mr_in in = {device->ib_pd, addr, size, access_flag|exp_access_flag, 0}; ib_mr = ibv_exp_reg_mr(&in); if (NULL == ib_mr) { rc = OSHMEM_ERR_OUT_OF_RESOURCE; } else { device->ib_mr_shared = ib_mr; opal_value_array_append_item(&device->ib_mr_array, &ib_mr); } #if defined(MPAGE_ENABLE) && (MPAGE_ENABLE > 0) if (!rc) { struct ibv_exp_reg_shared_mr_in in_smr; access_flag = IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ| IBV_EXP_ACCESS_NO_RDMA; addr = (void *)mca_sshmem_base_start_address; mca_sshmem_verbs_fill_shared_mr(&in_smr, device->ib_pd, device->ib_mr_shared->handle, addr, access_flag); ib_mr = ibv_exp_reg_shared_mr(&in_smr); if (NULL == ib_mr) { mca_sshmem_verbs_component.has_shared_mr = 0; } else { opal_value_array_append_item(&device->ib_mr_array, &ib_mr); mca_sshmem_verbs_component.has_shared_mr = 1; } } #endif /* MPAGE_ENABLE */ } /* all is well - rainbows and butterflies */ if (!rc) { *priority = mca_sshmem_verbs_component.priority; *module = (mca_base_module_t *)&mca_sshmem_verbs_module.super; } out: if (device) { if (opal_value_array_get_size(&device->ib_mr_array)) { struct ibv_mr** array; struct ibv_mr* ib_mr = NULL; array = OPAL_VALUE_ARRAY_GET_BASE(&device->ib_mr_array, struct ibv_mr *); while (opal_value_array_get_size(&device->ib_mr_array) > 0) { ib_mr = array[0]; ibv_dereg_mr(ib_mr); opal_value_array_remove_item(&device->ib_mr_array, 0); } if (device->ib_mr_shared) { device->ib_mr_shared = NULL; } OBJ_DESTRUCT(&device->ib_mr_array); } if (device->ib_pd) { ibv_dealloc_pd(device->ib_pd); device->ib_pd = NULL; } if(device->ib_dev_context) { ibv_close_device(device->ib_dev_context); device->ib_dev_context = NULL; } if(device->ib_devs) { ibv_free_device_list(device->ib_devs); device->ib_devs = NULL; } } return rc; }
} static ucs_status_t uct_ib_mem_alloc(uct_pd_h uct_pd, size_t *length_p, void **address_p, uct_mem_h *memh_p UCS_MEMTRACK_ARG) { uct_ib_pd_t *pd = ucs_derived_of(uct_pd, uct_ib_pd_t); struct ibv_exp_reg_mr_in in = { pd->pd, NULL, ucs_memtrack_adjust_alloc_size(*length_p), UCT_IB_MEM_ACCESS_FLAGS | IBV_EXP_ACCESS_ALLOCATE_MR, 0 }; struct ibv_mr *mr; mr = ibv_exp_reg_mr(&in); if (mr == NULL) { ucs_error("ibv_exp_reg_mr(in={NULL, length=%Zu, flags=0x%lx}) failed: %m", ucs_memtrack_adjust_alloc_size(*length_p), (unsigned long)(UCT_IB_MEM_ACCESS_FLAGS | IBV_EXP_ACCESS_ALLOCATE_MR)); return UCS_ERR_IO_ERROR; } UCS_STATS_UPDATE_COUNTER(pd->stats, UCT_IB_PD_STAT_MEM_ALLOC, +1); *address_p = mr->addr; *length_p = mr->length; ucs_memtrack_allocated(address_p, length_p UCS_MEMTRACK_VAL); *memh_p = mr; return UCS_OK; }
/* ////////////////////////////////////////////////////////////////////////// */ static int segment_create(map_segment_t *ds_buf, const char *file_name, size_t size) { int rc = OSHMEM_SUCCESS; openib_device_t *device = &memheap_device; int num_devs = 0; int i = 0; assert(ds_buf); /* init the contents of map_segment_t */ shmem_ds_reset(ds_buf); memset(device, 0, sizeof(*device)); #ifdef HAVE_IBV_GET_DEVICE_LIST device->ib_devs = ibv_get_device_list(&num_devs); #else #error unsupported ibv_get_device_list in infiniband/verbs.h #endif if (num_devs == 0 || !device->ib_devs) { return OSHMEM_ERR_NOT_SUPPORTED; } /* Open device */ if (NULL != mca_sshmem_verbs_component.hca_name) { for (i = 0; i < num_devs; i++) { if (0 == strcmp(mca_sshmem_verbs_component.hca_name, ibv_get_device_name(device->ib_devs[i]))) { device->ib_dev = device->ib_devs[i]; break; } } } else { device->ib_dev = device->ib_devs[0]; } if (NULL == device->ib_dev) { OPAL_OUTPUT_VERBOSE( (5, oshmem_sshmem_base_framework.framework_output, "error getting device says %d: %s", errno, strerror(errno)) ); return OSHMEM_ERR_NOT_FOUND; } if (NULL == (device->ib_dev_context = ibv_open_device(device->ib_dev))) { OPAL_OUTPUT_VERBOSE( (5, oshmem_sshmem_base_framework.framework_output, "error obtaining device context for %s errno says %d: %s", ibv_get_device_name(device->ib_dev), errno, strerror(errno)) ); return OSHMEM_ERR_RESOURCE_BUSY; } /* Obtain device attributes */ if (ibv_query_device(device->ib_dev_context, &device->ib_dev_attr)) { OPAL_OUTPUT_VERBOSE( (5, oshmem_sshmem_base_framework.framework_output, "error obtaining device attributes for %s errno says %d: %s", ibv_get_device_name(device->ib_dev), errno, strerror(errno)) ); return OSHMEM_ERR_RESOURCE_BUSY; } /* Allocate the protection domain for the device */ device->ib_pd = ibv_alloc_pd(device->ib_dev_context); if (NULL == device->ib_pd) { OPAL_OUTPUT_VERBOSE( (5, oshmem_sshmem_base_framework.framework_output, "error allocating protection domain for %s errno says %d: %s", ibv_get_device_name(device->ib_dev), errno, strerror(errno)) ); return OSHMEM_ERR_RESOURCE_BUSY; } /* Allocate memory */ if (!rc) { void *addr = NULL; struct ibv_mr *ib_mr = NULL; uint64_t access_flag = IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ; uint64_t exp_access_flag = 0; OBJ_CONSTRUCT(&device->ib_mr_array, opal_value_array_t); opal_value_array_init(&device->ib_mr_array, sizeof(struct ibv_mr *)); #if (MPAGE_ENABLE > 0) exp_access_flag = IBV_EXP_ACCESS_ALLOCATE_MR | IBV_EXP_ACCESS_SHARED_MR_USER_READ | IBV_EXP_ACCESS_SHARED_MR_USER_WRITE; #endif /* MPAGE_ENABLE */ struct ibv_exp_reg_mr_in in = {device->ib_pd, addr, size, access_flag|exp_access_flag, 0}; #if MPAGE_HAVE_IBV_EXP_REG_MR_CREATE_FLAGS if (0 == mca_sshmem_verbs_component.has_shared_mr) { in.addr = (void *)mca_sshmem_base_start_address; in.comp_mask = IBV_EXP_REG_MR_CREATE_FLAGS; in.create_flags = IBV_EXP_REG_MR_CREATE_CONTIG; in.exp_access = access_flag; } #endif ib_mr = ibv_exp_reg_mr(&in); if (NULL == ib_mr) { OPAL_OUTPUT_VERBOSE( (5, oshmem_sshmem_base_framework.framework_output, "error to ibv_exp_reg_mr() %llu bytes errno says %d: %s", (unsigned long long)size, errno, strerror(errno)) ); rc = OSHMEM_ERR_OUT_OF_RESOURCE; } else { device->ib_mr_shared = ib_mr; opal_value_array_append_item(&device->ib_mr_array, &ib_mr); } #if (MPAGE_ENABLE > 0) if (!rc && mca_sshmem_verbs_component.has_shared_mr) { void *addr = NULL; access_flag = IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ| IBV_EXP_ACCESS_NO_RDMA; addr = (void *)mca_sshmem_base_start_address; struct ibv_exp_reg_shared_mr_in in; mca_sshmem_verbs_fill_shared_mr(&in, device->ib_pd, device->ib_mr_shared->handle, addr, access_flag); ib_mr = ibv_exp_reg_shared_mr(&in); if (NULL == ib_mr) { OPAL_OUTPUT_VERBOSE( (5, oshmem_sshmem_base_framework.framework_output, "error to ibv_reg_shared_mr() %llu bytes errno says %d: %s has_shared_mr: %d", (unsigned long long)size, errno, strerror(errno), mca_sshmem_verbs_component.has_shared_mr ) ); rc = OSHMEM_ERR_OUT_OF_RESOURCE; } else { opal_value_array_append_item(&device->ib_mr_array, &ib_mr); } } #endif /* MPAGE_ENABLE */ if (!rc) { OPAL_OUTPUT_VERBOSE( (70, oshmem_sshmem_base_framework.framework_output, "ibv device %s shared_mr: %d", ibv_get_device_name(device->ib_dev), mca_sshmem_verbs_component.has_shared_mr) ); if (mca_sshmem_verbs_component.has_shared_mr) { assert(size == device->ib_mr_shared->length); ds_buf->type = MAP_SEGMENT_ALLOC_IBV; ds_buf->seg_id = device->ib_mr_shared->handle; } else { ds_buf->type = MAP_SEGMENT_ALLOC_IBV_NOSHMR; ds_buf->seg_id = MAP_SEGMENT_SHM_INVALID; } ds_buf->super.va_base = ib_mr->addr; ds_buf->seg_size = size; ds_buf->super.va_end = (void*)((uintptr_t)ds_buf->super.va_base + ds_buf->seg_size); } } OPAL_OUTPUT_VERBOSE( (70, oshmem_sshmem_base_framework.framework_output, "%s: %s: create %s " "(id: %d, addr: %p size: %lu, name: %s)\n", mca_sshmem_verbs_component.super.base_version.mca_type_name, mca_sshmem_verbs_component.super.base_version.mca_component_name, (rc ? "failure" : "successful"), ds_buf->seg_id, ds_buf->super.va_base, (unsigned long)ds_buf->seg_size, ds_buf->seg_name) ); return rc; }
} static ucs_status_t uct_ib_mem_alloc_internal(uct_md_h uct_md, size_t *length_p, void **address_p, uct_ib_mem_t *memh UCS_MEMTRACK_ARG) { uct_ib_md_t *md = ucs_derived_of(uct_md, uct_ib_md_t); struct ibv_exp_reg_mr_in in = { md->pd, NULL, ucs_memtrack_adjust_alloc_size(*length_p), UCT_IB_MEM_ACCESS_FLAGS | IBV_EXP_ACCESS_ALLOCATE_MR, 0 }; memh->mr = ibv_exp_reg_mr(&in); if (memh->mr == NULL) { ucs_error("ibv_exp_reg_mr(in={NULL, length=%Zu, flags=0x%lx}) failed: %m", ucs_memtrack_adjust_alloc_size(*length_p), (unsigned long)(UCT_IB_MEM_ACCESS_FLAGS | IBV_EXP_ACCESS_ALLOCATE_MR)); return UCS_ERR_IO_ERROR; } ucs_trace("allocated memory %p..%p on %s lkey 0x%x rkey 0x%x", memh->mr->addr, memh->mr->addr + memh->mr->length, uct_ib_device_name(&md->dev), memh->mr->lkey, memh->mr->rkey); memh->lkey = memh->mr->lkey; memh->umr = uct_ib_md_create_umr(md, memh->mr); #if HAVE_EXP_UMR if (memh->umr == NULL && md->umr_qp) {