/** * opal_carto_base_get_nodes_distance - returns the distance of * all the nodes from the reference node. * * @param graph * @param reference_node * @param node_type the type of the nodes in the returned array * @param dist_array * * @return int number of nodes in the returned array. */ int opal_carto_base_get_nodes_distance_fn(opal_carto_graph_t *graph, opal_carto_base_node_t *reference_node, const char *node_type, opal_value_array_t *dist_array) { opal_value_array_t *distance_array; vertex_distance_from_t *vertex_distance; opal_carto_base_node_t *node; uint32_t i, graph_order; int distance_array_size; opal_carto_node_distance_t node_distance; distance_array = OBJ_NEW(opal_value_array_t); opal_value_array_init(distance_array, sizeof(vertex_distance_from_t)); opal_value_array_reserve(distance_array,50); /* use dijkstra algorithm to receive the distance of all the nodes from the referenced node */ graph_order = opal_graph_dijkstra(graph, reference_node->vertex, distance_array); /* for all the nodes in the dijkstra array */ for (i = 0, distance_array_size = 0; i < graph_order; i++) { vertex_distance = opal_value_array_get_item(distance_array, i); node = vertex_distance->vertex->vertex_data; /* check if the node is in the correct type */ if (NULL == node_type || 0 == strcmp(node->node_type, node_type)) { /* assigne the result distance array */ node_distance.node = vertex_distance->vertex->vertex_data; node_distance.node_distance = vertex_distance->weight; opal_value_array_append_item(dist_array, (void *)&node_distance); } } /* return the result distance array */ return distance_array_size; }
/** * This graph API returns all the adjacents of a vertex and the * distance (weight) of those adjacents and the vertex. * * @param graph * @param vertex The reference vertex * @param adjacents An allocated pointer array of vertices and * their distance from the reference vertex. * Note that this pointer should be free after * usage by the user * * @return int the number of adjacents in the list. */ int opal_graph_get_adjacent_vertices(opal_graph_t *graph, opal_graph_vertex_t *vertex, opal_value_array_t *adjacents) { opal_adjacency_list_t *adj_list; opal_graph_edge_t *edge; int adjacents_number; opal_list_item_t *item; vertex_distance_from_t distance_from; int i; /** * Verify that the vertex belongs to the graph. */ if (graph != vertex->in_graph) { OPAL_OUTPUT((0,"Vertex %p not in the graph %p\n", (void *)vertex, (void *)graph)); return 0; } /** * find the adjacency list that this vertex belongs to */ adj_list = (opal_adjacency_list_t *) vertex->in_adj_list; /* find the number of adjcents of this vertex */ adjacents_number = opal_list_get_size(adj_list->edges); /* Run on all the edges from this vertex */ for (item = opal_list_get_first(adj_list->edges), i = 0; item != opal_list_get_end(adj_list->edges); item = opal_list_get_next(item), i++) { edge = (opal_graph_edge_t *)item; /* assign vertices and their weight in the adjcents list */ distance_from.vertex = edge->end; distance_from.weight = edge->weight; opal_value_array_append_item(adjacents, &distance_from); } /* return the number of the adjacents in the list */ return adjacents_number; }
static void start_sequence(orte_jobid_t jobid, orte_node_t *node, orte_regex_node_t *ndreg, char suffix, int32_t nodenum) { int32_t j, ppn; orte_vpid_t start_vpid, end_vpid; orte_node_rank_t nrank; opal_value_array_append_item(&ndreg->suffix, &suffix); opal_value_array_append_item(&ndreg->nodes, &nodenum); j = 0; opal_value_array_append_item(&ndreg->cnt, &j); compute_vpids(node, jobid, &start_vpid, &end_vpid, &ppn, &nrank); opal_value_array_append_item(&ndreg->starting_vpid, &start_vpid); opal_value_array_append_item(&ndreg->ppn, &ppn); opal_value_array_append_item(&ndreg->nrank, &nrank); }
/* ////////////////////////////////////////////////////////////////////////// */ static int verbs_runtime_query(mca_base_module_t **module, int *priority, const char *hint) { int rc = OSHMEM_SUCCESS; openib_device_t my_device; openib_device_t *device = &my_device; int num_devs = 0; int i = 0; *priority = 0; *module = NULL; memset(device, 0, sizeof(*device)); #ifdef HAVE_IBV_GET_DEVICE_LIST device->ib_devs = ibv_get_device_list(&num_devs); #else #error unsupported ibv_get_device_list in infiniband/verbs.h #endif if (num_devs == 0 || !device->ib_devs) { return OSHMEM_ERR_NOT_SUPPORTED; } /* Open device */ if (NULL != mca_sshmem_verbs_component.hca_name) { for (i = 0; i < num_devs; i++) { if (0 == strcmp(mca_sshmem_verbs_component.hca_name, ibv_get_device_name(device->ib_devs[i]))) { device->ib_dev = device->ib_devs[i]; break; } } } else { device->ib_dev = device->ib_devs[0]; } if (NULL == device->ib_dev) { rc = OSHMEM_ERR_NOT_FOUND; goto out; } if (NULL == (device->ib_dev_context = ibv_open_device(device->ib_dev))) { rc = OSHMEM_ERR_RESOURCE_BUSY; goto out; } /* Obtain device attributes */ if (ibv_query_device(device->ib_dev_context, &device->ib_dev_attr)) { rc = OSHMEM_ERR_RESOURCE_BUSY; goto out; } /* Allocate the protection domain for the device */ device->ib_pd = ibv_alloc_pd(device->ib_dev_context); if (NULL == device->ib_pd) { rc = OSHMEM_ERR_RESOURCE_BUSY; goto out; } /* Allocate memory */ if (!rc) { void *addr = NULL; size_t size = getpagesize(); struct ibv_mr *ib_mr = NULL; uint64_t access_flag = IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ; uint64_t exp_access_flag = 0; OBJ_CONSTRUCT(&device->ib_mr_array, opal_value_array_t); opal_value_array_init(&device->ib_mr_array, sizeof(struct ibv_mr *)); #if defined(MPAGE_ENABLE) && (MPAGE_ENABLE > 0) exp_access_flag = IBV_EXP_ACCESS_ALLOCATE_MR | IBV_EXP_ACCESS_SHARED_MR_USER_READ | IBV_EXP_ACCESS_SHARED_MR_USER_WRITE; #endif /* MPAGE_ENABLE */ struct ibv_exp_reg_mr_in in = {device->ib_pd, addr, size, access_flag|exp_access_flag, 0}; ib_mr = ibv_exp_reg_mr(&in); if (NULL == ib_mr) { rc = OSHMEM_ERR_OUT_OF_RESOURCE; } else { device->ib_mr_shared = ib_mr; opal_value_array_append_item(&device->ib_mr_array, &ib_mr); } #if defined(MPAGE_ENABLE) && (MPAGE_ENABLE > 0) if (!rc) { struct ibv_exp_reg_shared_mr_in in_smr; access_flag = IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ| IBV_EXP_ACCESS_NO_RDMA; addr = (void *)mca_sshmem_base_start_address; mca_sshmem_verbs_fill_shared_mr(&in_smr, device->ib_pd, device->ib_mr_shared->handle, addr, access_flag); ib_mr = ibv_exp_reg_shared_mr(&in_smr); if (NULL == ib_mr) { mca_sshmem_verbs_component.has_shared_mr = 0; } else { opal_value_array_append_item(&device->ib_mr_array, &ib_mr); mca_sshmem_verbs_component.has_shared_mr = 1; } } #endif /* MPAGE_ENABLE */ } /* all is well - rainbows and butterflies */ if (!rc) { *priority = mca_sshmem_verbs_component.priority; *module = (mca_base_module_t *)&mca_sshmem_verbs_module.super; } out: if (device) { if (opal_value_array_get_size(&device->ib_mr_array)) { struct ibv_mr** array; struct ibv_mr* ib_mr = NULL; array = OPAL_VALUE_ARRAY_GET_BASE(&device->ib_mr_array, struct ibv_mr *); while (opal_value_array_get_size(&device->ib_mr_array) > 0) { ib_mr = array[0]; ibv_dereg_mr(ib_mr); opal_value_array_remove_item(&device->ib_mr_array, 0); } if (device->ib_mr_shared) { device->ib_mr_shared = NULL; } OBJ_DESTRUCT(&device->ib_mr_array); } if (device->ib_pd) { ibv_dealloc_pd(device->ib_pd); device->ib_pd = NULL; } if(device->ib_dev_context) { ibv_close_device(device->ib_dev_context); device->ib_dev_context = NULL; } if(device->ib_devs) { ibv_free_device_list(device->ib_devs); device->ib_devs = NULL; } } return rc; }
/** * segment_attach can only be called after a successful call to segment_create */ static void * segment_attach(map_segment_t *ds_buf, sshmem_mkey_t *mkey) { openib_device_t *device = &memheap_device; static int mr_count = 0; void *addr = NULL; assert(ds_buf); assert(mkey->va_base == 0); if (MAP_SEGMENT_SHM_INVALID == (int)(mkey->u.key)) { return (mkey->va_base); } /* workaround mtt problem - request aligned addresses */ ++mr_count; addr = (void *)((uintptr_t)mca_sshmem_base_start_address + mca_sshmem_verbs_component.mr_interleave_factor * 1024ULL * 1024ULL * 1024ULL * mr_count); { struct ibv_mr *ib_mr = NULL; uint64_t access_flag = IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ | IBV_EXP_ACCESS_NO_RDMA; struct ibv_exp_reg_shared_mr_in in; mca_sshmem_verbs_fill_shared_mr(&in, device->ib_pd, mkey->u.key, addr, access_flag); ib_mr = ibv_exp_reg_shared_mr(&in); if (NULL == ib_mr) { mkey->va_base = (void *)-1; OPAL_OUTPUT_VERBOSE( (5, oshmem_sshmem_base_framework.framework_output, "error to ibv_reg_shared_mr() %llu bytes errno says %d: %s", (unsigned long long)ds_buf->seg_size, errno, strerror(errno)) ); } else { if (ib_mr->addr != addr) { OPAL_OUTPUT_VERBOSE( (5, oshmem_sshmem_base_framework.framework_output, "Failed to map shared region to address %p got addr %p. Try to increase 'memheap_mr_interleave_factor' from %d", addr, ib_mr->addr, mca_sshmem_verbs_component.mr_interleave_factor) ); } opal_value_array_append_item(&device->ib_mr_array, &ib_mr); mkey->va_base = ib_mr->addr; } } OPAL_OUTPUT_VERBOSE( (70, oshmem_sshmem_base_framework.framework_output, "%s: %s: attach successful " "(id: %d, addr: %p size: %lu, name: %s | va_base: 0x%p len: %d key %llx)\n", mca_sshmem_verbs_component.super.base_version.mca_type_name, mca_sshmem_verbs_component.super.base_version.mca_component_name, ds_buf->seg_id, ds_buf->super.va_base, (unsigned long)ds_buf->seg_size, ds_buf->seg_name, mkey->va_base, mkey->len, (unsigned long long)mkey->u.key) ); /* update returned base pointer with an offset that hides our stuff */ return (mkey->va_base); }
/* ////////////////////////////////////////////////////////////////////////// */ static int segment_create(map_segment_t *ds_buf, const char *file_name, size_t size) { int rc = OSHMEM_SUCCESS; openib_device_t *device = &memheap_device; int num_devs = 0; int i = 0; assert(ds_buf); /* init the contents of map_segment_t */ shmem_ds_reset(ds_buf); memset(device, 0, sizeof(*device)); #ifdef HAVE_IBV_GET_DEVICE_LIST device->ib_devs = ibv_get_device_list(&num_devs); #else #error unsupported ibv_get_device_list in infiniband/verbs.h #endif if (num_devs == 0 || !device->ib_devs) { return OSHMEM_ERR_NOT_SUPPORTED; } /* Open device */ if (NULL != mca_sshmem_verbs_component.hca_name) { for (i = 0; i < num_devs; i++) { if (0 == strcmp(mca_sshmem_verbs_component.hca_name, ibv_get_device_name(device->ib_devs[i]))) { device->ib_dev = device->ib_devs[i]; break; } } } else { device->ib_dev = device->ib_devs[0]; } if (NULL == device->ib_dev) { OPAL_OUTPUT_VERBOSE( (5, oshmem_sshmem_base_framework.framework_output, "error getting device says %d: %s", errno, strerror(errno)) ); return OSHMEM_ERR_NOT_FOUND; } if (NULL == (device->ib_dev_context = ibv_open_device(device->ib_dev))) { OPAL_OUTPUT_VERBOSE( (5, oshmem_sshmem_base_framework.framework_output, "error obtaining device context for %s errno says %d: %s", ibv_get_device_name(device->ib_dev), errno, strerror(errno)) ); return OSHMEM_ERR_RESOURCE_BUSY; } /* Obtain device attributes */ if (ibv_query_device(device->ib_dev_context, &device->ib_dev_attr)) { OPAL_OUTPUT_VERBOSE( (5, oshmem_sshmem_base_framework.framework_output, "error obtaining device attributes for %s errno says %d: %s", ibv_get_device_name(device->ib_dev), errno, strerror(errno)) ); return OSHMEM_ERR_RESOURCE_BUSY; } /* Allocate the protection domain for the device */ device->ib_pd = ibv_alloc_pd(device->ib_dev_context); if (NULL == device->ib_pd) { OPAL_OUTPUT_VERBOSE( (5, oshmem_sshmem_base_framework.framework_output, "error allocating protection domain for %s errno says %d: %s", ibv_get_device_name(device->ib_dev), errno, strerror(errno)) ); return OSHMEM_ERR_RESOURCE_BUSY; } /* Allocate memory */ if (!rc) { void *addr = NULL; struct ibv_mr *ib_mr = NULL; uint64_t access_flag = IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ; uint64_t exp_access_flag = 0; OBJ_CONSTRUCT(&device->ib_mr_array, opal_value_array_t); opal_value_array_init(&device->ib_mr_array, sizeof(struct ibv_mr *)); #if (MPAGE_ENABLE > 0) exp_access_flag = IBV_EXP_ACCESS_ALLOCATE_MR | IBV_EXP_ACCESS_SHARED_MR_USER_READ | IBV_EXP_ACCESS_SHARED_MR_USER_WRITE; #endif /* MPAGE_ENABLE */ struct ibv_exp_reg_mr_in in = {device->ib_pd, addr, size, access_flag|exp_access_flag, 0}; #if MPAGE_HAVE_IBV_EXP_REG_MR_CREATE_FLAGS if (0 == mca_sshmem_verbs_component.has_shared_mr) { in.addr = (void *)mca_sshmem_base_start_address; in.comp_mask = IBV_EXP_REG_MR_CREATE_FLAGS; in.create_flags = IBV_EXP_REG_MR_CREATE_CONTIG; in.exp_access = access_flag; } #endif ib_mr = ibv_exp_reg_mr(&in); if (NULL == ib_mr) { OPAL_OUTPUT_VERBOSE( (5, oshmem_sshmem_base_framework.framework_output, "error to ibv_exp_reg_mr() %llu bytes errno says %d: %s", (unsigned long long)size, errno, strerror(errno)) ); rc = OSHMEM_ERR_OUT_OF_RESOURCE; } else { device->ib_mr_shared = ib_mr; opal_value_array_append_item(&device->ib_mr_array, &ib_mr); } #if (MPAGE_ENABLE > 0) if (!rc && mca_sshmem_verbs_component.has_shared_mr) { void *addr = NULL; access_flag = IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ| IBV_EXP_ACCESS_NO_RDMA; addr = (void *)mca_sshmem_base_start_address; struct ibv_exp_reg_shared_mr_in in; mca_sshmem_verbs_fill_shared_mr(&in, device->ib_pd, device->ib_mr_shared->handle, addr, access_flag); ib_mr = ibv_exp_reg_shared_mr(&in); if (NULL == ib_mr) { OPAL_OUTPUT_VERBOSE( (5, oshmem_sshmem_base_framework.framework_output, "error to ibv_reg_shared_mr() %llu bytes errno says %d: %s has_shared_mr: %d", (unsigned long long)size, errno, strerror(errno), mca_sshmem_verbs_component.has_shared_mr ) ); rc = OSHMEM_ERR_OUT_OF_RESOURCE; } else { opal_value_array_append_item(&device->ib_mr_array, &ib_mr); } } #endif /* MPAGE_ENABLE */ if (!rc) { OPAL_OUTPUT_VERBOSE( (70, oshmem_sshmem_base_framework.framework_output, "ibv device %s shared_mr: %d", ibv_get_device_name(device->ib_dev), mca_sshmem_verbs_component.has_shared_mr) ); if (mca_sshmem_verbs_component.has_shared_mr) { assert(size == device->ib_mr_shared->length); ds_buf->type = MAP_SEGMENT_ALLOC_IBV; ds_buf->seg_id = device->ib_mr_shared->handle; } else { ds_buf->type = MAP_SEGMENT_ALLOC_IBV_NOSHMR; ds_buf->seg_id = MAP_SEGMENT_SHM_INVALID; } ds_buf->super.va_base = ib_mr->addr; ds_buf->seg_size = size; ds_buf->super.va_end = (void*)((uintptr_t)ds_buf->super.va_base + ds_buf->seg_size); } } OPAL_OUTPUT_VERBOSE( (70, oshmem_sshmem_base_framework.framework_output, "%s: %s: create %s " "(id: %d, addr: %p size: %lu, name: %s)\n", mca_sshmem_verbs_component.super.base_version.mca_type_name, mca_sshmem_verbs_component.super.base_version.mca_component_name, (rc ? "failure" : "successful"), ds_buf->seg_id, ds_buf->super.va_base, (unsigned long)ds_buf->seg_size, ds_buf->seg_name) ); return rc; }
/** * This graph API returns the distance (weight) from a reference * vertex to all other vertices in the graph using the Dijkstra * algorithm * * @param graph * @param vertex The reference vertex. * @param distance_array An array of vertices and * their distance from the reference vertex. * * @return uint32_t the size of the distance array */ uint32_t opal_graph_dijkstra(opal_graph_t *graph, opal_graph_vertex_t *vertex, opal_value_array_t *distance_array) { int graph_order; vertex_distance_from_t *Q, *q_start, *current_vertex; opal_list_item_t *adj_list_item; opal_adjacency_list_t *adj_list; int number_of_items_in_q; int i; uint32_t weight; /** * Verify that the reference vertex belongs to the graph. */ if (graph != vertex->in_graph) { OPAL_OUTPUT((0,"opal:graph:dijkstra: vertex %p not in the graph %p\n",(void *)vertex,(void *)graph)); return 0; } /* get the order of the graph and allocate a working queue accordingly */ graph_order = opal_graph_get_order(graph); Q = (vertex_distance_from_t *)malloc(graph_order * sizeof(vertex_distance_from_t)); /* assign a pointer to the start of the queue */ q_start = Q; /* run on all the vertices of the graph */ for (adj_list_item = opal_list_get_first(graph->adjacency_list), i=0; adj_list_item != opal_list_get_end(graph->adjacency_list); adj_list_item = opal_list_get_next(adj_list_item), i++) { adj_list = (opal_adjacency_list_t *)adj_list_item; /* insert the vertices pointes to the working queue */ Q[i].vertex = adj_list->vertex; /** * assign an infinity distance to all the vertices in the queue * except the reference vertex which its distance should be 0. */ if (Q[i].vertex == vertex) { Q[i].weight = 0; } else { Q[i].weight = DISTANCE_INFINITY; } } number_of_items_in_q = i; /* sort the working queue according the distance from the reference vertex */ qsort(q_start, number_of_items_in_q, sizeof(vertex_distance_from_t), compare_vertex_distance); /* while the working queue is not empty */ while (number_of_items_in_q > 0) { /* start to work with the first vertex in the working queue */ current_vertex = q_start; /* remove the first vertex from the queue */ q_start++; /* decrees the number of vertices in the queue */ number_of_items_in_q--; /* find the distance of all other vertices in the queue from the first vertex in the queue */ for (i = 0; i < number_of_items_in_q; i++) { weight = opal_graph_adjacent(graph, current_vertex->vertex, q_start[i].vertex); /** * if the distance from the first vertex in the queue to the I * vertex in the queue plus the distance of the first vertex in * the queue from the referenced vertex is smaller than the * distance of the I vertex from the referenced vertex, assign * the lower distance to the I vertex. */ if (current_vertex->weight + weight < q_start[i].weight) { q_start[i].weight = weight + current_vertex->weight; } } /* sort again the working queue */ qsort(q_start, number_of_items_in_q, sizeof(vertex_distance_from_t), compare_vertex_distance); } /* copy the working queue the the returned distance array */ for (i = 0; i < graph_order-1; i++) { opal_value_array_append_item(distance_array, (void *)&(Q[i+1])); } /* free the working queue */ free(Q); /* assign the distance array size. */ return graph_order - 1; }
static int _ibv_attach(map_segment_t *s, size_t size) { int rc = OSHMEM_SUCCESS; static openib_device_t memheap_device; openib_device_t *device = &memheap_device; int num_devs = 0; assert(s); memset(device, 0, sizeof(*device)); #ifdef HAVE_IBV_GET_DEVICE_LIST device->ib_devs = ibv_get_device_list(&num_devs); #else #error unsupported ibv_get_device_list in infiniband/verbs.h #endif if (num_devs == 0 || !device->ib_devs) { rc = OSHMEM_ERR_NOT_SUPPORTED; } /* Open device */ if (!rc) { int i = 0; if (num_devs > 1) { if (NULL == mca_memheap_base_param_hca_name) { MEMHEAP_VERBOSE(5, "found %d HCAs, choosing the first", num_devs); } else { MEMHEAP_VERBOSE(5, "found %d HCAs, searching for %s", num_devs, mca_memheap_base_param_hca_name); } } for (i = 0; i < num_devs; i++) { device->ib_dev = device->ib_devs[i]; device->ib_dev_context = ibv_open_device(device->ib_dev); if (NULL == device->ib_dev_context) { MEMHEAP_ERROR("error obtaining device context for %s errno says %d: %s", ibv_get_device_name(device->ib_dev), errno, strerror(errno)); rc = OSHMEM_ERR_RESOURCE_BUSY; } else { if (NULL != mca_memheap_base_param_hca_name) { if (0 == strcmp(mca_memheap_base_param_hca_name,ibv_get_device_name(device->ib_dev))) { MEMHEAP_VERBOSE(5, "mca_memheap_base_param_hca_name = %s, selected %s as %d of %d", mca_memheap_base_param_hca_name, ibv_get_device_name(device->ib_dev), i, num_devs); rc = OSHMEM_SUCCESS; break; } } else { MEMHEAP_VERBOSE(5, "mca_memheap_base_param_hca_name = %s, selected %s as %d of %d", mca_memheap_base_param_hca_name, ibv_get_device_name(device->ib_dev), i, num_devs); rc = OSHMEM_SUCCESS; break; } } } } /* Obtain device attributes */ if (!rc) { if (ibv_query_device(device->ib_dev_context, &device->ib_dev_attr)) { MEMHEAP_ERROR("error obtaining device attributes for %s errno says %d: %s", ibv_get_device_name(device->ib_dev), errno, strerror(errno)); rc = OSHMEM_ERR_RESOURCE_BUSY; } else { MEMHEAP_VERBOSE(5, "ibv device %s", ibv_get_device_name(device->ib_dev)); } } /* Allocate the protection domain for the device */ if (!rc) { device->ib_pd = ibv_alloc_pd(device->ib_dev_context); if (NULL == device->ib_pd) { MEMHEAP_ERROR("error allocating protection domain for %s errno says %d: %s", ibv_get_device_name(device->ib_dev), errno, strerror(errno)); rc = OSHMEM_ERR_RESOURCE_BUSY; } } /* Allocate memory */ if (!rc) { void *addr = NULL; struct ibv_mr *ib_mr = NULL; int access_flag = IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ; OBJ_CONSTRUCT(&device->ib_mr_array, opal_value_array_t); opal_value_array_init(&device->ib_mr_array, sizeof(struct ibv_mr *)); #if defined(MPAGE_ENABLE) && (MPAGE_ENABLE > 0) access_flag |= IBV_ACCESS_ALLOCATE_MR | IBV_ACCESS_SHARED_MR_USER_READ | IBV_ACCESS_SHARED_MR_USER_WRITE; #endif /* MPAGE_ENABLE */ ib_mr = ibv_reg_mr(device->ib_pd, addr, size, access_flag); if (NULL == ib_mr) { MEMHEAP_ERROR("error to ibv_reg_mr() %llu bytes errno says %d: %s", (unsigned long long)size, errno, strerror(errno)); rc = OSHMEM_ERR_OUT_OF_RESOURCE; } else { device->ib_mr_shared = ib_mr; opal_value_array_append_item(&device->ib_mr_array, &ib_mr); } #if defined(MPAGE_ENABLE) && (MPAGE_ENABLE > 0) if (!rc) { access_flag = IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ| IBV_ACCESS_NO_RDMA; addr = (void *)mca_memheap_base_start_address; ib_mr = ibv_reg_shared_mr(device->ib_mr_shared->handle, device->ib_pd, addr, access_flag); if (NULL == ib_mr) { MEMHEAP_ERROR("error to ibv_reg_shared_mr() %llu bytes errno says %d: %s", (unsigned long long)size, errno, strerror(errno)); rc = OSHMEM_ERR_OUT_OF_RESOURCE; } else { opal_value_array_append_item(&device->ib_mr_array, &ib_mr); } } #endif /* MPAGE_ENABLE */ if (!rc) { assert(size == device->ib_mr_shared->length); s->type = MAP_SEGMENT_ALLOC_IBV; s->shmid = device->ib_mr_shared->handle; s->start = ib_mr->addr; s->size = size; s->end = (void*)((uintptr_t)s->start + s->size); s->context = &memheap_device; } } return rc; }
static void memheap_attach_segment(mca_spml_mkey_t *mkey, int tr_id) { /* process special case when va was got using shmget(IPC_PRIVATE) * this case is notable for: * - key is set as (type|shmid); * - va_base is set as 0; */ if (!mkey->va_base && ((int) MEMHEAP_SHM_GET_ID(mkey->key) != MEMHEAP_SHM_INVALID)) { MEMHEAP_VERBOSE(5, "shared memory usage tr_id: %d key %llx base_va %p shmid 0x%X|0x%X", tr_id, (unsigned long long)mkey->key, mkey->va_base, MEMHEAP_SHM_GET_TYPE(mkey->key), MEMHEAP_SHM_GET_ID(mkey->key)); if (MEMHEAP_SHM_GET_TYPE(mkey->key) == MAP_SEGMENT_ALLOC_SHM) { mkey->va_base = shmat(MEMHEAP_SHM_GET_ID(mkey->key), 0, 0); } else if (MEMHEAP_SHM_GET_TYPE(mkey->key) == MAP_SEGMENT_ALLOC_IBV) { #if defined(MPAGE_ENABLE) && (MPAGE_ENABLE > 0) openib_device_t *device = NULL; struct ibv_mr *ib_mr; void *addr; static int mr_count; int access_flag = IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ | IBV_ACCESS_NO_RDMA; device = (openib_device_t *)memheap_map->mem_segs[HEAP_SEG_INDEX].context; assert(device); /* workaround mtt problem - request aligned addresses */ ++mr_count; addr = (void *)(mca_memheap_base_start_address + mca_memheap_base_mr_interleave_factor*1024ULL*1024ULL*1024ULL*mr_count); ib_mr = ibv_reg_shared_mr(MEMHEAP_SHM_GET_ID(mkey->key), device->ib_pd, addr, access_flag); if (NULL == ib_mr) { mkey->va_base = (void*)-1; MEMHEAP_ERROR("error to ibv_reg_shared_mr() errno says %d: %s", errno, strerror(errno)); } else { if (ib_mr->addr != addr) { MEMHEAP_WARN("Failed to map shared region to address %p got addr %p. Try to increase 'memheap_mr_interleave_factor' from %d", addr, ib_mr->addr, mca_memheap_base_mr_interleave_factor); } opal_value_array_append_item(&device->ib_mr_array, &ib_mr); mkey->va_base = ib_mr->addr; } #endif /* MPAGE_ENABLE */ } else { MEMHEAP_ERROR("tr_id: %d key %llx attach failed: incorrect shmid 0x%X|0x%X", tr_id, (unsigned long long)mkey->key, MEMHEAP_SHM_GET_TYPE(mkey->key), MEMHEAP_SHM_GET_ID(mkey->key)); oshmem_shmem_abort(-1); } if ((void *) -1 == (void *) mkey->va_base) { MEMHEAP_ERROR("tr_id: %d key %llx attach failed: errno = %d", tr_id, (unsigned long long)mkey->key, errno); oshmem_shmem_abort(-1); } } }