/* * register memory */ static int mca_rcache_grdma_register (mca_rcache_base_module_t *rcache, void *addr, size_t size, uint32_t flags, int32_t access_flags, mca_rcache_base_registration_t **reg) { mca_rcache_grdma_module_t *rcache_grdma = (mca_rcache_grdma_module_t*)rcache; const bool bypass_cache = !!(flags & MCA_RCACHE_FLAGS_CACHE_BYPASS); const bool persist = !!(flags & MCA_RCACHE_FLAGS_PERSIST); mca_rcache_base_registration_t *grdma_reg; opal_free_list_item_t *item; unsigned char *base, *bound; unsigned int page_size = opal_getpagesize (); int rc; *reg = NULL; /* if cache bypass is requested don't use the cache */ base = OPAL_DOWN_ALIGN_PTR(addr, page_size, unsigned char *); bound = OPAL_ALIGN_PTR((intptr_t) addr + size, page_size, unsigned char *) - 1; #if OPAL_CUDA_GDR_SUPPORT if (flags & MCA_RCACHE_FLAGS_CUDA_GPU_MEM) { size_t psize; mca_common_cuda_get_address_range(&base, &psize, addr); bound = base + psize - 1; /* Check to see if this memory is in the cache and if it has been freed. If so, * this call will boot it out of the cache. */ check_for_cuda_freed_memory(rcache, base, psize); } #endif /* OPAL_CUDA_GDR_SUPPORT */ do_unregistration_gc (rcache); /* look through existing regs if not persistent registration requested. * Persistent registration are always registered and placed in the cache */ if (!(bypass_cache || persist)) { mca_rcache_base_find_args_t find_args = {.reg = NULL, .rcache_grdma = rcache_grdma, .base = base, .bound = bound, .access_flags = access_flags}; /* check to see if memory is registered */ rc = mca_rcache_base_vma_iterate (rcache_grdma->cache->vma_module, base, size, mca_rcache_grdma_check_cached, (void *) &find_args); if (1 == rc) { *reg = find_args.reg; return OPAL_SUCCESS; } /* get updated access flags */ access_flags = find_args.access_flags; OPAL_THREAD_ADD32((volatile int32_t *) &rcache_grdma->stat_cache_miss, 1); }
void mca_mpool_grdma_finalize(struct mca_mpool_base_module_t *mpool) { mca_mpool_grdma_module_t *mpool_grdma = (mca_mpool_grdma_module_t*)mpool; mca_mpool_base_registration_t *regs[GRDMA_MPOOL_NREGS]; int reg_cnt, i; /* Statistic */ if (true == mca_mpool_grdma_component.print_stats) { opal_output(0, "%s grdma: stats " "(hit/miss/found/not found/evicted): %d/%d/%d/%d/%d\n", OPAL_NAME_PRINT(OPAL_PROC_MY_NAME), mpool_grdma->stat_cache_hit, mpool_grdma->stat_cache_miss, mpool_grdma->stat_cache_found, mpool_grdma->stat_cache_notfound, mpool_grdma->stat_evicted); } OPAL_THREAD_LOCK(&mpool->rcache->lock); do_unregistration_gc(mpool); do { reg_cnt = mpool->rcache->rcache_find_all(mpool->rcache, 0, (size_t)-1, regs, GRDMA_MPOOL_NREGS); for (i = 0 ; i < reg_cnt ; ++i) { if (regs[i]->ref_count) { regs[i]->ref_count = 0; /* otherwise dereg will fail on assert */ } else if (mca_mpool_grdma_component.leave_pinned) { opal_list_remove_item(&mpool_grdma->pool->lru_list, (opal_list_item_t *) regs[i]); } (void) dereg_mem(regs[i]); } } while (reg_cnt == GRDMA_MPOOL_NREGS); OBJ_RELEASE(mpool_grdma->pool); OBJ_DESTRUCT(&mpool_grdma->reg_list); OPAL_THREAD_UNLOCK(&mpool->rcache->lock); /* Cleanup any vmas that we have deferred deletion on */ mpool->rcache->rcache_clean(mpool->rcache); /* this mpool was allocated by grdma_init in mpool_grdma_component.c */ free(mpool); }
/* * register memory */ int mca_mpool_grdma_register(mca_mpool_base_module_t *mpool, void *addr, size_t size, uint32_t flags, mca_mpool_base_registration_t **reg) { mca_mpool_grdma_module_t *mpool_grdma = (mca_mpool_grdma_module_t*)mpool; mca_mpool_base_registration_t *grdma_reg; ompi_free_list_item_t *item; unsigned char *base, *bound; bool bypass_cache = !!(flags & MCA_MPOOL_FLAGS_CACHE_BYPASS); int rc; OPAL_THREAD_LOCK(&mpool->rcache->lock); /* if cache bypass is requested don't use the cache */ base = (unsigned char *) down_align_addr(addr, mca_mpool_base_page_size_log); bound = (unsigned char *) up_align_addr((void*)((char*) addr + size - 1), mca_mpool_base_page_size_log); if (!opal_list_is_empty (&mpool_grdma->pool->gc_list)) do_unregistration_gc(mpool); /* look through existing regs if not persistent registration requested. * Persistent registration are always registered and placed in the cache */ if(!(flags & MCA_MPOOL_FLAGS_PERSIST) && !bypass_cache) { /* check to see if memory is registered */ mpool->rcache->rcache_find(mpool->rcache, addr, size, reg); if(*reg != NULL && (mca_mpool_grdma_component.leave_pinned || ((*reg)->flags & MCA_MPOOL_FLAGS_PERSIST) || ((*reg)->base == base && (*reg)->bound == bound))) { if(0 == (*reg)->ref_count && mca_mpool_grdma_component.leave_pinned) { opal_list_remove_item(&mpool_grdma->pool->lru_list, (opal_list_item_t*)(*reg)); } mpool_grdma->stat_cache_hit++; (*reg)->ref_count++; OPAL_THREAD_UNLOCK(&mpool->rcache->lock); return OMPI_SUCCESS; } mpool_grdma->stat_cache_miss++; *reg = NULL; /* in case previous find found something */ /* If no suitable registration is in cache and leave_pinned isn't * set don't use the cache. * This is optimisation in case limit is not set. If limit is set we * have to put registration into the cache to determine when we hit * memory registration limit. * NONE: cache is still used for persistent registrations so previous * find can find something */ if(!mca_mpool_grdma_component.leave_pinned) { bypass_cache = true; } } OMPI_FREE_LIST_GET(&mpool_grdma->reg_list, item, rc); if(OMPI_SUCCESS != rc) { OPAL_THREAD_UNLOCK(&mpool->rcache->lock); return rc; } grdma_reg = (mca_mpool_base_registration_t*)item; grdma_reg->mpool = mpool; grdma_reg->base = base; grdma_reg->bound = bound; grdma_reg->flags = flags; if (false == bypass_cache) { rc = mpool->rcache->rcache_insert(mpool->rcache, grdma_reg, 0); if (OPAL_UNLIKELY(rc != OMPI_SUCCESS)) { OPAL_THREAD_UNLOCK(&mpool->rcache->lock); OMPI_FREE_LIST_RETURN(&mpool_grdma->reg_list, item); return rc; } } while (OMPI_ERR_OUT_OF_RESOURCE == (rc = mpool_grdma->resources.register_mem(mpool_grdma->resources.reg_data, base, bound - base + 1, grdma_reg))) { /* try to remove one unused reg and retry */ if (!mca_mpool_grdma_evict (mpool)) { break; } } if (OPAL_UNLIKELY(rc != OMPI_SUCCESS)) { if (false == bypass_cache) { mpool->rcache->rcache_delete(mpool->rcache, grdma_reg); } OPAL_THREAD_UNLOCK(&mpool->rcache->lock); OMPI_FREE_LIST_RETURN(&mpool_grdma->reg_list, item); return rc; } *reg = grdma_reg; (*reg)->ref_count++; OPAL_THREAD_UNLOCK(&mpool->rcache->lock); /* Cleanup any vmas that we have deferred deletion on */ mpool->rcache->rcache_clean(mpool->rcache); return OMPI_SUCCESS; }
void mca_mpool_rdma_finalize(struct mca_mpool_base_module_t *mpool) { mca_mpool_rdma_module_t *mpool_rdma = (mca_mpool_rdma_module_t*)mpool; mca_mpool_base_registration_t *reg; mca_mpool_base_registration_t *regs[RDMA_MPOOL_NREGS]; int reg_cnt, i; int rc; /* Statistic */ if(true == mca_mpool_rdma_component.print_stats) { opal_output(0, "%s rdma: stats " "(hit/miss/found/not found/evicted): %d/%d/%d/%d/%d\n", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), mpool_rdma->stat_cache_hit, mpool_rdma->stat_cache_miss, mpool_rdma->stat_cache_found, mpool_rdma->stat_cache_notfound, mpool_rdma->stat_evicted); } OPAL_THREAD_LOCK(&mpool->rcache->lock); if(!opal_list_is_empty(&mpool_rdma->gc_list)) do_unregistration_gc(mpool); do { reg_cnt = mpool->rcache->rcache_find_all(mpool->rcache, 0, (size_t)-1, regs, RDMA_MPOOL_NREGS); for(i = 0; i < reg_cnt; i++) { reg = regs[i]; if(reg->ref_count) { reg->ref_count = 0; /* otherway dereg will fail on assert */ } else if (mca_mpool_rdma_component.leave_pinned) { opal_list_remove_item(&mpool_rdma->mru_list, (opal_list_item_t*)reg); } /* Remove from rcache first */ mpool->rcache->rcache_delete(mpool->rcache, reg); /* Drop lock before deregistering memory */ OPAL_THREAD_UNLOCK(&mpool->rcache->lock); rc = dereg_mem(mpool, reg); OPAL_THREAD_LOCK(&mpool->rcache->lock); if(rc != OMPI_SUCCESS) { /* Potentially lose track of registrations do we have to put it back? */ continue; } OMPI_FREE_LIST_RETURN(&mpool_rdma->reg_list, (ompi_free_list_item_t*)reg); } } while(reg_cnt == RDMA_MPOOL_NREGS); OBJ_DESTRUCT(&mpool_rdma->mru_list); OBJ_DESTRUCT(&mpool_rdma->gc_list); OBJ_DESTRUCT(&mpool_rdma->reg_list); OPAL_THREAD_UNLOCK(&mpool->rcache->lock); /* Cleanup any vmas that we have deferred deletion on */ mpool->rcache->rcache_clean(mpool->rcache); }
/* * register memory */ int mca_mpool_rdma_register(mca_mpool_base_module_t *mpool, void *addr, size_t size, uint32_t flags, mca_mpool_base_registration_t **reg) { mca_mpool_rdma_module_t *mpool_rdma = (mca_mpool_rdma_module_t*)mpool; mca_mpool_base_registration_t *rdma_reg; ompi_free_list_item_t *item; unsigned char *base, *bound; int rc; /* if cache bypass is requested don't use the cache */ if(flags & MCA_MPOOL_FLAGS_CACHE_BYPASS) { return register_cache_bypass(mpool, addr, size, flags, reg); } base = (unsigned char *) down_align_addr(addr, mca_mpool_base_page_size_log); bound = (unsigned char *) up_align_addr((void*)((char*) addr + size - 1), mca_mpool_base_page_size_log); OPAL_THREAD_LOCK(&mpool->rcache->lock); if(!opal_list_is_empty(&mpool_rdma->gc_list)) do_unregistration_gc(mpool); /* look through existing regs if not persistent registration requested. * Persistent registration are always registered and placed in the cache */ if(!(flags & MCA_MPOOL_FLAGS_PERSIST)) { /* check to see if memory is registered */ mpool->rcache->rcache_find(mpool->rcache, addr, size, reg); if(*reg != NULL && (mca_mpool_rdma_component.leave_pinned || ((*reg)->flags & MCA_MPOOL_FLAGS_PERSIST) || ((*reg)->base == base && (*reg)->bound == bound))) { if(0 == (*reg)->ref_count && mca_mpool_rdma_component.leave_pinned) { opal_list_remove_item(&mpool_rdma->mru_list, (opal_list_item_t*)(*reg)); } mpool_rdma->stat_cache_hit++; (*reg)->ref_count++; OPAL_THREAD_UNLOCK(&mpool->rcache->lock); return OMPI_SUCCESS; } mpool_rdma->stat_cache_miss++; *reg = NULL; /* in case previous find found something */ /* If no suitable registration is in cache and leave_pinned isn't * set and size of registration cache is unlimited don't use the cache. * This is optimisation in case limit is not set. If limit is set we * have to put registration into the cache to determine when we hit * memory registration limit. * NONE: cache is still used for persistent registrations so previous * find can find something */ if(!mca_mpool_rdma_component.leave_pinned && mca_mpool_rdma_component.rcache_size_limit == 0) { OPAL_THREAD_UNLOCK(&mpool->rcache->lock); return register_cache_bypass(mpool, addr, size, flags, reg); } } OMPI_FREE_LIST_GET(&mpool_rdma->reg_list, item, rc); if(OMPI_SUCCESS != rc) { OPAL_THREAD_UNLOCK(&mpool->rcache->lock); return rc; } rdma_reg = (mca_mpool_base_registration_t*)item; rdma_reg->mpool = mpool; rdma_reg->base = base; rdma_reg->bound = bound; rdma_reg->flags = flags; while((rc = mpool->rcache->rcache_insert(mpool->rcache, rdma_reg, mca_mpool_rdma_component.rcache_size_limit)) == OMPI_ERR_TEMP_OUT_OF_RESOURCE) { mca_mpool_base_registration_t *old_reg; /* try to remove one unused reg and retry */ old_reg = (mca_mpool_base_registration_t*) opal_list_get_last(&mpool_rdma->mru_list); if(opal_list_get_end(&mpool_rdma->mru_list) != (opal_list_item_t*)old_reg) { /* Remove the registration from the cache and list before deregistering the memory */ mpool->rcache->rcache_delete(mpool->rcache, old_reg); opal_list_remove_item(&mpool_rdma->mru_list, (opal_list_item_t*)old_reg); /* Drop the rcache lock while we deregister the memory */ OPAL_THREAD_UNLOCK(&mpool->rcache->lock); rc = dereg_mem(mpool, old_reg); OPAL_THREAD_LOCK(&mpool->rcache->lock); /* This introduces a potential leak of registrations if the deregistration fails to occur as we no longer have a reference to it. Is this possible? */ if(OMPI_SUCCESS == rc) { OMPI_FREE_LIST_RETURN(&mpool_rdma->reg_list, (ompi_free_list_item_t*)old_reg); mpool_rdma->stat_evicted++; } else break; } else break; } if(rc != OMPI_SUCCESS) { OPAL_THREAD_UNLOCK(&mpool->rcache->lock); OMPI_FREE_LIST_RETURN(&mpool_rdma->reg_list, item); return rc; } rc = mpool_rdma->resources.register_mem(mpool_rdma->resources.reg_data, base, bound - base + 1, rdma_reg); if(rc != OMPI_SUCCESS) { mpool->rcache->rcache_delete(mpool->rcache, rdma_reg); OPAL_THREAD_UNLOCK(&mpool->rcache->lock); OMPI_FREE_LIST_RETURN(&mpool_rdma->reg_list, item); return rc; } *reg = rdma_reg; (*reg)->ref_count++; OPAL_THREAD_UNLOCK(&mpool->rcache->lock); /* Cleanup any vmas that we have deferred deletion on */ mpool->rcache->rcache_clean(mpool->rcache); return OMPI_SUCCESS; }
/* * register memory */ int mca_mpool_grdma_register(mca_mpool_base_module_t *mpool, void *addr, size_t size, uint32_t flags, mca_mpool_base_registration_t **reg) { mca_mpool_grdma_module_t *mpool_grdma = (mca_mpool_grdma_module_t*)mpool; const bool bypass_cache = !!(flags & MCA_MPOOL_FLAGS_CACHE_BYPASS); const bool persist = !!(flags & MCA_MPOOL_FLAGS_PERSIST); mca_mpool_base_registration_t *grdma_reg; opal_free_list_item_t *item; unsigned char *base, *bound; int rc; OPAL_THREAD_LOCK(&mpool->rcache->lock); /* if cache bypass is requested don't use the cache */ base = (unsigned char *) down_align_addr(addr, mca_mpool_base_page_size_log); bound = (unsigned char *) up_align_addr((void*)((char*) addr + size - 1), mca_mpool_base_page_size_log); if (!opal_list_is_empty (&mpool_grdma->pool->gc_list)) do_unregistration_gc(mpool); #if OPAL_CUDA_GDR_SUPPORT if (flags & MCA_MPOOL_FLAGS_CUDA_GPU_MEM) { size_t psize; mca_common_cuda_get_address_range(&base, &psize, addr); bound = base + psize - 1; /* Check to see if this memory is in the cache and if it has been freed. If so, * this call will boot it out of the cache. */ check_for_cuda_freed_memory(mpool, base, psize); } #endif /* OPAL_CUDA_GDR_SUPPORT */ /* look through existing regs if not persistent registration requested. * Persistent registration are always registered and placed in the cache */ if(!(bypass_cache || persist)) { /* check to see if memory is registered */ mpool->rcache->rcache_find(mpool->rcache, base, bound - base + 1, reg); if (*reg && !(flags & MCA_MPOOL_FLAGS_INVALID)) { if (0 == (*reg)->ref_count) { /* Leave pinned must be set for this to still be in the rcache. */ opal_list_remove_item(&mpool_grdma->pool->lru_list, (opal_list_item_t *)(*reg)); } /* This segment fits fully within an existing segment. */ mpool_grdma->stat_cache_hit++; (*reg)->ref_count++; OPAL_THREAD_UNLOCK(&mpool->rcache->lock); return OPAL_SUCCESS; } mpool_grdma->stat_cache_miss++; *reg = NULL; /* in case previous find found something */ /* Unless explicitly requested by the caller always store the * registration in the rcache. This will speed up the case where * no leave pinned protocol is in use but the same segment is in * use in multiple simultaneous transactions. We used to set bypass_cache * here is !mca_mpool_grdma_component.leave_pinned. */ } item = opal_free_list_get (&mpool_grdma->reg_list); if(NULL == item) { OPAL_THREAD_UNLOCK(&mpool->rcache->lock); return OPAL_ERR_OUT_OF_RESOURCE; } grdma_reg = (mca_mpool_base_registration_t*)item; grdma_reg->mpool = mpool; grdma_reg->base = base; grdma_reg->bound = bound; grdma_reg->flags = flags; #if OPAL_CUDA_GDR_SUPPORT if (flags & MCA_MPOOL_FLAGS_CUDA_GPU_MEM) { mca_common_cuda_get_buffer_id(grdma_reg); } #endif /* OPAL_CUDA_GDR_SUPPORT */ if (false == bypass_cache) { rc = mpool->rcache->rcache_insert(mpool->rcache, grdma_reg, 0); if (OPAL_UNLIKELY(rc != OPAL_SUCCESS)) { OPAL_THREAD_UNLOCK(&mpool->rcache->lock); opal_free_list_return (&mpool_grdma->reg_list, item); return rc; } } while (OPAL_ERR_OUT_OF_RESOURCE == (rc = mpool_grdma->resources.register_mem(mpool_grdma->resources.reg_data, base, bound - base + 1, grdma_reg))) { /* try to remove one unused reg and retry */ if (!mca_mpool_grdma_evict (mpool)) { break; } } if (OPAL_UNLIKELY(rc != OPAL_SUCCESS)) { if (false == bypass_cache) { mpool->rcache->rcache_delete(mpool->rcache, grdma_reg); } OPAL_THREAD_UNLOCK(&mpool->rcache->lock); opal_free_list_return (&mpool_grdma->reg_list, item); return rc; } *reg = grdma_reg; (*reg)->ref_count++; OPAL_THREAD_UNLOCK(&mpool->rcache->lock); /* Cleanup any vmas that we have deferred deletion on */ mpool->rcache->rcache_clean(mpool->rcache); return OPAL_SUCCESS; }