int mca_mpool_grdma_deregister(struct mca_mpool_base_module_t *mpool, mca_mpool_base_registration_t *reg) { mca_mpool_grdma_module_t *mpool_grdma = (mca_mpool_grdma_module_t *) mpool; int rc = OMPI_SUCCESS; assert(reg->ref_count > 0); OPAL_THREAD_LOCK(&mpool->rcache->lock); reg->ref_count--; if(reg->ref_count > 0) { OPAL_THREAD_UNLOCK(&mpool->rcache->lock); return OMPI_SUCCESS; } if(registration_is_cachebale(reg)) { opal_list_append(&mpool_grdma->pool->lru_list, (opal_list_item_t *) reg); } else { rc = dereg_mem (reg); } OPAL_THREAD_UNLOCK(&mpool->rcache->lock); /* Cleanup any vmas that we have deferred deletion on */ mpool->rcache->rcache_clean(mpool->rcache); return rc; }
int mca_mpool_rdma_deregister(struct mca_mpool_base_module_t *mpool, mca_mpool_base_registration_t *reg) { mca_mpool_rdma_module_t *mpool_rdma = (mca_mpool_rdma_module_t*)mpool; int rc = OMPI_SUCCESS; assert(reg->ref_count > 0); OPAL_THREAD_LOCK(&mpool->rcache->lock); reg->ref_count--; if(reg->ref_count > 0) { OPAL_THREAD_UNLOCK(&mpool->rcache->lock); return OMPI_SUCCESS; } if(mca_mpool_rdma_component.leave_pinned && !(reg->flags & (MCA_MPOOL_FLAGS_CACHE_BYPASS|MCA_MPOOL_FLAGS_PERSIST))) { /* if leave_pinned is set don't deregister memory, but put it * on MRU list for future use */ opal_list_prepend(&mpool_rdma->mru_list, (opal_list_item_t*)reg); } else { rc = dereg_mem(mpool, reg); if(OMPI_SUCCESS == rc) { if(!(reg->flags & MCA_MPOOL_FLAGS_CACHE_BYPASS)) mpool->rcache->rcache_delete(mpool->rcache, reg); OMPI_FREE_LIST_RETURN(&mpool_rdma->reg_list, (ompi_free_list_item_t*)reg); } } OPAL_THREAD_UNLOCK(&mpool->rcache->lock); return rc; }
/* This function must be called with the rcache lock held */ static inline void do_unregistration_gc(struct mca_mpool_base_module_t *mpool) { mca_mpool_grdma_module_t *mpool_grdma = (mca_mpool_grdma_module_t*)mpool; opal_list_item_t *item; /* Remove registration from garbage collection list before deregistering it */ while (NULL != (item = opal_list_remove_first(&mpool_grdma->pool->gc_list))) { dereg_mem((mca_mpool_base_registration_t *) item); } }
static inline void do_unregistration_gc (mca_rcache_base_module_t *rcache) { mca_rcache_grdma_module_t *rcache_grdma = (mca_rcache_grdma_module_t *) rcache; opal_list_item_t *item; /* Remove registration from garbage collection list before deregistering it */ while (NULL != (item = opal_lifo_pop_atomic (&rcache_grdma->cache->gc_lifo))) { OPAL_OUTPUT_VERBOSE((MCA_BASE_VERBOSE_TRACE, opal_rcache_base_framework.framework_output, "deleting stale registration %p", (void *) item)); dereg_mem ((mca_rcache_base_registration_t *) item); } }
static int check_for_cuda_freed_memory(mca_mpool_base_module_t *mpool, void *addr, size_t size) { mca_mpool_grdma_module_t *mpool_grdma = (mca_mpool_grdma_module_t *) mpool; mca_mpool_base_registration_t *regs[GRDMA_MPOOL_NREGS]; int reg_cnt, i, rc = OPAL_SUCCESS; mca_mpool_base_registration_t *reg; mpool->rcache->rcache_find(mpool->rcache, addr, size, ®); if (NULL == reg) { return OPAL_SUCCESS; } /* If not previously freed memory, just return 0 */ if (!(mca_common_cuda_previously_freed_memory(reg))) { return OPAL_SUCCESS; } /* mpool->rcache->rcache_dump_range(mpool->rcache, 0, (size_t)-1, "Before free"); */ /* This memory has been freed. Find all registrations and delete */ do { reg_cnt = mpool->rcache->rcache_find_all(mpool->rcache, reg->base, reg->bound - reg->base + 1, regs, GRDMA_MPOOL_NREGS); for(i = 0 ; i < reg_cnt ; ++i) { regs[i]->flags |= MCA_MPOOL_FLAGS_INVALID; if (regs[i]->ref_count) { opal_output(0, "Release FAILED: ref_count=%d, base=%p, bound=%p, size=%d", regs[i]->ref_count, regs[i]->base, regs[i]->bound, (int) (regs[i]->bound - regs[i]->base + 1)); /* memory is being freed, but there are registration in use that * covers the memory. This can happen even in a correct program, * but may also be an user error. We can't tell. Mark the * registration as invalid. It will not be used any more and * will be unregistered when ref_count will become zero */ rc = OPAL_ERROR; /* tell caller that something was wrong */ } else { opal_list_remove_item(&mpool_grdma->pool->lru_list,(opal_list_item_t *) regs[i]); /* Now deregister. Do not use gc_list as we need to kick this out now. */ dereg_mem(regs[i]); } } } while(reg_cnt == GRDMA_MPOOL_NREGS); OPAL_THREAD_UNLOCK(&mpool->rcache->lock); /* mpool->rcache->rcache_dump_range(mpool->rcache, 0, (size_t)-1, "After free");*/ return rc; }
static int mca_rcache_grdma_check_cached (mca_rcache_base_registration_t *grdma_reg, void *ctx) { mca_rcache_base_find_args_t *args = (mca_rcache_base_find_args_t *) ctx; mca_rcache_grdma_module_t *rcache_grdma = args->rcache_grdma; if ((grdma_reg->flags & MCA_RCACHE_FLAGS_INVALID) || &rcache_grdma->super != grdma_reg->rcache || grdma_reg->base > args->base || grdma_reg->bound < args->bound) { return 0; } if (OPAL_UNLIKELY((args->access_flags & grdma_reg->access_flags) != args->access_flags)) { args->access_flags |= grdma_reg->access_flags; if (0 != grdma_reg->ref_count) { if (!(grdma_reg->flags & MCA_RCACHE_FLAGS_CACHE_BYPASS)) { mca_rcache_base_vma_delete (rcache_grdma->cache->vma_module, grdma_reg); } /* mark the registration to go away when it is deregistered */ grdma_reg->flags |= MCA_RCACHE_FLAGS_INVALID | MCA_RCACHE_FLAGS_CACHE_BYPASS; } else { if (registration_is_cacheable(grdma_reg)) { opal_list_remove_item (&rcache_grdma->cache->lru_list, (opal_list_item_t *) grdma_reg); } dereg_mem (grdma_reg); } } else { if (0 == grdma_reg->ref_count) { /* Leave pinned must be set for this to still be in the rcache. */ opal_list_remove_item(&rcache_grdma->cache->lru_list, (opal_list_item_t *) grdma_reg); } /* This segment fits fully within an existing segment. */ rcache_grdma->stat_cache_hit++; int32_t ref_cnt = opal_atomic_add_32 (&grdma_reg->ref_count, 1); OPAL_OUTPUT_VERBOSE((MCA_BASE_VERBOSE_TRACE, opal_rcache_base_framework.framework_output, "returning existing registration %p. references %d", (void *) grdma_reg, ref_cnt)); (void)ref_cnt; args->reg = grdma_reg; return 1; } /* can't use this registration */ return 0; }
void mca_mpool_grdma_finalize(struct mca_mpool_base_module_t *mpool) { mca_mpool_grdma_module_t *mpool_grdma = (mca_mpool_grdma_module_t*)mpool; mca_mpool_base_registration_t *regs[GRDMA_MPOOL_NREGS]; int reg_cnt, i; /* Statistic */ if (true == mca_mpool_grdma_component.print_stats) { opal_output(0, "%s grdma: stats " "(hit/miss/found/not found/evicted): %d/%d/%d/%d/%d\n", OPAL_NAME_PRINT(OPAL_PROC_MY_NAME), mpool_grdma->stat_cache_hit, mpool_grdma->stat_cache_miss, mpool_grdma->stat_cache_found, mpool_grdma->stat_cache_notfound, mpool_grdma->stat_evicted); } OPAL_THREAD_LOCK(&mpool->rcache->lock); do_unregistration_gc(mpool); do { reg_cnt = mpool->rcache->rcache_find_all(mpool->rcache, 0, (size_t)-1, regs, GRDMA_MPOOL_NREGS); for (i = 0 ; i < reg_cnt ; ++i) { if (regs[i]->ref_count) { regs[i]->ref_count = 0; /* otherwise dereg will fail on assert */ } else if (mca_mpool_grdma_component.leave_pinned) { opal_list_remove_item(&mpool_grdma->pool->lru_list, (opal_list_item_t *) regs[i]); } (void) dereg_mem(regs[i]); } } while (reg_cnt == GRDMA_MPOOL_NREGS); OBJ_RELEASE(mpool_grdma->pool); OBJ_DESTRUCT(&mpool_grdma->reg_list); OPAL_THREAD_UNLOCK(&mpool->rcache->lock); /* Cleanup any vmas that we have deferred deletion on */ mpool->rcache->rcache_clean(mpool->rcache); /* this mpool was allocated by grdma_init in mpool_grdma_component.c */ free(mpool); }
int mca_mpool_rdma_release_memory(struct mca_mpool_base_module_t *mpool, void *base, size_t size) { mca_mpool_rdma_module_t *mpool_rdma = (mca_mpool_rdma_module_t*)mpool; mca_mpool_base_registration_t *reg; ompi_pointer_array_t regs; int reg_cnt, i, err = 0; OBJ_CONSTRUCT(®s, ompi_pointer_array_t); OPAL_THREAD_LOCK(&mpool->rcache->lock); reg_cnt = mpool->rcache->rcache_find_all(mpool->rcache, base, size, ®s); for(i = 0; i < reg_cnt; i++) { reg = (mca_mpool_base_registration_t*) ompi_pointer_array_get_item(®s, i); if(0 == reg->ref_count) { if(dereg_mem(mpool, reg) != OMPI_SUCCESS) { err++; continue; } } else { /* remove registration from cache and wait for ref_count goes to * zero before unregister memory. Note that our registered memory * statistic can go wrong at this point, but it is better than * potential memory corruption. And we return error in this case to * the caller */ reg->flags |= MCA_MPOOL_FLAGS_CACHE_BYPASS; err++; /* tell caller that something was wrong */ } mpool->rcache->rcache_delete(mpool->rcache, reg); if(0 == reg->ref_count) { opal_list_remove_item(&mpool_rdma->mru_list, (opal_list_item_t*)reg); OMPI_FREE_LIST_RETURN(&mpool_rdma->reg_list, (ompi_free_list_item_t*)reg); } } OPAL_THREAD_UNLOCK(&mpool->rcache->lock); ompi_pointer_array_remove_all(®s); return err?OMPI_ERROR:OMPI_SUCCESS; }
static inline bool mca_mpool_grdma_evict_lru_local (mca_mpool_grdma_pool_t *pool) { mca_mpool_grdma_module_t *mpool_grdma; mca_mpool_base_registration_t *old_reg; old_reg = (mca_mpool_base_registration_t *) opal_list_remove_first (&pool->lru_list); if (NULL == old_reg) { return false; } mpool_grdma = (mca_mpool_grdma_module_t *) old_reg->mpool; (void) dereg_mem (old_reg); mpool_grdma->stat_evicted++; return true; }
static inline bool mca_rcache_grdma_evict_lru_local (mca_rcache_grdma_cache_t *cache) { mca_rcache_grdma_module_t *rcache_grdma; mca_rcache_base_registration_t *old_reg; opal_mutex_lock (&cache->vma_module->vma_lock); old_reg = (mca_rcache_base_registration_t *) opal_list_remove_first (&cache->lru_list); opal_mutex_unlock (&cache->vma_module->vma_lock); if (NULL == old_reg) { return false; } rcache_grdma = (mca_rcache_grdma_module_t *) old_reg->rcache; (void) dereg_mem (old_reg); rcache_grdma->stat_evicted++; return true; }
int mca_mpool_rdma_deregister(struct mca_mpool_base_module_t *mpool, mca_mpool_base_registration_t *reg) { mca_mpool_rdma_module_t *mpool_rdma = (mca_mpool_rdma_module_t*)mpool; int rc = OMPI_SUCCESS; assert(reg->ref_count > 0); OPAL_THREAD_LOCK(&mpool->rcache->lock); reg->ref_count--; if(reg->ref_count > 0) { OPAL_THREAD_UNLOCK(&mpool->rcache->lock); return OMPI_SUCCESS; } if(mca_mpool_rdma_component.leave_pinned && registration_is_cachebale(reg)) { /* if leave_pinned is set don't deregister memory, but put it * on MRU list for future use */ opal_list_prepend(&mpool_rdma->mru_list, (opal_list_item_t*)reg); } else { /* Remove from rcache first */ if(!(reg->flags & MCA_MPOOL_FLAGS_CACHE_BYPASS)) mpool->rcache->rcache_delete(mpool->rcache, reg); /* Drop the rcache lock before deregistring the memory */ OPAL_THREAD_UNLOCK(&mpool->rcache->lock); rc = dereg_mem(mpool, reg); OPAL_THREAD_LOCK(&mpool->rcache->lock); if(OMPI_SUCCESS == rc) { OMPI_FREE_LIST_RETURN(&mpool_rdma->reg_list, (ompi_free_list_item_t*)reg); } } OPAL_THREAD_UNLOCK(&mpool->rcache->lock); /* Cleanup any vmas that we have deferred deletion on */ mpool->rcache->rcache_clean(mpool->rcache); return rc; }
/* This function must be called with the rcache lock held */ static void do_unregistration_gc(struct mca_mpool_base_module_t *mpool) { mca_mpool_rdma_module_t *mpool_rdma = (mca_mpool_rdma_module_t*)mpool; mca_mpool_base_registration_t *reg; do { /* Remove registration from garbage collection list before deregistering it */ reg = (mca_mpool_base_registration_t *) opal_list_remove_first(&mpool_rdma->gc_list); mpool->rcache->rcache_delete(mpool->rcache, reg); /* Drop the rcache lock before calling dereg_mem as there may be memory allocations */ OPAL_THREAD_UNLOCK(&mpool->rcache->lock); dereg_mem(mpool, reg); OPAL_THREAD_LOCK(&mpool->rcache->lock); OMPI_FREE_LIST_RETURN(&mpool_rdma->reg_list, (ompi_free_list_item_t*)reg); } while(!opal_list_is_empty(&mpool_rdma->gc_list)); }
/* * register memory */ int mca_mpool_rdma_register(mca_mpool_base_module_t *mpool, void *addr, size_t size, uint32_t flags, mca_mpool_base_registration_t **reg) { mca_mpool_rdma_module_t *mpool_rdma = (mca_mpool_rdma_module_t*)mpool; mca_mpool_base_registration_t *rdma_reg; ompi_free_list_item_t *item; unsigned char *base, *bound; int rc; /* if cache bypass is requested don't use the cache */ if(flags & MCA_MPOOL_FLAGS_CACHE_BYPASS) { return register_cache_bypass(mpool, addr, size, flags, reg); } base = down_align_addr(addr, mca_mpool_base_page_size_log); bound = up_align_addr((void*)((char*) addr + size - 1), mca_mpool_base_page_size_log); OPAL_THREAD_LOCK(&mpool->rcache->lock); /* look through existing regs if not persistent registration requested. * Persistent registration are always registered and placed in the cache */ if(!(flags & MCA_MPOOL_FLAGS_PERSIST)) { /* check to see if memory is registered */ mpool->rcache->rcache_find(mpool->rcache, addr, size, reg); if(*reg != NULL && (mca_mpool_rdma_component.leave_pinned || ((*reg)->flags & MCA_MPOOL_FLAGS_PERSIST) || ((*reg)->base == base && (*reg)->bound == bound))) { if(0 == (*reg)->ref_count && mca_mpool_rdma_component.leave_pinned) { opal_list_remove_item(&mpool_rdma->mru_list, (opal_list_item_t*)(*reg)); } mpool_rdma->stat_cache_hit++; (*reg)->ref_count++; OPAL_THREAD_UNLOCK(&mpool->rcache->lock); return MPI_SUCCESS; } mpool_rdma->stat_cache_miss++; *reg = NULL; /* in case previous find found something */ /* If no suitable registration is in cache and leave_pinned isn't * set and size of registration cache is unlimited don't use the cache. * This is optimisation in case limit is not set. If limit is set we * have to put registration into the cache to determine when we hit * memory registration limit. * NONE: cache is still used for persistent registrations so previous * find can find something */ if(!mca_mpool_rdma_component.leave_pinned && mca_mpool_rdma_component.rcache_size_limit == 0) { OPAL_THREAD_UNLOCK(&mpool->rcache->lock); return register_cache_bypass(mpool, addr, size, flags, reg); } } OMPI_FREE_LIST_GET(&mpool_rdma->reg_list, item, rc); if(OMPI_SUCCESS != rc) { OPAL_THREAD_UNLOCK(&mpool->rcache->lock); return rc; } rdma_reg = (mca_mpool_base_registration_t*)item; rdma_reg->mpool = mpool; rdma_reg->base = base; rdma_reg->bound = bound; rdma_reg->flags = flags; while((rc = mpool->rcache->rcache_insert(mpool->rcache, rdma_reg, mca_mpool_rdma_component.rcache_size_limit)) == OMPI_ERR_TEMP_OUT_OF_RESOURCE) { mca_mpool_base_registration_t *old_reg; /* try to remove one unused reg and retry */ old_reg = (mca_mpool_base_registration_t*) opal_list_get_last(&mpool_rdma->mru_list); if(opal_list_get_end(&mpool_rdma->mru_list) != (opal_list_item_t*)old_reg) { rc = dereg_mem(mpool, old_reg); if(MPI_SUCCESS == rc) { mpool->rcache->rcache_delete(mpool->rcache, old_reg); opal_list_remove_item(&mpool_rdma->mru_list, (opal_list_item_t*)old_reg); OMPI_FREE_LIST_RETURN(&mpool_rdma->reg_list, (ompi_free_list_item_t*)old_reg); mpool_rdma->stat_evicted++; } else break; } else break; } if(rc != OMPI_SUCCESS) { OPAL_THREAD_UNLOCK(&mpool->rcache->lock); OMPI_FREE_LIST_RETURN(&mpool_rdma->reg_list, item); return rc; } rc = mpool_rdma->resources.register_mem(mpool_rdma->resources.reg_data, base, bound - base + 1, rdma_reg); if(rc != OMPI_SUCCESS) { mpool->rcache->rcache_delete(mpool->rcache, rdma_reg); OPAL_THREAD_UNLOCK(&mpool->rcache->lock); OMPI_FREE_LIST_RETURN(&mpool_rdma->reg_list, item); return rc; } *reg = rdma_reg; (*reg)->ref_count++; OPAL_THREAD_UNLOCK(&mpool->rcache->lock); return OMPI_SUCCESS; }
void mca_mpool_rdma_finalize(struct mca_mpool_base_module_t *mpool) { mca_mpool_rdma_module_t *mpool_rdma = (mca_mpool_rdma_module_t*)mpool; mca_mpool_base_registration_t *reg; mca_mpool_base_registration_t *regs[RDMA_MPOOL_NREGS]; int reg_cnt, i; int rc; /* Statistic */ if(true == mca_mpool_rdma_component.print_stats) { opal_output(0, "%s rdma: stats " "(hit/miss/found/not found/evicted): %d/%d/%d/%d/%d\n", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), mpool_rdma->stat_cache_hit, mpool_rdma->stat_cache_miss, mpool_rdma->stat_cache_found, mpool_rdma->stat_cache_notfound, mpool_rdma->stat_evicted); } OPAL_THREAD_LOCK(&mpool->rcache->lock); if(!opal_list_is_empty(&mpool_rdma->gc_list)) do_unregistration_gc(mpool); do { reg_cnt = mpool->rcache->rcache_find_all(mpool->rcache, 0, (size_t)-1, regs, RDMA_MPOOL_NREGS); for(i = 0; i < reg_cnt; i++) { reg = regs[i]; if(reg->ref_count) { reg->ref_count = 0; /* otherway dereg will fail on assert */ } else if (mca_mpool_rdma_component.leave_pinned) { opal_list_remove_item(&mpool_rdma->mru_list, (opal_list_item_t*)reg); } /* Remove from rcache first */ mpool->rcache->rcache_delete(mpool->rcache, reg); /* Drop lock before deregistering memory */ OPAL_THREAD_UNLOCK(&mpool->rcache->lock); rc = dereg_mem(mpool, reg); OPAL_THREAD_LOCK(&mpool->rcache->lock); if(rc != OMPI_SUCCESS) { /* Potentially lose track of registrations do we have to put it back? */ continue; } OMPI_FREE_LIST_RETURN(&mpool_rdma->reg_list, (ompi_free_list_item_t*)reg); } } while(reg_cnt == RDMA_MPOOL_NREGS); OBJ_DESTRUCT(&mpool_rdma->mru_list); OBJ_DESTRUCT(&mpool_rdma->gc_list); OBJ_DESTRUCT(&mpool_rdma->reg_list); OPAL_THREAD_UNLOCK(&mpool->rcache->lock); /* Cleanup any vmas that we have deferred deletion on */ mpool->rcache->rcache_clean(mpool->rcache); }