/* * register memory */ int mca_mpool_rdma_register(mca_mpool_base_module_t *mpool, void *addr, size_t size, uint32_t flags, mca_mpool_base_registration_t **reg) { mca_mpool_rdma_module_t *mpool_rdma = (mca_mpool_rdma_module_t*)mpool; mca_mpool_base_registration_t *rdma_reg; ompi_free_list_item_t *item; unsigned char *base, *bound; int rc; /* if cache bypass is requested don't use the cache */ if(flags & MCA_MPOOL_FLAGS_CACHE_BYPASS) { return register_cache_bypass(mpool, addr, size, flags, reg); } base = down_align_addr(addr, mca_mpool_base_page_size_log); bound = up_align_addr((void*)((char*) addr + size - 1), mca_mpool_base_page_size_log); OPAL_THREAD_LOCK(&mpool->rcache->lock); /* look through existing regs if not persistent registration requested. * Persistent registration are always registered and placed in the cache */ if(!(flags & MCA_MPOOL_FLAGS_PERSIST)) { /* check to see if memory is registered */ mpool->rcache->rcache_find(mpool->rcache, addr, size, reg); if(*reg != NULL && (mca_mpool_rdma_component.leave_pinned || ((*reg)->flags & MCA_MPOOL_FLAGS_PERSIST) || ((*reg)->base == base && (*reg)->bound == bound))) { if(0 == (*reg)->ref_count && mca_mpool_rdma_component.leave_pinned) { opal_list_remove_item(&mpool_rdma->mru_list, (opal_list_item_t*)(*reg)); } mpool_rdma->stat_cache_hit++; (*reg)->ref_count++; OPAL_THREAD_UNLOCK(&mpool->rcache->lock); return MPI_SUCCESS; } mpool_rdma->stat_cache_miss++; *reg = NULL; /* in case previous find found something */ /* If no suitable registration is in cache and leave_pinned isn't * set and size of registration cache is unlimited don't use the cache. * This is optimisation in case limit is not set. If limit is set we * have to put registration into the cache to determine when we hit * memory registration limit. * NONE: cache is still used for persistent registrations so previous * find can find something */ if(!mca_mpool_rdma_component.leave_pinned && mca_mpool_rdma_component.rcache_size_limit == 0) { OPAL_THREAD_UNLOCK(&mpool->rcache->lock); return register_cache_bypass(mpool, addr, size, flags, reg); } } OMPI_FREE_LIST_GET(&mpool_rdma->reg_list, item, rc); if(OMPI_SUCCESS != rc) { OPAL_THREAD_UNLOCK(&mpool->rcache->lock); return rc; } rdma_reg = (mca_mpool_base_registration_t*)item; rdma_reg->mpool = mpool; rdma_reg->base = base; rdma_reg->bound = bound; rdma_reg->flags = flags; while((rc = mpool->rcache->rcache_insert(mpool->rcache, rdma_reg, mca_mpool_rdma_component.rcache_size_limit)) == OMPI_ERR_TEMP_OUT_OF_RESOURCE) { mca_mpool_base_registration_t *old_reg; /* try to remove one unused reg and retry */ old_reg = (mca_mpool_base_registration_t*) opal_list_get_last(&mpool_rdma->mru_list); if(opal_list_get_end(&mpool_rdma->mru_list) != (opal_list_item_t*)old_reg) { rc = dereg_mem(mpool, old_reg); if(MPI_SUCCESS == rc) { mpool->rcache->rcache_delete(mpool->rcache, old_reg); opal_list_remove_item(&mpool_rdma->mru_list, (opal_list_item_t*)old_reg); OMPI_FREE_LIST_RETURN(&mpool_rdma->reg_list, (ompi_free_list_item_t*)old_reg); mpool_rdma->stat_evicted++; } else break; } else break; } if(rc != OMPI_SUCCESS) { OPAL_THREAD_UNLOCK(&mpool->rcache->lock); OMPI_FREE_LIST_RETURN(&mpool_rdma->reg_list, item); return rc; } rc = mpool_rdma->resources.register_mem(mpool_rdma->resources.reg_data, base, bound - base + 1, rdma_reg); if(rc != OMPI_SUCCESS) { mpool->rcache->rcache_delete(mpool->rcache, rdma_reg); OPAL_THREAD_UNLOCK(&mpool->rcache->lock); OMPI_FREE_LIST_RETURN(&mpool_rdma->reg_list, item); return rc; } *reg = rdma_reg; (*reg)->ref_count++; OPAL_THREAD_UNLOCK(&mpool->rcache->lock); return OMPI_SUCCESS; }
/* * register memory */ int mca_mpool_grdma_register(mca_mpool_base_module_t *mpool, void *addr, size_t size, uint32_t flags, mca_mpool_base_registration_t **reg) { mca_mpool_grdma_module_t *mpool_grdma = (mca_mpool_grdma_module_t*)mpool; mca_mpool_base_registration_t *grdma_reg; ompi_free_list_item_t *item; unsigned char *base, *bound; bool bypass_cache = !!(flags & MCA_MPOOL_FLAGS_CACHE_BYPASS); int rc; OPAL_THREAD_LOCK(&mpool->rcache->lock); /* if cache bypass is requested don't use the cache */ base = (unsigned char *) down_align_addr(addr, mca_mpool_base_page_size_log); bound = (unsigned char *) up_align_addr((void*)((char*) addr + size - 1), mca_mpool_base_page_size_log); if (!opal_list_is_empty (&mpool_grdma->pool->gc_list)) do_unregistration_gc(mpool); /* look through existing regs if not persistent registration requested. * Persistent registration are always registered and placed in the cache */ if(!(flags & MCA_MPOOL_FLAGS_PERSIST) && !bypass_cache) { /* check to see if memory is registered */ mpool->rcache->rcache_find(mpool->rcache, addr, size, reg); if(*reg != NULL && (mca_mpool_grdma_component.leave_pinned || ((*reg)->flags & MCA_MPOOL_FLAGS_PERSIST) || ((*reg)->base == base && (*reg)->bound == bound))) { if(0 == (*reg)->ref_count && mca_mpool_grdma_component.leave_pinned) { opal_list_remove_item(&mpool_grdma->pool->lru_list, (opal_list_item_t*)(*reg)); } mpool_grdma->stat_cache_hit++; (*reg)->ref_count++; OPAL_THREAD_UNLOCK(&mpool->rcache->lock); return OMPI_SUCCESS; } mpool_grdma->stat_cache_miss++; *reg = NULL; /* in case previous find found something */ /* If no suitable registration is in cache and leave_pinned isn't * set don't use the cache. * This is optimisation in case limit is not set. If limit is set we * have to put registration into the cache to determine when we hit * memory registration limit. * NONE: cache is still used for persistent registrations so previous * find can find something */ if(!mca_mpool_grdma_component.leave_pinned) { bypass_cache = true; } } OMPI_FREE_LIST_GET(&mpool_grdma->reg_list, item, rc); if(OMPI_SUCCESS != rc) { OPAL_THREAD_UNLOCK(&mpool->rcache->lock); return rc; } grdma_reg = (mca_mpool_base_registration_t*)item; grdma_reg->mpool = mpool; grdma_reg->base = base; grdma_reg->bound = bound; grdma_reg->flags = flags; if (false == bypass_cache) { rc = mpool->rcache->rcache_insert(mpool->rcache, grdma_reg, 0); if (OPAL_UNLIKELY(rc != OMPI_SUCCESS)) { OPAL_THREAD_UNLOCK(&mpool->rcache->lock); OMPI_FREE_LIST_RETURN(&mpool_grdma->reg_list, item); return rc; } } while (OMPI_ERR_OUT_OF_RESOURCE == (rc = mpool_grdma->resources.register_mem(mpool_grdma->resources.reg_data, base, bound - base + 1, grdma_reg))) { /* try to remove one unused reg and retry */ if (!mca_mpool_grdma_evict (mpool)) { break; } } if (OPAL_UNLIKELY(rc != OMPI_SUCCESS)) { if (false == bypass_cache) { mpool->rcache->rcache_delete(mpool->rcache, grdma_reg); } OPAL_THREAD_UNLOCK(&mpool->rcache->lock); OMPI_FREE_LIST_RETURN(&mpool_grdma->reg_list, item); return rc; } *reg = grdma_reg; (*reg)->ref_count++; OPAL_THREAD_UNLOCK(&mpool->rcache->lock); /* Cleanup any vmas that we have deferred deletion on */ mpool->rcache->rcache_clean(mpool->rcache); return OMPI_SUCCESS; }
/* * Return a module-specific IO MPI_Request */ int mca_io_base_request_alloc(ompi_file_t *file, mca_io_base_request_t **req) { int err; mca_io_base_module_request_once_init_fn_t func; ompi_free_list_item_t *item; /* See if we've got a request on the module's freelist (which is cached on the file, since there's only one module per MPI_File). Use a quick-but-not-entirely-accurate (but good enough) check as a slight optimization to potentially having to avoid locking and unlocking. */ if (opal_list_get_size(&file->f_io_requests) > 0) { OPAL_THREAD_LOCK(&file->f_io_requests_lock); if (opal_list_get_size(&file->f_io_requests) > 0) { *req = (mca_io_base_request_t*) opal_list_remove_first(&file->f_io_requests); (*req)->free_called = false; } else { *req = NULL; } OPAL_THREAD_UNLOCK(&file->f_io_requests_lock); } else { *req = NULL; } /* Nope, we didn't have one on the file freelist, so let's get one off the global freelist */ if (NULL == *req) { OMPI_FREE_LIST_GET(&mca_io_base_requests, item, err); *req = (mca_io_base_request_t*) item; /* Call the per-use init function, if it exists */ switch (file->f_io_version) { case MCA_IO_BASE_V_2_0_0: /* These can be set once for this request since this request will always be used with the same module (and therefore, the same MPI_File). Note that (*req)->req_ompi.rq_type is already set by the constructor. */ (*req)->req_file = file; (*req)->req_ver = file->f_io_version; (*req)->free_called = false; (*req)->super.req_free = file->f_io_selected_module.v2_0_0.io_module_request_free; (*req)->super.req_cancel = file->f_io_selected_module.v2_0_0.io_module_request_cancel; /* Call the module's once-per process init, if it exists */ func = file->f_io_selected_module.v2_0_0.io_module_request_once_init; if (NULL != func) { if (OMPI_SUCCESS != (err = func(&file->f_io_selected_module, *req))) { OMPI_FREE_LIST_RETURN(&mca_io_base_requests, item); return err; } } break; default: OMPI_FREE_LIST_RETURN(&mca_io_base_requests, item); return OMPI_ERR_NOT_IMPLEMENTED; break; } } /* Initialize the request */ OMPI_REQUEST_INIT(&((*req)->super), false); (*req)->super.req_mpi_object.file = file; /* * Copied from ompi/mca/pml/base/pml_base_recvreq.h: * always set the req_status.MPI_TAG to ANY_TAG before starting the * request. This field is used if cancelled to find out if the request * has been matched or not. */ (*req)->super.req_status.MPI_TAG = MPI_ANY_TAG; (*req)->super.req_status.MPI_ERROR = OMPI_SUCCESS; (*req)->super.req_status._count = 0; (*req)->super.req_status._cancelled = 0; /* All done */ return OMPI_SUCCESS; }