void ib_umem_odp_unmap_dma_pages(struct ib_umem *umem, u64 virt, u64 bound) { int idx; u64 addr; struct ib_device *dev = umem->context->device; virt = max_t(u64, virt, ib_umem_start(umem)); bound = min_t(u64, bound, ib_umem_end(umem)); /* Note that during the run of this function, the * notifiers_count of the MR is > 0, preventing any racing * faults from completion. We might be racing with other * invalidations, so we must make sure we free each page only * once. */ for (addr = virt; addr < bound; addr += PAGE_SIZE) { idx = (addr - ib_umem_start(umem)) / PAGE_SIZE; mutex_lock(&umem->odp_data->umem_mutex); if (umem->odp_data->page_list[idx]) { struct page *page = umem->odp_data->page_list[idx]; #ifdef CONFIG_COMPAT_USE_COMPOUND_TRANS_HEAD struct page *head_page = compound_trans_head(page); #else struct page *head_page = compound_head(page); #endif dma_addr_t dma_addr = umem->odp_data->dma_list[idx] & ODP_DMA_ADDR_MASK; WARN_ON(!dma_addr); ib_dma_unmap_page(dev, dma_addr, PAGE_SIZE, DMA_BIDIRECTIONAL); if (umem->odp_data->dma_list[idx] & ODP_WRITE_ALLOWED_BIT) /* * set_page_dirty prefers being called with * the page lock. However, MMU notifiers are * called sometimes with and sometimes without * the lock. We rely on the umem_mutex instead * to prevent other mmu notifiers from * continuing and allowing the page mapping to * be removed. */ set_page_dirty(head_page); /* on demand pinning support */ if (!umem->context->invalidate_range) put_page(page); umem->odp_data->page_list[idx] = NULL; umem->odp_data->dma_list[idx] = 0; atomic_inc(&dev->odp_statistics.num_invalidation_pages); } mutex_unlock(&umem->odp_data->umem_mutex); } }
static struct ib_umem_odp *odp_lookup(struct ib_ucontext *ctx, u64 start, u64 length, struct mlx5_ib_mr *parent) { struct ib_umem_odp *odp; struct rb_node *rb; down_read(&ctx->umem_rwsem); odp = rbt_ib_umem_lookup(&ctx->umem_tree, start, length); if (!odp) goto end; while (1) { if (check_parent(odp, parent)) goto end; rb = rb_next(&odp->interval_tree.rb); if (!rb) goto not_found; odp = rb_entry(rb, struct ib_umem_odp, interval_tree.rb); if (ib_umem_start(odp->umem) > start + length) goto not_found; } not_found: odp = NULL; end: up_read(&ctx->umem_rwsem); return odp; }
static int ib_umem_notifier_release_trampoline(struct ib_umem *item, u64 start, u64 end, void *cookie) { /* * Increase the number of notifiers running, to * prevent any further fault handling on this MR. */ ib_umem_notifier_start_account(item); item->odp_data->dying = 1; /* Make sure that the fact the umem is dying is out before we release * all pending page faults. */ smp_wmb(); complete_all(&item->odp_data->notifier_completion); item->context->invalidate_range(item, ib_umem_start(item), ib_umem_end(item)); return 0; }
/** * ib_umem_odp_map_dma_pages - Pin and DMA map userspace memory in an ODP MR. * * Pins the range of pages passed in the argument, and maps them to * DMA addresses. The DMA addresses of the mapped pages is updated in * umem->odp_data->dma_list. * * Returns the number of pages mapped in success, negative error code * for failure. * An -EAGAIN error code is returned when a concurrent mmu notifier prevents * the function from completing its task. * * @umem: the umem to map and pin * @user_virt: the address from which we need to map. * @bcnt: the minimal number of bytes to pin and map. The mapping might be * bigger due to alignment, and may also be smaller in case of an error * pinning or mapping a page. The actual pages mapped is returned in * the return value. * @access_mask: bit mask of the requested access permissions for the given * range. * @current_seq: the MMU notifiers sequance value for synchronization with * invalidations. the sequance number is read from * umem->odp_data->notifiers_seq before calling this function * @flags: IB_ODP_DMA_MAP_FOR_PREEFTCH is used to indicate that the function * was called from the prefetch verb. IB_ODP_DMA_MAP_FOR_PAGEFAULT is * used to indicate that the function was called from a pagefault * handler. */ int ib_umem_odp_map_dma_pages(struct ib_umem *umem, u64 user_virt, u64 bcnt, u64 access_mask, unsigned long current_seq, enum ib_odp_dma_map_flags flags) { struct task_struct *owning_process = NULL; struct mm_struct *owning_mm = NULL; struct page **local_page_list = NULL; u64 off; int j, k, ret = 0, start_idx, npages = 0; if (access_mask == 0) return -EINVAL; if (user_virt < ib_umem_start(umem) || user_virt + bcnt > ib_umem_end(umem)) return -EFAULT; local_page_list = (struct page **)__get_free_page(GFP_KERNEL); if (!local_page_list) return -ENOMEM; off = user_virt & (~PAGE_MASK); user_virt = user_virt & PAGE_MASK; bcnt += off; /* Charge for the first page offset as well. */ owning_process = get_pid_task(umem->context->tgid, PIDTYPE_PID); if (owning_process == NULL) { ret = -EINVAL; goto out_no_task; } owning_mm = get_task_mm(owning_process); if (owning_mm == NULL) { ret = -EINVAL; goto out_put_task; } start_idx = (user_virt - ib_umem_start(umem)) >> PAGE_SHIFT; k = start_idx; while (bcnt > 0) { down_read(&owning_mm->mmap_sem); /* * Note: this might result in redundent page getting. We can * avoid this by checking dma_list to be 0 before calling * get_user_pages. However, this make the code much more * complex (and doesn't gain us much performance in most use * cases). */ npages = get_user_pages(owning_process, owning_mm, user_virt, min_t(size_t, (bcnt - 1 + PAGE_SIZE) / PAGE_SIZE, PAGE_SIZE / sizeof(struct page *)), access_mask & ODP_WRITE_ALLOWED_BIT, 0, local_page_list, NULL); up_read(&owning_mm->mmap_sem); if (npages < 0) break; bcnt -= min_t(size_t, npages << PAGE_SHIFT, bcnt); user_virt += npages << PAGE_SHIFT; for (j = 0; j < npages; ++j) { ret = ib_umem_odp_map_dma_single_page(umem, k, local_page_list[j], access_mask, current_seq, flags); if (ret < 0) break; k++; } if (ret < 0) { /* Release left over pages when handling errors. */ for (++j; j < npages; ++j) put_page(local_page_list[j]); break; } } if (ret >= 0) { if (npages < 0 && k == start_idx) ret = npages; else ret = k - start_idx; } mmput(owning_mm); out_put_task: put_task_struct(owning_process); out_no_task: free_page((unsigned long) local_page_list); return ret; }
void ib_umem_odp_release(struct ib_umem *umem) { struct ib_ucontext *context = umem->context; /* * Ensure that no more pages are mapped in the umem. * * It is the driver's responsibility to ensure, before calling us, * that the hardware will not attempt to access the MR any more. */ ib_umem_odp_unmap_dma_pages(umem, ib_umem_start(umem), ib_umem_end(umem)); down_write(&context->umem_mutex); if (likely(ib_umem_start(umem) != ib_umem_end(umem))) rbt_ib_umem_remove(&umem->odp_data->interval_tree, &context->umem_tree); context->odp_mrs_count--; /* * Downgrade the lock to a read lock. This ensures that the notifiers * (who lock the mutex for reading) will be able to finish, and we * will be able to enventually obtain the mmu notifiers SRCU. Note * that since we are doing it atomically, no other user could register * and unregister while we do the check. */ downgrade_write(&context->umem_mutex); if (!context->odp_mrs_count) { struct task_struct *owning_process = NULL; struct mm_struct *owning_mm = NULL; owning_process = get_pid_task(context->tgid, PIDTYPE_PID); if (owning_process == NULL) /* * The process is already dead, notifier were removed * already. */ goto out; owning_mm = get_task_mm(owning_process); if (owning_mm == NULL) /* * The process' mm is already dead, notifier were * removed already. */ goto out_put_task; mmu_notifier_unregister(&context->mn, owning_mm); mmput(owning_mm); out_put_task: put_task_struct(owning_process); } out: up_read(&context->umem_mutex); vfree(umem->odp_data->dma_list); vfree(umem->odp_data->page_list); kfree(umem->odp_data); kfree(umem); }
int ib_umem_odp_get(struct ib_ucontext *context, struct ib_umem *umem) { int ret_val; struct pid *our_pid; struct mm_struct *mm = get_task_mm(current); BUG_ON(!mm); /* Prevent creating ODP MRs in child processes */ rcu_read_lock(); our_pid = get_task_pid(current->group_leader, PIDTYPE_PID); rcu_read_unlock(); put_pid(our_pid); if (context->tgid != our_pid) { ret_val = -EINVAL; goto out_mm; } umem->hugetlb = 0; umem->odp_data = kzalloc(sizeof(*umem->odp_data), GFP_KERNEL); if (!umem->odp_data) { ret_val = -ENOMEM; goto out_mm; } umem->odp_data->umem = umem; mutex_init(&umem->odp_data->umem_mutex); init_completion(&umem->odp_data->notifier_completion); umem->odp_data->page_list = vzalloc(ib_umem_num_pages(umem) * sizeof(*umem->odp_data->page_list)); if (!umem->odp_data->page_list) { ret_val = -ENOMEM; goto out_odp_data; } umem->odp_data->dma_list = vzalloc(ib_umem_num_pages(umem) * sizeof(*umem->odp_data->dma_list)); if (!umem->odp_data->dma_list) { ret_val = -ENOMEM; goto out_page_list; } /* * When using MMU notifiers, we will get a * notification before the "current" task (and MM) is * destroyed. We use the umem_mutex lock to synchronize. */ down_write(&context->umem_mutex); context->odp_mrs_count++; if (likely(ib_umem_start(umem) != ib_umem_end(umem))) rbt_ib_umem_insert(&umem->odp_data->interval_tree, &context->umem_tree); downgrade_write(&context->umem_mutex); if (context->odp_mrs_count == 1) { /* * Note that at this point, no MMU notifier is running * for this context! */ INIT_HLIST_NODE(&context->mn.hlist); context->mn.ops = &ib_umem_notifiers; /* * Lock-dep detects a false positive for mmap_sem vs. * umem_mutex, due to not grasping downgrade_write correctly. */ lockdep_off(); ret_val = mmu_notifier_register(&context->mn, mm); lockdep_on(); if (ret_val) { pr_err("Failed to register mmu_notifier %d\n", ret_val); ret_val = -EBUSY; goto out_mutex; } } up_read(&context->umem_mutex); /* * Note that doing an mmput can cause a notifier for the relevant mm. * If the notifier is called while we hold the umem_mutex, this will * cause a deadlock. Therefore, we release the reference only after we * released the mutex. */ mmput(mm); return 0; out_mutex: up_read(&context->umem_mutex); vfree(umem->odp_data->dma_list); out_page_list: vfree(umem->odp_data->page_list); out_odp_data: kfree(umem->odp_data); out_mm: mmput(mm); return ret_val; }