/* * decide whether a page can be released, possibly by cancelling a store to it * - we're allowed to sleep if __GFP_DIRECT_RECLAIM is flagged */ bool __fscache_maybe_release_page(struct fscache_cookie *cookie, struct page *page, gfp_t gfp) { struct page *xpage; void *val; _enter("%p,%p,%x", cookie, page, gfp); try_again: rcu_read_lock(); val = radix_tree_lookup(&cookie->stores, page->index); if (!val) { rcu_read_unlock(); fscache_stat(&fscache_n_store_vmscan_not_storing); __fscache_uncache_page(cookie, page); return true; } /* see if the page is actually undergoing storage - if so we can't get * rid of it till the cache has finished with it */ if (radix_tree_tag_get(&cookie->stores, page->index, FSCACHE_COOKIE_STORING_TAG)) { rcu_read_unlock(); goto page_busy; } /* the page is pending storage, so we attempt to cancel the store and * discard the store request so that the page can be reclaimed */ spin_lock(&cookie->stores_lock); rcu_read_unlock(); if (radix_tree_tag_get(&cookie->stores, page->index, FSCACHE_COOKIE_STORING_TAG)) { /* the page started to undergo storage whilst we were looking, * so now we can only wait or return */ spin_unlock(&cookie->stores_lock); goto page_busy; } xpage = radix_tree_delete(&cookie->stores, page->index); spin_unlock(&cookie->stores_lock); if (xpage) { fscache_stat(&fscache_n_store_vmscan_cancelled); fscache_stat(&fscache_n_store_radix_deletes); ASSERTCMP(xpage, ==, page); } else {
/** * hwspin_lock_request_specific() - request for a specific hwspinlock * @id: index of the specific hwspinlock that is requested * * This function should be called by users of the hwspinlock module, * in order to assign them a specific hwspinlock. * Usually early board code will be calling this function in order to * reserve specific hwspinlock ids for predefined purposes. * * Should be called from a process context (might sleep) * * Returns the address of the assigned hwspinlock, or NULL on error */ struct hwspinlock *hwspin_lock_request_specific(unsigned int id) { struct hwspinlock *hwlock; int ret; mutex_lock(&hwspinlock_tree_lock); /* make sure this hwspinlock exists */ hwlock = radix_tree_lookup(&hwspinlock_tree, id); if (!hwlock) { pr_warn("hwspinlock %u does not exist\n", id); goto out; } /* sanity check (this shouldn't happen) */ WARN_ON(hwlock_to_id(hwlock) != id); /* make sure this hwspinlock is unused */ ret = radix_tree_tag_get(&hwspinlock_tree, id, HWSPINLOCK_UNUSED); if (ret == 0) { pr_warn("hwspinlock %u is already in use\n", id); hwlock = NULL; goto out; } /* mark as used and power up */ ret = __hwspin_lock_request(hwlock); if (ret < 0) hwlock = NULL; out: mutex_unlock(&hwspinlock_tree_lock); return hwlock; }
bool __fscache_maybe_release_page(struct fscache_cookie *cookie, struct page *page, gfp_t gfp) { struct page *xpage; void *val; _enter("%p,%p,%x", cookie, page, gfp); rcu_read_lock(); val = radix_tree_lookup(&cookie->stores, page->index); if (!val) { rcu_read_unlock(); fscache_stat(&fscache_n_store_vmscan_not_storing); __fscache_uncache_page(cookie, page); return true; } /* */ if (radix_tree_tag_get(&cookie->stores, page->index, FSCACHE_COOKIE_STORING_TAG)) { rcu_read_unlock(); goto page_busy; } /* */ spin_lock(&cookie->stores_lock); rcu_read_unlock(); if (radix_tree_tag_get(&cookie->stores, page->index, FSCACHE_COOKIE_STORING_TAG)) { /* */ spin_unlock(&cookie->stores_lock); goto page_busy; } xpage = radix_tree_delete(&cookie->stores, page->index); spin_unlock(&cookie->stores_lock); if (xpage) { fscache_stat(&fscache_n_store_vmscan_cancelled); fscache_stat(&fscache_n_store_radix_deletes); ASSERTCMP(xpage, ==, page); } else {
/** * hwspin_lock_free() - free a specific hwspinlock * @hwlock: the specific hwspinlock to free * * This function mark @hwlock as free again. * Should only be called with an @hwlock that was retrieved from * an earlier call to omap_hwspin_lock_request{_specific}. * * Should be called from a process context (might sleep) * * Returns 0 on success, or an appropriate error code on failure */ int hwspin_lock_free(struct hwspinlock *hwlock) { struct device *dev = hwlock->bank->dev; struct hwspinlock *tmp; int ret; if (!hwlock) { pr_err("invalid hwlock\n"); return -EINVAL; } mutex_lock(&hwspinlock_tree_lock); /* make sure the hwspinlock is used */ ret = radix_tree_tag_get(&hwspinlock_tree, hwlock_to_id(hwlock), HWSPINLOCK_UNUSED); if (ret == 1) { dev_err(dev, "%s: hwlock is already free\n", __func__); dump_stack(); ret = -EINVAL; goto out; } /* notify the underlying device that power is not needed */ ret = pm_runtime_put(dev); if (ret < 0) goto out; /* mark this hwspinlock as available */ tmp = radix_tree_tag_set(&hwspinlock_tree, hwlock_to_id(hwlock), HWSPINLOCK_UNUSED); /* sanity check (this shouldn't happen) */ WARN_ON(tmp != hwlock); module_put(dev->driver->owner); out: mutex_unlock(&hwspinlock_tree_lock); return ret; }
static struct hwspinlock *hwspin_lock_unregister_single(unsigned int id) { struct hwspinlock *hwlock = NULL; int ret; mutex_lock(&hwspinlock_tree_lock); /* make sure the hwspinlock is not in use (tag is set) */ ret = radix_tree_tag_get(&hwspinlock_tree, id, HWSPINLOCK_UNUSED); if (ret == 0) { pr_err("hwspinlock %d still in use (or not present)\n", id); goto out; } hwlock = radix_tree_delete(&hwspinlock_tree, id); if (!hwlock) { pr_err("failed to delete hwspinlock %d\n", id); goto out; } out: mutex_unlock(&hwspinlock_tree_lock); return hwlock; }
static int copy_user_bh(struct page *to, struct inode *inode, struct buffer_head *bh, unsigned long vaddr) { struct blk_dax_ctl dax = { .sector = to_sector(bh, inode), .size = bh->b_size, }; struct block_device *bdev = bh->b_bdev; void *vto; if (dax_map_atomic(bdev, &dax) < 0) return PTR_ERR(dax.addr); vto = kmap_atomic(to); copy_user_page(vto, (void __force *)dax.addr, vaddr, to); kunmap_atomic(vto); dax_unmap_atomic(bdev, &dax); return 0; } #define NO_SECTOR -1 #define DAX_PMD_INDEX(page_index) (page_index & (PMD_MASK >> PAGE_SHIFT)) static int dax_radix_entry(struct address_space *mapping, pgoff_t index, sector_t sector, bool pmd_entry, bool dirty) { struct radix_tree_root *page_tree = &mapping->page_tree; pgoff_t pmd_index = DAX_PMD_INDEX(index); int type, error = 0; void *entry; WARN_ON_ONCE(pmd_entry && !dirty); if (dirty) __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); spin_lock_irq(&mapping->tree_lock); entry = radix_tree_lookup(page_tree, pmd_index); if (entry && RADIX_DAX_TYPE(entry) == RADIX_DAX_PMD) { index = pmd_index; goto dirty; } entry = radix_tree_lookup(page_tree, index); if (entry) { type = RADIX_DAX_TYPE(entry); if (WARN_ON_ONCE(type != RADIX_DAX_PTE && type != RADIX_DAX_PMD)) { error = -EIO; goto unlock; } if (!pmd_entry || type == RADIX_DAX_PMD) goto dirty; /* * We only insert dirty PMD entries into the radix tree. This * means we don't need to worry about removing a dirty PTE * entry and inserting a clean PMD entry, thus reducing the * range we would flush with a follow-up fsync/msync call. */ radix_tree_delete(&mapping->page_tree, index); mapping->nrexceptional--; } if (sector == NO_SECTOR) { /* * This can happen during correct operation if our pfn_mkwrite * fault raced against a hole punch operation. If this * happens the pte that was hole punched will have been * unmapped and the radix tree entry will have been removed by * the time we are called, but the call will still happen. We * will return all the way up to wp_pfn_shared(), where the * pte_same() check will fail, eventually causing page fault * to be retried by the CPU. */ goto unlock; } error = radix_tree_insert(page_tree, index, RADIX_DAX_ENTRY(sector, pmd_entry)); if (error) goto unlock; mapping->nrexceptional++; dirty: if (dirty) radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY); unlock: spin_unlock_irq(&mapping->tree_lock); return error; } static int dax_writeback_one(struct block_device *bdev, struct address_space *mapping, pgoff_t index, void *entry) { struct radix_tree_root *page_tree = &mapping->page_tree; int type = RADIX_DAX_TYPE(entry); struct radix_tree_node *node; struct blk_dax_ctl dax; void **slot; int ret = 0; spin_lock_irq(&mapping->tree_lock); /* * Regular page slots are stabilized by the page lock even * without the tree itself locked. These unlocked entries * need verification under the tree lock. */ if (!__radix_tree_lookup(page_tree, index, &node, &slot)) goto unlock; if (*slot != entry) goto unlock; /* another fsync thread may have already written back this entry */ if (!radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE)) goto unlock; if (WARN_ON_ONCE(type != RADIX_DAX_PTE && type != RADIX_DAX_PMD)) { ret = -EIO; goto unlock; } dax.sector = RADIX_DAX_SECTOR(entry); dax.size = (type == RADIX_DAX_PMD ? PMD_SIZE : PAGE_SIZE); spin_unlock_irq(&mapping->tree_lock); /* * We cannot hold tree_lock while calling dax_map_atomic() because it * eventually calls cond_resched(). */ ret = dax_map_atomic(bdev, &dax); if (ret < 0) return ret; if (WARN_ON_ONCE(ret < dax.size)) { ret = -EIO; goto unmap; } wb_cache_pmem(dax.addr, dax.size); spin_lock_irq(&mapping->tree_lock); radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_TOWRITE); spin_unlock_irq(&mapping->tree_lock); unmap: dax_unmap_atomic(bdev, &dax); return ret; unlock: spin_unlock_irq(&mapping->tree_lock); return ret; } /* * Flush the mapping to the persistent domain within the byte range of [start, * end]. This is required by data integrity operations to ensure file data is * on persistent storage prior to completion of the operation. */ int dax_writeback_mapping_range(struct address_space *mapping, struct block_device *bdev, struct writeback_control *wbc) { struct inode *inode = mapping->host; pgoff_t start_index, end_index, pmd_index; pgoff_t indices[PAGEVEC_SIZE]; struct pagevec pvec; bool done = false; int i, ret = 0; void *entry; if (WARN_ON_ONCE(inode->i_blkbits != PAGE_SHIFT)) return -EIO; if (!mapping->nrexceptional || wbc->sync_mode != WB_SYNC_ALL) return 0; start_index = wbc->range_start >> PAGE_SHIFT; end_index = wbc->range_end >> PAGE_SHIFT; pmd_index = DAX_PMD_INDEX(start_index); rcu_read_lock(); entry = radix_tree_lookup(&mapping->page_tree, pmd_index); rcu_read_unlock(); /* see if the start of our range is covered by a PMD entry */ if (entry && RADIX_DAX_TYPE(entry) == RADIX_DAX_PMD) start_index = pmd_index; tag_pages_for_writeback(mapping, start_index, end_index); pagevec_init(&pvec, 0); while (!done) { pvec.nr = find_get_entries_tag(mapping, start_index, PAGECACHE_TAG_TOWRITE, PAGEVEC_SIZE, pvec.pages, indices); if (pvec.nr == 0) break; for (i = 0; i < pvec.nr; i++) { if (indices[i] > end_index) { done = true; break; } ret = dax_writeback_one(bdev, mapping, indices[i], pvec.pages[i]); if (ret < 0) return ret; } } wmb_pmem(); return 0; } EXPORT_SYMBOL_GPL(dax_writeback_mapping_range); static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh, struct vm_area_struct *vma, struct vm_fault *vmf) { unsigned long vaddr = (unsigned long)vmf->virtual_address; struct address_space *mapping = inode->i_mapping; struct block_device *bdev = bh->b_bdev; struct blk_dax_ctl dax = { .sector = to_sector(bh, inode), .size = bh->b_size, }; pgoff_t size; int error; i_mmap_lock_read(mapping); /* * Check truncate didn't happen while we were allocating a block. * If it did, this block may or may not be still allocated to the * file. We can't tell the filesystem to free it because we can't * take i_mutex here. In the worst case, the file still has blocks * allocated past the end of the file. */ size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; if (unlikely(vmf->pgoff >= size)) { error = -EIO; goto out; } if (dax_map_atomic(bdev, &dax) < 0) { error = PTR_ERR(dax.addr); goto out; } if (buffer_unwritten(bh) || buffer_new(bh)) { clear_pmem(dax.addr, PAGE_SIZE); wmb_pmem(); } dax_unmap_atomic(bdev, &dax); error = dax_radix_entry(mapping, vmf->pgoff, dax.sector, false, vmf->flags & FAULT_FLAG_WRITE); if (error) goto out; error = vm_insert_mixed(vma, vaddr, dax.pfn); out: i_mmap_unlock_read(mapping); return error; } /** * __dax_fault - handle a page fault on a DAX file * @vma: The virtual memory area where the fault occurred * @vmf: The description of the fault * @get_block: The filesystem method used to translate file offsets to blocks * @complete_unwritten: The filesystem method used to convert unwritten blocks * to written so the data written to them is exposed. This is required for * required by write faults for filesystems that will return unwritten * extent mappings from @get_block, but it is optional for reads as * dax_insert_mapping() will always zero unwritten blocks. If the fs does * not support unwritten extents, the it should pass NULL. * * When a page fault occurs, filesystems may call this helper in their * fault handler for DAX files. __dax_fault() assumes the caller has done all * the necessary locking for the page fault to proceed successfully. */ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, get_block_t get_block, dax_iodone_t complete_unwritten) { struct file *file = vma->vm_file; struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; struct page *page; struct buffer_head bh; unsigned long vaddr = (unsigned long)vmf->virtual_address; unsigned blkbits = inode->i_blkbits; sector_t block; pgoff_t size; int error; int major = 0; size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; if (vmf->pgoff >= size) return VM_FAULT_SIGBUS; memset(&bh, 0, sizeof(bh)); block = (sector_t)vmf->pgoff << (PAGE_SHIFT - blkbits); bh.b_bdev = inode->i_sb->s_bdev; bh.b_size = PAGE_SIZE; repeat: page = find_get_page(mapping, vmf->pgoff); if (page) { if (!lock_page_or_retry(page, vma->vm_mm, vmf->flags)) { put_page(page); return VM_FAULT_RETRY; } if (unlikely(page->mapping != mapping)) { unlock_page(page); put_page(page); goto repeat; } size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; if (unlikely(vmf->pgoff >= size)) { /* * We have a struct page covering a hole in the file * from a read fault and we've raced with a truncate */ error = -EIO; goto unlock_page; } } error = get_block(inode, block, &bh, 0); if (!error && (bh.b_size < PAGE_SIZE)) error = -EIO; /* fs corruption? */ if (error) goto unlock_page; if (!buffer_mapped(&bh) && !buffer_unwritten(&bh) && !vmf->cow_page) { if (vmf->flags & FAULT_FLAG_WRITE) { error = get_block(inode, block, &bh, 1); count_vm_event(PGMAJFAULT); mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); major = VM_FAULT_MAJOR; if (!error && (bh.b_size < PAGE_SIZE)) error = -EIO; if (error) goto unlock_page; } else { return dax_load_hole(mapping, page, vmf); } } if (vmf->cow_page) { struct page *new_page = vmf->cow_page; if (buffer_written(&bh)) error = copy_user_bh(new_page, inode, &bh, vaddr); else clear_user_highpage(new_page, vaddr); if (error) goto unlock_page; vmf->page = page; if (!page) { i_mmap_lock_read(mapping); /* Check we didn't race with truncate */ size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; if (vmf->pgoff >= size) { i_mmap_unlock_read(mapping); error = -EIO; goto out; } } return VM_FAULT_LOCKED; } /* Check we didn't race with a read fault installing a new page */ if (!page && major) page = find_lock_page(mapping, vmf->pgoff); if (page) { unmap_mapping_range(mapping, vmf->pgoff << PAGE_SHIFT, PAGE_SIZE, 0); delete_from_page_cache(page); unlock_page(page); put_page(page); page = NULL; } /* * If we successfully insert the new mapping over an unwritten extent, * we need to ensure we convert the unwritten extent. If there is an * error inserting the mapping, the filesystem needs to leave it as * unwritten to prevent exposure of the stale underlying data to * userspace, but we still need to call the completion function so * the private resources on the mapping buffer can be released. We * indicate what the callback should do via the uptodate variable, same * as for normal BH based IO completions. */ error = dax_insert_mapping(inode, &bh, vma, vmf); if (buffer_unwritten(&bh)) { if (complete_unwritten) complete_unwritten(&bh, !error); else WARN_ON_ONCE(!(vmf->flags & FAULT_FLAG_WRITE)); } out: if (error == -ENOMEM) return VM_FAULT_OOM | major; /* -EBUSY is fine, somebody else faulted on the same PTE */ if ((error < 0) && (error != -EBUSY)) return VM_FAULT_SIGBUS | major; return VM_FAULT_NOPAGE | major; unlock_page: if (page) { unlock_page(page); put_page(page); } goto out; }
for_each_index(i, base, order) { assert(!radix_tree_tag_get(&tree, i, 0)); assert(radix_tree_tag_get(&tree, i, 1)); }