示例#1
0
文件: page.c 项目: raoy1990/linux
/*
 * decide whether a page can be released, possibly by cancelling a store to it
 * - we're allowed to sleep if __GFP_DIRECT_RECLAIM is flagged
 */
bool __fscache_maybe_release_page(struct fscache_cookie *cookie,
				  struct page *page,
				  gfp_t gfp)
{
	struct page *xpage;
	void *val;

	_enter("%p,%p,%x", cookie, page, gfp);

try_again:
	rcu_read_lock();
	val = radix_tree_lookup(&cookie->stores, page->index);
	if (!val) {
		rcu_read_unlock();
		fscache_stat(&fscache_n_store_vmscan_not_storing);
		__fscache_uncache_page(cookie, page);
		return true;
	}

	/* see if the page is actually undergoing storage - if so we can't get
	 * rid of it till the cache has finished with it */
	if (radix_tree_tag_get(&cookie->stores, page->index,
			       FSCACHE_COOKIE_STORING_TAG)) {
		rcu_read_unlock();
		goto page_busy;
	}

	/* the page is pending storage, so we attempt to cancel the store and
	 * discard the store request so that the page can be reclaimed */
	spin_lock(&cookie->stores_lock);
	rcu_read_unlock();

	if (radix_tree_tag_get(&cookie->stores, page->index,
			       FSCACHE_COOKIE_STORING_TAG)) {
		/* the page started to undergo storage whilst we were looking,
		 * so now we can only wait or return */
		spin_unlock(&cookie->stores_lock);
		goto page_busy;
	}

	xpage = radix_tree_delete(&cookie->stores, page->index);
	spin_unlock(&cookie->stores_lock);

	if (xpage) {
		fscache_stat(&fscache_n_store_vmscan_cancelled);
		fscache_stat(&fscache_n_store_radix_deletes);
		ASSERTCMP(xpage, ==, page);
	} else {
/**
 * hwspin_lock_request_specific() - request for a specific hwspinlock
 * @id: index of the specific hwspinlock that is requested
 *
 * This function should be called by users of the hwspinlock module,
 * in order to assign them a specific hwspinlock.
 * Usually early board code will be calling this function in order to
 * reserve specific hwspinlock ids for predefined purposes.
 *
 * Should be called from a process context (might sleep)
 *
 * Returns the address of the assigned hwspinlock, or NULL on error
 */
struct hwspinlock *hwspin_lock_request_specific(unsigned int id)
{
	struct hwspinlock *hwlock;
	int ret;

	mutex_lock(&hwspinlock_tree_lock);

	/* make sure this hwspinlock exists */
	hwlock = radix_tree_lookup(&hwspinlock_tree, id);
	if (!hwlock) {
		pr_warn("hwspinlock %u does not exist\n", id);
		goto out;
	}

	/* sanity check (this shouldn't happen) */
	WARN_ON(hwlock_to_id(hwlock) != id);

	/* make sure this hwspinlock is unused */
	ret = radix_tree_tag_get(&hwspinlock_tree, id, HWSPINLOCK_UNUSED);
	if (ret == 0) {
		pr_warn("hwspinlock %u is already in use\n", id);
		hwlock = NULL;
		goto out;
	}

	/* mark as used and power up */
	ret = __hwspin_lock_request(hwlock);
	if (ret < 0)
		hwlock = NULL;

out:
	mutex_unlock(&hwspinlock_tree_lock);
	return hwlock;
}
示例#3
0
bool __fscache_maybe_release_page(struct fscache_cookie *cookie,
				  struct page *page,
				  gfp_t gfp)
{
	struct page *xpage;
	void *val;

	_enter("%p,%p,%x", cookie, page, gfp);

	rcu_read_lock();
	val = radix_tree_lookup(&cookie->stores, page->index);
	if (!val) {
		rcu_read_unlock();
		fscache_stat(&fscache_n_store_vmscan_not_storing);
		__fscache_uncache_page(cookie, page);
		return true;
	}

	/*                                                                    
                                                  */
	if (radix_tree_tag_get(&cookie->stores, page->index,
			       FSCACHE_COOKIE_STORING_TAG)) {
		rcu_read_unlock();
		goto page_busy;
	}

	/*                                                                   
                                                                */
	spin_lock(&cookie->stores_lock);
	rcu_read_unlock();

	if (radix_tree_tag_get(&cookie->stores, page->index,
			       FSCACHE_COOKIE_STORING_TAG)) {
		/*                                                            
                                       */
		spin_unlock(&cookie->stores_lock);
		goto page_busy;
	}

	xpage = radix_tree_delete(&cookie->stores, page->index);
	spin_unlock(&cookie->stores_lock);

	if (xpage) {
		fscache_stat(&fscache_n_store_vmscan_cancelled);
		fscache_stat(&fscache_n_store_radix_deletes);
		ASSERTCMP(xpage, ==, page);
	} else {
/**
 * hwspin_lock_free() - free a specific hwspinlock
 * @hwlock: the specific hwspinlock to free
 *
 * This function mark @hwlock as free again.
 * Should only be called with an @hwlock that was retrieved from
 * an earlier call to omap_hwspin_lock_request{_specific}.
 *
 * Should be called from a process context (might sleep)
 *
 * Returns 0 on success, or an appropriate error code on failure
 */
int hwspin_lock_free(struct hwspinlock *hwlock)
{
	struct device *dev = hwlock->bank->dev;
	struct hwspinlock *tmp;
	int ret;

	if (!hwlock) {
		pr_err("invalid hwlock\n");
		return -EINVAL;
	}

	mutex_lock(&hwspinlock_tree_lock);

	/* make sure the hwspinlock is used */
	ret = radix_tree_tag_get(&hwspinlock_tree, hwlock_to_id(hwlock),
							HWSPINLOCK_UNUSED);
	if (ret == 1) {
		dev_err(dev, "%s: hwlock is already free\n", __func__);
		dump_stack();
		ret = -EINVAL;
		goto out;
	}

	/* notify the underlying device that power is not needed */
	ret = pm_runtime_put(dev);
	if (ret < 0)
		goto out;

	/* mark this hwspinlock as available */
	tmp = radix_tree_tag_set(&hwspinlock_tree, hwlock_to_id(hwlock),
							HWSPINLOCK_UNUSED);

	/* sanity check (this shouldn't happen) */
	WARN_ON(tmp != hwlock);

	module_put(dev->driver->owner);

out:
	mutex_unlock(&hwspinlock_tree_lock);
	return ret;
}
static struct hwspinlock *hwspin_lock_unregister_single(unsigned int id)
{
	struct hwspinlock *hwlock = NULL;
	int ret;

	mutex_lock(&hwspinlock_tree_lock);

	/* make sure the hwspinlock is not in use (tag is set) */
	ret = radix_tree_tag_get(&hwspinlock_tree, id, HWSPINLOCK_UNUSED);
	if (ret == 0) {
		pr_err("hwspinlock %d still in use (or not present)\n", id);
		goto out;
	}

	hwlock = radix_tree_delete(&hwspinlock_tree, id);
	if (!hwlock) {
		pr_err("failed to delete hwspinlock %d\n", id);
		goto out;
	}

out:
	mutex_unlock(&hwspinlock_tree_lock);
	return hwlock;
}
示例#6
0
文件: dax.c 项目: 020gzh/linux
static int copy_user_bh(struct page *to, struct inode *inode,
		struct buffer_head *bh, unsigned long vaddr)
{
	struct blk_dax_ctl dax = {
		.sector = to_sector(bh, inode),
		.size = bh->b_size,
	};
	struct block_device *bdev = bh->b_bdev;
	void *vto;

	if (dax_map_atomic(bdev, &dax) < 0)
		return PTR_ERR(dax.addr);
	vto = kmap_atomic(to);
	copy_user_page(vto, (void __force *)dax.addr, vaddr, to);
	kunmap_atomic(vto);
	dax_unmap_atomic(bdev, &dax);
	return 0;
}

#define NO_SECTOR -1
#define DAX_PMD_INDEX(page_index) (page_index & (PMD_MASK >> PAGE_SHIFT))

static int dax_radix_entry(struct address_space *mapping, pgoff_t index,
		sector_t sector, bool pmd_entry, bool dirty)
{
	struct radix_tree_root *page_tree = &mapping->page_tree;
	pgoff_t pmd_index = DAX_PMD_INDEX(index);
	int type, error = 0;
	void *entry;

	WARN_ON_ONCE(pmd_entry && !dirty);
	if (dirty)
		__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);

	spin_lock_irq(&mapping->tree_lock);

	entry = radix_tree_lookup(page_tree, pmd_index);
	if (entry && RADIX_DAX_TYPE(entry) == RADIX_DAX_PMD) {
		index = pmd_index;
		goto dirty;
	}

	entry = radix_tree_lookup(page_tree, index);
	if (entry) {
		type = RADIX_DAX_TYPE(entry);
		if (WARN_ON_ONCE(type != RADIX_DAX_PTE &&
					type != RADIX_DAX_PMD)) {
			error = -EIO;
			goto unlock;
		}

		if (!pmd_entry || type == RADIX_DAX_PMD)
			goto dirty;

		/*
		 * We only insert dirty PMD entries into the radix tree.  This
		 * means we don't need to worry about removing a dirty PTE
		 * entry and inserting a clean PMD entry, thus reducing the
		 * range we would flush with a follow-up fsync/msync call.
		 */
		radix_tree_delete(&mapping->page_tree, index);
		mapping->nrexceptional--;
	}

	if (sector == NO_SECTOR) {
		/*
		 * This can happen during correct operation if our pfn_mkwrite
		 * fault raced against a hole punch operation.  If this
		 * happens the pte that was hole punched will have been
		 * unmapped and the radix tree entry will have been removed by
		 * the time we are called, but the call will still happen.  We
		 * will return all the way up to wp_pfn_shared(), where the
		 * pte_same() check will fail, eventually causing page fault
		 * to be retried by the CPU.
		 */
		goto unlock;
	}

	error = radix_tree_insert(page_tree, index,
			RADIX_DAX_ENTRY(sector, pmd_entry));
	if (error)
		goto unlock;

	mapping->nrexceptional++;
 dirty:
	if (dirty)
		radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY);
 unlock:
	spin_unlock_irq(&mapping->tree_lock);
	return error;
}

static int dax_writeback_one(struct block_device *bdev,
		struct address_space *mapping, pgoff_t index, void *entry)
{
	struct radix_tree_root *page_tree = &mapping->page_tree;
	int type = RADIX_DAX_TYPE(entry);
	struct radix_tree_node *node;
	struct blk_dax_ctl dax;
	void **slot;
	int ret = 0;

	spin_lock_irq(&mapping->tree_lock);
	/*
	 * Regular page slots are stabilized by the page lock even
	 * without the tree itself locked.  These unlocked entries
	 * need verification under the tree lock.
	 */
	if (!__radix_tree_lookup(page_tree, index, &node, &slot))
		goto unlock;
	if (*slot != entry)
		goto unlock;

	/* another fsync thread may have already written back this entry */
	if (!radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE))
		goto unlock;

	if (WARN_ON_ONCE(type != RADIX_DAX_PTE && type != RADIX_DAX_PMD)) {
		ret = -EIO;
		goto unlock;
	}

	dax.sector = RADIX_DAX_SECTOR(entry);
	dax.size = (type == RADIX_DAX_PMD ? PMD_SIZE : PAGE_SIZE);
	spin_unlock_irq(&mapping->tree_lock);

	/*
	 * We cannot hold tree_lock while calling dax_map_atomic() because it
	 * eventually calls cond_resched().
	 */
	ret = dax_map_atomic(bdev, &dax);
	if (ret < 0)
		return ret;

	if (WARN_ON_ONCE(ret < dax.size)) {
		ret = -EIO;
		goto unmap;
	}

	wb_cache_pmem(dax.addr, dax.size);

	spin_lock_irq(&mapping->tree_lock);
	radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_TOWRITE);
	spin_unlock_irq(&mapping->tree_lock);
 unmap:
	dax_unmap_atomic(bdev, &dax);
	return ret;

 unlock:
	spin_unlock_irq(&mapping->tree_lock);
	return ret;
}

/*
 * Flush the mapping to the persistent domain within the byte range of [start,
 * end]. This is required by data integrity operations to ensure file data is
 * on persistent storage prior to completion of the operation.
 */
int dax_writeback_mapping_range(struct address_space *mapping,
		struct block_device *bdev, struct writeback_control *wbc)
{
	struct inode *inode = mapping->host;
	pgoff_t start_index, end_index, pmd_index;
	pgoff_t indices[PAGEVEC_SIZE];
	struct pagevec pvec;
	bool done = false;
	int i, ret = 0;
	void *entry;

	if (WARN_ON_ONCE(inode->i_blkbits != PAGE_SHIFT))
		return -EIO;

	if (!mapping->nrexceptional || wbc->sync_mode != WB_SYNC_ALL)
		return 0;

	start_index = wbc->range_start >> PAGE_SHIFT;
	end_index = wbc->range_end >> PAGE_SHIFT;
	pmd_index = DAX_PMD_INDEX(start_index);

	rcu_read_lock();
	entry = radix_tree_lookup(&mapping->page_tree, pmd_index);
	rcu_read_unlock();

	/* see if the start of our range is covered by a PMD entry */
	if (entry && RADIX_DAX_TYPE(entry) == RADIX_DAX_PMD)
		start_index = pmd_index;

	tag_pages_for_writeback(mapping, start_index, end_index);

	pagevec_init(&pvec, 0);
	while (!done) {
		pvec.nr = find_get_entries_tag(mapping, start_index,
				PAGECACHE_TAG_TOWRITE, PAGEVEC_SIZE,
				pvec.pages, indices);

		if (pvec.nr == 0)
			break;

		for (i = 0; i < pvec.nr; i++) {
			if (indices[i] > end_index) {
				done = true;
				break;
			}

			ret = dax_writeback_one(bdev, mapping, indices[i],
					pvec.pages[i]);
			if (ret < 0)
				return ret;
		}
	}
	wmb_pmem();
	return 0;
}
EXPORT_SYMBOL_GPL(dax_writeback_mapping_range);

static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
			struct vm_area_struct *vma, struct vm_fault *vmf)
{
	unsigned long vaddr = (unsigned long)vmf->virtual_address;
	struct address_space *mapping = inode->i_mapping;
	struct block_device *bdev = bh->b_bdev;
	struct blk_dax_ctl dax = {
		.sector = to_sector(bh, inode),
		.size = bh->b_size,
	};
	pgoff_t size;
	int error;

	i_mmap_lock_read(mapping);

	/*
	 * Check truncate didn't happen while we were allocating a block.
	 * If it did, this block may or may not be still allocated to the
	 * file.  We can't tell the filesystem to free it because we can't
	 * take i_mutex here.  In the worst case, the file still has blocks
	 * allocated past the end of the file.
	 */
	size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
	if (unlikely(vmf->pgoff >= size)) {
		error = -EIO;
		goto out;
	}

	if (dax_map_atomic(bdev, &dax) < 0) {
		error = PTR_ERR(dax.addr);
		goto out;
	}

	if (buffer_unwritten(bh) || buffer_new(bh)) {
		clear_pmem(dax.addr, PAGE_SIZE);
		wmb_pmem();
	}
	dax_unmap_atomic(bdev, &dax);

	error = dax_radix_entry(mapping, vmf->pgoff, dax.sector, false,
			vmf->flags & FAULT_FLAG_WRITE);
	if (error)
		goto out;

	error = vm_insert_mixed(vma, vaddr, dax.pfn);

 out:
	i_mmap_unlock_read(mapping);

	return error;
}

/**
 * __dax_fault - handle a page fault on a DAX file
 * @vma: The virtual memory area where the fault occurred
 * @vmf: The description of the fault
 * @get_block: The filesystem method used to translate file offsets to blocks
 * @complete_unwritten: The filesystem method used to convert unwritten blocks
 *	to written so the data written to them is exposed. This is required for
 *	required by write faults for filesystems that will return unwritten
 *	extent mappings from @get_block, but it is optional for reads as
 *	dax_insert_mapping() will always zero unwritten blocks. If the fs does
 *	not support unwritten extents, the it should pass NULL.
 *
 * When a page fault occurs, filesystems may call this helper in their
 * fault handler for DAX files. __dax_fault() assumes the caller has done all
 * the necessary locking for the page fault to proceed successfully.
 */
int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
			get_block_t get_block, dax_iodone_t complete_unwritten)
{
	struct file *file = vma->vm_file;
	struct address_space *mapping = file->f_mapping;
	struct inode *inode = mapping->host;
	struct page *page;
	struct buffer_head bh;
	unsigned long vaddr = (unsigned long)vmf->virtual_address;
	unsigned blkbits = inode->i_blkbits;
	sector_t block;
	pgoff_t size;
	int error;
	int major = 0;

	size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
	if (vmf->pgoff >= size)
		return VM_FAULT_SIGBUS;

	memset(&bh, 0, sizeof(bh));
	block = (sector_t)vmf->pgoff << (PAGE_SHIFT - blkbits);
	bh.b_bdev = inode->i_sb->s_bdev;
	bh.b_size = PAGE_SIZE;

 repeat:
	page = find_get_page(mapping, vmf->pgoff);
	if (page) {
		if (!lock_page_or_retry(page, vma->vm_mm, vmf->flags)) {
			put_page(page);
			return VM_FAULT_RETRY;
		}
		if (unlikely(page->mapping != mapping)) {
			unlock_page(page);
			put_page(page);
			goto repeat;
		}
		size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
		if (unlikely(vmf->pgoff >= size)) {
			/*
			 * We have a struct page covering a hole in the file
			 * from a read fault and we've raced with a truncate
			 */
			error = -EIO;
			goto unlock_page;
		}
	}

	error = get_block(inode, block, &bh, 0);
	if (!error && (bh.b_size < PAGE_SIZE))
		error = -EIO;		/* fs corruption? */
	if (error)
		goto unlock_page;

	if (!buffer_mapped(&bh) && !buffer_unwritten(&bh) && !vmf->cow_page) {
		if (vmf->flags & FAULT_FLAG_WRITE) {
			error = get_block(inode, block, &bh, 1);
			count_vm_event(PGMAJFAULT);
			mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
			major = VM_FAULT_MAJOR;
			if (!error && (bh.b_size < PAGE_SIZE))
				error = -EIO;
			if (error)
				goto unlock_page;
		} else {
			return dax_load_hole(mapping, page, vmf);
		}
	}

	if (vmf->cow_page) {
		struct page *new_page = vmf->cow_page;
		if (buffer_written(&bh))
			error = copy_user_bh(new_page, inode, &bh, vaddr);
		else
			clear_user_highpage(new_page, vaddr);
		if (error)
			goto unlock_page;
		vmf->page = page;
		if (!page) {
			i_mmap_lock_read(mapping);
			/* Check we didn't race with truncate */
			size = (i_size_read(inode) + PAGE_SIZE - 1) >>
								PAGE_SHIFT;
			if (vmf->pgoff >= size) {
				i_mmap_unlock_read(mapping);
				error = -EIO;
				goto out;
			}
		}
		return VM_FAULT_LOCKED;
	}

	/* Check we didn't race with a read fault installing a new page */
	if (!page && major)
		page = find_lock_page(mapping, vmf->pgoff);

	if (page) {
		unmap_mapping_range(mapping, vmf->pgoff << PAGE_SHIFT,
							PAGE_SIZE, 0);
		delete_from_page_cache(page);
		unlock_page(page);
		put_page(page);
		page = NULL;
	}

	/*
	 * If we successfully insert the new mapping over an unwritten extent,
	 * we need to ensure we convert the unwritten extent. If there is an
	 * error inserting the mapping, the filesystem needs to leave it as
	 * unwritten to prevent exposure of the stale underlying data to
	 * userspace, but we still need to call the completion function so
	 * the private resources on the mapping buffer can be released. We
	 * indicate what the callback should do via the uptodate variable, same
	 * as for normal BH based IO completions.
	 */
	error = dax_insert_mapping(inode, &bh, vma, vmf);
	if (buffer_unwritten(&bh)) {
		if (complete_unwritten)
			complete_unwritten(&bh, !error);
		else
			WARN_ON_ONCE(!(vmf->flags & FAULT_FLAG_WRITE));
	}

 out:
	if (error == -ENOMEM)
		return VM_FAULT_OOM | major;
	/* -EBUSY is fine, somebody else faulted on the same PTE */
	if ((error < 0) && (error != -EBUSY))
		return VM_FAULT_SIGBUS | major;
	return VM_FAULT_NOPAGE | major;

 unlock_page:
	if (page) {
		unlock_page(page);
		put_page(page);
	}
	goto out;
}
示例#7
0
	for_each_index(i, base, order) {
		assert(!radix_tree_tag_get(&tree, i, 0));
		assert(radix_tree_tag_get(&tree, i, 1));
	}