/* * Lookup a swap entry in the swap cache. A found page will be returned * unlocked and with its refcount incremented - we rely on the kernel * lock getting page table operations atomic even if we drop the page * lock before returning. */ struct page * lookup_swap_cache(swp_entry_t entry) { struct page *page; page = find_get_page(&swapper_space, entry.val); if (page) INC_CACHE_INFO(find_success); INC_CACHE_INFO(find_total); return page; }
int btrfs_write_marked_extents(struct btrfs_root *root, struct extent_io_tree *dirty_pages, int mark) { int ret; int err = 0; int werr = 0; struct page *page; struct inode *btree_inode = root->fs_info->btree_inode; u64 start = 0; u64 end; unsigned long index; while (1) { ret = find_first_extent_bit(dirty_pages, start, &start, &end, mark); if (ret) break; while (start <= end) { cond_resched(); index = start >> PAGE_CACHE_SHIFT; start = (u64)(index + 1) << PAGE_CACHE_SHIFT; page = find_get_page(btree_inode->i_mapping, index); if (!page) continue; btree_lock_page_hook(page); if (!page->mapping) { unlock_page(page); page_cache_release(page); continue; } if (PageWriteback(page)) { if (PageDirty(page)) wait_on_page_writeback(page); else { unlock_page(page); page_cache_release(page); continue; } } err = write_one_page(page, 0); if (err) werr = err; page_cache_release(page); } } if (err) werr = err; return werr; }
/* * Use the cached Readdirplus results in order to avoid a LOOKUP call * whenever we believe that the parent directory has not changed. * * We assume that any file creation/rename changes the directory mtime. * As this results in a page cache invalidation whenever it occurs, * we don't require any other tests for cache coherency. */ static int nfs_cached_lookup(struct inode *dir, struct dentry *dentry, struct nfs_fh *fh, struct nfs_fattr *fattr) { nfs_readdir_descriptor_t desc; struct nfs_server *server; struct nfs_entry entry; struct page *page; unsigned long timestamp; int res; if (!NFS_USE_READDIRPLUS(dir)) return -ENOENT; server = NFS_SERVER(dir); /* Don't use readdirplus unless the cache is stable */ if ((server->flags & NFS_MOUNT_NOAC) != 0 || nfs_caches_unstable(dir) || nfs_attribute_timeout(dir)) return -ENOENT; if ((NFS_FLAGS(dir) & (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA)) != 0) return -ENOENT; timestamp = NFS_I(dir)->readdir_timestamp; entry.fh = fh; entry.fattr = fattr; desc.decode = NFS_PROTO(dir)->decode_dirent; desc.entry = &entry; desc.page_index = 0; desc.plus = 1; for(;(page = find_get_page(dir->i_mapping, desc.page_index)); desc.page_index++) { res = -EIO; if (PageUptodate(page)) { void * kaddr = kmap_atomic(page, KM_USER0); desc.ptr = kaddr; res = find_dirent_name(&desc, page, dentry); kunmap_atomic(kaddr, KM_USER0); } page_cache_release(page); if (res == 0) goto out_found; if (res != -EAGAIN) break; } return -ENOENT; out_found: fattr->timestamp = timestamp; return 0; }
struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, struct vm_area_struct *vma, unsigned long addr) { struct page *found_page, *new_page = NULL; int err; do { found_page = find_get_page(&swapper_space, entry.val); if (found_page) break; if (!new_page) { new_page = alloc_page_vma(gfp_mask, vma, addr); if (!new_page) break; } err = radix_tree_preload(gfp_mask & GFP_KERNEL); if (err) break; err = swapcache_prepare(entry); if (err == -EEXIST) { radix_tree_preload_end(); continue; } if (err) { radix_tree_preload_end(); break; } __set_page_locked(new_page); SetPageSwapBacked(new_page); err = __add_to_swap_cache(new_page, entry); if (likely(!err)) { radix_tree_preload_end(); lru_cache_add_anon(new_page); swap_readpage(new_page); return new_page; } radix_tree_preload_end(); ClearPageSwapBacked(new_page); __clear_page_locked(new_page); swapcache_free(entry, NULL); } while (err != -ENOMEM); if (new_page) page_cache_release(new_page); return found_page; }
/* * Locate a page of swap in physical memory, reserving swap cache space * and reading the disk if it is not already cached. * A failure return means that either the page allocation failed or that * the swap entry is no longer in use. */ struct page *read_swap_cache_async(swp_entry_t entry, struct vm_area_struct *vma, unsigned long addr) { struct page *found_page, *new_page = NULL; int err; do { /* * First check the swap cache. Since this is normally * called after lookup_swap_cache() failed, re-calling * that would confuse statistics. */ found_page = find_get_page(&swapper_space, entry.val); if (found_page) break; /* * Get a new page to read into from swap. */ if (!new_page) { new_page = alloc_page_vma(GFP_HIGHUSER, vma, addr); if (!new_page) break; /* Out of memory */ } /* * Associate the page with swap entry in the swap cache. * May fail (-ENOENT) if swap entry has been freed since * our caller observed it. May fail (-EEXIST) if there * is already a page associated with this entry in the * swap cache: added by a racing read_swap_cache_async, * or by try_to_swap_out (or shmem_writepage) re-using * the just freed swap entry for an existing page. * May fail (-ENOMEM) if radix-tree node allocation failed. */ err = add_to_swap_cache(new_page, entry); if (!err) { /* * Initiate read into locked page and return. */ lru_cache_add_active(new_page); swap_readpage(NULL, new_page); return new_page; } } while (err != -ENOENT && err != -ENOMEM); if (new_page) page_cache_release(new_page); return found_page; }
/* * Lookup a swap entry in the swap cache. A found page will be returned * unlocked and with its refcount incremented - we rely on the kernel * lock getting page table operations atomic even if we drop the page * lock before returning. */ struct page * lookup_swap_cache(swp_entry_t entry) { struct page *page; page = find_get_page(swap_address_space(entry), swp_offset(entry)); if (page && likely(!PageTransCompound(page))) { INC_CACHE_INFO(find_success); if (TestClearPageReadahead(page)) atomic_inc(&swapin_readahead_hits); } INC_CACHE_INFO(find_total); return page; }
/* * Later we can get more picky about what "in core" means precisely. * For now, simply check to see if the page is in the page cache, * and is up to date; i.e. that no page-in operation would be required * at this time if an application were to map and access this page. */ static unsigned char mincore_page(struct address_space *mapping, pgoff_t pgoff) { unsigned char present = 0; struct page *page; /* * When tmpfs swaps out a page from a file, any process mapping that * file will not get a swp_entry_t in its pte, but rather it is like * any other file mapping (ie. marked !present and faulted in with * tmpfs's .fault). So swapped out tmpfs mappings are tested here. <<<<<<< HEAD ======= <<<<<<< HEAD >>>>>>> ae1773bb70f3d7cf73324ce8fba787e01d8fa9f2 */ page = find_get_page(mapping, pgoff); #ifdef CONFIG_SWAP /* shmem/tmpfs may return swap: account for swapcache page too. */ if (radix_tree_exceptional_entry(page)) { swp_entry_t swap = radix_to_swp_entry(page); page = find_get_page(&swapper_space, swap.val); } #endif <<<<<<< HEAD
/* * Lookup a swap entry in the swap cache. A found page will be returned * unlocked and with its refcount incremented - we rely on the kernel * lock getting page table operations atomic even if we drop the page * lock before returning. */ struct page * lookup_swap_cache(swp_entry_t entry) { struct page *page; page = find_get_page(swap_address_space(entry), entry.val); if (page) { INC_CACHE_INFO(find_success); if (TestClearPageReadahead(page)) atomic_inc(&swapin_readahead_hits); } INC_CACHE_INFO(find_total); return page; }
/* * Later we can get more picky about what "in core" means precisely. * For now, simply check to see if the page is in the page cache, * and is up to date; i.e. that no page-in operation would be required * at this time if an application were to map and access this page. */ static unsigned char mincore_page(struct vm_area_struct * vma, unsigned long pgoff) { unsigned char present = 0; struct address_space * as = vma->vm_file->f_mapping; struct page * page; page = find_get_page(as, pgoff); if (page) { present = PageUptodate(page); page_cache_release(page); } return present; }
void do_generic_file_read(struct file *filp,unsigned int *ppos, read_descriptor_t *desc,int dumm){ struct address_space *mapping = filp->f_dentry->d_inode->i_mapping; struct inode *inode = mapping->host; unsigned long index = *ppos >> PAGE_CACHE_SHIFT; unsigned long offset = *ppos & ~PAGE_CACHE_SHIFT; for(;;){ struct page *page = find_get_page(mapping,index); unsigned long end_index = inode->i_size >> PAGE_CACHE_SHIFT; if(index > end_index) break; unsigned long nr,ret; nr = PAGE_CACHE_SIZE; nr = nr - offset; mapping->a_ops->readpage(filp,page); } }
/* * Lookup a swap entry in the swap cache. A found page will be returned * unlocked and with its refcount incremented - we rely on the kernel * lock getting page table operations atomic even if we drop the page * lock before returning. */ struct page * lookup_swap_cache(swp_entry_t entry) { struct page *found; found = find_get_page(&swapper_space, entry.val); /* * Unsafe to assert PageSwapCache and mapping on page found: * if SMP nothing prevents swapoff from deleting this page from * the swap cache at this moment. find_lock_page would prevent * that, but no need to change: we _have_ got the right page. */ INC_CACHE_INFO(find_total); if (found) INC_CACHE_INFO(find_success); return found; }
/* * Later we can get more picky about what "in core" means precisely. * For now, simply check to see if the page is in the page cache, * and is up to date; i.e. that no page-in operation would be required * at this time if an application were to map and access this page. */ static unsigned char mincore_page(struct address_space *mapping, pgoff_t pgoff) { unsigned char present = 0; struct page *page; /* * When tmpfs swaps out a page from a file, any process mapping that * file will not get a swp_entry_t in its pte, but rather it is like * any other file mapping (ie. marked !present and faulted in with * tmpfs's .fault). So swapped out tmpfs mappings are tested here. * * However when tmpfs moves the page from pagecache and into swapcache, * it is still in core, but the find_get_page below won't find it. * No big deal, but make a note of it. */ page = find_get_page(mapping, pgoff); if (page) { present = PageUptodate(page); page_cache_release(page); } return present; }
struct page *find_data_page(struct inode *inode, pgoff_t index) { struct address_space *mapping = inode->i_mapping; struct page *page; page = find_get_page(mapping, index); if (page && PageUptodate(page)) return page; f2fs_put_page(page, 0); page = get_read_data_page(inode, index, READ_SYNC); if (IS_ERR(page)) return page; if (PageUptodate(page)) return page; wait_on_page_locked(page); if (unlikely(!PageUptodate(page))) { f2fs_put_page(page, 0); return ERR_PTR(-EIO); } return page; }
/* * Locate a page of swap in physical memory, reserving swap cache space * and reading the disk if it is not already cached. * A failure return means that either the page allocation failed or that * the swap entry is no longer in use. */ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, struct vm_area_struct *vma, unsigned long addr) { struct page *found_page, *new_page = NULL; int err; do { /* * First check the swap cache. Since this is normally * called after lookup_swap_cache() failed, re-calling * that would confuse statistics. */ found_page = find_get_page(&swapper_space, entry.val); if (found_page) break; /* * Get a new page to read into from swap. */ if (!new_page) { new_page = alloc_page_vma(gfp_mask, vma, addr); if (!new_page) break; /* Out of memory */ } /* * call radix_tree_preload() while we can wait. */ err = radix_tree_preload(gfp_mask & GFP_KERNEL); if (err) break; /* * Swap entry may have been freed since our caller observed it. */ err = swapcache_prepare(entry); if (err == -EEXIST) { /* seems racy */ radix_tree_preload_end(); continue; } if (err) { /* swp entry is obsolete ? */ radix_tree_preload_end(); break; } /* May fail (-ENOMEM) if radix-tree node allocation failed. */ __set_page_locked(new_page); SetPageSwapBacked(new_page); err = __add_to_swap_cache(new_page, entry); if (likely(!err)) { radix_tree_preload_end(); /* * Initiate read into locked page and return. */ lru_cache_add_anon(new_page); swap_readpage(new_page); return new_page; } radix_tree_preload_end(); ClearPageSwapBacked(new_page); __clear_page_locked(new_page); /* * add_to_swap_cache() doesn't return -EEXIST, so we can safely * clear SWAP_HAS_CACHE flag. */ swapcache_free(entry, NULL); } while (err != -ENOMEM); if (new_page) page_cache_release(new_page); return found_page; }
struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, struct vm_area_struct *vma, unsigned long addr, bool *new_page_allocated) { struct page *found_page, *new_page = NULL; struct address_space *swapper_space = swap_address_space(entry); int err; *new_page_allocated = false; do { /* * First check the swap cache. Since this is normally * called after lookup_swap_cache() failed, re-calling * that would confuse statistics. */ found_page = find_get_page(swapper_space, entry.val); if (found_page) break; /* * Get a new page to read into from swap. */ if (!new_page) { new_page = alloc_page_vma(gfp_mask, vma, addr); if (!new_page) break; /* Out of memory */ } /* * call radix_tree_preload() while we can wait. */ err = radix_tree_maybe_preload(gfp_mask & GFP_KERNEL); if (err) break; /* * Swap entry may have been freed since our caller observed it. */ err = swapcache_prepare(entry); if (err == -EEXIST) { radix_tree_preload_end(); /* * We might race against get_swap_page() and stumble * across a SWAP_HAS_CACHE swap_map entry whose page * has not been brought into the swapcache yet, while * the other end is scheduled away waiting on discard * I/O completion at scan_swap_map(). * * In order to avoid turning this transitory state * into a permanent loop around this -EEXIST case * if !CONFIG_PREEMPT and the I/O completion happens * to be waiting on the CPU waitqueue where we are now * busy looping, we just conditionally invoke the * scheduler here, if there are some more important * tasks to run. */ cond_resched(); continue; } if (err) { /* swp entry is obsolete ? */ radix_tree_preload_end(); break; } /* May fail (-ENOMEM) if radix-tree node allocation failed. */ __SetPageLocked(new_page); __SetPageSwapBacked(new_page); err = __add_to_swap_cache(new_page, entry); if (likely(!err)) { radix_tree_preload_end(); /* * Initiate read into locked page and return. */ lru_cache_add_anon(new_page); *new_page_allocated = true; return new_page; } radix_tree_preload_end(); __ClearPageLocked(new_page); /* * add_to_swap_cache() doesn't return -EEXIST, so we can safely * clear SWAP_HAS_CACHE flag. */ swapcache_free(entry); } while (err != -ENOMEM); if (new_page) put_page(new_page); return found_page; }
static int tux3_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) { struct inode *inode = file_inode(vma->vm_file); struct sb *sb = tux_sb(inode->i_sb); struct page *clone, *page = vmf->page; void *ptr; int ret; sb_start_pagefault(inode->i_sb); retry: down_read(&tux_inode(inode)->truncate_lock); lock_page(page); if (page->mapping != mapping(inode)) { unlock_page(page); ret = VM_FAULT_NOPAGE; goto out; } /* * page fault can be happened while holding change_begin/end() * (e.g. copy of user data between ->write_begin and * ->write_end for write(2)). * * So, we use nested version here. */ change_begin_atomic_nested(sb, &ptr); /* * FIXME: Caller releases vmf->page (old_page) unconditionally. * So, this takes additional refcount to workaround it. */ if (vmf->page == page) page_cache_get(page); clone = pagefork_for_blockdirty(page, tux3_get_current_delta()); if (IS_ERR(clone)) { /* Someone did page fork */ pgoff_t index = page->index; change_end_atomic_nested(sb, ptr); unlock_page(page); page_cache_release(page); up_read(&tux_inode(inode)->truncate_lock); switch (PTR_ERR(clone)) { case -EAGAIN: page = find_get_page(inode->i_mapping, index); assert(page); goto retry; case -ENOMEM: ret = VM_FAULT_OOM; break; default: ret = VM_FAULT_SIGBUS; break; } goto out; } file_update_time(vma->vm_file); /* Assign buffers to dirty */ if (!page_has_buffers(clone)) create_empty_buffers(clone, sb->blocksize, 0); /* * We mark the page dirty already here so that when freeze is in * progress, we are guaranteed that writeback during freezing will * see the dirty page and writeprotect it again. */ tux3_set_page_dirty(clone); #if 1 /* FIXME: Caller doesn't see the changed vmf->page */ vmf->page = clone; change_end_atomic_nested(sb, ptr); /* FIXME: caller doesn't know about pagefork */ unlock_page(clone); page_cache_release(clone); ret = 0; // ret = VM_FAULT_LOCKED; #endif out: up_read(&tux_inode(inode)->truncate_lock); sb_end_pagefault(inode->i_sb); return ret; }
static int copy_user_bh(struct page *to, struct inode *inode, struct buffer_head *bh, unsigned long vaddr) { struct blk_dax_ctl dax = { .sector = to_sector(bh, inode), .size = bh->b_size, }; struct block_device *bdev = bh->b_bdev; void *vto; if (dax_map_atomic(bdev, &dax) < 0) return PTR_ERR(dax.addr); vto = kmap_atomic(to); copy_user_page(vto, (void __force *)dax.addr, vaddr, to); kunmap_atomic(vto); dax_unmap_atomic(bdev, &dax); return 0; } #define NO_SECTOR -1 #define DAX_PMD_INDEX(page_index) (page_index & (PMD_MASK >> PAGE_SHIFT)) static int dax_radix_entry(struct address_space *mapping, pgoff_t index, sector_t sector, bool pmd_entry, bool dirty) { struct radix_tree_root *page_tree = &mapping->page_tree; pgoff_t pmd_index = DAX_PMD_INDEX(index); int type, error = 0; void *entry; WARN_ON_ONCE(pmd_entry && !dirty); if (dirty) __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); spin_lock_irq(&mapping->tree_lock); entry = radix_tree_lookup(page_tree, pmd_index); if (entry && RADIX_DAX_TYPE(entry) == RADIX_DAX_PMD) { index = pmd_index; goto dirty; } entry = radix_tree_lookup(page_tree, index); if (entry) { type = RADIX_DAX_TYPE(entry); if (WARN_ON_ONCE(type != RADIX_DAX_PTE && type != RADIX_DAX_PMD)) { error = -EIO; goto unlock; } if (!pmd_entry || type == RADIX_DAX_PMD) goto dirty; /* * We only insert dirty PMD entries into the radix tree. This * means we don't need to worry about removing a dirty PTE * entry and inserting a clean PMD entry, thus reducing the * range we would flush with a follow-up fsync/msync call. */ radix_tree_delete(&mapping->page_tree, index); mapping->nrexceptional--; } if (sector == NO_SECTOR) { /* * This can happen during correct operation if our pfn_mkwrite * fault raced against a hole punch operation. If this * happens the pte that was hole punched will have been * unmapped and the radix tree entry will have been removed by * the time we are called, but the call will still happen. We * will return all the way up to wp_pfn_shared(), where the * pte_same() check will fail, eventually causing page fault * to be retried by the CPU. */ goto unlock; } error = radix_tree_insert(page_tree, index, RADIX_DAX_ENTRY(sector, pmd_entry)); if (error) goto unlock; mapping->nrexceptional++; dirty: if (dirty) radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY); unlock: spin_unlock_irq(&mapping->tree_lock); return error; } static int dax_writeback_one(struct block_device *bdev, struct address_space *mapping, pgoff_t index, void *entry) { struct radix_tree_root *page_tree = &mapping->page_tree; int type = RADIX_DAX_TYPE(entry); struct radix_tree_node *node; struct blk_dax_ctl dax; void **slot; int ret = 0; spin_lock_irq(&mapping->tree_lock); /* * Regular page slots are stabilized by the page lock even * without the tree itself locked. These unlocked entries * need verification under the tree lock. */ if (!__radix_tree_lookup(page_tree, index, &node, &slot)) goto unlock; if (*slot != entry) goto unlock; /* another fsync thread may have already written back this entry */ if (!radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE)) goto unlock; if (WARN_ON_ONCE(type != RADIX_DAX_PTE && type != RADIX_DAX_PMD)) { ret = -EIO; goto unlock; } dax.sector = RADIX_DAX_SECTOR(entry); dax.size = (type == RADIX_DAX_PMD ? PMD_SIZE : PAGE_SIZE); spin_unlock_irq(&mapping->tree_lock); /* * We cannot hold tree_lock while calling dax_map_atomic() because it * eventually calls cond_resched(). */ ret = dax_map_atomic(bdev, &dax); if (ret < 0) return ret; if (WARN_ON_ONCE(ret < dax.size)) { ret = -EIO; goto unmap; } wb_cache_pmem(dax.addr, dax.size); spin_lock_irq(&mapping->tree_lock); radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_TOWRITE); spin_unlock_irq(&mapping->tree_lock); unmap: dax_unmap_atomic(bdev, &dax); return ret; unlock: spin_unlock_irq(&mapping->tree_lock); return ret; } /* * Flush the mapping to the persistent domain within the byte range of [start, * end]. This is required by data integrity operations to ensure file data is * on persistent storage prior to completion of the operation. */ int dax_writeback_mapping_range(struct address_space *mapping, struct block_device *bdev, struct writeback_control *wbc) { struct inode *inode = mapping->host; pgoff_t start_index, end_index, pmd_index; pgoff_t indices[PAGEVEC_SIZE]; struct pagevec pvec; bool done = false; int i, ret = 0; void *entry; if (WARN_ON_ONCE(inode->i_blkbits != PAGE_SHIFT)) return -EIO; if (!mapping->nrexceptional || wbc->sync_mode != WB_SYNC_ALL) return 0; start_index = wbc->range_start >> PAGE_SHIFT; end_index = wbc->range_end >> PAGE_SHIFT; pmd_index = DAX_PMD_INDEX(start_index); rcu_read_lock(); entry = radix_tree_lookup(&mapping->page_tree, pmd_index); rcu_read_unlock(); /* see if the start of our range is covered by a PMD entry */ if (entry && RADIX_DAX_TYPE(entry) == RADIX_DAX_PMD) start_index = pmd_index; tag_pages_for_writeback(mapping, start_index, end_index); pagevec_init(&pvec, 0); while (!done) { pvec.nr = find_get_entries_tag(mapping, start_index, PAGECACHE_TAG_TOWRITE, PAGEVEC_SIZE, pvec.pages, indices); if (pvec.nr == 0) break; for (i = 0; i < pvec.nr; i++) { if (indices[i] > end_index) { done = true; break; } ret = dax_writeback_one(bdev, mapping, indices[i], pvec.pages[i]); if (ret < 0) return ret; } } wmb_pmem(); return 0; } EXPORT_SYMBOL_GPL(dax_writeback_mapping_range); static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh, struct vm_area_struct *vma, struct vm_fault *vmf) { unsigned long vaddr = (unsigned long)vmf->virtual_address; struct address_space *mapping = inode->i_mapping; struct block_device *bdev = bh->b_bdev; struct blk_dax_ctl dax = { .sector = to_sector(bh, inode), .size = bh->b_size, }; pgoff_t size; int error; i_mmap_lock_read(mapping); /* * Check truncate didn't happen while we were allocating a block. * If it did, this block may or may not be still allocated to the * file. We can't tell the filesystem to free it because we can't * take i_mutex here. In the worst case, the file still has blocks * allocated past the end of the file. */ size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; if (unlikely(vmf->pgoff >= size)) { error = -EIO; goto out; } if (dax_map_atomic(bdev, &dax) < 0) { error = PTR_ERR(dax.addr); goto out; } if (buffer_unwritten(bh) || buffer_new(bh)) { clear_pmem(dax.addr, PAGE_SIZE); wmb_pmem(); } dax_unmap_atomic(bdev, &dax); error = dax_radix_entry(mapping, vmf->pgoff, dax.sector, false, vmf->flags & FAULT_FLAG_WRITE); if (error) goto out; error = vm_insert_mixed(vma, vaddr, dax.pfn); out: i_mmap_unlock_read(mapping); return error; } /** * __dax_fault - handle a page fault on a DAX file * @vma: The virtual memory area where the fault occurred * @vmf: The description of the fault * @get_block: The filesystem method used to translate file offsets to blocks * @complete_unwritten: The filesystem method used to convert unwritten blocks * to written so the data written to them is exposed. This is required for * required by write faults for filesystems that will return unwritten * extent mappings from @get_block, but it is optional for reads as * dax_insert_mapping() will always zero unwritten blocks. If the fs does * not support unwritten extents, the it should pass NULL. * * When a page fault occurs, filesystems may call this helper in their * fault handler for DAX files. __dax_fault() assumes the caller has done all * the necessary locking for the page fault to proceed successfully. */ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, get_block_t get_block, dax_iodone_t complete_unwritten) { struct file *file = vma->vm_file; struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; struct page *page; struct buffer_head bh; unsigned long vaddr = (unsigned long)vmf->virtual_address; unsigned blkbits = inode->i_blkbits; sector_t block; pgoff_t size; int error; int major = 0; size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; if (vmf->pgoff >= size) return VM_FAULT_SIGBUS; memset(&bh, 0, sizeof(bh)); block = (sector_t)vmf->pgoff << (PAGE_SHIFT - blkbits); bh.b_bdev = inode->i_sb->s_bdev; bh.b_size = PAGE_SIZE; repeat: page = find_get_page(mapping, vmf->pgoff); if (page) { if (!lock_page_or_retry(page, vma->vm_mm, vmf->flags)) { put_page(page); return VM_FAULT_RETRY; } if (unlikely(page->mapping != mapping)) { unlock_page(page); put_page(page); goto repeat; } size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; if (unlikely(vmf->pgoff >= size)) { /* * We have a struct page covering a hole in the file * from a read fault and we've raced with a truncate */ error = -EIO; goto unlock_page; } } error = get_block(inode, block, &bh, 0); if (!error && (bh.b_size < PAGE_SIZE)) error = -EIO; /* fs corruption? */ if (error) goto unlock_page; if (!buffer_mapped(&bh) && !buffer_unwritten(&bh) && !vmf->cow_page) { if (vmf->flags & FAULT_FLAG_WRITE) { error = get_block(inode, block, &bh, 1); count_vm_event(PGMAJFAULT); mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); major = VM_FAULT_MAJOR; if (!error && (bh.b_size < PAGE_SIZE)) error = -EIO; if (error) goto unlock_page; } else { return dax_load_hole(mapping, page, vmf); } } if (vmf->cow_page) { struct page *new_page = vmf->cow_page; if (buffer_written(&bh)) error = copy_user_bh(new_page, inode, &bh, vaddr); else clear_user_highpage(new_page, vaddr); if (error) goto unlock_page; vmf->page = page; if (!page) { i_mmap_lock_read(mapping); /* Check we didn't race with truncate */ size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; if (vmf->pgoff >= size) { i_mmap_unlock_read(mapping); error = -EIO; goto out; } } return VM_FAULT_LOCKED; } /* Check we didn't race with a read fault installing a new page */ if (!page && major) page = find_lock_page(mapping, vmf->pgoff); if (page) { unmap_mapping_range(mapping, vmf->pgoff << PAGE_SHIFT, PAGE_SIZE, 0); delete_from_page_cache(page); unlock_page(page); put_page(page); page = NULL; } /* * If we successfully insert the new mapping over an unwritten extent, * we need to ensure we convert the unwritten extent. If there is an * error inserting the mapping, the filesystem needs to leave it as * unwritten to prevent exposure of the stale underlying data to * userspace, but we still need to call the completion function so * the private resources on the mapping buffer can be released. We * indicate what the callback should do via the uptodate variable, same * as for normal BH based IO completions. */ error = dax_insert_mapping(inode, &bh, vma, vmf); if (buffer_unwritten(&bh)) { if (complete_unwritten) complete_unwritten(&bh, !error); else WARN_ON_ONCE(!(vmf->flags & FAULT_FLAG_WRITE)); } out: if (error == -ENOMEM) return VM_FAULT_OOM | major; /* -EBUSY is fine, somebody else faulted on the same PTE */ if ((error < 0) && (error != -EBUSY)) return VM_FAULT_SIGBUS | major; return VM_FAULT_NOPAGE | major; unlock_page: if (page) { unlock_page(page); put_page(page); } goto out; }
/* * zswap_get_swap_cache_page * * This is an adaption of read_swap_cache_async() * * This function tries to find a page with the given swap entry * in the swapper_space address space (the swap cache). If the page * is found, it is returned in retpage. Otherwise, a page is allocated, * added to the swap cache, and returned in retpage. * * If success, the swap cache page is returned in retpage * Returns ZSWAP_SWAPCACHE_EXIST if page was already in the swap cache * Returns ZSWAP_SWAPCACHE_NEW if the new page needs to be populated, * the new page is added to swapcache and locked * Returns ZSWAP_SWAPCACHE_FAIL on error */ static int zswap_get_swap_cache_page(swp_entry_t entry, struct page **retpage) { struct page *found_page, *new_page = NULL; struct address_space *swapper_space = swap_address_space(entry); int err; *retpage = NULL; do { /* * First check the swap cache. Since this is normally * called after lookup_swap_cache() failed, re-calling * that would confuse statistics. */ found_page = find_get_page(swapper_space, entry.val); if (found_page) break; /* * Get a new page to read into from swap. */ if (!new_page) { new_page = alloc_page(GFP_KERNEL); if (!new_page) break; /* Out of memory */ } /* * call radix_tree_preload() while we can wait. */ err = radix_tree_preload(GFP_KERNEL); if (err) break; /* * Swap entry may have been freed since our caller observed it. */ err = swapcache_prepare(entry); if (err == -EEXIST) { /* seems racy */ radix_tree_preload_end(); continue; } if (err) { /* swp entry is obsolete ? */ radix_tree_preload_end(); break; } /* May fail (-ENOMEM) if radix-tree node allocation failed. */ __set_page_locked(new_page); SetPageSwapBacked(new_page); err = __add_to_swap_cache(new_page, entry); if (likely(!err)) { radix_tree_preload_end(); lru_cache_add_anon(new_page); *retpage = new_page; return ZSWAP_SWAPCACHE_NEW; } radix_tree_preload_end(); ClearPageSwapBacked(new_page); __clear_page_locked(new_page); /* * add_to_swap_cache() doesn't return -EEXIST, so we can safely * clear SWAP_HAS_CACHE flag. */ swapcache_free(entry, NULL); } while (err != -ENOMEM); if (new_page) page_cache_release(new_page); if (!found_page) return ZSWAP_SWAPCACHE_FAIL; *retpage = found_page; return ZSWAP_SWAPCACHE_EXIST; }
static int __generic_file_splice_read(struct file *in, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags) { struct address_space *mapping = in->f_mapping; unsigned int loff, nr_pages; struct page *pages[PIPE_BUFFERS]; struct partial_page partial[PIPE_BUFFERS]; struct page *page; pgoff_t index, end_index; loff_t isize; size_t total_len; int error, page_nr; struct splice_pipe_desc spd = { .pages = pages, .partial = partial, .flags = flags, .ops = &page_cache_pipe_buf_ops, }; index = *ppos >> PAGE_CACHE_SHIFT; loff = *ppos & ~PAGE_CACHE_MASK; nr_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; if (nr_pages > PIPE_BUFFERS) nr_pages = PIPE_BUFFERS; /* * Initiate read-ahead on this page range. however, don't call into * read-ahead if this is a non-zero offset (we are likely doing small * chunk splice and the page is already there) for a single page. */ if (!loff || nr_pages > 1) page_cache_readahead(mapping, &in->f_ra, in, index, nr_pages); /* * Now fill in the holes: */ error = 0; total_len = 0; /* * Lookup the (hopefully) full range of pages we need. */ spd.nr_pages = find_get_pages_contig(mapping, index, nr_pages, pages); /* * If find_get_pages_contig() returned fewer pages than we needed, * allocate the rest. */ index += spd.nr_pages; while (spd.nr_pages < nr_pages) { /* * Page could be there, find_get_pages_contig() breaks on * the first hole. */ page = find_get_page(mapping, index); if (!page) { /* * Make sure the read-ahead engine is notified * about this failure. */ handle_ra_miss(mapping, &in->f_ra, index); /* * page didn't exist, allocate one. */ page = page_cache_alloc_cold(mapping); if (!page) break; error = add_to_page_cache_lru(page, mapping, index, GFP_KERNEL); if (unlikely(error)) { page_cache_release(page); if (error == -EEXIST) continue; break; } /* * add_to_page_cache() locks the page, unlock it * to avoid convoluting the logic below even more. */ unlock_page(page); } pages[spd.nr_pages++] = page; index++; } /* * Now loop over the map and see if we need to start IO on any * pages, fill in the partial map, etc. */ index = *ppos >> PAGE_CACHE_SHIFT; nr_pages = spd.nr_pages; spd.nr_pages = 0; for (page_nr = 0; page_nr < nr_pages; page_nr++) { unsigned int this_len; if (!len) break; /* * this_len is the max we'll use from this page */ this_len = min_t(unsigned long, len, PAGE_CACHE_SIZE - loff); page = pages[page_nr]; /* * If the page isn't uptodate, we may need to start io on it */ if (!PageUptodate(page)) { /* * If in nonblock mode then dont block on waiting * for an in-flight io page */ if (flags & SPLICE_F_NONBLOCK) break; lock_page(page); /* * page was truncated, stop here. if this isn't the * first page, we'll just complete what we already * added */ if (!page->mapping) { unlock_page(page); break; } /* * page was already under io and is now done, great */ if (PageUptodate(page)) { unlock_page(page); goto fill_it; } /* * need to read in the page */ error = mapping->a_ops->readpage(in, page); if (unlikely(error)) { /* * We really should re-lookup the page here, * but it complicates things a lot. Instead * lets just do what we already stored, and * we'll get it the next time we are called. */ if (error == AOP_TRUNCATED_PAGE) error = 0; break; } /* * i_size must be checked after ->readpage(). */ isize = i_size_read(mapping->host); end_index = (isize - 1) >> PAGE_CACHE_SHIFT; if (unlikely(!isize || index > end_index)) break; /* * if this is the last page, see if we need to shrink * the length and stop */ if (end_index == index) { loff = PAGE_CACHE_SIZE - (isize & ~PAGE_CACHE_MASK); if (total_len + loff > isize) break; /* * force quit after adding this page */ len = this_len; this_len = min(this_len, loff); loff = 0; } } fill_it: partial[page_nr].offset = loff; partial[page_nr].len = this_len; len -= this_len; total_len += this_len; loff = 0; spd.nr_pages++; index++; } /* * Release any pages at the end, if we quit early. 'i' is how far * we got, 'nr_pages' is how many pages are in the map. */ while (page_nr < nr_pages) page_cache_release(pages[page_nr++]); if (spd.nr_pages) return splice_to_pipe(pipe, &spd); return error; }
#ifdef CONFIG_SWAP /* shmem/tmpfs may return swap: account for swapcache page too. */ if (radix_tree_exceptional_entry(page)) { swp_entry_t swap = radix_to_swp_entry(page); page = find_get_page(&swapper_space, swap.val); } #endif <<<<<<< HEAD ======= ======= * * However when tmpfs moves the page from pagecache and into swapcache, * it is still in core, but the find_get_page below won't find it. * No big deal, but make a note of it. */ page = find_get_page(mapping, pgoff); >>>>>>> 58a75b6a81be54a8b491263ca1af243e9d8617b9 >>>>>>> ae1773bb70f3d7cf73324ce8fba787e01d8fa9f2 if (page) { present = PageUptodate(page); page_cache_release(page); } return present; } static void mincore_unmapped_range(struct vm_area_struct *vma, unsigned long addr, unsigned long end, unsigned char *vec) { unsigned long nr = (end - addr) >> PAGE_SHIFT;
struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, struct vm_area_struct *vma, unsigned long addr, bool *new_page_allocated) { struct page *found_page, *new_page = NULL; struct address_space *swapper_space = swap_address_space(entry); int err; *new_page_allocated = false; do { /* * First check the swap cache. Since this is normally * called after lookup_swap_cache() failed, re-calling * that would confuse statistics. */ found_page = find_get_page(swapper_space, swp_offset(entry)); if (found_page) break; /* * Just skip read ahead for unused swap slot. * During swap_off when swap_slot_cache is disabled, * we have to handle the race between putting * swap entry in swap cache and marking swap slot * as SWAP_HAS_CACHE. That's done in later part of code or * else swap_off will be aborted if we return NULL. */ if (!__swp_swapcount(entry) && swap_slot_cache_enabled) break; /* * Get a new page to read into from swap. */ if (!new_page) { new_page = alloc_page_vma(gfp_mask, vma, addr); if (!new_page) break; /* Out of memory */ } /* * call radix_tree_preload() while we can wait. */ err = radix_tree_maybe_preload(gfp_mask & GFP_KERNEL); if (err) break; /* * Swap entry may have been freed since our caller observed it. */ err = swapcache_prepare(entry); if (err == -EEXIST) { radix_tree_preload_end(); /* * We might race against get_swap_page() and stumble * across a SWAP_HAS_CACHE swap_map entry whose page * has not been brought into the swapcache yet. */ cond_resched(); continue; } if (err) { /* swp entry is obsolete ? */ radix_tree_preload_end(); break; } /* May fail (-ENOMEM) if radix-tree node allocation failed. */ __SetPageLocked(new_page); __SetPageSwapBacked(new_page); err = __add_to_swap_cache(new_page, entry); if (likely(!err)) { radix_tree_preload_end(); /* * Initiate read into locked page and return. */ lru_cache_add_anon(new_page); *new_page_allocated = true; return new_page; } radix_tree_preload_end(); __ClearPageLocked(new_page); /* * add_to_swap_cache() doesn't return -EEXIST, so we can safely * clear SWAP_HAS_CACHE flag. */ put_swap_page(new_page, entry); } while (err != -ENOMEM); if (new_page) put_page(new_page); return found_page; }
/* * zcache_get_swap_cache_page * * This is an adaption of read_swap_cache_async() * * If success, page is returned in retpage * Returns 0 if page was already in the swap cache, page is not locked * Returns 1 if the new page needs to be populated, page is locked */ static int zcache_get_swap_cache_page(int type, pgoff_t offset, struct page *new_page) { struct page *found_page; swp_entry_t entry = swp_entry(type, offset); int err; BUG_ON(new_page == NULL); do { /* * First check the swap cache. Since this is normally * called after lookup_swap_cache() failed, re-calling * that would confuse statistics. */ found_page = find_get_page(&swapper_space, entry.val); if (found_page) return 0; /* * call radix_tree_preload() while we can wait. */ err = radix_tree_preload(GFP_KERNEL); if (err) break; /* * Swap entry may have been freed since our caller observed it. */ err = swapcache_prepare(entry); if (err == -EEXIST) { /* seems racy */ radix_tree_preload_end(); continue; } if (err) { /* swp entry is obsolete ? */ radix_tree_preload_end(); break; } /* May fail (-ENOMEM) if radix-tree node allocation failed. */ __set_page_locked(new_page); SetPageSwapBacked(new_page); err = __add_to_swap_cache(new_page, entry); if (likely(!err)) { radix_tree_preload_end(); lru_cache_add_anon(new_page); return 1; } radix_tree_preload_end(); ClearPageSwapBacked(new_page); __clear_page_locked(new_page); /* * add_to_swap_cache() doesn't return -EEXIST, so we can safely * clear SWAP_HAS_CACHE flag. */ swapcache_free(entry, NULL); /* FIXME: is it possible to get here without err==-ENOMEM? * If not, we can dispense with the do loop, use goto retry */ } while (err != -ENOMEM); return -ENOMEM; }