/** * pagecache_isize_extended - update pagecache after extension of i_size * @inode: inode for which i_size was extended * @from: original inode size * @to: new inode size * * Handle extension of inode size either caused by extending truncate or by * write starting after current i_size. We mark the page straddling current * i_size RO so that page_mkwrite() is called on the nearest write access to * the page. This way filesystem can be sure that page_mkwrite() is called on * the page before user writes to the page via mmap after the i_size has been * changed. * * The function must be called after i_size is updated so that page fault * coming after we unlock the page will already see the new i_size. * The function must be called while we still hold i_mutex - this not only * makes sure i_size is stable but also that userspace cannot observe new * i_size value before we are prepared to store mmap writes at new inode size. */ void pagecache_isize_extended(struct inode *inode, loff_t from, loff_t to) { int bsize = 1 << inode->i_blkbits; loff_t rounded_from; struct page *page; pgoff_t index; WARN_ON(to > inode->i_size); if (from >= to || bsize == PAGE_CACHE_SIZE) return; /* Page straddling @from will not have any hole block created? */ rounded_from = round_up(from, bsize); if (to <= rounded_from || !(rounded_from & (PAGE_CACHE_SIZE - 1))) return; index = from >> PAGE_CACHE_SHIFT; page = find_lock_page(inode->i_mapping, index); /* Page not cached? Nothing to do */ if (!page) return; /* * See clear_page_dirty_for_io() for details why set_page_dirty() * is needed. */ if (page_mkclean(page)) set_page_dirty(page); unlock_page(page); page_cache_release(page); }
/** * nilfs_copy_back_pages -- copy back pages to original cache from shadow cache * @dmap: destination page cache * @smap: source page cache * * No pages must no be added to the cache during this process. * This must be ensured by the caller. */ void nilfs_copy_back_pages(struct address_space *dmap, struct address_space *smap) { struct pagevec pvec; unsigned int i, n; pgoff_t index = 0; int err; pagevec_init(&pvec); repeat: n = pagevec_lookup(&pvec, smap, &index); if (!n) return; for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i], *dpage; pgoff_t offset = page->index; lock_page(page); dpage = find_lock_page(dmap, offset); if (dpage) { /* override existing page on the destination cache */ WARN_ON(PageDirty(dpage)); nilfs_copy_page(dpage, page, 0); unlock_page(dpage); put_page(dpage); } else { struct page *page2; /* move the page to the destination cache */ spin_lock_irq(&smap->tree_lock); page2 = radix_tree_delete(&smap->page_tree, offset); WARN_ON(page2 != page); smap->nrpages--; spin_unlock_irq(&smap->tree_lock); spin_lock_irq(&dmap->tree_lock); err = radix_tree_insert(&dmap->page_tree, offset, page); if (unlikely(err < 0)) { WARN_ON(err == -EEXIST); page->mapping = NULL; put_page(page); /* for cache */ } else { page->mapping = dmap; dmap->nrpages++; if (PageDirty(page)) radix_tree_tag_set(&dmap->page_tree, offset, PAGECACHE_TAG_DIRTY); } spin_unlock_irq(&dmap->tree_lock); } unlock_page(page); } pagevec_release(&pvec); cond_resched(); goto repeat; }
static int __bdev_writeseg(struct super_block *sb, u64 ofs, pgoff_t index, size_t nr_pages) { struct logfs_super *super = logfs_super(sb); struct address_space *mapping = super->s_mapping_inode->i_mapping; struct bio *bio; struct page *page; struct request_queue *q = bdev_get_queue(sb->s_bdev); unsigned int max_pages = queue_max_hw_sectors(q) >> (PAGE_SHIFT - 9); int i; if (max_pages > BIO_MAX_PAGES) max_pages = BIO_MAX_PAGES; bio = bio_alloc(GFP_NOFS, max_pages); BUG_ON(!bio); for (i = 0; i < nr_pages; i++) { if (i >= max_pages) { /* Block layer cannot split bios :( */ bio->bi_vcnt = i; bio->bi_idx = 0; bio->bi_size = i * PAGE_SIZE; bio->bi_bdev = super->s_bdev; bio->bi_sector = ofs >> 9; bio->bi_private = sb; bio->bi_end_io = writeseg_end_io; atomic_inc(&super->s_pending_writes); submit_bio(WRITE, bio); ofs += i * PAGE_SIZE; index += i; nr_pages -= i; i = 0; bio = bio_alloc(GFP_NOFS, max_pages); BUG_ON(!bio); } page = find_lock_page(mapping, index + i); BUG_ON(!page); bio->bi_io_vec[i].bv_page = page; bio->bi_io_vec[i].bv_len = PAGE_SIZE; bio->bi_io_vec[i].bv_offset = 0; BUG_ON(PageWriteback(page)); set_page_writeback(page); unlock_page(page); }
struct buffer_head *nilfs_grab_buffer(struct inode *inode, struct address_space *mapping, unsigned long blkoff, unsigned long b_state) { int blkbits = inode->i_blkbits; pgoff_t index = blkoff >> (PAGE_CACHE_SHIFT - blkbits); struct page *page, *opage; struct buffer_head *bh, *obh; page = grab_cache_page(mapping, index); if (unlikely(!page)) return NULL; bh = __nilfs_get_page_block(page, blkoff, index, blkbits, b_state); if (unlikely(!bh)) { unlock_page(page); page_cache_release(page); return NULL; } if (!buffer_uptodate(bh) && mapping->assoc_mapping != NULL) { /* * Shadow page cache uses assoc_mapping to point its original * page cache. The following code tries the original cache * if the given cache is a shadow and it didn't hit. */ opage = find_lock_page(mapping->assoc_mapping, index); if (!opage) return bh; obh = __nilfs_get_page_block(opage, blkoff, index, blkbits, b_state); if (buffer_uptodate(obh)) { nilfs_copy_buffer(bh, obh); if (buffer_dirty(obh)) { nilfs_mark_buffer_dirty(bh); if (!buffer_nilfs_node(bh) && NILFS_MDT(inode)) nilfs_mdt_mark_dirty(inode); } } brelse(obh); unlock_page(opage); page_cache_release(opage); } return bh; }
struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno, int create) { struct address_space *mapping = gl->gl_aspace->i_mapping; struct gfs2_sbd *sdp = gl->gl_sbd; struct page *page; struct buffer_head *bh; unsigned int shift; unsigned long index; unsigned int bufnum; shift = PAGE_CACHE_SHIFT - sdp->sd_sb.sb_bsize_shift; index = blkno >> shift; /* convert block to page */ bufnum = blkno - (index << shift); /* block buf index within page */ if (create) { for (;;) { page = grab_cache_page(mapping, index); if (page) break; yield(); } } else { page = find_lock_page(mapping, index); if (!page) return NULL; } if (!page_has_buffers(page)) create_empty_buffers(page, sdp->sd_sb.sb_bsize, 0); /* Locate header for our buffer within our page */ for (bh = page_buffers(page); bufnum--; bh = bh->b_this_page) /* Do nothing */; get_bh(bh); if (!buffer_mapped(bh)) map_bh(bh, sdp->sd_vfs, blkno); unlock_page(page); mark_page_accessed(page); page_cache_release(page); return bh; }
struct page * lookup_swap_cache(swp_entry_t entry) { struct page *found; #ifdef SWAP_CACHE_INFO swap_cache_find_total++; #endif while (1) { /* * Right now the pagecache is 32-bit only. But it's a 32 bit index. =) */ repeat: found = find_lock_page(&swapper_space, entry.val); if (!found) return 0; /* * Though the "found" page was in the swap cache an instant * earlier, it might have been removed by refill_inactive etc. * Re search ... Since find_lock_page grabs a reference on * the page, it can not be reused for anything else, namely * it can not be associated with another swaphandle, so it * is enough to check whether the page is still in the scache. */ if (!PageSwapCache(found)) { UnlockPage(found); page_cache_release(found); goto repeat; } if (found->mapping != &swapper_space) goto out_bad; #ifdef SWAP_CACHE_INFO swap_cache_find_success++; #endif UnlockPage(found); return found; } out_bad: printk (KERN_ERR "VM: Found a non-swapper swap page!\n"); UnlockPage(found); page_cache_release(found); return 0; }
static struct dentry * __dcache_find_get_entry(struct dentry *parent, u64 idx, struct ceph_readdir_cache_control *cache_ctl) { struct inode *dir = d_inode(parent); struct dentry *dentry; unsigned idx_mask = (PAGE_SIZE / sizeof(struct dentry *)) - 1; loff_t ptr_pos = idx * sizeof(struct dentry *); pgoff_t ptr_pgoff = ptr_pos >> PAGE_SHIFT; if (ptr_pos >= i_size_read(dir)) return NULL; if (!cache_ctl->page || ptr_pgoff != page_index(cache_ctl->page)) { ceph_readdir_cache_release(cache_ctl); cache_ctl->page = find_lock_page(&dir->i_data, ptr_pgoff); if (!cache_ctl->page) { dout(" page %lu not found\n", ptr_pgoff); return ERR_PTR(-EAGAIN); } /* reading/filling the cache are serialized by i_mutex, no need to use page lock */ unlock_page(cache_ctl->page); cache_ctl->dentries = kmap(cache_ctl->page); } cache_ctl->index = idx & idx_mask; rcu_read_lock(); spin_lock(&parent->d_lock); /* check i_size again here, because empty directory can be * marked as complete while not holding the i_mutex. */ if (ceph_dir_is_complete_ordered(dir) && ptr_pos < i_size_read(dir)) dentry = cache_ctl->dentries[cache_ctl->index]; else dentry = NULL; spin_unlock(&parent->d_lock); if (dentry && !lockref_get_not_dead(&dentry->d_lockref)) dentry = NULL; rcu_read_unlock(); return dentry ? : ERR_PTR(-EAGAIN); }
static void write_wbuf(struct super_block *sb, struct logfs_area *area, void *wbuf) { struct logfs_super *super = logfs_super(sb); struct address_space *mapping = super->s_mapping_inode->i_mapping; u64 ofs; pgoff_t index; int page_ofs; struct page *page; ofs = dev_ofs(sb, area->a_segno, area->a_used_bytes & ~(super->s_writesize - 1)); index = ofs >> PAGE_SHIFT; page_ofs = ofs & (PAGE_SIZE - 1); page = find_lock_page(mapping, index); BUG_ON(!page); memcpy(wbuf, page_address(page) + page_ofs, super->s_writesize); unlock_page(page); }
struct page *ufs_get_locked_page(struct address_space *mapping, pgoff_t index) { struct page *page; page = find_lock_page(mapping, index); if (!page) { page = read_mapping_page(mapping, index, NULL); if (IS_ERR(page)) { printk(KERN_ERR "ufs_change_blocknr: " "read_mapping_page error: ino %lu, index: %lu\n", mapping->host->i_ino, index); goto out; } lock_page(page); if (unlikely(page->mapping == NULL)) { /* Truncate got there first */ unlock_page(page); put_page(page); page = NULL; goto out; } if (!PageUptodate(page) || PageError(page)) { unlock_page(page); put_page(page); printk(KERN_ERR "ufs_change_blocknr: " "can not read page: ino %lu, index: %lu\n", mapping->host->i_ino, index); page = ERR_PTR(-EIO); } } out: return page; }
/* * Read a directory, using filldir to fill the dirent memory. * smb_proc_readdir does the actual reading from the smb server. * * The cache code is almost directly taken from ncpfs */ static int smb_readdir(struct file *filp, void *dirent, filldir_t filldir) { struct dentry *dentry = filp->f_path.dentry; struct inode *dir = dentry->d_inode; struct smb_sb_info *server = server_from_dentry(dentry); union smb_dir_cache *cache = NULL; struct smb_cache_control ctl; struct page *page = NULL; int result; ctl.page = NULL; ctl.cache = NULL; VERBOSE("reading %s/%s, f_pos=%d\n", DENTRY_PATH(dentry), (int) filp->f_pos); result = 0; lock_kernel(); switch ((unsigned int) filp->f_pos) { case 0: if (filldir(dirent, ".", 1, 0, dir->i_ino, DT_DIR) < 0) goto out; filp->f_pos = 1; /* fallthrough */ case 1: if (filldir(dirent, "..", 2, 1, parent_ino(dentry), DT_DIR) < 0) goto out; filp->f_pos = 2; } /* * Make sure our inode is up-to-date. */ result = smb_revalidate_inode(dentry); if (result) goto out; page = grab_cache_page(&dir->i_data, 0); if (!page) goto read_really; ctl.cache = cache = kmap(page); ctl.head = cache->head; if (!PageUptodate(page) || !ctl.head.eof) { VERBOSE("%s/%s, page uptodate=%d, eof=%d\n", DENTRY_PATH(dentry), PageUptodate(page),ctl.head.eof); goto init_cache; } if (filp->f_pos == 2) { if (jiffies - ctl.head.time >= SMB_MAX_AGE(server)) goto init_cache; /* * N.B. ncpfs checks mtime of dentry too here, we don't. * 1. common smb servers do not update mtime on dir changes * 2. it requires an extra smb request * (revalidate has the same timeout as ctl.head.time) * * Instead smbfs invalidates its own cache on local changes * and remote changes are not seen until timeout. */ } if (filp->f_pos > ctl.head.end) goto finished; ctl.fpos = filp->f_pos + (SMB_DIRCACHE_START - 2); ctl.ofs = ctl.fpos / SMB_DIRCACHE_SIZE; ctl.idx = ctl.fpos % SMB_DIRCACHE_SIZE; for (;;) { if (ctl.ofs != 0) { ctl.page = find_lock_page(&dir->i_data, ctl.ofs); if (!ctl.page) goto invalid_cache; ctl.cache = kmap(ctl.page); if (!PageUptodate(ctl.page)) goto invalid_cache; } while (ctl.idx < SMB_DIRCACHE_SIZE) { struct dentry *dent; int res; dent = smb_dget_fpos(ctl.cache->dentry[ctl.idx], dentry, filp->f_pos); if (!dent) goto invalid_cache; res = filldir(dirent, dent->d_name.name, dent->d_name.len, filp->f_pos, dent->d_inode->i_ino, DT_UNKNOWN); dput(dent); if (res) goto finished; filp->f_pos += 1; ctl.idx += 1; if (filp->f_pos > ctl.head.end) goto finished; } if (ctl.page) { kunmap(ctl.page); SetPageUptodate(ctl.page); unlock_page(ctl.page); page_cache_release(ctl.page); ctl.page = NULL; } ctl.idx = 0; ctl.ofs += 1; } invalid_cache: if (ctl.page) { kunmap(ctl.page); unlock_page(ctl.page); page_cache_release(ctl.page); ctl.page = NULL; } ctl.cache = cache; init_cache: smb_invalidate_dircache_entries(dentry); ctl.head.time = jiffies; ctl.head.eof = 0; ctl.fpos = 2; ctl.ofs = 0; ctl.idx = SMB_DIRCACHE_START; ctl.filled = 0; ctl.valid = 1; read_really: result = server->ops->readdir(filp, dirent, filldir, &ctl); if (result == -ERESTARTSYS && page) ClearPageUptodate(page); if (ctl.idx == -1) goto invalid_cache; /* retry */ ctl.head.end = ctl.fpos - 1; ctl.head.eof = ctl.valid; finished: if (page) { cache->head = ctl.head; kunmap(page); if (result != -ERESTARTSYS) SetPageUptodate(page); unlock_page(page); page_cache_release(page); } if (ctl.page) { kunmap(ctl.page); SetPageUptodate(ctl.page); unlock_page(ctl.page); page_cache_release(ctl.page); } out: unlock_kernel(); return result; }
/* * This function was originally taken from fs/mpage.c, and customized for f2fs. * Major change was from block_size == page_size in f2fs by default. */ static int f2fs_mpage_readpages(struct address_space *mapping, struct list_head *pages, struct page *page, unsigned nr_pages) { struct bio *bio = NULL; unsigned page_idx; sector_t last_block_in_bio = 0; struct inode *inode = mapping->host; const unsigned blkbits = inode->i_blkbits; const unsigned blocksize = 1 << blkbits; sector_t block_in_file; sector_t last_block; sector_t last_block_in_file; sector_t block_nr; struct block_device *bdev = inode->i_sb->s_bdev; struct f2fs_map_blocks map; map.m_pblk = 0; map.m_lblk = 0; map.m_len = 0; map.m_flags = 0; for (page_idx = 0; nr_pages; page_idx++, nr_pages--) { prefetchw(&page->flags); if (pages) { page = list_entry(pages->prev, struct page, lru); list_del(&page->lru); if (add_to_page_cache_lru(page, mapping, page->index, GFP_KERNEL)) goto next_page; } block_in_file = (sector_t)page->index; last_block = block_in_file + nr_pages; last_block_in_file = (i_size_read(inode) + blocksize - 1) >> blkbits; if (last_block > last_block_in_file) last_block = last_block_in_file; /* * Map blocks using the previous result first. */ if ((map.m_flags & F2FS_MAP_MAPPED) && block_in_file > map.m_lblk && block_in_file < (map.m_lblk + map.m_len)) goto got_it; /* * Then do more f2fs_map_blocks() calls until we are * done with this page. */ map.m_flags = 0; if (block_in_file < last_block) { map.m_lblk = block_in_file; map.m_len = last_block - block_in_file; if (f2fs_map_blocks(inode, &map, 0, false)) goto set_error_page; } got_it: if ((map.m_flags & F2FS_MAP_MAPPED)) { block_nr = map.m_pblk + block_in_file - map.m_lblk; SetPageMappedToDisk(page); if (!PageUptodate(page) && !cleancache_get_page(page)) { SetPageUptodate(page); goto confused; } } else { zero_user_segment(page, 0, PAGE_CACHE_SIZE); SetPageUptodate(page); unlock_page(page); goto next_page; } /* * This page will go to BIO. Do we need to send this * BIO off first? */ if (bio && (last_block_in_bio != block_nr - 1)) { submit_and_realloc: submit_bio(READ, bio); bio = NULL; } if (bio == NULL) { struct f2fs_crypto_ctx *ctx = NULL; if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) { struct page *cpage; ctx = f2fs_get_crypto_ctx(inode); if (IS_ERR(ctx)) goto set_error_page; /* wait the page to be moved by cleaning */ cpage = find_lock_page( META_MAPPING(F2FS_I_SB(inode)), block_nr); if (cpage) { f2fs_wait_on_page_writeback(cpage, DATA); f2fs_put_page(cpage, 1); } } bio = bio_alloc(GFP_KERNEL, min_t(int, nr_pages, BIO_MAX_PAGES)); if (!bio) { if (ctx) f2fs_release_crypto_ctx(ctx); goto set_error_page; } bio->bi_bdev = bdev; bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(block_nr); bio->bi_end_io = f2fs_read_end_io; bio->bi_private = ctx; } if (bio_add_page(bio, page, blocksize, 0) < blocksize) goto submit_and_realloc; last_block_in_bio = block_nr; goto next_page; set_error_page: SetPageError(page); zero_user_segment(page, 0, PAGE_CACHE_SIZE); unlock_page(page); goto next_page; confused: if (bio) { submit_bio(READ, bio); bio = NULL; } unlock_page(page); next_page: if (pages) page_cache_release(page); } BUG_ON(pages && !list_empty(pages)); if (bio) submit_bio(READ, bio); return 0; }
/** * truncate_inode_pages - truncate *all* the pages from an offset * @mapping: mapping to truncate * @lstart: offset from which to truncate * * Truncate the page cache at a set offset, removing the pages that are beyond * that offset (and zeroing out partial pages). * * Truncate takes two passes - the first pass is nonblocking. It will not * block on page locks and it will not block on writeback. The second pass * will wait. This is to prevent as much IO as possible in the affected region. * The first pass will remove most pages, so the search cost of the second pass * is low. * * When looking at page->index outside the page lock we need to be careful to * copy it into a local to avoid races (it could change at any time). * * We pass down the cache-hot hint to the page freeing code. Even if the * mapping is large, it is probably the case that the final pages are the most * recently touched, and freeing happens in ascending file offset order. * * Called under (and serialised by) inode->i_sem. */ void truncate_inode_pages(struct address_space *mapping, loff_t lstart) { const pgoff_t start = (lstart + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT; const unsigned partial = lstart & (PAGE_CACHE_SIZE - 1); struct pagevec pvec; pgoff_t next; int i; if (mapping->nrpages == 0) return; pagevec_init(&pvec, 0); next = start; while (pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; pgoff_t page_index = page->index; if (page_index > next) next = page_index; next++; if (TestSetPageLocked(page)) continue; if (PageWriteback(page)) { unlock_page(page); continue; } truncate_complete_page(mapping, page); unlock_page(page); } pagevec_release(&pvec); cond_resched(); } if (partial) { struct page *page = find_lock_page(mapping, start - 1); if (page) { wait_on_page_writeback(page); truncate_partial_page(page, partial); unlock_page(page); page_cache_release(page); } } next = start; for ( ; ; ) { cond_resched(); if (!pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { if (next == start) break; next = start; continue; } for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; lock_page(page); wait_on_page_writeback(page); if (page->index > next) next = page->index; next++; truncate_complete_page(mapping, page); unlock_page(page); } pagevec_release(&pvec); } }
/** * truncate_inode_pages_range - truncate range of pages specified by start & end byte offsets * @mapping: mapping to truncate * @lstart: offset from which to truncate * @lend: offset to which to truncate (inclusive) * * Truncate the page cache, removing the pages that are between * specified offsets (and zeroing out partial pages * if lstart or lend + 1 is not page aligned). * * Truncate takes two passes - the first pass is nonblocking. It will not * block on page locks and it will not block on writeback. The second pass * will wait. This is to prevent as much IO as possible in the affected region. * The first pass will remove most pages, so the search cost of the second pass * is low. * * We pass down the cache-hot hint to the page freeing code. Even if the * mapping is large, it is probably the case that the final pages are the most * recently touched, and freeing happens in ascending file offset order. * * Note that since ->invalidatepage() accepts range to invalidate * truncate_inode_pages_range is able to handle cases where lend + 1 is not * page aligned properly. */ void truncate_inode_pages_range(struct address_space *mapping, loff_t lstart, loff_t lend) { pgoff_t start; /* inclusive */ pgoff_t end; /* exclusive */ unsigned int partial_start; /* inclusive */ unsigned int partial_end; /* exclusive */ struct pagevec pvec; pgoff_t indices[PAGEVEC_SIZE]; pgoff_t index; int i; cleancache_invalidate_inode(mapping); if (mapping->nrpages == 0 && mapping->nrexceptional == 0) return; /* Offsets within partial pages */ partial_start = lstart & (PAGE_CACHE_SIZE - 1); partial_end = (lend + 1) & (PAGE_CACHE_SIZE - 1); /* * 'start' and 'end' always covers the range of pages to be fully * truncated. Partial pages are covered with 'partial_start' at the * start of the range and 'partial_end' at the end of the range. * Note that 'end' is exclusive while 'lend' is inclusive. */ start = (lstart + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; if (lend == -1) /* * lend == -1 indicates end-of-file so we have to set 'end' * to the highest possible pgoff_t and since the type is * unsigned we're using -1. */ end = -1; else end = (lend + 1) >> PAGE_CACHE_SHIFT; pagevec_init(&pvec, 0); index = start; while (index < end && pagevec_lookup_entries(&pvec, mapping, index, min(end - index, (pgoff_t)PAGEVEC_SIZE), indices)) { for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; /* We rely upon deletion not changing page->index */ index = indices[i]; if (index >= end) break; if (radix_tree_exceptional_entry(page)) { clear_exceptional_entry(mapping, index, page); continue; } if (!trylock_page(page)) continue; WARN_ON(page->index != index); if (PageWriteback(page)) { unlock_page(page); continue; } truncate_inode_page(mapping, page); unlock_page(page); } pagevec_remove_exceptionals(&pvec); pagevec_release(&pvec); cond_resched(); index++; } if (partial_start) { struct page *page = find_lock_page(mapping, start - 1); if (page) { unsigned int top = PAGE_CACHE_SIZE; if (start > end) { /* Truncation within a single page */ top = partial_end; partial_end = 0; } wait_on_page_writeback(page); zero_user_segment(page, partial_start, top); cleancache_invalidate_page(mapping, page); if (page_has_private(page)) do_invalidatepage(page, partial_start, top - partial_start); unlock_page(page); page_cache_release(page); } } if (partial_end) { struct page *page = find_lock_page(mapping, end); if (page) { wait_on_page_writeback(page); zero_user_segment(page, 0, partial_end); cleancache_invalidate_page(mapping, page); if (page_has_private(page)) do_invalidatepage(page, 0, partial_end); unlock_page(page); page_cache_release(page); } } /* * If the truncation happened within a single page no pages * will be released, just zeroed, so we can bail out now. */ if (start >= end) return; index = start; for ( ; ; ) { cond_resched(); if (!pagevec_lookup_entries(&pvec, mapping, index, min(end - index, (pgoff_t)PAGEVEC_SIZE), indices)) { /* If all gone from start onwards, we're done */ if (index == start) break; /* Otherwise restart to make sure all gone */ index = start; continue; } if (index == start && indices[0] >= end) { /* All gone out of hole to be punched, we're done */ pagevec_remove_exceptionals(&pvec); pagevec_release(&pvec); break; } for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; /* We rely upon deletion not changing page->index */ index = indices[i]; if (index >= end) { /* Restart punch to make sure all gone */ index = start - 1; break; } if (radix_tree_exceptional_entry(page)) { clear_exceptional_entry(mapping, index, page); continue; } lock_page(page); WARN_ON(page->index != index); wait_on_page_writeback(page); truncate_inode_page(mapping, page); unlock_page(page); } pagevec_remove_exceptionals(&pvec); pagevec_release(&pvec); index++; } cleancache_invalidate_inode(mapping); }
/* * This is a little more tricky than the file -> pipe splicing. There are * basically three cases: * * - Destination page already exists in the address space and there * are users of it. For that case we have no other option that * copying the data. Tough luck. * - Destination page already exists in the address space, but there * are no users of it. Make sure it's uptodate, then drop it. Fall * through to last case. * - Destination page does not exist, we can add the pipe page to * the page cache and avoid the copy. * * If asked to move pages to the output file (SPLICE_F_MOVE is set in * sd->flags), we attempt to migrate pages from the pipe to the output * file address space page cache. This is possible if no one else has * the pipe page referenced outside of the pipe and page cache. If * SPLICE_F_MOVE isn't set, or we cannot move the page, we simply create * a new page in the output file page cache and fill/dirty that. */ static int pipe_to_file(struct pipe_inode_info *pipe, struct pipe_buffer *buf, struct splice_desc *sd) { struct file *file = sd->file; struct address_space *mapping = file->f_mapping; unsigned int offset, this_len; struct page *page; pgoff_t index; int ret; /* * make sure the data in this buffer is uptodate */ ret = buf->ops->pin(pipe, buf); if (unlikely(ret)) return ret; index = sd->pos >> PAGE_CACHE_SHIFT; offset = sd->pos & ~PAGE_CACHE_MASK; this_len = sd->len; if (this_len + offset > PAGE_CACHE_SIZE) this_len = PAGE_CACHE_SIZE - offset; /* * Reuse buf page, if SPLICE_F_MOVE is set and we are doing a full * page. */ if ((sd->flags & SPLICE_F_MOVE) && this_len == PAGE_CACHE_SIZE) { /* * If steal succeeds, buf->page is now pruned from the * pagecache and we can reuse it. The page will also be * locked on successful return. */ if (buf->ops->steal(pipe, buf)) goto find_page; page = buf->page; if (add_to_page_cache(page, mapping, index, GFP_KERNEL)) { unlock_page(page); goto find_page; } page_cache_get(page); if (!(buf->flags & PIPE_BUF_FLAG_LRU)) lru_cache_add(page); } else { find_page: page = find_lock_page(mapping, index); if (!page) { ret = -ENOMEM; page = page_cache_alloc_cold(mapping); if (unlikely(!page)) goto out_ret; /* * This will also lock the page */ ret = add_to_page_cache_lru(page, mapping, index, GFP_KERNEL); if (unlikely(ret)) goto out; } /* * We get here with the page locked. If the page is also * uptodate, we don't need to do more. If it isn't, we * may need to bring it in if we are not going to overwrite * the full page. */ if (!PageUptodate(page)) { if (this_len < PAGE_CACHE_SIZE) { ret = mapping->a_ops->readpage(file, page); if (unlikely(ret)) goto out; lock_page(page); if (!PageUptodate(page)) { /* * Page got invalidated, repeat. */ if (!page->mapping) { unlock_page(page); page_cache_release(page); goto find_page; } ret = -EIO; goto out; } } else SetPageUptodate(page); } } ret = mapping->a_ops->prepare_write(file, page, offset, offset+this_len); if (unlikely(ret)) { loff_t isize = i_size_read(mapping->host); if (ret != AOP_TRUNCATED_PAGE) unlock_page(page); page_cache_release(page); if (ret == AOP_TRUNCATED_PAGE) goto find_page; /* * prepare_write() may have instantiated a few blocks * outside i_size. Trim these off again. */ if (sd->pos + this_len > isize) vmtruncate(mapping->host, isize); goto out_ret; } if (buf->page != page) { /* * Careful, ->map() uses KM_USER0! */ char *src = buf->ops->map(pipe, buf, 1); char *dst = kmap_atomic(page, KM_USER1); memcpy(dst + offset, src + buf->offset, this_len); flush_dcache_page(page); kunmap_atomic(dst, KM_USER1); buf->ops->unmap(pipe, buf, src); } ret = mapping->a_ops->commit_write(file, page, offset, offset+this_len); if (!ret) { /* * Return the number of bytes written and mark page as * accessed, we are now done! */ ret = this_len; mark_page_accessed(page); balance_dirty_pages_ratelimited(mapping); } else if (ret == AOP_TRUNCATED_PAGE) { page_cache_release(page); goto find_page; } out: page_cache_release(page); unlock_page(page); out_ret: return ret; }
static int copy_user_bh(struct page *to, struct inode *inode, struct buffer_head *bh, unsigned long vaddr) { struct blk_dax_ctl dax = { .sector = to_sector(bh, inode), .size = bh->b_size, }; struct block_device *bdev = bh->b_bdev; void *vto; if (dax_map_atomic(bdev, &dax) < 0) return PTR_ERR(dax.addr); vto = kmap_atomic(to); copy_user_page(vto, (void __force *)dax.addr, vaddr, to); kunmap_atomic(vto); dax_unmap_atomic(bdev, &dax); return 0; } #define NO_SECTOR -1 #define DAX_PMD_INDEX(page_index) (page_index & (PMD_MASK >> PAGE_SHIFT)) static int dax_radix_entry(struct address_space *mapping, pgoff_t index, sector_t sector, bool pmd_entry, bool dirty) { struct radix_tree_root *page_tree = &mapping->page_tree; pgoff_t pmd_index = DAX_PMD_INDEX(index); int type, error = 0; void *entry; WARN_ON_ONCE(pmd_entry && !dirty); if (dirty) __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); spin_lock_irq(&mapping->tree_lock); entry = radix_tree_lookup(page_tree, pmd_index); if (entry && RADIX_DAX_TYPE(entry) == RADIX_DAX_PMD) { index = pmd_index; goto dirty; } entry = radix_tree_lookup(page_tree, index); if (entry) { type = RADIX_DAX_TYPE(entry); if (WARN_ON_ONCE(type != RADIX_DAX_PTE && type != RADIX_DAX_PMD)) { error = -EIO; goto unlock; } if (!pmd_entry || type == RADIX_DAX_PMD) goto dirty; /* * We only insert dirty PMD entries into the radix tree. This * means we don't need to worry about removing a dirty PTE * entry and inserting a clean PMD entry, thus reducing the * range we would flush with a follow-up fsync/msync call. */ radix_tree_delete(&mapping->page_tree, index); mapping->nrexceptional--; } if (sector == NO_SECTOR) { /* * This can happen during correct operation if our pfn_mkwrite * fault raced against a hole punch operation. If this * happens the pte that was hole punched will have been * unmapped and the radix tree entry will have been removed by * the time we are called, but the call will still happen. We * will return all the way up to wp_pfn_shared(), where the * pte_same() check will fail, eventually causing page fault * to be retried by the CPU. */ goto unlock; } error = radix_tree_insert(page_tree, index, RADIX_DAX_ENTRY(sector, pmd_entry)); if (error) goto unlock; mapping->nrexceptional++; dirty: if (dirty) radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY); unlock: spin_unlock_irq(&mapping->tree_lock); return error; } static int dax_writeback_one(struct block_device *bdev, struct address_space *mapping, pgoff_t index, void *entry) { struct radix_tree_root *page_tree = &mapping->page_tree; int type = RADIX_DAX_TYPE(entry); struct radix_tree_node *node; struct blk_dax_ctl dax; void **slot; int ret = 0; spin_lock_irq(&mapping->tree_lock); /* * Regular page slots are stabilized by the page lock even * without the tree itself locked. These unlocked entries * need verification under the tree lock. */ if (!__radix_tree_lookup(page_tree, index, &node, &slot)) goto unlock; if (*slot != entry) goto unlock; /* another fsync thread may have already written back this entry */ if (!radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE)) goto unlock; if (WARN_ON_ONCE(type != RADIX_DAX_PTE && type != RADIX_DAX_PMD)) { ret = -EIO; goto unlock; } dax.sector = RADIX_DAX_SECTOR(entry); dax.size = (type == RADIX_DAX_PMD ? PMD_SIZE : PAGE_SIZE); spin_unlock_irq(&mapping->tree_lock); /* * We cannot hold tree_lock while calling dax_map_atomic() because it * eventually calls cond_resched(). */ ret = dax_map_atomic(bdev, &dax); if (ret < 0) return ret; if (WARN_ON_ONCE(ret < dax.size)) { ret = -EIO; goto unmap; } wb_cache_pmem(dax.addr, dax.size); spin_lock_irq(&mapping->tree_lock); radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_TOWRITE); spin_unlock_irq(&mapping->tree_lock); unmap: dax_unmap_atomic(bdev, &dax); return ret; unlock: spin_unlock_irq(&mapping->tree_lock); return ret; } /* * Flush the mapping to the persistent domain within the byte range of [start, * end]. This is required by data integrity operations to ensure file data is * on persistent storage prior to completion of the operation. */ int dax_writeback_mapping_range(struct address_space *mapping, struct block_device *bdev, struct writeback_control *wbc) { struct inode *inode = mapping->host; pgoff_t start_index, end_index, pmd_index; pgoff_t indices[PAGEVEC_SIZE]; struct pagevec pvec; bool done = false; int i, ret = 0; void *entry; if (WARN_ON_ONCE(inode->i_blkbits != PAGE_SHIFT)) return -EIO; if (!mapping->nrexceptional || wbc->sync_mode != WB_SYNC_ALL) return 0; start_index = wbc->range_start >> PAGE_SHIFT; end_index = wbc->range_end >> PAGE_SHIFT; pmd_index = DAX_PMD_INDEX(start_index); rcu_read_lock(); entry = radix_tree_lookup(&mapping->page_tree, pmd_index); rcu_read_unlock(); /* see if the start of our range is covered by a PMD entry */ if (entry && RADIX_DAX_TYPE(entry) == RADIX_DAX_PMD) start_index = pmd_index; tag_pages_for_writeback(mapping, start_index, end_index); pagevec_init(&pvec, 0); while (!done) { pvec.nr = find_get_entries_tag(mapping, start_index, PAGECACHE_TAG_TOWRITE, PAGEVEC_SIZE, pvec.pages, indices); if (pvec.nr == 0) break; for (i = 0; i < pvec.nr; i++) { if (indices[i] > end_index) { done = true; break; } ret = dax_writeback_one(bdev, mapping, indices[i], pvec.pages[i]); if (ret < 0) return ret; } } wmb_pmem(); return 0; } EXPORT_SYMBOL_GPL(dax_writeback_mapping_range); static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh, struct vm_area_struct *vma, struct vm_fault *vmf) { unsigned long vaddr = (unsigned long)vmf->virtual_address; struct address_space *mapping = inode->i_mapping; struct block_device *bdev = bh->b_bdev; struct blk_dax_ctl dax = { .sector = to_sector(bh, inode), .size = bh->b_size, }; pgoff_t size; int error; i_mmap_lock_read(mapping); /* * Check truncate didn't happen while we were allocating a block. * If it did, this block may or may not be still allocated to the * file. We can't tell the filesystem to free it because we can't * take i_mutex here. In the worst case, the file still has blocks * allocated past the end of the file. */ size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; if (unlikely(vmf->pgoff >= size)) { error = -EIO; goto out; } if (dax_map_atomic(bdev, &dax) < 0) { error = PTR_ERR(dax.addr); goto out; } if (buffer_unwritten(bh) || buffer_new(bh)) { clear_pmem(dax.addr, PAGE_SIZE); wmb_pmem(); } dax_unmap_atomic(bdev, &dax); error = dax_radix_entry(mapping, vmf->pgoff, dax.sector, false, vmf->flags & FAULT_FLAG_WRITE); if (error) goto out; error = vm_insert_mixed(vma, vaddr, dax.pfn); out: i_mmap_unlock_read(mapping); return error; } /** * __dax_fault - handle a page fault on a DAX file * @vma: The virtual memory area where the fault occurred * @vmf: The description of the fault * @get_block: The filesystem method used to translate file offsets to blocks * @complete_unwritten: The filesystem method used to convert unwritten blocks * to written so the data written to them is exposed. This is required for * required by write faults for filesystems that will return unwritten * extent mappings from @get_block, but it is optional for reads as * dax_insert_mapping() will always zero unwritten blocks. If the fs does * not support unwritten extents, the it should pass NULL. * * When a page fault occurs, filesystems may call this helper in their * fault handler for DAX files. __dax_fault() assumes the caller has done all * the necessary locking for the page fault to proceed successfully. */ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, get_block_t get_block, dax_iodone_t complete_unwritten) { struct file *file = vma->vm_file; struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; struct page *page; struct buffer_head bh; unsigned long vaddr = (unsigned long)vmf->virtual_address; unsigned blkbits = inode->i_blkbits; sector_t block; pgoff_t size; int error; int major = 0; size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; if (vmf->pgoff >= size) return VM_FAULT_SIGBUS; memset(&bh, 0, sizeof(bh)); block = (sector_t)vmf->pgoff << (PAGE_SHIFT - blkbits); bh.b_bdev = inode->i_sb->s_bdev; bh.b_size = PAGE_SIZE; repeat: page = find_get_page(mapping, vmf->pgoff); if (page) { if (!lock_page_or_retry(page, vma->vm_mm, vmf->flags)) { put_page(page); return VM_FAULT_RETRY; } if (unlikely(page->mapping != mapping)) { unlock_page(page); put_page(page); goto repeat; } size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; if (unlikely(vmf->pgoff >= size)) { /* * We have a struct page covering a hole in the file * from a read fault and we've raced with a truncate */ error = -EIO; goto unlock_page; } } error = get_block(inode, block, &bh, 0); if (!error && (bh.b_size < PAGE_SIZE)) error = -EIO; /* fs corruption? */ if (error) goto unlock_page; if (!buffer_mapped(&bh) && !buffer_unwritten(&bh) && !vmf->cow_page) { if (vmf->flags & FAULT_FLAG_WRITE) { error = get_block(inode, block, &bh, 1); count_vm_event(PGMAJFAULT); mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); major = VM_FAULT_MAJOR; if (!error && (bh.b_size < PAGE_SIZE)) error = -EIO; if (error) goto unlock_page; } else { return dax_load_hole(mapping, page, vmf); } } if (vmf->cow_page) { struct page *new_page = vmf->cow_page; if (buffer_written(&bh)) error = copy_user_bh(new_page, inode, &bh, vaddr); else clear_user_highpage(new_page, vaddr); if (error) goto unlock_page; vmf->page = page; if (!page) { i_mmap_lock_read(mapping); /* Check we didn't race with truncate */ size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; if (vmf->pgoff >= size) { i_mmap_unlock_read(mapping); error = -EIO; goto out; } } return VM_FAULT_LOCKED; } /* Check we didn't race with a read fault installing a new page */ if (!page && major) page = find_lock_page(mapping, vmf->pgoff); if (page) { unmap_mapping_range(mapping, vmf->pgoff << PAGE_SHIFT, PAGE_SIZE, 0); delete_from_page_cache(page); unlock_page(page); put_page(page); page = NULL; } /* * If we successfully insert the new mapping over an unwritten extent, * we need to ensure we convert the unwritten extent. If there is an * error inserting the mapping, the filesystem needs to leave it as * unwritten to prevent exposure of the stale underlying data to * userspace, but we still need to call the completion function so * the private resources on the mapping buffer can be released. We * indicate what the callback should do via the uptodate variable, same * as for normal BH based IO completions. */ error = dax_insert_mapping(inode, &bh, vma, vmf); if (buffer_unwritten(&bh)) { if (complete_unwritten) complete_unwritten(&bh, !error); else WARN_ON_ONCE(!(vmf->flags & FAULT_FLAG_WRITE)); } out: if (error == -ENOMEM) return VM_FAULT_OOM | major; /* -EBUSY is fine, somebody else faulted on the same PTE */ if ((error < 0) && (error != -EBUSY)) return VM_FAULT_SIGBUS | major; return VM_FAULT_NOPAGE | major; unlock_page: if (page) { unlock_page(page); put_page(page); } goto out; }
* negative error code is returned. * * %-EBUSY - page has an active buffer. * * %-ENOENT - page cache has no page addressed by the offset. */ int nilfs_mdt_forget_block(struct inode *inode, unsigned long block) { pgoff_t index = (pgoff_t)block >> (PAGE_CACHE_SHIFT - inode->i_blkbits); struct page *page; unsigned long first_block; int ret = 0; int still_dirty; page = find_lock_page(inode->i_mapping, index); if (!page) return -ENOENT; wait_on_page_writeback(page); first_block = (unsigned long)index << (PAGE_CACHE_SHIFT - inode->i_blkbits); if (page_has_buffers(page)) { struct buffer_head *bh; bh = nilfs_page_get_nth_block(page, block - first_block); nilfs_forget_buffer(bh); } still_dirty = PageDirty(page); unlock_page(page);
/* * This is a little more tricky than the file -> pipe splicing. There are * basically three cases: * * - Destination page already exists in the address space and there * are users of it. For that case we have no other option that * copying the data. Tough luck. * - Destination page already exists in the address space, but there * are no users of it. Make sure it's uptodate, then drop it. Fall * through to last case. * - Destination page does not exist, we can add the pipe page to * the page cache and avoid the copy. * * If asked to move pages to the output file (SPLICE_F_MOVE is set in * sd->flags), we attempt to migrate pages from the pipe to the output * file address space page cache. This is possible if no one else has * the pipe page referenced outside of the pipe and page cache. If * SPLICE_F_MOVE isn't set, or we cannot move the page, we simply create * a new page in the output file page cache and fill/dirty that. */ static int pipe_to_file(struct pipe_inode_info *pipe, struct pipe_buffer *buf, struct splice_desc *sd) { struct file *file = sd->file; struct address_space *mapping = file->f_mapping; unsigned int offset, this_len; struct page *page; pgoff_t index; int ret; /* * make sure the data in this buffer is uptodate */ ret = buf->ops->pin(pipe, buf); if (unlikely(ret)) return ret; index = sd->pos >> PAGE_CACHE_SHIFT; offset = sd->pos & ~PAGE_CACHE_MASK; this_len = sd->len; if (this_len + offset > PAGE_CACHE_SIZE) this_len = PAGE_CACHE_SIZE - offset; find_page: page = find_lock_page(mapping, index); if (!page) { ret = -ENOMEM; page = page_cache_alloc_cold(mapping); if (unlikely(!page)) goto out_ret; /* * This will also lock the page */ ret = add_to_page_cache_lru(page, mapping, index, GFP_KERNEL); if (unlikely(ret)) goto out; } ret = mapping->a_ops->prepare_write(file, page, offset, offset+this_len); if (unlikely(ret)) { loff_t isize = i_size_read(mapping->host); if (ret != AOP_TRUNCATED_PAGE) unlock_page(page); page_cache_release(page); if (ret == AOP_TRUNCATED_PAGE) goto find_page; /* * prepare_write() may have instantiated a few blocks * outside i_size. Trim these off again. */ if (sd->pos + this_len > isize) vmtruncate(mapping->host, isize); goto out_ret; } if (buf->page != page) { /* * Careful, ->map() uses KM_USER0! */ char *src = buf->ops->map(pipe, buf, 1); char *dst = kmap_atomic(page, KM_USER1); memcpy(dst + offset, src + buf->offset, this_len); flush_dcache_page(page); kunmap_atomic(dst, KM_USER1); buf->ops->unmap(pipe, buf, src); } ret = mapping->a_ops->commit_write(file, page, offset, offset+this_len); if (ret) { if (ret == AOP_TRUNCATED_PAGE) { page_cache_release(page); goto find_page; } if (ret < 0) goto out; /* * Partial write has happened, so 'ret' already initialized by * number of bytes written, Where is nothing we have to do here. */ } else ret = this_len; /* * Return the number of bytes written and mark page as * accessed, we are now done! */ mark_page_accessed(page); balance_dirty_pages_ratelimited(mapping); out: page_cache_release(page); unlock_page(page); out_ret: return ret; }