/* * Batched page_cache_release(). Decrement the reference count on all the * passed pages. If it fell to zero then remove the page from the LRU and * free it. * * Avoid taking zone->lru_lock if possible, but if it is taken, retain it * for the remainder of the operation. * * The locking in this function is against shrink_inactive_list(): we recheck * the page count inside the lock to see whether shrink_inactive_list() * grabbed the page via the LRU. If it did, give up: shrink_inactive_list() * will free it. */ void release_pages(struct page **pages, int nr, int cold) { int i; struct pagevec pages_to_free; struct zone *zone = NULL; unsigned long uninitialized_var(flags); pagevec_init(&pages_to_free, cold); for (i = 0; i < nr; i++) { struct page *page = pages[i]; if (unlikely(PageCompound(page))) { if (zone) { spin_unlock_irqrestore(&zone->lru_lock, flags); zone = NULL; } put_compound_page(page); continue; } if (!put_page_testzero(page)) continue; if (PageLRU(page)) { struct zone *pagezone = page_zone(page); if (pagezone != zone) { if (zone) spin_unlock_irqrestore(&zone->lru_lock, flags); zone = pagezone; spin_lock_irqsave(&zone->lru_lock, flags); } VM_BUG_ON(!PageLRU(page)); __ClearPageLRU(page); del_page_from_lru(zone, page); } if (!pagevec_add(&pages_to_free, page)) { if (zone) { spin_unlock_irqrestore(&zone->lru_lock, flags); zone = NULL; } __pagevec_free(&pages_to_free); pagevec_reinit(&pages_to_free); } } if (zone) spin_unlock_irqrestore(&zone->lru_lock, flags); pagevec_free(&pages_to_free); }
/* * Batched page_cache_release(). Decrement the reference count on all the * passed pages. If it fell to zero then remove the page from the LRU and * free it. * * Avoid taking zone->lru_lock if possible, but if it is taken, retain it * for the remainder of the operation. * * The locking in this function is against shrink_cache(): we recheck the * page count inside the lock to see whether shrink_cache grabbed the page * via the LRU. If it did, give up: shrink_cache will free it. */ void release_pages(struct page **pages, int nr, int cold) { int i; struct pagevec pages_to_free; struct zone *zone = NULL; unsigned long uninitialized_var(flags); pagevec_init(&pages_to_free, cold); for (i = 0; i < nr; i++) { struct page *page = pages[i]; if (unlikely(PageCompound(page))) { if (zone) { spin_unlock_irqrestore(&zone->lru_lock, flags); zone = NULL; } put_compound_page(page); continue; } // dyc: if page->ref not zero, continue if (!put_page_testzero(page)) continue; // dyc: if in url, remove from it if (PageLRU(page)) { struct zone *pagezone = page_zone(page); if (pagezone != zone) { if (zone) spin_unlock_irqrestore(&zone->lru_lock, flags); zone = pagezone; spin_lock_irqsave(&zone->lru_lock, flags); } VM_BUG_ON(!PageLRU(page)); __ClearPageLRU(page); del_page_from_lru(zone, page); } // dyc: if no space available after adding if (!pagevec_add(&pages_to_free, page)) { if (zone) { spin_unlock_irqrestore(&zone->lru_lock, flags); zone = NULL; } // dyc: return page to buddy system __pagevec_free(&pages_to_free); pagevec_reinit(&pages_to_free); } } // for (i = 0; i < nr; i++) if (zone) spin_unlock_irqrestore(&zone->lru_lock, flags); pagevec_free(&pages_to_free); }
/** * invalidate_mapping_pages - Invalidate all the unlocked pages of one inode * @mapping: the address_space which holds the pages to invalidate * @start: the offset 'from' which to invalidate * @end: the offset 'to' which to invalidate (inclusive) * * This function only removes the unlocked pages, if you want to * remove all the pages of one inode, you must call truncate_inode_pages. * * invalidate_mapping_pages() will not block on IO activity. It will not * invalidate pages which are dirty, locked, under writeback or mapped into * pagetables. */ unsigned long invalidate_mapping_pages(struct address_space *mapping, pgoff_t start, pgoff_t end) { struct pagevec pvec; pgoff_t index = start; unsigned long ret; unsigned long count = 0; int i; /* * Note: this function may get called on a shmem/tmpfs mapping: * pagevec_lookup() might then return 0 prematurely (because it * got a gangful of swap entries); but it's hardly worth worrying * about - it can rarely have anything to free from such a mapping * (most pages are dirty), and already skips over any difficulties. */ pagevec_init(&pvec, 0); while (index <= end && pagevec_lookup(&pvec, mapping, index, min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) { mem_cgroup_uncharge_start(); for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; /* We rely upon deletion not changing page->index */ index = page->index; if (index > end) break; if (!trylock_page(page)) continue; WARN_ON(page->index != index); ret = invalidate_inode_page(page); unlock_page(page); /* * Invalidation is a hint that the page is no longer * of interest and try to speed up its reclaim. */ if (!ret) deactivate_page(page); count += ret; } pagevec_release(&pvec); mem_cgroup_uncharge_end(); cond_resched(); index++; } return count; }
/** * invalidate_mapping_pages - Invalidate all the unlocked pages of one inode * @mapping: the address_space which holds the pages to invalidate * @start: the offset 'from' which to invalidate * @end: the offset 'to' which to invalidate (inclusive) * * This function only removes the unlocked pages, if you want to * remove all the pages of one inode, you must call truncate_inode_pages. * * invalidate_mapping_pages() will not block on IO activity. It will not * invalidate pages which are dirty, locked, under writeback or mapped into * pagetables. */ unsigned long invalidate_mapping_pages(struct address_space *mapping, pgoff_t start, pgoff_t end) { pgoff_t indices[PAGEVEC_SIZE]; struct pagevec pvec; pgoff_t index = start; unsigned long ret; unsigned long count = 0; int i; pagevec_init(&pvec, 0); while (index <= end && __pagevec_lookup(&pvec, mapping, index, min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1, indices)) { mem_cgroup_uncharge_start(); for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; /* We rely upon deletion not changing page->index */ index = indices[i]; if (index > end) break; if (radix_tree_exceptional_entry(page)) { clear_exceptional_entry(mapping, index, page); continue; } if (!trylock_page(page)) continue; WARN_ON(page->index != index); ret = invalidate_inode_page(page); unlock_page(page); /* * Invalidation is a hint that the page is no longer * of interest and try to speed up its reclaim. */ if (!ret) deactivate_page(page); count += ret; } pagevec_remove_exceptionals(&pvec); pagevec_release(&pvec); mem_cgroup_uncharge_end(); cond_resched(); index++; } return count; }
/* * pagevec_release() for pages which are known to not be on the LRU * * This function reinitialises the caller's pagevec. */ void __pagevec_release_nonlru(struct pagevec *pvec) { int i; struct pagevec pages_to_free; pagevec_init(&pages_to_free, pvec->cold); for (i = 0; i < pagevec_count(pvec); i++) { struct page *page = pvec->pages[i]; VM_BUG_ON(PageLRU(page)); if (put_page_testzero(page)) pagevec_add(&pages_to_free, page); } pagevec_free(&pages_to_free); pagevec_reinit(pvec); }
/* * Batched page_cache_release(). Decrement the reference count on all the * passed pages. If it fell to zero then remove the page from the LRU and * free it. * * Avoid taking zone->lru_lock if possible, but if it is taken, retain it * for the remainder of the operation. * * The locking in this function is against shrink_cache(): we recheck the * page count inside the lock to see whether shrink_cache grabbed the page * via the LRU. If it did, give up: shrink_cache will free it. */ void release_pages(struct page **pages, int nr, int cold) { int i; struct pagevec pages_to_free; struct zone *zone = NULL; pagevec_init(&pages_to_free, cold); for (i = 0; i < nr; i++) { struct page *page = pages[i]; struct zone *pagezone; if (unlikely(PageCompound(page))) { if (zone) { spin_unlock_irq(&zone->lru_lock); zone = NULL; } put_compound_page(page); continue; } if (!put_page_testzero(page)) continue; pagezone = page_zone(page); if (pagezone != zone) { if (zone) spin_unlock_irq(&zone->lru_lock); zone = pagezone; spin_lock_irq(&zone->lru_lock); } if (TestClearPageLRU(page)) del_page_from_lru(zone, page); if (page_count(page) == 0) { if (!pagevec_add(&pages_to_free, page)) { spin_unlock_irq(&zone->lru_lock); __pagevec_free(&pages_to_free); pagevec_reinit(&pages_to_free); zone = NULL; /* No lock is held */ } } } if (zone) spin_unlock_irq(&zone->lru_lock); pagevec_free(&pages_to_free); }
int nilfs_copy_dirty_pages(struct address_space *dmap, struct address_space *smap) { struct pagevec pvec; unsigned int i; pgoff_t index = 0; int err = 0; pagevec_init(&pvec, 0); repeat: if (!pagevec_lookup_tag(&pvec, smap, &index, PAGECACHE_TAG_DIRTY, PAGEVEC_SIZE)) return 0; for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i], *dpage; lock_page(page); if (unlikely(!PageDirty(page))) NILFS_PAGE_BUG(page, "inconsistent dirty state"); dpage = grab_cache_page(dmap, page->index); if (unlikely(!dpage)) { /* No empty page is added to the page cache */ err = -ENOMEM; unlock_page(page); break; } if (unlikely(!page_has_buffers(page))) NILFS_PAGE_BUG(page, "found empty page in dat page cache"); nilfs_copy_page(dpage, page, 1); __set_page_dirty_nobuffers(dpage); unlock_page(dpage); page_cache_release(dpage); unlock_page(page); } pagevec_release(&pvec); cond_resched(); if (likely(!err)) goto repeat; return err; }
/** * write_cache_pages - walk the list of dirty pages of the given address space and write all of them. * @mapping: address space structure to write * @wbc: subtract the number of written pages from *@wbc->nr_to_write * @writepage: function called for each page * @data: data passed to writepage function * * If a page is already under I/O, write_cache_pages() skips it, even * if it's dirty. This is desirable behaviour for memory-cleaning writeback, * but it is INCORRECT for data-integrity system calls such as fsync(). fsync() * and msync() need to guarantee that all the data which was dirty at the time * the call was made get new I/O started against them. If wbc->sync_mode is * WB_SYNC_ALL then we were called for data integrity and we must wait for * existing IO to complete. */ int write_cache_pages(struct address_space *mapping, struct writeback_control *wbc, writepage_t writepage, void *data) { int ret = 0; int done = 0; struct pagevec pvec; int nr_pages; pgoff_t uninitialized_var(writeback_index); pgoff_t index; pgoff_t end; /* Inclusive */ pgoff_t done_index; int cycled; int range_whole = 0; pagevec_init(&pvec, 0); if (wbc->range_cyclic) { writeback_index = mapping->writeback_index; /* prev offset */ index = writeback_index; if (index == 0) cycled = 1; else cycled = 0; end = -1; } else { index = wbc->range_start >> PAGE_CACHE_SHIFT; end = wbc->range_end >> PAGE_CACHE_SHIFT; if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) range_whole = 1; cycled = 1; /* ignore range_cyclic tests */ /* * If this is a data integrity sync, cap the writeback to the * current end of file. Any extension to the file that occurs * after this is a new write and we don't need to write those * pages out to fulfil our data integrity requirements. If we * try to write them out, we can get stuck in this scan until * the concurrent writer stops adding dirty pages and extending * EOF. */ if (wbc->sync_mode == WB_SYNC_ALL && wbc->range_end == LLONG_MAX) { end = i_size_read(mapping->host) >> PAGE_CACHE_SHIFT; } }
/* * Radix-tree checker */ void nilfs_check_radix_tree(const char *fname, int line, struct address_space *mapping, int tag) { struct pagevec pvec; unsigned int i, n; pgoff_t index = 0; char *page_type; int nr_found = 0; if (tag == PAGECACHE_TAG_DIRTY) page_type = "dirty"; else if (tag == PAGECACHE_TAG_WRITEBACK) page_type = "writeback"; else page_type = "leaking"; pagevec_init(&pvec, 0); repeat: if (tag < 0) { n = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE); if (n) index = pvec.pages[n - 1]->index + 1; } else n = pagevec_lookup_tag(&pvec, mapping, &index, tag, PAGEVEC_SIZE); if (!n) { if (nr_found) printk(KERN_WARNING "%s: found %d %s pages\n", fname, nr_found, page_type); return; } for (i = 0; i < n; i++) { nilfs_page_debug(fname, line, pvec.pages[i], "%s page", page_type); nr_found++; } pagevec_release(&pvec); cond_resched(); goto repeat; }
/** * nilfs_clear_dirty_pages - discard dirty pages in address space * @mapping: address space with dirty pages for discarding * @silent: suppress [true] or print [false] warning messages */ void nilfs_clear_dirty_pages(struct address_space *mapping, bool silent) { struct pagevec pvec; unsigned int i; pgoff_t index = 0; pagevec_init(&pvec); while (pagevec_lookup_tag(&pvec, mapping, &index, PAGECACHE_TAG_DIRTY)) { for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; lock_page(page); nilfs_clear_dirty_page(page, silent); unlock_page(page); } pagevec_release(&pvec); cond_resched(); } }
void nilfs_clear_dirty_pages(struct address_space *mapping) { struct pagevec pvec; unsigned int i; pgoff_t index = 0; pagevec_init(&pvec, 0); while (pagevec_lookup_tag(&pvec, mapping, &index, PAGECACHE_TAG_DIRTY, PAGEVEC_SIZE)) { for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; struct buffer_head *bh, *head; lock_page(page); ClearPageUptodate(page); ClearPageMappedToDisk(page); bh = head = page_buffers(page); do { lock_buffer(bh); clear_buffer_async_write(bh); clear_buffer_dirty(bh); clear_buffer_nilfs_volatile(bh); clear_buffer_nilfs_checked(bh); clear_buffer_nilfs_redirected(bh); clear_buffer_uptodate(bh); clear_buffer_mapped(bh); unlock_buffer(bh); bh = bh->b_this_page; } while (bh != head); __nilfs_clear_page_dirty(page); unlock_page(page); } pagevec_release(&pvec); cond_resched(); } }
/* * kill all the pages in the given range */ static void afs_kill_pages(struct address_space *mapping, pgoff_t first, pgoff_t last) { struct afs_vnode *vnode = AFS_FS_I(mapping->host); struct pagevec pv; unsigned count, loop; _enter("{%x:%u},%lx-%lx", vnode->fid.vid, vnode->fid.vnode, first, last); pagevec_init(&pv); do { _debug("kill %lx-%lx", first, last); count = last - first + 1; if (count > PAGEVEC_SIZE) count = PAGEVEC_SIZE; pv.nr = find_get_pages_contig(mapping, first, count, pv.pages); ASSERTCMP(pv.nr, ==, count); for (loop = 0; loop < count; loop++) { struct page *page = pv.pages[loop]; ClearPageUptodate(page); SetPageError(page); end_page_writeback(page); if (page->index >= first) first = page->index + 1; lock_page(page); generic_error_remove_page(mapping, page); } __pagevec_release(&pv); } while (first <= last); _leave(""); }
/* * Redirty all the pages in a given range. */ static void afs_redirty_pages(struct writeback_control *wbc, struct address_space *mapping, pgoff_t first, pgoff_t last) { struct afs_vnode *vnode = AFS_FS_I(mapping->host); struct pagevec pv; unsigned count, loop; _enter("{%x:%u},%lx-%lx", vnode->fid.vid, vnode->fid.vnode, first, last); pagevec_init(&pv); do { _debug("redirty %lx-%lx", first, last); count = last - first + 1; if (count > PAGEVEC_SIZE) count = PAGEVEC_SIZE; pv.nr = find_get_pages_contig(mapping, first, count, pv.pages); ASSERTCMP(pv.nr, ==, count); for (loop = 0; loop < count; loop++) { struct page *page = pv.pages[loop]; redirty_page_for_writepage(wbc, page); end_page_writeback(page); if (page->index >= first) first = page->index + 1; } __pagevec_release(&pv); } while (first <= last); _leave(""); }
/** * invalidate_mapping_pages - Invalidate all the unlocked pages of one inode * @mapping: the address_space which holds the pages to invalidate * @start: the offset 'from' which to invalidate * @end: the offset 'to' which to invalidate (inclusive) * * This function only removes the unlocked pages, if you want to * remove all the pages of one inode, you must call truncate_inode_pages. * * invalidate_mapping_pages() will not block on IO activity. It will not * invalidate pages which are dirty, locked, under writeback or mapped into * pagetables. */ unsigned long invalidate_mapping_pages(struct address_space *mapping, pgoff_t start, pgoff_t end) { struct pagevec pvec; pgoff_t next = start; unsigned long ret = 0; int i; pagevec_init(&pvec, 0); while (next <= end && pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; if (TestSetPageLocked(page)) { next++; continue; } if (page->index > next) next = page->index; next++; if (PageDirty(page) || PageWriteback(page)) goto unlock; if (page_mapped(page)) goto unlock; ret += invalidate_complete_page(mapping, page); unlock: unlock_page(page); if (next > end) break; } pagevec_release(&pvec); cond_resched(); } return ret; }
/** * truncate_inode_pages_range - truncate range of pages specified by start & end byte offsets * @mapping: mapping to truncate * @lstart: offset from which to truncate * @lend: offset to which to truncate (inclusive) * * Truncate the page cache, removing the pages that are between * specified offsets (and zeroing out partial pages * if lstart or lend + 1 is not page aligned). * * Truncate takes two passes - the first pass is nonblocking. It will not * block on page locks and it will not block on writeback. The second pass * will wait. This is to prevent as much IO as possible in the affected region. * The first pass will remove most pages, so the search cost of the second pass * is low. * * We pass down the cache-hot hint to the page freeing code. Even if the * mapping is large, it is probably the case that the final pages are the most * recently touched, and freeing happens in ascending file offset order. * * Note that since ->invalidatepage() accepts range to invalidate * truncate_inode_pages_range is able to handle cases where lend + 1 is not * page aligned properly. */ void truncate_inode_pages_range(struct address_space *mapping, loff_t lstart, loff_t lend) { pgoff_t start; /* inclusive */ pgoff_t end; /* exclusive */ unsigned int partial_start; /* inclusive */ unsigned int partial_end; /* exclusive */ struct pagevec pvec; pgoff_t indices[PAGEVEC_SIZE]; pgoff_t index; int i; cleancache_invalidate_inode(mapping); if (mapping->nrpages == 0 && mapping->nrexceptional == 0) return; /* Offsets within partial pages */ partial_start = lstart & (PAGE_CACHE_SIZE - 1); partial_end = (lend + 1) & (PAGE_CACHE_SIZE - 1); /* * 'start' and 'end' always covers the range of pages to be fully * truncated. Partial pages are covered with 'partial_start' at the * start of the range and 'partial_end' at the end of the range. * Note that 'end' is exclusive while 'lend' is inclusive. */ start = (lstart + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; if (lend == -1) /* * lend == -1 indicates end-of-file so we have to set 'end' * to the highest possible pgoff_t and since the type is * unsigned we're using -1. */ end = -1; else end = (lend + 1) >> PAGE_CACHE_SHIFT; pagevec_init(&pvec, 0); index = start; while (index < end && pagevec_lookup_entries(&pvec, mapping, index, min(end - index, (pgoff_t)PAGEVEC_SIZE), indices)) { for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; /* We rely upon deletion not changing page->index */ index = indices[i]; if (index >= end) break; if (radix_tree_exceptional_entry(page)) { clear_exceptional_entry(mapping, index, page); continue; } if (!trylock_page(page)) continue; WARN_ON(page->index != index); if (PageWriteback(page)) { unlock_page(page); continue; } truncate_inode_page(mapping, page); unlock_page(page); } pagevec_remove_exceptionals(&pvec); pagevec_release(&pvec); cond_resched(); index++; } if (partial_start) { struct page *page = find_lock_page(mapping, start - 1); if (page) { unsigned int top = PAGE_CACHE_SIZE; if (start > end) { /* Truncation within a single page */ top = partial_end; partial_end = 0; } wait_on_page_writeback(page); zero_user_segment(page, partial_start, top); cleancache_invalidate_page(mapping, page); if (page_has_private(page)) do_invalidatepage(page, partial_start, top - partial_start); unlock_page(page); page_cache_release(page); } } if (partial_end) { struct page *page = find_lock_page(mapping, end); if (page) { wait_on_page_writeback(page); zero_user_segment(page, 0, partial_end); cleancache_invalidate_page(mapping, page); if (page_has_private(page)) do_invalidatepage(page, 0, partial_end); unlock_page(page); page_cache_release(page); } } /* * If the truncation happened within a single page no pages * will be released, just zeroed, so we can bail out now. */ if (start >= end) return; index = start; for ( ; ; ) { cond_resched(); if (!pagevec_lookup_entries(&pvec, mapping, index, min(end - index, (pgoff_t)PAGEVEC_SIZE), indices)) { /* If all gone from start onwards, we're done */ if (index == start) break; /* Otherwise restart to make sure all gone */ index = start; continue; } if (index == start && indices[0] >= end) { /* All gone out of hole to be punched, we're done */ pagevec_remove_exceptionals(&pvec); pagevec_release(&pvec); break; } for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; /* We rely upon deletion not changing page->index */ index = indices[i]; if (index >= end) { /* Restart punch to make sure all gone */ index = start - 1; break; } if (radix_tree_exceptional_entry(page)) { clear_exceptional_entry(mapping, index, page); continue; } lock_page(page); WARN_ON(page->index != index); wait_on_page_writeback(page); truncate_inode_page(mapping, page); unlock_page(page); } pagevec_remove_exceptionals(&pvec); pagevec_release(&pvec); index++; } cleancache_invalidate_inode(mapping); }
static int f2fs_write_data_page(struct page *page, struct writeback_control *wbc) { struct inode *inode = page->mapping->host; struct f2fs_sb_info *sbi = F2FS_I_SB(inode); loff_t i_size = i_size_read(inode); const pgoff_t end_index = ((unsigned long long) i_size) >> PAGE_CACHE_SHIFT; unsigned offset = 0; bool need_balance_fs = false; int err = 0; struct f2fs_io_info fio = { .sbi = sbi, .type = DATA, .rw = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : WRITE, .page = page, .encrypted_page = NULL, }; trace_f2fs_writepage(page, DATA); if (page->index < end_index) goto write; /* * If the offset is out-of-range of file size, * this page does not have to be written to disk. */ offset = i_size & (PAGE_CACHE_SIZE - 1); if ((page->index >= end_index + 1) || !offset) goto out; zero_user_segment(page, offset, PAGE_CACHE_SIZE); write: if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) goto redirty_out; if (f2fs_is_drop_cache(inode)) goto out; if (f2fs_is_volatile_file(inode) && !wbc->for_reclaim && available_free_memory(sbi, BASE_CHECK)) goto redirty_out; /* Dentry blocks are controlled by checkpoint */ if (S_ISDIR(inode->i_mode)) { if (unlikely(f2fs_cp_error(sbi))) goto redirty_out; err = do_write_data_page(&fio); goto done; } /* we should bypass data pages to proceed the kworkder jobs */ if (unlikely(f2fs_cp_error(sbi))) { SetPageError(page); goto out; } if (!wbc->for_reclaim) need_balance_fs = true; else if (has_not_enough_free_secs(sbi, 0)) goto redirty_out; err = -EAGAIN; f2fs_lock_op(sbi); if (f2fs_has_inline_data(inode)) err = f2fs_write_inline_data(inode, page); if (err == -EAGAIN) err = do_write_data_page(&fio); f2fs_unlock_op(sbi); done: if (err && err != -ENOENT) goto redirty_out; clear_cold_data(page); out: inode_dec_dirty_pages(inode); if (err) ClearPageUptodate(page); unlock_page(page); if (need_balance_fs) f2fs_balance_fs(sbi); if (wbc->for_reclaim) f2fs_submit_merged_bio(sbi, DATA, WRITE); return 0; redirty_out: redirty_page_for_writepage(wbc, page); return AOP_WRITEPAGE_ACTIVATE; } static int __f2fs_writepage(struct page *page, struct writeback_control *wbc, void *data) { struct address_space *mapping = data; int ret = mapping->a_ops->writepage(page, wbc); mapping_set_error(mapping, ret); return ret; } /* * This function was copied from write_cche_pages from mm/page-writeback.c. * The major change is making write step of cold data page separately from * warm/hot data page. */ static int f2fs_write_cache_pages(struct address_space *mapping, struct writeback_control *wbc, writepage_t writepage, void *data) { int ret = 0; int done = 0; struct pagevec pvec; int nr_pages; pgoff_t uninitialized_var(writeback_index); pgoff_t index; pgoff_t end; /* Inclusive */ pgoff_t done_index; int cycled; int range_whole = 0; int tag; int step = 0; pagevec_init(&pvec, 0); next: if (wbc->range_cyclic) { writeback_index = mapping->writeback_index; /* prev offset */ index = writeback_index; if (index == 0) cycled = 1; else cycled = 0; end = -1; } else { index = wbc->range_start >> PAGE_CACHE_SHIFT; end = wbc->range_end >> PAGE_CACHE_SHIFT; if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) range_whole = 1; cycled = 1; /* ignore range_cyclic tests */ } if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) tag = PAGECACHE_TAG_TOWRITE; else tag = PAGECACHE_TAG_DIRTY; retry: if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) tag_pages_for_writeback(mapping, index, end); done_index = index; while (!done && (index <= end)) { int i; nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag, min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1); if (nr_pages == 0) break; for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; if (page->index > end) { done = 1; break; } done_index = page->index; lock_page(page); if (unlikely(page->mapping != mapping)) { continue_unlock: unlock_page(page); continue; } if (!PageDirty(page)) { /* someone wrote it for us */ goto continue_unlock; } if (step == is_cold_data(page)) goto continue_unlock; if (PageWriteback(page)) { if (wbc->sync_mode != WB_SYNC_NONE) f2fs_wait_on_page_writeback(page, DATA); else goto continue_unlock; } BUG_ON(PageWriteback(page)); if (!clear_page_dirty_for_io(page)) goto continue_unlock; ret = (*writepage)(page, wbc, data); if (unlikely(ret)) { if (ret == AOP_WRITEPAGE_ACTIVATE) { unlock_page(page); ret = 0; } else { done_index = page->index + 1; done = 1; break; } } if (--wbc->nr_to_write <= 0 && wbc->sync_mode == WB_SYNC_NONE) { done = 1; break; } } pagevec_release(&pvec); cond_resched(); } if (step < 1) { step++; goto next; } if (!cycled && !done) { cycled = 1; index = 0; end = writeback_index - 1; goto retry; } if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) mapping->writeback_index = done_index; return ret; }
int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) { struct inode *inode = file->f_mapping->host; struct f2fs_inode_info *fi = F2FS_I(inode); struct f2fs_sb_info *sbi = F2FS_I_SB(inode); nid_t ino = inode->i_ino; int ret = 0; bool need_cp = false; struct writeback_control wbc = { .sync_mode = WB_SYNC_ALL, .nr_to_write = LONG_MAX, .for_reclaim = 0, }; if (unlikely(f2fs_readonly(inode->i_sb))) return 0; trace_f2fs_sync_file_enter(inode); /* if fdatasync is triggered, let's do in-place-update */ if (get_dirty_pages(inode) <= SM_I(sbi)->min_fsync_blocks) set_inode_flag(fi, FI_NEED_IPU); ret = filemap_write_and_wait_range(inode->i_mapping, start, end); clear_inode_flag(fi, FI_NEED_IPU); if (ret) { trace_f2fs_sync_file_exit(inode, need_cp, datasync, ret); return ret; } /* if the inode is dirty, let's recover all the time */ if (!datasync && is_inode_flag_set(fi, FI_DIRTY_INODE)) { update_inode_page(inode); goto go_write; } /* * if there is no written data, don't waste time to write recovery info. */ if (!is_inode_flag_set(fi, FI_APPEND_WRITE) && !exist_written_data(sbi, ino, APPEND_INO)) { /* it may call write_inode just prior to fsync */ if (need_inode_page_update(sbi, ino)) goto go_write; if (is_inode_flag_set(fi, FI_UPDATE_WRITE) || exist_written_data(sbi, ino, UPDATE_INO)) goto flush_out; goto out; } go_write: /* guarantee free sections for fsync */ f2fs_balance_fs(sbi); /* * Both of fdatasync() and fsync() are able to be recovered from * sudden-power-off. */ down_read(&fi->i_sem); need_cp = need_do_checkpoint(inode); up_read(&fi->i_sem); if (need_cp) { /* all the dirty node pages should be flushed for POR */ ret = f2fs_sync_fs(inode->i_sb, 1); /* * We've secured consistency through sync_fs. Following pino * will be used only for fsynced inodes after checkpoint. */ try_to_fix_pino(inode); clear_inode_flag(fi, FI_APPEND_WRITE); clear_inode_flag(fi, FI_UPDATE_WRITE); goto out; } sync_nodes: sync_node_pages(sbi, ino, &wbc); /* if cp_error was enabled, we should avoid infinite loop */ if (unlikely(f2fs_cp_error(sbi))) goto out; if (need_inode_block_update(sbi, ino)) { mark_inode_dirty_sync(inode); f2fs_write_inode(inode, NULL); goto sync_nodes; } ret = wait_on_node_pages_writeback(sbi, ino); if (ret) goto out; /* once recovery info is written, don't need to tack this */ remove_dirty_inode(sbi, ino, APPEND_INO); clear_inode_flag(fi, FI_APPEND_WRITE); flush_out: remove_dirty_inode(sbi, ino, UPDATE_INO); clear_inode_flag(fi, FI_UPDATE_WRITE); ret = f2fs_issue_flush(sbi); out: trace_f2fs_sync_file_exit(inode, need_cp, datasync, ret); f2fs_trace_ios(NULL, 1); return ret; } static pgoff_t __get_first_dirty_index(struct address_space *mapping, pgoff_t pgofs, int whence) { struct pagevec pvec; int nr_pages; if (whence != SEEK_DATA) return 0; /* find first dirty page index */ pagevec_init(&pvec, 0); nr_pages = pagevec_lookup_tag(&pvec, mapping, &pgofs, PAGECACHE_TAG_DIRTY, 1); pgofs = nr_pages ? pvec.pages[0]->index : LONG_MAX; pagevec_release(&pvec); return pgofs; } static bool __found_offset(block_t blkaddr, pgoff_t dirty, pgoff_t pgofs, int whence) { switch (whence) { case SEEK_DATA: if ((blkaddr == NEW_ADDR && dirty == pgofs) || (blkaddr != NEW_ADDR && blkaddr != NULL_ADDR)) return true; break; case SEEK_HOLE: if (blkaddr == NULL_ADDR) return true; break; } return false; } static inline int unsigned_offsets(struct file *file) { return file->f_mode & FMODE_UNSIGNED_OFFSET; } static loff_t vfs_setpos(struct file *file, loff_t offset, loff_t maxsize) { if (offset < 0 && !unsigned_offsets(file)) return -EINVAL; if (offset > maxsize) return -EINVAL; if (offset != file->f_pos) { file->f_pos = offset; file->f_version = 0; } return offset; } static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence) { struct inode *inode = file->f_mapping->host; loff_t maxbytes = inode->i_sb->s_maxbytes; struct dnode_of_data dn; pgoff_t pgofs, end_offset, dirty; loff_t data_ofs = offset; loff_t isize; int err = 0; mutex_lock(&inode->i_mutex); isize = i_size_read(inode); if (offset >= isize) goto fail; /* handle inline data case */ if (f2fs_has_inline_data(inode) || f2fs_has_inline_dentry(inode)) { if (whence == SEEK_HOLE) data_ofs = isize; goto found; } pgofs = (pgoff_t)(offset >> PAGE_CACHE_SHIFT); dirty = __get_first_dirty_index(inode->i_mapping, pgofs, whence); for (; data_ofs < isize; data_ofs = pgofs << PAGE_CACHE_SHIFT) { set_new_dnode(&dn, inode, NULL, NULL, 0); err = get_dnode_of_data(&dn, pgofs, LOOKUP_NODE_RA); if (err && err != -ENOENT) { goto fail; } else if (err == -ENOENT) { /* direct node does not exists */ if (whence == SEEK_DATA) { pgofs = PGOFS_OF_NEXT_DNODE(pgofs, F2FS_I(inode)); continue; } else { goto found; } } end_offset = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode)); /* find data/hole in dnode block */ for (; dn.ofs_in_node < end_offset; dn.ofs_in_node++, pgofs++, data_ofs = (loff_t)pgofs << PAGE_CACHE_SHIFT) { block_t blkaddr; blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node); if (__found_offset(blkaddr, dirty, pgofs, whence)) { f2fs_put_dnode(&dn); goto found; } } f2fs_put_dnode(&dn); } if (whence == SEEK_DATA) goto fail; found: if (whence == SEEK_HOLE && data_ofs > isize) data_ofs = isize; mutex_unlock(&inode->i_mutex); return vfs_setpos(file, data_ofs, maxbytes); fail: mutex_unlock(&inode->i_mutex); return -ENXIO; } static loff_t f2fs_llseek(struct file *file, loff_t offset, int whence) { struct inode *inode = file->f_mapping->host; loff_t maxbytes = inode->i_sb->s_maxbytes; switch (whence) { case SEEK_SET: case SEEK_CUR: case SEEK_END: return generic_file_llseek_size(file, offset, whence, maxbytes); case SEEK_DATA: case SEEK_HOLE: if (offset < 0) return -ENXIO; return f2fs_seek_block(file, offset, whence); } return -EINVAL; } static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma) { struct inode *inode = file_inode(file); if (f2fs_encrypted_inode(inode)) { int err = f2fs_get_encryption_info(inode); if (err) return 0; } /* we don't need to use inline_data strictly */ if (f2fs_has_inline_data(inode)) { int err = f2fs_convert_inline_inode(inode); if (err) return err; } file_accessed(file); vma->vm_ops = &f2fs_file_vm_ops; return 0; } static int f2fs_file_open(struct inode *inode, struct file *filp) { int ret = generic_file_open(inode, filp); if (!ret && f2fs_encrypted_inode(inode)) { ret = f2fs_get_encryption_info(inode); if (ret) ret = -EACCES; } return ret; }
/** * truncate_inode_pages - truncate *all* the pages from an offset * @mapping: mapping to truncate * @lstart: offset from which to truncate * * Truncate the page cache at a set offset, removing the pages that are beyond * that offset (and zeroing out partial pages). * * Truncate takes two passes - the first pass is nonblocking. It will not * block on page locks and it will not block on writeback. The second pass * will wait. This is to prevent as much IO as possible in the affected region. * The first pass will remove most pages, so the search cost of the second pass * is low. * * When looking at page->index outside the page lock we need to be careful to * copy it into a local to avoid races (it could change at any time). * * We pass down the cache-hot hint to the page freeing code. Even if the * mapping is large, it is probably the case that the final pages are the most * recently touched, and freeing happens in ascending file offset order. * * Called under (and serialised by) inode->i_sem. */ void truncate_inode_pages(struct address_space *mapping, loff_t lstart) { const pgoff_t start = (lstart + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT; const unsigned partial = lstart & (PAGE_CACHE_SIZE - 1); struct pagevec pvec; pgoff_t next; int i; if (mapping->nrpages == 0) return; pagevec_init(&pvec, 0); next = start; while (pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; pgoff_t page_index = page->index; if (page_index > next) next = page_index; next++; if (TestSetPageLocked(page)) continue; if (PageWriteback(page)) { unlock_page(page); continue; } truncate_complete_page(mapping, page); unlock_page(page); } pagevec_release(&pvec); cond_resched(); } if (partial) { struct page *page = find_lock_page(mapping, start - 1); if (page) { wait_on_page_writeback(page); truncate_partial_page(page, partial); unlock_page(page); page_cache_release(page); } } next = start; for ( ; ; ) { cond_resched(); if (!pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { if (next == start) break; next = start; continue; } for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; lock_page(page); wait_on_page_writeback(page); if (page->index > next) next = page->index; next++; truncate_complete_page(mapping, page); unlock_page(page); } pagevec_release(&pvec); } }
/** * nilfs_find_uncommitted_extent - find extent of uncommitted data * @inode: inode * @start_blk: start block offset (in) * @blkoff: start offset of the found extent (out) * * This function searches an extent of buffers marked "delayed" which * starts from a block offset equal to or larger than @start_blk. If * such an extent was found, this will store the start offset in * @blkoff and return its length in blocks. Otherwise, zero is * returned. */ unsigned long nilfs_find_uncommitted_extent(struct inode *inode, sector_t start_blk, sector_t *blkoff) { unsigned int i; pgoff_t index; unsigned int nblocks_in_page; unsigned long length = 0; sector_t b; struct pagevec pvec; struct page *page; if (inode->i_mapping->nrpages == 0) return 0; index = start_blk >> (PAGE_SHIFT - inode->i_blkbits); nblocks_in_page = 1U << (PAGE_SHIFT - inode->i_blkbits); pagevec_init(&pvec); repeat: pvec.nr = find_get_pages_contig(inode->i_mapping, index, PAGEVEC_SIZE, pvec.pages); if (pvec.nr == 0) return length; if (length > 0 && pvec.pages[0]->index > index) goto out; b = pvec.pages[0]->index << (PAGE_SHIFT - inode->i_blkbits); i = 0; do { page = pvec.pages[i]; lock_page(page); if (page_has_buffers(page)) { struct buffer_head *bh, *head; bh = head = page_buffers(page); do { if (b < start_blk) continue; if (buffer_delay(bh)) { if (length == 0) *blkoff = b; length++; } else if (length > 0) { goto out_locked; } } while (++b, bh = bh->b_this_page, bh != head); } else { if (length > 0) goto out_locked; b += nblocks_in_page; } unlock_page(page); } while (++i < pagevec_count(&pvec)); index = page->index + 1; pagevec_release(&pvec); cond_resched(); goto repeat; out_locked: unlock_page(page); out: pagevec_release(&pvec); return length; }
/** * write_cache_pages - walk the list of dirty pages of the given address space and write all of them. * @mapping: address space structure to write * @wbc: subtract the number of written pages from *@wbc->nr_to_write * @writepage: function called for each page * @data: data passed to writepage function * * If a page is already under I/O, write_cache_pages() skips it, even * if it's dirty. This is desirable behaviour for memory-cleaning writeback, * but it is INCORRECT for data-integrity system calls such as fsync(). fsync() * and msync() need to guarantee that all the data which was dirty at the time * the call was made get new I/O started against them. If wbc->sync_mode is * WB_SYNC_ALL then we were called for data integrity and we must wait for * existing IO to complete. */ int write_cache_pages(struct address_space *mapping, struct writeback_control *wbc, writepage_t writepage, void *data) { struct backing_dev_info *bdi = mapping->backing_dev_info; int ret = 0; int done = 0; struct pagevec pvec; int nr_pages; pgoff_t index; pgoff_t end; /* Inclusive */ int scanned = 0; int range_whole = 0; long nr_to_write = wbc->nr_to_write; if (wbc->nonblocking && bdi_write_congested(bdi)) { wbc->encountered_congestion = 1; return 0; } pagevec_init(&pvec, 0); if (wbc->range_cyclic) { index = mapping->writeback_index; /* Start from prev offset */ end = -1; } else { index = wbc->range_start >> PAGE_CACHE_SHIFT; end = wbc->range_end >> PAGE_CACHE_SHIFT; if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) range_whole = 1; scanned = 1; } retry: while (!done && (index <= end) && (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, PAGECACHE_TAG_DIRTY, min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) { unsigned i; scanned = 1; for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; /* * At this point we hold neither mapping->tree_lock nor * lock on the page itself: the page may be truncated or * invalidated (changing page->mapping to NULL), or even * swizzled back from swapper_space to tmpfs file * mapping */ lock_page(page); if (unlikely(page->mapping != mapping)) { unlock_page(page); continue; } if (!wbc->range_cyclic && page->index > end) { done = 1; unlock_page(page); continue; } if (wbc->sync_mode != WB_SYNC_NONE) wait_on_page_writeback(page); if (PageWriteback(page) || !clear_page_dirty_for_io(page)) { unlock_page(page); continue; } ret = (*writepage)(page, wbc, data); if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) { unlock_page(page); ret = 0; } if (ret || (--nr_to_write <= 0)) done = 1; if (wbc->nonblocking && bdi_write_congested(bdi)) { wbc->encountered_congestion = 1; done = 1; } } pagevec_release(&pvec); cond_resched(); } if (!scanned && !done) { /* * We hit the last page and there is more work to be done: wrap * back to the start of the file */ scanned = 1; index = 0; goto retry; } if (!wbc->no_nrwrite_index_update) { if (wbc->range_cyclic || (range_whole && nr_to_write > 0)) mapping->writeback_index = index; wbc->nr_to_write = nr_to_write; } return ret; }
static int gfs2_write_cache_jdata(struct address_space *mapping, struct writeback_control *wbc) { int ret = 0; int done = 0; struct pagevec pvec; int nr_pages; pgoff_t uninitialized_var(writeback_index); pgoff_t index; pgoff_t end; pgoff_t done_index; int cycled; int range_whole = 0; int tag; pagevec_init(&pvec, 0); if (wbc->range_cyclic) { writeback_index = mapping->writeback_index; /* prev offset */ index = writeback_index; if (index == 0) cycled = 1; else cycled = 0; end = -1; } else { index = wbc->range_start >> PAGE_CACHE_SHIFT; end = wbc->range_end >> PAGE_CACHE_SHIFT; if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) range_whole = 1; cycled = 1; /* ignore range_cyclic tests */ } if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) tag = PAGECACHE_TAG_TOWRITE; else tag = PAGECACHE_TAG_DIRTY; retry: if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) tag_pages_for_writeback(mapping, index, end); done_index = index; while (!done && (index <= end)) { nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag, min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); if (nr_pages == 0) break; ret = gfs2_write_jdata_pagevec(mapping, wbc, &pvec, nr_pages, end, &done_index); if (ret) done = 1; if (ret > 0) ret = 0; pagevec_release(&pvec); cond_resched(); } if (!cycled && !done) { /* * range_cyclic: * We hit the last page and there is more work to be done: wrap * back to the start of the file */ cycled = 1; index = 0; end = writeback_index - 1; goto retry; } if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) mapping->writeback_index = done_index; return ret; }
/* * Inspired by write_cache_pages from /mm/page-writeback.c */ static int ecryptfs_writepages(struct address_space *mapping, struct writeback_control *wbc) { int ret = 0; int done = 0; struct pagevec pvec; int nr_pages; pgoff_t uninitialized_var(writeback_index); pgoff_t index; pgoff_t end; /* Inclusive */ pgoff_t done_index; int cycled; int range_whole = 0; int tag; struct page **pgs; int pgidx; /* printk("[g-ecryptfs] Info: call writepages\n"); */ pgs = kmalloc(sizeof(struct page*)*PAGEVEC_SIZE, GFP_KERNEL); if (!pgs) { printk("[g-ecryptfs] Error: pgs alloc failed!\n"); return -EFAULT; } pagevec_init(&pvec, 0); if (wbc->range_cyclic) { writeback_index = mapping->writeback_index; /* prev offset */ index = writeback_index; if (index == 0) cycled = 1; else cycled = 0; end = -1; } else { index = wbc->range_start >> PAGE_CACHE_SHIFT; end = wbc->range_end >> PAGE_CACHE_SHIFT; if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) range_whole = 1; cycled = 1; /* ignore range_cyclic tests */ } if (wbc->sync_mode == WB_SYNC_ALL) tag = PAGECACHE_TAG_TOWRITE; else tag = PAGECACHE_TAG_DIRTY; retry: if (wbc->sync_mode == WB_SYNC_ALL) tag_pages_for_writeback(mapping, index, end); done_index = index; while (!done && (index <= end)) { int i; struct page *page; nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag, min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); if (nr_pages == 0) break; pgidx = 0; for (i = 0; i < nr_pages; i++) { page = pvec.pages[i]; /* * At this point, the page may be truncated or * invalidated (changing page->mapping to NULL), or * even swizzled back from swapper_space to tmpfs file * mapping. However, page->index will not change * because we have a reference on the page. */ if (page->index > end) { /* * can't be range_cyclic (1st pass) because * end == -1 in that case. */ done = 1; break; } done_index = page->index + 1; lock_page(page); /* * Page truncated or invalidated. We can freely skip it * then, even for data integrity operations: the page * has disappeared concurrently, so there could be no * real expectation of this data interity operation * even if there is now a new, dirty page at the same * pagecache address. */ if (unlikely(page->mapping != mapping)) { continue_unlock: unlock_page(page); continue; } if (!PageDirty(page)) { /* someone wrote it for us */ goto continue_unlock; } if (PageWriteback(page)) { if (wbc->sync_mode != WB_SYNC_NONE) wait_on_page_writeback(page); else goto continue_unlock; } BUG_ON(PageWriteback(page)); if (!clear_page_dirty_for_io(page)) goto continue_unlock; pgs[pgidx++] = page; } /*trace_wbc_writepage(wbc, mapping->backing_dev_info);*/ ret = ecryptfs_encrypt_pages(pgs, pgidx); //printk("[g-ecryptfs] Info: enc %d pages in writepages\n", pgidx); mapping_set_error(mapping, ret); for (i = 0; i < nr_pages; i++) { page = pvec.pages[i]; if (unlikely(ret)) { if (ret == AOP_WRITEPAGE_ACTIVATE) { if (PageLocked(page)) unlock_page(page); ret = 0; } else { /* * done_index is set past this page, * so media errors will not choke * background writeout for the entire * file. This has consequences for * range_cyclic semantics (ie. it may * not be suitable for data integrity * writeout). */ done = 1; break; } } /* * We stop writing back only if we are not doing * integrity sync. In case of integrity sync we have to * keep going until we have written all the pages * we tagged for writeback prior to entering this loop. */ if (--wbc->nr_to_write <= 0 && wbc->sync_mode == WB_SYNC_NONE) { done = 1; break; } } pagevec_release(&pvec); cond_resched(); } if (!cycled && !done) { /* * range_cyclic: * We hit the last page and there is more work to be done: wrap * back to the start of the file */ cycled = 1; index = 0; end = writeback_index - 1; goto retry; } if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) mapping->writeback_index = done_index; kfree(pgs); return ret; }
/** * invalidate_mapping_pages - Invalidate all the unlocked pages of one inode * @mapping: the address_space which holds the pages to invalidate * @start: the offset 'from' which to invalidate * @end: the offset 'to' which to invalidate (inclusive) * * This function only removes the unlocked pages, if you want to * remove all the pages of one inode, you must call truncate_inode_pages. * * invalidate_mapping_pages() will not block on IO activity. It will not * invalidate pages which are dirty, locked, under writeback or mapped into * pagetables. */ unsigned long invalidate_mapping_pages(struct address_space *mapping, pgoff_t start, pgoff_t end) { pgoff_t indices[PAGEVEC_SIZE]; struct pagevec pvec; pgoff_t index = start; unsigned long ret; unsigned long count = 0; int i; pagevec_init(&pvec, 0); while (index <= end && pagevec_lookup_entries(&pvec, mapping, index, min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1, indices)) { for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; /* We rely upon deletion not changing page->index */ index = indices[i]; if (index > end) break; if (radix_tree_exceptional_entry(page)) { clear_exceptional_entry(mapping, index, page); continue; } if (!trylock_page(page)) continue; WARN_ON(page_to_index(page) != index); /* Middle of THP: skip */ if (PageTransTail(page)) { unlock_page(page); continue; } else if (PageTransHuge(page)) { index += HPAGE_PMD_NR - 1; i += HPAGE_PMD_NR - 1; /* 'end' is in the middle of THP */ if (index == round_down(end, HPAGE_PMD_NR)) continue; } ret = invalidate_inode_page(page); unlock_page(page); /* * Invalidation is a hint that the page is no longer * of interest and try to speed up its reclaim. */ if (!ret) deactivate_file_page(page); count += ret; } pagevec_remove_exceptionals(&pvec); pagevec_release(&pvec); cond_resched(); index++; } return count; }
/**ltl 功能:遍历给定地址空间的"脏"页面,写这些页面 参数: 返回值: 说明: */ int write_cache_pages(struct address_space *mapping, struct writeback_control *wbc, writepage_t writepage, void *data) { int ret = 0; int done = 0; struct pagevec pvec; int nr_pages; pgoff_t uninitialized_var(writeback_index); pgoff_t index;/*要冲刷页面的索引*/ pgoff_t end; /* Inclusive *//*冲刷最后一个页面的索引,-1表示要循环冲刷*/ pgoff_t done_index; int cycled;/*主要用在回绕需要分成两段进行冲刷的情况下,为1表示前一段冲刷已经完成。*/ int range_whole = 0; int tag; pagevec_init(&pvec, 0); if (wbc->range_cyclic) {//是否要循环进行冲刷 writeback_index = mapping->writeback_index; /* prev offset */ index = writeback_index; if (index == 0) cycled = 1; else cycled = 0; end = -1; } else { index = wbc->range_start >> PAGE_CACHE_SHIFT; end = wbc->range_end >> PAGE_CACHE_SHIFT; if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) range_whole = 1; cycled = 1; /* ignore range_cyclic tests */ } if (wbc->sync_mode == WB_SYNC_ALL) tag = PAGECACHE_TAG_TOWRITE; else tag = PAGECACHE_TAG_DIRTY; retry: if (wbc->sync_mode == WB_SYNC_ALL) tag_pages_for_writeback(mapping, index, end); done_index = index; while (!done && (index <= end)) { int i; /*在地址空间中查找设备了PAGECACHE_TAG_DIRTY标志的页面,将结果保存在pagevec中*/ nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag, min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); if (nr_pages == 0) break; //对找到的页面进行处理。 for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; /* * At this point, the page may be truncated or * invalidated (changing page->mapping to NULL), or * even swizzled back from swapper_space to tmpfs file * mapping. However, page->index will not change * because we have a reference on the page. */ if (page->index > end) { /* * can't be range_cyclic (1st pass) because * end == -1 in that case. */ done = 1; break; } done_index = page->index + 1; //页面加锁 lock_page(page); /* * Page truncated or invalidated. We can freely skip it * then, even for data integrity operations: the page * has disappeared concurrently, so there could be no * real expectation of this data interity operation * even if there is now a new, dirty page at the same * pagecache address. */ /*由于在加锁过程中可能其它进程对页面做过改动,因此要做以下判断*/ if (unlikely(page->mapping != mapping)) {//页面无效 continue_unlock: unlock_page(page); continue; } if (!PageDirty(page)) {//页面回写完成,I_DIRTY标志已经清除。 /* someone wrote it for us */ goto continue_unlock; } if (PageWriteback(page)) {//页面正在回写中,那要根据sync_mode采取策略 if (wbc->sync_mode != WB_SYNC_NONE) wait_on_page_writeback(page);//要等待正在回写完成后才继续 else goto continue_unlock; } BUG_ON(PageWriteback(page)); if (!clear_page_dirty_for_io(page)) goto continue_unlock; trace_wbc_writepage(wbc, mapping->backing_dev_info); //开始回写"脏"页面 ret = (*writepage)(page, wbc, data); if (unlikely(ret)) { if (ret == AOP_WRITEPAGE_ACTIVATE) { unlock_page(page); ret = 0; } else { /* * done_index is set past this page, * so media errors will not choke * background writeout for the entire * file. This has consequences for * range_cyclic semantics (ie. it may * not be suitable for data integrity * writeout). */ done = 1; break; } } /* * We stop writing back only if we are not doing * integrity sync. In case of integrity sync we have to * keep going until we have written all the pages * we tagged for writeback prior to entering this loop. */ /*页面写成功后,递减计数器*/ if (--wbc->nr_to_write <= 0 && wbc->sync_mode == WB_SYNC_NONE) { done = 1; break; } } pagevec_release(&pvec); cond_resched(); } if (!cycled && !done) { /* * range_cyclic: * We hit the last page and there is more work to be done: wrap * back to the start of the file */ cycled = 1; index = 0; end = writeback_index - 1; goto retry; } if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) mapping->writeback_index = done_index; return ret; }
/** * nilfs_copy_back_pages -- copy back pages to orignal cache from shadow cache * @dmap: destination page cache * @smap: source page cache * * No pages must no be added to the cache during this process. * This must be ensured by the caller. */ void nilfs_copy_back_pages(struct address_space *dmap, struct address_space *smap) { struct pagevec pvec; unsigned int i, n; pgoff_t index = 0; int err; pagevec_init(&pvec, 0); repeat: n = pagevec_lookup(&pvec, smap, index, PAGEVEC_SIZE); if (!n) return; index = pvec.pages[n - 1]->index + 1; for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i], *dpage; pgoff_t offset = page->index; lock_page(page); dpage = find_lock_page(dmap, offset); if (dpage) { /* override existing page on the destination cache */ WARN_ON(PageDirty(dpage)); nilfs_copy_page(dpage, page, 0); unlock_page(dpage); page_cache_release(dpage); } else { struct page *page2; /* move the page to the destination cache */ spin_lock_irq(&smap->tree_lock); page2 = radix_tree_delete(&smap->page_tree, offset); WARN_ON(page2 != page); smap->nrpages--; spin_unlock_irq(&smap->tree_lock); spin_lock_irq(&dmap->tree_lock); err = radix_tree_insert(&dmap->page_tree, offset, page); if (unlikely(err < 0)) { WARN_ON(err == -EEXIST); page->mapping = NULL; page_cache_release(page); /* for cache */ } else { page->mapping = dmap; dmap->nrpages++; if (PageDirty(page)) radix_tree_tag_set(&dmap->page_tree, offset, PAGECACHE_TAG_DIRTY); } spin_unlock_irq(&dmap->tree_lock); } unlock_page(page); } pagevec_release(&pvec); cond_resched(); goto repeat; }
/** * write_cache_pages - walk the list of dirty pages of the given address space and write all of them. * @mapping: address space structure to write * @wbc: subtract the number of written pages from *@wbc->nr_to_write * @writepage: function called for each page * @data: data passed to writepage function * * If a page is already under I/O, write_cache_pages() skips it, even * if it's dirty. This is desirable behaviour for memory-cleaning writeback, * but it is INCORRECT for data-integrity system calls such as fsync(). fsync() * and msync() need to guarantee that all the data which was dirty at the time * the call was made get new I/O started against them. If wbc->sync_mode is * WB_SYNC_ALL then we were called for data integrity and we must wait for * existing IO to complete. * * To avoid livelocks (when other process dirties new pages), we first tag * pages which should be written back with TOWRITE tag and only then start * writing them. For data-integrity sync we have to be careful so that we do * not miss some pages (e.g., because some other process has cleared TOWRITE * tag we set). The rule we follow is that TOWRITE tag can be cleared only * by the process clearing the DIRTY tag (and submitting the page for IO). */ int write_cache_pages(struct address_space *mapping, struct writeback_control *wbc, writepage_t writepage, void *data) { int ret = 0; int done = 0; struct pagevec pvec; int nr_pages; pgoff_t index; pgoff_t end; /* Inclusive */ pgoff_t done_index; int range_whole = 0; int tag; pagevec_init(&pvec, 0); if (wbc->range_cyclic) { index = mapping->writeback_index; /* prev offset */ end = -1; } else { index = wbc->range_start >> PAGE_CACHE_SHIFT; end = wbc->range_end >> PAGE_CACHE_SHIFT; if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) range_whole = 1; } if (wbc->sync_mode == WB_SYNC_ALL) tag = PAGECACHE_TAG_TOWRITE; else tag = PAGECACHE_TAG_DIRTY; if (wbc->sync_mode == WB_SYNC_ALL) tag_pages_for_writeback(mapping, index, end); done_index = index; while (!done && (index <= end)) { int i; nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag, min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); if (nr_pages == 0) { done_index = 0; break; } for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; /* * At this point, the page may be truncated or * invalidated (changing page->mapping to NULL), or * even swizzled back from swapper_space to tmpfs file * mapping. However, page->index will not change * because we have a reference on the page. */ if (page->index > end) { /* * can't be range_cyclic (1st pass) because * end == -1 in that case. */ done = 1; break; } done_index = page->index + 1; lock_page(page); /* * Page truncated or invalidated. We can freely skip it * then, even for data integrity operations: the page * has disappeared concurrently, so there could be no * real expectation of this data interity operation * even if there is now a new, dirty page at the same * pagecache address. */ if (unlikely(page->mapping != mapping)) { continue_unlock: unlock_page(page); continue; } if (!PageDirty(page)) { /* someone wrote it for us */ goto continue_unlock; } if (PageWriteback(page)) { if (wbc->sync_mode != WB_SYNC_NONE) wait_on_page_writeback(page); else goto continue_unlock; } BUG_ON(PageWriteback(page)); if (!clear_page_dirty_for_io(page)) goto continue_unlock; trace_wbc_writepage(wbc, mapping->backing_dev_info); ret = (*writepage)(page, wbc, data); if (unlikely(ret)) { if (ret == AOP_WRITEPAGE_ACTIVATE) { unlock_page(page); ret = 0; } else { /* * done_index is set past this page, * so media errors will not choke * background writeout for the entire * file. This has consequences for * range_cyclic semantics (ie. it may * not be suitable for data integrity * writeout). */ done = 1; break; } } /* * We stop writing back only if we are not doing * integrity sync. In case of integrity sync we have to * keep going until we have written all the pages * we tagged for writeback prior to entering this loop. */ if (--wbc->nr_to_write <= 0 && wbc->sync_mode == WB_SYNC_NONE) { done = 1; break; } } pagevec_release(&pvec); cond_resched(); } if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) mapping->writeback_index = done_index; return ret; }
static int copy_user_bh(struct page *to, struct inode *inode, struct buffer_head *bh, unsigned long vaddr) { struct blk_dax_ctl dax = { .sector = to_sector(bh, inode), .size = bh->b_size, }; struct block_device *bdev = bh->b_bdev; void *vto; if (dax_map_atomic(bdev, &dax) < 0) return PTR_ERR(dax.addr); vto = kmap_atomic(to); copy_user_page(vto, (void __force *)dax.addr, vaddr, to); kunmap_atomic(vto); dax_unmap_atomic(bdev, &dax); return 0; } #define NO_SECTOR -1 #define DAX_PMD_INDEX(page_index) (page_index & (PMD_MASK >> PAGE_SHIFT)) static int dax_radix_entry(struct address_space *mapping, pgoff_t index, sector_t sector, bool pmd_entry, bool dirty) { struct radix_tree_root *page_tree = &mapping->page_tree; pgoff_t pmd_index = DAX_PMD_INDEX(index); int type, error = 0; void *entry; WARN_ON_ONCE(pmd_entry && !dirty); if (dirty) __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); spin_lock_irq(&mapping->tree_lock); entry = radix_tree_lookup(page_tree, pmd_index); if (entry && RADIX_DAX_TYPE(entry) == RADIX_DAX_PMD) { index = pmd_index; goto dirty; } entry = radix_tree_lookup(page_tree, index); if (entry) { type = RADIX_DAX_TYPE(entry); if (WARN_ON_ONCE(type != RADIX_DAX_PTE && type != RADIX_DAX_PMD)) { error = -EIO; goto unlock; } if (!pmd_entry || type == RADIX_DAX_PMD) goto dirty; /* * We only insert dirty PMD entries into the radix tree. This * means we don't need to worry about removing a dirty PTE * entry and inserting a clean PMD entry, thus reducing the * range we would flush with a follow-up fsync/msync call. */ radix_tree_delete(&mapping->page_tree, index); mapping->nrexceptional--; } if (sector == NO_SECTOR) { /* * This can happen during correct operation if our pfn_mkwrite * fault raced against a hole punch operation. If this * happens the pte that was hole punched will have been * unmapped and the radix tree entry will have been removed by * the time we are called, but the call will still happen. We * will return all the way up to wp_pfn_shared(), where the * pte_same() check will fail, eventually causing page fault * to be retried by the CPU. */ goto unlock; } error = radix_tree_insert(page_tree, index, RADIX_DAX_ENTRY(sector, pmd_entry)); if (error) goto unlock; mapping->nrexceptional++; dirty: if (dirty) radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY); unlock: spin_unlock_irq(&mapping->tree_lock); return error; } static int dax_writeback_one(struct block_device *bdev, struct address_space *mapping, pgoff_t index, void *entry) { struct radix_tree_root *page_tree = &mapping->page_tree; int type = RADIX_DAX_TYPE(entry); struct radix_tree_node *node; struct blk_dax_ctl dax; void **slot; int ret = 0; spin_lock_irq(&mapping->tree_lock); /* * Regular page slots are stabilized by the page lock even * without the tree itself locked. These unlocked entries * need verification under the tree lock. */ if (!__radix_tree_lookup(page_tree, index, &node, &slot)) goto unlock; if (*slot != entry) goto unlock; /* another fsync thread may have already written back this entry */ if (!radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE)) goto unlock; if (WARN_ON_ONCE(type != RADIX_DAX_PTE && type != RADIX_DAX_PMD)) { ret = -EIO; goto unlock; } dax.sector = RADIX_DAX_SECTOR(entry); dax.size = (type == RADIX_DAX_PMD ? PMD_SIZE : PAGE_SIZE); spin_unlock_irq(&mapping->tree_lock); /* * We cannot hold tree_lock while calling dax_map_atomic() because it * eventually calls cond_resched(). */ ret = dax_map_atomic(bdev, &dax); if (ret < 0) return ret; if (WARN_ON_ONCE(ret < dax.size)) { ret = -EIO; goto unmap; } wb_cache_pmem(dax.addr, dax.size); spin_lock_irq(&mapping->tree_lock); radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_TOWRITE); spin_unlock_irq(&mapping->tree_lock); unmap: dax_unmap_atomic(bdev, &dax); return ret; unlock: spin_unlock_irq(&mapping->tree_lock); return ret; } /* * Flush the mapping to the persistent domain within the byte range of [start, * end]. This is required by data integrity operations to ensure file data is * on persistent storage prior to completion of the operation. */ int dax_writeback_mapping_range(struct address_space *mapping, struct block_device *bdev, struct writeback_control *wbc) { struct inode *inode = mapping->host; pgoff_t start_index, end_index, pmd_index; pgoff_t indices[PAGEVEC_SIZE]; struct pagevec pvec; bool done = false; int i, ret = 0; void *entry; if (WARN_ON_ONCE(inode->i_blkbits != PAGE_SHIFT)) return -EIO; if (!mapping->nrexceptional || wbc->sync_mode != WB_SYNC_ALL) return 0; start_index = wbc->range_start >> PAGE_SHIFT; end_index = wbc->range_end >> PAGE_SHIFT; pmd_index = DAX_PMD_INDEX(start_index); rcu_read_lock(); entry = radix_tree_lookup(&mapping->page_tree, pmd_index); rcu_read_unlock(); /* see if the start of our range is covered by a PMD entry */ if (entry && RADIX_DAX_TYPE(entry) == RADIX_DAX_PMD) start_index = pmd_index; tag_pages_for_writeback(mapping, start_index, end_index); pagevec_init(&pvec, 0); while (!done) { pvec.nr = find_get_entries_tag(mapping, start_index, PAGECACHE_TAG_TOWRITE, PAGEVEC_SIZE, pvec.pages, indices); if (pvec.nr == 0) break; for (i = 0; i < pvec.nr; i++) { if (indices[i] > end_index) { done = true; break; } ret = dax_writeback_one(bdev, mapping, indices[i], pvec.pages[i]); if (ret < 0) return ret; } } wmb_pmem(); return 0; } EXPORT_SYMBOL_GPL(dax_writeback_mapping_range); static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh, struct vm_area_struct *vma, struct vm_fault *vmf) { unsigned long vaddr = (unsigned long)vmf->virtual_address; struct address_space *mapping = inode->i_mapping; struct block_device *bdev = bh->b_bdev; struct blk_dax_ctl dax = { .sector = to_sector(bh, inode), .size = bh->b_size, }; pgoff_t size; int error; i_mmap_lock_read(mapping); /* * Check truncate didn't happen while we were allocating a block. * If it did, this block may or may not be still allocated to the * file. We can't tell the filesystem to free it because we can't * take i_mutex here. In the worst case, the file still has blocks * allocated past the end of the file. */ size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; if (unlikely(vmf->pgoff >= size)) { error = -EIO; goto out; } if (dax_map_atomic(bdev, &dax) < 0) { error = PTR_ERR(dax.addr); goto out; } if (buffer_unwritten(bh) || buffer_new(bh)) { clear_pmem(dax.addr, PAGE_SIZE); wmb_pmem(); } dax_unmap_atomic(bdev, &dax); error = dax_radix_entry(mapping, vmf->pgoff, dax.sector, false, vmf->flags & FAULT_FLAG_WRITE); if (error) goto out; error = vm_insert_mixed(vma, vaddr, dax.pfn); out: i_mmap_unlock_read(mapping); return error; } /** * __dax_fault - handle a page fault on a DAX file * @vma: The virtual memory area where the fault occurred * @vmf: The description of the fault * @get_block: The filesystem method used to translate file offsets to blocks * @complete_unwritten: The filesystem method used to convert unwritten blocks * to written so the data written to them is exposed. This is required for * required by write faults for filesystems that will return unwritten * extent mappings from @get_block, but it is optional for reads as * dax_insert_mapping() will always zero unwritten blocks. If the fs does * not support unwritten extents, the it should pass NULL. * * When a page fault occurs, filesystems may call this helper in their * fault handler for DAX files. __dax_fault() assumes the caller has done all * the necessary locking for the page fault to proceed successfully. */ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, get_block_t get_block, dax_iodone_t complete_unwritten) { struct file *file = vma->vm_file; struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; struct page *page; struct buffer_head bh; unsigned long vaddr = (unsigned long)vmf->virtual_address; unsigned blkbits = inode->i_blkbits; sector_t block; pgoff_t size; int error; int major = 0; size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; if (vmf->pgoff >= size) return VM_FAULT_SIGBUS; memset(&bh, 0, sizeof(bh)); block = (sector_t)vmf->pgoff << (PAGE_SHIFT - blkbits); bh.b_bdev = inode->i_sb->s_bdev; bh.b_size = PAGE_SIZE; repeat: page = find_get_page(mapping, vmf->pgoff); if (page) { if (!lock_page_or_retry(page, vma->vm_mm, vmf->flags)) { put_page(page); return VM_FAULT_RETRY; } if (unlikely(page->mapping != mapping)) { unlock_page(page); put_page(page); goto repeat; } size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; if (unlikely(vmf->pgoff >= size)) { /* * We have a struct page covering a hole in the file * from a read fault and we've raced with a truncate */ error = -EIO; goto unlock_page; } } error = get_block(inode, block, &bh, 0); if (!error && (bh.b_size < PAGE_SIZE)) error = -EIO; /* fs corruption? */ if (error) goto unlock_page; if (!buffer_mapped(&bh) && !buffer_unwritten(&bh) && !vmf->cow_page) { if (vmf->flags & FAULT_FLAG_WRITE) { error = get_block(inode, block, &bh, 1); count_vm_event(PGMAJFAULT); mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); major = VM_FAULT_MAJOR; if (!error && (bh.b_size < PAGE_SIZE)) error = -EIO; if (error) goto unlock_page; } else { return dax_load_hole(mapping, page, vmf); } } if (vmf->cow_page) { struct page *new_page = vmf->cow_page; if (buffer_written(&bh)) error = copy_user_bh(new_page, inode, &bh, vaddr); else clear_user_highpage(new_page, vaddr); if (error) goto unlock_page; vmf->page = page; if (!page) { i_mmap_lock_read(mapping); /* Check we didn't race with truncate */ size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; if (vmf->pgoff >= size) { i_mmap_unlock_read(mapping); error = -EIO; goto out; } } return VM_FAULT_LOCKED; } /* Check we didn't race with a read fault installing a new page */ if (!page && major) page = find_lock_page(mapping, vmf->pgoff); if (page) { unmap_mapping_range(mapping, vmf->pgoff << PAGE_SHIFT, PAGE_SIZE, 0); delete_from_page_cache(page); unlock_page(page); put_page(page); page = NULL; } /* * If we successfully insert the new mapping over an unwritten extent, * we need to ensure we convert the unwritten extent. If there is an * error inserting the mapping, the filesystem needs to leave it as * unwritten to prevent exposure of the stale underlying data to * userspace, but we still need to call the completion function so * the private resources on the mapping buffer can be released. We * indicate what the callback should do via the uptodate variable, same * as for normal BH based IO completions. */ error = dax_insert_mapping(inode, &bh, vma, vmf); if (buffer_unwritten(&bh)) { if (complete_unwritten) complete_unwritten(&bh, !error); else WARN_ON_ONCE(!(vmf->flags & FAULT_FLAG_WRITE)); } out: if (error == -ENOMEM) return VM_FAULT_OOM | major; /* -EBUSY is fine, somebody else faulted on the same PTE */ if ((error < 0) && (error != -EBUSY)) return VM_FAULT_SIGBUS | major; return VM_FAULT_NOPAGE | major; unlock_page: if (page) { unlock_page(page); put_page(page); } goto out; }
/** * invalidate_inode_pages2_range - remove range of pages from an address_space * @mapping: the address_space * @start: the page offset 'from' which to invalidate * @end: the page offset 'to' which to invalidate (inclusive) * * Any pages which are found to be mapped into pagetables are unmapped prior to * invalidation. * * Returns -EBUSY if any pages could not be invalidated. */ int invalidate_inode_pages2_range(struct address_space *mapping, pgoff_t start, pgoff_t end) { pgoff_t indices[PAGEVEC_SIZE]; struct pagevec pvec; pgoff_t index; int i; int ret = 0; int ret2 = 0; int did_range_unmap = 0; cleancache_invalidate_inode(mapping); pagevec_init(&pvec, 0); index = start; while (index <= end && pagevec_lookup_entries(&pvec, mapping, index, min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1, indices)) { for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; /* We rely upon deletion not changing page->index */ index = indices[i]; if (index > end) break; if (radix_tree_exceptional_entry(page)) { clear_exceptional_entry(mapping, index, page); continue; } lock_page(page); WARN_ON(page->index != index); if (page->mapping != mapping) { unlock_page(page); continue; } wait_on_page_writeback(page); if (page_mapped(page)) { if (!did_range_unmap) { /* * Zap the rest of the file in one hit. */ unmap_mapping_range(mapping, (loff_t)index << PAGE_CACHE_SHIFT, (loff_t)(1 + end - index) << PAGE_CACHE_SHIFT, 0); did_range_unmap = 1; } else { /* * Just zap this page */ unmap_mapping_range(mapping, (loff_t)index << PAGE_CACHE_SHIFT, PAGE_CACHE_SIZE, 0); } } BUG_ON(page_mapped(page)); ret2 = do_launder_page(mapping, page); if (ret2 == 0) { if (!invalidate_complete_page2(mapping, page)) ret2 = -EBUSY; } if (ret2 < 0) ret = ret2; unlock_page(page); } pagevec_remove_exceptionals(&pvec); pagevec_release(&pvec); cond_resched(); index++; } cleancache_invalidate_inode(mapping); return ret; }
/** * write_cache_pages - walk the list of dirty pages of the given address space and write all of them. * @mapping: address space structure to write * @wbc: subtract the number of written pages from *@wbc->nr_to_write * @writepage: function called for each page * @data: data passed to writepage function * * If a page is already under I/O, write_cache_pages() skips it, even * if it's dirty. This is desirable behaviour for memory-cleaning writeback, * but it is INCORRECT for data-integrity system calls such as fsync(). fsync() * and msync() need to guarantee that all the data which was dirty at the time * the call was made get new I/O started against them. If wbc->sync_mode is * WB_SYNC_ALL then we were called for data integrity and we must wait for * existing IO to complete. */ int write_cache_pages(struct address_space *mapping, struct writeback_control *wbc, writepage_t writepage, void *data) { struct backing_dev_info *bdi = mapping->backing_dev_info; int ret = 0; int done = 0; struct pagevec pvec; int nr_pages; pgoff_t uninitialized_var(writeback_index); pgoff_t index; pgoff_t end; /* Inclusive */ pgoff_t done_index; int cycled; int range_whole = 0; if (wbc->nonblocking && bdi_write_congested(bdi)) { wbc->encountered_congestion = 1; return 0; } pagevec_init(&pvec, 0); if (wbc->range_cyclic) { writeback_index = mapping->writeback_index; /* prev offset */ index = writeback_index; if (index == 0) cycled = 1; else cycled = 0; end = -1; } else { index = wbc->range_start >> PAGE_CACHE_SHIFT; end = wbc->range_end >> PAGE_CACHE_SHIFT; if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) range_whole = 1; cycled = 1; /* ignore range_cyclic tests */ } retry: done_index = index; while (!done && (index <= end)) { int i; nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, PAGECACHE_TAG_DIRTY, min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); if (nr_pages == 0) break; for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; /* * At this point, the page may be truncated or * invalidated (changing page->mapping to NULL), or * even swizzled back from swapper_space to tmpfs file * mapping. However, page->index will not change * because we have a reference on the page. */ if (page->index > end) { /* * can't be range_cyclic (1st pass) because * end == -1 in that case. */ done = 1; break; } done_index = page->index + 1; lock_page(page); /* * Page truncated or invalidated. We can freely skip it * then, even for data integrity operations: the page * has disappeared concurrently, so there could be no * real expectation of this data interity operation * even if there is now a new, dirty page at the same * pagecache address. */ if (unlikely(page->mapping != mapping)) { continue_unlock: unlock_page(page); continue; } if (!PageDirty(page)) { /* someone wrote it for us */ goto continue_unlock; } if (PageWriteback(page)) { if (wbc->sync_mode != WB_SYNC_NONE) wait_on_page_writeback(page); else goto continue_unlock; } BUG_ON(PageWriteback(page)); if (!clear_page_dirty_for_io(page)) goto continue_unlock; ret = (*writepage)(page, wbc, data); if (unlikely(ret)) { if (ret == AOP_WRITEPAGE_ACTIVATE) { unlock_page(page); ret = 0; } else { /* * done_index is set past this page, * so media errors will not choke * background writeout for the entire * file. This has consequences for * range_cyclic semantics (ie. it may * not be suitable for data integrity * writeout). */ done = 1; break; } } if (wbc->nr_to_write > 0) { wbc->nr_to_write--; if (wbc->nr_to_write == 0 && wbc->sync_mode == WB_SYNC_NONE) { /* * We stop writing back only if we are * not doing integrity sync. In case of * integrity sync we have to keep going * because someone may be concurrently * dirtying pages, and we might have * synced a lot of newly appeared dirty * pages, but have not synced all of the * old dirty pages. */ done = 1; break; } } if (wbc->nonblocking && bdi_write_congested(bdi)) { wbc->encountered_congestion = 1; done = 1; break; } } pagevec_release(&pvec); cond_resched(); } if (!cycled && !done) { /* * range_cyclic: * We hit the last page and there is more work to be done: wrap * back to the start of the file */ cycled = 1; index = 0; end = writeback_index - 1; goto retry; } if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) mapping->writeback_index = done_index; return ret; }
/* * This routine is called to find out and return a data or hole offset * from the page cache for unwritten extents according to the desired * type for xfs_seek_data() or xfs_seek_hole(). * * The argument offset is used to tell where we start to search from the * page cache. Map is used to figure out the end points of the range to * lookup pages. * * Return true if the desired type of offset was found, and the argument * offset is filled with that address. Otherwise, return false and keep * offset unchanged. */ STATIC bool xfs_find_get_desired_pgoff( struct inode *inode, struct xfs_bmbt_irec *map, unsigned int type, loff_t *offset) { struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; struct pagevec pvec; pgoff_t index; pgoff_t end; loff_t endoff; loff_t startoff = *offset; loff_t lastoff = startoff; bool found = false; pagevec_init(&pvec, 0); index = startoff >> PAGE_CACHE_SHIFT; endoff = XFS_FSB_TO_B(mp, map->br_startoff + map->br_blockcount); end = endoff >> PAGE_CACHE_SHIFT; do { int want; unsigned nr_pages; unsigned int i; want = min_t(pgoff_t, end - index, PAGEVEC_SIZE); nr_pages = pagevec_lookup(&pvec, inode->i_mapping, index, want); /* * No page mapped into given range. If we are searching holes * and if this is the first time we got into the loop, it means * that the given offset is landed in a hole, return it. * * If we have already stepped through some block buffers to find * holes but they all contains data. In this case, the last * offset is already updated and pointed to the end of the last * mapped page, if it does not reach the endpoint to search, * that means there should be a hole between them. */ if (nr_pages == 0) { /* Data search found nothing */ if (type == DATA_OFF) break; ASSERT(type == HOLE_OFF); if (lastoff == startoff || lastoff < endoff) { found = true; *offset = lastoff; } break; } /* * At lease we found one page. If this is the first time we * step into the loop, and if the first page index offset is * greater than the given search offset, a hole was found. */ if (type == HOLE_OFF && lastoff == startoff && lastoff < page_offset(pvec.pages[0])) { found = true; break; } for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; loff_t b_offset; /* * At this point, the page may be truncated or * invalidated (changing page->mapping to NULL), * or even swizzled back from swapper_space to tmpfs * file mapping. However, page->index will not change * because we have a reference on the page. * * Searching done if the page index is out of range. * If the current offset is not reaches the end of * the specified search range, there should be a hole * between them. */ if (page->index > end) { if (type == HOLE_OFF && lastoff < endoff) { *offset = lastoff; found = true; } goto out; } lock_page(page); /* * Page truncated or invalidated(page->mapping == NULL). * We can freely skip it and proceed to check the next * page. */ if (unlikely(page->mapping != inode->i_mapping)) { unlock_page(page); continue; } if (!page_has_buffers(page)) { unlock_page(page); continue; } found = xfs_lookup_buffer_offset(page, &b_offset, type); if (found) { /* * The found offset may be less than the start * point to search if this is the first time to * come here. */ *offset = max_t(loff_t, startoff, b_offset); unlock_page(page); goto out; } /* * We either searching data but nothing was found, or * searching hole but found a data buffer. In either * case, probably the next page contains the desired * things, update the last offset to it so. */ lastoff = page_offset(page) + PAGE_SIZE; unlock_page(page); } /* * The number of returned pages less than our desired, search * done. In this case, nothing was found for searching data, * but we found a hole behind the last offset. */ if (nr_pages < want) { if (type == HOLE_OFF) { *offset = lastoff; found = true; } break; } index = pvec.pages[i - 1]->index + 1; pagevec_release(&pvec); } while (index <= end); out: pagevec_release(&pvec); return found; }