static struct page *init_inode_metadata(struct inode *inode, struct inode *dir, const struct qstr *name) { struct page *page; int err; if (is_inode_flag_set(F2FS_I(inode), FI_NEW_INODE)) { page = new_inode_page(inode, name); if (IS_ERR(page)) return page; if (S_ISDIR(inode->i_mode)) { err = make_empty_dir(inode, dir, page); if (err) goto error; } err = f2fs_init_acl(inode, dir, page); if (err) goto put_error; err = f2fs_init_security(inode, dir, name, page); if (err) goto put_error; wait_on_page_writeback(page); } else { page = get_node_page(F2FS_SB(dir->i_sb), inode->i_ino); if (IS_ERR(page)) return page; wait_on_page_writeback(page); set_cold_node(inode, page); } init_dent_inode(name, page); /* * This file should be checkpointed during fsync. * We lost i_pino from now on. */ if (is_inode_flag_set(F2FS_I(inode), FI_INC_LINK)) { file_lost_pino(inode); inc_nlink(inode); } return page; put_error: f2fs_put_page(page, 1); error: remove_inode_page(inode); return ERR_PTR(err); }
/* * It only removes the dentry from the dentry page,corresponding name * entry in name page does not need to be touched during deletion. */ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, struct inode *inode) { struct f2fs_dentry_block *dentry_blk; unsigned int bit_pos; struct address_space *mapping = page->mapping; struct inode *dir = mapping->host; struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb); int slots = GET_DENTRY_SLOTS(le16_to_cpu(dentry->name_len)); void *kaddr = page_address(page); int i; lock_page(page); wait_on_page_writeback(page); dentry_blk = (struct f2fs_dentry_block *)kaddr; bit_pos = dentry - (struct f2fs_dir_entry *)dentry_blk->dentry; for (i = 0; i < slots; i++) test_and_clear_bit_le(bit_pos + i, &dentry_blk->dentry_bitmap); /* Let's check and deallocate this dentry page */ bit_pos = find_next_bit_le(&dentry_blk->dentry_bitmap, NR_DENTRY_IN_BLOCK, 0); kunmap(page); /* kunmap - pair of f2fs_find_entry */ set_page_dirty(page); dir->i_ctime = dir->i_mtime = CURRENT_TIME; if (inode && S_ISDIR(inode->i_mode)) { drop_nlink(dir); update_inode_page(dir); } else { mark_inode_dirty(dir); } if (inode) { inode->i_ctime = CURRENT_TIME; drop_nlink(inode); if (S_ISDIR(inode->i_mode)) { drop_nlink(inode); i_size_write(inode, 0); } update_inode_page(inode); if (inode->i_nlink == 0) add_orphan_inode(sbi, inode->i_ino); else release_orphan_inode(sbi); } if (bit_pos == NR_DENTRY_IN_BLOCK) { truncate_hole(dir, page->index, page->index + 1); clear_page_dirty_for_io(page); ClearPageUptodate(page); dec_page_count(sbi, F2FS_DIRTY_DENTS); inode_dec_dirty_dents(dir); } f2fs_put_page(page, 1); }
/** * invalidate_inode_pages2 - remove all unmapped pages from an address_space * @mapping - the address_space * * invalidate_inode_pages2() is like truncate_inode_pages(), except for the case * where the page is seen to be mapped into process pagetables. In that case, * the page is marked clean but is left attached to its address_space. * * The page is also marked not uptodate so that a subsequent pagefault will * perform I/O to bringthe page's contents back into sync with its backing * store. * * FIXME: invalidate_inode_pages2() is probably trivially livelockable. */ void invalidate_inode_pages2(struct address_space *mapping) { struct pagevec pvec; pgoff_t next = 0; int i; pagevec_init(&pvec, 0); while (pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; lock_page(page); if (page->mapping == mapping) { /* truncate race? */ wait_on_page_writeback(page); next = page->index + 1; if (page_mapped(page)) { clear_page_dirty(page); ClearPageUptodate(page); } else { if (!invalidate_complete_page(mapping, page)) { clear_page_dirty(page); ClearPageUptodate(page); } } } unlock_page(page); } pagevec_release(&pvec); cond_resched(); } }
/* * Attempt to steal a page from a pipe buffer. This should perhaps go into * a vm helper function, it's already simplified quite a bit by the * addition of remove_mapping(). If success is returned, the caller may * attempt to reuse this page for another destination. */ static int page_cache_pipe_buf_steal(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { struct page *page = buf->page; struct address_space *mapping = page_mapping(page); lock_page(page); WARN_ON(!PageUptodate(page)); /* * At least for ext2 with nobh option, we need to wait on writeback * completing on this page, since we'll remove it from the pagecache. * Otherwise truncate wont wait on the page, allowing the disk * blocks to be reused by someone else before we actually wrote our * data to them. fs corruption ensues. */ wait_on_page_writeback(page); if (PagePrivate(page)) try_to_release_page(page, mapping_gfp_mask(mapping)); if (!remove_mapping(mapping, page)) { unlock_page(page); return 1; } buf->flags |= PIPE_BUF_FLAG_LRU; return 0; }
static int gfs2_write_jdata_pagevec(struct address_space *mapping, struct writeback_control *wbc, struct pagevec *pvec, int nr_pages, pgoff_t end) { struct inode *inode = mapping->host; struct gfs2_sbd *sdp = GFS2_SB(inode); loff_t i_size = i_size_read(inode); pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT; unsigned offset = i_size & (PAGE_CACHE_SIZE-1); unsigned nrblocks = nr_pages * (PAGE_CACHE_SIZE/inode->i_sb->s_blocksize); int i; int ret; ret = gfs2_trans_begin(sdp, nrblocks, nrblocks); if (ret < 0) return ret; for(i = 0; i < nr_pages; i++) { struct page *page = pvec->pages[i]; lock_page(page); if (unlikely(page->mapping != mapping)) { unlock_page(page); continue; } if (!wbc->range_cyclic && page->index > end) { ret = 1; unlock_page(page); continue; } if (wbc->sync_mode != WB_SYNC_NONE) wait_on_page_writeback(page); if (PageWriteback(page) || !clear_page_dirty_for_io(page)) { unlock_page(page); continue; } /* Is the page fully outside i_size? (truncate in progress) */ if (page->index > end_index || (page->index == end_index && !offset)) { page->mapping->a_ops->invalidatepage(page, 0, PAGE_CACHE_SIZE); unlock_page(page); continue; } ret = __gfs2_jdata_writepage(page, wbc); if (ret || (--(wbc->nr_to_write) <= 0)) ret = 1; } gfs2_trans_end(sdp); return ret; }
static int nilfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) { struct page *page = vmf->page; struct inode *inode = vma->vm_file->f_dentry->d_inode; struct nilfs_transaction_info ti; int ret; if (unlikely(nilfs_near_disk_full(inode->i_sb->s_fs_info))) return VM_FAULT_SIGBUS; lock_page(page); if (page->mapping != inode->i_mapping || page_offset(page) >= i_size_read(inode) || !PageUptodate(page)) { unlock_page(page); return VM_FAULT_NOPAGE; } if (PageMappedToDisk(page)) goto mapped; if (page_has_buffers(page)) { struct buffer_head *bh, *head; int fully_mapped = 1; bh = head = page_buffers(page); do { if (!buffer_mapped(bh)) { fully_mapped = 0; break; } } while (bh = bh->b_this_page, bh != head); if (fully_mapped) { SetPageMappedToDisk(page); goto mapped; } } unlock_page(page); ret = nilfs_transaction_begin(inode->i_sb, &ti, 1); if (unlikely(ret)) return VM_FAULT_SIGBUS; ret = block_page_mkwrite(vma, vmf, nilfs_get_block); if (ret != VM_FAULT_LOCKED) { nilfs_transaction_abort(inode->i_sb); return ret; } nilfs_set_file_dirty(inode, 1 << (PAGE_SHIFT - inode->i_blkbits)); nilfs_transaction_commit(inode->i_sb); mapped: wait_on_page_writeback(page); return VM_FAULT_LOCKED; }
/* added: begin address space operations definitions */ static int wrapfs_writepage(struct page *page, struct writeback_control *wbc){ int err = -EIO; struct inode *inode; struct inode *lower_inode; struct page *lower_page; struct address_space *lower_mapping; /* lower inode mapping */ gfp_t mask; BUG_ON(!PageUptodate(page)); inode = page->mapping->host; if (!inode || !WRAPFS_I(inode)){ err = 0; goto out; } lower_inode = wrapfs_lower_inode(inode); lower_mapping = lower_inode->i_mapping; mask = mapping_gfp_mask(lower_mapping) & ~(__GFP_FS); lower_page = find_or_create_page(lower_mapping, page->index, mask); if (!lower_page) { err = 0; set_page_dirty(page); goto out; } copy_highpage(lower_page, page); flush_dcache_page(lower_page); SetPageUptodate(lower_page); set_page_dirty(lower_page); if (wbc->for_reclaim) { unlock_page(lower_page); goto out_release; } BUG_ON(!lower_mapping->a_ops->writepage); wait_on_page_writeback(lower_page); /* prevent multiple writers */ clear_page_dirty_for_io(lower_page); /* emulate VFS behavior */ err = lower_mapping->a_ops->writepage(lower_page, wbc); if (err < 0) goto out_release; if (err == AOP_WRITEPAGE_ACTIVATE) { err = 0; unlock_page(lower_page); } fsstack_copy_attr_times(inode, lower_inode); out_release: page_cache_release(lower_page); out: unlock_page(page); return err; }
void reiser4_wait_page_writeback(struct page *page) { assert("zam-783", PageLocked(page)); do { unlock_page(page); wait_on_page_writeback(page); lock_page(page); } while (PageWriteback(page)); }
int btrfs_wait_marked_extents(struct btrfs_root *root, struct extent_io_tree *dirty_pages, int mark) { int ret; int err = 0; int werr = 0; struct page *page; struct inode *btree_inode = root->fs_info->btree_inode; u64 start = 0; u64 end; unsigned long index; while (1) { ret = find_first_extent_bit(dirty_pages, start, &start, &end, mark); if (ret) break; clear_extent_bits(dirty_pages, start, end, mark, GFP_NOFS); while (start <= end) { index = start >> PAGE_CACHE_SHIFT; start = (u64)(index + 1) << PAGE_CACHE_SHIFT; page = find_get_page(btree_inode->i_mapping, index); if (!page) continue; if (PageDirty(page)) { btree_lock_page_hook(page); wait_on_page_writeback(page); err = write_one_page(page, 0); if (err) werr = err; } wait_on_page_writeback(page); page_cache_release(page); cond_resched(); } } if (err) werr = err; return werr; }
void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de, struct page *page, struct inode *inode) { lock_page(page); wait_on_page_writeback(page); de->ino = cpu_to_le32(inode->i_ino); set_de_type(de, inode); kunmap(page); set_page_dirty(page); dir->i_mtime = dir->i_ctime = CURRENT_TIME; mark_inode_dirty(dir); f2fs_put_page(page, 1); }
void init_dent_inode(const struct qstr *name, struct page *ipage) { struct f2fs_node *rn; if (IS_ERR(ipage)) return; wait_on_page_writeback(ipage); /* copy name info. to this inode page */ rn = (struct f2fs_node *)page_address(ipage); rn->i.i_namelen = cpu_to_le32(name->len); memcpy(rn->i.i_name, name->name, name->len); set_page_dirty(ipage); }
void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de, struct page *page, struct inode *inode) { lock_page(page); wait_on_page_writeback(page); de->ino = cpu_to_le32(inode->i_ino); set_de_type(de, inode); kunmap(page); set_page_dirty(page); dir->i_mtime = dir->i_ctime = CURRENT_TIME; mark_inode_dirty(dir); /* update parent inode number before releasing dentry page */ F2FS_I(inode)->i_pino = dir->i_ino; f2fs_put_page(page, 1); }
/* * Attempt to steal a page from a pipe buffer. This should perhaps go into * a vm helper function, it's already simplified quite a bit by the * addition of remove_mapping(). If success is returned, the caller may * attempt to reuse this page for another destination. */ static int page_cache_pipe_buf_steal(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { struct page *page = buf->page; struct address_space *mapping; lock_page(page); mapping = page_mapping(page); if (mapping) { WARN_ON(!PageUptodate(page)); /* * At least for ext2 with nobh option, we need to wait on * writeback completing on this page, since we'll remove it * from the pagecache. Otherwise truncate wont wait on the * page, allowing the disk blocks to be reused by someone else * before we actually wrote our data to them. fs corruption * ensues. */ wait_on_page_writeback(page); if (PagePrivate(page) && try_to_release_page(page, GFP_KERNEL)) goto out_unlock; /* * If we succeeded in removing the mapping, set LRU flag * and return good. */ if (remove_mapping(mapping, page)) { buf->flags |= PIPE_BUF_FLAG_LRU; return 0; } } /* * Raced with truncate or failed to remove page from current * address space, unlock and return failure. */ out_unlock: unlock_page(page); return 1; }
void nilfs_btnode_delete(struct buffer_head *bh) { struct address_space *mapping; struct page *page = bh->b_page; pgoff_t index = page_index(page); int still_dirty; page_cache_get(page); lock_page(page); wait_on_page_writeback(page); nilfs_forget_buffer(bh); still_dirty = PageDirty(page); mapping = page->mapping; unlock_page(page); page_cache_release(page); if (!still_dirty && mapping) invalidate_inode_pages2_range(mapping, index, index); }
/* * Notification that a PTE pointing to an NFS page is about to be made * writable, implying that someone is about to modify the page through a * shared-writable mapping */ static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) { struct page *page = vmf->page; struct file *filp = vma->vm_file; struct inode *inode = file_inode(filp); unsigned pagelen; int ret = VM_FAULT_NOPAGE; struct address_space *mapping; dfprintk(PAGECACHE, "NFS: vm_page_mkwrite(%pD2(%lu), offset %lld)\n", filp, filp->f_mapping->host->i_ino, (long long)page_offset(page)); /* make sure the cache has finished storing the page */ nfs_fscache_wait_on_page_write(NFS_I(inode), page); wait_on_bit_action(&NFS_I(inode)->flags, NFS_INO_INVALIDATING, nfs_wait_bit_killable, TASK_KILLABLE); lock_page(page); mapping = page_file_mapping(page); if (mapping != inode->i_mapping) goto out_unlock; wait_on_page_writeback(page); pagelen = nfs_page_length(page); if (pagelen == 0) goto out_unlock; ret = VM_FAULT_LOCKED; if (nfs_flush_incompatible(filp, page) == 0 && nfs_updatepage(filp, page, 0, pagelen) == 0) goto out; ret = VM_FAULT_SIGBUS; out_unlock: unlock_page(page); out: return ret; }
/* * Notification that a PTE pointing to an NFS page is about to be made * writable, implying that someone is about to modify the page through a * shared-writable mapping */ static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) { struct page *page = vmf->page; struct file *filp = vma->vm_file; struct dentry *dentry = filp->f_path.dentry; unsigned pagelen; int ret = VM_FAULT_NOPAGE; struct address_space *mapping; dfprintk(PAGECACHE, "NFS: vm_page_mkwrite(%s/%s(%ld), offset %lld)\n", dentry->d_parent->d_name.name, dentry->d_name.name, filp->f_mapping->host->i_ino, (long long)page_offset(page)); /* make sure the cache has finished storing the page */ nfs_fscache_wait_on_page_write(NFS_I(dentry->d_inode), page); lock_page(page); mapping = page_file_mapping(page); if (mapping != dentry->d_inode->i_mapping) goto out_unlock; wait_on_page_writeback(page); pagelen = nfs_page_length(page); if (pagelen == 0) goto out_unlock; ret = VM_FAULT_LOCKED; if (nfs_flush_incompatible(filp, page) == 0 && nfs_updatepage(filp, page, 0, pagelen) == 0) goto out; ret = VM_FAULT_SIGBUS; out_unlock: unlock_page(page); out: return ret; }
/** * write_cache_pages - walk the list of dirty pages of the given address space and write all of them. * @mapping: address space structure to write * @wbc: subtract the number of written pages from *@wbc->nr_to_write * @writepage: function called for each page * @data: data passed to writepage function * * If a page is already under I/O, write_cache_pages() skips it, even * if it's dirty. This is desirable behaviour for memory-cleaning writeback, * but it is INCORRECT for data-integrity system calls such as fsync(). fsync() * and msync() need to guarantee that all the data which was dirty at the time * the call was made get new I/O started against them. If wbc->sync_mode is * WB_SYNC_ALL then we were called for data integrity and we must wait for * existing IO to complete. */ int write_cache_pages(struct address_space *mapping, struct writeback_control *wbc, writepage_t writepage, void *data) { struct backing_dev_info *bdi = mapping->backing_dev_info; int ret = 0; int done = 0; struct pagevec pvec; int nr_pages; pgoff_t uninitialized_var(writeback_index); pgoff_t index; pgoff_t end; /* Inclusive */ pgoff_t done_index; int cycled; int range_whole = 0; if (wbc->nonblocking && bdi_write_congested(bdi)) { wbc->encountered_congestion = 1; return 0; } pagevec_init(&pvec, 0); if (wbc->range_cyclic) { writeback_index = mapping->writeback_index; /* prev offset */ index = writeback_index; if (index == 0) cycled = 1; else cycled = 0; end = -1; } else { index = wbc->range_start >> PAGE_CACHE_SHIFT; end = wbc->range_end >> PAGE_CACHE_SHIFT; if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) range_whole = 1; cycled = 1; /* ignore range_cyclic tests */ } retry: done_index = index; while (!done && (index <= end)) { int i; nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, PAGECACHE_TAG_DIRTY, min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); if (nr_pages == 0) break; for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; /* * At this point, the page may be truncated or * invalidated (changing page->mapping to NULL), or * even swizzled back from swapper_space to tmpfs file * mapping. However, page->index will not change * because we have a reference on the page. */ if (page->index > end) { /* * can't be range_cyclic (1st pass) because * end == -1 in that case. */ done = 1; break; } done_index = page->index + 1; lock_page(page); /* * Page truncated or invalidated. We can freely skip it * then, even for data integrity operations: the page * has disappeared concurrently, so there could be no * real expectation of this data interity operation * even if there is now a new, dirty page at the same * pagecache address. */ if (unlikely(page->mapping != mapping)) { continue_unlock: unlock_page(page); continue; } if (!PageDirty(page)) { /* someone wrote it for us */ goto continue_unlock; } if (PageWriteback(page)) { if (wbc->sync_mode != WB_SYNC_NONE) wait_on_page_writeback(page); else goto continue_unlock; } BUG_ON(PageWriteback(page)); if (!clear_page_dirty_for_io(page)) goto continue_unlock; ret = (*writepage)(page, wbc, data); if (unlikely(ret)) { if (ret == AOP_WRITEPAGE_ACTIVATE) { unlock_page(page); ret = 0; } else { /* * done_index is set past this page, * so media errors will not choke * background writeout for the entire * file. This has consequences for * range_cyclic semantics (ie. it may * not be suitable for data integrity * writeout). */ done = 1; break; } } if (wbc->nr_to_write > 0) { wbc->nr_to_write--; if (wbc->nr_to_write == 0 && wbc->sync_mode == WB_SYNC_NONE) { /* * We stop writing back only if we are * not doing integrity sync. In case of * integrity sync we have to keep going * because someone may be concurrently * dirtying pages, and we might have * synced a lot of newly appeared dirty * pages, but have not synced all of the * old dirty pages. */ done = 1; break; } } if (wbc->nonblocking && bdi_write_congested(bdi)) { wbc->encountered_congestion = 1; done = 1; break; } } pagevec_release(&pvec); cond_resched(); } if (!cycled && !done) { /* * range_cyclic: * We hit the last page and there is more work to be done: wrap * back to the start of the file */ cycled = 1; index = 0; end = writeback_index - 1; goto retry; } if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) mapping->writeback_index = done_index; return ret; }
/** * write_one_page - write out a single page and optionally wait on I/O * @page: the page to write * @wait: if true, wait on writeout * * The page must be locked by the caller and will be unlocked upon return. * * write_one_page() returns a negative error code if I/O failed. */ int write_one_page(struct page *page, int wait) { struct address_space *mapping = page->mapping; int ret = 0; struct writeback_control wbc = { .sync_mode = WB_SYNC_ALL, .nr_to_write = 1, }; BUG_ON(!PageLocked(page)); if (wait) wait_on_page_writeback(page); if (clear_page_dirty_for_io(page)) { page_cache_get(page); ret = mapping->a_ops->writepage(page, &wbc); if (ret == 0 && wait) { wait_on_page_writeback(page); if (PageError(page)) ret = -EIO; } page_cache_release(page); } else { unlock_page(page); } return ret; } EXPORT_SYMBOL(write_one_page); /* * For address_spaces which do not use buffers nor write back. */ int __set_page_dirty_no_writeback(struct page *page) { if (!PageDirty(page)) SetPageDirty(page); return 0; } /* * For address_spaces which do not use buffers. Just tag the page as dirty in * its radix tree. * * This is also used when a single buffer is being dirtied: we want to set the * page dirty in that case, but not all the buffers. This is a "bottom-up" * dirtying, whereas __set_page_dirty_buffers() is a "top-down" dirtying. * * Most callers have locked the page, which pins the address_space in memory. * But zap_pte_range() does not lock the page, however in that case the * mapping is pinned by the vma's ->vm_file reference. * * We take care to handle the case where the page was truncated from the * mapping by re-checking page_mapping() inside tree_lock. */ int __set_page_dirty_nobuffers(struct page *page) { if (!TestSetPageDirty(page)) { struct address_space *mapping = page_mapping(page); struct address_space *mapping2; if (!mapping) return 1; spin_lock_irq(&mapping->tree_lock); mapping2 = page_mapping(page); if (mapping2) { /* Race with truncate? */ BUG_ON(mapping2 != mapping); WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page)); if (mapping_cap_account_dirty(mapping)) { __inc_zone_page_state(page, NR_FILE_DIRTY); __inc_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE); task_io_account_write(PAGE_CACHE_SIZE); } radix_tree_tag_set(&mapping->page_tree, page_index(page), PAGECACHE_TAG_DIRTY); } spin_unlock_irq(&mapping->tree_lock); if (mapping->host) { /* !PageAnon && !swapper_space */ __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); } return 1; } return 0; } EXPORT_SYMBOL(__set_page_dirty_nobuffers); /* * When a writepage implementation decides that it doesn't want to write this * page for some reason, it should redirty the locked page via * redirty_page_for_writepage() and it should then unlock the page and return 0 */ int redirty_page_for_writepage(struct writeback_control *wbc, struct page *page) { wbc->pages_skipped++; return __set_page_dirty_nobuffers(page); }
static int gfs2_write_jdata_pagevec(struct address_space *mapping, struct writeback_control *wbc, struct pagevec *pvec, int nr_pages, pgoff_t end, pgoff_t *done_index) { struct inode *inode = mapping->host; struct gfs2_sbd *sdp = GFS2_SB(inode); unsigned nrblocks = nr_pages * (PAGE_CACHE_SIZE/inode->i_sb->s_blocksize); int i; int ret; ret = gfs2_trans_begin(sdp, nrblocks, nrblocks); if (ret < 0) return ret; for(i = 0; i < nr_pages; i++) { struct page *page = pvec->pages[i]; /* * At this point, the page may be truncated or * invalidated (changing page->mapping to NULL), or * even swizzled back from swapper_space to tmpfs file * mapping. However, page->index will not change * because we have a reference on the page. */ if (page->index > end) { /* * can't be range_cyclic (1st pass) because * end == -1 in that case. */ ret = 1; break; } *done_index = page->index; lock_page(page); if (unlikely(page->mapping != mapping)) { continue_unlock: unlock_page(page); continue; } if (!PageDirty(page)) { /* someone wrote it for us */ goto continue_unlock; } if (PageWriteback(page)) { if (wbc->sync_mode != WB_SYNC_NONE) wait_on_page_writeback(page); else goto continue_unlock; } BUG_ON(PageWriteback(page)); if (!clear_page_dirty_for_io(page)) goto continue_unlock; trace_wbc_writepage(wbc, mapping->backing_dev_info); ret = __gfs2_jdata_writepage(page, wbc); if (unlikely(ret)) { if (ret == AOP_WRITEPAGE_ACTIVATE) { unlock_page(page); ret = 0; } else { /* * done_index is set past this page, * so media errors will not choke * background writeout for the entire * file. This has consequences for * range_cyclic semantics (ie. it may * not be suitable for data integrity * writeout). */ *done_index = page->index + 1; ret = 1; break; } } /* * We stop writing back only if we are not doing * integrity sync. In case of integrity sync we have to * keep going until we have written all the pages * we tagged for writeback prior to entering this loop. */ if (--wbc->nr_to_write <= 0 && wbc->sync_mode == WB_SYNC_NONE) { ret = 1; break; } } gfs2_trans_end(sdp); return ret; }
/** * write_one_page - write out a single page and optionally wait on I/O * @page: the page to write * @wait: if true, wait on writeout * * The page must be locked by the caller and will be unlocked upon return. * * write_one_page() returns a negative error code if I/O failed. */ int write_one_page(struct page *page, int wait) { struct address_space *mapping = page->mapping; int ret = 0; struct writeback_control wbc = { .sync_mode = WB_SYNC_ALL, .nr_to_write = 1, }; BUG_ON(!PageLocked(page)); if (wait) wait_on_page_writeback(page); if (clear_page_dirty_for_io(page)) { page_cache_get(page); ret = mapping->a_ops->writepage(page, &wbc); if (ret == 0 && wait) { wait_on_page_writeback(page); if (PageError(page)) ret = -EIO; } page_cache_release(page); } else { unlock_page(page); } return ret; } EXPORT_SYMBOL(write_one_page); /* * For address_spaces which do not use buffers nor write back. */ int __set_page_dirty_no_writeback(struct page *page) { if (!PageDirty(page)) return !TestSetPageDirty(page); return 0; } /* * Helper function for set_page_dirty family. * NOTE: This relies on being atomic wrt interrupts. */ void account_page_dirtied(struct page *page, struct address_space *mapping) { if (mapping_cap_account_dirty(mapping)) { __inc_zone_page_state(page, NR_FILE_DIRTY); __inc_zone_page_state(page, NR_DIRTIED); __inc_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE); task_dirty_inc(current); task_io_account_write(PAGE_CACHE_SIZE); } } EXPORT_SYMBOL(account_page_dirtied); /* * Helper function for set_page_writeback family. * NOTE: Unlike account_page_dirtied this does not rely on being atomic * wrt interrupts. */ void account_page_writeback(struct page *page) { inc_zone_page_state(page, NR_WRITEBACK); inc_zone_page_state(page, NR_WRITTEN); }
/**ltl 功能:遍历给定地址空间的"脏"页面,写这些页面 参数: 返回值: 说明: */ int write_cache_pages(struct address_space *mapping, struct writeback_control *wbc, writepage_t writepage, void *data) { int ret = 0; int done = 0; struct pagevec pvec; int nr_pages; pgoff_t uninitialized_var(writeback_index); pgoff_t index;/*要冲刷页面的索引*/ pgoff_t end; /* Inclusive *//*冲刷最后一个页面的索引,-1表示要循环冲刷*/ pgoff_t done_index; int cycled;/*主要用在回绕需要分成两段进行冲刷的情况下,为1表示前一段冲刷已经完成。*/ int range_whole = 0; int tag; pagevec_init(&pvec, 0); if (wbc->range_cyclic) {//是否要循环进行冲刷 writeback_index = mapping->writeback_index; /* prev offset */ index = writeback_index; if (index == 0) cycled = 1; else cycled = 0; end = -1; } else { index = wbc->range_start >> PAGE_CACHE_SHIFT; end = wbc->range_end >> PAGE_CACHE_SHIFT; if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) range_whole = 1; cycled = 1; /* ignore range_cyclic tests */ } if (wbc->sync_mode == WB_SYNC_ALL) tag = PAGECACHE_TAG_TOWRITE; else tag = PAGECACHE_TAG_DIRTY; retry: if (wbc->sync_mode == WB_SYNC_ALL) tag_pages_for_writeback(mapping, index, end); done_index = index; while (!done && (index <= end)) { int i; /*在地址空间中查找设备了PAGECACHE_TAG_DIRTY标志的页面,将结果保存在pagevec中*/ nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag, min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); if (nr_pages == 0) break; //对找到的页面进行处理。 for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; /* * At this point, the page may be truncated or * invalidated (changing page->mapping to NULL), or * even swizzled back from swapper_space to tmpfs file * mapping. However, page->index will not change * because we have a reference on the page. */ if (page->index > end) { /* * can't be range_cyclic (1st pass) because * end == -1 in that case. */ done = 1; break; } done_index = page->index + 1; //页面加锁 lock_page(page); /* * Page truncated or invalidated. We can freely skip it * then, even for data integrity operations: the page * has disappeared concurrently, so there could be no * real expectation of this data interity operation * even if there is now a new, dirty page at the same * pagecache address. */ /*由于在加锁过程中可能其它进程对页面做过改动,因此要做以下判断*/ if (unlikely(page->mapping != mapping)) {//页面无效 continue_unlock: unlock_page(page); continue; } if (!PageDirty(page)) {//页面回写完成,I_DIRTY标志已经清除。 /* someone wrote it for us */ goto continue_unlock; } if (PageWriteback(page)) {//页面正在回写中,那要根据sync_mode采取策略 if (wbc->sync_mode != WB_SYNC_NONE) wait_on_page_writeback(page);//要等待正在回写完成后才继续 else goto continue_unlock; } BUG_ON(PageWriteback(page)); if (!clear_page_dirty_for_io(page)) goto continue_unlock; trace_wbc_writepage(wbc, mapping->backing_dev_info); //开始回写"脏"页面 ret = (*writepage)(page, wbc, data); if (unlikely(ret)) { if (ret == AOP_WRITEPAGE_ACTIVATE) { unlock_page(page); ret = 0; } else { /* * done_index is set past this page, * so media errors will not choke * background writeout for the entire * file. This has consequences for * range_cyclic semantics (ie. it may * not be suitable for data integrity * writeout). */ done = 1; break; } } /* * We stop writing back only if we are not doing * integrity sync. In case of integrity sync we have to * keep going until we have written all the pages * we tagged for writeback prior to entering this loop. */ /*页面写成功后,递减计数器*/ if (--wbc->nr_to_write <= 0 && wbc->sync_mode == WB_SYNC_NONE) { done = 1; break; } } pagevec_release(&pvec); cond_resched(); } if (!cycled && !done) { /* * range_cyclic: * We hit the last page and there is more work to be done: wrap * back to the start of the file */ cycled = 1; index = 0; end = writeback_index - 1; goto retry; } if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) mapping->writeback_index = done_index; return ret; }
/* * Caller should grab and release a rwsem by calling f2fs_lock_op() and * f2fs_unlock_op(). */ int __f2fs_add_link(struct inode *dir, const struct qstr *name, struct inode *inode) { unsigned int bit_pos; unsigned int level; unsigned int current_depth; unsigned long bidx, block; f2fs_hash_t dentry_hash; struct f2fs_dir_entry *de; unsigned int nbucket, nblock; size_t namelen = name->len; struct page *dentry_page = NULL; struct f2fs_dentry_block *dentry_blk = NULL; int slots = GET_DENTRY_SLOTS(namelen); struct page *page; int err = 0; int i; dentry_hash = f2fs_dentry_hash(name->name, name->len); level = 0; current_depth = F2FS_I(dir)->i_current_depth; if (F2FS_I(dir)->chash == dentry_hash) { level = F2FS_I(dir)->clevel; F2FS_I(dir)->chash = 0; } start: if (unlikely(current_depth == MAX_DIR_HASH_DEPTH)) return -ENOSPC; /* Increase the depth, if required */ if (level == current_depth) ++current_depth; nbucket = dir_buckets(level); nblock = bucket_blocks(level); bidx = dir_block_index(level, (le32_to_cpu(dentry_hash) % nbucket)); for (block = bidx; block <= (bidx + nblock - 1); block++) { dentry_page = get_new_data_page(dir, NULL, block, true); if (IS_ERR(dentry_page)) return PTR_ERR(dentry_page); dentry_blk = kmap(dentry_page); bit_pos = room_for_filename(dentry_blk, slots); if (bit_pos < NR_DENTRY_IN_BLOCK) goto add_dentry; kunmap(dentry_page); f2fs_put_page(dentry_page, 1); } /* Move to next level to find the empty slot for new dentry */ ++level; goto start; add_dentry: wait_on_page_writeback(dentry_page); page = init_inode_metadata(inode, dir, name); if (IS_ERR(page)) { err = PTR_ERR(page); goto fail; } de = &dentry_blk->dentry[bit_pos]; de->hash_code = dentry_hash; de->name_len = cpu_to_le16(namelen); memcpy(dentry_blk->filename[bit_pos], name->name, name->len); de->ino = cpu_to_le32(inode->i_ino); set_de_type(de, inode); for (i = 0; i < slots; i++) test_and_set_bit_le(bit_pos + i, &dentry_blk->dentry_bitmap); set_page_dirty(dentry_page); /* we don't need to mark_inode_dirty now */ F2FS_I(inode)->i_pino = dir->i_ino; update_inode(inode, page); f2fs_put_page(page, 1); update_parent_metadata(dir, inode, current_depth); fail: clear_inode_flag(F2FS_I(dir), FI_UPDATE_DIR); kunmap(dentry_page); f2fs_put_page(dentry_page, 1); return err; }
static int wrapfs_writepage(struct page *page, struct writeback_control *wbc) { int err = -EIO; struct inode *inode; struct inode *lower_inode; struct page *lower_page; struct address_space *lower_mapping; /* lower inode mapping */ gfp_t mask; /*printk(KERN_ALERT "in writepage() \n");*/ BUG_ON(!PageUptodate(page)); inode = page->mapping->host; /* if no lower inode, nothing to do */ if (!inode || !WRAPFS_I(inode) || WRAPFS_I(inode)->lower_inode) { err = 0; goto out; } lower_inode = wrapfs_lower_inode(inode); lower_mapping = lower_inode->i_mapping; /* * find lower page (returns a locked page) * * We turn off __GFP_FS while we look for or create a new lower * page. This prevents a recursion into the file system code, which * under memory pressure conditions could lead to a deadlock. This * is similar to how the loop driver behaves (see loop_set_fd in * drivers/block/loop.c). If we can't find the lower page, we * redirty our page and return "success" so that the VM will call us * again in the (hopefully near) future. */ mask = mapping_gfp_mask(lower_mapping) & ~(__GFP_FS); lower_page = find_or_create_page(lower_mapping, page->index, mask); if (!lower_page) { err = 0; set_page_dirty(page); goto out; } /* copy page data from our upper page to the lower page */ copy_highpage(lower_page, page); flush_dcache_page(lower_page); SetPageUptodate(lower_page); set_page_dirty(lower_page); /* * Call lower writepage (expects locked page). However, if we are * called with wbc->for_reclaim, then the VFS/VM just wants to * reclaim our page. Therefore, we don't need to call the lower * ->writepage: just copy our data to the lower page (already done * above), then mark the lower page dirty and unlock it, and return * success. */ if (wbc->for_reclaim) { unlock_page(lower_page); goto out_release; } BUG_ON(!lower_mapping->a_ops->writepage); wait_on_page_writeback(lower_page); /* prevent multiple writers */ clear_page_dirty_for_io(lower_page); /* emulate VFS behavior */ err = lower_mapping->a_ops->writepage(lower_page, wbc); if (err < 0) goto out_release; /* * Lower file systems such as ramfs and tmpfs, may return * AOP_WRITEPAGE_ACTIVATE so that the VM won't try to (pointlessly) * write the page again for a while. But those lower file systems * also set the page dirty bit back again. Since we successfully * copied our page data to the lower page, then the VM will come * back to the lower page (directly) and try to flush it. So we can * save the VM the hassle of coming back to our page and trying to * flush too. Therefore, we don't re-dirty our own page, and we * never return AOP_WRITEPAGE_ACTIVATE back to the VM (we consider * this a success). * * We also unlock the lower page if the lower ->writepage returned * AOP_WRITEPAGE_ACTIVATE. (This "anomalous" behaviour may be * addressed in future shmem/VM code.) */ if (err == AOP_WRITEPAGE_ACTIVATE) { err = 0; unlock_page(lower_page); } /* all is well */ /* lower mtimes have changed: update ours */ /* fsstack_copy_inode_size(dentry->d_inode, lower_file->f_path.dentry->d_inode); fsstack_copy_attr_times(dentry->d_inode, lower_file->f_path.dentry->d_inode); */ out_release: /* b/c find_or_create_page increased refcnt */ page_cache_release(lower_page); out: /* * We unlock our page unconditionally, because we never return * AOP_WRITEPAGE_ACTIVATE. */ unlock_page(page); return err; }
struct inode *f2fs_iget_nowait(struct super_block *sb, unsigned long ino) { struct f2fs_iget_args args = { .ino = ino, .on_free = 0 }; struct inode *inode = ilookup5(sb, ino, f2fs_iget_test, &args); if (inode) return inode; if (!args.on_free) return f2fs_iget(sb, ino); return ERR_PTR(-ENOENT); } static int do_read_inode(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); struct f2fs_inode_info *fi = F2FS_I(inode); struct page *node_page; struct f2fs_node *rn; struct f2fs_inode *ri; /* Check if ino is within scope */ check_nid_range(sbi, inode->i_ino); node_page = get_node_page(sbi, inode->i_ino); if (IS_ERR(node_page)) return PTR_ERR(node_page); rn = page_address(node_page); ri = &(rn->i); inode->i_mode = le16_to_cpu(ri->i_mode); i_uid_write(inode, le32_to_cpu(ri->i_uid)); i_gid_write(inode, le32_to_cpu(ri->i_gid)); set_nlink(inode, le32_to_cpu(ri->i_links)); inode->i_size = le64_to_cpu(ri->i_size); inode->i_blocks = le64_to_cpu(ri->i_blocks); inode->i_atime.tv_sec = le64_to_cpu(ri->i_atime); inode->i_ctime.tv_sec = le64_to_cpu(ri->i_ctime); inode->i_mtime.tv_sec = le64_to_cpu(ri->i_mtime); inode->i_atime.tv_nsec = le32_to_cpu(ri->i_atime_nsec); inode->i_ctime.tv_nsec = le32_to_cpu(ri->i_ctime_nsec); inode->i_mtime.tv_nsec = le32_to_cpu(ri->i_mtime_nsec); inode->i_generation = le32_to_cpu(ri->i_generation); fi->i_current_depth = le32_to_cpu(ri->i_current_depth); fi->i_xattr_nid = le32_to_cpu(ri->i_xattr_nid); fi->i_flags = le32_to_cpu(ri->i_flags); fi->flags = 0; fi->data_version = le64_to_cpu(F2FS_CKPT(sbi)->checkpoint_ver) - 1; fi->i_advise = ri->i_advise; fi->i_pino = le32_to_cpu(ri->i_pino); get_extent_info(&fi->ext, ri->i_ext); f2fs_put_page(node_page, 1); return 0; } struct inode *f2fs_iget(struct super_block *sb, unsigned long ino) { struct f2fs_sb_info *sbi = F2FS_SB(sb); struct inode *inode; int ret; inode = iget_locked(sb, ino); if (!inode) return ERR_PTR(-ENOMEM); if (!(inode->i_state & I_NEW)) return inode; if (ino == F2FS_NODE_INO(sbi) || ino == F2FS_META_INO(sbi)) goto make_now; ret = do_read_inode(inode); if (ret) goto bad_inode; if (!sbi->por_doing && inode->i_nlink == 0) { ret = -ENOENT; goto bad_inode; } make_now: if (ino == F2FS_NODE_INO(sbi)) { inode->i_mapping->a_ops = &f2fs_node_aops; mapping_set_gfp_mask(inode->i_mapping, GFP_F2FS_ZERO); } else if (ino == F2FS_META_INO(sbi)) { inode->i_mapping->a_ops = &f2fs_meta_aops; mapping_set_gfp_mask(inode->i_mapping, GFP_F2FS_ZERO); } else if (S_ISREG(inode->i_mode)) { inode->i_op = &f2fs_file_inode_operations; inode->i_fop = &f2fs_file_operations; inode->i_mapping->a_ops = &f2fs_dblock_aops; } else if (S_ISDIR(inode->i_mode)) { inode->i_op = &f2fs_dir_inode_operations; inode->i_fop = &f2fs_dir_operations; inode->i_mapping->a_ops = &f2fs_dblock_aops; mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER_MOVABLE | __GFP_ZERO); } else if (S_ISLNK(inode->i_mode)) { inode->i_op = &f2fs_symlink_inode_operations; inode->i_mapping->a_ops = &f2fs_dblock_aops; } else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) || S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) { inode->i_op = &f2fs_special_inode_operations; init_special_inode(inode, inode->i_mode, inode->i_rdev); } else { ret = -EIO; goto bad_inode; } unlock_new_inode(inode); return inode; bad_inode: iget_failed(inode); return ERR_PTR(ret); } void update_inode(struct inode *inode, struct page *node_page) { struct f2fs_node *rn; struct f2fs_inode *ri; wait_on_page_writeback(node_page); rn = page_address(node_page); ri = &(rn->i); ri->i_mode = cpu_to_le16(inode->i_mode); ri->i_advise = F2FS_I(inode)->i_advise; ri->i_uid = cpu_to_le32(i_uid_read(inode)); ri->i_gid = cpu_to_le32(i_gid_read(inode)); ri->i_links = cpu_to_le32(inode->i_nlink); ri->i_size = cpu_to_le64(i_size_read(inode)); ri->i_blocks = cpu_to_le64(inode->i_blocks); set_raw_extent(&F2FS_I(inode)->ext, &ri->i_ext); ri->i_atime = cpu_to_le64(inode->i_atime.tv_sec); ri->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec); ri->i_mtime = cpu_to_le64(inode->i_mtime.tv_sec); ri->i_atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec); ri->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); ri->i_mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); ri->i_current_depth = cpu_to_le32(F2FS_I(inode)->i_current_depth); ri->i_xattr_nid = cpu_to_le32(F2FS_I(inode)->i_xattr_nid); ri->i_flags = cpu_to_le32(F2FS_I(inode)->i_flags); ri->i_pino = cpu_to_le32(F2FS_I(inode)->i_pino); ri->i_generation = cpu_to_le32(inode->i_generation); set_cold_node(inode, node_page); set_page_dirty(node_page); } int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc) { struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); struct page *node_page; bool need_lock = false; if (inode->i_ino == F2FS_NODE_INO(sbi) || inode->i_ino == F2FS_META_INO(sbi)) return 0; if (wbc) f2fs_balance_fs(sbi); node_page = get_node_page(sbi, inode->i_ino); if (IS_ERR(node_page)) return PTR_ERR(node_page); if (!PageDirty(node_page)) { need_lock = true; f2fs_put_page(node_page, 1); mutex_lock(&sbi->write_inode); node_page = get_node_page(sbi, inode->i_ino); if (IS_ERR(node_page)) { mutex_unlock(&sbi->write_inode); return PTR_ERR(node_page); } } update_inode(inode, node_page); f2fs_put_page(node_page, 1); if (need_lock) mutex_unlock(&sbi->write_inode); return 0; } /* * Called at the last iput() if i_nlink is zero */ void f2fs_evict_inode(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); truncate_inode_pages(&inode->i_data, 0); if (inode->i_ino == F2FS_NODE_INO(sbi) || inode->i_ino == F2FS_META_INO(sbi)) goto no_delete; BUG_ON(atomic_read(&F2FS_I(inode)->dirty_dents)); remove_dirty_dir_inode(inode); if (inode->i_nlink || is_bad_inode(inode)) goto no_delete; set_inode_flag(F2FS_I(inode), FI_NO_ALLOC); i_size_write(inode, 0); if (F2FS_HAS_BLOCKS(inode)) f2fs_truncate(inode); remove_inode_page(inode); no_delete: clear_inode(inode); }
/** * truncate_inode_pages_range - truncate range of pages specified by start & end byte offsets * @mapping: mapping to truncate * @lstart: offset from which to truncate * @lend: offset to which to truncate (inclusive) * * Truncate the page cache, removing the pages that are between * specified offsets (and zeroing out partial pages * if lstart or lend + 1 is not page aligned). * * Truncate takes two passes - the first pass is nonblocking. It will not * block on page locks and it will not block on writeback. The second pass * will wait. This is to prevent as much IO as possible in the affected region. * The first pass will remove most pages, so the search cost of the second pass * is low. * * We pass down the cache-hot hint to the page freeing code. Even if the * mapping is large, it is probably the case that the final pages are the most * recently touched, and freeing happens in ascending file offset order. * * Note that since ->invalidatepage() accepts range to invalidate * truncate_inode_pages_range is able to handle cases where lend + 1 is not * page aligned properly. */ void truncate_inode_pages_range(struct address_space *mapping, loff_t lstart, loff_t lend) { pgoff_t start; /* inclusive */ pgoff_t end; /* exclusive */ unsigned int partial_start; /* inclusive */ unsigned int partial_end; /* exclusive */ struct pagevec pvec; pgoff_t indices[PAGEVEC_SIZE]; pgoff_t index; int i; cleancache_invalidate_inode(mapping); if (mapping->nrpages == 0 && mapping->nrexceptional == 0) return; /* Offsets within partial pages */ partial_start = lstart & (PAGE_CACHE_SIZE - 1); partial_end = (lend + 1) & (PAGE_CACHE_SIZE - 1); /* * 'start' and 'end' always covers the range of pages to be fully * truncated. Partial pages are covered with 'partial_start' at the * start of the range and 'partial_end' at the end of the range. * Note that 'end' is exclusive while 'lend' is inclusive. */ start = (lstart + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; if (lend == -1) /* * lend == -1 indicates end-of-file so we have to set 'end' * to the highest possible pgoff_t and since the type is * unsigned we're using -1. */ end = -1; else end = (lend + 1) >> PAGE_CACHE_SHIFT; pagevec_init(&pvec, 0); index = start; while (index < end && pagevec_lookup_entries(&pvec, mapping, index, min(end - index, (pgoff_t)PAGEVEC_SIZE), indices)) { for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; /* We rely upon deletion not changing page->index */ index = indices[i]; if (index >= end) break; if (radix_tree_exceptional_entry(page)) { clear_exceptional_entry(mapping, index, page); continue; } if (!trylock_page(page)) continue; WARN_ON(page->index != index); if (PageWriteback(page)) { unlock_page(page); continue; } truncate_inode_page(mapping, page); unlock_page(page); } pagevec_remove_exceptionals(&pvec); pagevec_release(&pvec); cond_resched(); index++; } if (partial_start) { struct page *page = find_lock_page(mapping, start - 1); if (page) { unsigned int top = PAGE_CACHE_SIZE; if (start > end) { /* Truncation within a single page */ top = partial_end; partial_end = 0; } wait_on_page_writeback(page); zero_user_segment(page, partial_start, top); cleancache_invalidate_page(mapping, page); if (page_has_private(page)) do_invalidatepage(page, partial_start, top - partial_start); unlock_page(page); page_cache_release(page); } } if (partial_end) { struct page *page = find_lock_page(mapping, end); if (page) { wait_on_page_writeback(page); zero_user_segment(page, 0, partial_end); cleancache_invalidate_page(mapping, page); if (page_has_private(page)) do_invalidatepage(page, 0, partial_end); unlock_page(page); page_cache_release(page); } } /* * If the truncation happened within a single page no pages * will be released, just zeroed, so we can bail out now. */ if (start >= end) return; index = start; for ( ; ; ) { cond_resched(); if (!pagevec_lookup_entries(&pvec, mapping, index, min(end - index, (pgoff_t)PAGEVEC_SIZE), indices)) { /* If all gone from start onwards, we're done */ if (index == start) break; /* Otherwise restart to make sure all gone */ index = start; continue; } if (index == start && indices[0] >= end) { /* All gone out of hole to be punched, we're done */ pagevec_remove_exceptionals(&pvec); pagevec_release(&pvec); break; } for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; /* We rely upon deletion not changing page->index */ index = indices[i]; if (index >= end) { /* Restart punch to make sure all gone */ index = start - 1; break; } if (radix_tree_exceptional_entry(page)) { clear_exceptional_entry(mapping, index, page); continue; } lock_page(page); WARN_ON(page->index != index); wait_on_page_writeback(page); truncate_inode_page(mapping, page); unlock_page(page); } pagevec_remove_exceptionals(&pvec); pagevec_release(&pvec); index++; } cleancache_invalidate_inode(mapping); }
/** * invalidate_inode_pages2_range - remove range of pages from an address_space * @mapping: the address_space * @start: the page offset 'from' which to invalidate * @end: the page offset 'to' which to invalidate (inclusive) * * Any pages which are found to be mapped into pagetables are unmapped prior to * invalidation. * * Returns -EBUSY if any pages could not be invalidated. */ int invalidate_inode_pages2_range(struct address_space *mapping, pgoff_t start, pgoff_t end) { pgoff_t indices[PAGEVEC_SIZE]; struct pagevec pvec; pgoff_t index; int i; int ret = 0; int ret2 = 0; int did_range_unmap = 0; cleancache_invalidate_inode(mapping); pagevec_init(&pvec, 0); index = start; while (index <= end && pagevec_lookup_entries(&pvec, mapping, index, min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1, indices)) { for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; /* We rely upon deletion not changing page->index */ index = indices[i]; if (index > end) break; if (radix_tree_exceptional_entry(page)) { clear_exceptional_entry(mapping, index, page); continue; } lock_page(page); WARN_ON(page->index != index); if (page->mapping != mapping) { unlock_page(page); continue; } wait_on_page_writeback(page); if (page_mapped(page)) { if (!did_range_unmap) { /* * Zap the rest of the file in one hit. */ unmap_mapping_range(mapping, (loff_t)index << PAGE_CACHE_SHIFT, (loff_t)(1 + end - index) << PAGE_CACHE_SHIFT, 0); did_range_unmap = 1; } else { /* * Just zap this page */ unmap_mapping_range(mapping, (loff_t)index << PAGE_CACHE_SHIFT, PAGE_CACHE_SIZE, 0); } } BUG_ON(page_mapped(page)); ret2 = do_launder_page(mapping, page); if (ret2 == 0) { if (!invalidate_complete_page2(mapping, page)) ret2 = -EBUSY; } if (ret2 < 0) ret = ret2; unlock_page(page); } pagevec_remove_exceptionals(&pvec); pagevec_release(&pvec); cond_resched(); index++; } cleancache_invalidate_inode(mapping); return ret; }
static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) { struct page *page = vmf->page; struct inode *inode = vma->vm_file->f_path.dentry->d_inode; struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_sbd *sdp = GFS2_SB(inode); unsigned long last_index; u64 pos = page->index << PAGE_CACHE_SHIFT; unsigned int data_blocks, ind_blocks, rblocks; struct gfs2_holder gh; struct gfs2_qadata *qa; loff_t size; int ret; vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); ret = gfs2_glock_nq(&gh); if (ret) goto out; set_bit(GLF_DIRTY, &ip->i_gl->gl_flags); set_bit(GIF_SW_PAGED, &ip->i_flags); if (!gfs2_write_alloc_required(ip, pos, PAGE_CACHE_SIZE)) { lock_page(page); if (!PageUptodate(page) || page->mapping != inode->i_mapping) { ret = -EAGAIN; unlock_page(page); } goto out_unlock; } ret = -ENOMEM; qa = gfs2_qadata_get(ip); if (qa == NULL) goto out_unlock; ret = gfs2_quota_lock_check(ip); if (ret) goto out_alloc_put; gfs2_write_calc_reserv(ip, PAGE_CACHE_SIZE, &data_blocks, &ind_blocks); ret = gfs2_inplace_reserve(ip, data_blocks + ind_blocks); if (ret) goto out_quota_unlock; rblocks = RES_DINODE + ind_blocks; if (gfs2_is_jdata(ip)) rblocks += data_blocks ? data_blocks : 1; if (ind_blocks || data_blocks) { rblocks += RES_STATFS + RES_QUOTA; rblocks += gfs2_rg_blocks(ip); } ret = gfs2_trans_begin(sdp, rblocks, 0); if (ret) goto out_trans_fail; lock_page(page); ret = -EINVAL; size = i_size_read(inode); last_index = (size - 1) >> PAGE_CACHE_SHIFT; if (size == 0 || (page->index > last_index)) goto out_trans_end; ret = -EAGAIN; if (!PageUptodate(page) || page->mapping != inode->i_mapping) goto out_trans_end; ret = 0; if (gfs2_is_stuffed(ip)) ret = gfs2_unstuff_dinode(ip, page); if (ret == 0) ret = gfs2_allocate_page_backing(page); out_trans_end: if (ret) unlock_page(page); gfs2_trans_end(sdp); out_trans_fail: gfs2_inplace_release(ip); out_quota_unlock: gfs2_quota_unlock(ip); out_alloc_put: gfs2_qadata_put(ip); out_unlock: gfs2_glock_dq(&gh); out: gfs2_holder_uninit(&gh); if (ret == 0) { set_page_dirty(page); if (inode->i_sb->s_frozen == SB_UNFROZEN) { wait_on_page_writeback(page); } else { ret = -EAGAIN; unlock_page(page); } } return block_page_mkwrite_return(ret); }
/* * We completely avoid races by reading each swap page in advance, * and then search for the process using it. All the necessary * page table adjustments can then be made atomically. */ static int try_to_unuse(unsigned int type) { struct swap_info_struct * si = &swap_info[type]; struct mm_struct *start_mm; unsigned short *swap_map; unsigned short swcount; struct page *page; swp_entry_t entry; unsigned int i = 0; int retval = 0; int reset_overflow = 0; int shmem; /* * When searching mms for an entry, a good strategy is to * start at the first mm we freed the previous entry from * (though actually we don't notice whether we or coincidence * freed the entry). Initialize this start_mm with a hold. * * A simpler strategy would be to start at the last mm we * freed the previous entry from; but that would take less * advantage of mmlist ordering, which clusters forked mms * together, child after parent. If we race with dup_mmap(), we * prefer to resolve parent before child, lest we miss entries * duplicated after we scanned child: using last mm would invert * that. Though it's only a serious concern when an overflowed * swap count is reset from SWAP_MAP_MAX, preventing a rescan. */ start_mm = &init_mm; atomic_inc(&init_mm.mm_users); /* * Keep on scanning until all entries have gone. Usually, * one pass through swap_map is enough, but not necessarily: * there are races when an instance of an entry might be missed. */ while ((i = find_next_to_unuse(si, i)) != 0) { if (signal_pending(current)) { retval = -EINTR; break; } /* * Get a page for the entry, using the existing swap * cache page if there is one. Otherwise, get a clean * page and read the swap into it. */ swap_map = &si->swap_map[i]; entry = swp_entry(type, i); page = read_swap_cache_async(entry, NULL, 0); if (!page) { /* * Either swap_duplicate() failed because entry * has been freed independently, and will not be * reused since sys_swapoff() already disabled * allocation from here, or alloc_page() failed. */ if (!*swap_map) continue; retval = -ENOMEM; break; } /* * Don't hold on to start_mm if it looks like exiting. */ if (atomic_read(&start_mm->mm_users) == 1) { mmput(start_mm); start_mm = &init_mm; atomic_inc(&init_mm.mm_users); } /* * Wait for and lock page. When do_swap_page races with * try_to_unuse, do_swap_page can handle the fault much * faster than try_to_unuse can locate the entry. This * apparently redundant "wait_on_page_locked" lets try_to_unuse * defer to do_swap_page in such a case - in some tests, * do_swap_page and try_to_unuse repeatedly compete. */ wait_on_page_locked(page); wait_on_page_writeback(page); lock_page(page); wait_on_page_writeback(page); /* * Remove all references to entry. * Whenever we reach init_mm, there's no address space * to search, but use it as a reminder to search shmem. */ shmem = 0; swcount = *swap_map; if (swcount > 1) { if (start_mm == &init_mm) shmem = shmem_unuse(entry, page); else retval = unuse_mm(start_mm, entry, page); } if (*swap_map > 1) { int set_start_mm = (*swap_map >= swcount); struct list_head *p = &start_mm->mmlist; struct mm_struct *new_start_mm = start_mm; struct mm_struct *prev_mm = start_mm; struct mm_struct *mm; atomic_inc(&new_start_mm->mm_users); atomic_inc(&prev_mm->mm_users); spin_lock(&mmlist_lock); while (*swap_map > 1 && !retval && (p = p->next) != &start_mm->mmlist) { mm = list_entry(p, struct mm_struct, mmlist); if (!atomic_inc_not_zero(&mm->mm_users)) continue; spin_unlock(&mmlist_lock); mmput(prev_mm); prev_mm = mm; cond_resched(); swcount = *swap_map; if (swcount <= 1) ; else if (mm == &init_mm) { set_start_mm = 1; shmem = shmem_unuse(entry, page); } else retval = unuse_mm(mm, entry, page); if (set_start_mm && *swap_map < swcount) { mmput(new_start_mm); atomic_inc(&mm->mm_users); new_start_mm = mm; set_start_mm = 0; } spin_lock(&mmlist_lock); } spin_unlock(&mmlist_lock); mmput(prev_mm); mmput(start_mm); start_mm = new_start_mm; } if (retval) { unlock_page(page); page_cache_release(page); break; } /* * How could swap count reach 0x7fff when the maximum * pid is 0x7fff, and there's no way to repeat a swap * page within an mm (except in shmem, where it's the * shared object which takes the reference count)? * We believe SWAP_MAP_MAX cannot occur in Linux 2.4. * * If that's wrong, then we should worry more about * exit_mmap() and do_munmap() cases described above: * we might be resetting SWAP_MAP_MAX too early here. * We know "Undead"s can happen, they're okay, so don't * report them; but do report if we reset SWAP_MAP_MAX. */ if (*swap_map == SWAP_MAP_MAX) { spin_lock(&swap_lock); *swap_map = 1; spin_unlock(&swap_lock); reset_overflow = 1; } /* * If a reference remains (rare), we would like to leave * the page in the swap cache; but try_to_unmap could * then re-duplicate the entry once we drop page lock, * so we might loop indefinitely; also, that page could * not be swapped out to other storage meanwhile. So: * delete from cache even if there's another reference, * after ensuring that the data has been saved to disk - * since if the reference remains (rarer), it will be * read from disk into another page. Splitting into two * pages would be incorrect if swap supported "shared * private" pages, but they are handled by tmpfs files. * * Note shmem_unuse already deleted a swappage from * the swap cache, unless the move to filepage failed: * in which case it left swappage in cache, lowered its * swap count to pass quickly through the loops above, * and now we must reincrement count to try again later. */ if ((*swap_map > 1) && PageDirty(page) && PageSwapCache(page)) { struct writeback_control wbc = { .sync_mode = WB_SYNC_NONE, }; swap_writepage(page, &wbc); lock_page(page); wait_on_page_writeback(page); } if (PageSwapCache(page)) { if (shmem) swap_duplicate(entry); else delete_from_swap_cache(page); } /* * So we could skip searching mms once swap count went * to 1, we did not mark any present ptes as dirty: must * mark page dirty so shrink_page_list will preserve it. */ SetPageDirty(page); unlock_page(page); page_cache_release(page); /* * Make sure that we aren't completely killing * interactive performance. */ cond_resched(); }
static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) { struct page *page = vmf->page; struct inode *inode = vma->vm_file->f_path.dentry->d_inode; struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_sbd *sdp = GFS2_SB(inode); unsigned long last_index; u64 pos = page->index << PAGE_CACHE_SHIFT; unsigned int data_blocks, ind_blocks, rblocks; struct gfs2_holder gh; loff_t size; int ret; sb_start_pagefault(inode->i_sb); /* Update file times before taking page lock */ file_update_time(vma->vm_file); ret = gfs2_rs_alloc(ip); if (ret) return ret; gfs2_size_hint(vma->vm_file, pos, PAGE_CACHE_SIZE); gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); ret = gfs2_glock_nq(&gh); if (ret) goto out; set_bit(GLF_DIRTY, &ip->i_gl->gl_flags); set_bit(GIF_SW_PAGED, &ip->i_flags); if (!gfs2_write_alloc_required(ip, pos, PAGE_CACHE_SIZE)) { lock_page(page); if (!PageUptodate(page) || page->mapping != inode->i_mapping) { ret = -EAGAIN; unlock_page(page); } goto out_unlock; } ret = gfs2_rindex_update(sdp); if (ret) goto out_unlock; ret = gfs2_quota_lock_check(ip); if (ret) goto out_unlock; gfs2_write_calc_reserv(ip, PAGE_CACHE_SIZE, &data_blocks, &ind_blocks); ret = gfs2_inplace_reserve(ip, data_blocks + ind_blocks, 0); if (ret) goto out_quota_unlock; rblocks = RES_DINODE + ind_blocks; if (gfs2_is_jdata(ip)) rblocks += data_blocks ? data_blocks : 1; if (ind_blocks || data_blocks) { rblocks += RES_STATFS + RES_QUOTA; rblocks += gfs2_rg_blocks(ip, data_blocks + ind_blocks); } ret = gfs2_trans_begin(sdp, rblocks, 0); if (ret) goto out_trans_fail; lock_page(page); ret = -EINVAL; size = i_size_read(inode); last_index = (size - 1) >> PAGE_CACHE_SHIFT; /* Check page index against inode size */ if (size == 0 || (page->index > last_index)) goto out_trans_end; ret = -EAGAIN; /* If truncated, we must retry the operation, we may have raced * with the glock demotion code. */ if (!PageUptodate(page) || page->mapping != inode->i_mapping) goto out_trans_end; /* Unstuff, if required, and allocate backing blocks for page */ ret = 0; if (gfs2_is_stuffed(ip)) ret = gfs2_unstuff_dinode(ip, page); if (ret == 0) ret = gfs2_allocate_page_backing(page); out_trans_end: if (ret) unlock_page(page); gfs2_trans_end(sdp); out_trans_fail: gfs2_inplace_release(ip); out_quota_unlock: gfs2_quota_unlock(ip); out_unlock: gfs2_glock_dq(&gh); out: gfs2_holder_uninit(&gh); if (ret == 0) { set_page_dirty(page); wait_on_page_writeback(page); } sb_end_pagefault(inode->i_sb); return block_page_mkwrite_return(ret); }
/** * truncate_inode_pages - truncate *all* the pages from an offset * @mapping: mapping to truncate * @lstart: offset from which to truncate * * Truncate the page cache at a set offset, removing the pages that are beyond * that offset (and zeroing out partial pages). * * Truncate takes two passes - the first pass is nonblocking. It will not * block on page locks and it will not block on writeback. The second pass * will wait. This is to prevent as much IO as possible in the affected region. * The first pass will remove most pages, so the search cost of the second pass * is low. * * When looking at page->index outside the page lock we need to be careful to * copy it into a local to avoid races (it could change at any time). * * We pass down the cache-hot hint to the page freeing code. Even if the * mapping is large, it is probably the case that the final pages are the most * recently touched, and freeing happens in ascending file offset order. * * Called under (and serialised by) inode->i_sem. */ void truncate_inode_pages(struct address_space *mapping, loff_t lstart) { const pgoff_t start = (lstart + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT; const unsigned partial = lstart & (PAGE_CACHE_SIZE - 1); struct pagevec pvec; pgoff_t next; int i; if (mapping->nrpages == 0) return; pagevec_init(&pvec, 0); next = start; while (pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; pgoff_t page_index = page->index; if (page_index > next) next = page_index; next++; if (TestSetPageLocked(page)) continue; if (PageWriteback(page)) { unlock_page(page); continue; } truncate_complete_page(mapping, page); unlock_page(page); } pagevec_release(&pvec); cond_resched(); } if (partial) { struct page *page = find_lock_page(mapping, start - 1); if (page) { wait_on_page_writeback(page); truncate_partial_page(page, partial); unlock_page(page); page_cache_release(page); } } next = start; for ( ; ; ) { cond_resched(); if (!pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { if (next == start) break; next = start; continue; } for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; lock_page(page); wait_on_page_writeback(page); if (page->index > next) next = page->index; next++; truncate_complete_page(mapping, page); unlock_page(page); } pagevec_release(&pvec); } }