int __generic_block_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len, get_block_t *get_block) { struct buffer_head tmp; unsigned long long start_blk; long long length = 0, map_len = 0; u64 logical = 0, phys = 0, size = 0; u32 flags = FIEMAP_EXTENT_MERGED; int ret = 0, past_eof = 0, whole_file = 0; if ((ret = fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC))) return ret; start_blk = logical_to_blk(inode, start); length = (long long)min_t(u64, len, i_size_read(inode)); if (length < len) whole_file = 1; map_len = length; do { /* * we set b_size to the total size we want so it will map as * many contiguous blocks as possible at once */ memset(&tmp, 0, sizeof(struct buffer_head)); tmp.b_size = map_len; ret = get_block(inode, start_blk, &tmp, 0); if (ret) break; /* HOLE */ if (!buffer_mapped(&tmp)) { length -= blk_to_logical(inode, 1); start_blk++; /* * we want to handle the case where there is an * allocated block at the front of the file, and then * nothing but holes up to the end of the file properly, * to make sure that extent at the front gets properly * marked with FIEMAP_EXTENT_LAST */ if (!past_eof && blk_to_logical(inode, start_blk) >= blk_to_logical(inode, 0)+i_size_read(inode)) past_eof = 1; /* * first hole after going past the EOF, this is our * last extent */ if (past_eof && size) { flags = FIEMAP_EXTENT_MERGED|FIEMAP_EXTENT_LAST; ret = fiemap_fill_next_extent(fieinfo, logical, phys, size, flags); break; } /* if we have holes up to/past EOF then we're done */ if (length <= 0 || past_eof) break; } else { /* * we have gone over the length of what we wanted to * map, and it wasn't the entire file, so add the extent * we got last time and exit. * * This is for the case where say we want to map all the * way up to the second to the last block in a file, but * the last block is a hole, making the second to last * block FIEMAP_EXTENT_LAST. In this case we want to * see if there is a hole after the second to last block * so we can mark it properly. If we found data after * we exceeded the length we were requesting, then we * are good to go, just add the extent to the fieinfo * and break */ if (length <= 0 && !whole_file) { ret = fiemap_fill_next_extent(fieinfo, logical, phys, size, flags); break; } /* * if size != 0 then we know we already have an extent * to add, so add it. */ if (size) { ret = fiemap_fill_next_extent(fieinfo, logical, phys, size, flags); if (ret) break; } logical = blk_to_logical(inode, start_blk); phys = blk_to_logical(inode, tmp.b_blocknr); size = tmp.b_size; flags = FIEMAP_EXTENT_MERGED; length -= tmp.b_size; start_blk += logical_to_blk(inode, size); /* * If we are past the EOF, then we need to make sure as * soon as we find a hole that the last extent we found * is marked with FIEMAP_EXTENT_LAST */ if (!past_eof && logical+size >= blk_to_logical(inode, 0)+i_size_read(inode)) past_eof = 1; } cond_resched(); } while (1); /* if ret is 1 then we just hit the end of the extent array */ if (ret == 1) ret = 0; return ret; }
/** * vxfs_read_super - read superblock into memory and initialize filesystem * @sbp: VFS superblock (to fill) * @dp: fs private mount data * @silent: do not complain loudly when sth is wrong * * Description: * We are called on the first mount of a filesystem to read the * superblock into memory and do some basic setup. * * Returns: * The superblock on success, else %NULL. * * Locking: * We are under @sbp->s_lock. */ static int vxfs_fill_super(struct super_block *sbp, void *dp, int silent) { struct vxfs_sb_info *infp; struct vxfs_sb *rsbp; struct buffer_head *bp = NULL; u_long bsize; struct inode *root; int ret = -EINVAL; sbp->s_flags |= MS_RDONLY; infp = kzalloc(sizeof(*infp), GFP_KERNEL); if (!infp) { printk(KERN_WARNING "vxfs: unable to allocate incore superblock\n"); return -ENOMEM; } bsize = sb_min_blocksize(sbp, BLOCK_SIZE); if (!bsize) { printk(KERN_WARNING "vxfs: unable to set blocksize\n"); goto out; } bp = sb_bread(sbp, 1); if (!bp || !buffer_mapped(bp)) { if (!silent) { printk(KERN_WARNING "vxfs: unable to read disk superblock\n"); } goto out; } rsbp = (struct vxfs_sb *)bp->b_data; if (rsbp->vs_magic != VXFS_SUPER_MAGIC) { if (!silent) printk(KERN_NOTICE "vxfs: WRONG superblock magic\n"); goto out; } if ((rsbp->vs_version < 2 || rsbp->vs_version > 4) && !silent) { printk(KERN_NOTICE "vxfs: unsupported VxFS version (%d)\n", rsbp->vs_version); goto out; } #ifdef DIAGNOSTIC printk(KERN_DEBUG "vxfs: supported VxFS version (%d)\n", rsbp->vs_version); printk(KERN_DEBUG "vxfs: blocksize: %d\n", rsbp->vs_bsize); #endif sbp->s_magic = rsbp->vs_magic; sbp->s_fs_info = infp; infp->vsi_raw = rsbp; infp->vsi_bp = bp; infp->vsi_oltext = rsbp->vs_oltext[0]; infp->vsi_oltsize = rsbp->vs_oltsize; if (!sb_set_blocksize(sbp, rsbp->vs_bsize)) { printk(KERN_WARNING "vxfs: unable to set final block size\n"); goto out; } if (vxfs_read_olt(sbp, bsize)) { printk(KERN_WARNING "vxfs: unable to read olt\n"); goto out; } if (vxfs_read_fshead(sbp)) { printk(KERN_WARNING "vxfs: unable to read fshead\n"); goto out; } sbp->s_op = &vxfs_super_ops; root = vxfs_iget(sbp, VXFS_ROOT_INO); if (IS_ERR(root)) { ret = PTR_ERR(root); goto out; } sbp->s_root = d_alloc_root(root); if (!sbp->s_root) { iput(root); printk(KERN_WARNING "vxfs: unable to get root dentry.\n"); goto out_free_ilist; } return 0; out_free_ilist: vxfs_put_fake_inode(infp->vsi_fship); vxfs_put_fake_inode(infp->vsi_ilist); vxfs_put_fake_inode(infp->vsi_stilist); out: brelse(bp); kfree(infp); return ret; }
int __generic_block_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len, get_block_t *get_block) { struct buffer_head tmp; unsigned int start_blk; long long length = 0, map_len = 0; u64 logical = 0, phys = 0, size = 0; u32 flags = FIEMAP_EXTENT_MERGED; int ret = 0; if ((ret = fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC))) return ret; start_blk = logical_to_blk(inode, start); length = (long long)min_t(u64, len, i_size_read(inode)); map_len = length; do { /* * we set b_size to the total size we want so it will map as * many contiguous blocks as possible at once */ memset(&tmp, 0, sizeof(struct buffer_head)); tmp.b_size = map_len; ret = get_block(inode, start_blk, &tmp, 0); if (ret) break; /* HOLE */ if (!buffer_mapped(&tmp)) { /* * first hole after going past the EOF, this is our * last extent */ if (length <= 0) { flags = FIEMAP_EXTENT_MERGED|FIEMAP_EXTENT_LAST; ret = fiemap_fill_next_extent(fieinfo, logical, phys, size, flags); break; } length -= blk_to_logical(inode, 1); /* if we have holes up to/past EOF then we're done */ if (length <= 0) break; start_blk++; } else { if (length <= 0 && size) { ret = fiemap_fill_next_extent(fieinfo, logical, phys, size, flags); if (ret) break; } logical = blk_to_logical(inode, start_blk); phys = blk_to_logical(inode, tmp.b_blocknr); size = tmp.b_size; flags = FIEMAP_EXTENT_MERGED; length -= tmp.b_size; start_blk += logical_to_blk(inode, size); /* * if we are past the EOF we need to loop again to see * if there is a hole so we can mark this extent as the * last one, and if not keep mapping things until we * find a hole, or we run out of slots in the extent * array */ if (length <= 0) continue; ret = fiemap_fill_next_extent(fieinfo, logical, phys, size, flags); if (ret) break; } cond_resched(); } while (1); /* if ret is 1 then we just hit the end of the extent array */ if (ret == 1) ret = 0; return ret; }
/* * Modify inode page cache in such way: * have - blocks with b_blocknr equal to oldb...oldb+count-1 * get - blocks with b_blocknr equal to newb...newb+count-1 * also we suppose that oldb...oldb+count-1 blocks * situated at the end of file. * * We can come here from ufs_writepage or ufs_prepare_write, * locked_page is argument of these functions, so we already lock it. */ static void ufs_change_blocknr(struct inode *inode, unsigned int beg, unsigned int count, unsigned int oldb, unsigned int newb, struct page *locked_page) { const unsigned mask = (1 << (PAGE_CACHE_SHIFT - inode->i_blkbits)) - 1; struct address_space * const mapping = inode->i_mapping; pgoff_t index, cur_index; unsigned end, pos, j; struct page *page; struct buffer_head *head, *bh; UFSD("ENTER, ino %lu, count %u, oldb %u, newb %u\n", inode->i_ino, count, oldb, newb); BUG_ON(!locked_page); BUG_ON(!PageLocked(locked_page)); cur_index = locked_page->index; for (end = count + beg; beg < end; beg = (beg | mask) + 1) { index = beg >> (PAGE_CACHE_SHIFT - inode->i_blkbits); if (likely(cur_index != index)) { page = ufs_get_locked_page(mapping, index); if (!page || IS_ERR(page)) /* it was truncated or EIO */ continue; } else page = locked_page; head = page_buffers(page); bh = head; pos = beg & mask; for (j = 0; j < pos; ++j) bh = bh->b_this_page; j = 0; do { if (buffer_mapped(bh)) { pos = bh->b_blocknr - oldb; if (pos < count) { UFSD(" change from %llu to %llu\n", (unsigned long long)pos + oldb, (unsigned long long)pos + newb); bh->b_blocknr = newb + pos; unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr); mark_buffer_dirty(bh); ++j; } } bh = bh->b_this_page; } while (bh != head); if (j) set_page_dirty(page); if (likely(cur_index != index)) ufs_put_locked_page(page); } UFSD("EXIT\n"); }
/** * write_mft_record_nolock - write out a mapped (extent) mft record * @ni: ntfs inode describing the mapped (extent) mft record * @m: mapped (extent) mft record to write * @sync: if true, wait for i/o completion * * Write the mapped (extent) mft record @m described by the (regular or extent) * ntfs inode @ni to backing store. If the mft record @m has a counterpart in * the mft mirror, that is also updated. * * On success, clean the mft record and return 0. On error, leave the mft * record dirty and return -errno. The caller should call make_bad_inode() on * the base inode to ensure no more access happens to this inode. We do not do * it here as the caller may want to finish writing other extent mft records * first to minimize on-disk metadata inconsistencies. * * NOTE: We always perform synchronous i/o and ignore the @sync parameter. * However, if the mft record has a counterpart in the mft mirror and @sync is * true, we write the mft record, wait for i/o completion, and only then write * the mft mirror copy. This ensures that if the system crashes either the mft * or the mft mirror will contain a self-consistent mft record @m. If @sync is * false on the other hand, we start i/o on both and then wait for completion * on them. This provides a speedup but no longer guarantees that you will end * up with a self-consistent mft record in the case of a crash but if you asked * for asynchronous writing you probably do not care about that anyway. * * TODO: If @sync is false, want to do truly asynchronous i/o, i.e. just * schedule i/o via ->writepage or do it via kntfsd or whatever. */ int write_mft_record_nolock(ntfs_inode *ni, MFT_RECORD *m, int sync) { ntfs_volume *vol = ni->vol; struct page *page = ni->page; unsigned int blocksize = vol->sb->s_blocksize; int max_bhs = vol->mft_record_size / blocksize; struct buffer_head *bhs[max_bhs]; struct buffer_head *bh, *head; unsigned int block_start, block_end, m_start, m_end; int i_bhs, nr_bhs, err = 0; ntfs_debug("Entering for inode 0x%lx.", ni->mft_no); BUG_ON(NInoAttr(ni)); BUG_ON(!max_bhs); BUG_ON(!PageLocked(page)); /* * If the ntfs_inode is clean no need to do anything. If it is dirty, * mark it as clean now so that it can be redirtied later on if needed. * There is no danger of races since the caller is holding the locks * for the mft record @m and the page it is in. */ if (!NInoTestClearDirty(ni)) goto done; /* Make sure we have mapped buffers. */ if (!page_has_buffers(page)) { no_buffers_err_out: ntfs_error(vol->sb, "Writing mft records without existing " "buffers is not implemented yet. %s", ntfs_please_email); err = -EOPNOTSUPP; goto err_out; } bh = head = page_buffers(page); if (!bh) goto no_buffers_err_out; nr_bhs = 0; block_start = 0; m_start = ni->page_ofs; m_end = m_start + vol->mft_record_size; do { block_end = block_start + blocksize; /* * If the buffer is outside the mft record, just skip it, * clearing it if it is dirty to make sure it is not written * out. It should never be marked dirty but better be safe. */ if ((block_end <= m_start) || (block_start >= m_end)) { if (buffer_dirty(bh)) { ntfs_warning(vol->sb, "Clearing dirty mft " "record page buffer. %s", ntfs_please_email); clear_buffer_dirty(bh); } continue; } if (!buffer_mapped(bh)) { ntfs_error(vol->sb, "Writing mft records without " "existing mapped buffers is not " "implemented yet. %s", ntfs_please_email); err = -EOPNOTSUPP; continue; } if (!buffer_uptodate(bh)) { ntfs_error(vol->sb, "Writing mft records without " "existing uptodate buffers is not " "implemented yet. %s", ntfs_please_email); err = -EOPNOTSUPP; continue; } BUG_ON(!nr_bhs && (m_start != block_start)); BUG_ON(nr_bhs >= max_bhs); bhs[nr_bhs++] = bh; BUG_ON((nr_bhs >= max_bhs) && (m_end != block_end)); } while (block_start = block_end, (bh = bh->b_this_page) != head); if (unlikely(err)) goto cleanup_out; /* Apply the mst protection fixups. */ err = pre_write_mst_fixup((NTFS_RECORD*)m, vol->mft_record_size); if (err) { ntfs_error(vol->sb, "Failed to apply mst fixups!"); goto cleanup_out; } flush_dcache_mft_record_page(ni); /* Lock buffers and start synchronous write i/o on them. */ for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++) { struct buffer_head *tbh = bhs[i_bhs]; if (unlikely(test_set_buffer_locked(tbh))) BUG(); BUG_ON(!buffer_uptodate(tbh)); if (buffer_dirty(tbh)) clear_buffer_dirty(tbh); get_bh(tbh); tbh->b_end_io = end_buffer_write_sync; submit_bh(WRITE, tbh); } /* Synchronize the mft mirror now if not @sync. */ if (!sync && ni->mft_no < vol->mftmirr_size) sync_mft_mirror(ni, m, sync); /* Wait on i/o completion of buffers. */ for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++) { struct buffer_head *tbh = bhs[i_bhs]; wait_on_buffer(tbh); if (unlikely(!buffer_uptodate(tbh))) { err = -EIO; /* * Set the buffer uptodate so the page & buffer states * don't become out of sync. */ if (PageUptodate(page)) set_buffer_uptodate(tbh); } } /* If @sync, now synchronize the mft mirror. */ if (sync && ni->mft_no < vol->mftmirr_size) sync_mft_mirror(ni, m, sync); /* Remove the mst protection fixups again. */ post_write_mst_fixup((NTFS_RECORD*)m); flush_dcache_mft_record_page(ni); if (unlikely(err)) { /* I/O error during writing. This is really bad! */ ntfs_error(vol->sb, "I/O error while writing mft record " "0x%lx! Marking base inode as bad. You " "should unmount the volume and run chkdsk.", ni->mft_no); goto err_out; } done: ntfs_debug("Done."); return 0; cleanup_out: /* Clean the buffers. */ for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++) clear_buffer_dirty(bhs[i_bhs]); err_out: /* * Current state: all buffers are clean, unlocked, and uptodate. * The caller should mark the base inode as bad so that no more i/o * happens. ->clear_inode() will still be invoked so all extent inodes * and other allocated memory will be freed. */ if (err == -ENOMEM) { ntfs_error(vol->sb, "Not enough memory to write mft record. " "Redirtying so the write is retried later."); mark_mft_record_dirty(ni); err = 0; } return err; }
/* * This is the worker routine which does all the work of mapping the disk * blocks and constructs largest possible bios, submits them for IO if the * blocks are not contiguous on the disk. * * We pass a buffer_head back and forth and use its buffer_mapped() flag to * represent the validity of its disk mapping and to decide when to do the next * get_block() call. */ static struct bio * do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages, sector_t *last_block_in_bio, struct buffer_head *map_bh, unsigned long *first_logical_block, get_block_t get_block) { struct inode *inode = page->mapping->host; const unsigned blkbits = inode->i_blkbits; const unsigned blocks_per_page = PAGE_CACHE_SIZE >> blkbits; const unsigned blocksize = 1 << blkbits; sector_t block_in_file; sector_t last_block; sector_t last_block_in_file; sector_t blocks[MAX_BUF_PER_PAGE]; unsigned page_block; unsigned first_hole = blocks_per_page; struct block_device *bdev = NULL; int length; int fully_mapped = 1; unsigned nblocks; unsigned relative_block; /*page是缓冲区页 : 是块设备页或者是非连续块*/ if (page_has_buffers(page)) goto confused; block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits); last_block = block_in_file + nr_pages * blocks_per_page; last_block_in_file = (i_size_read(inode) + blocksize - 1) >> blkbits; if (last_block > last_block_in_file) last_block = last_block_in_file; page_block = 0; /* * Map blocks using the result from the previous get_blocks call first. */ nblocks = map_bh->b_size >> blkbits; /*如果bh已经被映射到磁盘了 此时map_bh.b_state = 0 此段代码不会执行*/ if (buffer_mapped(map_bh) && block_in_file > *first_logical_block && block_in_file < (*first_logical_block + nblocks)) { unsigned map_offset = block_in_file - *first_logical_block; unsigned last = nblocks - map_offset; for (relative_block = 0; ; relative_block++) { if (relative_block == last) { clear_buffer_mapped(map_bh); break; } if (page_block == blocks_per_page) break; blocks[page_block] = map_bh->b_blocknr + map_offset + relative_block; page_block++; block_in_file++; } bdev = map_bh->b_bdev; } /* * Then do more get_blocks calls until we are done with this page. */ map_bh->b_page = page; while (page_block < blocks_per_page) { map_bh->b_state = 0; map_bh->b_size = 0; if (block_in_file < last_block) { map_bh->b_size = (last_block-block_in_file) << blkbits; if (get_block(inode, block_in_file, map_bh, 0)) goto confused; *first_logical_block = block_in_file; } /*没有映射 对应文件洞*/ if (!buffer_mapped(map_bh)) { fully_mapped = 0; if (first_hole == blocks_per_page) first_hole = page_block; page_block++; block_in_file++; continue; } /* some filesystems will copy data into the page during * the get_block call, in which case we don't want to * read it again. map_buffer_to_page copies the data * we just collected from get_block into the page's buffers * so readpage doesn't have to repeat the get_block call */ /*如果块缓存区是最新的 将其数据直接copy到page*/ if (buffer_uptodate(map_bh)) { map_buffer_to_page(page, map_bh, page_block); goto confused; } /* 遇到一个文件洞 但之后的块又映射了,这时应该将洞之前的块处理掉 */ if (first_hole != blocks_per_page) goto confused; /* hole -> non-hole 从有空洞变成没有空洞*/ /* Contiguous blocks? */ /*get_block获取块号存放在map_bh的b_blocknr中*/ if (page_block && blocks[page_block-1] != map_bh->b_blocknr-1) goto confused; nblocks = map_bh->b_size >> blkbits; for (relative_block = 0; ; relative_block++) { if (relative_block == nblocks) { clear_buffer_mapped(map_bh); break; } else if (page_block == blocks_per_page) break; /*page中连续的块存放在blocks数组中*/ blocks[page_block] = map_bh->b_blocknr+relative_block; page_block++; block_in_file++; } bdev = map_bh->b_bdev; } /*存在连续的文件空洞*/ if (first_hole != blocks_per_page) { /*将空洞的page部分清零*/ zero_user_segment(page, first_hole << blkbits, PAGE_CACHE_SIZE); if (first_hole == 0) { SetPageUptodate(page); unlock_page(page); goto out; } } else if (fully_mapped) { SetPageMappedToDisk(page); } /* * This page will go to BIO. Do we need to send this BIO off first? */ if (bio && (*last_block_in_bio != blocks[0] - 1)) bio = mpage_bio_submit(READ, bio); alloc_new: if (bio == NULL) { /*blocks[0]<<(blkbits-9)表示该块对应的磁盘扇区号*/ bio = mpage_alloc(bdev, blocks[0] << (blkbits - 9), min_t(int, nr_pages, bio_get_nr_vecs(bdev)), GFP_KERNEL); if (bio == NULL) goto confused; }
static bool buffer_written(struct buffer_head *bh) { return buffer_mapped(bh) && !buffer_unwritten(bh); }
int nilfs_btnode_submit_block(struct address_space *btnc, __u64 blocknr, sector_t pblocknr, struct buffer_head **pbh, int newblk) { struct buffer_head *bh; struct inode *inode = NILFS_BTNC_I(btnc); int err; bh = nilfs_grab_buffer(inode, btnc, blocknr, 1 << BH_NILFS_Node); if (unlikely(!bh)) return -ENOMEM; err = -EEXIST; /* internal code */ if (newblk) { if (unlikely(buffer_mapped(bh) || buffer_uptodate(bh) || buffer_dirty(bh))) { brelse(bh); BUG(); } memset(bh->b_data, 0, 1 << inode->i_blkbits); bh->b_bdev = NILFS_I_NILFS(inode)->ns_bdev; bh->b_blocknr = blocknr; set_buffer_mapped(bh); set_buffer_uptodate(bh); goto found; } if (buffer_uptodate(bh) || buffer_dirty(bh)) goto found; if (pblocknr == 0) { pblocknr = blocknr; if (inode->i_ino != NILFS_DAT_INO) { struct inode *dat = nilfs_dat_inode(NILFS_I_NILFS(inode)); /* blocknr is a virtual block number */ err = nilfs_dat_translate(dat, blocknr, &pblocknr); if (unlikely(err)) { brelse(bh); goto out_locked; } } } lock_buffer(bh); if (buffer_uptodate(bh)) { unlock_buffer(bh); err = -EEXIST; /* internal code */ goto found; } set_buffer_mapped(bh); bh->b_bdev = NILFS_I_NILFS(inode)->ns_bdev; bh->b_blocknr = pblocknr; /* set block address for read */ bh->b_end_io = end_buffer_read_sync; get_bh(bh); submit_bh(READ, bh); bh->b_blocknr = blocknr; /* set back to the given block address */ err = 0; found: *pbh = bh; out_locked: unlock_page(bh->b_page); page_cache_release(bh->b_page); return err; }
static int nilfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) { struct page *page = vmf->page; struct inode *inode = file_inode(vma->vm_file); struct nilfs_transaction_info ti; int ret = 0; if (unlikely(nilfs_near_disk_full(inode->i_sb->s_fs_info))) return VM_FAULT_SIGBUS; /* -ENOSPC */ sb_start_pagefault(inode->i_sb); lock_page(page); if (page->mapping != inode->i_mapping || page_offset(page) >= i_size_read(inode) || !PageUptodate(page)) { unlock_page(page); ret = -EFAULT; /* make the VM retry the fault */ goto out; } /* * check to see if the page is mapped already (no holes) */ if (PageMappedToDisk(page)) goto mapped; if (page_has_buffers(page)) { struct buffer_head *bh, *head; int fully_mapped = 1; bh = head = page_buffers(page); do { if (!buffer_mapped(bh)) { fully_mapped = 0; break; } } while (bh = bh->b_this_page, bh != head); if (fully_mapped) { SetPageMappedToDisk(page); goto mapped; } } unlock_page(page); /* * fill hole blocks */ ret = nilfs_transaction_begin(inode->i_sb, &ti, 1); /* never returns -ENOMEM, but may return -ENOSPC */ if (unlikely(ret)) goto out; file_update_time(vma->vm_file); ret = __block_page_mkwrite(vma, vmf, nilfs_get_block); if (ret) { nilfs_transaction_abort(inode->i_sb); goto out; } nilfs_set_file_dirty(inode, 1 << (PAGE_SHIFT - inode->i_blkbits)); nilfs_transaction_commit(inode->i_sb); mapped: wait_for_stable_page(page); out: sb_end_pagefault(inode->i_sb); return block_page_mkwrite_return(ret); }
int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len) { struct buffer_head map_bh; sector_t start_blk, last_blk; loff_t isize = i_size_read(inode); u64 logical = 0, phys = 0, size = 0; u32 flags = 0; bool past_eof = false, whole_file = false; int ret = 0; ret = fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC); if (ret) return ret; mutex_lock(&inode->i_mutex); if (len >= isize) { whole_file = true; len = isize; } if (logical_to_blk(inode, len) == 0) len = blk_to_logical(inode, 1); start_blk = logical_to_blk(inode, start); last_blk = logical_to_blk(inode, start + len - 1); next: memset(&map_bh, 0, sizeof(struct buffer_head)); map_bh.b_size = len; ret = get_data_block(inode, start_blk, &map_bh, 0, F2FS_GET_BLOCK_FIEMAP); if (ret) goto out; /* HOLE */ if (!buffer_mapped(&map_bh)) { start_blk++; if (!past_eof && blk_to_logical(inode, start_blk) >= isize) past_eof = 1; if (past_eof && size) { flags |= FIEMAP_EXTENT_LAST; ret = fiemap_fill_next_extent(fieinfo, logical, phys, size, flags); } else if (size) { ret = fiemap_fill_next_extent(fieinfo, logical, phys, size, flags); size = 0; } /* if we have holes up to/past EOF then we're done */ if (start_blk > last_blk || past_eof || ret) goto out; } else { if (start_blk > last_blk && !whole_file) { ret = fiemap_fill_next_extent(fieinfo, logical, phys, size, flags); goto out; } /* * if size != 0 then we know we already have an extent * to add, so add it. */ if (size) { ret = fiemap_fill_next_extent(fieinfo, logical, phys, size, flags); if (ret) goto out; } logical = blk_to_logical(inode, start_blk); phys = blk_to_logical(inode, map_bh.b_blocknr); size = map_bh.b_size; flags = 0; if (buffer_unwritten(&map_bh)) flags = FIEMAP_EXTENT_UNWRITTEN; start_blk += logical_to_blk(inode, size); /* * If we are past the EOF, then we need to make sure as * soon as we find a hole that the last extent we found * is marked with FIEMAP_EXTENT_LAST */ if (!past_eof && logical + size >= isize) past_eof = true; } cond_resched(); if (fatal_signal_pending(current)) ret = -EINTR; else goto next; out: if (ret == 1) ret = 0; mutex_unlock(&inode->i_mutex); return ret; }
/* * nilfs_gccache_submit_read_data() - add data buffer and submit read request * @inode - gc inode * @blkoff - dummy offset treated as the key for the page cache * @pbn - physical block number of the block * @vbn - virtual block number of the block, 0 for non-virtual block * @out_bh - indirect pointer to a buffer_head struct to receive the results * * Description: nilfs_gccache_submit_read_data() registers the data buffer * specified by @pbn to the GC pagecache with the key @blkoff. * This function sets @vbn (@pbn if @vbn is zero) in b_blocknr of the buffer. * * Return Value: On success, 0 is returned. On Error, one of the following * negative error code is returned. * * %-EIO - I/O error. * * %-ENOMEM - Insufficient amount of memory available. * * %-ENOENT - The block specified with @pbn does not exist. */ int nilfs_gccache_submit_read_data(struct inode *inode, sector_t blkoff, sector_t pbn, __u64 vbn, struct buffer_head **out_bh) { struct buffer_head *bh; int err; #if defined(YANQIN) struct radix_tree_root *rtree = &(NILFS_I(inode)->i_gc_blocks); struct nilfs_gc_block_info *gbi; #endif bh = nilfs_grab_buffer(inode, inode->i_mapping, blkoff, 0); if (unlikely(!bh)) return -ENOMEM; if (buffer_uptodate(bh)) goto out; if (pbn == 0) { struct the_nilfs *nilfs = inode->i_sb->s_fs_info; err = nilfs_dat_translate(nilfs->ns_dat, vbn, &pbn); if (unlikely(err)) { /* -EIO, -ENOMEM, -ENOENT */ brelse(bh); goto failed; } } lock_buffer(bh); if (buffer_uptodate(bh)) { unlock_buffer(bh); goto out; } if (!buffer_mapped(bh)) { bh->b_bdev = inode->i_sb->s_bdev; set_buffer_mapped(bh); } bh->b_blocknr = pbn; bh->b_end_io = end_buffer_read_sync; get_bh(bh); submit_bh(READ, bh); if (vbn) bh->b_blocknr = vbn; out: err = 0; *out_bh = bh; #if defined(YANQIN) set_bit(BH_PrivateStart, &bh->b_state); gbi = kmalloc(sizeof(struct nilfs_gc_block_info), GFP_NOFS); if (!gbi) return -ENOMEM; gbi->old_pblocknr = pbn; gbi->new_pblocknr = 0; /* To be assigned later */ err = radix_tree_insert(rtree, vbn, gbi); #endif failed: unlock_page(bh->b_page); page_cache_release(bh->b_page); return err; }
static int __mpage_writepage(struct page *page, struct writeback_control *wbc, void *data) { struct mpage_data *mpd = data; struct bio *bio = mpd->bio; struct address_space *mapping = page->mapping; struct inode *inode = page->mapping->host; const unsigned blkbits = inode->i_blkbits; unsigned long end_index; const unsigned blocks_per_page = PAGE_SIZE >> blkbits; sector_t last_block; sector_t block_in_file; sector_t blocks[MAX_BUF_PER_PAGE]; unsigned page_block; unsigned first_unmapped = blocks_per_page; struct block_device *bdev = NULL; int boundary = 0; sector_t boundary_block = 0; struct block_device *boundary_bdev = NULL; int length; struct buffer_head map_bh; loff_t i_size = i_size_read(inode); int ret = 0; int op_flags = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : 0); if (page_has_buffers(page)) { struct buffer_head *head = page_buffers(page); struct buffer_head *bh = head; /* If they're all mapped and dirty, do it */ page_block = 0; do { BUG_ON(buffer_locked(bh)); if (!buffer_mapped(bh)) { /* * unmapped dirty buffers are created by * __set_page_dirty_buffers -> mmapped data */ if (buffer_dirty(bh)) goto confused; if (first_unmapped == blocks_per_page) first_unmapped = page_block; continue; } if (first_unmapped != blocks_per_page) goto confused; /* hole -> non-hole */ if (!buffer_dirty(bh) || !buffer_uptodate(bh)) goto confused; if (page_block) { if (bh->b_blocknr != blocks[page_block-1] + 1) goto confused; } blocks[page_block++] = bh->b_blocknr; boundary = buffer_boundary(bh); if (boundary) { boundary_block = bh->b_blocknr; boundary_bdev = bh->b_bdev; } bdev = bh->b_bdev; } while ((bh = bh->b_this_page) != head); if (first_unmapped) goto page_is_mapped; /* * Page has buffers, but they are all unmapped. The page was * created by pagein or read over a hole which was handled by * block_read_full_page(). If this address_space is also * using mpage_readpages then this can rarely happen. */ goto confused; } /* * The page has no buffers: map it to disk */ BUG_ON(!PageUptodate(page)); block_in_file = (sector_t)page->index << (PAGE_SHIFT - blkbits); last_block = (i_size - 1) >> blkbits; map_bh.b_page = page; for (page_block = 0; page_block < blocks_per_page; ) { map_bh.b_state = 0; map_bh.b_size = 1 << blkbits; if (mpd->get_block(inode, block_in_file, &map_bh, 1)) goto confused; if (buffer_new(&map_bh)) unmap_underlying_metadata(map_bh.b_bdev, map_bh.b_blocknr); if (buffer_boundary(&map_bh)) { boundary_block = map_bh.b_blocknr; boundary_bdev = map_bh.b_bdev; } if (page_block) { if (map_bh.b_blocknr != blocks[page_block-1] + 1) goto confused; } blocks[page_block++] = map_bh.b_blocknr; boundary = buffer_boundary(&map_bh); bdev = map_bh.b_bdev; if (block_in_file == last_block) break; block_in_file++; } BUG_ON(page_block == 0); first_unmapped = page_block; page_is_mapped: end_index = i_size >> PAGE_SHIFT; if (page->index >= end_index) { /* * The page straddles i_size. It must be zeroed out on each * and every writepage invocation because it may be mmapped. * "A file is mapped in multiples of the page size. For a file * that is not a multiple of the page size, the remaining memory * is zeroed when mapped, and writes to that region are not * written out to the file." */ unsigned offset = i_size & (PAGE_SIZE - 1); if (page->index > end_index || !offset) goto confused; zero_user_segment(page, offset, PAGE_SIZE); } /* * This page will go to BIO. Do we need to send this BIO off first? */ if (bio && mpd->last_block_in_bio != blocks[0] - 1) bio = mpage_bio_submit(REQ_OP_WRITE, op_flags, bio); alloc_new: if (bio == NULL) { if (first_unmapped == blocks_per_page) { if (!bdev_write_page(bdev, blocks[0] << (blkbits - 9), page, wbc)) { clean_buffers(page, first_unmapped); goto out; } } bio = mpage_alloc(bdev, blocks[0] << (blkbits - 9), BIO_MAX_PAGES, GFP_NOFS|__GFP_HIGH); if (bio == NULL) goto confused; wbc_init_bio(wbc, bio); } /* * Must try to add the page before marking the buffer clean or * the confused fail path above (OOM) will be very confused when * it finds all bh marked clean (i.e. it will not write anything) */ wbc_account_io(wbc, page, PAGE_SIZE); length = first_unmapped << blkbits; if (bio_add_page(bio, page, length, 0) < length) { bio = mpage_bio_submit(REQ_OP_WRITE, op_flags, bio); goto alloc_new; } clean_buffers(page, first_unmapped); BUG_ON(PageWriteback(page)); set_page_writeback(page); unlock_page(page); if (boundary || (first_unmapped != blocks_per_page)) { bio = mpage_bio_submit(REQ_OP_WRITE, op_flags, bio); if (boundary_block) { write_boundary_block(boundary_bdev, boundary_block, 1 << blkbits); } } else { mpd->last_block_in_bio = blocks[blocks_per_page - 1]; } goto out; confused: if (bio) bio = mpage_bio_submit(REQ_OP_WRITE, op_flags, bio); if (mpd->use_writepage) { ret = mapping->a_ops->writepage(page, wbc); } else { ret = -EAGAIN; goto out; } /* * The caller has a ref on the inode, so *mapping is stable */ mapping_set_error(mapping, ret); out: mpd->bio = bio; return ret; }
/* * This is the worker routine which does all the work of mapping the disk * blocks and constructs largest possible bios, submits them for IO if the * blocks are not contiguous on the disk. * * We pass a buffer_head back and forth and use its buffer_mapped() flag to * represent the validity of its disk mapping and to decide when to do the next * get_block() call. */ static struct bio * do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages, sector_t *last_block_in_bio, struct buffer_head *map_bh, unsigned long *first_logical_block, struct compressed_bio **cb, get_block_t get_block) { struct inode *inode = page->mapping->host; const unsigned blkbits = inode->i_blkbits; const unsigned blocks_per_page = PAGE_CACHE_SIZE >> blkbits; //SET TO 1 const unsigned blocksize = 1 << blkbits; sector_t block_in_file; sector_t last_block; sector_t last_block_in_file; sector_t blocks[MAX_BUF_PER_PAGE]; unsigned page_block; //Increments to 1 unsigned first_hole = blocks_per_page; struct block_device *bdev = NULL; int fully_mapped = 1; unsigned nblocks; unsigned relative_block; int err; /* blkbits = 12 | MAX_BUF_PER_PAGE = 8 */ if (page_has_buffers(page)) goto confused; /* block_in_file : page->index * last_block : last page->index of requested nr_pages * last_block_in_file : always index of last_page_of_file */ block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits); last_block = block_in_file + nr_pages * blocks_per_page; last_block_in_file = (i_size_read(inode) + blocksize - 1) >> blkbits; if (last_block > last_block_in_file) last_block = last_block_in_file; page_block = 0; /* * Map blocks using the result from the previous get_blocks call first. */ /* nblocks : Initially 0 | Later mapped to 1 extent so is mostly 16 */ nblocks = map_bh->b_size >> blkbits; printk(KERN_INFO "\ncurrent_page : %Lu | nblocks_initial : %u", block_in_file, nblocks); if (buffer_mapped(map_bh) && block_in_file > *first_logical_block && block_in_file < (*first_logical_block + nblocks)) { unsigned map_offset = block_in_file - *first_logical_block; unsigned last = nblocks - map_offset; for (relative_block = 0; ; relative_block++) { if (relative_block == last) { clear_buffer_mapped(map_bh); break; } if (page_block == blocks_per_page) break; blocks[page_block] = map_bh->b_blocknr + map_offset + relative_block; page_block++; block_in_file++; } bdev = map_bh->b_bdev; } /* * Then do more get_blocks calls until we are done with this page. */ map_bh->b_page = page; while (page_block < blocks_per_page) { map_bh->b_state = 0; map_bh->b_size = 0; if (block_in_file < last_block) { map_bh->b_size = (last_block - block_in_file) << blkbits; /* use of get_block => ***needs buffer_head map_bh * bdev = map_bh->b_dev * physical = map_bh->b_blocknr * nblocks = map_bh->b_size (no of logical blocks in extent) * compress_count = map_bh->b_private * first_logical_block */ if (get_block(inode, block_in_file, map_bh, 0)) //BLOCK_MAPPER goto confused; *first_logical_block = block_in_file; } /* generally is mapped.. so FALSE */ if (!buffer_mapped(map_bh)) { fully_mapped = 0; if (first_hole == blocks_per_page) first_hole = page_block; page_block++; block_in_file++; continue; } /* some filesystems will copy data into the page during * the get_block call, in which case we don't want to * read it again. map_buffer_to_page copies the data * we just collected from get_block into the page's buffers * so readpage doesn't have to repeat the get_block call */ //NEXT 3 => FALSE if (buffer_uptodate(map_bh)) { printk("\nIn map_buffer_to_page()"); map_buffer_to_page(page, map_bh, page_block); goto confused; } if (first_hole != blocks_per_page) goto confused; /* hole -> non-hole */ /* Contiguous blocks? */ if (page_block && blocks[page_block-1] != map_bh->b_blocknr-1) goto confused; nblocks = map_bh->b_size >> blkbits; printk(KERN_INFO "\nnblocks_mapped : %u", nblocks); //MAIN PART if (!*cb) { *cb = kzalloc(sizeof(struct compressed_bio), GFP_NOFS); if (!*cb) { /* ERROR */ BUG_ON(1); err = -ENOMEM; } err = compressed_bio_init(*cb, inode, *first_logical_block, *(unsigned *)map_bh->b_private, nblocks << PAGE_CACHE_SHIFT, 0);//compressed_len = 0 if (err) { /* ERROR = -ENOMEM */ err = -ENOMEM; } kfree((unsigned *)map_bh->b_private); map_bh->b_private = NULL; } for (relative_block = 0; ; relative_block++) { if (relative_block == nblocks) { clear_buffer_mapped(map_bh); break; } else if (page_block == blocks_per_page) break; blocks[page_block] = map_bh->b_blocknr+relative_block; page_block++; block_in_file++; } bdev = map_bh->b_bdev; } if (first_hole != blocks_per_page) { zero_user_segment(page, first_hole << blkbits, PAGE_CACHE_SIZE); if (first_hole == 0) { SetPageUptodate(page); unlock_page(page); goto out; } } else if (fully_mapped) { //TRUE...REQ? SetPageMappedToDisk(page); } if (fully_mapped && blocks_per_page == 1 && !PageUptodate(page) && cleancache_get_page(page) == 0) { //FALSE printk(KERN_INFO "\nSet_Page_Uptodate"); SetPageUptodate(page); goto confused; } /* * This page will go to BIO. Do we need to send this BIO off first? */ /* if (bio && (*last_block_in_bio != blocks[0] - 1)) */ /* bio = mpage_bio_submit(READ, bio); */ /* alloc_new: */ /* if (bio == NULL) { */ /* bio = mpage_alloc(bdev, blocks[0] << (blkbits - 9), */ /* min_t(int, nr_pages, bio_get_nr_vecs(bdev)), */ /* GFP_KERNEL); */ /* if (bio == NULL) */ /* goto confused; */ /* } */ /* length = first_hole << blkbits; */ /* if (bio_add_page(bio, page, length, 0) < length) { */ /* bio = mpage_bio_submit(READ, bio); */ /* goto alloc_new; */ /* } */ /* relative_block = block_in_file - *first_logical_block; */ /* nblocks = map_bh->b_size >> blkbits; */ /* if ((buffer_boundary(map_bh) && relative_block == nblocks) || */ /* (first_hole != blocks_per_page)) */ /* bio = mpage_bio_submit(READ, bio); */ /* else */ /* *last_block_in_bio = blocks[blocks_per_page - 1]; */ out: return bio; confused: printk(KERN_INFO "\nCONFUSED !"); if (bio) bio = mpage_bio_submit(READ, bio); if (!PageUptodate(page)) block_read_full_page(page, get_block); else unlock_page(page); goto out; }
int __generic_block_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, loff_t start, loff_t len, get_block_t *get_block) { struct buffer_head map_bh; sector_t start_blk, last_blk; loff_t isize = i_size_read(inode); u64 logical = 0, phys = 0, size = 0; u32 flags = FIEMAP_EXTENT_MERGED; bool past_eof = false, whole_file = false; int ret = 0; ret = fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC); if (ret) return ret; /* * Either the i_mutex or other appropriate locking needs to be held * since we expect isize to not change at all through the duration of * this call. */ if (len >= isize) { whole_file = true; len = isize; } /* * Some filesystems can't deal with being asked to map less than * blocksize, so make sure our len is at least block length. */ if (logical_to_blk(inode, len) == 0) len = blk_to_logical(inode, 1); start_blk = logical_to_blk(inode, start); last_blk = logical_to_blk(inode, start + len - 1); do { /* * we set b_size to the total size we want so it will map as * many contiguous blocks as possible at once */ memset(&map_bh, 0, sizeof(struct buffer_head)); map_bh.b_size = len; ret = get_block(inode, start_blk, &map_bh, 0); if (ret) break; /* HOLE */ if (!buffer_mapped(&map_bh)) { start_blk++; /* * We want to handle the case where there is an * allocated block at the front of the file, and then * nothing but holes up to the end of the file properly, * to make sure that extent at the front gets properly * marked with FIEMAP_EXTENT_LAST */ if (!past_eof && blk_to_logical(inode, start_blk) >= isize) past_eof = 1; /* * First hole after going past the EOF, this is our * last extent */ if (past_eof && size) { flags = FIEMAP_EXTENT_MERGED|FIEMAP_EXTENT_LAST; ret = fiemap_fill_next_extent(fieinfo, logical, phys, size, flags); } else if (size) { ret = fiemap_fill_next_extent(fieinfo, logical, phys, size, flags); size = 0; } /* if we have holes up to/past EOF then we're done */ if (start_blk > last_blk || past_eof || ret) break; } else { /* * We have gone over the length of what we wanted to * map, and it wasn't the entire file, so add the extent * we got last time and exit. * * This is for the case where say we want to map all the * way up to the second to the last block in a file, but * the last block is a hole, making the second to last * block FIEMAP_EXTENT_LAST. In this case we want to * see if there is a hole after the second to last block * so we can mark it properly. If we found data after * we exceeded the length we were requesting, then we * are good to go, just add the extent to the fieinfo * and break */ if (start_blk > last_blk && !whole_file) { ret = fiemap_fill_next_extent(fieinfo, logical, phys, size, flags); break; } /* * if size != 0 then we know we already have an extent * to add, so add it. */ if (size) { ret = fiemap_fill_next_extent(fieinfo, logical, phys, size, flags); if (ret) break; } logical = blk_to_logical(inode, start_blk); phys = blk_to_logical(inode, map_bh.b_blocknr); size = map_bh.b_size; flags = FIEMAP_EXTENT_MERGED; start_blk += logical_to_blk(inode, size); /* * If we are past the EOF, then we need to make sure as * soon as we find a hole that the last extent we found * is marked with FIEMAP_EXTENT_LAST */ if (!past_eof && logical + size >= isize) past_eof = true; } cond_resched(); } while (1); /* If ret is 1 then we just hit the end of the extent array */ if (ret == 1) ret = 0; return ret; }
static int copy_user_bh(struct page *to, struct inode *inode, struct buffer_head *bh, unsigned long vaddr) { struct blk_dax_ctl dax = { .sector = to_sector(bh, inode), .size = bh->b_size, }; struct block_device *bdev = bh->b_bdev; void *vto; if (dax_map_atomic(bdev, &dax) < 0) return PTR_ERR(dax.addr); vto = kmap_atomic(to); copy_user_page(vto, (void __force *)dax.addr, vaddr, to); kunmap_atomic(vto); dax_unmap_atomic(bdev, &dax); return 0; } #define NO_SECTOR -1 #define DAX_PMD_INDEX(page_index) (page_index & (PMD_MASK >> PAGE_CACHE_SHIFT)) static int dax_radix_entry(struct address_space *mapping, pgoff_t index, sector_t sector, bool pmd_entry, bool dirty) { struct radix_tree_root *page_tree = &mapping->page_tree; pgoff_t pmd_index = DAX_PMD_INDEX(index); int type, error = 0; void *entry; WARN_ON_ONCE(pmd_entry && !dirty); __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); spin_lock_irq(&mapping->tree_lock); entry = radix_tree_lookup(page_tree, pmd_index); if (entry && RADIX_DAX_TYPE(entry) == RADIX_DAX_PMD) { index = pmd_index; goto dirty; } entry = radix_tree_lookup(page_tree, index); if (entry) { type = RADIX_DAX_TYPE(entry); if (WARN_ON_ONCE(type != RADIX_DAX_PTE && type != RADIX_DAX_PMD)) { error = -EIO; goto unlock; } if (!pmd_entry || type == RADIX_DAX_PMD) goto dirty; /* * We only insert dirty PMD entries into the radix tree. This * means we don't need to worry about removing a dirty PTE * entry and inserting a clean PMD entry, thus reducing the * range we would flush with a follow-up fsync/msync call. */ radix_tree_delete(&mapping->page_tree, index); mapping->nrexceptional--; } if (sector == NO_SECTOR) { /* * This can happen during correct operation if our pfn_mkwrite * fault raced against a hole punch operation. If this * happens the pte that was hole punched will have been * unmapped and the radix tree entry will have been removed by * the time we are called, but the call will still happen. We * will return all the way up to wp_pfn_shared(), where the * pte_same() check will fail, eventually causing page fault * to be retried by the CPU. */ goto unlock; } error = radix_tree_insert(page_tree, index, RADIX_DAX_ENTRY(sector, pmd_entry)); if (error) goto unlock; mapping->nrexceptional++; dirty: if (dirty) radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY); unlock: spin_unlock_irq(&mapping->tree_lock); return error; } static int dax_writeback_one(struct block_device *bdev, struct address_space *mapping, pgoff_t index, void *entry) { struct radix_tree_root *page_tree = &mapping->page_tree; int type = RADIX_DAX_TYPE(entry); struct radix_tree_node *node; struct blk_dax_ctl dax; void **slot; int ret = 0; spin_lock_irq(&mapping->tree_lock); /* * Regular page slots are stabilized by the page lock even * without the tree itself locked. These unlocked entries * need verification under the tree lock. */ if (!__radix_tree_lookup(page_tree, index, &node, &slot)) goto unlock; if (*slot != entry) goto unlock; /* another fsync thread may have already written back this entry */ if (!radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE)) goto unlock; if (WARN_ON_ONCE(type != RADIX_DAX_PTE && type != RADIX_DAX_PMD)) { ret = -EIO; goto unlock; } dax.sector = RADIX_DAX_SECTOR(entry); dax.size = (type == RADIX_DAX_PMD ? PMD_SIZE : PAGE_SIZE); spin_unlock_irq(&mapping->tree_lock); /* * We cannot hold tree_lock while calling dax_map_atomic() because it * eventually calls cond_resched(). */ ret = dax_map_atomic(bdev, &dax); if (ret < 0) return ret; if (WARN_ON_ONCE(ret < dax.size)) { ret = -EIO; goto unmap; } wb_cache_pmem(dax.addr, dax.size); spin_lock_irq(&mapping->tree_lock); radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_TOWRITE); spin_unlock_irq(&mapping->tree_lock); unmap: dax_unmap_atomic(bdev, &dax); return ret; unlock: spin_unlock_irq(&mapping->tree_lock); return ret; } /* * Flush the mapping to the persistent domain within the byte range of [start, * end]. This is required by data integrity operations to ensure file data is * on persistent storage prior to completion of the operation. */ int dax_writeback_mapping_range(struct address_space *mapping, loff_t start, loff_t end) { struct inode *inode = mapping->host; struct block_device *bdev = inode->i_sb->s_bdev; pgoff_t start_index, end_index, pmd_index; pgoff_t indices[PAGEVEC_SIZE]; struct pagevec pvec; bool done = false; int i, ret = 0; void *entry; if (WARN_ON_ONCE(inode->i_blkbits != PAGE_SHIFT)) return -EIO; start_index = start >> PAGE_CACHE_SHIFT; end_index = end >> PAGE_CACHE_SHIFT; pmd_index = DAX_PMD_INDEX(start_index); rcu_read_lock(); entry = radix_tree_lookup(&mapping->page_tree, pmd_index); rcu_read_unlock(); /* see if the start of our range is covered by a PMD entry */ if (entry && RADIX_DAX_TYPE(entry) == RADIX_DAX_PMD) start_index = pmd_index; tag_pages_for_writeback(mapping, start_index, end_index); pagevec_init(&pvec, 0); while (!done) { pvec.nr = find_get_entries_tag(mapping, start_index, PAGECACHE_TAG_TOWRITE, PAGEVEC_SIZE, pvec.pages, indices); if (pvec.nr == 0) break; for (i = 0; i < pvec.nr; i++) { if (indices[i] > end_index) { done = true; break; } ret = dax_writeback_one(bdev, mapping, indices[i], pvec.pages[i]); if (ret < 0) return ret; } } wmb_pmem(); return 0; } EXPORT_SYMBOL_GPL(dax_writeback_mapping_range); static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh, struct vm_area_struct *vma, struct vm_fault *vmf) { unsigned long vaddr = (unsigned long)vmf->virtual_address; struct address_space *mapping = inode->i_mapping; struct block_device *bdev = bh->b_bdev; struct blk_dax_ctl dax = { .sector = to_sector(bh, inode), .size = bh->b_size, }; pgoff_t size; int error; i_mmap_lock_read(mapping); /* * Check truncate didn't happen while we were allocating a block. * If it did, this block may or may not be still allocated to the * file. We can't tell the filesystem to free it because we can't * take i_mutex here. In the worst case, the file still has blocks * allocated past the end of the file. */ size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; if (unlikely(vmf->pgoff >= size)) { error = -EIO; goto out; } if (dax_map_atomic(bdev, &dax) < 0) { error = PTR_ERR(dax.addr); goto out; } if (buffer_unwritten(bh) || buffer_new(bh)) { clear_pmem(dax.addr, PAGE_SIZE); wmb_pmem(); } dax_unmap_atomic(bdev, &dax); error = dax_radix_entry(mapping, vmf->pgoff, dax.sector, false, vmf->flags & FAULT_FLAG_WRITE); if (error) goto out; error = vm_insert_mixed(vma, vaddr, dax.pfn); out: i_mmap_unlock_read(mapping); return error; } /** * __dax_fault - handle a page fault on a DAX file * @vma: The virtual memory area where the fault occurred * @vmf: The description of the fault * @get_block: The filesystem method used to translate file offsets to blocks * @complete_unwritten: The filesystem method used to convert unwritten blocks * to written so the data written to them is exposed. This is required for * required by write faults for filesystems that will return unwritten * extent mappings from @get_block, but it is optional for reads as * dax_insert_mapping() will always zero unwritten blocks. If the fs does * not support unwritten extents, the it should pass NULL. * * When a page fault occurs, filesystems may call this helper in their * fault handler for DAX files. __dax_fault() assumes the caller has done all * the necessary locking for the page fault to proceed successfully. */ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, get_block_t get_block, dax_iodone_t complete_unwritten) { struct file *file = vma->vm_file; struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; struct page *page; struct buffer_head bh; unsigned long vaddr = (unsigned long)vmf->virtual_address; unsigned blkbits = inode->i_blkbits; sector_t block; pgoff_t size; int error; int major = 0; size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; if (vmf->pgoff >= size) return VM_FAULT_SIGBUS; memset(&bh, 0, sizeof(bh)); block = (sector_t)vmf->pgoff << (PAGE_SHIFT - blkbits); bh.b_bdev = inode->i_sb->s_bdev; bh.b_size = PAGE_SIZE; repeat: page = find_get_page(mapping, vmf->pgoff); if (page) { if (!lock_page_or_retry(page, vma->vm_mm, vmf->flags)) { page_cache_release(page); return VM_FAULT_RETRY; } if (unlikely(page->mapping != mapping)) { unlock_page(page); page_cache_release(page); goto repeat; } size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; if (unlikely(vmf->pgoff >= size)) { /* * We have a struct page covering a hole in the file * from a read fault and we've raced with a truncate */ error = -EIO; goto unlock_page; } } error = get_block(inode, block, &bh, 0); if (!error && (bh.b_size < PAGE_SIZE)) error = -EIO; /* fs corruption? */ if (error) goto unlock_page; if (!buffer_mapped(&bh) && !buffer_unwritten(&bh) && !vmf->cow_page) { if (vmf->flags & FAULT_FLAG_WRITE) { error = get_block(inode, block, &bh, 1); count_vm_event(PGMAJFAULT); mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); major = VM_FAULT_MAJOR; if (!error && (bh.b_size < PAGE_SIZE)) error = -EIO; if (error) goto unlock_page; } else { return dax_load_hole(mapping, page, vmf); } } if (vmf->cow_page) { struct page *new_page = vmf->cow_page; if (buffer_written(&bh)) error = copy_user_bh(new_page, inode, &bh, vaddr); else clear_user_highpage(new_page, vaddr); if (error) goto unlock_page; vmf->page = page; if (!page) { i_mmap_lock_read(mapping); /* Check we didn't race with truncate */ size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; if (vmf->pgoff >= size) { i_mmap_unlock_read(mapping); error = -EIO; goto out; } } return VM_FAULT_LOCKED; } /* Check we didn't race with a read fault installing a new page */ if (!page && major) page = find_lock_page(mapping, vmf->pgoff); if (page) { unmap_mapping_range(mapping, vmf->pgoff << PAGE_SHIFT, PAGE_CACHE_SIZE, 0); delete_from_page_cache(page); unlock_page(page); page_cache_release(page); page = NULL; } /* * If we successfully insert the new mapping over an unwritten extent, * we need to ensure we convert the unwritten extent. If there is an * error inserting the mapping, the filesystem needs to leave it as * unwritten to prevent exposure of the stale underlying data to * userspace, but we still need to call the completion function so * the private resources on the mapping buffer can be released. We * indicate what the callback should do via the uptodate variable, same * as for normal BH based IO completions. */ error = dax_insert_mapping(inode, &bh, vma, vmf); if (buffer_unwritten(&bh)) { if (complete_unwritten) complete_unwritten(&bh, !error); else WARN_ON_ONCE(!(vmf->flags & FAULT_FLAG_WRITE)); } out: if (error == -ENOMEM) return VM_FAULT_OOM | major; /* -EBUSY is fine, somebody else faulted on the same PTE */ if ((error < 0) && (error != -EBUSY)) return VM_FAULT_SIGBUS | major; return VM_FAULT_NOPAGE | major; unlock_page: if (page) { unlock_page(page); page_cache_release(page); } goto out; }
/* * This is the worker routine which does all the work of mapping the disk * blocks and constructs largest possible bios, submits them for IO if the * blocks are not contiguous on the disk. * * We pass a buffer_head back and forth and use its buffer_mapped() flag to * represent the validity of its disk mapping and to decide when to do the next * get_block() call. */ static struct bio * do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages, sector_t *last_block_in_bio, struct buffer_head *map_bh, unsigned long *first_logical_block, get_block_t get_block) { struct inode *inode = page->mapping->host; const unsigned blkbits = inode->i_blkbits; const unsigned blocks_per_page = PAGE_CACHE_SIZE >> blkbits; const unsigned blocksize = 1 << blkbits; sector_t block_in_file; sector_t last_block; sector_t last_block_in_file; sector_t blocks[MAX_BUF_PER_PAGE]; unsigned page_block; unsigned first_hole = blocks_per_page; struct block_device *bdev = NULL; int length; int fully_mapped = 1; unsigned nblocks; unsigned relative_block; if (page_has_buffers(page)) goto confused; block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits); last_block = block_in_file + nr_pages * blocks_per_page; last_block_in_file = (i_size_read(inode) + blocksize - 1) >> blkbits; if (last_block > last_block_in_file) last_block = last_block_in_file; page_block = 0; /* * Map blocks using the result from the previous get_blocks call first. */ nblocks = map_bh->b_size >> blkbits; if (buffer_mapped(map_bh) && block_in_file > *first_logical_block && block_in_file < (*first_logical_block + nblocks)) { unsigned map_offset = block_in_file - *first_logical_block; unsigned last = nblocks - map_offset; for (relative_block = 0; ; relative_block++) { if (relative_block == last) { clear_buffer_mapped(map_bh); break; } if (page_block == blocks_per_page) break; blocks[page_block] = map_bh->b_blocknr + map_offset + relative_block; page_block++; block_in_file++; } bdev = map_bh->b_bdev; } /* * Then do more get_blocks calls until we are done with this page. */ map_bh->b_page = page; while (page_block < blocks_per_page) { map_bh->b_state = 0; map_bh->b_size = 0; if (block_in_file < last_block) { map_bh->b_size = (last_block-block_in_file) << blkbits; if (get_block(inode, block_in_file, map_bh, 0)) goto confused; *first_logical_block = block_in_file; } if (!buffer_mapped(map_bh)) { fully_mapped = 0; if (first_hole == blocks_per_page) first_hole = page_block; page_block++; block_in_file++; clear_buffer_mapped(map_bh); continue; } /* some filesystems will copy data into the page during * the get_block call, in which case we don't want to * read it again. map_buffer_to_page copies the data * we just collected from get_block into the page's buffers * so readpage doesn't have to repeat the get_block call */ if (buffer_uptodate(map_bh)) { map_buffer_to_page(page, map_bh, page_block); goto confused; } if (first_hole != blocks_per_page) goto confused; /* hole -> non-hole */ /* Contiguous blocks? */ if (page_block && blocks[page_block-1] != map_bh->b_blocknr-1) goto confused; nblocks = map_bh->b_size >> blkbits; for (relative_block = 0; ; relative_block++) { if (relative_block == nblocks) { clear_buffer_mapped(map_bh); break; } else if (page_block == blocks_per_page) break; blocks[page_block] = map_bh->b_blocknr+relative_block; page_block++; block_in_file++; } bdev = map_bh->b_bdev; } if (first_hole != blocks_per_page) { char *kaddr = kmap_atomic(page, KM_USER0); memset(kaddr + (first_hole << blkbits), 0, PAGE_CACHE_SIZE - (first_hole << blkbits)); flush_dcache_page(page); kunmap_atomic(kaddr, KM_USER0); if (first_hole == 0) { SetPageUptodate(page); unlock_page(page); goto out; } } else if (fully_mapped) { SetPageMappedToDisk(page); } /* * This page will go to BIO. Do we need to send this BIO off first? */ if (bio && (*last_block_in_bio != blocks[0] - 1)) bio = mpage_bio_submit(READ, bio); alloc_new: if (bio == NULL) { bio = mpage_alloc(bdev, blocks[0] << (blkbits - 9), min_t(int, nr_pages, bio_get_nr_vecs(bdev)), GFP_KERNEL); if (bio == NULL) goto confused; }
int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, pmd_t *pmd, unsigned int flags, get_block_t get_block, dax_iodone_t complete_unwritten) { struct file *file = vma->vm_file; struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; struct buffer_head bh; unsigned blkbits = inode->i_blkbits; unsigned long pmd_addr = address & PMD_MASK; bool write = flags & FAULT_FLAG_WRITE; struct block_device *bdev; pgoff_t size, pgoff; sector_t block; int error, result = 0; bool alloc = false; /* dax pmd mappings require pfn_t_devmap() */ if (!IS_ENABLED(CONFIG_FS_DAX_PMD)) return VM_FAULT_FALLBACK; /* Fall back to PTEs if we're going to COW */ if (write && !(vma->vm_flags & VM_SHARED)) { split_huge_pmd(vma, pmd, address); dax_pmd_dbg(NULL, address, "cow write"); return VM_FAULT_FALLBACK; } /* If the PMD would extend outside the VMA */ if (pmd_addr < vma->vm_start) { dax_pmd_dbg(NULL, address, "vma start unaligned"); return VM_FAULT_FALLBACK; } if ((pmd_addr + PMD_SIZE) > vma->vm_end) { dax_pmd_dbg(NULL, address, "vma end unaligned"); return VM_FAULT_FALLBACK; } pgoff = linear_page_index(vma, pmd_addr); size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; if (pgoff >= size) return VM_FAULT_SIGBUS; /* If the PMD would cover blocks out of the file */ if ((pgoff | PG_PMD_COLOUR) >= size) { dax_pmd_dbg(NULL, address, "offset + huge page size > file size"); return VM_FAULT_FALLBACK; } memset(&bh, 0, sizeof(bh)); bh.b_bdev = inode->i_sb->s_bdev; block = (sector_t)pgoff << (PAGE_SHIFT - blkbits); bh.b_size = PMD_SIZE; if (get_block(inode, block, &bh, 0) != 0) return VM_FAULT_SIGBUS; if (!buffer_mapped(&bh) && write) { if (get_block(inode, block, &bh, 1) != 0) return VM_FAULT_SIGBUS; alloc = true; } bdev = bh.b_bdev; /* * If the filesystem isn't willing to tell us the length of a hole, * just fall back to PTEs. Calling get_block 512 times in a loop * would be silly. */ if (!buffer_size_valid(&bh) || bh.b_size < PMD_SIZE) { dax_pmd_dbg(&bh, address, "allocated block too small"); return VM_FAULT_FALLBACK; } /* * If we allocated new storage, make sure no process has any * zero pages covering this hole */ if (alloc) { loff_t lstart = pgoff << PAGE_SHIFT; loff_t lend = lstart + PMD_SIZE - 1; /* inclusive */ truncate_pagecache_range(inode, lstart, lend); } i_mmap_lock_read(mapping); /* * If a truncate happened while we were allocating blocks, we may * leave blocks allocated to the file that are beyond EOF. We can't * take i_mutex here, so just leave them hanging; they'll be freed * when the file is deleted. */ size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; if (pgoff >= size) { result = VM_FAULT_SIGBUS; goto out; } if ((pgoff | PG_PMD_COLOUR) >= size) { dax_pmd_dbg(&bh, address, "offset + huge page size > file size"); goto fallback; } if (!write && !buffer_mapped(&bh) && buffer_uptodate(&bh)) { spinlock_t *ptl; pmd_t entry; struct page *zero_page = get_huge_zero_page(); if (unlikely(!zero_page)) { dax_pmd_dbg(&bh, address, "no zero page"); goto fallback; } ptl = pmd_lock(vma->vm_mm, pmd); if (!pmd_none(*pmd)) { spin_unlock(ptl); dax_pmd_dbg(&bh, address, "pmd already present"); goto fallback; } dev_dbg(part_to_dev(bdev->bd_part), "%s: %s addr: %lx pfn: <zero> sect: %llx\n", __func__, current->comm, address, (unsigned long long) to_sector(&bh, inode)); entry = mk_pmd(zero_page, vma->vm_page_prot); entry = pmd_mkhuge(entry); set_pmd_at(vma->vm_mm, pmd_addr, pmd, entry); result = VM_FAULT_NOPAGE; spin_unlock(ptl); } else { struct blk_dax_ctl dax = { .sector = to_sector(&bh, inode), .size = PMD_SIZE, }; long length = dax_map_atomic(bdev, &dax); if (length < 0) { result = VM_FAULT_SIGBUS; goto out; } if (length < PMD_SIZE) { dax_pmd_dbg(&bh, address, "dax-length too small"); dax_unmap_atomic(bdev, &dax); goto fallback; } if (pfn_t_to_pfn(dax.pfn) & PG_PMD_COLOUR) { dax_pmd_dbg(&bh, address, "pfn unaligned"); dax_unmap_atomic(bdev, &dax); goto fallback; } if (!pfn_t_devmap(dax.pfn)) { dax_unmap_atomic(bdev, &dax); dax_pmd_dbg(&bh, address, "pfn not in memmap"); goto fallback; } if (buffer_unwritten(&bh) || buffer_new(&bh)) { clear_pmem(dax.addr, PMD_SIZE); wmb_pmem(); count_vm_event(PGMAJFAULT); mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); result |= VM_FAULT_MAJOR; } dax_unmap_atomic(bdev, &dax); /* * For PTE faults we insert a radix tree entry for reads, and * leave it clean. Then on the first write we dirty the radix * tree entry via the dax_pfn_mkwrite() path. This sequence * allows the dax_pfn_mkwrite() call to be simpler and avoid a * call into get_block() to translate the pgoff to a sector in * order to be able to create a new radix tree entry. * * The PMD path doesn't have an equivalent to * dax_pfn_mkwrite(), though, so for a read followed by a * write we traverse all the way through __dax_pmd_fault() * twice. This means we can just skip inserting a radix tree * entry completely on the initial read and just wait until * the write to insert a dirty entry. */ if (write) { error = dax_radix_entry(mapping, pgoff, dax.sector, true, true); if (error) { dax_pmd_dbg(&bh, address, "PMD radix insertion failed"); goto fallback; } } dev_dbg(part_to_dev(bdev->bd_part), "%s: %s addr: %lx pfn: %lx sect: %llx\n", __func__, current->comm, address, pfn_t_to_pfn(dax.pfn), (unsigned long long) dax.sector); result |= vmf_insert_pfn_pmd(vma, address, pmd, dax.pfn, write); } out: i_mmap_unlock_read(mapping); if (buffer_unwritten(&bh)) complete_unwritten(&bh, !(result & VM_FAULT_ERROR)); return result; fallback: count_vm_event(THP_FAULT_FALLBACK); result = VM_FAULT_FALLBACK; goto out; } EXPORT_SYMBOL_GPL(__dax_pmd_fault); /** * dax_pmd_fault - handle a PMD fault on a DAX file * @vma: The virtual memory area where the fault occurred * @vmf: The description of the fault * @get_block: The filesystem method used to translate file offsets to blocks * * When a page fault occurs, filesystems may call this helper in their * pmd_fault handler for DAX files. */ int dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, pmd_t *pmd, unsigned int flags, get_block_t get_block, dax_iodone_t complete_unwritten) { int result; struct super_block *sb = file_inode(vma->vm_file)->i_sb; if (flags & FAULT_FLAG_WRITE) { sb_start_pagefault(sb); file_update_time(vma->vm_file); } result = __dax_pmd_fault(vma, address, pmd, flags, get_block, complete_unwritten); if (flags & FAULT_FLAG_WRITE) sb_end_pagefault(sb); return result; }
/* * Modify inode page cache in such way: * have - blocks with b_blocknr equal to oldb...oldb+count-1 * get - blocks with b_blocknr equal to newb...newb+count-1 * also we suppose that oldb...oldb+count-1 blocks * situated at the end of file. * * We can come here from ufs_writepage or ufs_prepare_write, * locked_page is argument of these functions, so we already lock it. */ static void ufs_change_blocknr(struct inode *inode, sector_t beg, unsigned int count, sector_t oldb, sector_t newb, struct page *locked_page) { const unsigned blks_per_page = 1 << (PAGE_CACHE_SHIFT - inode->i_blkbits); const unsigned mask = blks_per_page - 1; struct address_space * const mapping = inode->i_mapping; pgoff_t index, cur_index, last_index; unsigned pos, j, lblock; sector_t end, i; struct page *page; struct buffer_head *head, *bh; UFSD("ENTER, ino %lu, count %u, oldb %llu, newb %llu\n", inode->i_ino, count, (unsigned long long)oldb, (unsigned long long)newb); BUG_ON(!locked_page); BUG_ON(!PageLocked(locked_page)); cur_index = locked_page->index; end = count + beg; last_index = end >> (PAGE_CACHE_SHIFT - inode->i_blkbits); for (i = beg; i < end; i = (i | mask) + 1) { index = i >> (PAGE_CACHE_SHIFT - inode->i_blkbits); if (likely(cur_index != index)) { page = ufs_get_locked_page(mapping, index); if (!page)/* it was truncated */ continue; if (IS_ERR(page)) {/* or EIO */ ufs_error(inode->i_sb, __func__, "read of page %llu failed\n", (unsigned long long)index); continue; } } else page = locked_page; head = page_buffers(page); bh = head; pos = i & mask; for (j = 0; j < pos; ++j) bh = bh->b_this_page; if (unlikely(index == last_index)) lblock = end & mask; else lblock = blks_per_page; do { if (j >= lblock) break; pos = (i - beg) + j; if (!buffer_mapped(bh)) map_bh(bh, inode->i_sb, oldb + pos); if (!buffer_uptodate(bh)) { ll_rw_block(READ, 1, &bh); wait_on_buffer(bh); if (!buffer_uptodate(bh)) { ufs_error(inode->i_sb, __func__, "read of block failed\n"); break; } } UFSD(" change from %llu to %llu, pos %u\n", (unsigned long long)(pos + oldb), (unsigned long long)(pos + newb), pos); bh->b_blocknr = newb + pos; unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr); mark_buffer_dirty(bh); ++j; bh = bh->b_this_page; } while (bh != head); if (likely(cur_index != index)) ufs_put_locked_page(page); } UFSD("EXIT\n"); }
/** * sync_mft_mirror - synchronize an mft record to the mft mirror * @ni: ntfs inode whose mft record to synchronize * @m: mapped, mst protected (extent) mft record to synchronize * @sync: if true, wait for i/o completion * * Write the mapped, mst protected (extent) mft record @m described by the * (regular or extent) ntfs inode @ni to the mft mirror ($MFTMirr). * * On success return 0. On error return -errno and set the volume errors flag * in the ntfs_volume to which @ni belongs. * * NOTE: We always perform synchronous i/o and ignore the @sync parameter. * * TODO: If @sync is false, want to do truly asynchronous i/o, i.e. just * schedule i/o via ->writepage or do it via kntfsd or whatever. */ static int sync_mft_mirror(ntfs_inode *ni, MFT_RECORD *m, int sync) { ntfs_volume *vol = ni->vol; struct page *page; unsigned int blocksize = vol->sb->s_blocksize; int max_bhs = vol->mft_record_size / blocksize; struct buffer_head *bhs[max_bhs]; struct buffer_head *bh, *head; u8 *kmirr; unsigned int block_start, block_end, m_start, m_end; int i_bhs, nr_bhs, err = 0; ntfs_debug("Entering for inode 0x%lx.", ni->mft_no); BUG_ON(!max_bhs); if (unlikely(!vol->mftmirr_ino)) { /* This could happen during umount... */ err = sync_mft_mirror_umount(ni, m); if (likely(!err)) return err; goto err_out; } /* Get the page containing the mirror copy of the mft record @m. */ page = ntfs_map_page(vol->mftmirr_ino->i_mapping, ni->mft_no >> (PAGE_CACHE_SHIFT - vol->mft_record_size_bits)); if (unlikely(IS_ERR(page))) { ntfs_error(vol->sb, "Failed to map mft mirror page."); err = PTR_ERR(page); goto err_out; } /* * Exclusion against other writers. This should never be a problem * since the page in which the mft record @m resides is also locked and * hence any other writers would be held up there but it is better to * make sure no one is writing from elsewhere. */ lock_page(page); /* The address in the page of the mirror copy of the mft record @m. */ kmirr = page_address(page) + ((ni->mft_no << vol->mft_record_size_bits) & ~PAGE_CACHE_MASK); /* Copy the mst protected mft record to the mirror. */ memcpy(kmirr, m, vol->mft_record_size); /* Make sure we have mapped buffers. */ if (!page_has_buffers(page)) { no_buffers_err_out: ntfs_error(vol->sb, "Writing mft mirror records without " "existing buffers is not implemented yet. %s", ntfs_please_email); err = -EOPNOTSUPP; goto unlock_err_out; } bh = head = page_buffers(page); if (!bh) goto no_buffers_err_out; nr_bhs = 0; block_start = 0; m_start = kmirr - (u8*)page_address(page); m_end = m_start + vol->mft_record_size; do { block_end = block_start + blocksize; /* * If the buffer is outside the mft record, just skip it, * clearing it if it is dirty to make sure it is not written * out. It should never be marked dirty but better be safe. */ if ((block_end <= m_start) || (block_start >= m_end)) { if (buffer_dirty(bh)) { ntfs_warning(vol->sb, "Clearing dirty mft " "record page buffer. %s", ntfs_please_email); clear_buffer_dirty(bh); } continue; } if (!buffer_mapped(bh)) { ntfs_error(vol->sb, "Writing mft mirror records " "without existing mapped buffers is " "not implemented yet. %s", ntfs_please_email); err = -EOPNOTSUPP; continue; } if (!buffer_uptodate(bh)) { ntfs_error(vol->sb, "Writing mft mirror records " "without existing uptodate buffers is " "not implemented yet. %s", ntfs_please_email); err = -EOPNOTSUPP; continue; } BUG_ON(!nr_bhs && (m_start != block_start)); BUG_ON(nr_bhs >= max_bhs); bhs[nr_bhs++] = bh; BUG_ON((nr_bhs >= max_bhs) && (m_end != block_end)); } while (block_start = block_end, (bh = bh->b_this_page) != head); if (likely(!err)) { /* Lock buffers and start synchronous write i/o on them. */ for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++) { struct buffer_head *tbh = bhs[i_bhs]; if (unlikely(test_set_buffer_locked(tbh))) BUG(); BUG_ON(!buffer_uptodate(tbh)); if (buffer_dirty(tbh)) clear_buffer_dirty(tbh); get_bh(tbh); tbh->b_end_io = end_buffer_write_sync; submit_bh(WRITE, tbh); } /* Wait on i/o completion of buffers. */ for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++) { struct buffer_head *tbh = bhs[i_bhs]; wait_on_buffer(tbh); if (unlikely(!buffer_uptodate(tbh))) { err = -EIO; /* * Set the buffer uptodate so the page & buffer * states don't become out of sync. */ if (PageUptodate(page)) set_buffer_uptodate(tbh); } } } else /* if (unlikely(err)) */ { /* Clean the buffers. */ for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++) clear_buffer_dirty(bhs[i_bhs]); } unlock_err_out: /* Current state: all buffers are clean, unlocked, and uptodate. */ /* Remove the mst protection fixups again. */ post_write_mst_fixup((NTFS_RECORD*)kmirr); flush_dcache_page(page); unlock_page(page); ntfs_unmap_page(page); if (unlikely(err)) { /* I/O error during writing. This is really bad! */ ntfs_error(vol->sb, "I/O error while writing mft mirror " "record 0x%lx! You should unmount the volume " "and run chkdsk or ntfsfix.", ni->mft_no); goto err_out; } ntfs_debug("Done."); return 0; err_out: ntfs_error(vol->sb, "Failed to synchronize $MFTMirr (error code %i). " "Volume will be left marked dirty on umount. Run " "ntfsfix on the partition after umounting to correct " "this.", -err); /* We don't want to clear the dirty bit on umount. */ NVolSetErrors(vol); return err; }