示例#1
0
int __generic_block_fiemap(struct inode *inode,
                           struct fiemap_extent_info *fieinfo, u64 start,
                           u64 len, get_block_t *get_block)
{
    struct buffer_head tmp;
    unsigned long long start_blk;
    long long length = 0, map_len = 0;
    u64 logical = 0, phys = 0, size = 0;
    u32 flags = FIEMAP_EXTENT_MERGED;
    int ret = 0, past_eof = 0, whole_file = 0;

    if ((ret = fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC)))
        return ret;

    start_blk = logical_to_blk(inode, start);

    length = (long long)min_t(u64, len, i_size_read(inode));
    if (length < len)
        whole_file = 1;

    map_len = length;

    do {
        /*
         * we set b_size to the total size we want so it will map as
         * many contiguous blocks as possible at once
         */
        memset(&tmp, 0, sizeof(struct buffer_head));
        tmp.b_size = map_len;

        ret = get_block(inode, start_blk, &tmp, 0);
        if (ret)
            break;

        /* HOLE */
        if (!buffer_mapped(&tmp)) {
            length -= blk_to_logical(inode, 1);
            start_blk++;

            /*
             * we want to handle the case where there is an
             * allocated block at the front of the file, and then
             * nothing but holes up to the end of the file properly,
             * to make sure that extent at the front gets properly
             * marked with FIEMAP_EXTENT_LAST
             */
            if (!past_eof &&
                    blk_to_logical(inode, start_blk) >=
                    blk_to_logical(inode, 0)+i_size_read(inode))
                past_eof = 1;

            /*
             * first hole after going past the EOF, this is our
             * last extent
             */
            if (past_eof && size) {
                flags = FIEMAP_EXTENT_MERGED|FIEMAP_EXTENT_LAST;
                ret = fiemap_fill_next_extent(fieinfo, logical,
                                              phys, size,
                                              flags);
                break;
            }

            /* if we have holes up to/past EOF then we're done */
            if (length <= 0 || past_eof)
                break;
        } else {
            /*
             * we have gone over the length of what we wanted to
             * map, and it wasn't the entire file, so add the extent
             * we got last time and exit.
             *
             * This is for the case where say we want to map all the
             * way up to the second to the last block in a file, but
             * the last block is a hole, making the second to last
             * block FIEMAP_EXTENT_LAST.  In this case we want to
             * see if there is a hole after the second to last block
             * so we can mark it properly.  If we found data after
             * we exceeded the length we were requesting, then we
             * are good to go, just add the extent to the fieinfo
             * and break
             */
            if (length <= 0 && !whole_file) {
                ret = fiemap_fill_next_extent(fieinfo, logical,
                                              phys, size,
                                              flags);
                break;
            }

            /*
             * if size != 0 then we know we already have an extent
             * to add, so add it.
             */
            if (size) {
                ret = fiemap_fill_next_extent(fieinfo, logical,
                                              phys, size,
                                              flags);
                if (ret)
                    break;
            }

            logical = blk_to_logical(inode, start_blk);
            phys = blk_to_logical(inode, tmp.b_blocknr);
            size = tmp.b_size;
            flags = FIEMAP_EXTENT_MERGED;

            length -= tmp.b_size;
            start_blk += logical_to_blk(inode, size);

            /*
             * If we are past the EOF, then we need to make sure as
             * soon as we find a hole that the last extent we found
             * is marked with FIEMAP_EXTENT_LAST
             */
            if (!past_eof &&
                    logical+size >=
                    blk_to_logical(inode, 0)+i_size_read(inode))
                past_eof = 1;
        }
        cond_resched();
    } while (1);

    /* if ret is 1 then we just hit the end of the extent array */
    if (ret == 1)
        ret = 0;

    return ret;
}
示例#2
0
/**
 * vxfs_read_super - read superblock into memory and initialize filesystem
 * @sbp:		VFS superblock (to fill)
 * @dp:			fs private mount data
 * @silent:		do not complain loudly when sth is wrong
 *
 * Description:
 *   We are called on the first mount of a filesystem to read the
 *   superblock into memory and do some basic setup.
 *
 * Returns:
 *   The superblock on success, else %NULL.
 *
 * Locking:
 *   We are under @sbp->s_lock.
 */
static int vxfs_fill_super(struct super_block *sbp, void *dp, int silent)
{
    struct vxfs_sb_info	*infp;
    struct vxfs_sb		*rsbp;
    struct buffer_head	*bp = NULL;
    u_long			bsize;
    struct inode *root;
    int ret = -EINVAL;

    sbp->s_flags |= MS_RDONLY;

    infp = kzalloc(sizeof(*infp), GFP_KERNEL);
    if (!infp) {
        printk(KERN_WARNING "vxfs: unable to allocate incore superblock\n");
        return -ENOMEM;
    }

    bsize = sb_min_blocksize(sbp, BLOCK_SIZE);
    if (!bsize) {
        printk(KERN_WARNING "vxfs: unable to set blocksize\n");
        goto out;
    }

    bp = sb_bread(sbp, 1);
    if (!bp || !buffer_mapped(bp)) {
        if (!silent) {
            printk(KERN_WARNING
                   "vxfs: unable to read disk superblock\n");
        }
        goto out;
    }

    rsbp = (struct vxfs_sb *)bp->b_data;
    if (rsbp->vs_magic != VXFS_SUPER_MAGIC) {
        if (!silent)
            printk(KERN_NOTICE "vxfs: WRONG superblock magic\n");
        goto out;
    }

    if ((rsbp->vs_version < 2 || rsbp->vs_version > 4) && !silent) {
        printk(KERN_NOTICE "vxfs: unsupported VxFS version (%d)\n",
               rsbp->vs_version);
        goto out;
    }

#ifdef DIAGNOSTIC
    printk(KERN_DEBUG "vxfs: supported VxFS version (%d)\n", rsbp->vs_version);
    printk(KERN_DEBUG "vxfs: blocksize: %d\n", rsbp->vs_bsize);
#endif

    sbp->s_magic = rsbp->vs_magic;
    sbp->s_fs_info = infp;

    infp->vsi_raw = rsbp;
    infp->vsi_bp = bp;
    infp->vsi_oltext = rsbp->vs_oltext[0];
    infp->vsi_oltsize = rsbp->vs_oltsize;

    if (!sb_set_blocksize(sbp, rsbp->vs_bsize)) {
        printk(KERN_WARNING "vxfs: unable to set final block size\n");
        goto out;
    }

    if (vxfs_read_olt(sbp, bsize)) {
        printk(KERN_WARNING "vxfs: unable to read olt\n");
        goto out;
    }

    if (vxfs_read_fshead(sbp)) {
        printk(KERN_WARNING "vxfs: unable to read fshead\n");
        goto out;
    }

    sbp->s_op = &vxfs_super_ops;
    root = vxfs_iget(sbp, VXFS_ROOT_INO);
    if (IS_ERR(root)) {
        ret = PTR_ERR(root);
        goto out;
    }
    sbp->s_root = d_alloc_root(root);
    if (!sbp->s_root) {
        iput(root);
        printk(KERN_WARNING "vxfs: unable to get root dentry.\n");
        goto out_free_ilist;
    }

    return 0;

out_free_ilist:
    vxfs_put_fake_inode(infp->vsi_fship);
    vxfs_put_fake_inode(infp->vsi_ilist);
    vxfs_put_fake_inode(infp->vsi_stilist);
out:
    brelse(bp);
    kfree(infp);
    return ret;
}
示例#3
0
int __generic_block_fiemap(struct inode *inode,
			   struct fiemap_extent_info *fieinfo, u64 start,
			   u64 len, get_block_t *get_block)
{
	struct buffer_head tmp;
	unsigned int start_blk;
	long long length = 0, map_len = 0;
	u64 logical = 0, phys = 0, size = 0;
	u32 flags = FIEMAP_EXTENT_MERGED;
	int ret = 0;

	if ((ret = fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC)))
		return ret;

	start_blk = logical_to_blk(inode, start);

	length = (long long)min_t(u64, len, i_size_read(inode));
	map_len = length;

	do {
		/*
		 * we set b_size to the total size we want so it will map as
		 * many contiguous blocks as possible at once
		 */
		memset(&tmp, 0, sizeof(struct buffer_head));
		tmp.b_size = map_len;

		ret = get_block(inode, start_blk, &tmp, 0);
		if (ret)
			break;

		/* HOLE */
		if (!buffer_mapped(&tmp)) {
			/*
			 * first hole after going past the EOF, this is our
			 * last extent
			 */
			if (length <= 0) {
				flags = FIEMAP_EXTENT_MERGED|FIEMAP_EXTENT_LAST;
				ret = fiemap_fill_next_extent(fieinfo, logical,
							      phys, size,
							      flags);
				break;
			}

			length -= blk_to_logical(inode, 1);

			/* if we have holes up to/past EOF then we're done */
			if (length <= 0)
				break;

			start_blk++;
		} else {
			if (length <= 0 && size) {
				ret = fiemap_fill_next_extent(fieinfo, logical,
							      phys, size,
							      flags);
				if (ret)
					break;
			}

			logical = blk_to_logical(inode, start_blk);
			phys = blk_to_logical(inode, tmp.b_blocknr);
			size = tmp.b_size;
			flags = FIEMAP_EXTENT_MERGED;

			length -= tmp.b_size;
			start_blk += logical_to_blk(inode, size);

			/*
			 * if we are past the EOF we need to loop again to see
			 * if there is a hole so we can mark this extent as the
			 * last one, and if not keep mapping things until we
			 * find a hole, or we run out of slots in the extent
			 * array
			 */
			if (length <= 0)
				continue;

			ret = fiemap_fill_next_extent(fieinfo, logical, phys,
						      size, flags);
			if (ret)
				break;
		}
		cond_resched();
	} while (1);

	/* if ret is 1 then we just hit the end of the extent array */
	if (ret == 1)
		ret = 0;

	return ret;
}
示例#4
0
/*
 * Modify inode page cache in such way:
 * have - blocks with b_blocknr equal to oldb...oldb+count-1
 * get - blocks with b_blocknr equal to newb...newb+count-1
 * also we suppose that oldb...oldb+count-1 blocks
 * situated at the end of file.
 *
 * We can come here from ufs_writepage or ufs_prepare_write,
 * locked_page is argument of these functions, so we already lock it.
 */
static void ufs_change_blocknr(struct inode *inode, unsigned int beg,
			       unsigned int count, unsigned int oldb,
			       unsigned int newb, struct page *locked_page)
{
	const unsigned mask = (1 << (PAGE_CACHE_SHIFT - inode->i_blkbits)) - 1;
	struct address_space * const mapping = inode->i_mapping;
	pgoff_t index, cur_index;
	unsigned end, pos, j;
	struct page *page;
	struct buffer_head *head, *bh;

	UFSD("ENTER, ino %lu, count %u, oldb %u, newb %u\n",
	      inode->i_ino, count, oldb, newb);

	BUG_ON(!locked_page);
	BUG_ON(!PageLocked(locked_page));

	cur_index = locked_page->index;

	for (end = count + beg; beg < end; beg = (beg | mask) + 1) {
		index = beg >> (PAGE_CACHE_SHIFT - inode->i_blkbits);

		if (likely(cur_index != index)) {
			page = ufs_get_locked_page(mapping, index);
			if (!page || IS_ERR(page)) /* it was truncated or EIO */
				continue;
		} else
			page = locked_page;

		head = page_buffers(page);
		bh = head;
		pos = beg & mask;
		for (j = 0; j < pos; ++j)
			bh = bh->b_this_page;
		j = 0;
		do {
			if (buffer_mapped(bh)) {
				pos = bh->b_blocknr - oldb;
				if (pos < count) {
					UFSD(" change from %llu to %llu\n",
					     (unsigned long long)pos + oldb,
					     (unsigned long long)pos + newb);
					bh->b_blocknr = newb + pos;
					unmap_underlying_metadata(bh->b_bdev,
								  bh->b_blocknr);
					mark_buffer_dirty(bh);
					++j;
				}
			}

			bh = bh->b_this_page;
		} while (bh != head);

		if (j)
			set_page_dirty(page);

		if (likely(cur_index != index))
			ufs_put_locked_page(page);
 	}
	UFSD("EXIT\n");
}
/**
 * write_mft_record_nolock - write out a mapped (extent) mft record
 * @ni:		ntfs inode describing the mapped (extent) mft record
 * @m:		mapped (extent) mft record to write
 * @sync:	if true, wait for i/o completion
 *
 * Write the mapped (extent) mft record @m described by the (regular or extent)
 * ntfs inode @ni to backing store.  If the mft record @m has a counterpart in
 * the mft mirror, that is also updated.
 *
 * On success, clean the mft record and return 0.  On error, leave the mft
 * record dirty and return -errno.  The caller should call make_bad_inode() on
 * the base inode to ensure no more access happens to this inode.  We do not do
 * it here as the caller may want to finish writing other extent mft records
 * first to minimize on-disk metadata inconsistencies.
 *
 * NOTE:  We always perform synchronous i/o and ignore the @sync parameter.
 * However, if the mft record has a counterpart in the mft mirror and @sync is
 * true, we write the mft record, wait for i/o completion, and only then write
 * the mft mirror copy.  This ensures that if the system crashes either the mft
 * or the mft mirror will contain a self-consistent mft record @m.  If @sync is
 * false on the other hand, we start i/o on both and then wait for completion
 * on them.  This provides a speedup but no longer guarantees that you will end
 * up with a self-consistent mft record in the case of a crash but if you asked
 * for asynchronous writing you probably do not care about that anyway.
 *
 * TODO:  If @sync is false, want to do truly asynchronous i/o, i.e. just
 * schedule i/o via ->writepage or do it via kntfsd or whatever.
 */
int write_mft_record_nolock(ntfs_inode *ni, MFT_RECORD *m, int sync)
{
	ntfs_volume *vol = ni->vol;
	struct page *page = ni->page;
	unsigned int blocksize = vol->sb->s_blocksize;
	int max_bhs = vol->mft_record_size / blocksize;
	struct buffer_head *bhs[max_bhs];
	struct buffer_head *bh, *head;
	unsigned int block_start, block_end, m_start, m_end;
	int i_bhs, nr_bhs, err = 0;

	ntfs_debug("Entering for inode 0x%lx.", ni->mft_no);
	BUG_ON(NInoAttr(ni));
	BUG_ON(!max_bhs);
	BUG_ON(!PageLocked(page));
	/*
	 * If the ntfs_inode is clean no need to do anything.  If it is dirty,
	 * mark it as clean now so that it can be redirtied later on if needed.
	 * There is no danger of races since the caller is holding the locks
	 * for the mft record @m and the page it is in.
	 */
	if (!NInoTestClearDirty(ni))
		goto done;
	/* Make sure we have mapped buffers. */
	if (!page_has_buffers(page)) {
no_buffers_err_out:
		ntfs_error(vol->sb, "Writing mft records without existing "
				"buffers is not implemented yet.  %s",
				ntfs_please_email);
		err = -EOPNOTSUPP;
		goto err_out;
	}
	bh = head = page_buffers(page);
	if (!bh)
		goto no_buffers_err_out;
	nr_bhs = 0;
	block_start = 0;
	m_start = ni->page_ofs;
	m_end = m_start + vol->mft_record_size;
	do {
		block_end = block_start + blocksize;
		/*
		 * If the buffer is outside the mft record, just skip it,
		 * clearing it if it is dirty to make sure it is not written
		 * out.  It should never be marked dirty but better be safe.
		 */
		if ((block_end <= m_start) || (block_start >= m_end)) {
			if (buffer_dirty(bh)) {
				ntfs_warning(vol->sb, "Clearing dirty mft "
						"record page buffer.  %s",
						ntfs_please_email);
				clear_buffer_dirty(bh);
			}
			continue;
		}
		if (!buffer_mapped(bh)) {
			ntfs_error(vol->sb, "Writing mft records without "
					"existing mapped buffers is not "
					"implemented yet.  %s",
					ntfs_please_email);
			err = -EOPNOTSUPP;
			continue;
		}
		if (!buffer_uptodate(bh)) {
			ntfs_error(vol->sb, "Writing mft records without "
					"existing uptodate buffers is not "
					"implemented yet.  %s",
					ntfs_please_email);
			err = -EOPNOTSUPP;
			continue;
		}
		BUG_ON(!nr_bhs && (m_start != block_start));
		BUG_ON(nr_bhs >= max_bhs);
		bhs[nr_bhs++] = bh;
		BUG_ON((nr_bhs >= max_bhs) && (m_end != block_end));
	} while (block_start = block_end, (bh = bh->b_this_page) != head);
	if (unlikely(err))
		goto cleanup_out;
	/* Apply the mst protection fixups. */
	err = pre_write_mst_fixup((NTFS_RECORD*)m, vol->mft_record_size);
	if (err) {
		ntfs_error(vol->sb, "Failed to apply mst fixups!");
		goto cleanup_out;
	}
	flush_dcache_mft_record_page(ni);
	/* Lock buffers and start synchronous write i/o on them. */
	for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++) {
		struct buffer_head *tbh = bhs[i_bhs];

		if (unlikely(test_set_buffer_locked(tbh)))
			BUG();
		BUG_ON(!buffer_uptodate(tbh));
		if (buffer_dirty(tbh))
			clear_buffer_dirty(tbh);
		get_bh(tbh);
		tbh->b_end_io = end_buffer_write_sync;
		submit_bh(WRITE, tbh);
	}
	/* Synchronize the mft mirror now if not @sync. */
	if (!sync && ni->mft_no < vol->mftmirr_size)
		sync_mft_mirror(ni, m, sync);
	/* Wait on i/o completion of buffers. */
	for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++) {
		struct buffer_head *tbh = bhs[i_bhs];

		wait_on_buffer(tbh);
		if (unlikely(!buffer_uptodate(tbh))) {
			err = -EIO;
			/*
			 * Set the buffer uptodate so the page & buffer states
			 * don't become out of sync.
			 */
			if (PageUptodate(page))
				set_buffer_uptodate(tbh);
		}
	}
	/* If @sync, now synchronize the mft mirror. */
	if (sync && ni->mft_no < vol->mftmirr_size)
		sync_mft_mirror(ni, m, sync);
	/* Remove the mst protection fixups again. */
	post_write_mst_fixup((NTFS_RECORD*)m);
	flush_dcache_mft_record_page(ni);
	if (unlikely(err)) {
		/* I/O error during writing.  This is really bad! */
		ntfs_error(vol->sb, "I/O error while writing mft record "
				"0x%lx!  Marking base inode as bad.  You "
				"should unmount the volume and run chkdsk.",
				ni->mft_no);
		goto err_out;
	}
done:
	ntfs_debug("Done.");
	return 0;
cleanup_out:
	/* Clean the buffers. */
	for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++)
		clear_buffer_dirty(bhs[i_bhs]);
err_out:
	/*
	 * Current state: all buffers are clean, unlocked, and uptodate.
	 * The caller should mark the base inode as bad so that no more i/o
	 * happens.  ->clear_inode() will still be invoked so all extent inodes
	 * and other allocated memory will be freed.
	 */
	if (err == -ENOMEM) {
		ntfs_error(vol->sb, "Not enough memory to write mft record.  "
				"Redirtying so the write is retried later.");
		mark_mft_record_dirty(ni);
		err = 0;
	}
	return err;
}
示例#6
0
/*
 * This is the worker routine which does all the work of mapping the disk
 * blocks and constructs largest possible bios, submits them for IO if the
 * blocks are not contiguous on the disk.
 *
 * We pass a buffer_head back and forth and use its buffer_mapped() flag to
 * represent the validity of its disk mapping and to decide when to do the next
 * get_block() call.
 */
static struct bio *
do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages,
		sector_t *last_block_in_bio, struct buffer_head *map_bh,
		unsigned long *first_logical_block, get_block_t get_block)
{
	struct inode *inode = page->mapping->host;
	const unsigned blkbits = inode->i_blkbits;
	const unsigned blocks_per_page = PAGE_CACHE_SIZE >> blkbits;
	const unsigned blocksize = 1 << blkbits;
	sector_t block_in_file;
	sector_t last_block;
	sector_t last_block_in_file;
	sector_t blocks[MAX_BUF_PER_PAGE];
	unsigned page_block;
	unsigned first_hole = blocks_per_page;
	struct block_device *bdev = NULL;
	int length;
	int fully_mapped = 1;
	unsigned nblocks;
	unsigned relative_block;

    /*page是缓冲区页 : 是块设备页或者是非连续块*/
	if (page_has_buffers(page))
		goto confused;

	block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits);
	last_block = block_in_file + nr_pages * blocks_per_page;
	last_block_in_file = (i_size_read(inode) + blocksize - 1) >> blkbits;
	if (last_block > last_block_in_file)
		last_block = last_block_in_file;
	page_block = 0;

	/*
	 * Map blocks using the result from the previous get_blocks call first.
	 */
	nblocks = map_bh->b_size >> blkbits;
    /*如果bh已经被映射到磁盘了 此时map_bh.b_state = 0 此段代码不会执行*/
	if (buffer_mapped(map_bh) && block_in_file > *first_logical_block &&
			block_in_file < (*first_logical_block + nblocks)) {
		unsigned map_offset = block_in_file - *first_logical_block;
		unsigned last = nblocks - map_offset;

		for (relative_block = 0; ; relative_block++) {
			if (relative_block == last) {
				clear_buffer_mapped(map_bh);
				break;
			}
			if (page_block == blocks_per_page)
				break;
			blocks[page_block] = map_bh->b_blocknr + map_offset +
						relative_block;
			page_block++;
			block_in_file++;
		}
		bdev = map_bh->b_bdev;
	}

	/*
	 * Then do more get_blocks calls until we are done with this page.
	 */
	map_bh->b_page = page;
	while (page_block < blocks_per_page) {
		map_bh->b_state = 0;
		map_bh->b_size = 0;

		if (block_in_file < last_block) {
			map_bh->b_size = (last_block-block_in_file) << blkbits;
			if (get_block(inode, block_in_file, map_bh, 0))
				goto confused;
			*first_logical_block = block_in_file;
		}

        /*没有映射 对应文件洞*/
		if (!buffer_mapped(map_bh)) {
			fully_mapped = 0;
			if (first_hole == blocks_per_page)
				first_hole = page_block;
			page_block++;
			block_in_file++;
			continue;
		}

		/* some filesystems will copy data into the page during
		 * the get_block call, in which case we don't want to
		 * read it again.  map_buffer_to_page copies the data
		 * we just collected from get_block into the page's buffers
		 * so readpage doesn't have to repeat the get_block call
		 */
        /*如果块缓存区是最新的 将其数据直接copy到page*/
		if (buffer_uptodate(map_bh)) {
			map_buffer_to_page(page, map_bh, page_block);
			goto confused;
		}
	
        /* 遇到一个文件洞 但之后的块又映射了,这时应该将洞之前的块处理掉 */
		if (first_hole != blocks_per_page)
			goto confused;		/* hole -> non-hole 从有空洞变成没有空洞*/

		/* Contiguous blocks? */
        /*get_block获取块号存放在map_bh的b_blocknr中*/
		if (page_block && blocks[page_block-1] != map_bh->b_blocknr-1)
			goto confused;
		nblocks = map_bh->b_size >> blkbits;
		for (relative_block = 0; ; relative_block++) {
			if (relative_block == nblocks) {
				clear_buffer_mapped(map_bh);
				break;
			} else if (page_block == blocks_per_page)
				break;
            /*page中连续的块存放在blocks数组中*/
			blocks[page_block] = map_bh->b_blocknr+relative_block;
			page_block++;
			block_in_file++;
		}
		bdev = map_bh->b_bdev;
	}

    /*存在连续的文件空洞*/
	if (first_hole != blocks_per_page) {
        /*将空洞的page部分清零*/
		zero_user_segment(page, first_hole << blkbits, PAGE_CACHE_SIZE);
		if (first_hole == 0) {
			SetPageUptodate(page);
			unlock_page(page);
			goto out;
		}
	} else if (fully_mapped) {
		SetPageMappedToDisk(page);
	}

	/*
	 * This page will go to BIO.  Do we need to send this BIO off first?
	 */
	if (bio && (*last_block_in_bio != blocks[0] - 1))
		bio = mpage_bio_submit(READ, bio);

alloc_new:
	if (bio == NULL) {
        /*blocks[0]<<(blkbits-9)表示该块对应的磁盘扇区号*/
		bio = mpage_alloc(bdev, blocks[0] << (blkbits - 9),
			  	min_t(int, nr_pages, bio_get_nr_vecs(bdev)),
				GFP_KERNEL);
		if (bio == NULL)
			goto confused;
	}
示例#7
0
static bool buffer_written(struct buffer_head *bh)
{
	return buffer_mapped(bh) && !buffer_unwritten(bh);
}
示例#8
0
int nilfs_btnode_submit_block(struct address_space *btnc, __u64 blocknr,
			      sector_t pblocknr, struct buffer_head **pbh,
			      int newblk)
{
	struct buffer_head *bh;
	struct inode *inode = NILFS_BTNC_I(btnc);
	int err;

	bh = nilfs_grab_buffer(inode, btnc, blocknr, 1 << BH_NILFS_Node);
	if (unlikely(!bh))
		return -ENOMEM;

	err = -EEXIST; /* internal code */
	if (newblk) {
		if (unlikely(buffer_mapped(bh) || buffer_uptodate(bh) ||
			     buffer_dirty(bh))) {
			brelse(bh);
			BUG();
		}
		memset(bh->b_data, 0, 1 << inode->i_blkbits);
		bh->b_bdev = NILFS_I_NILFS(inode)->ns_bdev;
		bh->b_blocknr = blocknr;
		set_buffer_mapped(bh);
		set_buffer_uptodate(bh);
		goto found;
	}

	if (buffer_uptodate(bh) || buffer_dirty(bh))
		goto found;

	if (pblocknr == 0) {
		pblocknr = blocknr;
		if (inode->i_ino != NILFS_DAT_INO) {
			struct inode *dat =
				nilfs_dat_inode(NILFS_I_NILFS(inode));

			/* blocknr is a virtual block number */
			err = nilfs_dat_translate(dat, blocknr, &pblocknr);
			if (unlikely(err)) {
				brelse(bh);
				goto out_locked;
			}
		}
	}
	lock_buffer(bh);
	if (buffer_uptodate(bh)) {
		unlock_buffer(bh);
		err = -EEXIST; /* internal code */
		goto found;
	}
	set_buffer_mapped(bh);
	bh->b_bdev = NILFS_I_NILFS(inode)->ns_bdev;
	bh->b_blocknr = pblocknr; /* set block address for read */
	bh->b_end_io = end_buffer_read_sync;
	get_bh(bh);
	submit_bh(READ, bh);
	bh->b_blocknr = blocknr; /* set back to the given block address */
	err = 0;
found:
	*pbh = bh;

out_locked:
	unlock_page(bh->b_page);
	page_cache_release(bh->b_page);
	return err;
}
示例#9
0
static int nilfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
{
	struct page *page = vmf->page;
	struct inode *inode = file_inode(vma->vm_file);
	struct nilfs_transaction_info ti;
	int ret = 0;

	if (unlikely(nilfs_near_disk_full(inode->i_sb->s_fs_info)))
		return VM_FAULT_SIGBUS; /* -ENOSPC */

	sb_start_pagefault(inode->i_sb);
	lock_page(page);
	if (page->mapping != inode->i_mapping ||
	    page_offset(page) >= i_size_read(inode) || !PageUptodate(page)) {
		unlock_page(page);
		ret = -EFAULT;	/* make the VM retry the fault */
		goto out;
	}

	/*
	 * check to see if the page is mapped already (no holes)
	 */
	if (PageMappedToDisk(page))
		goto mapped;

	if (page_has_buffers(page)) {
		struct buffer_head *bh, *head;
		int fully_mapped = 1;

		bh = head = page_buffers(page);
		do {
			if (!buffer_mapped(bh)) {
				fully_mapped = 0;
				break;
			}
		} while (bh = bh->b_this_page, bh != head);

		if (fully_mapped) {
			SetPageMappedToDisk(page);
			goto mapped;
		}
	}
	unlock_page(page);

	/*
	 * fill hole blocks
	 */
	ret = nilfs_transaction_begin(inode->i_sb, &ti, 1);
	/* never returns -ENOMEM, but may return -ENOSPC */
	if (unlikely(ret))
		goto out;

	file_update_time(vma->vm_file);
	ret = __block_page_mkwrite(vma, vmf, nilfs_get_block);
	if (ret) {
		nilfs_transaction_abort(inode->i_sb);
		goto out;
	}
	nilfs_set_file_dirty(inode, 1 << (PAGE_SHIFT - inode->i_blkbits));
	nilfs_transaction_commit(inode->i_sb);

 mapped:
	wait_for_stable_page(page);
 out:
	sb_end_pagefault(inode->i_sb);
	return block_page_mkwrite_return(ret);
}
int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
		u64 start, u64 len)
{
	struct buffer_head map_bh;
	sector_t start_blk, last_blk;
	loff_t isize = i_size_read(inode);
	u64 logical = 0, phys = 0, size = 0;
	u32 flags = 0;
	bool past_eof = false, whole_file = false;
	int ret = 0;

	ret = fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC);
	if (ret)
		return ret;

	mutex_lock(&inode->i_mutex);

	if (len >= isize) {
		whole_file = true;
		len = isize;
	}

	if (logical_to_blk(inode, len) == 0)
		len = blk_to_logical(inode, 1);

	start_blk = logical_to_blk(inode, start);
	last_blk = logical_to_blk(inode, start + len - 1);
next:
	memset(&map_bh, 0, sizeof(struct buffer_head));
	map_bh.b_size = len;

	ret = get_data_block(inode, start_blk, &map_bh, 0,
					F2FS_GET_BLOCK_FIEMAP);
	if (ret)
		goto out;

	/* HOLE */
	if (!buffer_mapped(&map_bh)) {
		start_blk++;

		if (!past_eof && blk_to_logical(inode, start_blk) >= isize)
			past_eof = 1;

		if (past_eof && size) {
			flags |= FIEMAP_EXTENT_LAST;
			ret = fiemap_fill_next_extent(fieinfo, logical,
					phys, size, flags);
		} else if (size) {
			ret = fiemap_fill_next_extent(fieinfo, logical,
					phys, size, flags);
			size = 0;
		}

		/* if we have holes up to/past EOF then we're done */
		if (start_blk > last_blk || past_eof || ret)
			goto out;
	} else {
		if (start_blk > last_blk && !whole_file) {
			ret = fiemap_fill_next_extent(fieinfo, logical,
					phys, size, flags);
			goto out;
		}

		/*
		 * if size != 0 then we know we already have an extent
		 * to add, so add it.
		 */
		if (size) {
			ret = fiemap_fill_next_extent(fieinfo, logical,
					phys, size, flags);
			if (ret)
				goto out;
		}

		logical = blk_to_logical(inode, start_blk);
		phys = blk_to_logical(inode, map_bh.b_blocknr);
		size = map_bh.b_size;
		flags = 0;
		if (buffer_unwritten(&map_bh))
			flags = FIEMAP_EXTENT_UNWRITTEN;

		start_blk += logical_to_blk(inode, size);

		/*
		 * If we are past the EOF, then we need to make sure as
		 * soon as we find a hole that the last extent we found
		 * is marked with FIEMAP_EXTENT_LAST
		 */
		if (!past_eof && logical + size >= isize)
			past_eof = true;
	}
	cond_resched();
	if (fatal_signal_pending(current))
		ret = -EINTR;
	else
		goto next;
out:
	if (ret == 1)
		ret = 0;

	mutex_unlock(&inode->i_mutex);
	return ret;
}
示例#11
0
/*
 * nilfs_gccache_submit_read_data() - add data buffer and submit read request
 * @inode - gc inode
 * @blkoff - dummy offset treated as the key for the page cache
 * @pbn - physical block number of the block
 * @vbn - virtual block number of the block, 0 for non-virtual block
 * @out_bh - indirect pointer to a buffer_head struct to receive the results
 *
 * Description: nilfs_gccache_submit_read_data() registers the data buffer
 * specified by @pbn to the GC pagecache with the key @blkoff.
 * This function sets @vbn (@pbn if @vbn is zero) in b_blocknr of the buffer.
 *
 * Return Value: On success, 0 is returned. On Error, one of the following
 * negative error code is returned.
 *
 * %-EIO - I/O error.
 *
 * %-ENOMEM - Insufficient amount of memory available.
 *
 * %-ENOENT - The block specified with @pbn does not exist.
 */
int nilfs_gccache_submit_read_data(struct inode *inode, sector_t blkoff,
				   sector_t pbn, __u64 vbn,
				   struct buffer_head **out_bh)
{
	struct buffer_head *bh;
	int err;
#if defined(YANQIN)
  struct radix_tree_root *rtree = &(NILFS_I(inode)->i_gc_blocks);
  struct nilfs_gc_block_info *gbi;
#endif

	bh = nilfs_grab_buffer(inode, inode->i_mapping, blkoff, 0);
	if (unlikely(!bh))
		return -ENOMEM;

	if (buffer_uptodate(bh))
		goto out;

	if (pbn == 0) {
		struct the_nilfs *nilfs = inode->i_sb->s_fs_info;

		err = nilfs_dat_translate(nilfs->ns_dat, vbn, &pbn);
		if (unlikely(err)) { /* -EIO, -ENOMEM, -ENOENT */
			brelse(bh);
			goto failed;
		}
	}

	lock_buffer(bh);
	if (buffer_uptodate(bh)) {
		unlock_buffer(bh);
		goto out;
	}

	if (!buffer_mapped(bh)) {
		bh->b_bdev = inode->i_sb->s_bdev;
		set_buffer_mapped(bh);
	}
	bh->b_blocknr = pbn;
	bh->b_end_io = end_buffer_read_sync;
	get_bh(bh);
	submit_bh(READ, bh);
	if (vbn)
		bh->b_blocknr = vbn;
 out:
	err = 0;
	*out_bh = bh;
#if defined(YANQIN)
  set_bit(BH_PrivateStart, &bh->b_state);
  gbi = kmalloc(sizeof(struct nilfs_gc_block_info), GFP_NOFS);
  if (!gbi) return -ENOMEM;
  gbi->old_pblocknr = pbn;
  gbi->new_pblocknr = 0; /* To be assigned later */
  err = radix_tree_insert(rtree, vbn, gbi);
#endif

 failed:
	unlock_page(bh->b_page);
	page_cache_release(bh->b_page);
	return err;
}
示例#12
0
文件: mpage.c 项目: Vhacker1995/linux
static int __mpage_writepage(struct page *page, struct writeback_control *wbc,
                             void *data)
{
    struct mpage_data *mpd = data;
    struct bio *bio = mpd->bio;
    struct address_space *mapping = page->mapping;
    struct inode *inode = page->mapping->host;
    const unsigned blkbits = inode->i_blkbits;
    unsigned long end_index;
    const unsigned blocks_per_page = PAGE_SIZE >> blkbits;
    sector_t last_block;
    sector_t block_in_file;
    sector_t blocks[MAX_BUF_PER_PAGE];
    unsigned page_block;
    unsigned first_unmapped = blocks_per_page;
    struct block_device *bdev = NULL;
    int boundary = 0;
    sector_t boundary_block = 0;
    struct block_device *boundary_bdev = NULL;
    int length;
    struct buffer_head map_bh;
    loff_t i_size = i_size_read(inode);
    int ret = 0;
    int op_flags = (wbc->sync_mode == WB_SYNC_ALL ?  WRITE_SYNC : 0);

    if (page_has_buffers(page)) {
        struct buffer_head *head = page_buffers(page);
        struct buffer_head *bh = head;

        /* If they're all mapped and dirty, do it */
        page_block = 0;
        do {
            BUG_ON(buffer_locked(bh));
            if (!buffer_mapped(bh)) {
                /*
                 * unmapped dirty buffers are created by
                 * __set_page_dirty_buffers -> mmapped data
                 */
                if (buffer_dirty(bh))
                    goto confused;
                if (first_unmapped == blocks_per_page)
                    first_unmapped = page_block;
                continue;
            }

            if (first_unmapped != blocks_per_page)
                goto confused;	/* hole -> non-hole */

            if (!buffer_dirty(bh) || !buffer_uptodate(bh))
                goto confused;
            if (page_block) {
                if (bh->b_blocknr != blocks[page_block-1] + 1)
                    goto confused;
            }
            blocks[page_block++] = bh->b_blocknr;
            boundary = buffer_boundary(bh);
            if (boundary) {
                boundary_block = bh->b_blocknr;
                boundary_bdev = bh->b_bdev;
            }
            bdev = bh->b_bdev;
        } while ((bh = bh->b_this_page) != head);

        if (first_unmapped)
            goto page_is_mapped;

        /*
         * Page has buffers, but they are all unmapped. The page was
         * created by pagein or read over a hole which was handled by
         * block_read_full_page().  If this address_space is also
         * using mpage_readpages then this can rarely happen.
         */
        goto confused;
    }

    /*
     * The page has no buffers: map it to disk
     */
    BUG_ON(!PageUptodate(page));
    block_in_file = (sector_t)page->index << (PAGE_SHIFT - blkbits);
    last_block = (i_size - 1) >> blkbits;
    map_bh.b_page = page;
    for (page_block = 0; page_block < blocks_per_page; ) {

        map_bh.b_state = 0;
        map_bh.b_size = 1 << blkbits;
        if (mpd->get_block(inode, block_in_file, &map_bh, 1))
            goto confused;
        if (buffer_new(&map_bh))
            unmap_underlying_metadata(map_bh.b_bdev,
                                      map_bh.b_blocknr);
        if (buffer_boundary(&map_bh)) {
            boundary_block = map_bh.b_blocknr;
            boundary_bdev = map_bh.b_bdev;
        }
        if (page_block) {
            if (map_bh.b_blocknr != blocks[page_block-1] + 1)
                goto confused;
        }
        blocks[page_block++] = map_bh.b_blocknr;
        boundary = buffer_boundary(&map_bh);
        bdev = map_bh.b_bdev;
        if (block_in_file == last_block)
            break;
        block_in_file++;
    }
    BUG_ON(page_block == 0);

    first_unmapped = page_block;

page_is_mapped:
    end_index = i_size >> PAGE_SHIFT;
    if (page->index >= end_index) {
        /*
         * The page straddles i_size.  It must be zeroed out on each
         * and every writepage invocation because it may be mmapped.
         * "A file is mapped in multiples of the page size.  For a file
         * that is not a multiple of the page size, the remaining memory
         * is zeroed when mapped, and writes to that region are not
         * written out to the file."
         */
        unsigned offset = i_size & (PAGE_SIZE - 1);

        if (page->index > end_index || !offset)
            goto confused;
        zero_user_segment(page, offset, PAGE_SIZE);
    }

    /*
     * This page will go to BIO.  Do we need to send this BIO off first?
     */
    if (bio && mpd->last_block_in_bio != blocks[0] - 1)
        bio = mpage_bio_submit(REQ_OP_WRITE, op_flags, bio);

alloc_new:
    if (bio == NULL) {
        if (first_unmapped == blocks_per_page) {
            if (!bdev_write_page(bdev, blocks[0] << (blkbits - 9),
                                 page, wbc)) {
                clean_buffers(page, first_unmapped);
                goto out;
            }
        }
        bio = mpage_alloc(bdev, blocks[0] << (blkbits - 9),
                          BIO_MAX_PAGES, GFP_NOFS|__GFP_HIGH);
        if (bio == NULL)
            goto confused;

        wbc_init_bio(wbc, bio);
    }

    /*
     * Must try to add the page before marking the buffer clean or
     * the confused fail path above (OOM) will be very confused when
     * it finds all bh marked clean (i.e. it will not write anything)
     */
    wbc_account_io(wbc, page, PAGE_SIZE);
    length = first_unmapped << blkbits;
    if (bio_add_page(bio, page, length, 0) < length) {
        bio = mpage_bio_submit(REQ_OP_WRITE, op_flags, bio);
        goto alloc_new;
    }

    clean_buffers(page, first_unmapped);

    BUG_ON(PageWriteback(page));
    set_page_writeback(page);
    unlock_page(page);
    if (boundary || (first_unmapped != blocks_per_page)) {
        bio = mpage_bio_submit(REQ_OP_WRITE, op_flags, bio);
        if (boundary_block) {
            write_boundary_block(boundary_bdev,
                                 boundary_block, 1 << blkbits);
        }
    } else {
        mpd->last_block_in_bio = blocks[blocks_per_page - 1];
    }
    goto out;

confused:
    if (bio)
        bio = mpage_bio_submit(REQ_OP_WRITE, op_flags, bio);

    if (mpd->use_writepage) {
        ret = mapping->a_ops->writepage(page, wbc);
    } else {
        ret = -EAGAIN;
        goto out;
    }
    /*
     * The caller has a ref on the inode, so *mapping is stable
     */
    mapping_set_error(mapping, ret);
out:
    mpd->bio = bio;
    return ret;
}
示例#13
0
/*
 * This is the worker routine which does all the work of mapping the disk
 * blocks and constructs largest possible bios, submits them for IO if the
 * blocks are not contiguous on the disk.
 *
 * We pass a buffer_head back and forth and use its buffer_mapped() flag to
 * represent the validity of its disk mapping and to decide when to do the next
 * get_block() call.
 */
static struct bio *
do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages,
		sector_t *last_block_in_bio, struct buffer_head *map_bh,
		  unsigned long *first_logical_block, struct compressed_bio **cb, get_block_t get_block)
{
	struct inode *inode = page->mapping->host;
	const unsigned blkbits = inode->i_blkbits;
	const unsigned blocks_per_page = PAGE_CACHE_SIZE >> blkbits; //SET TO 1
	const unsigned blocksize = 1 << blkbits;
	sector_t block_in_file;
	sector_t last_block;
	sector_t last_block_in_file;
	sector_t blocks[MAX_BUF_PER_PAGE];
	unsigned page_block;                                         //Increments to 1
	unsigned first_hole = blocks_per_page;
	struct block_device *bdev = NULL;
	int fully_mapped = 1;
	unsigned nblocks;
	unsigned relative_block;
	int err;
	/* blkbits = 12 | MAX_BUF_PER_PAGE = 8 */
	
	if (page_has_buffers(page))
		goto confused;

	/* block_in_file : page->index
	 * last_block    : last page->index of requested nr_pages
 	 * last_block_in_file  : always index of last_page_of_file
	 */
	block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits);
	last_block = block_in_file + nr_pages * blocks_per_page;
	last_block_in_file = (i_size_read(inode) + blocksize - 1) >> blkbits;
	if (last_block > last_block_in_file)
		last_block = last_block_in_file;
	page_block = 0;

	/*
	 * Map blocks using the result from the previous get_blocks call first.
	 */
	
	/* nblocks : Initially 0 | Later mapped to 1 extent so is mostly 16 */
	nblocks = map_bh->b_size >> blkbits;
	printk(KERN_INFO "\ncurrent_page : %Lu | nblocks_initial : %u", block_in_file, nblocks);

	if (buffer_mapped(map_bh) && block_in_file > *first_logical_block &&
			block_in_file < (*first_logical_block + nblocks)) {
		unsigned map_offset = block_in_file - *first_logical_block;
		unsigned last = nblocks - map_offset;

		for (relative_block = 0; ; relative_block++) {
			if (relative_block == last) {
				clear_buffer_mapped(map_bh);
				break;
			}
			if (page_block == blocks_per_page)
				break;
			blocks[page_block] = map_bh->b_blocknr + map_offset +
						relative_block;

			page_block++;
			block_in_file++;
		}
		bdev = map_bh->b_bdev;
	}
	
	/*
	 * Then do more get_blocks calls until we are done with this page.
	 */
	map_bh->b_page = page;
	while (page_block < blocks_per_page) {
		map_bh->b_state = 0;
		map_bh->b_size = 0;

		if (block_in_file < last_block) {
			map_bh->b_size = (last_block - block_in_file) << blkbits;
			/* use of get_block => ***needs buffer_head map_bh 
			 * bdev     = map_bh->b_dev
			 * physical = map_bh->b_blocknr
			 * nblocks  = map_bh->b_size (no of logical blocks in extent)
			 * compress_count = map_bh->b_private
			 * first_logical_block
			 */
			if (get_block(inode, block_in_file, map_bh, 0)) //BLOCK_MAPPER
				goto confused;
			*first_logical_block = block_in_file;
		}
		/* generally is mapped.. so FALSE */
		if (!buffer_mapped(map_bh)) {
			fully_mapped = 0;
			if (first_hole == blocks_per_page)
				first_hole = page_block;
			page_block++;
			block_in_file++;
			continue;
		}

		/* some filesystems will copy data into the page during
		 * the get_block call, in which case we don't want to
		 * read it again.  map_buffer_to_page copies the data
		 * we just collected from get_block into the page's buffers
		 * so readpage doesn't have to repeat the get_block call
		 */
		
		//NEXT 3 => FALSE
		if (buffer_uptodate(map_bh)) {
			printk("\nIn map_buffer_to_page()");
			map_buffer_to_page(page, map_bh, page_block);
			goto confused;
		}
	
		if (first_hole != blocks_per_page)
			goto confused;		/* hole -> non-hole */

		/* Contiguous blocks? */
		if (page_block && blocks[page_block-1] != map_bh->b_blocknr-1)
			goto confused;

		nblocks = map_bh->b_size >> blkbits;
		printk(KERN_INFO "\nnblocks_mapped : %u", nblocks);
		//MAIN PART
		if (!*cb) {
			*cb = kzalloc(sizeof(struct compressed_bio), GFP_NOFS);
			if (!*cb) {
				/* ERROR */
				BUG_ON(1);
				err = -ENOMEM;
			}
			err = compressed_bio_init(*cb, inode, *first_logical_block, *(unsigned *)map_bh->b_private,
						  nblocks << PAGE_CACHE_SHIFT, 0);//compressed_len = 0
			if (err) {
				/* ERROR = -ENOMEM */
				err = -ENOMEM;
			}
			kfree((unsigned *)map_bh->b_private);
			map_bh->b_private = NULL;
		}
		
		for (relative_block = 0; ; relative_block++) {
			if (relative_block == nblocks) {
				clear_buffer_mapped(map_bh);
				break;
			} else if (page_block == blocks_per_page)
				break;

			blocks[page_block] = map_bh->b_blocknr+relative_block;
			page_block++;
			block_in_file++;
		}
		bdev = map_bh->b_bdev;
	}

	if (first_hole != blocks_per_page) {
		zero_user_segment(page, first_hole << blkbits, PAGE_CACHE_SIZE);
		if (first_hole == 0) {
			SetPageUptodate(page);
			unlock_page(page);
			goto out;
		}
	} else if (fully_mapped) {
                //TRUE...REQ?
		SetPageMappedToDisk(page);
	}
	
	if (fully_mapped && blocks_per_page == 1 && !PageUptodate(page) &&
	    cleancache_get_page(page) == 0) {
		//FALSE
		printk(KERN_INFO "\nSet_Page_Uptodate");
		SetPageUptodate(page);
		goto confused;
	}
	/*
	 * This page will go to BIO.  Do we need to send this BIO off first?
	 */
/* 	if (bio && (*last_block_in_bio != blocks[0] - 1)) */
/* 		bio = mpage_bio_submit(READ, bio); */

/* alloc_new: */
/* 	if (bio == NULL) { */
/* 		bio = mpage_alloc(bdev, blocks[0] << (blkbits - 9), */
/* 			  	min_t(int, nr_pages, bio_get_nr_vecs(bdev)), */
/* 				GFP_KERNEL); */
/* 		if (bio == NULL) */
/* 			goto confused; */
/* 	} */

/* 	length = first_hole << blkbits; */
/* 	if (bio_add_page(bio, page, length, 0) < length) { */
/* 		bio = mpage_bio_submit(READ, bio); */
/* 		goto alloc_new; */
/* 	} */

/* 	relative_block = block_in_file - *first_logical_block; */
/* 	nblocks = map_bh->b_size >> blkbits; */
/* 	if ((buffer_boundary(map_bh) && relative_block == nblocks) || */
/* 	    (first_hole != blocks_per_page)) */
/* 		bio = mpage_bio_submit(READ, bio); */
/* 	else */
/* 		*last_block_in_bio = blocks[blocks_per_page - 1]; */
	
out:
	return bio;

confused:
	printk(KERN_INFO "\nCONFUSED !");
	if (bio)
		bio = mpage_bio_submit(READ, bio);
	if (!PageUptodate(page))
	        block_read_full_page(page, get_block);
	else
		unlock_page(page);
	goto out;
}
示例#14
0
int __generic_block_fiemap(struct inode *inode,
			   struct fiemap_extent_info *fieinfo, loff_t start,
			   loff_t len, get_block_t *get_block)
{
	struct buffer_head map_bh;
	sector_t start_blk, last_blk;
	loff_t isize = i_size_read(inode);
	u64 logical = 0, phys = 0, size = 0;
	u32 flags = FIEMAP_EXTENT_MERGED;
	bool past_eof = false, whole_file = false;
	int ret = 0;

	ret = fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC);
	if (ret)
		return ret;

	/*
	 * Either the i_mutex or other appropriate locking needs to be held
	 * since we expect isize to not change at all through the duration of
	 * this call.
	 */
	if (len >= isize) {
		whole_file = true;
		len = isize;
	}

	/*
	 * Some filesystems can't deal with being asked to map less than
	 * blocksize, so make sure our len is at least block length.
	 */
	if (logical_to_blk(inode, len) == 0)
		len = blk_to_logical(inode, 1);

	start_blk = logical_to_blk(inode, start);
	last_blk = logical_to_blk(inode, start + len - 1);

	do {
		/*
		 * we set b_size to the total size we want so it will map as
		 * many contiguous blocks as possible at once
		 */
		memset(&map_bh, 0, sizeof(struct buffer_head));
		map_bh.b_size = len;

		ret = get_block(inode, start_blk, &map_bh, 0);
		if (ret)
			break;

		/* HOLE */
		if (!buffer_mapped(&map_bh)) {
			start_blk++;

			/*
			 * We want to handle the case where there is an
			 * allocated block at the front of the file, and then
			 * nothing but holes up to the end of the file properly,
			 * to make sure that extent at the front gets properly
			 * marked with FIEMAP_EXTENT_LAST
			 */
			if (!past_eof &&
			    blk_to_logical(inode, start_blk) >= isize)
				past_eof = 1;

			/*
			 * First hole after going past the EOF, this is our
			 * last extent
			 */
			if (past_eof && size) {
				flags = FIEMAP_EXTENT_MERGED|FIEMAP_EXTENT_LAST;
				ret = fiemap_fill_next_extent(fieinfo, logical,
							      phys, size,
							      flags);
			} else if (size) {
				ret = fiemap_fill_next_extent(fieinfo, logical,
							      phys, size, flags);
				size = 0;
			}

			/* if we have holes up to/past EOF then we're done */
			if (start_blk > last_blk || past_eof || ret)
				break;
		} else {
			/*
			 * We have gone over the length of what we wanted to
			 * map, and it wasn't the entire file, so add the extent
			 * we got last time and exit.
			 *
			 * This is for the case where say we want to map all the
			 * way up to the second to the last block in a file, but
			 * the last block is a hole, making the second to last
			 * block FIEMAP_EXTENT_LAST.  In this case we want to
			 * see if there is a hole after the second to last block
			 * so we can mark it properly.  If we found data after
			 * we exceeded the length we were requesting, then we
			 * are good to go, just add the extent to the fieinfo
			 * and break
			 */
			if (start_blk > last_blk && !whole_file) {
				ret = fiemap_fill_next_extent(fieinfo, logical,
							      phys, size,
							      flags);
				break;
			}

			/*
			 * if size != 0 then we know we already have an extent
			 * to add, so add it.
			 */
			if (size) {
				ret = fiemap_fill_next_extent(fieinfo, logical,
							      phys, size,
							      flags);
				if (ret)
					break;
			}

			logical = blk_to_logical(inode, start_blk);
			phys = blk_to_logical(inode, map_bh.b_blocknr);
			size = map_bh.b_size;
			flags = FIEMAP_EXTENT_MERGED;

			start_blk += logical_to_blk(inode, size);

			/*
			 * If we are past the EOF, then we need to make sure as
			 * soon as we find a hole that the last extent we found
			 * is marked with FIEMAP_EXTENT_LAST
			 */
			if (!past_eof && logical + size >= isize)
				past_eof = true;
		}
		cond_resched();
	} while (1);

	/* If ret is 1 then we just hit the end of the extent array */
	if (ret == 1)
		ret = 0;

	return ret;
}
示例#15
0
static int copy_user_bh(struct page *to, struct inode *inode,
		struct buffer_head *bh, unsigned long vaddr)
{
	struct blk_dax_ctl dax = {
		.sector = to_sector(bh, inode),
		.size = bh->b_size,
	};
	struct block_device *bdev = bh->b_bdev;
	void *vto;

	if (dax_map_atomic(bdev, &dax) < 0)
		return PTR_ERR(dax.addr);
	vto = kmap_atomic(to);
	copy_user_page(vto, (void __force *)dax.addr, vaddr, to);
	kunmap_atomic(vto);
	dax_unmap_atomic(bdev, &dax);
	return 0;
}

#define NO_SECTOR -1
#define DAX_PMD_INDEX(page_index) (page_index & (PMD_MASK >> PAGE_CACHE_SHIFT))

static int dax_radix_entry(struct address_space *mapping, pgoff_t index,
		sector_t sector, bool pmd_entry, bool dirty)
{
	struct radix_tree_root *page_tree = &mapping->page_tree;
	pgoff_t pmd_index = DAX_PMD_INDEX(index);
	int type, error = 0;
	void *entry;

	WARN_ON_ONCE(pmd_entry && !dirty);
	__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);

	spin_lock_irq(&mapping->tree_lock);

	entry = radix_tree_lookup(page_tree, pmd_index);
	if (entry && RADIX_DAX_TYPE(entry) == RADIX_DAX_PMD) {
		index = pmd_index;
		goto dirty;
	}

	entry = radix_tree_lookup(page_tree, index);
	if (entry) {
		type = RADIX_DAX_TYPE(entry);
		if (WARN_ON_ONCE(type != RADIX_DAX_PTE &&
					type != RADIX_DAX_PMD)) {
			error = -EIO;
			goto unlock;
		}

		if (!pmd_entry || type == RADIX_DAX_PMD)
			goto dirty;

		/*
		 * We only insert dirty PMD entries into the radix tree.  This
		 * means we don't need to worry about removing a dirty PTE
		 * entry and inserting a clean PMD entry, thus reducing the
		 * range we would flush with a follow-up fsync/msync call.
		 */
		radix_tree_delete(&mapping->page_tree, index);
		mapping->nrexceptional--;
	}

	if (sector == NO_SECTOR) {
		/*
		 * This can happen during correct operation if our pfn_mkwrite
		 * fault raced against a hole punch operation.  If this
		 * happens the pte that was hole punched will have been
		 * unmapped and the radix tree entry will have been removed by
		 * the time we are called, but the call will still happen.  We
		 * will return all the way up to wp_pfn_shared(), where the
		 * pte_same() check will fail, eventually causing page fault
		 * to be retried by the CPU.
		 */
		goto unlock;
	}

	error = radix_tree_insert(page_tree, index,
			RADIX_DAX_ENTRY(sector, pmd_entry));
	if (error)
		goto unlock;

	mapping->nrexceptional++;
 dirty:
	if (dirty)
		radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY);
 unlock:
	spin_unlock_irq(&mapping->tree_lock);
	return error;
}

static int dax_writeback_one(struct block_device *bdev,
		struct address_space *mapping, pgoff_t index, void *entry)
{
	struct radix_tree_root *page_tree = &mapping->page_tree;
	int type = RADIX_DAX_TYPE(entry);
	struct radix_tree_node *node;
	struct blk_dax_ctl dax;
	void **slot;
	int ret = 0;

	spin_lock_irq(&mapping->tree_lock);
	/*
	 * Regular page slots are stabilized by the page lock even
	 * without the tree itself locked.  These unlocked entries
	 * need verification under the tree lock.
	 */
	if (!__radix_tree_lookup(page_tree, index, &node, &slot))
		goto unlock;
	if (*slot != entry)
		goto unlock;

	/* another fsync thread may have already written back this entry */
	if (!radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE))
		goto unlock;

	if (WARN_ON_ONCE(type != RADIX_DAX_PTE && type != RADIX_DAX_PMD)) {
		ret = -EIO;
		goto unlock;
	}

	dax.sector = RADIX_DAX_SECTOR(entry);
	dax.size = (type == RADIX_DAX_PMD ? PMD_SIZE : PAGE_SIZE);
	spin_unlock_irq(&mapping->tree_lock);

	/*
	 * We cannot hold tree_lock while calling dax_map_atomic() because it
	 * eventually calls cond_resched().
	 */
	ret = dax_map_atomic(bdev, &dax);
	if (ret < 0)
		return ret;

	if (WARN_ON_ONCE(ret < dax.size)) {
		ret = -EIO;
		goto unmap;
	}

	wb_cache_pmem(dax.addr, dax.size);

	spin_lock_irq(&mapping->tree_lock);
	radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_TOWRITE);
	spin_unlock_irq(&mapping->tree_lock);
 unmap:
	dax_unmap_atomic(bdev, &dax);
	return ret;

 unlock:
	spin_unlock_irq(&mapping->tree_lock);
	return ret;
}

/*
 * Flush the mapping to the persistent domain within the byte range of [start,
 * end]. This is required by data integrity operations to ensure file data is
 * on persistent storage prior to completion of the operation.
 */
int dax_writeback_mapping_range(struct address_space *mapping, loff_t start,
		loff_t end)
{
	struct inode *inode = mapping->host;
	struct block_device *bdev = inode->i_sb->s_bdev;
	pgoff_t start_index, end_index, pmd_index;
	pgoff_t indices[PAGEVEC_SIZE];
	struct pagevec pvec;
	bool done = false;
	int i, ret = 0;
	void *entry;

	if (WARN_ON_ONCE(inode->i_blkbits != PAGE_SHIFT))
		return -EIO;

	start_index = start >> PAGE_CACHE_SHIFT;
	end_index = end >> PAGE_CACHE_SHIFT;
	pmd_index = DAX_PMD_INDEX(start_index);

	rcu_read_lock();
	entry = radix_tree_lookup(&mapping->page_tree, pmd_index);
	rcu_read_unlock();

	/* see if the start of our range is covered by a PMD entry */
	if (entry && RADIX_DAX_TYPE(entry) == RADIX_DAX_PMD)
		start_index = pmd_index;

	tag_pages_for_writeback(mapping, start_index, end_index);

	pagevec_init(&pvec, 0);
	while (!done) {
		pvec.nr = find_get_entries_tag(mapping, start_index,
				PAGECACHE_TAG_TOWRITE, PAGEVEC_SIZE,
				pvec.pages, indices);

		if (pvec.nr == 0)
			break;

		for (i = 0; i < pvec.nr; i++) {
			if (indices[i] > end_index) {
				done = true;
				break;
			}

			ret = dax_writeback_one(bdev, mapping, indices[i],
					pvec.pages[i]);
			if (ret < 0)
				return ret;
		}
	}
	wmb_pmem();
	return 0;
}
EXPORT_SYMBOL_GPL(dax_writeback_mapping_range);

static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
			struct vm_area_struct *vma, struct vm_fault *vmf)
{
	unsigned long vaddr = (unsigned long)vmf->virtual_address;
	struct address_space *mapping = inode->i_mapping;
	struct block_device *bdev = bh->b_bdev;
	struct blk_dax_ctl dax = {
		.sector = to_sector(bh, inode),
		.size = bh->b_size,
	};
	pgoff_t size;
	int error;

	i_mmap_lock_read(mapping);

	/*
	 * Check truncate didn't happen while we were allocating a block.
	 * If it did, this block may or may not be still allocated to the
	 * file.  We can't tell the filesystem to free it because we can't
	 * take i_mutex here.  In the worst case, the file still has blocks
	 * allocated past the end of the file.
	 */
	size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
	if (unlikely(vmf->pgoff >= size)) {
		error = -EIO;
		goto out;
	}

	if (dax_map_atomic(bdev, &dax) < 0) {
		error = PTR_ERR(dax.addr);
		goto out;
	}

	if (buffer_unwritten(bh) || buffer_new(bh)) {
		clear_pmem(dax.addr, PAGE_SIZE);
		wmb_pmem();
	}
	dax_unmap_atomic(bdev, &dax);

	error = dax_radix_entry(mapping, vmf->pgoff, dax.sector, false,
			vmf->flags & FAULT_FLAG_WRITE);
	if (error)
		goto out;

	error = vm_insert_mixed(vma, vaddr, dax.pfn);

 out:
	i_mmap_unlock_read(mapping);

	return error;
}

/**
 * __dax_fault - handle a page fault on a DAX file
 * @vma: The virtual memory area where the fault occurred
 * @vmf: The description of the fault
 * @get_block: The filesystem method used to translate file offsets to blocks
 * @complete_unwritten: The filesystem method used to convert unwritten blocks
 *	to written so the data written to them is exposed. This is required for
 *	required by write faults for filesystems that will return unwritten
 *	extent mappings from @get_block, but it is optional for reads as
 *	dax_insert_mapping() will always zero unwritten blocks. If the fs does
 *	not support unwritten extents, the it should pass NULL.
 *
 * When a page fault occurs, filesystems may call this helper in their
 * fault handler for DAX files. __dax_fault() assumes the caller has done all
 * the necessary locking for the page fault to proceed successfully.
 */
int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
			get_block_t get_block, dax_iodone_t complete_unwritten)
{
	struct file *file = vma->vm_file;
	struct address_space *mapping = file->f_mapping;
	struct inode *inode = mapping->host;
	struct page *page;
	struct buffer_head bh;
	unsigned long vaddr = (unsigned long)vmf->virtual_address;
	unsigned blkbits = inode->i_blkbits;
	sector_t block;
	pgoff_t size;
	int error;
	int major = 0;

	size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
	if (vmf->pgoff >= size)
		return VM_FAULT_SIGBUS;

	memset(&bh, 0, sizeof(bh));
	block = (sector_t)vmf->pgoff << (PAGE_SHIFT - blkbits);
	bh.b_bdev = inode->i_sb->s_bdev;
	bh.b_size = PAGE_SIZE;

 repeat:
	page = find_get_page(mapping, vmf->pgoff);
	if (page) {
		if (!lock_page_or_retry(page, vma->vm_mm, vmf->flags)) {
			page_cache_release(page);
			return VM_FAULT_RETRY;
		}
		if (unlikely(page->mapping != mapping)) {
			unlock_page(page);
			page_cache_release(page);
			goto repeat;
		}
		size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
		if (unlikely(vmf->pgoff >= size)) {
			/*
			 * We have a struct page covering a hole in the file
			 * from a read fault and we've raced with a truncate
			 */
			error = -EIO;
			goto unlock_page;
		}
	}

	error = get_block(inode, block, &bh, 0);
	if (!error && (bh.b_size < PAGE_SIZE))
		error = -EIO;		/* fs corruption? */
	if (error)
		goto unlock_page;

	if (!buffer_mapped(&bh) && !buffer_unwritten(&bh) && !vmf->cow_page) {
		if (vmf->flags & FAULT_FLAG_WRITE) {
			error = get_block(inode, block, &bh, 1);
			count_vm_event(PGMAJFAULT);
			mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
			major = VM_FAULT_MAJOR;
			if (!error && (bh.b_size < PAGE_SIZE))
				error = -EIO;
			if (error)
				goto unlock_page;
		} else {
			return dax_load_hole(mapping, page, vmf);
		}
	}

	if (vmf->cow_page) {
		struct page *new_page = vmf->cow_page;
		if (buffer_written(&bh))
			error = copy_user_bh(new_page, inode, &bh, vaddr);
		else
			clear_user_highpage(new_page, vaddr);
		if (error)
			goto unlock_page;
		vmf->page = page;
		if (!page) {
			i_mmap_lock_read(mapping);
			/* Check we didn't race with truncate */
			size = (i_size_read(inode) + PAGE_SIZE - 1) >>
								PAGE_SHIFT;
			if (vmf->pgoff >= size) {
				i_mmap_unlock_read(mapping);
				error = -EIO;
				goto out;
			}
		}
		return VM_FAULT_LOCKED;
	}

	/* Check we didn't race with a read fault installing a new page */
	if (!page && major)
		page = find_lock_page(mapping, vmf->pgoff);

	if (page) {
		unmap_mapping_range(mapping, vmf->pgoff << PAGE_SHIFT,
							PAGE_CACHE_SIZE, 0);
		delete_from_page_cache(page);
		unlock_page(page);
		page_cache_release(page);
		page = NULL;
	}

	/*
	 * If we successfully insert the new mapping over an unwritten extent,
	 * we need to ensure we convert the unwritten extent. If there is an
	 * error inserting the mapping, the filesystem needs to leave it as
	 * unwritten to prevent exposure of the stale underlying data to
	 * userspace, but we still need to call the completion function so
	 * the private resources on the mapping buffer can be released. We
	 * indicate what the callback should do via the uptodate variable, same
	 * as for normal BH based IO completions.
	 */
	error = dax_insert_mapping(inode, &bh, vma, vmf);
	if (buffer_unwritten(&bh)) {
		if (complete_unwritten)
			complete_unwritten(&bh, !error);
		else
			WARN_ON_ONCE(!(vmf->flags & FAULT_FLAG_WRITE));
	}

 out:
	if (error == -ENOMEM)
		return VM_FAULT_OOM | major;
	/* -EBUSY is fine, somebody else faulted on the same PTE */
	if ((error < 0) && (error != -EBUSY))
		return VM_FAULT_SIGBUS | major;
	return VM_FAULT_NOPAGE | major;

 unlock_page:
	if (page) {
		unlock_page(page);
		page_cache_release(page);
	}
	goto out;
}
示例#16
0
/*
 * This is the worker routine which does all the work of mapping the disk
 * blocks and constructs largest possible bios, submits them for IO if the
 * blocks are not contiguous on the disk.
 *
 * We pass a buffer_head back and forth and use its buffer_mapped() flag to
 * represent the validity of its disk mapping and to decide when to do the next
 * get_block() call.
 */
static struct bio *
do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages,
		sector_t *last_block_in_bio, struct buffer_head *map_bh,
		unsigned long *first_logical_block, get_block_t get_block)
{
	struct inode *inode = page->mapping->host;
	const unsigned blkbits = inode->i_blkbits;
	const unsigned blocks_per_page = PAGE_CACHE_SIZE >> blkbits;
	const unsigned blocksize = 1 << blkbits;
	sector_t block_in_file;
	sector_t last_block;
	sector_t last_block_in_file;
	sector_t blocks[MAX_BUF_PER_PAGE];
	unsigned page_block;
	unsigned first_hole = blocks_per_page;
	struct block_device *bdev = NULL;
	int length;
	int fully_mapped = 1;
	unsigned nblocks;
	unsigned relative_block;

	if (page_has_buffers(page))
		goto confused;

	block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits);
	last_block = block_in_file + nr_pages * blocks_per_page;
	last_block_in_file = (i_size_read(inode) + blocksize - 1) >> blkbits;
	if (last_block > last_block_in_file)
		last_block = last_block_in_file;
	page_block = 0;

	/*
	 * Map blocks using the result from the previous get_blocks call first.
	 */
	nblocks = map_bh->b_size >> blkbits;
	if (buffer_mapped(map_bh) && block_in_file > *first_logical_block &&
			block_in_file < (*first_logical_block + nblocks)) {
		unsigned map_offset = block_in_file - *first_logical_block;
		unsigned last = nblocks - map_offset;

		for (relative_block = 0; ; relative_block++) {
			if (relative_block == last) {
				clear_buffer_mapped(map_bh);
				break;
			}
			if (page_block == blocks_per_page)
				break;
			blocks[page_block] = map_bh->b_blocknr + map_offset +
						relative_block;
			page_block++;
			block_in_file++;
		}
		bdev = map_bh->b_bdev;
	}

	/*
	 * Then do more get_blocks calls until we are done with this page.
	 */
	map_bh->b_page = page;
	while (page_block < blocks_per_page) {
		map_bh->b_state = 0;
		map_bh->b_size = 0;

		if (block_in_file < last_block) {
			map_bh->b_size = (last_block-block_in_file) << blkbits;
			if (get_block(inode, block_in_file, map_bh, 0))
				goto confused;
			*first_logical_block = block_in_file;
		}

		if (!buffer_mapped(map_bh)) {
			fully_mapped = 0;
			if (first_hole == blocks_per_page)
				first_hole = page_block;
			page_block++;
			block_in_file++;
			clear_buffer_mapped(map_bh);
			continue;
		}

		/* some filesystems will copy data into the page during
		 * the get_block call, in which case we don't want to
		 * read it again.  map_buffer_to_page copies the data
		 * we just collected from get_block into the page's buffers
		 * so readpage doesn't have to repeat the get_block call
		 */
		if (buffer_uptodate(map_bh)) {
			map_buffer_to_page(page, map_bh, page_block);
			goto confused;
		}
	
		if (first_hole != blocks_per_page)
			goto confused;		/* hole -> non-hole */

		/* Contiguous blocks? */
		if (page_block && blocks[page_block-1] != map_bh->b_blocknr-1)
			goto confused;
		nblocks = map_bh->b_size >> blkbits;
		for (relative_block = 0; ; relative_block++) {
			if (relative_block == nblocks) {
				clear_buffer_mapped(map_bh);
				break;
			} else if (page_block == blocks_per_page)
				break;
			blocks[page_block] = map_bh->b_blocknr+relative_block;
			page_block++;
			block_in_file++;
		}
		bdev = map_bh->b_bdev;
	}

	if (first_hole != blocks_per_page) {
		char *kaddr = kmap_atomic(page, KM_USER0);
		memset(kaddr + (first_hole << blkbits), 0,
				PAGE_CACHE_SIZE - (first_hole << blkbits));
		flush_dcache_page(page);
		kunmap_atomic(kaddr, KM_USER0);
		if (first_hole == 0) {
			SetPageUptodate(page);
			unlock_page(page);
			goto out;
		}
	} else if (fully_mapped) {
		SetPageMappedToDisk(page);
	}

	/*
	 * This page will go to BIO.  Do we need to send this BIO off first?
	 */
	if (bio && (*last_block_in_bio != blocks[0] - 1))
		bio = mpage_bio_submit(READ, bio);

alloc_new:
	if (bio == NULL) {
		bio = mpage_alloc(bdev, blocks[0] << (blkbits - 9),
			  	min_t(int, nr_pages, bio_get_nr_vecs(bdev)),
				GFP_KERNEL);
		if (bio == NULL)
			goto confused;
	}
示例#17
0
int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
		pmd_t *pmd, unsigned int flags, get_block_t get_block,
		dax_iodone_t complete_unwritten)
{
	struct file *file = vma->vm_file;
	struct address_space *mapping = file->f_mapping;
	struct inode *inode = mapping->host;
	struct buffer_head bh;
	unsigned blkbits = inode->i_blkbits;
	unsigned long pmd_addr = address & PMD_MASK;
	bool write = flags & FAULT_FLAG_WRITE;
	struct block_device *bdev;
	pgoff_t size, pgoff;
	sector_t block;
	int error, result = 0;
	bool alloc = false;

	/* dax pmd mappings require pfn_t_devmap() */
	if (!IS_ENABLED(CONFIG_FS_DAX_PMD))
		return VM_FAULT_FALLBACK;

	/* Fall back to PTEs if we're going to COW */
	if (write && !(vma->vm_flags & VM_SHARED)) {
		split_huge_pmd(vma, pmd, address);
		dax_pmd_dbg(NULL, address, "cow write");
		return VM_FAULT_FALLBACK;
	}
	/* If the PMD would extend outside the VMA */
	if (pmd_addr < vma->vm_start) {
		dax_pmd_dbg(NULL, address, "vma start unaligned");
		return VM_FAULT_FALLBACK;
	}
	if ((pmd_addr + PMD_SIZE) > vma->vm_end) {
		dax_pmd_dbg(NULL, address, "vma end unaligned");
		return VM_FAULT_FALLBACK;
	}

	pgoff = linear_page_index(vma, pmd_addr);
	size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
	if (pgoff >= size)
		return VM_FAULT_SIGBUS;
	/* If the PMD would cover blocks out of the file */
	if ((pgoff | PG_PMD_COLOUR) >= size) {
		dax_pmd_dbg(NULL, address,
				"offset + huge page size > file size");
		return VM_FAULT_FALLBACK;
	}

	memset(&bh, 0, sizeof(bh));
	bh.b_bdev = inode->i_sb->s_bdev;
	block = (sector_t)pgoff << (PAGE_SHIFT - blkbits);

	bh.b_size = PMD_SIZE;

	if (get_block(inode, block, &bh, 0) != 0)
		return VM_FAULT_SIGBUS;

	if (!buffer_mapped(&bh) && write) {
		if (get_block(inode, block, &bh, 1) != 0)
			return VM_FAULT_SIGBUS;
		alloc = true;
	}

	bdev = bh.b_bdev;

	/*
	 * If the filesystem isn't willing to tell us the length of a hole,
	 * just fall back to PTEs.  Calling get_block 512 times in a loop
	 * would be silly.
	 */
	if (!buffer_size_valid(&bh) || bh.b_size < PMD_SIZE) {
		dax_pmd_dbg(&bh, address, "allocated block too small");
		return VM_FAULT_FALLBACK;
	}

	/*
	 * If we allocated new storage, make sure no process has any
	 * zero pages covering this hole
	 */
	if (alloc) {
		loff_t lstart = pgoff << PAGE_SHIFT;
		loff_t lend = lstart + PMD_SIZE - 1; /* inclusive */

		truncate_pagecache_range(inode, lstart, lend);
	}

	i_mmap_lock_read(mapping);

	/*
	 * If a truncate happened while we were allocating blocks, we may
	 * leave blocks allocated to the file that are beyond EOF.  We can't
	 * take i_mutex here, so just leave them hanging; they'll be freed
	 * when the file is deleted.
	 */
	size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
	if (pgoff >= size) {
		result = VM_FAULT_SIGBUS;
		goto out;
	}
	if ((pgoff | PG_PMD_COLOUR) >= size) {
		dax_pmd_dbg(&bh, address,
				"offset + huge page size > file size");
		goto fallback;
	}

	if (!write && !buffer_mapped(&bh) && buffer_uptodate(&bh)) {
		spinlock_t *ptl;
		pmd_t entry;
		struct page *zero_page = get_huge_zero_page();

		if (unlikely(!zero_page)) {
			dax_pmd_dbg(&bh, address, "no zero page");
			goto fallback;
		}

		ptl = pmd_lock(vma->vm_mm, pmd);
		if (!pmd_none(*pmd)) {
			spin_unlock(ptl);
			dax_pmd_dbg(&bh, address, "pmd already present");
			goto fallback;
		}

		dev_dbg(part_to_dev(bdev->bd_part),
				"%s: %s addr: %lx pfn: <zero> sect: %llx\n",
				__func__, current->comm, address,
				(unsigned long long) to_sector(&bh, inode));

		entry = mk_pmd(zero_page, vma->vm_page_prot);
		entry = pmd_mkhuge(entry);
		set_pmd_at(vma->vm_mm, pmd_addr, pmd, entry);
		result = VM_FAULT_NOPAGE;
		spin_unlock(ptl);
	} else {
		struct blk_dax_ctl dax = {
			.sector = to_sector(&bh, inode),
			.size = PMD_SIZE,
		};
		long length = dax_map_atomic(bdev, &dax);

		if (length < 0) {
			result = VM_FAULT_SIGBUS;
			goto out;
		}
		if (length < PMD_SIZE) {
			dax_pmd_dbg(&bh, address, "dax-length too small");
			dax_unmap_atomic(bdev, &dax);
			goto fallback;
		}
		if (pfn_t_to_pfn(dax.pfn) & PG_PMD_COLOUR) {
			dax_pmd_dbg(&bh, address, "pfn unaligned");
			dax_unmap_atomic(bdev, &dax);
			goto fallback;
		}

		if (!pfn_t_devmap(dax.pfn)) {
			dax_unmap_atomic(bdev, &dax);
			dax_pmd_dbg(&bh, address, "pfn not in memmap");
			goto fallback;
		}

		if (buffer_unwritten(&bh) || buffer_new(&bh)) {
			clear_pmem(dax.addr, PMD_SIZE);
			wmb_pmem();
			count_vm_event(PGMAJFAULT);
			mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
			result |= VM_FAULT_MAJOR;
		}
		dax_unmap_atomic(bdev, &dax);

		/*
		 * For PTE faults we insert a radix tree entry for reads, and
		 * leave it clean.  Then on the first write we dirty the radix
		 * tree entry via the dax_pfn_mkwrite() path.  This sequence
		 * allows the dax_pfn_mkwrite() call to be simpler and avoid a
		 * call into get_block() to translate the pgoff to a sector in
		 * order to be able to create a new radix tree entry.
		 *
		 * The PMD path doesn't have an equivalent to
		 * dax_pfn_mkwrite(), though, so for a read followed by a
		 * write we traverse all the way through __dax_pmd_fault()
		 * twice.  This means we can just skip inserting a radix tree
		 * entry completely on the initial read and just wait until
		 * the write to insert a dirty entry.
		 */
		if (write) {
			error = dax_radix_entry(mapping, pgoff, dax.sector,
					true, true);
			if (error) {
				dax_pmd_dbg(&bh, address,
						"PMD radix insertion failed");
				goto fallback;
			}
		}

		dev_dbg(part_to_dev(bdev->bd_part),
				"%s: %s addr: %lx pfn: %lx sect: %llx\n",
				__func__, current->comm, address,
				pfn_t_to_pfn(dax.pfn),
				(unsigned long long) dax.sector);
		result |= vmf_insert_pfn_pmd(vma, address, pmd,
				dax.pfn, write);
	}

 out:
	i_mmap_unlock_read(mapping);

	if (buffer_unwritten(&bh))
		complete_unwritten(&bh, !(result & VM_FAULT_ERROR));

	return result;

 fallback:
	count_vm_event(THP_FAULT_FALLBACK);
	result = VM_FAULT_FALLBACK;
	goto out;
}
EXPORT_SYMBOL_GPL(__dax_pmd_fault);

/**
 * dax_pmd_fault - handle a PMD fault on a DAX file
 * @vma: The virtual memory area where the fault occurred
 * @vmf: The description of the fault
 * @get_block: The filesystem method used to translate file offsets to blocks
 *
 * When a page fault occurs, filesystems may call this helper in their
 * pmd_fault handler for DAX files.
 */
int dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
			pmd_t *pmd, unsigned int flags, get_block_t get_block,
			dax_iodone_t complete_unwritten)
{
	int result;
	struct super_block *sb = file_inode(vma->vm_file)->i_sb;

	if (flags & FAULT_FLAG_WRITE) {
		sb_start_pagefault(sb);
		file_update_time(vma->vm_file);
	}
	result = __dax_pmd_fault(vma, address, pmd, flags, get_block,
				complete_unwritten);
	if (flags & FAULT_FLAG_WRITE)
		sb_end_pagefault(sb);

	return result;
}
示例#18
0
/*
 * Modify inode page cache in such way:
 * have - blocks with b_blocknr equal to oldb...oldb+count-1
 * get - blocks with b_blocknr equal to newb...newb+count-1
 * also we suppose that oldb...oldb+count-1 blocks
 * situated at the end of file.
 *
 * We can come here from ufs_writepage or ufs_prepare_write,
 * locked_page is argument of these functions, so we already lock it.
 */
static void ufs_change_blocknr(struct inode *inode, sector_t beg,
			       unsigned int count, sector_t oldb,
			       sector_t newb, struct page *locked_page)
{
	const unsigned blks_per_page =
		1 << (PAGE_CACHE_SHIFT - inode->i_blkbits);
	const unsigned mask = blks_per_page - 1;
	struct address_space * const mapping = inode->i_mapping;
	pgoff_t index, cur_index, last_index;
	unsigned pos, j, lblock;
	sector_t end, i;
	struct page *page;
	struct buffer_head *head, *bh;

	UFSD("ENTER, ino %lu, count %u, oldb %llu, newb %llu\n",
	      inode->i_ino, count,
	     (unsigned long long)oldb, (unsigned long long)newb);

	BUG_ON(!locked_page);
	BUG_ON(!PageLocked(locked_page));

	cur_index = locked_page->index;
	end = count + beg;
	last_index = end >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
	for (i = beg; i < end; i = (i | mask) + 1) {
		index = i >> (PAGE_CACHE_SHIFT - inode->i_blkbits);

		if (likely(cur_index != index)) {
			page = ufs_get_locked_page(mapping, index);
			if (!page)/* it was truncated */
				continue;
			if (IS_ERR(page)) {/* or EIO */
				ufs_error(inode->i_sb, __func__,
					  "read of page %llu failed\n",
					  (unsigned long long)index);
				continue;
			}
		} else
			page = locked_page;

		head = page_buffers(page);
		bh = head;
		pos = i & mask;
		for (j = 0; j < pos; ++j)
			bh = bh->b_this_page;


		if (unlikely(index == last_index))
			lblock = end & mask;
		else
			lblock = blks_per_page;

		do {
			if (j >= lblock)
				break;
			pos = (i - beg) + j;

			if (!buffer_mapped(bh))
					map_bh(bh, inode->i_sb, oldb + pos);
			if (!buffer_uptodate(bh)) {
				ll_rw_block(READ, 1, &bh);
				wait_on_buffer(bh);
				if (!buffer_uptodate(bh)) {
					ufs_error(inode->i_sb, __func__,
						  "read of block failed\n");
					break;
				}
			}

			UFSD(" change from %llu to %llu, pos %u\n",
			     (unsigned long long)(pos + oldb),
			     (unsigned long long)(pos + newb), pos);

			bh->b_blocknr = newb + pos;
			unmap_underlying_metadata(bh->b_bdev,
						  bh->b_blocknr);
			mark_buffer_dirty(bh);
			++j;
			bh = bh->b_this_page;
		} while (bh != head);

		if (likely(cur_index != index))
			ufs_put_locked_page(page);
 	}
	UFSD("EXIT\n");
}
示例#19
0
/**
 * sync_mft_mirror - synchronize an mft record to the mft mirror
 * @ni:		ntfs inode whose mft record to synchronize
 * @m:		mapped, mst protected (extent) mft record to synchronize
 * @sync:	if true, wait for i/o completion
 *
 * Write the mapped, mst protected (extent) mft record @m described by the
 * (regular or extent) ntfs inode @ni to the mft mirror ($MFTMirr).
 *
 * On success return 0.  On error return -errno and set the volume errors flag
 * in the ntfs_volume to which @ni belongs.
 *
 * NOTE:  We always perform synchronous i/o and ignore the @sync parameter.
 *
 * TODO:  If @sync is false, want to do truly asynchronous i/o, i.e. just
 * schedule i/o via ->writepage or do it via kntfsd or whatever.
 */
static int sync_mft_mirror(ntfs_inode *ni, MFT_RECORD *m, int sync)
{
	ntfs_volume *vol = ni->vol;
	struct page *page;
	unsigned int blocksize = vol->sb->s_blocksize;
	int max_bhs = vol->mft_record_size / blocksize;
	struct buffer_head *bhs[max_bhs];
	struct buffer_head *bh, *head;
	u8 *kmirr;
	unsigned int block_start, block_end, m_start, m_end;
	int i_bhs, nr_bhs, err = 0;

	ntfs_debug("Entering for inode 0x%lx.", ni->mft_no);
	BUG_ON(!max_bhs);
	if (unlikely(!vol->mftmirr_ino)) {
		/* This could happen during umount... */
		err = sync_mft_mirror_umount(ni, m);
		if (likely(!err))
			return err;
		goto err_out;
	}
	/* Get the page containing the mirror copy of the mft record @m. */
	page = ntfs_map_page(vol->mftmirr_ino->i_mapping, ni->mft_no >>
			(PAGE_CACHE_SHIFT - vol->mft_record_size_bits));
	if (unlikely(IS_ERR(page))) {
		ntfs_error(vol->sb, "Failed to map mft mirror page.");
		err = PTR_ERR(page);
		goto err_out;
	}
	/*
	 * Exclusion against other writers.   This should never be a problem
	 * since the page in which the mft record @m resides is also locked and
	 * hence any other writers would be held up there but it is better to
	 * make sure no one is writing from elsewhere.
	 */
	lock_page(page);
	/* The address in the page of the mirror copy of the mft record @m. */
	kmirr = page_address(page) + ((ni->mft_no << vol->mft_record_size_bits)
			& ~PAGE_CACHE_MASK);
	/* Copy the mst protected mft record to the mirror. */
	memcpy(kmirr, m, vol->mft_record_size);
	/* Make sure we have mapped buffers. */
	if (!page_has_buffers(page)) {
no_buffers_err_out:
		ntfs_error(vol->sb, "Writing mft mirror records without "
				"existing buffers is not implemented yet.  %s",
				ntfs_please_email);
		err = -EOPNOTSUPP;
		goto unlock_err_out;
	}
	bh = head = page_buffers(page);
	if (!bh)
		goto no_buffers_err_out;
	nr_bhs = 0;
	block_start = 0;
	m_start = kmirr - (u8*)page_address(page);
	m_end = m_start + vol->mft_record_size;
	do {
		block_end = block_start + blocksize;
		/*
		 * If the buffer is outside the mft record, just skip it,
		 * clearing it if it is dirty to make sure it is not written
		 * out.  It should never be marked dirty but better be safe.
		 */
		if ((block_end <= m_start) || (block_start >= m_end)) {
			if (buffer_dirty(bh)) {
				ntfs_warning(vol->sb, "Clearing dirty mft "
						"record page buffer.  %s",
						ntfs_please_email);
				clear_buffer_dirty(bh);
			}
			continue;
		}
		if (!buffer_mapped(bh)) {
			ntfs_error(vol->sb, "Writing mft mirror records "
					"without existing mapped buffers is "
					"not implemented yet.  %s",
					ntfs_please_email);
			err = -EOPNOTSUPP;
			continue;
		}
		if (!buffer_uptodate(bh)) {
			ntfs_error(vol->sb, "Writing mft mirror records "
					"without existing uptodate buffers is "
					"not implemented yet.  %s",
					ntfs_please_email);
			err = -EOPNOTSUPP;
			continue;
		}
		BUG_ON(!nr_bhs && (m_start != block_start));
		BUG_ON(nr_bhs >= max_bhs);
		bhs[nr_bhs++] = bh;
		BUG_ON((nr_bhs >= max_bhs) && (m_end != block_end));
	} while (block_start = block_end, (bh = bh->b_this_page) != head);
	if (likely(!err)) {
		/* Lock buffers and start synchronous write i/o on them. */
		for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++) {
			struct buffer_head *tbh = bhs[i_bhs];

			if (unlikely(test_set_buffer_locked(tbh)))
				BUG();
			BUG_ON(!buffer_uptodate(tbh));
			if (buffer_dirty(tbh))
				clear_buffer_dirty(tbh);
			get_bh(tbh);
			tbh->b_end_io = end_buffer_write_sync;
			submit_bh(WRITE, tbh);
		}
		/* Wait on i/o completion of buffers. */
		for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++) {
			struct buffer_head *tbh = bhs[i_bhs];

			wait_on_buffer(tbh);
			if (unlikely(!buffer_uptodate(tbh))) {
				err = -EIO;
				/*
				 * Set the buffer uptodate so the page & buffer
				 * states don't become out of sync.
				 */
				if (PageUptodate(page))
					set_buffer_uptodate(tbh);
			}
		}
	} else /* if (unlikely(err)) */ {
		/* Clean the buffers. */
		for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++)
			clear_buffer_dirty(bhs[i_bhs]);
	}
unlock_err_out:
	/* Current state: all buffers are clean, unlocked, and uptodate. */
	/* Remove the mst protection fixups again. */
	post_write_mst_fixup((NTFS_RECORD*)kmirr);
	flush_dcache_page(page);
	unlock_page(page);
	ntfs_unmap_page(page);
	if (unlikely(err)) {
		/* I/O error during writing.  This is really bad! */
		ntfs_error(vol->sb, "I/O error while writing mft mirror "
				"record 0x%lx!  You should unmount the volume "
				"and run chkdsk or ntfsfix.", ni->mft_no);
		goto err_out;
	}
	ntfs_debug("Done.");
	return 0;
err_out:
	ntfs_error(vol->sb, "Failed to synchronize $MFTMirr (error code %i).  "
			"Volume will be left marked dirty on umount.  Run "
			"ntfsfix on the partition after umounting to correct "
			"this.", -err);
	/* We don't want to clear the dirty bit on umount. */
	NVolSetErrors(vol);
	return err;
}