예제 #1
0
/*
 * Locking for serialisation of IO during page faults. This results in a lock
 * ordering of:
 *
 * mmap_sem (MM)
 *   sb_start_pagefault(vfs, freeze)
 *     i_mmaplock (XFS - truncate serialisation)
 *       page_lock (MM)
 *         i_lock (XFS - extent map serialisation)
 */
static int
__xfs_filemap_fault(
	struct vm_fault		*vmf,
	enum page_entry_size	pe_size,
	bool			write_fault)
{
	struct inode		*inode = file_inode(vmf->vma->vm_file);
	struct xfs_inode	*ip = XFS_I(inode);
	int			ret;

	trace_xfs_filemap_fault(ip, pe_size, write_fault);

	if (write_fault) {
		sb_start_pagefault(inode->i_sb);
		file_update_time(vmf->vma->vm_file);
	}

	xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
	if (IS_DAX(inode)) {
		ret = dax_iomap_fault(vmf, pe_size, &xfs_iomap_ops);
	} else {
		if (write_fault)
			ret = iomap_page_mkwrite(vmf, &xfs_iomap_ops);
		else
			ret = filemap_fault(vmf);
	}
	xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);

	if (write_fault)
		sb_end_pagefault(inode->i_sb);
	return ret;
}
예제 #2
0
STATIC ssize_t
xfs_file_read_iter(
	struct kiocb		*iocb,
	struct iov_iter		*to)
{
	struct inode		*inode = file_inode(iocb->ki_filp);
	struct xfs_mount	*mp = XFS_I(inode)->i_mount;
	ssize_t			ret = 0;

	XFS_STATS_INC(mp, xs_read_calls);

	if (XFS_FORCED_SHUTDOWN(mp))
		return -EIO;

	if (IS_DAX(inode))
		ret = xfs_file_dax_read(iocb, to);
	else if (iocb->ki_flags & IOCB_DIRECT)
		ret = xfs_file_dio_aio_read(iocb, to);
	else
		ret = xfs_file_buffered_aio_read(iocb, to);

	if (ret > 0)
		XFS_STATS_ADD(mp, xs_read_bytes, ret);
	return ret;
}
예제 #3
0
/*
 * pfn_mkwrite was originally inteneded to ensure we capture time stamp
 * updates on write faults. In reality, it's need to serialise against
 * truncate similar to page_mkwrite. Hence we cycle the XFS_MMAPLOCK_SHARED
 * to ensure we serialise the fault barrier in place.
 */
static int
xfs_filemap_pfn_mkwrite(
	struct vm_fault		*vmf)
{

	struct inode		*inode = file_inode(vmf->vma->vm_file);
	struct xfs_inode	*ip = XFS_I(inode);
	int			ret = VM_FAULT_NOPAGE;
	loff_t			size;

	trace_xfs_filemap_pfn_mkwrite(ip);

	sb_start_pagefault(inode->i_sb);
	file_update_time(vmf->vma->vm_file);

	/* check if the faulting page hasn't raced with truncate */
	xfs_ilock(ip, XFS_MMAPLOCK_SHARED);
	size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
	if (vmf->pgoff >= size)
		ret = VM_FAULT_SIGBUS;
	else if (IS_DAX(inode))
		ret = dax_iomap_fault(vmf, PE_SIZE_PTE, &xfs_iomap_ops);
	xfs_iunlock(ip, XFS_MMAPLOCK_SHARED);
	sb_end_pagefault(inode->i_sb);
	return ret;

}
예제 #4
0
/*
 * Similar to xfs_filemap_fault(), the DAX fault path can call into here on
 * both read and write faults. Hence we need to handle both cases. There is no
 * ->huge_mkwrite callout for huge pages, so we have a single function here to
 * handle both cases here. @flags carries the information on the type of fault
 * occuring.
 */
STATIC int
xfs_filemap_huge_fault(
	struct vm_fault		*vmf,
	enum page_entry_size	pe_size)
{
	struct inode		*inode = file_inode(vmf->vma->vm_file);
	struct xfs_inode	*ip = XFS_I(inode);
	int			ret;

	if (!IS_DAX(inode))
		return VM_FAULT_FALLBACK;

	trace_xfs_filemap_huge_fault(ip);

	if (vmf->flags & FAULT_FLAG_WRITE) {
		sb_start_pagefault(inode->i_sb);
		file_update_time(vmf->vma->vm_file);
	}

	xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
	ret = dax_iomap_fault(vmf, pe_size, &xfs_iomap_ops);
	xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);

	if (vmf->flags & FAULT_FLAG_WRITE)
		sb_end_pagefault(inode->i_sb);

	return ret;
}
예제 #5
0
/*
 * mmap()d file has taken write protection fault and is being made writable. We
 * can set the page state up correctly for a writable page, which means we can
 * do correct delalloc accounting (ENOSPC checking!) and unwritten extent
 * mapping.
 */
STATIC int
xfs_filemap_page_mkwrite(
	struct vm_fault		*vmf)
{
	struct inode		*inode = file_inode(vmf->vma->vm_file);
	int			ret;

	trace_xfs_filemap_page_mkwrite(XFS_I(inode));

	sb_start_pagefault(inode->i_sb);
	file_update_time(vmf->vma->vm_file);
	xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);

	if (IS_DAX(inode)) {
		ret = dax_iomap_fault(vmf, PE_SIZE_PTE, &xfs_iomap_ops);
	} else {
		ret = iomap_page_mkwrite(vmf, &xfs_iomap_ops);
		ret = block_page_mkwrite_return(ret);
	}

	xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
	sb_end_pagefault(inode->i_sb);

	return ret;
}
예제 #6
0
파일: xfs_ioctl.c 프로젝트: asmalldev/linux
/*
 * If we are changing DAX flags, we have to ensure the file is clean and any
 * cached objects in the address space are invalidated and removed. This
 * requires us to lock out other IO and page faults similar to a truncate
 * operation. The locks need to be held until the transaction has been committed
 * so that the cache invalidation is atomic with respect to the DAX flag
 * manipulation.
 */
static int
xfs_ioctl_setattr_dax_invalidate(
	struct xfs_inode	*ip,
	struct fsxattr		*fa,
	int			*join_flags)
{
	struct inode		*inode = VFS_I(ip);
	int			error;

	*join_flags = 0;

	/*
	 * It is only valid to set the DAX flag on regular files and
	 * directories on filesystems where the block size is equal to the page
	 * size. On directories it serves as an inherit hint.
	 */
	if (fa->fsx_xflags & FS_XFLAG_DAX) {
		if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)))
			return -EINVAL;
		if (ip->i_mount->m_sb.sb_blocksize != PAGE_SIZE)
			return -EINVAL;
	}

	/* If the DAX state is not changing, we have nothing to do here. */
	if ((fa->fsx_xflags & FS_XFLAG_DAX) && IS_DAX(inode))
		return 0;
	if (!(fa->fsx_xflags & FS_XFLAG_DAX) && !IS_DAX(inode))
		return 0;

	/* lock, flush and invalidate mapping in preparation for flag change */
	xfs_ilock(ip, XFS_MMAPLOCK_EXCL | XFS_IOLOCK_EXCL);
	error = filemap_write_and_wait(inode->i_mapping);
	if (error)
		goto out_unlock;
	error = invalidate_inode_pages2(inode->i_mapping);
	if (error)
		goto out_unlock;

	*join_flags = XFS_MMAPLOCK_EXCL | XFS_IOLOCK_EXCL;
	return 0;

out_unlock:
	xfs_iunlock(ip, XFS_MMAPLOCK_EXCL | XFS_IOLOCK_EXCL);
	return error;

}
예제 #7
0
파일: file.c 프로젝트: Anjali05/linux
static ssize_t ext2_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
#ifdef CONFIG_FS_DAX
	if (IS_DAX(iocb->ki_filp->f_mapping->host))
		return ext2_dax_write_iter(iocb, from);
#endif
	return generic_file_write_iter(iocb, from);
}
예제 #8
0
static int
xfs_filemap_fault(
	struct vm_fault		*vmf)
{
	/* DAX can shortcut the normal fault path on write faults! */
	return __xfs_filemap_fault(vmf, PE_SIZE_PTE,
			IS_DAX(file_inode(vmf->vma->vm_file)) &&
			(vmf->flags & FAULT_FLAG_WRITE));
}
예제 #9
0
파일: file.c 프로젝트: Anjali05/linux
static int ext2_file_mmap(struct file *file, struct vm_area_struct *vma)
{
	if (!IS_DAX(file_inode(file)))
		return generic_file_mmap(file, vma);

	file_accessed(file);
	vma->vm_ops = &ext2_dax_vm_ops;
	return 0;
}
예제 #10
0
파일: xfs_file.c 프로젝트: Lyude/linux
STATIC int
xfs_file_mmap(
	struct file	*filp,
	struct vm_area_struct *vma)
{
	/*
	 * We don't support synchronous mappings for non-DAX files. At least
	 * until someone comes with a sensible use case.
	 */
	if (!IS_DAX(file_inode(filp)) && (vma->vm_flags & VM_SYNC))
		return -EOPNOTSUPP;

	file_accessed(filp);
	vma->vm_ops = &xfs_file_vm_ops;
	if (IS_DAX(file_inode(filp)))
		vma->vm_flags |= VM_HUGEPAGE;
	return 0;
}
예제 #11
0
파일: file.c 프로젝트: 020gzh/linux
static int ext2_file_mmap(struct file *file, struct vm_area_struct *vma)
{
	if (!IS_DAX(file_inode(file)))
		return generic_file_mmap(file, vma);

	file_accessed(file);
	vma->vm_ops = &ext2_dax_vm_ops;
	vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE;
	return 0;
}
예제 #12
0
STATIC int
xfs_file_mmap(
	struct file	*filp,
	struct vm_area_struct *vma)
{
	file_accessed(filp);
	vma->vm_ops = &xfs_file_vm_ops;
	if (IS_DAX(file_inode(filp)))
		vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE;
	return 0;
}
예제 #13
0
static int
xfs_filemap_huge_fault(
	struct vm_fault		*vmf,
	enum page_entry_size	pe_size)
{
	if (!IS_DAX(file_inode(vmf->vma->vm_file)))
		return VM_FAULT_FALLBACK;

	/* DAX can shortcut the normal fault path on write faults! */
	return __xfs_filemap_fault(vmf, pe_size,
			(vmf->flags & FAULT_FLAG_WRITE));
}
예제 #14
0
STATIC int
xfs_filemap_fault(
	struct vm_fault		*vmf)
{
	struct inode		*inode = file_inode(vmf->vma->vm_file);
	int			ret;

	trace_xfs_filemap_fault(XFS_I(inode));

	/* DAX can shortcut the normal fault path on write faults! */
	if ((vmf->flags & FAULT_FLAG_WRITE) && IS_DAX(inode))
		return xfs_filemap_page_mkwrite(vmf);

	xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
	if (IS_DAX(inode))
		ret = dax_iomap_fault(vmf, PE_SIZE_PTE, &xfs_iomap_ops);
	else
		ret = filemap_fault(vmf);
	xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);

	return ret;
}
예제 #15
0
/*
 * xfs_iozero clears the specified range supplied via the page cache (except in
 * the DAX case). Writes through the page cache will allocate blocks over holes,
 * though the callers usually map the holes first and avoid them. If a block is
 * not completely zeroed, then it will be read from disk before being partially
 * zeroed.
 *
 * In the DAX case, we can just directly write to the underlying pages. This
 * will not allocate blocks, but will avoid holes and unwritten extents and so
 * not do unnecessary work.
 */
int
xfs_iozero(
	struct xfs_inode	*ip,	/* inode			*/
	loff_t			pos,	/* offset in file		*/
	size_t			count)	/* size of data to zero		*/
{
	struct page		*page;
	struct address_space	*mapping;
	int			status = 0;


	mapping = VFS_I(ip)->i_mapping;
	do {
		unsigned offset, bytes;
		void *fsdata;

		offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
		bytes = PAGE_CACHE_SIZE - offset;
		if (bytes > count)
			bytes = count;

		if (IS_DAX(VFS_I(ip))) {
			status = dax_zero_page_range(VFS_I(ip), pos, bytes,
						     xfs_get_blocks_direct);
			if (status)
				break;
		} else {
			status = pagecache_write_begin(NULL, mapping, pos, bytes,
						AOP_FLAG_UNINTERRUPTIBLE,
						&page, &fsdata);
			if (status)
				break;

			zero_user(page, offset, bytes);

			status = pagecache_write_end(NULL, mapping, pos, bytes,
						bytes, page, fsdata);
			WARN_ON(status <= 0); /* can't return less than zero! */
			status = 0;
		}
		pos += bytes;
		count -= bytes;
	} while (count);

	return status;
}
예제 #16
0
STATIC ssize_t
xfs_file_write_iter(
	struct kiocb		*iocb,
	struct iov_iter		*from)
{
	struct file		*file = iocb->ki_filp;
	struct address_space	*mapping = file->f_mapping;
	struct inode		*inode = mapping->host;
	struct xfs_inode	*ip = XFS_I(inode);
	ssize_t			ret;
	size_t			ocount = iov_iter_count(from);

	XFS_STATS_INC(ip->i_mount, xs_write_calls);

	if (ocount == 0)
		return 0;

	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
		return -EIO;

	if (IS_DAX(inode))
		ret = xfs_file_dax_write(iocb, from);
	else if (iocb->ki_flags & IOCB_DIRECT) {
		/*
		 * Allow a directio write to fall back to a buffered
		 * write *only* in the case that we're doing a reflink
		 * CoW.  In all other directio scenarios we do not
		 * allow an operation to fall back to buffered mode.
		 */
		ret = xfs_file_dio_aio_write(iocb, from);
		if (ret == -EREMCHG)
			goto buffered;
	} else {
buffered:
		ret = xfs_file_buffered_aio_write(iocb, from);
	}

	if (ret > 0) {
		XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret);

		/* Handle various SYNC-type writes */
		ret = generic_write_sync(iocb, ret);
	}
	return ret;
}
예제 #17
0
STATIC ssize_t
xfs_file_splice_read(
	struct file		*infilp,
	loff_t			*ppos,
	struct pipe_inode_info	*pipe,
	size_t			count,
	unsigned int		flags)
{
	struct xfs_inode	*ip = XFS_I(infilp->f_mapping->host);
	int			ioflags = 0;
	ssize_t			ret;

	XFS_STATS_INC(ip->i_mount, xs_read_calls);

	if (infilp->f_mode & FMODE_NOCMTIME)
		ioflags |= XFS_IO_INVIS;

	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
		return -EIO;

	trace_xfs_file_splice_read(ip, count, *ppos, ioflags);

	/*
	 * DAX inodes cannot ues the page cache for splice, so we have to push
	 * them through the VFS IO path. This means it goes through
	 * ->read_iter, which for us takes the XFS_IOLOCK_SHARED. Hence we
	 * cannot lock the splice operation at this level for DAX inodes.
	 */
	if (IS_DAX(VFS_I(ip))) {
		ret = default_file_splice_read(infilp, ppos, pipe, count,
					       flags);
		goto out;
	}

	xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
	ret = generic_file_splice_read(infilp, ppos, pipe, count, flags);
	xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
out:
	if (ret > 0)
		XFS_STATS_ADD(ip->i_mount, xs_read_bytes, ret);
	return ret;
}
예제 #18
0
파일: xfs_file.c 프로젝트: Lyude/linux
/*
 * Locking for serialisation of IO during page faults. This results in a lock
 * ordering of:
 *
 * mmap_sem (MM)
 *   sb_start_pagefault(vfs, freeze)
 *     i_mmaplock (XFS - truncate serialisation)
 *       page_lock (MM)
 *         i_lock (XFS - extent map serialisation)
 */
static vm_fault_t
__xfs_filemap_fault(
	struct vm_fault		*vmf,
	enum page_entry_size	pe_size,
	bool			write_fault)
{
	struct inode		*inode = file_inode(vmf->vma->vm_file);
	struct xfs_inode	*ip = XFS_I(inode);
	vm_fault_t		ret;

	trace_xfs_filemap_fault(ip, pe_size, write_fault);

	if (write_fault) {
		sb_start_pagefault(inode->i_sb);
		file_update_time(vmf->vma->vm_file);
	}

	xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
	if (IS_DAX(inode)) {
		pfn_t pfn;

		ret = dax_iomap_fault(vmf, pe_size, &pfn, NULL, &xfs_iomap_ops);
		if (ret & VM_FAULT_NEEDDSYNC)
			ret = dax_finish_sync_fault(vmf, pe_size, pfn);
	} else {
		if (write_fault)
			ret = iomap_page_mkwrite(vmf, &xfs_iomap_ops);
		else
			ret = filemap_fault(vmf);
	}
	xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);

	if (write_fault)
		sb_end_pagefault(inode->i_sb);
	return ret;
}
예제 #19
0
long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
{
	struct inode *inode = file_inode(filp);
	struct super_block *sb = inode->i_sb;
	struct ext4_inode_info *ei = EXT4_I(inode);
	unsigned int flags;

	ext4_debug("cmd = %u, arg = %lu\n", cmd, arg);

	switch (cmd) {
	case FS_IOC_GETFSMAP:
		return ext4_ioc_getfsmap(sb, (void __user *)arg);
	case EXT4_IOC_GETFLAGS:
		flags = ei->i_flags & EXT4_FL_USER_VISIBLE;
		return put_user(flags, (int __user *) arg);
	case EXT4_IOC_SETFLAGS: {
		int err;

		if (!inode_owner_or_capable(inode))
			return -EACCES;

		if (get_user(flags, (int __user *) arg))
			return -EFAULT;

		if (flags & ~EXT4_FL_USER_VISIBLE)
			return -EOPNOTSUPP;
		/*
		 * chattr(1) grabs flags via GETFLAGS, modifies the result and
		 * passes that to SETFLAGS. So we cannot easily make SETFLAGS
		 * more restrictive than just silently masking off visible but
		 * not settable flags as we always did.
		 */
		flags &= EXT4_FL_USER_MODIFIABLE;
		if (ext4_mask_flags(inode->i_mode, flags) != flags)
			return -EOPNOTSUPP;

		err = mnt_want_write_file(filp);
		if (err)
			return err;

		inode_lock(inode);
		err = ext4_ioctl_setflags(inode, flags);
		inode_unlock(inode);
		mnt_drop_write_file(filp);
		return err;
	}
	case EXT4_IOC_GETVERSION:
	case EXT4_IOC_GETVERSION_OLD:
		return put_user(inode->i_generation, (int __user *) arg);
	case EXT4_IOC_SETVERSION:
	case EXT4_IOC_SETVERSION_OLD: {
		handle_t *handle;
		struct ext4_iloc iloc;
		__u32 generation;
		int err;

		if (!inode_owner_or_capable(inode))
			return -EPERM;

		if (ext4_has_metadata_csum(inode->i_sb)) {
			ext4_warning(sb, "Setting inode version is not "
				     "supported with metadata_csum enabled.");
			return -ENOTTY;
		}

		err = mnt_want_write_file(filp);
		if (err)
			return err;
		if (get_user(generation, (int __user *) arg)) {
			err = -EFAULT;
			goto setversion_out;
		}

		inode_lock(inode);
		handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
		if (IS_ERR(handle)) {
			err = PTR_ERR(handle);
			goto unlock_out;
		}
		err = ext4_reserve_inode_write(handle, inode, &iloc);
		if (err == 0) {
			inode->i_ctime = current_time(inode);
			inode->i_generation = generation;
			err = ext4_mark_iloc_dirty(handle, inode, &iloc);
		}
		ext4_journal_stop(handle);

unlock_out:
		inode_unlock(inode);
setversion_out:
		mnt_drop_write_file(filp);
		return err;
	}
	case EXT4_IOC_GROUP_EXTEND: {
		ext4_fsblk_t n_blocks_count;
		int err, err2=0;

		err = ext4_resize_begin(sb);
		if (err)
			return err;

		if (get_user(n_blocks_count, (__u32 __user *)arg)) {
			err = -EFAULT;
			goto group_extend_out;
		}

		if (ext4_has_feature_bigalloc(sb)) {
			ext4_msg(sb, KERN_ERR,
				 "Online resizing not supported with bigalloc");
			err = -EOPNOTSUPP;
			goto group_extend_out;
		}

		err = mnt_want_write_file(filp);
		if (err)
			goto group_extend_out;

		err = ext4_group_extend(sb, EXT4_SB(sb)->s_es, n_blocks_count);
		if (EXT4_SB(sb)->s_journal) {
			jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
			err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal);
			jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
		}
		if (err == 0)
			err = err2;
		mnt_drop_write_file(filp);
group_extend_out:
		ext4_resize_end(sb);
		return err;
	}

	case EXT4_IOC_MOVE_EXT: {
		struct move_extent me;
		struct fd donor;
		int err;

		if (!(filp->f_mode & FMODE_READ) ||
		    !(filp->f_mode & FMODE_WRITE))
			return -EBADF;

		if (copy_from_user(&me,
			(struct move_extent __user *)arg, sizeof(me)))
			return -EFAULT;
		me.moved_len = 0;

		donor = fdget(me.donor_fd);
		if (!donor.file)
			return -EBADF;

		if (!(donor.file->f_mode & FMODE_WRITE)) {
			err = -EBADF;
			goto mext_out;
		}

		if (ext4_has_feature_bigalloc(sb)) {
			ext4_msg(sb, KERN_ERR,
				 "Online defrag not supported with bigalloc");
			err = -EOPNOTSUPP;
			goto mext_out;
		} else if (IS_DAX(inode)) {
			ext4_msg(sb, KERN_ERR,
				 "Online defrag not supported with DAX");
			err = -EOPNOTSUPP;
			goto mext_out;
		}

		err = mnt_want_write_file(filp);
		if (err)
			goto mext_out;

		err = ext4_move_extents(filp, donor.file, me.orig_start,
					me.donor_start, me.len, &me.moved_len);
		mnt_drop_write_file(filp);

		if (copy_to_user((struct move_extent __user *)arg,
				 &me, sizeof(me)))
			err = -EFAULT;
mext_out:
		fdput(donor);
		return err;
	}

	case EXT4_IOC_GROUP_ADD: {
		struct ext4_new_group_data input;

		if (copy_from_user(&input, (struct ext4_new_group_input __user *)arg,
				sizeof(input)))
			return -EFAULT;

		return ext4_ioctl_group_add(filp, &input);
	}

	case EXT4_IOC_MIGRATE:
	{
		int err;
		if (!inode_owner_or_capable(inode))
			return -EACCES;

		err = mnt_want_write_file(filp);
		if (err)
			return err;
		/*
		 * inode_mutex prevent write and truncate on the file.
		 * Read still goes through. We take i_data_sem in
		 * ext4_ext_swap_inode_data before we switch the
		 * inode format to prevent read.
		 */
		inode_lock((inode));
		err = ext4_ext_migrate(inode);
		inode_unlock((inode));
		mnt_drop_write_file(filp);
		return err;
	}

	case EXT4_IOC_ALLOC_DA_BLKS:
	{
		int err;
		if (!inode_owner_or_capable(inode))
			return -EACCES;

		err = mnt_want_write_file(filp);
		if (err)
			return err;
		err = ext4_alloc_da_blocks(inode);
		mnt_drop_write_file(filp);
		return err;
	}

	case EXT4_IOC_SWAP_BOOT:
	{
		int err;
		if (!(filp->f_mode & FMODE_WRITE))
			return -EBADF;
		err = mnt_want_write_file(filp);
		if (err)
			return err;
		err = swap_inode_boot_loader(sb, inode);
		mnt_drop_write_file(filp);
		return err;
	}

	case EXT4_IOC_RESIZE_FS: {
		ext4_fsblk_t n_blocks_count;
		int err = 0, err2 = 0;
		ext4_group_t o_group = EXT4_SB(sb)->s_groups_count;

		if (copy_from_user(&n_blocks_count, (__u64 __user *)arg,
				   sizeof(__u64))) {
			return -EFAULT;
		}

		err = ext4_resize_begin(sb);
		if (err)
			return err;

		err = mnt_want_write_file(filp);
		if (err)
			goto resizefs_out;

		err = ext4_resize_fs(sb, n_blocks_count);
		if (EXT4_SB(sb)->s_journal) {
			jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
			err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal);
			jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
		}
		if (err == 0)
			err = err2;
		mnt_drop_write_file(filp);
		if (!err && (o_group > EXT4_SB(sb)->s_groups_count) &&
		    ext4_has_group_desc_csum(sb) &&
		    test_opt(sb, INIT_INODE_TABLE))
			err = ext4_register_li_request(sb, o_group);

resizefs_out:
		ext4_resize_end(sb);
		return err;
	}

	case FITRIM:
	{
		struct request_queue *q = bdev_get_queue(sb->s_bdev);
		struct fstrim_range range;
		int ret = 0;

		if (!capable(CAP_SYS_ADMIN))
			return -EPERM;

		if (!blk_queue_discard(q))
			return -EOPNOTSUPP;

		if (copy_from_user(&range, (struct fstrim_range __user *)arg,
		    sizeof(range)))
			return -EFAULT;

		range.minlen = max((unsigned int)range.minlen,
				   q->limits.discard_granularity);
		ret = ext4_trim_fs(sb, &range);
		if (ret < 0)
			return ret;

		if (copy_to_user((struct fstrim_range __user *)arg, &range,
		    sizeof(range)))
			return -EFAULT;

		return 0;
	}
	case EXT4_IOC_PRECACHE_EXTENTS:
		return ext4_ext_precache(inode);

	case EXT4_IOC_SET_ENCRYPTION_POLICY:
		if (!ext4_has_feature_encrypt(sb))
			return -EOPNOTSUPP;
		return fscrypt_ioctl_set_policy(filp, (const void __user *)arg);

	case EXT4_IOC_GET_ENCRYPTION_PWSALT: {
#ifdef CONFIG_EXT4_FS_ENCRYPTION
		int err, err2;
		struct ext4_sb_info *sbi = EXT4_SB(sb);
		handle_t *handle;

		if (!ext4_has_feature_encrypt(sb))
			return -EOPNOTSUPP;
		if (uuid_is_zero(sbi->s_es->s_encrypt_pw_salt)) {
			err = mnt_want_write_file(filp);
			if (err)
				return err;
			handle = ext4_journal_start_sb(sb, EXT4_HT_MISC, 1);
			if (IS_ERR(handle)) {
				err = PTR_ERR(handle);
				goto pwsalt_err_exit;
			}
			err = ext4_journal_get_write_access(handle, sbi->s_sbh);
			if (err)
				goto pwsalt_err_journal;
			generate_random_uuid(sbi->s_es->s_encrypt_pw_salt);
			err = ext4_handle_dirty_metadata(handle, NULL,
							 sbi->s_sbh);
		pwsalt_err_journal:
			err2 = ext4_journal_stop(handle);
			if (err2 && !err)
				err = err2;
		pwsalt_err_exit:
			mnt_drop_write_file(filp);
			if (err)
				return err;
		}
		if (copy_to_user((void __user *) arg,
				 sbi->s_es->s_encrypt_pw_salt, 16))
			return -EFAULT;
		return 0;
#else
		return -EOPNOTSUPP;
#endif
	}
	case EXT4_IOC_GET_ENCRYPTION_POLICY:
		return fscrypt_ioctl_get_policy(filp, (void __user *)arg);

	case EXT4_IOC_FSGETXATTR:
	{
		struct fsxattr fa;

		memset(&fa, 0, sizeof(struct fsxattr));
		fa.fsx_xflags = ext4_iflags_to_xflags(ei->i_flags & EXT4_FL_USER_VISIBLE);

		if (ext4_has_feature_project(inode->i_sb)) {
			fa.fsx_projid = (__u32)from_kprojid(&init_user_ns,
				EXT4_I(inode)->i_projid);
		}

		if (copy_to_user((struct fsxattr __user *)arg,
				 &fa, sizeof(fa)))
			return -EFAULT;
		return 0;
	}
	case EXT4_IOC_FSSETXATTR:
	{
		struct fsxattr fa;
		int err;

		if (copy_from_user(&fa, (struct fsxattr __user *)arg,
				   sizeof(fa)))
			return -EFAULT;

		/* Make sure caller has proper permission */
		if (!inode_owner_or_capable(inode))
			return -EACCES;

		if (fa.fsx_xflags & ~EXT4_SUPPORTED_FS_XFLAGS)
			return -EOPNOTSUPP;

		flags = ext4_xflags_to_iflags(fa.fsx_xflags);
		if (ext4_mask_flags(inode->i_mode, flags) != flags)
			return -EOPNOTSUPP;

		err = mnt_want_write_file(filp);
		if (err)
			return err;

		inode_lock(inode);
		flags = (ei->i_flags & ~EXT4_FL_XFLAG_VISIBLE) |
			 (flags & EXT4_FL_XFLAG_VISIBLE);
		err = ext4_ioctl_setflags(inode, flags);
		inode_unlock(inode);
		mnt_drop_write_file(filp);
		if (err)
			return err;

		err = ext4_ioctl_setproject(filp, fa.fsx_projid);
		if (err)
			return err;

		return 0;
	}
	case EXT4_IOC_SHUTDOWN:
		return ext4_shutdown(sb, arg);
	default:
		return -ENOTTY;
	}
}
예제 #20
0
STATIC ssize_t
xfs_file_read_iter(
	struct kiocb		*iocb,
	struct iov_iter		*to)
{
	struct file		*file = iocb->ki_filp;
	struct inode		*inode = file->f_mapping->host;
	struct xfs_inode	*ip = XFS_I(inode);
	struct xfs_mount	*mp = ip->i_mount;
	size_t			size = iov_iter_count(to);
	ssize_t			ret = 0;
	int			ioflags = 0;
	xfs_fsize_t		n;
	loff_t			pos = iocb->ki_pos;

	XFS_STATS_INC(mp, xs_read_calls);

	if (unlikely(iocb->ki_flags & IOCB_DIRECT))
		ioflags |= XFS_IO_ISDIRECT;
	if (file->f_mode & FMODE_NOCMTIME)
		ioflags |= XFS_IO_INVIS;

	if ((ioflags & XFS_IO_ISDIRECT) && !IS_DAX(inode)) {
		xfs_buftarg_t	*target =
			XFS_IS_REALTIME_INODE(ip) ?
				mp->m_rtdev_targp : mp->m_ddev_targp;
		/* DIO must be aligned to device logical sector size */
		if ((pos | size) & target->bt_logical_sectormask) {
			if (pos == i_size_read(inode))
				return 0;
			return -EINVAL;
		}
	}

	n = mp->m_super->s_maxbytes - pos;
	if (n <= 0 || size == 0)
		return 0;

	if (n < size)
		size = n;

	if (XFS_FORCED_SHUTDOWN(mp))
		return -EIO;

	/*
	 * Locking is a bit tricky here. If we take an exclusive lock for direct
	 * IO, we effectively serialise all new concurrent read IO to this file
	 * and block it behind IO that is currently in progress because IO in
	 * progress holds the IO lock shared. We only need to hold the lock
	 * exclusive to blow away the page cache, so only take lock exclusively
	 * if the page cache needs invalidation. This allows the normal direct
	 * IO case of no page cache pages to proceeed concurrently without
	 * serialisation.
	 */
	xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
	if ((ioflags & XFS_IO_ISDIRECT) && inode->i_mapping->nrpages) {
		xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
		xfs_rw_ilock(ip, XFS_IOLOCK_EXCL);

		/*
		 * The generic dio code only flushes the range of the particular
		 * I/O. Because we take an exclusive lock here, this whole
		 * sequence is considerably more expensive for us. This has a
		 * noticeable performance impact for any file with cached pages,
		 * even when outside of the range of the particular I/O.
		 *
		 * Hence, amortize the cost of the lock against a full file
		 * flush and reduce the chances of repeated iolock cycles going
		 * forward.
		 */
		if (inode->i_mapping->nrpages) {
			ret = filemap_write_and_wait(VFS_I(ip)->i_mapping);
			if (ret) {
				xfs_rw_iunlock(ip, XFS_IOLOCK_EXCL);
				return ret;
			}

			/*
			 * Invalidate whole pages. This can return an error if
			 * we fail to invalidate a page, but this should never
			 * happen on XFS. Warn if it does fail.
			 */
			ret = invalidate_inode_pages2(VFS_I(ip)->i_mapping);
			WARN_ON_ONCE(ret);
			ret = 0;
		}
		xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
	}

	trace_xfs_file_read(ip, size, pos, ioflags);

	ret = generic_file_read_iter(iocb, to);
	if (ret > 0)
		XFS_STATS_ADD(mp, xs_read_bytes, ret);

	xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
	return ret;
}
예제 #21
0
/*
 * xfs_file_dio_aio_write - handle direct IO writes
 *
 * Lock the inode appropriately to prepare for and issue a direct IO write.
 * By separating it from the buffered write path we remove all the tricky to
 * follow locking changes and looping.
 *
 * If there are cached pages or we're extending the file, we need IOLOCK_EXCL
 * until we're sure the bytes at the new EOF have been zeroed and/or the cached
 * pages are flushed out.
 *
 * In most cases the direct IO writes will be done holding IOLOCK_SHARED
 * allowing them to be done in parallel with reads and other direct IO writes.
 * However, if the IO is not aligned to filesystem blocks, the direct IO layer
 * needs to do sub-block zeroing and that requires serialisation against other
 * direct IOs to the same block. In this case we need to serialise the
 * submission of the unaligned IOs so that we don't get racing block zeroing in
 * the dio layer.  To avoid the problem with aio, we also need to wait for
 * outstanding IOs to complete so that unwritten extent conversion is completed
 * before we try to map the overlapping block. This is currently implemented by
 * hitting it with a big hammer (i.e. inode_dio_wait()).
 *
 * Returns with locks held indicated by @iolock and errors indicated by
 * negative return values.
 */
STATIC ssize_t
xfs_file_dio_aio_write(
	struct kiocb		*iocb,
	struct iov_iter		*from)
{
	struct file		*file = iocb->ki_filp;
	struct address_space	*mapping = file->f_mapping;
	struct inode		*inode = mapping->host;
	struct xfs_inode	*ip = XFS_I(inode);
	struct xfs_mount	*mp = ip->i_mount;
	ssize_t			ret = 0;
	int			unaligned_io = 0;
	int			iolock;
	size_t			count = iov_iter_count(from);
	loff_t			end;
	struct iov_iter		data;
	struct xfs_buftarg	*target = XFS_IS_REALTIME_INODE(ip) ?
					mp->m_rtdev_targp : mp->m_ddev_targp;

	/* DIO must be aligned to device logical sector size */
	if (!IS_DAX(inode) &&
	    ((iocb->ki_pos | count) & target->bt_logical_sectormask))
		return -EINVAL;

	/* "unaligned" here means not aligned to a filesystem block */
	if ((iocb->ki_pos & mp->m_blockmask) ||
	    ((iocb->ki_pos + count) & mp->m_blockmask))
		unaligned_io = 1;

	/*
	 * We don't need to take an exclusive lock unless there page cache needs
	 * to be invalidated or unaligned IO is being executed. We don't need to
	 * consider the EOF extension case here because
	 * xfs_file_aio_write_checks() will relock the inode as necessary for
	 * EOF zeroing cases and fill out the new inode size as appropriate.
	 */
	if (unaligned_io || mapping->nrpages)
		iolock = XFS_IOLOCK_EXCL;
	else
		iolock = XFS_IOLOCK_SHARED;
	xfs_rw_ilock(ip, iolock);

	/*
	 * Recheck if there are cached pages that need invalidate after we got
	 * the iolock to protect against other threads adding new pages while
	 * we were waiting for the iolock.
	 */
	if (mapping->nrpages && iolock == XFS_IOLOCK_SHARED) {
		xfs_rw_iunlock(ip, iolock);
		iolock = XFS_IOLOCK_EXCL;
		xfs_rw_ilock(ip, iolock);
	}

	ret = xfs_file_aio_write_checks(iocb, from, &iolock);
	if (ret)
		goto out;
	count = iov_iter_count(from);
	end = iocb->ki_pos + count - 1;

	/*
	 * See xfs_file_read_iter() for why we do a full-file flush here.
	 */
	if (mapping->nrpages) {
		ret = filemap_write_and_wait(VFS_I(ip)->i_mapping);
		if (ret)
			goto out;
		/*
		 * Invalidate whole pages. This can return an error if we fail
		 * to invalidate a page, but this should never happen on XFS.
		 * Warn if it does fail.
		 */
		ret = invalidate_inode_pages2(VFS_I(ip)->i_mapping);
		WARN_ON_ONCE(ret);
		ret = 0;
	}

	/*
	 * If we are doing unaligned IO, wait for all other IO to drain,
	 * otherwise demote the lock if we had to flush cached pages
	 */
	if (unaligned_io)
		inode_dio_wait(inode);
	else if (iolock == XFS_IOLOCK_EXCL) {
		xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
		iolock = XFS_IOLOCK_SHARED;
	}

	trace_xfs_file_direct_write(ip, count, iocb->ki_pos, 0);

	data = *from;
	ret = mapping->a_ops->direct_IO(iocb, &data);

	/* see generic_file_direct_write() for why this is necessary */
	if (mapping->nrpages) {
		invalidate_inode_pages2_range(mapping,
					      iocb->ki_pos >> PAGE_SHIFT,
					      end >> PAGE_SHIFT);
	}
예제 #22
0
파일: xfs_iops.c 프로젝트: Chong-Li/cse522
/*
 * Truncate file.  Must have write permission and not be a directory.
 */
int
xfs_setattr_size(
	struct xfs_inode	*ip,
	struct iattr		*iattr)
{
	struct xfs_mount	*mp = ip->i_mount;
	struct inode		*inode = VFS_I(ip);
	xfs_off_t		oldsize, newsize;
	struct xfs_trans	*tp;
	int			error;
	uint			lock_flags = 0;
	bool			did_zeroing = false;

	trace_xfs_setattr(ip);

	if (mp->m_flags & XFS_MOUNT_RDONLY)
		return -EROFS;

	if (XFS_FORCED_SHUTDOWN(mp))
		return -EIO;

	error = inode_change_ok(inode, iattr);
	if (error)
		return error;

	ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
	ASSERT(xfs_isilocked(ip, XFS_MMAPLOCK_EXCL));
	ASSERT(S_ISREG(ip->i_d.di_mode));
	ASSERT((iattr->ia_valid & (ATTR_UID|ATTR_GID|ATTR_ATIME|ATTR_ATIME_SET|
		ATTR_MTIME_SET|ATTR_KILL_PRIV|ATTR_TIMES_SET)) == 0);

	oldsize = inode->i_size;
	newsize = iattr->ia_size;

	/*
	 * Short circuit the truncate case for zero length files.
	 */
	if (newsize == 0 && oldsize == 0 && ip->i_d.di_nextents == 0) {
		if (!(iattr->ia_valid & (ATTR_CTIME|ATTR_MTIME)))
			return 0;

		/*
		 * Use the regular setattr path to update the timestamps.
		 */
		iattr->ia_valid &= ~ATTR_SIZE;
		return xfs_setattr_nonsize(ip, iattr, 0);
	}

	/*
	 * Make sure that the dquots are attached to the inode.
	 */
	error = xfs_qm_dqattach(ip, 0);
	if (error)
		return error;

	/*
	 * File data changes must be complete before we start the transaction to
	 * modify the inode.  This needs to be done before joining the inode to
	 * the transaction because the inode cannot be unlocked once it is a
	 * part of the transaction.
	 *
	 * Start with zeroing any data block beyond EOF that we may expose on
	 * file extension.
	 */
	if (newsize > oldsize) {
		error = xfs_zero_eof(ip, newsize, oldsize, &did_zeroing);
		if (error)
			return error;
	}

	/*
	 * We are going to log the inode size change in this transaction so
	 * any previous writes that are beyond the on disk EOF and the new
	 * EOF that have not been written out need to be written here.  If we
	 * do not write the data out, we expose ourselves to the null files
	 * problem. Note that this includes any block zeroing we did above;
	 * otherwise those blocks may not be zeroed after a crash.
	 */
	if (newsize > ip->i_d.di_size &&
	    (oldsize != ip->i_d.di_size || did_zeroing)) {
		error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
						      ip->i_d.di_size, newsize);
		if (error)
			return error;
	}

	/* Now wait for all direct I/O to complete. */
	inode_dio_wait(inode);

	/*
	 * We've already locked out new page faults, so now we can safely remove
	 * pages from the page cache knowing they won't get refaulted until we
	 * drop the XFS_MMAP_EXCL lock after the extent manipulations are
	 * complete. The truncate_setsize() call also cleans partial EOF page
	 * PTEs on extending truncates and hence ensures sub-page block size
	 * filesystems are correctly handled, too.
	 *
	 * We have to do all the page cache truncate work outside the
	 * transaction context as the "lock" order is page lock->log space
	 * reservation as defined by extent allocation in the writeback path.
	 * Hence a truncate can fail with ENOMEM from xfs_trans_reserve(), but
	 * having already truncated the in-memory version of the file (i.e. made
	 * user visible changes). There's not much we can do about this, except
	 * to hope that the caller sees ENOMEM and retries the truncate
	 * operation.
	 */
	if (IS_DAX(inode))
		error = dax_truncate_page(inode, newsize, xfs_get_blocks_direct);
	else
		error = block_truncate_page(inode->i_mapping, newsize,
					    xfs_get_blocks);
	if (error)
		return error;
	truncate_setsize(inode, newsize);

	tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_SIZE);
	error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0);
	if (error)
		goto out_trans_cancel;

	lock_flags |= XFS_ILOCK_EXCL;
	xfs_ilock(ip, XFS_ILOCK_EXCL);
	xfs_trans_ijoin(tp, ip, 0);

	/*
	 * Only change the c/mtime if we are changing the size or we are
	 * explicitly asked to change it.  This handles the semantic difference
	 * between truncate() and ftruncate() as implemented in the VFS.
	 *
	 * The regular truncate() case without ATTR_CTIME and ATTR_MTIME is a
	 * special case where we need to update the times despite not having
	 * these flags set.  For all other operations the VFS set these flags
	 * explicitly if it wants a timestamp update.
	 */
	if (newsize != oldsize &&
	    !(iattr->ia_valid & (ATTR_CTIME | ATTR_MTIME))) {
		iattr->ia_ctime = iattr->ia_mtime =
			current_fs_time(inode->i_sb);
		iattr->ia_valid |= ATTR_CTIME | ATTR_MTIME;
	}

	/*
	 * The first thing we do is set the size to new_size permanently on
	 * disk.  This way we don't have to worry about anyone ever being able
	 * to look at the data being freed even in the face of a crash.
	 * What we're getting around here is the case where we free a block, it
	 * is allocated to another file, it is written to, and then we crash.
	 * If the new data gets written to the file but the log buffers
	 * containing the free and reallocation don't, then we'd end up with
	 * garbage in the blocks being freed.  As long as we make the new size
	 * permanent before actually freeing any blocks it doesn't matter if
	 * they get written to.
	 */
	ip->i_d.di_size = newsize;
	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);

	if (newsize <= oldsize) {
		error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, newsize);
		if (error)
			goto out_trans_cancel;

		/*
		 * Truncated "down", so we're removing references to old data
		 * here - if we delay flushing for a long time, we expose
		 * ourselves unduly to the notorious NULL files problem.  So,
		 * we mark this inode and flush it when the file is closed,
		 * and do not wait the usual (long) time for writeout.
		 */
		xfs_iflags_set(ip, XFS_ITRUNCATED);

		/* A truncate down always removes post-EOF blocks. */
		xfs_inode_clear_eofblocks_tag(ip);
	}

	if (iattr->ia_valid & ATTR_MODE)
		xfs_setattr_mode(ip, iattr);
	if (iattr->ia_valid & (ATTR_ATIME|ATTR_CTIME|ATTR_MTIME))
		xfs_setattr_time(ip, iattr);

	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);

	XFS_STATS_INC(mp, xs_ig_attrchg);

	if (mp->m_flags & XFS_MOUNT_WSYNC)
		xfs_trans_set_sync(tp);

	error = xfs_trans_commit(tp);
out_unlock:
	if (lock_flags)
		xfs_iunlock(ip, lock_flags);
	return error;

out_trans_cancel:
	xfs_trans_cancel(tp);
	goto out_unlock;
}
예제 #23
0
/*
 * Prepare two files for range cloning.  Upon a successful return both inodes
 * will have the iolock and mmaplock held, the page cache of the out file will
 * be truncated, and any leases on the out file will have been broken.  This
 * function borrows heavily from xfs_file_aio_write_checks.
 *
 * The VFS allows partial EOF blocks to "match" for dedupe even though it hasn't
 * checked that the bytes beyond EOF physically match. Hence we cannot use the
 * EOF block in the source dedupe range because it's not a complete block match,
 * hence can introduce a corruption into the file that has it's block replaced.
 *
 * In similar fashion, the VFS file cloning also allows partial EOF blocks to be
 * "block aligned" for the purposes of cloning entire files.  However, if the
 * source file range includes the EOF block and it lands within the existing EOF
 * of the destination file, then we can expose stale data from beyond the source
 * file EOF in the destination file.
 *
 * XFS doesn't support partial block sharing, so in both cases we have check
 * these cases ourselves. For dedupe, we can simply round the length to dedupe
 * down to the previous whole block and ignore the partial EOF block. While this
 * means we can't dedupe the last block of a file, this is an acceptible
 * tradeoff for simplicity on implementation.
 *
 * For cloning, we want to share the partial EOF block if it is also the new EOF
 * block of the destination file. If the partial EOF block lies inside the
 * existing destination EOF, then we have to abort the clone to avoid exposing
 * stale data in the destination file. Hence we reject these clone attempts with
 * -EINVAL in this case.
 */
int
xfs_reflink_remap_prep(
	struct file		*file_in,
	loff_t			pos_in,
	struct file		*file_out,
	loff_t			pos_out,
	loff_t			*len,
	unsigned int		remap_flags)
{
	struct inode		*inode_in = file_inode(file_in);
	struct xfs_inode	*src = XFS_I(inode_in);
	struct inode		*inode_out = file_inode(file_out);
	struct xfs_inode	*dest = XFS_I(inode_out);
	bool			same_inode = (inode_in == inode_out);
	ssize_t			ret;

	/* Lock both files against IO */
	ret = xfs_iolock_two_inodes_and_break_layout(inode_in, inode_out);
	if (ret)
		return ret;
	if (same_inode)
		xfs_ilock(src, XFS_MMAPLOCK_EXCL);
	else
		xfs_lock_two_inodes(src, XFS_MMAPLOCK_SHARED, dest,
				XFS_MMAPLOCK_EXCL);

	/* Check file eligibility and prepare for block sharing. */
	ret = -EINVAL;
	/* Don't reflink realtime inodes */
	if (XFS_IS_REALTIME_INODE(src) || XFS_IS_REALTIME_INODE(dest))
		goto out_unlock;

	/* Don't share DAX file data for now. */
	if (IS_DAX(inode_in) || IS_DAX(inode_out))
		goto out_unlock;

	ret = generic_remap_file_range_prep(file_in, pos_in, file_out, pos_out,
			len, remap_flags);
	if (ret < 0 || *len == 0)
		goto out_unlock;

	/* Attach dquots to dest inode before changing block map */
	ret = xfs_qm_dqattach(dest);
	if (ret)
		goto out_unlock;

	/*
	 * Zero existing post-eof speculative preallocations in the destination
	 * file.
	 */
	ret = xfs_reflink_zero_posteof(dest, pos_out);
	if (ret)
		goto out_unlock;

	/* Set flags and remap blocks. */
	ret = xfs_reflink_set_inode_flag(src, dest);
	if (ret)
		goto out_unlock;

	/*
	 * If pos_out > EOF, we may have dirtied blocks between EOF and
	 * pos_out. In that case, we need to extend the flush and unmap to cover
	 * from EOF to the end of the copy length.
	 */
	if (pos_out > XFS_ISIZE(dest)) {
		loff_t	flen = *len + (pos_out - XFS_ISIZE(dest));
		ret = xfs_flush_unmap_range(dest, XFS_ISIZE(dest), flen);
	} else {
		ret = xfs_flush_unmap_range(dest, pos_out, *len);
	}
	if (ret)
		goto out_unlock;

	return 1;
out_unlock:
	xfs_reflink_remap_unlock(file_in, file_out);
	return ret;
}
예제 #24
0
/*
 * Link a range of blocks from one file to another.
 */
int
xfs_reflink_remap_range(
	struct file		*file_in,
	loff_t			pos_in,
	struct file		*file_out,
	loff_t			pos_out,
	u64			len,
	bool			is_dedupe)
{
	struct inode		*inode_in = file_inode(file_in);
	struct xfs_inode	*src = XFS_I(inode_in);
	struct inode		*inode_out = file_inode(file_out);
	struct xfs_inode	*dest = XFS_I(inode_out);
	struct xfs_mount	*mp = src->i_mount;
	bool			same_inode = (inode_in == inode_out);
	xfs_fileoff_t		sfsbno, dfsbno;
	xfs_filblks_t		fsblen;
	xfs_extlen_t		cowextsize;
	ssize_t			ret;

	if (!xfs_sb_version_hasreflink(&mp->m_sb))
		return -EOPNOTSUPP;

	if (XFS_FORCED_SHUTDOWN(mp))
		return -EIO;

	/* Lock both files against IO */
	lock_two_nondirectories(inode_in, inode_out);
	if (same_inode)
		xfs_ilock(src, XFS_MMAPLOCK_EXCL);
	else
		xfs_lock_two_inodes(src, dest, XFS_MMAPLOCK_EXCL);

	/* Check file eligibility and prepare for block sharing. */
	ret = -EINVAL;
	/* Don't reflink realtime inodes */
	if (XFS_IS_REALTIME_INODE(src) || XFS_IS_REALTIME_INODE(dest))
		goto out_unlock;

	/* Don't share DAX file data for now. */
	if (IS_DAX(inode_in) || IS_DAX(inode_out))
		goto out_unlock;

	ret = vfs_clone_file_prep_inodes(inode_in, pos_in, inode_out, pos_out,
			&len, is_dedupe);
	if (ret <= 0)
		goto out_unlock;

	trace_xfs_reflink_remap_range(src, pos_in, len, dest, pos_out);

	/* Set flags and remap blocks. */
	ret = xfs_reflink_set_inode_flag(src, dest);
	if (ret)
		goto out_unlock;

	dfsbno = XFS_B_TO_FSBT(mp, pos_out);
	sfsbno = XFS_B_TO_FSBT(mp, pos_in);
	fsblen = XFS_B_TO_FSB(mp, len);
	ret = xfs_reflink_remap_blocks(src, sfsbno, dest, dfsbno, fsblen,
			pos_out + len);
	if (ret)
		goto out_unlock;

	/* Zap any page cache for the destination file's range. */
	truncate_inode_pages_range(&inode_out->i_data, pos_out,
				   PAGE_ALIGN(pos_out + len) - 1);

	/*
	 * Carry the cowextsize hint from src to dest if we're sharing the
	 * entire source file to the entire destination file, the source file
	 * has a cowextsize hint, and the destination file does not.
	 */
	cowextsize = 0;
	if (pos_in == 0 && len == i_size_read(inode_in) &&
	    (src->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE) &&
	    pos_out == 0 && len >= i_size_read(inode_out) &&
	    !(dest->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE))
		cowextsize = src->i_d.di_cowextsize;

	ret = xfs_reflink_update_dest(dest, pos_out + len, cowextsize,
			is_dedupe);

out_unlock:
	xfs_iunlock(src, XFS_MMAPLOCK_EXCL);
	if (!same_inode)
		xfs_iunlock(dest, XFS_MMAPLOCK_EXCL);
	unlock_two_nondirectories(inode_in, inode_out);
	if (ret)
		trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_);
	return ret;
}
예제 #25
0
STATIC ssize_t
xfs_file_read_iter(
	struct kiocb		*iocb,
	struct iov_iter		*to)
{
	struct file		*file = iocb->ki_filp;
	struct inode		*inode = file->f_mapping->host;
	struct xfs_inode	*ip = XFS_I(inode);
	struct xfs_mount	*mp = ip->i_mount;
	size_t			size = iov_iter_count(to);
	ssize_t			ret = 0;
	int			ioflags = 0;
	xfs_fsize_t		n;
	loff_t			pos = iocb->ki_pos;

	XFS_STATS_INC(xs_read_calls);

	if (unlikely(iocb->ki_flags & IOCB_DIRECT))
		ioflags |= XFS_IO_ISDIRECT;
	if (file->f_mode & FMODE_NOCMTIME)
		ioflags |= XFS_IO_INVIS;

	if ((ioflags & XFS_IO_ISDIRECT) && !IS_DAX(inode)) {
		xfs_buftarg_t	*target =
			XFS_IS_REALTIME_INODE(ip) ?
				mp->m_rtdev_targp : mp->m_ddev_targp;
		/* DIO must be aligned to device logical sector size */
		if ((pos | size) & target->bt_logical_sectormask) {
			if (pos == i_size_read(inode))
				return 0;
			return -EINVAL;
		}
	}

	n = mp->m_super->s_maxbytes - pos;
	if (n <= 0 || size == 0)
		return 0;

	if (n < size)
		size = n;

	if (XFS_FORCED_SHUTDOWN(mp))
		return -EIO;

	/*
	 * Locking is a bit tricky here. If we take an exclusive lock
	 * for direct IO, we effectively serialise all new concurrent
	 * read IO to this file and block it behind IO that is currently in
	 * progress because IO in progress holds the IO lock shared. We only
	 * need to hold the lock exclusive to blow away the page cache, so
	 * only take lock exclusively if the page cache needs invalidation.
	 * This allows the normal direct IO case of no page cache pages to
	 * proceeed concurrently without serialisation.
	 */
	xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
	if ((ioflags & XFS_IO_ISDIRECT) && inode->i_mapping->nrpages) {
		xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
		xfs_rw_ilock(ip, XFS_IOLOCK_EXCL);

		if (inode->i_mapping->nrpages) {
			ret = filemap_write_and_wait_range(
							VFS_I(ip)->i_mapping,
							pos, pos + size - 1);
			if (ret) {
				xfs_rw_iunlock(ip, XFS_IOLOCK_EXCL);
				return ret;
			}

			/*
			 * Invalidate whole pages. This can return an error if
			 * we fail to invalidate a page, but this should never
			 * happen on XFS. Warn if it does fail.
			 */
			ret = invalidate_inode_pages2_range(VFS_I(ip)->i_mapping,
					pos >> PAGE_CACHE_SHIFT,
					(pos + size - 1) >> PAGE_CACHE_SHIFT);
			WARN_ON_ONCE(ret);
			ret = 0;
		}