Beispiel #1
0
/*
 * On failure, we only need to kill delalloc blocks beyond EOF in the range of
 * this specific write because they will never be written. Previous writes
 * beyond EOF where block allocation succeeded do not need to be trashed, so
 * only new blocks from this write should be trashed. For blocks within
 * EOF, generic_write_end() zeros them so they are safe to leave alone and be
 * written with all the other valid data.
 */
static int
xfs_vm_write_end(
        struct file             *file,
        struct address_space    *mapping,
        loff_t                  pos,
        unsigned                len,
        unsigned                copied,
        struct page             *page,
        void                    *fsdata)
{
        int                     ret;

        ASSERT(len <= PAGE_CACHE_SIZE);

        ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata);
        if (unlikely(ret < len)) {
                struct inode    *inode = mapping->host;
                size_t          isize = i_size_read(inode);
                loff_t          to = pos + len;

                if (to > isize) {
                        /* only kill blocks in this write beyond EOF */
                        if (pos > isize)
                                isize = pos;
                        xfs_vm_kill_delalloc_range(inode, isize, to);
                        truncate_pagecache_range(inode, isize, to);
                }
        }
        return ret;
}
int
xfs_zero_file_space(
	struct xfs_inode	*ip,
	xfs_off_t		offset,
	xfs_off_t		len)
{
	struct xfs_mount	*mp = ip->i_mount;
	uint			granularity;
	xfs_off_t		start_boundary;
	xfs_off_t		end_boundary;
	int			error;

	granularity = max_t(uint, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE);

	/*
	 * Round the range of extents we are going to convert inwards.  If the
	 * offset is aligned, then it doesn't get changed so we zero from the
	 * start of the block offset points to.
	 */
	start_boundary = round_up(offset, granularity);
	end_boundary = round_down(offset + len, granularity);

	ASSERT(start_boundary >= offset);
	ASSERT(end_boundary <= offset + len);

	if (start_boundary < end_boundary - 1) {
		/* punch out the page cache over the conversion range */
		truncate_pagecache_range(VFS_I(ip), start_boundary,
					 end_boundary - 1);
		/* convert the blocks */
		error = xfs_alloc_file_space(ip, start_boundary,
					end_boundary - start_boundary - 1,
					XFS_BMAPI_PREALLOC | XFS_BMAPI_CONVERT);
		if (error)
			goto out;

		/* We've handled the interior of the range, now for the edges */
		if (start_boundary != offset) {
			error = xfs_iozero(ip, offset, start_boundary - offset);
			if (error)
				goto out;
		}

		if (end_boundary != offset + len)
			error = xfs_iozero(ip, end_boundary,
					   offset + len - end_boundary);

	} else {
		/*
		 * It's either a sub-granularity range or the range spanned lies
		 * partially across two adjacent blocks.
		 */
		error = xfs_iozero(ip, offset, len);
	}

out:
	return error;

}
Beispiel #3
0
static void
iomap_write_failed(struct inode *inode, loff_t pos, unsigned len)
{
	loff_t i_size = i_size_read(inode);

	/*
	 * Only truncate newly allocated pages beyoned EOF, even if the
	 * write started inside the existing inode size.
	 */
	if (pos + len > i_size)
		truncate_pagecache_range(inode, max(pos, i_size), pos + len);
}
int
xfs_swap_extents(
	xfs_inode_t	*ip,	/* target inode */
	xfs_inode_t	*tip,	/* tmp inode */
	xfs_swapext_t	*sxp)
{
	xfs_mount_t	*mp = ip->i_mount;
	xfs_trans_t	*tp;
	xfs_bstat_t	*sbp = &sxp->sx_stat;
	xfs_ifork_t	*tempifp, *ifp, *tifp;
	int		src_log_flags, target_log_flags;
	int		error = 0;
	int		aforkblks = 0;
	int		taforkblks = 0;
	__uint64_t	tmp;

	tempifp = kmem_alloc(sizeof(xfs_ifork_t), KM_MAYFAIL);
	if (!tempifp) {
		error = XFS_ERROR(ENOMEM);
		goto out;
	}

	/*
	 * we have to do two separate lock calls here to keep lockdep
	 * happy. If we try to get all the locks in one call, lock will
	 * report false positives when we drop the ILOCK and regain them
	 * below.
	 */
	xfs_lock_two_inodes(ip, tip, XFS_IOLOCK_EXCL);
	xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL);

	/* Verify that both files have the same format */
	if ((ip->i_d.di_mode & S_IFMT) != (tip->i_d.di_mode & S_IFMT)) {
		error = XFS_ERROR(EINVAL);
		goto out_unlock;
	}

	/* Verify both files are either real-time or non-realtime */
	if (XFS_IS_REALTIME_INODE(ip) != XFS_IS_REALTIME_INODE(tip)) {
		error = XFS_ERROR(EINVAL);
		goto out_unlock;
	}

	error = -filemap_write_and_wait(VFS_I(tip)->i_mapping);
	if (error)
		goto out_unlock;
	truncate_pagecache_range(VFS_I(tip), 0, -1);

	/* Verify O_DIRECT for ftmp */
	if (VN_CACHED(VFS_I(tip)) != 0) {
		error = XFS_ERROR(EINVAL);
		goto out_unlock;
	}

	/* Verify all data are being swapped */
	if (sxp->sx_offset != 0 ||
	    sxp->sx_length != ip->i_d.di_size ||
	    sxp->sx_length != tip->i_d.di_size) {
		error = XFS_ERROR(EFAULT);
		goto out_unlock;
	}

	trace_xfs_swap_extent_before(ip, 0);
	trace_xfs_swap_extent_before(tip, 1);

	/* check inode formats now that data is flushed */
	error = xfs_swap_extents_check_format(ip, tip);
	if (error) {
		xfs_notice(mp,
		    "%s: inode 0x%llx format is incompatible for exchanging.",
				__func__, ip->i_ino);
		goto out_unlock;
	}

	/*
	 * Compare the current change & modify times with that
	 * passed in.  If they differ, we abort this swap.
	 * This is the mechanism used to ensure the calling
	 * process that the file was not changed out from
	 * under it.
	 */
	if ((sbp->bs_ctime.tv_sec != VFS_I(ip)->i_ctime.tv_sec) ||
	    (sbp->bs_ctime.tv_nsec != VFS_I(ip)->i_ctime.tv_nsec) ||
	    (sbp->bs_mtime.tv_sec != VFS_I(ip)->i_mtime.tv_sec) ||
	    (sbp->bs_mtime.tv_nsec != VFS_I(ip)->i_mtime.tv_nsec)) {
		error = XFS_ERROR(EBUSY);
		goto out_unlock;
	}

	/* We need to fail if the file is memory mapped.  Once we have tossed
	 * all existing pages, the page fault will have no option
	 * but to go to the filesystem for pages. By making the page fault call
	 * vop_read (or write in the case of autogrow) they block on the iolock
	 * until we have switched the extents.
	 */
	if (VN_MAPPED(VFS_I(ip))) {
		error = XFS_ERROR(EBUSY);
		goto out_unlock;
	}

	xfs_iunlock(ip, XFS_ILOCK_EXCL);
	xfs_iunlock(tip, XFS_ILOCK_EXCL);

	/*
	 * There is a race condition here since we gave up the
	 * ilock.  However, the data fork will not change since
	 * we have the iolock (locked for truncation too) so we
	 * are safe.  We don't really care if non-io related
	 * fields change.
	 */
	truncate_pagecache_range(VFS_I(ip), 0, -1);

	tp = xfs_trans_alloc(mp, XFS_TRANS_SWAPEXT);
	error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0);
	if (error) {
		xfs_iunlock(ip,  XFS_IOLOCK_EXCL);
		xfs_iunlock(tip, XFS_IOLOCK_EXCL);
		xfs_trans_cancel(tp, 0);
		goto out;
	}
	xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL);

	/*
	 * Count the number of extended attribute blocks
	 */
	if ( ((XFS_IFORK_Q(ip) != 0) && (ip->i_d.di_anextents > 0)) &&
	     (ip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)) {
		error = xfs_bmap_count_blocks(tp, ip, XFS_ATTR_FORK, &aforkblks);
		if (error)
			goto out_trans_cancel;
	}
	if ( ((XFS_IFORK_Q(tip) != 0) && (tip->i_d.di_anextents > 0)) &&
	     (tip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)) {
		error = xfs_bmap_count_blocks(tp, tip, XFS_ATTR_FORK,
			&taforkblks);
		if (error)
			goto out_trans_cancel;
	}

	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
	xfs_trans_ijoin(tp, tip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);

	/*
	 * Before we've swapped the forks, lets set the owners of the forks
	 * appropriately. We have to do this as we are demand paging the btree
	 * buffers, and so the validation done on read will expect the owner
	 * field to be correctly set. Once we change the owners, we can swap the
	 * inode forks.
	 *
	 * Note the trickiness in setting the log flags - we set the owner log
	 * flag on the opposite inode (i.e. the inode we are setting the new
	 * owner to be) because once we swap the forks and log that, log
	 * recovery is going to see the fork as owned by the swapped inode,
	 * not the pre-swapped inodes.
	 */
	src_log_flags = XFS_ILOG_CORE;
	target_log_flags = XFS_ILOG_CORE;
	if (ip->i_d.di_version == 3 &&
	    ip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
		target_log_flags |= XFS_ILOG_DOWNER;
		error = xfs_bmbt_change_owner(tp, ip, XFS_DATA_FORK,
					      tip->i_ino, NULL);
		if (error)
			goto out_trans_cancel;
	}

	if (tip->i_d.di_version == 3 &&
	    tip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
		src_log_flags |= XFS_ILOG_DOWNER;
		error = xfs_bmbt_change_owner(tp, tip, XFS_DATA_FORK,
					      ip->i_ino, NULL);
		if (error)
			goto out_trans_cancel;
	}

	/*
	 * Swap the data forks of the inodes
	 */
	ifp = &ip->i_df;
	tifp = &tip->i_df;
	*tempifp = *ifp;	/* struct copy */
	*ifp = *tifp;		/* struct copy */
	*tifp = *tempifp;	/* struct copy */

	/*
	 * Fix the on-disk inode values
	 */
	tmp = (__uint64_t)ip->i_d.di_nblocks;
	ip->i_d.di_nblocks = tip->i_d.di_nblocks - taforkblks + aforkblks;
	tip->i_d.di_nblocks = tmp + taforkblks - aforkblks;

	tmp = (__uint64_t) ip->i_d.di_nextents;
	ip->i_d.di_nextents = tip->i_d.di_nextents;
	tip->i_d.di_nextents = tmp;

	tmp = (__uint64_t) ip->i_d.di_format;
	ip->i_d.di_format = tip->i_d.di_format;
	tip->i_d.di_format = tmp;

	/*
	 * The extents in the source inode could still contain speculative
	 * preallocation beyond EOF (e.g. the file is open but not modified
	 * while defrag is in progress). In that case, we need to copy over the
	 * number of delalloc blocks the data fork in the source inode is
	 * tracking beyond EOF so that when the fork is truncated away when the
	 * temporary inode is unlinked we don't underrun the i_delayed_blks
	 * counter on that inode.
	 */
	ASSERT(tip->i_delayed_blks == 0);
	tip->i_delayed_blks = ip->i_delayed_blks;
	ip->i_delayed_blks = 0;

	switch (ip->i_d.di_format) {
	case XFS_DINODE_FMT_EXTENTS:
		/* If the extents fit in the inode, fix the
		 * pointer.  Otherwise it's already NULL or
		 * pointing to the extent.
		 */
		if (ip->i_d.di_nextents <= XFS_INLINE_EXTS) {
			ifp->if_u1.if_extents =
				ifp->if_u2.if_inline_ext;
		}
		src_log_flags |= XFS_ILOG_DEXT;
		break;
	case XFS_DINODE_FMT_BTREE:
		ASSERT(ip->i_d.di_version < 3 ||
		       (src_log_flags & XFS_ILOG_DOWNER));
		src_log_flags |= XFS_ILOG_DBROOT;
		break;
	}

	switch (tip->i_d.di_format) {
	case XFS_DINODE_FMT_EXTENTS:
		/* If the extents fit in the inode, fix the
		 * pointer.  Otherwise it's already NULL or
		 * pointing to the extent.
		 */
		if (tip->i_d.di_nextents <= XFS_INLINE_EXTS) {
			tifp->if_u1.if_extents =
				tifp->if_u2.if_inline_ext;
		}
		target_log_flags |= XFS_ILOG_DEXT;
		break;
	case XFS_DINODE_FMT_BTREE:
		target_log_flags |= XFS_ILOG_DBROOT;
		ASSERT(tip->i_d.di_version < 3 ||
		       (target_log_flags & XFS_ILOG_DOWNER));
		break;
	}

	xfs_trans_log_inode(tp, ip,  src_log_flags);
	xfs_trans_log_inode(tp, tip, target_log_flags);

	/*
	 * If this is a synchronous mount, make sure that the
	 * transaction goes to disk before returning to the user.
	 */
	if (mp->m_flags & XFS_MOUNT_WSYNC)
		xfs_trans_set_sync(tp);

	error = xfs_trans_commit(tp, 0);

	trace_xfs_swap_extent_after(ip, 0);
	trace_xfs_swap_extent_after(tip, 1);
out:
	kmem_free(tempifp);
	return error;

out_unlock:
	xfs_iunlock(ip,  XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
	xfs_iunlock(tip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
	goto out;

out_trans_cancel:
	xfs_trans_cancel(tp, 0);
	goto out_unlock;
}
int
xfs_free_file_space(
	struct xfs_inode	*ip,
	xfs_off_t		offset,
	xfs_off_t		len)
{
	int			committed;
	int			done;
	xfs_fileoff_t		endoffset_fsb;
	int			error;
	xfs_fsblock_t		firstfsb;
	xfs_bmap_free_t		free_list;
	xfs_bmbt_irec_t		imap;
	xfs_off_t		ioffset;
	xfs_extlen_t		mod=0;
	xfs_mount_t		*mp;
	int			nimap;
	uint			resblks;
	xfs_off_t		rounding;
	int			rt;
	xfs_fileoff_t		startoffset_fsb;
	xfs_trans_t		*tp;

	mp = ip->i_mount;

	trace_xfs_free_file_space(ip);

	error = xfs_qm_dqattach(ip, 0);
	if (error)
		return error;

	error = 0;
	if (len <= 0)	/* if nothing being freed */
		return error;
	rt = XFS_IS_REALTIME_INODE(ip);
	startoffset_fsb	= XFS_B_TO_FSB(mp, offset);
	endoffset_fsb = XFS_B_TO_FSBT(mp, offset + len);

	/* wait for the completion of any pending DIOs */
	inode_dio_wait(VFS_I(ip));

	rounding = max_t(xfs_off_t, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE);
	ioffset = offset & ~(rounding - 1);
	error = -filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
					      ioffset, -1);
	if (error)
		goto out;
	truncate_pagecache_range(VFS_I(ip), ioffset, -1);

	/*
	 * Need to zero the stuff we're not freeing, on disk.
	 * If it's a realtime file & can't use unwritten extents then we
	 * actually need to zero the extent edges.  Otherwise xfs_bunmapi
	 * will take care of it for us.
	 */
	if (rt && !xfs_sb_version_hasextflgbit(&mp->m_sb)) {
		nimap = 1;
		error = xfs_bmapi_read(ip, startoffset_fsb, 1,
					&imap, &nimap, 0);
		if (error)
			goto out;
		ASSERT(nimap == 0 || nimap == 1);
		if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
			xfs_daddr_t	block;

			ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
			block = imap.br_startblock;
			mod = do_div(block, mp->m_sb.sb_rextsize);
			if (mod)
				startoffset_fsb += mp->m_sb.sb_rextsize - mod;
		}
		nimap = 1;
		error = xfs_bmapi_read(ip, endoffset_fsb - 1, 1,
					&imap, &nimap, 0);
		if (error)
			goto out;
		ASSERT(nimap == 0 || nimap == 1);
		if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
			ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
			mod++;
			if (mod && (mod != mp->m_sb.sb_rextsize))
				endoffset_fsb -= mod;
		}
	}
	if ((done = (endoffset_fsb <= startoffset_fsb)))
		/*
		 * One contiguous piece to clear
		 */
		error = xfs_zero_remaining_bytes(ip, offset, offset + len - 1);
	else {
		/*
		 * Some full blocks, possibly two pieces to clear
		 */
		if (offset < XFS_FSB_TO_B(mp, startoffset_fsb))
			error = xfs_zero_remaining_bytes(ip, offset,
				XFS_FSB_TO_B(mp, startoffset_fsb) - 1);
		if (!error &&
		    XFS_FSB_TO_B(mp, endoffset_fsb) < offset + len)
			error = xfs_zero_remaining_bytes(ip,
				XFS_FSB_TO_B(mp, endoffset_fsb),
				offset + len - 1);
	}

	/*
	 * free file space until done or until there is an error
	 */
	resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
	while (!error && !done) {

		/*
		 * allocate and setup the transaction. Allow this
		 * transaction to dip into the reserve blocks to ensure
		 * the freeing of the space succeeds at ENOSPC.
		 */
		tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
		tp->t_flags |= XFS_TRANS_RESERVE;
		error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write, resblks, 0);

		/*
		 * check for running out of space
		 */
		if (error) {
			/*
			 * Free the transaction structure.
			 */
			ASSERT(error == ENOSPC || XFS_FORCED_SHUTDOWN(mp));
			xfs_trans_cancel(tp, 0);
			break;
		}
		xfs_ilock(ip, XFS_ILOCK_EXCL);
		error = xfs_trans_reserve_quota(tp, mp,
				ip->i_udquot, ip->i_gdquot, ip->i_pdquot,
				resblks, 0, XFS_QMOPT_RES_REGBLKS);
		if (error)
			goto error1;

		xfs_trans_ijoin(tp, ip, 0);

		/*
		 * issue the bunmapi() call to free the blocks
		 */
		xfs_bmap_init(&free_list, &firstfsb);
		error = xfs_bunmapi(tp, ip, startoffset_fsb,
				  endoffset_fsb - startoffset_fsb,
				  0, 2, &firstfsb, &free_list, &done);
		if (error) {
			goto error0;
		}

		/*
		 * complete the transaction
		 */
		error = xfs_bmap_finish(&tp, &free_list, &committed);
		if (error) {
			goto error0;
		}

		error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
		xfs_iunlock(ip, XFS_ILOCK_EXCL);
	}

 out:
	return error;

 error0:
	xfs_bmap_cancel(&free_list);
 error1:
	xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
	xfs_iunlock(ip, XFS_ILOCK_EXCL);
	goto out;
}
Beispiel #6
0
int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
		pmd_t *pmd, unsigned int flags, get_block_t get_block,
		dax_iodone_t complete_unwritten)
{
	struct file *file = vma->vm_file;
	struct address_space *mapping = file->f_mapping;
	struct inode *inode = mapping->host;
	struct buffer_head bh;
	unsigned blkbits = inode->i_blkbits;
	unsigned long pmd_addr = address & PMD_MASK;
	bool write = flags & FAULT_FLAG_WRITE;
	struct block_device *bdev;
	pgoff_t size, pgoff;
	sector_t block;
	int error, result = 0;
	bool alloc = false;

	/* dax pmd mappings require pfn_t_devmap() */
	if (!IS_ENABLED(CONFIG_FS_DAX_PMD))
		return VM_FAULT_FALLBACK;

	/* Fall back to PTEs if we're going to COW */
	if (write && !(vma->vm_flags & VM_SHARED)) {
		split_huge_pmd(vma, pmd, address);
		dax_pmd_dbg(NULL, address, "cow write");
		return VM_FAULT_FALLBACK;
	}
	/* If the PMD would extend outside the VMA */
	if (pmd_addr < vma->vm_start) {
		dax_pmd_dbg(NULL, address, "vma start unaligned");
		return VM_FAULT_FALLBACK;
	}
	if ((pmd_addr + PMD_SIZE) > vma->vm_end) {
		dax_pmd_dbg(NULL, address, "vma end unaligned");
		return VM_FAULT_FALLBACK;
	}

	pgoff = linear_page_index(vma, pmd_addr);
	size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
	if (pgoff >= size)
		return VM_FAULT_SIGBUS;
	/* If the PMD would cover blocks out of the file */
	if ((pgoff | PG_PMD_COLOUR) >= size) {
		dax_pmd_dbg(NULL, address,
				"offset + huge page size > file size");
		return VM_FAULT_FALLBACK;
	}

	memset(&bh, 0, sizeof(bh));
	bh.b_bdev = inode->i_sb->s_bdev;
	block = (sector_t)pgoff << (PAGE_SHIFT - blkbits);

	bh.b_size = PMD_SIZE;

	if (get_block(inode, block, &bh, 0) != 0)
		return VM_FAULT_SIGBUS;

	if (!buffer_mapped(&bh) && write) {
		if (get_block(inode, block, &bh, 1) != 0)
			return VM_FAULT_SIGBUS;
		alloc = true;
	}

	bdev = bh.b_bdev;

	/*
	 * If the filesystem isn't willing to tell us the length of a hole,
	 * just fall back to PTEs.  Calling get_block 512 times in a loop
	 * would be silly.
	 */
	if (!buffer_size_valid(&bh) || bh.b_size < PMD_SIZE) {
		dax_pmd_dbg(&bh, address, "allocated block too small");
		return VM_FAULT_FALLBACK;
	}

	/*
	 * If we allocated new storage, make sure no process has any
	 * zero pages covering this hole
	 */
	if (alloc) {
		loff_t lstart = pgoff << PAGE_SHIFT;
		loff_t lend = lstart + PMD_SIZE - 1; /* inclusive */

		truncate_pagecache_range(inode, lstart, lend);
	}

	i_mmap_lock_read(mapping);

	/*
	 * If a truncate happened while we were allocating blocks, we may
	 * leave blocks allocated to the file that are beyond EOF.  We can't
	 * take i_mutex here, so just leave them hanging; they'll be freed
	 * when the file is deleted.
	 */
	size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
	if (pgoff >= size) {
		result = VM_FAULT_SIGBUS;
		goto out;
	}
	if ((pgoff | PG_PMD_COLOUR) >= size) {
		dax_pmd_dbg(&bh, address,
				"offset + huge page size > file size");
		goto fallback;
	}

	if (!write && !buffer_mapped(&bh) && buffer_uptodate(&bh)) {
		spinlock_t *ptl;
		pmd_t entry;
		struct page *zero_page = get_huge_zero_page();

		if (unlikely(!zero_page)) {
			dax_pmd_dbg(&bh, address, "no zero page");
			goto fallback;
		}

		ptl = pmd_lock(vma->vm_mm, pmd);
		if (!pmd_none(*pmd)) {
			spin_unlock(ptl);
			dax_pmd_dbg(&bh, address, "pmd already present");
			goto fallback;
		}

		dev_dbg(part_to_dev(bdev->bd_part),
				"%s: %s addr: %lx pfn: <zero> sect: %llx\n",
				__func__, current->comm, address,
				(unsigned long long) to_sector(&bh, inode));

		entry = mk_pmd(zero_page, vma->vm_page_prot);
		entry = pmd_mkhuge(entry);
		set_pmd_at(vma->vm_mm, pmd_addr, pmd, entry);
		result = VM_FAULT_NOPAGE;
		spin_unlock(ptl);
	} else {
		struct blk_dax_ctl dax = {
			.sector = to_sector(&bh, inode),
			.size = PMD_SIZE,
		};
		long length = dax_map_atomic(bdev, &dax);

		if (length < 0) {
			result = VM_FAULT_SIGBUS;
			goto out;
		}
		if (length < PMD_SIZE) {
			dax_pmd_dbg(&bh, address, "dax-length too small");
			dax_unmap_atomic(bdev, &dax);
			goto fallback;
		}
		if (pfn_t_to_pfn(dax.pfn) & PG_PMD_COLOUR) {
			dax_pmd_dbg(&bh, address, "pfn unaligned");
			dax_unmap_atomic(bdev, &dax);
			goto fallback;
		}

		if (!pfn_t_devmap(dax.pfn)) {
			dax_unmap_atomic(bdev, &dax);
			dax_pmd_dbg(&bh, address, "pfn not in memmap");
			goto fallback;
		}

		if (buffer_unwritten(&bh) || buffer_new(&bh)) {
			clear_pmem(dax.addr, PMD_SIZE);
			wmb_pmem();
			count_vm_event(PGMAJFAULT);
			mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
			result |= VM_FAULT_MAJOR;
		}
		dax_unmap_atomic(bdev, &dax);

		/*
		 * For PTE faults we insert a radix tree entry for reads, and
		 * leave it clean.  Then on the first write we dirty the radix
		 * tree entry via the dax_pfn_mkwrite() path.  This sequence
		 * allows the dax_pfn_mkwrite() call to be simpler and avoid a
		 * call into get_block() to translate the pgoff to a sector in
		 * order to be able to create a new radix tree entry.
		 *
		 * The PMD path doesn't have an equivalent to
		 * dax_pfn_mkwrite(), though, so for a read followed by a
		 * write we traverse all the way through __dax_pmd_fault()
		 * twice.  This means we can just skip inserting a radix tree
		 * entry completely on the initial read and just wait until
		 * the write to insert a dirty entry.
		 */
		if (write) {
			error = dax_radix_entry(mapping, pgoff, dax.sector,
					true, true);
			if (error) {
				dax_pmd_dbg(&bh, address,
						"PMD radix insertion failed");
				goto fallback;
			}
		}

		dev_dbg(part_to_dev(bdev->bd_part),
				"%s: %s addr: %lx pfn: %lx sect: %llx\n",
				__func__, current->comm, address,
				pfn_t_to_pfn(dax.pfn),
				(unsigned long long) dax.sector);
		result |= vmf_insert_pfn_pmd(vma, address, pmd,
				dax.pfn, write);
	}

 out:
	i_mmap_unlock_read(mapping);

	if (buffer_unwritten(&bh))
		complete_unwritten(&bh, !(result & VM_FAULT_ERROR));

	return result;

 fallback:
	count_vm_event(THP_FAULT_FALLBACK);
	result = VM_FAULT_FALLBACK;
	goto out;
}
EXPORT_SYMBOL_GPL(__dax_pmd_fault);

/**
 * dax_pmd_fault - handle a PMD fault on a DAX file
 * @vma: The virtual memory area where the fault occurred
 * @vmf: The description of the fault
 * @get_block: The filesystem method used to translate file offsets to blocks
 *
 * When a page fault occurs, filesystems may call this helper in their
 * pmd_fault handler for DAX files.
 */
int dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
			pmd_t *pmd, unsigned int flags, get_block_t get_block,
			dax_iodone_t complete_unwritten)
{
	int result;
	struct super_block *sb = file_inode(vma->vm_file)->i_sb;

	if (flags & FAULT_FLAG_WRITE) {
		sb_start_pagefault(sb);
		file_update_time(vma->vm_file);
	}
	result = __dax_pmd_fault(vma, address, pmd, flags, get_block,
				complete_unwritten);
	if (flags & FAULT_FLAG_WRITE)
		sb_end_pagefault(sb);

	return result;
}
Beispiel #7
0
static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len,
								int mode)
{
	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
	struct address_space *mapping = inode->i_mapping;
	pgoff_t index, pg_start, pg_end;
	loff_t new_size = i_size_read(inode);
	loff_t off_start, off_end;
	int ret = 0;

	if (!S_ISREG(inode->i_mode))
		return -EINVAL;

	ret = inode_newsize_ok(inode, (len + offset));
	if (ret)
		return ret;

	f2fs_balance_fs(sbi);

	if (f2fs_has_inline_data(inode)) {
		ret = f2fs_convert_inline_inode(inode);
		if (ret)
			return ret;
	}

	ret = filemap_write_and_wait_range(mapping, offset, offset + len - 1);
	if (ret)
		return ret;

	truncate_pagecache_range(inode, offset, offset + len - 1);

	pg_start = ((unsigned long long) offset) >> PAGE_CACHE_SHIFT;
	pg_end = ((unsigned long long) offset + len) >> PAGE_CACHE_SHIFT;

	off_start = offset & (PAGE_CACHE_SIZE - 1);
	off_end = (offset + len) & (PAGE_CACHE_SIZE - 1);

	if (pg_start == pg_end) {
		ret = fill_zero(inode, pg_start, off_start,
						off_end - off_start);
		if (ret)
			return ret;

		if (offset + len > new_size)
			new_size = offset + len;
		new_size = max_t(loff_t, new_size, offset + len);
	} else {
		if (off_start) {
			ret = fill_zero(inode, pg_start++, off_start,
						PAGE_CACHE_SIZE - off_start);
			if (ret)
				return ret;

			new_size = max_t(loff_t, new_size,
						pg_start << PAGE_CACHE_SHIFT);
		}

		for (index = pg_start; index < pg_end; index++) {
			struct dnode_of_data dn;
			struct page *ipage;

			f2fs_lock_op(sbi);

			ipage = get_node_page(sbi, inode->i_ino);
			if (IS_ERR(ipage)) {
				ret = PTR_ERR(ipage);
				f2fs_unlock_op(sbi);
				goto out;
			}

			set_new_dnode(&dn, inode, ipage, NULL, 0);
			ret = f2fs_reserve_block(&dn, index);
			if (ret) {
				f2fs_unlock_op(sbi);
				goto out;
			}

			if (dn.data_blkaddr != NEW_ADDR) {
				invalidate_blocks(sbi, dn.data_blkaddr);

				dn.data_blkaddr = NEW_ADDR;
				set_data_blkaddr(&dn);

				dn.data_blkaddr = NULL_ADDR;
				f2fs_update_extent_cache(&dn);
			}
			f2fs_put_dnode(&dn);
			f2fs_unlock_op(sbi);

			new_size = max_t(loff_t, new_size,
					(index + 1) << PAGE_CACHE_SHIFT);
		}

		if (off_end) {
			ret = fill_zero(inode, pg_end, 0, off_end);
			if (ret)
				goto out;

			new_size = max_t(loff_t, new_size, offset + len);
		}
	}

out:
	if (!(mode & FALLOC_FL_KEEP_SIZE) && i_size_read(inode) < new_size) {
		i_size_write(inode, new_size);
		mark_inode_dirty(inode);
		update_inode_page(inode);
	}

	return ret;
}
Beispiel #8
0
/*
 * xfs_file_dio_aio_write - handle direct IO writes
 *
 * Lock the inode appropriately to prepare for and issue a direct IO write.
 * By separating it from the buffered write path we remove all the tricky to
 * follow locking changes and looping.
 *
 * If there are cached pages or we're extending the file, we need IOLOCK_EXCL
 * until we're sure the bytes at the new EOF have been zeroed and/or the cached
 * pages are flushed out.
 *
 * In most cases the direct IO writes will be done holding IOLOCK_SHARED
 * allowing them to be done in parallel with reads and other direct IO writes.
 * However, if the IO is not aligned to filesystem blocks, the direct IO layer
 * needs to do sub-block zeroing and that requires serialisation against other
 * direct IOs to the same block. In this case we need to serialise the
 * submission of the unaligned IOs so that we don't get racing block zeroing in
 * the dio layer.  To avoid the problem with aio, we also need to wait for
 * outstanding IOs to complete so that unwritten extent conversion is completed
 * before we try to map the overlapping block. This is currently implemented by
 * hitting it with a big hammer (i.e. inode_dio_wait()).
 *
 * Returns with locks held indicated by @iolock and errors indicated by
 * negative return values.
 */
STATIC ssize_t
xfs_file_dio_aio_write(
	struct kiocb		*iocb,
	const struct iovec	*iovp,
	unsigned long		nr_segs,
	loff_t			pos,
	size_t			ocount)
{
	struct file		*file = iocb->ki_filp;
	struct address_space	*mapping = file->f_mapping;
	struct inode		*inode = mapping->host;
	struct xfs_inode	*ip = XFS_I(inode);
	struct xfs_mount	*mp = ip->i_mount;
	ssize_t			ret = 0;
	size_t			count = ocount;
	int			unaligned_io = 0;
	int			iolock;
	struct xfs_buftarg	*target = XFS_IS_REALTIME_INODE(ip) ?
					mp->m_rtdev_targp : mp->m_ddev_targp;

	/* DIO must be aligned to device logical sector size */
	if ((pos | count) & target->bt_logical_sectormask)
		return -XFS_ERROR(EINVAL);

	/* "unaligned" here means not aligned to a filesystem block */
	if ((pos & mp->m_blockmask) || ((pos + count) & mp->m_blockmask))
		unaligned_io = 1;

	/*
	 * We don't need to take an exclusive lock unless there page cache needs
	 * to be invalidated or unaligned IO is being executed. We don't need to
	 * consider the EOF extension case here because
	 * xfs_file_aio_write_checks() will relock the inode as necessary for
	 * EOF zeroing cases and fill out the new inode size as appropriate.
	 */
	if (unaligned_io || mapping->nrpages)
		iolock = XFS_IOLOCK_EXCL;
	else
		iolock = XFS_IOLOCK_SHARED;
	xfs_rw_ilock(ip, iolock);

	/*
	 * Recheck if there are cached pages that need invalidate after we got
	 * the iolock to protect against other threads adding new pages while
	 * we were waiting for the iolock.
	 */
	if (mapping->nrpages && iolock == XFS_IOLOCK_SHARED) {
		xfs_rw_iunlock(ip, iolock);
		iolock = XFS_IOLOCK_EXCL;
		xfs_rw_ilock(ip, iolock);
	}

	ret = xfs_file_aio_write_checks(file, &pos, &count, &iolock);
	if (ret)
		goto out;

	if (mapping->nrpages) {
		ret = -filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
						    pos, -1);
		if (ret)
			goto out;
		truncate_pagecache_range(VFS_I(ip), pos, -1);
	}

	/*
	 * If we are doing unaligned IO, wait for all other IO to drain,
	 * otherwise demote the lock if we had to flush cached pages
	 */
	if (unaligned_io)
		inode_dio_wait(inode);
	else if (iolock == XFS_IOLOCK_EXCL) {
		xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
		iolock = XFS_IOLOCK_SHARED;
	}

	trace_xfs_file_direct_write(ip, count, iocb->ki_pos, 0);
	ret = generic_file_direct_write(iocb, iovp,
			&nr_segs, pos, &iocb->ki_pos, count, ocount);

out:
	xfs_rw_iunlock(ip, iolock);

	/* No fallback to buffered IO on errors for XFS. */
	ASSERT(ret < 0 || ret == count);
	return ret;
}
Beispiel #9
0
STATIC ssize_t
xfs_file_aio_read(
	struct kiocb		*iocb,
	const struct iovec	*iovp,
	unsigned long		nr_segs,
	loff_t			pos)
{
	struct file		*file = iocb->ki_filp;
	struct inode		*inode = file->f_mapping->host;
	struct xfs_inode	*ip = XFS_I(inode);
	struct xfs_mount	*mp = ip->i_mount;
	size_t			size = 0;
	ssize_t			ret = 0;
	int			ioflags = 0;
	xfs_fsize_t		n;

	XFS_STATS_INC(xs_read_calls);

	BUG_ON(iocb->ki_pos != pos);

	if (unlikely(file->f_flags & O_DIRECT))
		ioflags |= IO_ISDIRECT;
	if (file->f_mode & FMODE_NOCMTIME)
		ioflags |= IO_INVIS;

	ret = generic_segment_checks(iovp, &nr_segs, &size, VERIFY_WRITE);
	if (ret < 0)
		return ret;

	if (unlikely(ioflags & IO_ISDIRECT)) {
		xfs_buftarg_t	*target =
			XFS_IS_REALTIME_INODE(ip) ?
				mp->m_rtdev_targp : mp->m_ddev_targp;
		/* DIO must be aligned to device logical sector size */
		if ((pos | size) & target->bt_logical_sectormask) {
			if (pos == i_size_read(inode))
				return 0;
			return -XFS_ERROR(EINVAL);
		}
	}

	n = mp->m_super->s_maxbytes - pos;
	if (n <= 0 || size == 0)
		return 0;

	if (n < size)
		size = n;

	if (XFS_FORCED_SHUTDOWN(mp))
		return -EIO;

	/*
	 * Locking is a bit tricky here. If we take an exclusive lock
	 * for direct IO, we effectively serialise all new concurrent
	 * read IO to this file and block it behind IO that is currently in
	 * progress because IO in progress holds the IO lock shared. We only
	 * need to hold the lock exclusive to blow away the page cache, so
	 * only take lock exclusively if the page cache needs invalidation.
	 * This allows the normal direct IO case of no page cache pages to
	 * proceeed concurrently without serialisation.
	 */
	xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
	if ((ioflags & IO_ISDIRECT) && inode->i_mapping->nrpages) {
		xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
		xfs_rw_ilock(ip, XFS_IOLOCK_EXCL);

		if (inode->i_mapping->nrpages) {
			ret = -filemap_write_and_wait_range(
							VFS_I(ip)->i_mapping,
							pos, -1);
			if (ret) {
				xfs_rw_iunlock(ip, XFS_IOLOCK_EXCL);
				return ret;
			}
			truncate_pagecache_range(VFS_I(ip), pos, -1);
		}
		xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
	}

	trace_xfs_file_read(ip, size, pos, ioflags);

	ret = generic_file_aio_read(iocb, iovp, nr_segs, pos);
	if (ret > 0)
		XFS_STATS_ADD(xs_read_bytes, ret);

	xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
	return ret;
}
Beispiel #10
0
static int _nfs42_proc_fallocate(struct rpc_message *msg, struct file *filep,
				 loff_t offset, loff_t len)
{
	struct inode *inode = file_inode(filep);
	struct nfs_server *server = NFS_SERVER(inode);
	struct nfs42_falloc_args args = {
		.falloc_fh	= NFS_FH(inode),
		.falloc_offset	= offset,
		.falloc_length	= len,
		.falloc_bitmask	= server->cache_consistency_bitmask,
	};
	struct nfs42_falloc_res res = {
		.falloc_server	= server,
	};
	int status;

	msg->rpc_argp = &args;
	msg->rpc_resp = &res;

	status = nfs42_set_rw_stateid(&args.falloc_stateid, filep, FMODE_WRITE);
	if (status)
		return status;

	res.falloc_fattr = nfs_alloc_fattr();
	if (!res.falloc_fattr)
		return -ENOMEM;

	status = nfs4_call_sync(server->client, server, msg,
				&args.seq_args, &res.seq_res, 0);
	if (status == 0)
		status = nfs_post_op_update_inode(inode, res.falloc_fattr);

	kfree(res.falloc_fattr);
	return status;
}

static int nfs42_proc_fallocate(struct rpc_message *msg, struct file *filep,
				loff_t offset, loff_t len)
{
	struct nfs_server *server = NFS_SERVER(file_inode(filep));
	struct nfs4_exception exception = { };
	int err;

	do {
		err = _nfs42_proc_fallocate(msg, filep, offset, len);
		if (err == -ENOTSUPP)
			return -EOPNOTSUPP;
		err = nfs4_handle_exception(server, err, &exception);
	} while (exception.retry);

	return err;
}

int nfs42_proc_allocate(struct file *filep, loff_t offset, loff_t len)
{
	struct rpc_message msg = {
		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_ALLOCATE],
	};
	struct inode *inode = file_inode(filep);
	int err;

	if (!nfs_server_capable(inode, NFS_CAP_ALLOCATE))
		return -EOPNOTSUPP;

	mutex_lock(&inode->i_mutex);

	err = nfs42_proc_fallocate(&msg, filep, offset, len);
	if (err == -EOPNOTSUPP)
		NFS_SERVER(inode)->caps &= ~NFS_CAP_ALLOCATE;

	mutex_unlock(&inode->i_mutex);
	return err;
}

int nfs42_proc_deallocate(struct file *filep, loff_t offset, loff_t len)
{
	struct rpc_message msg = {
		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_DEALLOCATE],
	};
	struct inode *inode = file_inode(filep);
	int err;

	if (!nfs_server_capable(inode, NFS_CAP_DEALLOCATE))
		return -EOPNOTSUPP;

	nfs_wb_all(inode);
	mutex_lock(&inode->i_mutex);

	err = nfs42_proc_fallocate(&msg, filep, offset, len);
	if (err == 0)
		truncate_pagecache_range(inode, offset, (offset + len) -1);
	if (err == -EOPNOTSUPP)
		NFS_SERVER(inode)->caps &= ~NFS_CAP_DEALLOCATE;

	mutex_unlock(&inode->i_mutex);
	return err;
}

static loff_t _nfs42_proc_llseek(struct file *filep, loff_t offset, int whence)
{
	struct inode *inode = file_inode(filep);
	struct nfs42_seek_args args = {
		.sa_fh		= NFS_FH(inode),
		.sa_offset	= offset,
		.sa_what	= (whence == SEEK_HOLE) ?
					NFS4_CONTENT_HOLE : NFS4_CONTENT_DATA,
	};
	struct nfs42_seek_res res;
	struct rpc_message msg = {
		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SEEK],
		.rpc_argp = &args,
		.rpc_resp = &res,
	};
	struct nfs_server *server = NFS_SERVER(inode);
	int status;

	if (!nfs_server_capable(inode, NFS_CAP_SEEK))
		return -ENOTSUPP;

	status = nfs42_set_rw_stateid(&args.sa_stateid, filep, FMODE_READ);
	if (status)
		return status;

	nfs_wb_all(inode);
	status = nfs4_call_sync(server->client, server, &msg,
				&args.seq_args, &res.seq_res, 0);
	if (status == -ENOTSUPP)
		server->caps &= ~NFS_CAP_SEEK;
	if (status)
		return status;

	return vfs_setpos(filep, res.sr_offset, inode->i_sb->s_maxbytes);
}

loff_t nfs42_proc_llseek(struct file *filep, loff_t offset, int whence)
{
	struct nfs_server *server = NFS_SERVER(file_inode(filep));
	struct nfs4_exception exception = { };
	loff_t err;

	do {
		err = _nfs42_proc_llseek(filep, offset, whence);
		if (err >= 0)
			break;
		if (err == -ENOTSUPP)
			return -EOPNOTSUPP;
		err = nfs4_handle_exception(server, err, &exception);
	} while (exception.retry);

	return err;
}


static void
nfs42_layoutstat_prepare(struct rpc_task *task, void *calldata)
{
	struct nfs42_layoutstat_data *data = calldata;
	struct nfs_server *server = NFS_SERVER(data->args.inode);

	nfs41_setup_sequence(nfs4_get_session(server), &data->args.seq_args,
			     &data->res.seq_res, task);
}

static void
nfs42_layoutstat_done(struct rpc_task *task, void *calldata)
{
	struct nfs42_layoutstat_data *data = calldata;

	if (!nfs4_sequence_done(task, &data->res.seq_res))
		return;

	switch (task->tk_status) {
	case 0:
		break;
	case -ENOTSUPP:
	case -EOPNOTSUPP:
		NFS_SERVER(data->inode)->caps &= ~NFS_CAP_LAYOUTSTATS;
	default:
		dprintk("%s server returns %d\n", __func__, task->tk_status);
	}
}

static void
nfs42_layoutstat_release(void *calldata)
{
	struct nfs42_layoutstat_data *data = calldata;
	struct nfs_server *nfss = NFS_SERVER(data->args.inode);

	if (nfss->pnfs_curr_ld->cleanup_layoutstats)
		nfss->pnfs_curr_ld->cleanup_layoutstats(data);

	pnfs_put_layout_hdr(NFS_I(data->args.inode)->layout);
	smp_mb__before_atomic();
	clear_bit(NFS_INO_LAYOUTSTATS, &NFS_I(data->args.inode)->flags);
	smp_mb__after_atomic();
	nfs_iput_and_deactive(data->inode);
	kfree(data->args.devinfo);
	kfree(data);
}

static const struct rpc_call_ops nfs42_layoutstat_ops = {
	.rpc_call_prepare = nfs42_layoutstat_prepare,
	.rpc_call_done = nfs42_layoutstat_done,
	.rpc_release = nfs42_layoutstat_release,
};

int nfs42_proc_layoutstats_generic(struct nfs_server *server,
				   struct nfs42_layoutstat_data *data)
{
	struct rpc_message msg = {
		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LAYOUTSTATS],
		.rpc_argp = &data->args,
		.rpc_resp = &data->res,
	};
	struct rpc_task_setup task_setup = {
		.rpc_client = server->client,
		.rpc_message = &msg,
		.callback_ops = &nfs42_layoutstat_ops,
		.callback_data = data,
		.flags = RPC_TASK_ASYNC,
	};
	struct rpc_task *task;

	data->inode = nfs_igrab_and_active(data->args.inode);
	if (!data->inode) {
		nfs42_layoutstat_release(data);
		return -EAGAIN;
	}
	nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 0);
	task = rpc_run_task(&task_setup);
	if (IS_ERR(task))
		return PTR_ERR(task);
	return 0;
}

static int _nfs42_proc_clone(struct rpc_message *msg, struct file *src_f,
			     struct file *dst_f, loff_t src_offset,
			     loff_t dst_offset, loff_t count)
{
	struct inode *src_inode = file_inode(src_f);
	struct inode *dst_inode = file_inode(dst_f);
	struct nfs_server *server = NFS_SERVER(dst_inode);
	struct nfs42_clone_args args = {
		.src_fh = NFS_FH(src_inode),
		.dst_fh = NFS_FH(dst_inode),
		.src_offset = src_offset,
		.dst_offset = dst_offset,
		.count = count,
		.dst_bitmask = server->cache_consistency_bitmask,
	};
	struct nfs42_clone_res res = {
		.server	= server,
	};
	int status;

	msg->rpc_argp = &args;
	msg->rpc_resp = &res;

	status = nfs42_set_rw_stateid(&args.src_stateid, src_f, FMODE_READ);
	if (status)
		return status;

	status = nfs42_set_rw_stateid(&args.dst_stateid, dst_f, FMODE_WRITE);
	if (status)
		return status;

	res.dst_fattr = nfs_alloc_fattr();
	if (!res.dst_fattr)
		return -ENOMEM;

	status = nfs4_call_sync(server->client, server, msg,
				&args.seq_args, &res.seq_res, 0);
	if (status == 0)
		status = nfs_post_op_update_inode(dst_inode, res.dst_fattr);

	kfree(res.dst_fattr);
	return status;
}

int nfs42_proc_clone(struct file *src_f, struct file *dst_f,
		     loff_t src_offset, loff_t dst_offset, loff_t count)
{
	struct rpc_message msg = {
		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CLONE],
	};
	struct inode *inode = file_inode(src_f);
	struct nfs_server *server = NFS_SERVER(file_inode(src_f));
	struct nfs4_exception exception = { };
	int err;

	if (!nfs_server_capable(inode, NFS_CAP_CLONE))
		return -EOPNOTSUPP;

	do {
		err = _nfs42_proc_clone(&msg, src_f, dst_f, src_offset,
					dst_offset, count);
		if (err == -ENOTSUPP || err == -EOPNOTSUPP) {
			NFS_SERVER(inode)->caps &= ~NFS_CAP_CLONE;
			return -EOPNOTSUPP;
		}
		err = nfs4_handle_exception(server, err, &exception);
	} while (exception.retry);

	return err;

}
Beispiel #11
0
static int _nfs42_proc_fallocate(struct rpc_message *msg, struct file *filep,
		struct nfs_lock_context *lock, loff_t offset, loff_t len)
{
	struct inode *inode = file_inode(filep);
	struct nfs_server *server = NFS_SERVER(inode);
	struct nfs42_falloc_args args = {
		.falloc_fh	= NFS_FH(inode),
		.falloc_offset	= offset,
		.falloc_length	= len,
		.falloc_bitmask	= server->cache_consistency_bitmask,
	};
	struct nfs42_falloc_res res = {
		.falloc_server	= server,
	};
	int status;

	msg->rpc_argp = &args;
	msg->rpc_resp = &res;

	status = nfs4_set_rw_stateid(&args.falloc_stateid, lock->open_context,
			lock, FMODE_WRITE);
	if (status)
		return status;

	res.falloc_fattr = nfs_alloc_fattr();
	if (!res.falloc_fattr)
		return -ENOMEM;

	status = nfs4_call_sync(server->client, server, msg,
				&args.seq_args, &res.seq_res, 0);
	if (status == 0)
		status = nfs_post_op_update_inode(inode, res.falloc_fattr);

	kfree(res.falloc_fattr);
	return status;
}

static int nfs42_proc_fallocate(struct rpc_message *msg, struct file *filep,
				loff_t offset, loff_t len)
{
	struct nfs_server *server = NFS_SERVER(file_inode(filep));
	struct nfs4_exception exception = { };
	struct nfs_lock_context *lock;
	int err;

	lock = nfs_get_lock_context(nfs_file_open_context(filep));
	if (IS_ERR(lock))
		return PTR_ERR(lock);

	exception.inode = file_inode(filep);
	exception.state = lock->open_context->state;

	do {
		err = _nfs42_proc_fallocate(msg, filep, lock, offset, len);
		if (err == -ENOTSUPP) {
			err = -EOPNOTSUPP;
			break;
		}
		err = nfs4_handle_exception(server, err, &exception);
	} while (exception.retry);

	nfs_put_lock_context(lock);
	return err;
}

int nfs42_proc_allocate(struct file *filep, loff_t offset, loff_t len)
{
	struct rpc_message msg = {
		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_ALLOCATE],
	};
	struct inode *inode = file_inode(filep);
	int err;

	if (!nfs_server_capable(inode, NFS_CAP_ALLOCATE))
		return -EOPNOTSUPP;

	inode_lock(inode);

	err = nfs42_proc_fallocate(&msg, filep, offset, len);
	if (err == -EOPNOTSUPP)
		NFS_SERVER(inode)->caps &= ~NFS_CAP_ALLOCATE;

	inode_unlock(inode);
	return err;
}

int nfs42_proc_deallocate(struct file *filep, loff_t offset, loff_t len)
{
	struct rpc_message msg = {
		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_DEALLOCATE],
	};
	struct inode *inode = file_inode(filep);
	int err;

	if (!nfs_server_capable(inode, NFS_CAP_DEALLOCATE))
		return -EOPNOTSUPP;

	inode_lock(inode);
	err = nfs_sync_inode(inode);
	if (err)
		goto out_unlock;

	err = nfs42_proc_fallocate(&msg, filep, offset, len);
	if (err == 0)
		truncate_pagecache_range(inode, offset, (offset + len) -1);
	if (err == -EOPNOTSUPP)
		NFS_SERVER(inode)->caps &= ~NFS_CAP_DEALLOCATE;
out_unlock:
	inode_unlock(inode);
	return err;
}

static ssize_t _nfs42_proc_copy(struct file *src, loff_t pos_src,
				struct nfs_lock_context *src_lock,
				struct file *dst, loff_t pos_dst,
				struct nfs_lock_context *dst_lock,
				size_t count)
{
	struct nfs42_copy_args args = {
		.src_fh		= NFS_FH(file_inode(src)),
		.src_pos	= pos_src,
		.dst_fh		= NFS_FH(file_inode(dst)),
		.dst_pos	= pos_dst,
		.count		= count,
	};
	struct nfs42_copy_res res;
	struct rpc_message msg = {
		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_COPY],
		.rpc_argp = &args,
		.rpc_resp = &res,
	};
	struct inode *dst_inode = file_inode(dst);
	struct nfs_server *server = NFS_SERVER(dst_inode);
	int status;

	status = nfs4_set_rw_stateid(&args.src_stateid, src_lock->open_context,
				     src_lock, FMODE_READ);
	if (status)
		return status;

	status = nfs_filemap_write_and_wait_range(file_inode(src)->i_mapping,
			pos_src, pos_src + (loff_t)count - 1);
	if (status)
		return status;

	status = nfs4_set_rw_stateid(&args.dst_stateid, dst_lock->open_context,
				     dst_lock, FMODE_WRITE);
	if (status)
		return status;

	status = nfs_sync_inode(dst_inode);
	if (status)
		return status;

	status = nfs4_call_sync(server->client, server, &msg,
				&args.seq_args, &res.seq_res, 0);
	if (status == -ENOTSUPP)
		server->caps &= ~NFS_CAP_COPY;
	if (status)
		return status;

	if (res.write_res.verifier.committed != NFS_FILE_SYNC) {
		status = nfs_commit_file(dst, &res.write_res.verifier.verifier);
		if (status)
			return status;
	}

	truncate_pagecache_range(dst_inode, pos_dst,
				 pos_dst + res.write_res.count);

	return res.write_res.count;
}

ssize_t nfs42_proc_copy(struct file *src, loff_t pos_src,
			struct file *dst, loff_t pos_dst,
			size_t count)
{
	struct nfs_server *server = NFS_SERVER(file_inode(dst));
	struct nfs_lock_context *src_lock;
	struct nfs_lock_context *dst_lock;
	struct nfs4_exception src_exception = { };
	struct nfs4_exception dst_exception = { };
	ssize_t err, err2;

	if (!nfs_server_capable(file_inode(dst), NFS_CAP_COPY))
		return -EOPNOTSUPP;

	src_lock = nfs_get_lock_context(nfs_file_open_context(src));
	if (IS_ERR(src_lock))
		return PTR_ERR(src_lock);

	src_exception.inode = file_inode(src);
	src_exception.state = src_lock->open_context->state;

	dst_lock = nfs_get_lock_context(nfs_file_open_context(dst));
	if (IS_ERR(dst_lock)) {
		err = PTR_ERR(dst_lock);
		goto out_put_src_lock;
	}

	dst_exception.inode = file_inode(dst);
	dst_exception.state = dst_lock->open_context->state;

	do {
		inode_lock(file_inode(dst));
		err = _nfs42_proc_copy(src, pos_src, src_lock,
				       dst, pos_dst, dst_lock, count);
		inode_unlock(file_inode(dst));

		if (err == -ENOTSUPP) {
			err = -EOPNOTSUPP;
			break;
		}

		err2 = nfs4_handle_exception(server, err, &src_exception);
		err  = nfs4_handle_exception(server, err, &dst_exception);
		if (!err)
			err = err2;
	} while (src_exception.retry || dst_exception.retry);

	nfs_put_lock_context(dst_lock);
out_put_src_lock:
	nfs_put_lock_context(src_lock);
	return err;
}

static loff_t _nfs42_proc_llseek(struct file *filep,
		struct nfs_lock_context *lock, loff_t offset, int whence)
{
	struct inode *inode = file_inode(filep);
	struct nfs42_seek_args args = {
		.sa_fh		= NFS_FH(inode),
		.sa_offset	= offset,
		.sa_what	= (whence == SEEK_HOLE) ?
					NFS4_CONTENT_HOLE : NFS4_CONTENT_DATA,
	};
	struct nfs42_seek_res res;
	struct rpc_message msg = {
		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SEEK],
		.rpc_argp = &args,
		.rpc_resp = &res,
	};
	struct nfs_server *server = NFS_SERVER(inode);
	int status;

	if (!nfs_server_capable(inode, NFS_CAP_SEEK))
		return -ENOTSUPP;

	status = nfs4_set_rw_stateid(&args.sa_stateid, lock->open_context,
			lock, FMODE_READ);
	if (status)
		return status;

	status = nfs_filemap_write_and_wait_range(inode->i_mapping,
			offset, LLONG_MAX);
	if (status)
		return status;

	status = nfs4_call_sync(server->client, server, &msg,
				&args.seq_args, &res.seq_res, 0);
	if (status == -ENOTSUPP)
		server->caps &= ~NFS_CAP_SEEK;
	if (status)
		return status;

	return vfs_setpos(filep, res.sr_offset, inode->i_sb->s_maxbytes);
}

loff_t nfs42_proc_llseek(struct file *filep, loff_t offset, int whence)
{
	struct nfs_server *server = NFS_SERVER(file_inode(filep));
	struct nfs4_exception exception = { };
	struct nfs_lock_context *lock;
	loff_t err;

	lock = nfs_get_lock_context(nfs_file_open_context(filep));
	if (IS_ERR(lock))
		return PTR_ERR(lock);

	exception.inode = file_inode(filep);
	exception.state = lock->open_context->state;

	do {
		err = _nfs42_proc_llseek(filep, lock, offset, whence);
		if (err >= 0)
			break;
		if (err == -ENOTSUPP) {
			err = -EOPNOTSUPP;
			break;
		}
		err = nfs4_handle_exception(server, err, &exception);
	} while (exception.retry);

	nfs_put_lock_context(lock);
	return err;
}


static void
nfs42_layoutstat_prepare(struct rpc_task *task, void *calldata)
{
	struct nfs42_layoutstat_data *data = calldata;
	struct inode *inode = data->inode;
	struct nfs_server *server = NFS_SERVER(inode);
	struct pnfs_layout_hdr *lo;

	spin_lock(&inode->i_lock);
	lo = NFS_I(inode)->layout;
	if (!pnfs_layout_is_valid(lo)) {
		spin_unlock(&inode->i_lock);
		rpc_exit(task, 0);
		return;
	}
	nfs4_stateid_copy(&data->args.stateid, &lo->plh_stateid);
	spin_unlock(&inode->i_lock);
	nfs41_setup_sequence(nfs4_get_session(server), &data->args.seq_args,
			     &data->res.seq_res, task);

}

static void
nfs42_layoutstat_done(struct rpc_task *task, void *calldata)
{
	struct nfs42_layoutstat_data *data = calldata;
	struct inode *inode = data->inode;
	struct pnfs_layout_hdr *lo;

	if (!nfs4_sequence_done(task, &data->res.seq_res))
		return;

	switch (task->tk_status) {
	case 0:
		break;
	case -NFS4ERR_EXPIRED:
	case -NFS4ERR_ADMIN_REVOKED:
	case -NFS4ERR_DELEG_REVOKED:
	case -NFS4ERR_STALE_STATEID:
	case -NFS4ERR_BAD_STATEID:
		spin_lock(&inode->i_lock);
		lo = NFS_I(inode)->layout;
		if (pnfs_layout_is_valid(lo) &&
		    nfs4_stateid_match(&data->args.stateid,
					     &lo->plh_stateid)) {
			LIST_HEAD(head);

			/*
			 * Mark the bad layout state as invalid, then retry
			 * with the current stateid.
			 */
			pnfs_mark_layout_stateid_invalid(lo, &head);
			spin_unlock(&inode->i_lock);
			pnfs_free_lseg_list(&head);
		} else
			spin_unlock(&inode->i_lock);
		break;
	case -NFS4ERR_OLD_STATEID:
		spin_lock(&inode->i_lock);
		lo = NFS_I(inode)->layout;
		if (pnfs_layout_is_valid(lo) &&
		    nfs4_stateid_match_other(&data->args.stateid,
					&lo->plh_stateid)) {
			/* Do we need to delay before resending? */
			if (!nfs4_stateid_is_newer(&lo->plh_stateid,
						&data->args.stateid))
				rpc_delay(task, HZ);
			rpc_restart_call_prepare(task);
		}
		spin_unlock(&inode->i_lock);
		break;
	case -ENOTSUPP:
	case -EOPNOTSUPP:
		NFS_SERVER(inode)->caps &= ~NFS_CAP_LAYOUTSTATS;
	}

	dprintk("%s server returns %d\n", __func__, task->tk_status);
}

static void
nfs42_layoutstat_release(void *calldata)
{
	struct nfs42_layoutstat_data *data = calldata;
	struct nfs_server *nfss = NFS_SERVER(data->args.inode);

	if (nfss->pnfs_curr_ld->cleanup_layoutstats)
		nfss->pnfs_curr_ld->cleanup_layoutstats(data);

	pnfs_put_layout_hdr(NFS_I(data->args.inode)->layout);
	smp_mb__before_atomic();
	clear_bit(NFS_INO_LAYOUTSTATS, &NFS_I(data->args.inode)->flags);
	smp_mb__after_atomic();
	nfs_iput_and_deactive(data->inode);
	kfree(data->args.devinfo);
	kfree(data);
}

static const struct rpc_call_ops nfs42_layoutstat_ops = {
	.rpc_call_prepare = nfs42_layoutstat_prepare,
	.rpc_call_done = nfs42_layoutstat_done,
	.rpc_release = nfs42_layoutstat_release,
};

int nfs42_proc_layoutstats_generic(struct nfs_server *server,
				   struct nfs42_layoutstat_data *data)
{
	struct rpc_message msg = {
		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LAYOUTSTATS],
		.rpc_argp = &data->args,
		.rpc_resp = &data->res,
	};
	struct rpc_task_setup task_setup = {
		.rpc_client = server->client,
		.rpc_message = &msg,
		.callback_ops = &nfs42_layoutstat_ops,
		.callback_data = data,
		.flags = RPC_TASK_ASYNC,
	};
	struct rpc_task *task;

	data->inode = nfs_igrab_and_active(data->args.inode);
	if (!data->inode) {
		nfs42_layoutstat_release(data);
		return -EAGAIN;
	}
	nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 0);
	task = rpc_run_task(&task_setup);
	if (IS_ERR(task))
		return PTR_ERR(task);
	rpc_put_task(task);
	return 0;
}

static int _nfs42_proc_clone(struct rpc_message *msg, struct file *src_f,
		struct file *dst_f, struct nfs_lock_context *src_lock,
		struct nfs_lock_context *dst_lock, loff_t src_offset,
		loff_t dst_offset, loff_t count)
{
	struct inode *src_inode = file_inode(src_f);
	struct inode *dst_inode = file_inode(dst_f);
	struct nfs_server *server = NFS_SERVER(dst_inode);
	struct nfs42_clone_args args = {
		.src_fh = NFS_FH(src_inode),
		.dst_fh = NFS_FH(dst_inode),
		.src_offset = src_offset,
		.dst_offset = dst_offset,
		.count = count,
		.dst_bitmask = server->cache_consistency_bitmask,
	};
	struct nfs42_clone_res res = {
		.server	= server,
	};
	int status;

	msg->rpc_argp = &args;
	msg->rpc_resp = &res;

	status = nfs4_set_rw_stateid(&args.src_stateid, src_lock->open_context,
			src_lock, FMODE_READ);
	if (status)
		return status;

	status = nfs4_set_rw_stateid(&args.dst_stateid, dst_lock->open_context,
			dst_lock, FMODE_WRITE);
	if (status)
		return status;

	res.dst_fattr = nfs_alloc_fattr();
	if (!res.dst_fattr)
		return -ENOMEM;

	status = nfs4_call_sync(server->client, server, msg,
				&args.seq_args, &res.seq_res, 0);
	if (status == 0)
		status = nfs_post_op_update_inode(dst_inode, res.dst_fattr);

	kfree(res.dst_fattr);
	return status;
}

int nfs42_proc_clone(struct file *src_f, struct file *dst_f,
		     loff_t src_offset, loff_t dst_offset, loff_t count)
{
	struct rpc_message msg = {
		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CLONE],
	};
	struct inode *inode = file_inode(src_f);
	struct nfs_server *server = NFS_SERVER(file_inode(src_f));
	struct nfs_lock_context *src_lock;
	struct nfs_lock_context *dst_lock;
	struct nfs4_exception src_exception = { };
	struct nfs4_exception dst_exception = { };
	int err, err2;

	if (!nfs_server_capable(inode, NFS_CAP_CLONE))
		return -EOPNOTSUPP;

	src_lock = nfs_get_lock_context(nfs_file_open_context(src_f));
	if (IS_ERR(src_lock))
		return PTR_ERR(src_lock);

	src_exception.inode = file_inode(src_f);
	src_exception.state = src_lock->open_context->state;

	dst_lock = nfs_get_lock_context(nfs_file_open_context(dst_f));
	if (IS_ERR(dst_lock)) {
		err = PTR_ERR(dst_lock);
		goto out_put_src_lock;
	}

	dst_exception.inode = file_inode(dst_f);
	dst_exception.state = dst_lock->open_context->state;

	do {
		err = _nfs42_proc_clone(&msg, src_f, dst_f, src_lock, dst_lock,
					src_offset, dst_offset, count);
		if (err == -ENOTSUPP || err == -EOPNOTSUPP) {
			NFS_SERVER(inode)->caps &= ~NFS_CAP_CLONE;
			err = -EOPNOTSUPP;
			break;
		}

		err2 = nfs4_handle_exception(server, err, &src_exception);
		err = nfs4_handle_exception(server, err, &dst_exception);
		if (!err)
			err = err2;
	} while (src_exception.retry || dst_exception.retry);

	nfs_put_lock_context(dst_lock);
out_put_src_lock:
	nfs_put_lock_context(src_lock);
	return err;
}
Beispiel #12
0
static int _nfs42_proc_fallocate(struct rpc_message *msg, struct file *filep,
				 loff_t offset, loff_t len)
{
	struct inode *inode = file_inode(filep);
	struct nfs_server *server = NFS_SERVER(inode);
	struct nfs42_falloc_args args = {
		.falloc_fh	= NFS_FH(inode),
		.falloc_offset	= offset,
		.falloc_length	= len,
		.falloc_bitmask	= server->cache_consistency_bitmask,
	};
	struct nfs42_falloc_res res = {
		.falloc_server	= server,
	};
	int status;

	msg->rpc_argp = &args;
	msg->rpc_resp = &res;

	status = nfs42_set_rw_stateid(&args.falloc_stateid, filep, FMODE_WRITE);
	if (status)
		return status;

	res.falloc_fattr = nfs_alloc_fattr();
	if (!res.falloc_fattr)
		return -ENOMEM;

	status = nfs4_call_sync(server->client, server, msg,
				&args.seq_args, &res.seq_res, 0);
	if (status == 0)
		status = nfs_post_op_update_inode(inode, res.falloc_fattr);

	kfree(res.falloc_fattr);
	return status;
}

static int nfs42_proc_fallocate(struct rpc_message *msg, struct file *filep,
				loff_t offset, loff_t len)
{
	struct nfs_server *server = NFS_SERVER(file_inode(filep));
	struct nfs4_exception exception = { };
	int err;

	do {
		err = _nfs42_proc_fallocate(msg, filep, offset, len);
		if (err == -ENOTSUPP)
			return -EOPNOTSUPP;
		err = nfs4_handle_exception(server, err, &exception);
	} while (exception.retry);

	return err;
}

int nfs42_proc_allocate(struct file *filep, loff_t offset, loff_t len)
{
	struct rpc_message msg = {
		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_ALLOCATE],
	};
	struct inode *inode = file_inode(filep);
	int err;

	if (!nfs_server_capable(inode, NFS_CAP_ALLOCATE))
		return -EOPNOTSUPP;

	mutex_lock(&inode->i_mutex);

	err = nfs42_proc_fallocate(&msg, filep, offset, len);
	if (err == -EOPNOTSUPP)
		NFS_SERVER(inode)->caps &= ~NFS_CAP_ALLOCATE;

	mutex_unlock(&inode->i_mutex);
	return err;
}

int nfs42_proc_deallocate(struct file *filep, loff_t offset, loff_t len)
{
	struct rpc_message msg = {
		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_DEALLOCATE],
	};
	struct inode *inode = file_inode(filep);
	int err;

	if (!nfs_server_capable(inode, NFS_CAP_DEALLOCATE))
		return -EOPNOTSUPP;

	nfs_wb_all(inode);
	mutex_lock(&inode->i_mutex);

	err = nfs42_proc_fallocate(&msg, filep, offset, len);
	if (err == 0)
		truncate_pagecache_range(inode, offset, (offset + len) -1);
	if (err == -EOPNOTSUPP)
		NFS_SERVER(inode)->caps &= ~NFS_CAP_DEALLOCATE;

	mutex_unlock(&inode->i_mutex);
	return err;
}

loff_t nfs42_proc_llseek(struct file *filep, loff_t offset, int whence)
{
	struct inode *inode = file_inode(filep);
	struct nfs42_seek_args args = {
		.sa_fh		= NFS_FH(inode),
		.sa_offset	= offset,
		.sa_what	= (whence == SEEK_HOLE) ?
					NFS4_CONTENT_HOLE : NFS4_CONTENT_DATA,
	};
	struct nfs42_seek_res res;
	struct rpc_message msg = {
		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SEEK],
		.rpc_argp = &args,
		.rpc_resp = &res,
	};
	struct nfs_server *server = NFS_SERVER(inode);
	int status;

	if (!nfs_server_capable(inode, NFS_CAP_SEEK))
		return -ENOTSUPP;

	status = nfs42_set_rw_stateid(&args.sa_stateid, filep, FMODE_READ);
	if (status)
		return status;

	nfs_wb_all(inode);
	status = nfs4_call_sync(server->client, server, &msg,
				&args.seq_args, &res.seq_res, 0);
	if (status == -ENOTSUPP)
		server->caps &= ~NFS_CAP_SEEK;
	if (status)
		return status;

	return vfs_setpos(filep, res.sr_offset, inode->i_sb->s_maxbytes);
}