예제 #1
0
/*
 * Common pre-write limit and setup checks.
 *
 * Called with the iolocked held either shared and exclusive according to
 * @iolock, and returns with it held.  Might upgrade the iolock to exclusive
 * if called for a direct write beyond i_size.
 */
STATIC ssize_t
xfs_file_aio_write_checks(
	struct file		*file,
	loff_t			*pos,
	size_t			*count,
	int			*iolock)
{
	struct inode		*inode = file->f_mapping->host;
	struct xfs_inode	*ip = XFS_I(inode);
	int			error = 0;

restart:
	error = generic_write_checks(file, pos, count, S_ISBLK(inode->i_mode));
	if (error)
		return error;

	/*
	 * If the offset is beyond the size of the file, we need to zero any
	 * blocks that fall between the existing EOF and the start of this
	 * write.  If zeroing is needed and we are currently holding the
	 * iolock shared, we need to update it to exclusive which implies
	 * having to redo all checks before.
	 */
	if (*pos > i_size_read(inode)) {
		if (*iolock == XFS_IOLOCK_SHARED) {
			xfs_rw_iunlock(ip, *iolock);
			*iolock = XFS_IOLOCK_EXCL;
			xfs_rw_ilock(ip, *iolock);
			goto restart;
		}
		error = -xfs_zero_eof(ip, *pos, i_size_read(inode));
		if (error)
			return error;
	}

	/*
	 * Updating the timestamps will grab the ilock again from
	 * xfs_fs_dirty_inode, so we have to call it after dropping the
	 * lock above.  Eventually we should look into a way to avoid
	 * the pointless lock roundtrip.
	 */
	if (likely(!(file->f_mode & FMODE_NOCMTIME))) {
		error = file_update_time(file);
		if (error)
			return error;
	}

	/*
	 * If we're writing the file then make sure to clear the setuid and
	 * setgid bits if the process is not being run by root.  This keeps
	 * people from modifying setuid and setgid binaries.
	 */
	return file_remove_suid(file);
}
예제 #2
0
/*
 * Common pre-write limit and setup checks.
 *
 * Called with the iolocked held either shared and exclusive according to
 * @iolock, and returns with it held.  Might upgrade the iolock to exclusive
 * if called for a direct write beyond i_size.
 */
STATIC ssize_t
xfs_file_aio_write_checks(
	struct file		*file,
	loff_t			*pos,
	size_t			*count,
	int			*iolock)
{
	struct inode		*inode = file->f_mapping->host;
	struct xfs_inode	*ip = XFS_I(inode);
	int			error = 0;

	xfs_rw_ilock(ip, XFS_ILOCK_EXCL);
restart:
	error = generic_write_checks(file, pos, count, S_ISBLK(inode->i_mode));
	if (error) {
		xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
		return error;
	}

	if (likely(!(file->f_mode & FMODE_NOCMTIME)))
		file_update_time(file);

	/*
	 * If the offset is beyond the size of the file, we need to zero any
	 * blocks that fall between the existing EOF and the start of this
	 * write.  If zeroing is needed and we are currently holding the
	 * iolock shared, we need to update it to exclusive which involves
	 * dropping all locks and relocking to maintain correct locking order.
	 * If we do this, restart the function to ensure all checks and values
	 * are still valid.
	 */
	if (*pos > i_size_read(inode)) {
		if (*iolock == XFS_IOLOCK_SHARED) {
			xfs_rw_iunlock(ip, XFS_ILOCK_EXCL | *iolock);
			*iolock = XFS_IOLOCK_EXCL;
			xfs_rw_ilock(ip, XFS_ILOCK_EXCL | *iolock);
			goto restart;
		}
		error = -xfs_zero_eof(ip, *pos, i_size_read(inode));
	}
	xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
	if (error)
		return error;

	/*
	 * If we're writing the file then make sure to clear the setuid and
	 * setgid bits if the process is not being run by root.  This keeps
	 * people from modifying setuid and setgid binaries.
	 */
	return file_remove_suid(file);

}
예제 #3
0
/*
 * Common pre-write limit and setup checks.
 *
 * Returns with iolock held according to @iolock.
 */
STATIC ssize_t
xfs_file_aio_write_checks(
	struct file		*file,
	loff_t			*pos,
	size_t			*count,
	int			*iolock)
{
	struct inode		*inode = file->f_mapping->host;
	struct xfs_inode	*ip = XFS_I(inode);
	xfs_fsize_t		new_size;
	int			error = 0;

	xfs_rw_ilock(ip, XFS_ILOCK_EXCL);
	error = generic_write_checks(file, pos, count, S_ISBLK(inode->i_mode));
	if (error) {
		xfs_rw_iunlock(ip, XFS_ILOCK_EXCL | *iolock);
		*iolock = 0;
		return error;
	}

	new_size = *pos + *count;
	if (new_size > ip->i_size)
		ip->i_new_size = new_size;

	if (likely(!(file->f_mode & FMODE_NOCMTIME)))
		file_update_time(file);

	/*
	 * If the offset is beyond the size of the file, we need to zero any
	 * blocks that fall between the existing EOF and the start of this
	 * write.
	 */
	if (*pos > ip->i_size)
		error = -xfs_zero_eof(ip, *pos, ip->i_size);

	xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
	if (error)
		return error;

	/*
	 * If we're writing the file then make sure to clear the setuid and
	 * setgid bits if the process is not being run by root.  This keeps
	 * people from modifying setuid and setgid binaries.
	 */
	return file_remove_suid(file);

}
예제 #4
0
/*
 * xfs_inval_cached_pages
 * 
 * This routine is responsible for keeping direct I/O and buffered I/O
 * somewhat coherent.  From here we make sure that we're at least
 * temporarily holding the inode I/O lock exclusively and then call
 * the page cache to flush and invalidate any cached pages.  If there
 * are no cached pages this routine will be very quick.
 */
void
xfs_inval_cached_pages(
	vnode_t		*vp,
	xfs_iocore_t	*io,
	xfs_off_t	offset,
	int		write,
	int		relock)
{
	xfs_mount_t	*mp;

	if (!VN_CACHED(vp)) {
		return;
	}

	mp = io->io_mount;

	/*
	 * We need to get the I/O lock exclusively in order
	 * to safely invalidate pages and mappings.
	 */
	if (relock) {
		XFS_IUNLOCK(mp, io, XFS_IOLOCK_SHARED);
		XFS_ILOCK(mp, io, XFS_IOLOCK_EXCL);
	}

	/* Writing beyond EOF creates a hole that must be zeroed */
	if (write && (offset > XFS_SIZE(mp, io))) {
		xfs_fsize_t	isize;

		XFS_ILOCK(mp, io, XFS_ILOCK_EXCL|XFS_EXTSIZE_RD);
		isize = XFS_SIZE(mp, io);
		if (offset > isize) {
			xfs_zero_eof(vp, io, offset, isize, offset);
		}
		XFS_IUNLOCK(mp, io, XFS_ILOCK_EXCL|XFS_EXTSIZE_RD);
	}

	xfs_inval_cached_trace(io, offset, -1, ctooff(offtoct(offset)), -1);
	VOP_FLUSHINVAL_PAGES(vp, ctooff(offtoct(offset)), -1, FI_REMAPF_LOCKED);
	if (relock) {
		XFS_ILOCK_DEMOTE(mp, io, XFS_IOLOCK_EXCL);
	}
}
STATIC ssize_t
xfs_file_aio_write_checks(
    struct file		*file,
    loff_t			*pos,
    size_t			*count,
    int			*iolock)
{
    struct inode		*inode = file->f_mapping->host;
    struct xfs_inode	*ip = XFS_I(inode);
    int			error = 0;

    xfs_rw_ilock(ip, XFS_ILOCK_EXCL);
restart:
    error = generic_write_checks(file, pos, count, S_ISBLK(inode->i_mode));
    if (error) {
        xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
        return error;
    }

    if (*pos > i_size_read(inode)) {
        if (*iolock == XFS_IOLOCK_SHARED) {
            xfs_rw_iunlock(ip, XFS_ILOCK_EXCL | *iolock);
            *iolock = XFS_IOLOCK_EXCL;
            xfs_rw_ilock(ip, XFS_ILOCK_EXCL | *iolock);
            goto restart;
        }
        error = -xfs_zero_eof(ip, *pos, i_size_read(inode));
    }
    xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
    if (error)
        return error;

    if (likely(!(file->f_mode & FMODE_NOCMTIME)))
        file_update_time(file);

    return file_remove_suid(file);

}
예제 #6
0
ssize_t				/* bytes written, or (-) error */
xfs_write(
	bhv_desc_t      *bdp,
	struct file	*file,
	const char	*buf,
	size_t		size,
	loff_t		*offset,
	int		ioflags,
	cred_t          *credp)
{
	xfs_inode_t	*xip;
	xfs_mount_t	*mp;
	ssize_t		ret;
	int		error = 0;
	xfs_fsize_t     isize, new_size;
	xfs_fsize_t	n, limit;
	xfs_iocore_t    *io;
	vnode_t		*vp;
	int		iolock;
	int		eventsent = 0;
	vrwlock_t	locktype;

	XFS_STATS_INC(xs_write_calls);

	vp = BHV_TO_VNODE(bdp);
	xip = XFS_BHVTOI(bdp);

	if (size == 0)
		return 0;

	io = &xip->i_iocore;
	mp = io->io_mount;

	fs_check_frozen(vp->v_vfsp, SB_FREEZE_WRITE);

	if (XFS_FORCED_SHUTDOWN(xip->i_mount)) {
		return -EIO;
	}

	if (unlikely(ioflags & IO_ISDIRECT)) {
		if (((__psint_t)buf & BBMASK) ||
		    (*offset & mp->m_blockmask) ||
		    (size  & mp->m_blockmask)) {
			return XFS_ERROR(-EINVAL);
		}
		iolock = XFS_IOLOCK_SHARED;
		locktype = VRWLOCK_WRITE_DIRECT;
	} else {
		iolock = XFS_IOLOCK_EXCL;
		locktype = VRWLOCK_WRITE;
	}

	if (ioflags & IO_ISLOCKED)
		iolock = 0;

	xfs_ilock(xip, XFS_ILOCK_EXCL|iolock);

	isize = xip->i_d.di_size;
	limit = XFS_MAXIOFFSET(mp);

	if (file->f_flags & O_APPEND)
		*offset = isize;

start:
	n = limit - *offset;
	if (n <= 0) {
		xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock);
		return -EFBIG;
	}
	if (n < size)
		size = n;

	new_size = *offset + size;
	if (new_size > isize) {
		io->io_new_size = new_size;
	}

	if ((DM_EVENT_ENABLED(vp->v_vfsp, xip, DM_EVENT_WRITE) &&
	    !(ioflags & IO_INVIS) && !eventsent)) {
		loff_t		savedsize = *offset;
		int dmflags = FILP_DELAY_FLAG(file) | DM_SEM_FLAG_RD(ioflags);

		xfs_iunlock(xip, XFS_ILOCK_EXCL);
		error = XFS_SEND_DATA(xip->i_mount, DM_EVENT_WRITE, vp,
				      *offset, size,
				      dmflags, &locktype);
		if (error) {
			if (iolock) xfs_iunlock(xip, iolock);
			return -error;
		}
		xfs_ilock(xip, XFS_ILOCK_EXCL);
		eventsent = 1;

		/*
		 * The iolock was dropped and reaquired in XFS_SEND_DATA
		 * so we have to recheck the size when appending.
		 * We will only "goto start;" once, since having sent the
		 * event prevents another call to XFS_SEND_DATA, which is
		 * what allows the size to change in the first place.
		 */
		if ((file->f_flags & O_APPEND) &&
		    savedsize != xip->i_d.di_size) {
			*offset = isize = xip->i_d.di_size;
			goto start;
		}
	}

	/*
	 * If the offset is beyond the size of the file, we have a couple
	 * of things to do. First, if there is already space allocated
	 * we need to either create holes or zero the disk or ...
	 *
	 * If there is a page where the previous size lands, we need
	 * to zero it out up to the new size.
	 */

	if (!(ioflags & IO_ISDIRECT) && (*offset > isize && isize)) {
		error = xfs_zero_eof(BHV_TO_VNODE(bdp), io, *offset,
			isize, *offset + size);
		if (error) {
			xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock);
			return(-error);
		}
	}
	xfs_iunlock(xip, XFS_ILOCK_EXCL);

	/*
	 * If we're writing the file then make sure to clear the
	 * setuid and setgid bits if the process is not being run
	 * by root.  This keeps people from modifying setuid and
	 * setgid binaries.
	 */

	if (((xip->i_d.di_mode & S_ISUID) ||
	    ((xip->i_d.di_mode & (S_ISGID | S_IXGRP)) ==
		(S_ISGID | S_IXGRP))) &&
	     !capable(CAP_FSETID)) {
		error = xfs_write_clear_setuid(xip);
		if (error) {
			xfs_iunlock(xip, iolock);
			return -error;
		}
	}


	if ((ssize_t) size < 0) {
		ret = -EINVAL;
		goto error;
	}

	if (!access_ok(VERIFY_READ, buf, size)) {
		ret = -EINVAL;
		goto error;
	}

retry:
	if (unlikely(ioflags & IO_ISDIRECT)) {
		xfs_inval_cached_pages(vp, io, *offset, 1, 1);
		xfs_rw_enter_trace(XFS_DIOWR_ENTER,
					io, buf, size, *offset, ioflags);
		ret = do_generic_direct_write(file, buf, size, offset);
	} else {
		xfs_rw_enter_trace(XFS_WRITE_ENTER,
					io, buf, size, *offset, ioflags);
		ret = do_generic_file_write(file, buf, size, offset);
	}

	if (unlikely(ioflags & IO_INVIS)) {
		/* generic_file_write updates the mtime/ctime but we need
		 * to undo that because this I/O was supposed to be
		 * invisible.
		 */
		struct inode	*inode = LINVFS_GET_IP(vp);
		inode->i_mtime = xip->i_d.di_mtime.t_sec;
		inode->i_ctime = xip->i_d.di_ctime.t_sec;
	} else {
		xfs_ichgtime(xip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
	}

	if ((ret == -ENOSPC) &&
	    DM_EVENT_ENABLED(vp->v_vfsp, xip, DM_EVENT_NOSPACE) &&
	    !(ioflags & IO_INVIS)) {

		xfs_rwunlock(bdp, locktype);
		error = XFS_SEND_NAMESP(xip->i_mount, DM_EVENT_NOSPACE, vp,
				DM_RIGHT_NULL, vp, DM_RIGHT_NULL, NULL, NULL,
				0, 0, 0); /* Delay flag intentionally  unused */
		if (error)
			return -error;
		xfs_rwlock(bdp, locktype);
		*offset = xip->i_d.di_size;
		goto retry;
	}

error:
	if (ret <= 0) {
		if (iolock)
			xfs_rwunlock(bdp, locktype);
		return ret;
	}

	XFS_STATS_ADD(xs_write_bytes, ret);

	if (*offset > xip->i_d.di_size) {
		xfs_ilock(xip, XFS_ILOCK_EXCL);
		if (*offset > xip->i_d.di_size) {
			struct inode	*inode = LINVFS_GET_IP(vp);

			xip->i_d.di_size = *offset;
			i_size_write(inode, *offset);
			xip->i_update_core = 1;
			xip->i_update_size = 1;
			mark_inode_dirty_sync(inode);
		}
		xfs_iunlock(xip, XFS_ILOCK_EXCL);
	}

	/* Handle various SYNC-type writes */
	if ((file->f_flags & O_SYNC) || IS_SYNC(file->f_dentry->d_inode)) {

		/*
		 * If we're treating this as O_DSYNC and we have not updated the
		 * size, force the log.
		 */

		if (!(mp->m_flags & XFS_MOUNT_OSYNCISOSYNC)
			&& !(xip->i_update_size)) {
			/*
			 * If an allocation transaction occurred
			 * without extending the size, then we have to force
			 * the log up the proper point to ensure that the
			 * allocation is permanent.  We can't count on
			 * the fact that buffered writes lock out direct I/O
			 * writes - the direct I/O write could have extended
			 * the size nontransactionally, then finished before
			 * we started.  xfs_write_file will think that the file
			 * didn't grow but the update isn't safe unless the
			 * size change is logged.
			 *
			 * Force the log if we've committed a transaction
			 * against the inode or if someone else has and
			 * the commit record hasn't gone to disk (e.g.
			 * the inode is pinned).  This guarantees that
			 * all changes affecting the inode are permanent
			 * when we return.
			 */

			xfs_inode_log_item_t *iip;
			xfs_lsn_t lsn;

			iip = xip->i_itemp;
			if (iip && iip->ili_last_lsn) {
				lsn = iip->ili_last_lsn;
				xfs_log_force(mp, lsn,
						XFS_LOG_FORCE | XFS_LOG_SYNC);
			} else if (xfs_ipincount(xip) > 0) {
				xfs_log_force(mp, (xfs_lsn_t)0,
						XFS_LOG_FORCE | XFS_LOG_SYNC);
			}

		} else {
			xfs_trans_t	*tp;

			/*
			 * O_SYNC or O_DSYNC _with_ a size update are handled
			 * the same way.
			 *
			 * If the write was synchronous then we need to make
			 * sure that the inode modification time is permanent.
			 * We'll have updated the timestamp above, so here
			 * we use a synchronous transaction to log the inode.
			 * It's not fast, but it's necessary.
			 *
			 * If this a dsync write and the size got changed
			 * non-transactionally, then we need to ensure that
			 * the size change gets logged in a synchronous
			 * transaction.
			 */

			tp = xfs_trans_alloc(mp, XFS_TRANS_WRITE_SYNC);
			if ((error = xfs_trans_reserve(tp, 0,
						      XFS_SWRITE_LOG_RES(mp),
						      0, 0, 0))) {
				/* Transaction reserve failed */
				xfs_trans_cancel(tp, 0);
			} else {
				/* Transaction reserve successful */
				xfs_ilock(xip, XFS_ILOCK_EXCL);
				xfs_trans_ijoin(tp, xip, XFS_ILOCK_EXCL);
				xfs_trans_ihold(tp, xip);
				xfs_trans_log_inode(tp, xip, XFS_ILOG_CORE);
				xfs_trans_set_sync(tp);
				error = xfs_trans_commit(tp, 0, NULL);
				xfs_iunlock(xip, XFS_ILOCK_EXCL);
			}
		}
	} /* (ioflags & O_SYNC) */

	/*
	 * If we are coming from an nfsd thread then insert into the
	 * reference cache.
	 */

	if (!strcmp(current->comm, "nfsd"))
		xfs_refcache_insert(xip);

	/* Drop lock this way - the old refcache release is in here */
	if (iolock)
		xfs_rwunlock(bdp, locktype);

	return(ret);
}
예제 #7
0
STATIC ssize_t
xfs_file_aio_write(
	struct kiocb		*iocb,
	const struct iovec	*iovp,
	unsigned long		nr_segs,
	loff_t			pos)
{
	struct file		*file = iocb->ki_filp;
	struct address_space	*mapping = file->f_mapping;
	struct inode		*inode = mapping->host;
	struct xfs_inode	*ip = XFS_I(inode);
	struct xfs_mount	*mp = ip->i_mount;
	ssize_t			ret = 0, error = 0;
	int			ioflags = 0;
	xfs_fsize_t		isize, new_size;
	int			iolock;
	int			eventsent = 0;
	size_t			ocount = 0, count;
	int			need_i_mutex;

	XFS_STATS_INC(xs_write_calls);

	BUG_ON(iocb->ki_pos != pos);

	if (unlikely(file->f_flags & O_DIRECT))
		ioflags |= IO_ISDIRECT;
	if (file->f_mode & FMODE_NOCMTIME)
		ioflags |= IO_INVIS;

	error = generic_segment_checks(iovp, &nr_segs, &ocount, VERIFY_READ);
	if (error)
		return error;

	count = ocount;
	if (count == 0)
		return 0;

	xfs_wait_for_freeze(mp, SB_FREEZE_WRITE);

	if (XFS_FORCED_SHUTDOWN(mp))
		return -EIO;

relock:
	if (ioflags & IO_ISDIRECT) {
		iolock = XFS_IOLOCK_SHARED;
		need_i_mutex = 0;
	} else {
		iolock = XFS_IOLOCK_EXCL;
		need_i_mutex = 1;
		mutex_lock(&inode->i_mutex);
	}

	xfs_ilock(ip, XFS_ILOCK_EXCL|iolock);

start:
	error = -generic_write_checks(file, &pos, &count,
					S_ISBLK(inode->i_mode));
	if (error) {
		xfs_iunlock(ip, XFS_ILOCK_EXCL|iolock);
		goto out_unlock_mutex;
	}

	if ((DM_EVENT_ENABLED(ip, DM_EVENT_WRITE) &&
	    !(ioflags & IO_INVIS) && !eventsent)) {
		int		dmflags = FILP_DELAY_FLAG(file);

		if (need_i_mutex)
			dmflags |= DM_FLAGS_IMUX;

		xfs_iunlock(ip, XFS_ILOCK_EXCL);
		error = XFS_SEND_DATA(ip->i_mount, DM_EVENT_WRITE, ip,
				      pos, count, dmflags, &iolock);
		if (error) {
			goto out_unlock_internal;
		}
		xfs_ilock(ip, XFS_ILOCK_EXCL);
		eventsent = 1;

		/*
		 * The iolock was dropped and reacquired in XFS_SEND_DATA
		 * so we have to recheck the size when appending.
		 * We will only "goto start;" once, since having sent the
		 * event prevents another call to XFS_SEND_DATA, which is
		 * what allows the size to change in the first place.
		 */
		if ((file->f_flags & O_APPEND) && pos != ip->i_size)
			goto start;
	}

	if (ioflags & IO_ISDIRECT) {
		xfs_buftarg_t	*target =
			XFS_IS_REALTIME_INODE(ip) ?
				mp->m_rtdev_targp : mp->m_ddev_targp;

		if ((pos & target->bt_smask) || (count & target->bt_smask)) {
			xfs_iunlock(ip, XFS_ILOCK_EXCL|iolock);
			return XFS_ERROR(-EINVAL);
		}

		if (!need_i_mutex && (mapping->nrpages || pos > ip->i_size)) {
			xfs_iunlock(ip, XFS_ILOCK_EXCL|iolock);
			iolock = XFS_IOLOCK_EXCL;
			need_i_mutex = 1;
			mutex_lock(&inode->i_mutex);
			xfs_ilock(ip, XFS_ILOCK_EXCL|iolock);
			goto start;
		}
	}

	new_size = pos + count;
	if (new_size > ip->i_size)
		ip->i_new_size = new_size;

	if (likely(!(ioflags & IO_INVIS)))
		file_update_time(file);

	/*
	 * If the offset is beyond the size of the file, we have a couple
	 * of things to do. First, if there is already space allocated
	 * we need to either create holes or zero the disk or ...
	 *
	 * If there is a page where the previous size lands, we need
	 * to zero it out up to the new size.
	 */

	if (pos > ip->i_size) {
		error = xfs_zero_eof(ip, pos, ip->i_size);
		if (error) {
			xfs_iunlock(ip, XFS_ILOCK_EXCL);
			goto out_unlock_internal;
		}
	}
	xfs_iunlock(ip, XFS_ILOCK_EXCL);

	/*
	 * If we're writing the file then make sure to clear the
	 * setuid and setgid bits if the process is not being run
	 * by root.  This keeps people from modifying setuid and
	 * setgid binaries.
	 */
	error = -file_remove_suid(file);
	if (unlikely(error))
		goto out_unlock_internal;

	/* We can write back this queue in page reclaim */
	current->backing_dev_info = mapping->backing_dev_info;

	if ((ioflags & IO_ISDIRECT)) {
		if (mapping->nrpages) {
			WARN_ON(need_i_mutex == 0);
			error = xfs_flushinval_pages(ip,
					(pos & PAGE_CACHE_MASK),
					-1, FI_REMAPF_LOCKED);
			if (error)
				goto out_unlock_internal;
		}

		if (need_i_mutex) {
			/* demote the lock now the cached pages are gone */
			xfs_ilock_demote(ip, XFS_IOLOCK_EXCL);
			mutex_unlock(&inode->i_mutex);

			iolock = XFS_IOLOCK_SHARED;
			need_i_mutex = 0;
		}

		trace_xfs_file_direct_write(ip, count, iocb->ki_pos, ioflags);
		ret = generic_file_direct_write(iocb, iovp,
				&nr_segs, pos, &iocb->ki_pos, count, ocount);

		/*
		 * direct-io write to a hole: fall through to buffered I/O
		 * for completing the rest of the request.
		 */
		if (ret >= 0 && ret != count) {
			XFS_STATS_ADD(xs_write_bytes, ret);

			pos += ret;
			count -= ret;

			ioflags &= ~IO_ISDIRECT;
			xfs_iunlock(ip, iolock);
			goto relock;
		}
	} else {
		int enospc = 0;
		ssize_t ret2 = 0;

write_retry:
		trace_xfs_file_buffered_write(ip, count, iocb->ki_pos, ioflags);
		ret2 = generic_file_buffered_write(iocb, iovp, nr_segs,
				pos, &iocb->ki_pos, count, ret);
		/*
		 * if we just got an ENOSPC, flush the inode now we
		 * aren't holding any page locks and retry *once*
		 */
		if (ret2 == -ENOSPC && !enospc) {
			error = xfs_flush_pages(ip, 0, -1, 0, FI_NONE);
			if (error)
				goto out_unlock_internal;
			enospc = 1;
			goto write_retry;
		}
		ret = ret2;
	}

	current->backing_dev_info = NULL;

	isize = i_size_read(inode);
	if (unlikely(ret < 0 && ret != -EFAULT && iocb->ki_pos > isize))
		iocb->ki_pos = isize;

	if (iocb->ki_pos > ip->i_size) {
		xfs_ilock(ip, XFS_ILOCK_EXCL);
		if (iocb->ki_pos > ip->i_size)
			ip->i_size = iocb->ki_pos;
		xfs_iunlock(ip, XFS_ILOCK_EXCL);
	}

	if (ret == -ENOSPC &&
	    DM_EVENT_ENABLED(ip, DM_EVENT_NOSPACE) && !(ioflags & IO_INVIS)) {
		xfs_iunlock(ip, iolock);
		if (need_i_mutex)
			mutex_unlock(&inode->i_mutex);
		error = XFS_SEND_NAMESP(ip->i_mount, DM_EVENT_NOSPACE, ip,
				DM_RIGHT_NULL, ip, DM_RIGHT_NULL, NULL, NULL,
				0, 0, 0); /* Delay flag intentionally  unused */
		if (need_i_mutex)
			mutex_lock(&inode->i_mutex);
		xfs_ilock(ip, iolock);
		if (error)
			goto out_unlock_internal;
		goto start;
	}

	error = -ret;
	if (ret <= 0)
		goto out_unlock_internal;

	XFS_STATS_ADD(xs_write_bytes, ret);

	/* Handle various SYNC-type writes */
	if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) {
		loff_t end = pos + ret - 1;
		int error2;

		xfs_iunlock(ip, iolock);
		if (need_i_mutex)
			mutex_unlock(&inode->i_mutex);

		error2 = filemap_write_and_wait_range(mapping, pos, end);
		if (!error)
			error = error2;
		if (need_i_mutex)
			mutex_lock(&inode->i_mutex);
		xfs_ilock(ip, iolock);

		error2 = -xfs_file_fsync(file, file->f_path.dentry,
					 (file->f_flags & __O_SYNC) ? 0 : 1);
		if (!error)
			error = error2;
	}

 out_unlock_internal:
	if (ip->i_new_size) {
		xfs_ilock(ip, XFS_ILOCK_EXCL);
		ip->i_new_size = 0;
		/*
		 * If this was a direct or synchronous I/O that failed (such
		 * as ENOSPC) then part of the I/O may have been written to
		 * disk before the error occured.  In this case the on-disk
		 * file size may have been adjusted beyond the in-memory file
		 * size and now needs to be truncated back.
		 */
		if (ip->i_d.di_size > ip->i_size)
			ip->i_d.di_size = ip->i_size;
		xfs_iunlock(ip, XFS_ILOCK_EXCL);
	}
	xfs_iunlock(ip, iolock);
 out_unlock_mutex:
	if (need_i_mutex)
		mutex_unlock(&inode->i_mutex);
	return -error;
}
예제 #8
0
ssize_t				/* bytes written, or (-) error */
xfs_write(
	bhv_desc_t		*bdp,
	struct kiocb		*iocb,
	const struct iovec	*iovp,
	unsigned int		nsegs,
	loff_t			*offset,
	int			ioflags,
	cred_t			*credp)
{
	struct file		*file = iocb->ki_filp;
	struct address_space	*mapping = file->f_mapping;
	struct inode		*inode = mapping->host;
	unsigned long		segs = nsegs;
	xfs_inode_t		*xip;
	xfs_mount_t		*mp;
	ssize_t			ret = 0, error = 0;
	xfs_fsize_t		isize, new_size;
	xfs_iocore_t		*io;
	vnode_t			*vp;
	unsigned long		seg;
	int			iolock;
	int			eventsent = 0;
	vrwlock_t		locktype;
	size_t			ocount = 0, count;
	loff_t			pos;
	int			need_isem = 1, need_flush = 0;

	XFS_STATS_INC(xs_write_calls);

	vp = BHV_TO_VNODE(bdp);
	xip = XFS_BHVTOI(bdp);

	for (seg = 0; seg < segs; seg++) {
		const struct iovec *iv = &iovp[seg];

		/*
		 * If any segment has a negative length, or the cumulative
		 * length ever wraps negative then return -EINVAL.
		 */
		ocount += iv->iov_len;
		if (unlikely((ssize_t)(ocount|iv->iov_len) < 0))
			return -EINVAL;
		if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
			continue;
		if (seg == 0)
			return -EFAULT;
		segs = seg;
		ocount -= iv->iov_len;  /* This segment is no good */
		break;
	}

	count = ocount;
	pos = *offset;

	if (count == 0)
		return 0;

	io = &xip->i_iocore;
	mp = io->io_mount;

	if (XFS_FORCED_SHUTDOWN(mp))
		return -EIO;

	if (ioflags & IO_ISDIRECT) {
		xfs_buftarg_t	*target =
			(xip->i_d.di_flags & XFS_DIFLAG_REALTIME) ?
				mp->m_rtdev_targp : mp->m_ddev_targp;

		if ((pos & target->pbr_smask) || (count & target->pbr_smask))
			return XFS_ERROR(-EINVAL);

		if (!VN_CACHED(vp) && pos < i_size_read(inode))
			need_isem = 0;

		if (VN_CACHED(vp))
			need_flush = 1;
	}

relock:
	if (need_isem) {
		iolock = XFS_IOLOCK_EXCL;
		locktype = VRWLOCK_WRITE;

		down(&inode->i_sem);
	} else {
		iolock = XFS_IOLOCK_SHARED;
		locktype = VRWLOCK_WRITE_DIRECT;
	}

	xfs_ilock(xip, XFS_ILOCK_EXCL|iolock);

	isize = i_size_read(inode);

	if (file->f_flags & O_APPEND)
		*offset = isize;

start:
	error = -generic_write_checks(file, &pos, &count,
					S_ISBLK(inode->i_mode));
	if (error) {
		xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock);
		goto out_unlock_isem;
	}

	new_size = pos + count;
	if (new_size > isize)
		io->io_new_size = new_size;

	if ((DM_EVENT_ENABLED(vp->v_vfsp, xip, DM_EVENT_WRITE) &&
	    !(ioflags & IO_INVIS) && !eventsent)) {
		loff_t		savedsize = pos;
		int		dmflags = FILP_DELAY_FLAG(file);

		if (need_isem)
			dmflags |= DM_FLAGS_ISEM;

		xfs_iunlock(xip, XFS_ILOCK_EXCL);
		error = XFS_SEND_DATA(xip->i_mount, DM_EVENT_WRITE, vp,
				      pos, count,
				      dmflags, &locktype);
		if (error) {
			xfs_iunlock(xip, iolock);
			goto out_unlock_isem;
		}
		xfs_ilock(xip, XFS_ILOCK_EXCL);
		eventsent = 1;

		/*
		 * The iolock was dropped and reaquired in XFS_SEND_DATA
		 * so we have to recheck the size when appending.
		 * We will only "goto start;" once, since having sent the
		 * event prevents another call to XFS_SEND_DATA, which is
		 * what allows the size to change in the first place.
		 */
		if ((file->f_flags & O_APPEND) && savedsize != isize) {
			pos = isize = xip->i_d.di_size;
			goto start;
		}
	}

	/*
	 * On Linux, generic_file_write updates the times even if
	 * no data is copied in so long as the write had a size.
	 *
	 * We must update xfs' times since revalidate will overcopy xfs.
	 */
	if (!(ioflags & IO_INVIS)) {
		xfs_ichgtime(xip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
		inode_update_time(inode, 1);
	}

	/*
	 * If the offset is beyond the size of the file, we have a couple
	 * of things to do. First, if there is already space allocated
	 * we need to either create holes or zero the disk or ...
	 *
	 * If there is a page where the previous size lands, we need
	 * to zero it out up to the new size.
	 */

	if (pos > isize) {
		error = xfs_zero_eof(BHV_TO_VNODE(bdp), io, pos,
					isize, pos + count);
		if (error) {
			xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock);
			goto out_unlock_isem;
		}
	}
	xfs_iunlock(xip, XFS_ILOCK_EXCL);

	/*
	 * If we're writing the file then make sure to clear the
	 * setuid and setgid bits if the process is not being run
	 * by root.  This keeps people from modifying setuid and
	 * setgid binaries.
	 */

	if (((xip->i_d.di_mode & S_ISUID) ||
	    ((xip->i_d.di_mode & (S_ISGID | S_IXGRP)) ==
		(S_ISGID | S_IXGRP))) &&
	     !capable(CAP_FSETID)) {
		error = xfs_write_clear_setuid(xip);
		if (likely(!error))
			error = -remove_suid(file->f_dentry);
		if (unlikely(error)) {
			xfs_iunlock(xip, iolock);
			goto out_unlock_isem;
		}
	}

retry:
	/* We can write back this queue in page reclaim */
	current->backing_dev_info = mapping->backing_dev_info;

	if ((ioflags & IO_ISDIRECT)) {
		if (need_flush) {
			xfs_inval_cached_trace(io, pos, -1,
					ctooff(offtoct(pos)), -1);
			VOP_FLUSHINVAL_PAGES(vp, ctooff(offtoct(pos)),
					-1, FI_REMAPF_LOCKED);
		}

		if (need_isem) {
			/* demote the lock now the cached pages are gone */
			XFS_ILOCK_DEMOTE(mp, io, XFS_IOLOCK_EXCL);
			up(&inode->i_sem);

			iolock = XFS_IOLOCK_SHARED;
			locktype = VRWLOCK_WRITE_DIRECT;
			need_isem = 0;
		}

 		xfs_rw_enter_trace(XFS_DIOWR_ENTER, io, (void *)iovp, segs,
				*offset, ioflags);
		ret = generic_file_direct_write(iocb, iovp,
				&segs, pos, offset, count, ocount);

		/*
		 * direct-io write to a hole: fall through to buffered I/O
		 * for completing the rest of the request.
		 */
		if (ret >= 0 && ret != count) {
			XFS_STATS_ADD(xs_write_bytes, ret);

			pos += ret;
			count -= ret;

			need_isem = 1;
			ioflags &= ~IO_ISDIRECT;
			xfs_iunlock(xip, iolock);
			goto relock;
		}
	} else {
		xfs_rw_enter_trace(XFS_WRITE_ENTER, io, (void *)iovp, segs,
				*offset, ioflags);
		ret = generic_file_buffered_write(iocb, iovp, segs,
				pos, offset, count, ret);
	}

	current->backing_dev_info = NULL;

	if (ret == -EIOCBQUEUED)
		ret = wait_on_sync_kiocb(iocb);

	if ((ret == -ENOSPC) &&
	    DM_EVENT_ENABLED(vp->v_vfsp, xip, DM_EVENT_NOSPACE) &&
	    !(ioflags & IO_INVIS)) {

		xfs_rwunlock(bdp, locktype);
		error = XFS_SEND_NAMESP(xip->i_mount, DM_EVENT_NOSPACE, vp,
				DM_RIGHT_NULL, vp, DM_RIGHT_NULL, NULL, NULL,
				0, 0, 0); /* Delay flag intentionally  unused */
		if (error)
			goto out_unlock_isem;
		xfs_rwlock(bdp, locktype);
		pos = xip->i_d.di_size;
		goto retry;
	}

	if (*offset > xip->i_d.di_size) {
		xfs_ilock(xip, XFS_ILOCK_EXCL);
		if (*offset > xip->i_d.di_size) {
			xip->i_d.di_size = *offset;
			i_size_write(inode, *offset);
			xip->i_update_core = 1;
			xip->i_update_size = 1;
		}
		xfs_iunlock(xip, XFS_ILOCK_EXCL);
	}

	error = -ret;
	if (ret <= 0)
		goto out_unlock_internal;

	XFS_STATS_ADD(xs_write_bytes, ret);

	/* Handle various SYNC-type writes */
	if ((file->f_flags & O_SYNC) || IS_SYNC(inode)) {
		/*
		 * If we're treating this as O_DSYNC and we have not updated the
		 * size, force the log.
		 */
		if (!(mp->m_flags & XFS_MOUNT_OSYNCISOSYNC) &&
		    !(xip->i_update_size)) {
			xfs_inode_log_item_t	*iip = xip->i_itemp;

			/*
			 * If an allocation transaction occurred
			 * without extending the size, then we have to force
			 * the log up the proper point to ensure that the
			 * allocation is permanent.  We can't count on
			 * the fact that buffered writes lock out direct I/O
			 * writes - the direct I/O write could have extended
			 * the size nontransactionally, then finished before
			 * we started.  xfs_write_file will think that the file
			 * didn't grow but the update isn't safe unless the
			 * size change is logged.
			 *
			 * Force the log if we've committed a transaction
			 * against the inode or if someone else has and
			 * the commit record hasn't gone to disk (e.g.
			 * the inode is pinned).  This guarantees that
			 * all changes affecting the inode are permanent
			 * when we return.
			 */
			if (iip && iip->ili_last_lsn) {
				xfs_log_force(mp, iip->ili_last_lsn,
						XFS_LOG_FORCE | XFS_LOG_SYNC);
			} else if (xfs_ipincount(xip) > 0) {
				xfs_log_force(mp, (xfs_lsn_t)0,
						XFS_LOG_FORCE | XFS_LOG_SYNC);
			}

		} else {
			xfs_trans_t	*tp;

			/*
			 * O_SYNC or O_DSYNC _with_ a size update are handled
			 * the same way.
			 *
			 * If the write was synchronous then we need to make
			 * sure that the inode modification time is permanent.
			 * We'll have updated the timestamp above, so here
			 * we use a synchronous transaction to log the inode.
			 * It's not fast, but it's necessary.
			 *
			 * If this a dsync write and the size got changed
			 * non-transactionally, then we need to ensure that
			 * the size change gets logged in a synchronous
			 * transaction.
			 */

			tp = xfs_trans_alloc(mp, XFS_TRANS_WRITE_SYNC);
			if ((error = xfs_trans_reserve(tp, 0,
						      XFS_SWRITE_LOG_RES(mp),
						      0, 0, 0))) {
				/* Transaction reserve failed */
				xfs_trans_cancel(tp, 0);
			} else {
				/* Transaction reserve successful */
				xfs_ilock(xip, XFS_ILOCK_EXCL);
				xfs_trans_ijoin(tp, xip, XFS_ILOCK_EXCL);
				xfs_trans_ihold(tp, xip);
				xfs_trans_log_inode(tp, xip, XFS_ILOG_CORE);
				xfs_trans_set_sync(tp);
				error = xfs_trans_commit(tp, 0, NULL);
				xfs_iunlock(xip, XFS_ILOCK_EXCL);
				if (error)
					goto out_unlock_internal;
			}
		}
	
		xfs_rwunlock(bdp, locktype);
		if (need_isem)
			up(&inode->i_sem);

		error = sync_page_range(inode, mapping, pos, ret);
		if (!error)
			error = ret;
		return error;
	}

 out_unlock_internal:
	xfs_rwunlock(bdp, locktype);
 out_unlock_isem:
	if (need_isem)
		up(&inode->i_sem);
	return -error;
}
예제 #9
0
파일: xfs_iops.c 프로젝트: gxt/linux
/*
 * Truncate file.  Must have write permission and not be a directory.
 */
int
xfs_setattr_size(
	struct xfs_inode	*ip,
	struct iattr		*iattr)
{
	struct xfs_mount	*mp = ip->i_mount;
	struct inode		*inode = VFS_I(ip);
	xfs_off_t		oldsize, newsize;
	struct xfs_trans	*tp;
	int			error;
	uint			lock_flags = 0;
	bool			did_zeroing = false;

	trace_xfs_setattr(ip);

	if (mp->m_flags & XFS_MOUNT_RDONLY)
		return -EROFS;

	if (XFS_FORCED_SHUTDOWN(mp))
		return -EIO;

	error = inode_change_ok(inode, iattr);
	if (error)
		return error;

	ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
	ASSERT(xfs_isilocked(ip, XFS_MMAPLOCK_EXCL));
	ASSERT(S_ISREG(inode->i_mode));
	ASSERT((iattr->ia_valid & (ATTR_UID|ATTR_GID|ATTR_ATIME|ATTR_ATIME_SET|
		ATTR_MTIME_SET|ATTR_KILL_PRIV|ATTR_TIMES_SET)) == 0);

	oldsize = inode->i_size;
	newsize = iattr->ia_size;

	/*
	 * Short circuit the truncate case for zero length files.
	 */
	if (newsize == 0 && oldsize == 0 && ip->i_d.di_nextents == 0) {
		if (!(iattr->ia_valid & (ATTR_CTIME|ATTR_MTIME)))
			return 0;

		/*
		 * Use the regular setattr path to update the timestamps.
		 */
		iattr->ia_valid &= ~ATTR_SIZE;
		return xfs_setattr_nonsize(ip, iattr, 0);
	}

	/*
	 * Make sure that the dquots are attached to the inode.
	 */
	error = xfs_qm_dqattach(ip, 0);
	if (error)
		return error;

	/*
	 * Wait for all direct I/O to complete.
	 */
	inode_dio_wait(inode);

	/*
	 * File data changes must be complete before we start the transaction to
	 * modify the inode.  This needs to be done before joining the inode to
	 * the transaction because the inode cannot be unlocked once it is a
	 * part of the transaction.
	 *
	 * Start with zeroing any data beyond EOF that we may expose on file
	 * extension, or zeroing out the rest of the block on a downward
	 * truncate.
	 */
	if (newsize > oldsize) {
		error = xfs_zero_eof(ip, newsize, oldsize, &did_zeroing);
	} else {
		error = iomap_truncate_page(inode, newsize, &did_zeroing,
				&xfs_iomap_ops);
	}

	if (error)
		return error;

	/*
	 * We are going to log the inode size change in this transaction so
	 * any previous writes that are beyond the on disk EOF and the new
	 * EOF that have not been written out need to be written here.  If we
	 * do not write the data out, we expose ourselves to the null files
	 * problem. Note that this includes any block zeroing we did above;
	 * otherwise those blocks may not be zeroed after a crash.
	 */
	if (did_zeroing ||
	    (newsize > ip->i_d.di_size && oldsize != ip->i_d.di_size)) {
		error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
						      ip->i_d.di_size, newsize);
		if (error)
			return error;
	}

	/*
	 * We've already locked out new page faults, so now we can safely remove
	 * pages from the page cache knowing they won't get refaulted until we
	 * drop the XFS_MMAP_EXCL lock after the extent manipulations are
	 * complete. The truncate_setsize() call also cleans partial EOF page
	 * PTEs on extending truncates and hence ensures sub-page block size
	 * filesystems are correctly handled, too.
	 *
	 * We have to do all the page cache truncate work outside the
	 * transaction context as the "lock" order is page lock->log space
	 * reservation as defined by extent allocation in the writeback path.
	 * Hence a truncate can fail with ENOMEM from xfs_trans_alloc(), but
	 * having already truncated the in-memory version of the file (i.e. made
	 * user visible changes). There's not much we can do about this, except
	 * to hope that the caller sees ENOMEM and retries the truncate
	 * operation.
	 */
	truncate_setsize(inode, newsize);

	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp);
	if (error)
		return error;

	lock_flags |= XFS_ILOCK_EXCL;
	xfs_ilock(ip, XFS_ILOCK_EXCL);
	xfs_trans_ijoin(tp, ip, 0);

	/*
	 * Only change the c/mtime if we are changing the size or we are
	 * explicitly asked to change it.  This handles the semantic difference
	 * between truncate() and ftruncate() as implemented in the VFS.
	 *
	 * The regular truncate() case without ATTR_CTIME and ATTR_MTIME is a
	 * special case where we need to update the times despite not having
	 * these flags set.  For all other operations the VFS set these flags
	 * explicitly if it wants a timestamp update.
	 */
	if (newsize != oldsize &&
	    !(iattr->ia_valid & (ATTR_CTIME | ATTR_MTIME))) {
		iattr->ia_ctime = iattr->ia_mtime =
			current_fs_time(inode->i_sb);
		iattr->ia_valid |= ATTR_CTIME | ATTR_MTIME;
	}

	/*
	 * The first thing we do is set the size to new_size permanently on
	 * disk.  This way we don't have to worry about anyone ever being able
	 * to look at the data being freed even in the face of a crash.
	 * What we're getting around here is the case where we free a block, it
	 * is allocated to another file, it is written to, and then we crash.
	 * If the new data gets written to the file but the log buffers
	 * containing the free and reallocation don't, then we'd end up with
	 * garbage in the blocks being freed.  As long as we make the new size
	 * permanent before actually freeing any blocks it doesn't matter if
	 * they get written to.
	 */
	ip->i_d.di_size = newsize;
	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);

	if (newsize <= oldsize) {
		error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, newsize);
		if (error)
			goto out_trans_cancel;

		/*
		 * Truncated "down", so we're removing references to old data
		 * here - if we delay flushing for a long time, we expose
		 * ourselves unduly to the notorious NULL files problem.  So,
		 * we mark this inode and flush it when the file is closed,
		 * and do not wait the usual (long) time for writeout.
		 */
		xfs_iflags_set(ip, XFS_ITRUNCATED);

		/* A truncate down always removes post-EOF blocks. */
		xfs_inode_clear_eofblocks_tag(ip);
	}

	if (iattr->ia_valid & ATTR_MODE)
		xfs_setattr_mode(ip, iattr);
	if (iattr->ia_valid & (ATTR_ATIME|ATTR_CTIME|ATTR_MTIME))
		xfs_setattr_time(ip, iattr);

	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);

	XFS_STATS_INC(mp, xs_ig_attrchg);

	if (mp->m_flags & XFS_MOUNT_WSYNC)
		xfs_trans_set_sync(tp);

	error = xfs_trans_commit(tp);
out_unlock:
	if (lock_flags)
		xfs_iunlock(ip, lock_flags);
	return error;

out_trans_cancel:
	xfs_trans_cancel(tp);
	goto out_unlock;
}
예제 #10
0
/*
 * Common pre-write limit and setup checks.
 *
 * Returns with iolock held according to @iolock.
 */
STATIC ssize_t
xfs_file_aio_write_checks(
	struct file		*file,
	loff_t			*pos,
	size_t			*count,
	xfs_fsize_t		*new_sizep,
	int			*iolock)
{
	struct inode		*inode = file->f_mapping->host;
	struct xfs_inode	*ip = XFS_I(inode);
	xfs_fsize_t		new_size;
	int			error = 0;

	xfs_rw_ilock(ip, XFS_ILOCK_EXCL);
	*new_sizep = 0;
restart:
	error = generic_write_checks(file, pos, count, S_ISBLK(inode->i_mode));
	if (error) {
		xfs_rw_iunlock(ip, XFS_ILOCK_EXCL | *iolock);
		*iolock = 0;
		return error;
	}

	if (likely(!(file->f_mode & FMODE_NOCMTIME)))
		file_update_time(file);

	/*
	 * If the offset is beyond the size of the file, we need to zero any
	 * blocks that fall between the existing EOF and the start of this
	 * write. There is no need to issue zeroing if another in-flght IO ends
	 * at or before this one If zeronig is needed and we are currently
	 * holding the iolock shared, we need to update it to exclusive which
	 * involves dropping all locks and relocking to maintain correct locking
	 * order. If we do this, restart the function to ensure all checks and
	 * values are still valid.
	 */
	if ((ip->i_new_size && *pos > ip->i_new_size) ||
	    (!ip->i_new_size && *pos > ip->i_size)) {
		if (*iolock == XFS_IOLOCK_SHARED) {
			xfs_rw_iunlock(ip, XFS_ILOCK_EXCL | *iolock);
			*iolock = XFS_IOLOCK_EXCL;
			xfs_rw_ilock(ip, XFS_ILOCK_EXCL | *iolock);
			goto restart;
		}
		error = -xfs_zero_eof(ip, *pos, ip->i_size);
	}

	/*
	 * If this IO extends beyond EOF, we may need to update ip->i_new_size.
	 * We have already zeroed space beyond EOF (if necessary).  Only update
	 * ip->i_new_size if this IO ends beyond any other in-flight writes.
	 */
	new_size = *pos + *count;
	if (new_size > ip->i_size) {
		if (new_size > ip->i_new_size)
			ip->i_new_size = new_size;
		*new_sizep = new_size;
	}

	xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
	if (error)
		return error;

	/*
	 * If we're writing the file then make sure to clear the setuid and
	 * setgid bits if the process is not being run by root.  This keeps
	 * people from modifying setuid and setgid binaries.
	 */
	return file_remove_suid(file);

}
/*
 * Truncate file.  Must have write permission and not be a directory.
 */
int
xfs_setattr_size(
	struct xfs_inode	*ip,
	struct iattr		*iattr,
	int			flags)
{
	struct xfs_mount	*mp = ip->i_mount;
	struct inode		*inode = VFS_I(ip);
	int			mask = iattr->ia_valid;
	struct xfs_trans	*tp;
	int			error;
	uint			lock_flags;
	uint			commit_flags = 0;

	trace_xfs_setattr(ip);

	if (mp->m_flags & XFS_MOUNT_RDONLY)
		return XFS_ERROR(EROFS);

	if (XFS_FORCED_SHUTDOWN(mp))
		return XFS_ERROR(EIO);

	error = -inode_change_ok(inode, iattr);
	if (error)
		return XFS_ERROR(error);

	ASSERT(S_ISREG(ip->i_d.di_mode));
	ASSERT((mask & (ATTR_MODE|ATTR_UID|ATTR_GID|ATTR_ATIME|ATTR_ATIME_SET|
			ATTR_MTIME_SET|ATTR_KILL_SUID|ATTR_KILL_SGID|
			ATTR_KILL_PRIV|ATTR_TIMES_SET)) == 0);

	lock_flags = XFS_ILOCK_EXCL;
	if (!(flags & XFS_ATTR_NOLOCK))
		lock_flags |= XFS_IOLOCK_EXCL;
	xfs_ilock(ip, lock_flags);

	/*
	 * Short circuit the truncate case for zero length files.
	 */
	if (iattr->ia_size == 0 &&
	    ip->i_size == 0 && ip->i_d.di_nextents == 0) {
		if (!(mask & (ATTR_CTIME|ATTR_MTIME)))
			goto out_unlock;

		/*
		 * Use the regular setattr path to update the timestamps.
		 */
		xfs_iunlock(ip, lock_flags);
		iattr->ia_valid &= ~ATTR_SIZE;
		return xfs_setattr_nonsize(ip, iattr, 0);
	}

	/*
	 * Make sure that the dquots are attached to the inode.
	 */
	error = xfs_qm_dqattach_locked(ip, 0);
	if (error)
		goto out_unlock;

	/*
	 * Now we can make the changes.  Before we join the inode to the
	 * transaction, take care of the part of the truncation that must be
	 * done without the inode lock.  This needs to be done before joining
	 * the inode to the transaction, because the inode cannot be unlocked
	 * once it is a part of the transaction.
	 */
	if (iattr->ia_size > ip->i_size) {
		/*
		 * Do the first part of growing a file: zero any data in the
		 * last block that is beyond the old EOF.  We need to do this
		 * before the inode is joined to the transaction to modify
		 * i_size.
		 */
		error = xfs_zero_eof(ip, iattr->ia_size, ip->i_size);
		if (error)
			goto out_unlock;
	}
	xfs_iunlock(ip, XFS_ILOCK_EXCL);
	lock_flags &= ~XFS_ILOCK_EXCL;

	/*
	 * We are going to log the inode size change in this transaction so
	 * any previous writes that are beyond the on disk EOF and the new
	 * EOF that have not been written out need to be written here.  If we
	 * do not write the data out, we expose ourselves to the null files
	 * problem.
	 *
	 * Only flush from the on disk size to the smaller of the in memory
	 * file size or the new size as that's the range we really care about
	 * here and prevents waiting for other data not within the range we
	 * care about here.
	 */
	if (ip->i_size != ip->i_d.di_size && iattr->ia_size > ip->i_d.di_size) {
		error = xfs_flush_pages(ip, ip->i_d.di_size, iattr->ia_size,
					XBF_ASYNC, FI_NONE);
		if (error)
			goto out_unlock;
	}

	/*
	 * Wait for all I/O to complete.
	 */
	xfs_ioend_wait(ip);

	error = -block_truncate_page(inode->i_mapping, iattr->ia_size,
				     xfs_get_blocks);
	if (error)
		goto out_unlock;

	tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_SIZE);
	error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0,
				 XFS_TRANS_PERM_LOG_RES,
				 XFS_ITRUNCATE_LOG_COUNT);
	if (error)
		goto out_trans_cancel;

	truncate_setsize(inode, iattr->ia_size);

	commit_flags = XFS_TRANS_RELEASE_LOG_RES;
	lock_flags |= XFS_ILOCK_EXCL;

	xfs_ilock(ip, XFS_ILOCK_EXCL);

	xfs_trans_ijoin(tp, ip);

	/*
	 * Only change the c/mtime if we are changing the size or we are
	 * explicitly asked to change it.  This handles the semantic difference
	 * between truncate() and ftruncate() as implemented in the VFS.
	 *
	 * The regular truncate() case without ATTR_CTIME and ATTR_MTIME is a
	 * special case where we need to update the times despite not having
	 * these flags set.  For all other operations the VFS set these flags
	 * explicitly if it wants a timestamp update.
	 */
	if (iattr->ia_size != ip->i_size &&
	    (!(mask & (ATTR_CTIME | ATTR_MTIME)))) {
		iattr->ia_ctime = iattr->ia_mtime =
			current_fs_time(inode->i_sb);
		mask |= ATTR_CTIME | ATTR_MTIME;
	}

	if (iattr->ia_size > ip->i_size) {
		ip->i_d.di_size = iattr->ia_size;
		ip->i_size = iattr->ia_size;
	} else if (iattr->ia_size <= ip->i_size ||
		   (iattr->ia_size == 0 && ip->i_d.di_nextents)) {
		error = xfs_itruncate_data(&tp, ip, iattr->ia_size);
		if (error)
			goto out_trans_abort;

		/*
		 * Truncated "down", so we're removing references to old data
		 * here - if we delay flushing for a long time, we expose
		 * ourselves unduly to the notorious NULL files problem.  So,
		 * we mark this inode and flush it when the file is closed,
		 * and do not wait the usual (long) time for writeout.
		 */
		xfs_iflags_set(ip, XFS_ITRUNCATED);
	}

	if (mask & ATTR_CTIME) {
		inode->i_ctime = iattr->ia_ctime;
		ip->i_d.di_ctime.t_sec = iattr->ia_ctime.tv_sec;
		ip->i_d.di_ctime.t_nsec = iattr->ia_ctime.tv_nsec;
		ip->i_update_core = 1;
	}
	if (mask & ATTR_MTIME) {
		inode->i_mtime = iattr->ia_mtime;
		ip->i_d.di_mtime.t_sec = iattr->ia_mtime.tv_sec;
		ip->i_d.di_mtime.t_nsec = iattr->ia_mtime.tv_nsec;
		ip->i_update_core = 1;
	}

	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);

	XFS_STATS_INC(xs_ig_attrchg);

	if (mp->m_flags & XFS_MOUNT_WSYNC)
		xfs_trans_set_sync(tp);

	error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
out_unlock:
	if (lock_flags)
		xfs_iunlock(ip, lock_flags);
	return error;

out_trans_abort:
	commit_flags |= XFS_TRANS_ABORT;
out_trans_cancel:
	xfs_trans_cancel(tp, commit_flags);
	goto out_unlock;
}
예제 #12
0
int
xfs_setattr_size(
	struct xfs_inode	*ip,
	struct iattr		*iattr,
	int			flags)
{
	struct xfs_mount	*mp = ip->i_mount;
	struct inode		*inode = VFS_I(ip);
	int			mask = iattr->ia_valid;
	xfs_off_t		oldsize, newsize;
	struct xfs_trans	*tp;
	int			error;
	uint			lock_flags;
	uint			commit_flags = 0;

	trace_xfs_setattr(ip);

	if (mp->m_flags & XFS_MOUNT_RDONLY)
		return XFS_ERROR(EROFS);

	if (XFS_FORCED_SHUTDOWN(mp))
		return XFS_ERROR(EIO);

	error = -inode_change_ok(inode, iattr);
	if (error)
		return XFS_ERROR(error);

	ASSERT(S_ISREG(ip->i_d.di_mode));
	ASSERT((mask & (ATTR_MODE|ATTR_UID|ATTR_GID|ATTR_ATIME|ATTR_ATIME_SET|
			ATTR_MTIME_SET|ATTR_KILL_SUID|ATTR_KILL_SGID|
			ATTR_KILL_PRIV|ATTR_TIMES_SET)) == 0);

	lock_flags = XFS_ILOCK_EXCL;
	if (!(flags & XFS_ATTR_NOLOCK))
		lock_flags |= XFS_IOLOCK_EXCL;
	xfs_ilock(ip, lock_flags);

	oldsize = inode->i_size;
	newsize = iattr->ia_size;

	if (newsize == 0 && oldsize == 0 && ip->i_d.di_nextents == 0) {
		if (!(mask & (ATTR_CTIME|ATTR_MTIME)))
			goto out_unlock;

		xfs_iunlock(ip, lock_flags);
		iattr->ia_valid &= ~ATTR_SIZE;
		return xfs_setattr_nonsize(ip, iattr, 0);
	}

	error = xfs_qm_dqattach_locked(ip, 0);
	if (error)
		goto out_unlock;

	if (newsize > oldsize) {
		error = xfs_zero_eof(ip, newsize, oldsize);
		if (error)
			goto out_unlock;
	}
	xfs_iunlock(ip, XFS_ILOCK_EXCL);
	lock_flags &= ~XFS_ILOCK_EXCL;

	/*
	 * We are going to log the inode size change in this transaction so
	 * any previous writes that are beyond the on disk EOF and the new
	 * EOF that have not been written out need to be written here.  If we
	 * do not write the data out, we expose ourselves to the null files
	 * problem.
	 *
	 * Only flush from the on disk size to the smaller of the in memory
	 * file size or the new size as that's the range we really care about
	 * here and prevents waiting for other data not within the range we
	 * care about here.
	 */
	if (oldsize != ip->i_d.di_size && newsize > ip->i_d.di_size) {
		error = xfs_flush_pages(ip, ip->i_d.di_size, newsize, 0,
					FI_NONE);
		if (error)
			goto out_unlock;
	}

	inode_dio_wait(inode);

	error = -block_truncate_page(inode->i_mapping, newsize, xfs_get_blocks);
	if (error)
		goto out_unlock;

	tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_SIZE);
	error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0,
				 XFS_TRANS_PERM_LOG_RES,
				 XFS_ITRUNCATE_LOG_COUNT);
	if (error)
		goto out_trans_cancel;

	truncate_setsize(inode, newsize);

	commit_flags = XFS_TRANS_RELEASE_LOG_RES;
	lock_flags |= XFS_ILOCK_EXCL;

	xfs_ilock(ip, XFS_ILOCK_EXCL);

	xfs_trans_ijoin(tp, ip, 0);

	if (newsize != oldsize && (!(mask & (ATTR_CTIME | ATTR_MTIME)))) {
		iattr->ia_ctime = iattr->ia_mtime =
			current_fs_time(inode->i_sb);
		mask |= ATTR_CTIME | ATTR_MTIME;
	}

	/*
	 * The first thing we do is set the size to new_size permanently on
	 * disk.  This way we don't have to worry about anyone ever being able
	 * to look at the data being freed even in the face of a crash.
	 * What we're getting around here is the case where we free a block, it
	 * is allocated to another file, it is written to, and then we crash.
	 * If the new data gets written to the file but the log buffers
	 * containing the free and reallocation don't, then we'd end up with
	 * garbage in the blocks being freed.  As long as we make the new size
	 * permanent before actually freeing any blocks it doesn't matter if
	 * they get written to.
	 */
	ip->i_d.di_size = newsize;
	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);

	if (newsize <= oldsize) {
		error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, newsize);
		if (error)
			goto out_trans_abort;

		xfs_iflags_set(ip, XFS_ITRUNCATED);
	}

	if (mask & ATTR_CTIME) {
		inode->i_ctime = iattr->ia_ctime;
		ip->i_d.di_ctime.t_sec = iattr->ia_ctime.tv_sec;
		ip->i_d.di_ctime.t_nsec = iattr->ia_ctime.tv_nsec;
	}
	if (mask & ATTR_MTIME) {
		inode->i_mtime = iattr->ia_mtime;
		ip->i_d.di_mtime.t_sec = iattr->ia_mtime.tv_sec;
		ip->i_d.di_mtime.t_nsec = iattr->ia_mtime.tv_nsec;
	}

	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);

	XFS_STATS_INC(xs_ig_attrchg);

	if (mp->m_flags & XFS_MOUNT_WSYNC)
		xfs_trans_set_sync(tp);

	error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
out_unlock:
	if (lock_flags)
		xfs_iunlock(ip, lock_flags);
	return error;

out_trans_abort:
	commit_flags |= XFS_TRANS_ABORT;
out_trans_cancel:
	xfs_trans_cancel(tp, commit_flags);
	goto out_unlock;
}
예제 #13
0
ssize_t				/* bytes written, or (-) error */
xfs_write(
	bhv_desc_t		*bdp,
	struct kiocb		*iocb,
	const struct iovec	*iovp,
	unsigned int		nsegs,
	loff_t			*offset,
	int			ioflags,
	cred_t			*credp)
{
	struct file		*file = iocb->ki_filp;
	struct address_space	*mapping = file->f_mapping;
	struct inode		*inode = mapping->host;
	unsigned long		segs = nsegs;
	xfs_inode_t		*xip;
	xfs_mount_t		*mp;
	ssize_t			ret = 0, error = 0;
	xfs_fsize_t		isize, new_size;
	xfs_iocore_t		*io;
	bhv_vnode_t		*vp;
	unsigned long		seg;
	int			iolock;
	int			eventsent = 0;
	bhv_vrwlock_t		locktype;
	size_t			ocount = 0, count;
	loff_t			pos;
	int			need_i_mutex = 1, need_flush = 0;

	XFS_STATS_INC(xs_write_calls);

	vp = BHV_TO_VNODE(bdp);
	xip = XFS_BHVTOI(bdp);

	for (seg = 0; seg < segs; seg++) {
		const struct iovec *iv = &iovp[seg];

		/*
		 * If any segment has a negative length, or the cumulative
		 * length ever wraps negative then return -EINVAL.
		 */
		ocount += iv->iov_len;
		if (unlikely((ssize_t)(ocount|iv->iov_len) < 0))
			return -EINVAL;
		if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
			continue;
		if (seg == 0)
			return -EFAULT;
		segs = seg;
		ocount -= iv->iov_len;  /* This segment is no good */
		break;
	}

	count = ocount;
	pos = *offset;

	if (count == 0)
		return 0;

	io = &xip->i_iocore;
	mp = io->io_mount;

	vfs_wait_for_freeze(vp->v_vfsp, SB_FREEZE_WRITE);

	if (XFS_FORCED_SHUTDOWN(mp))
		return -EIO;

	if (ioflags & IO_ISDIRECT) {
		xfs_buftarg_t	*target =
			(xip->i_d.di_flags & XFS_DIFLAG_REALTIME) ?
				mp->m_rtdev_targp : mp->m_ddev_targp;

		if ((pos & target->bt_smask) || (count & target->bt_smask))
			return XFS_ERROR(-EINVAL);

		if (!VN_CACHED(vp) && pos < i_size_read(inode))
			need_i_mutex = 0;

		if (VN_CACHED(vp))
			need_flush = 1;
	}

relock:
	if (need_i_mutex) {
		iolock = XFS_IOLOCK_EXCL;
		locktype = VRWLOCK_WRITE;

		mutex_lock(&inode->i_mutex);
	} else {
		iolock = XFS_IOLOCK_SHARED;
		locktype = VRWLOCK_WRITE_DIRECT;
	}

	xfs_ilock(xip, XFS_ILOCK_EXCL|iolock);

	isize = i_size_read(inode);

	if (file->f_flags & O_APPEND)
		*offset = isize;

start:
	error = -generic_write_checks(file, &pos, &count,
					S_ISBLK(inode->i_mode));
	if (error) {
		xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock);
		goto out_unlock_mutex;
	}

	new_size = pos + count;
	if (new_size > isize)
		io->io_new_size = new_size;

	if ((DM_EVENT_ENABLED(vp->v_vfsp, xip, DM_EVENT_WRITE) &&
	    !(ioflags & IO_INVIS) && !eventsent)) {
		loff_t		savedsize = pos;
		int		dmflags = FILP_DELAY_FLAG(file);

		if (need_i_mutex)
			dmflags |= DM_FLAGS_IMUX;

		xfs_iunlock(xip, XFS_ILOCK_EXCL);
		error = XFS_SEND_DATA(xip->i_mount, DM_EVENT_WRITE, vp,
				      pos, count,
				      dmflags, &locktype);
		if (error) {
			xfs_iunlock(xip, iolock);
			goto out_unlock_mutex;
		}
		xfs_ilock(xip, XFS_ILOCK_EXCL);
		eventsent = 1;

		/*
		 * The iolock was dropped and reacquired in XFS_SEND_DATA
		 * so we have to recheck the size when appending.
		 * We will only "goto start;" once, since having sent the
		 * event prevents another call to XFS_SEND_DATA, which is
		 * what allows the size to change in the first place.
		 */
		if ((file->f_flags & O_APPEND) && savedsize != isize) {
			pos = isize = xip->i_d.di_size;
			goto start;
		}
	}

	if (likely(!(ioflags & IO_INVIS))) {
		file_update_time(file);
		xfs_ichgtime_fast(xip, inode,
				  XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
	}

	/*
	 * If the offset is beyond the size of the file, we have a couple
	 * of things to do. First, if there is already space allocated
	 * we need to either create holes or zero the disk or ...
	 *
	 * If there is a page where the previous size lands, we need
	 * to zero it out up to the new size.
	 */

	if (pos > isize) {
		error = xfs_zero_eof(BHV_TO_VNODE(bdp), io, pos, isize);
		if (error) {
			xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock);
			goto out_unlock_mutex;
		}
	}
	xfs_iunlock(xip, XFS_ILOCK_EXCL);

	/*
	 * If we're writing the file then make sure to clear the
	 * setuid and setgid bits if the process is not being run
	 * by root.  This keeps people from modifying setuid and
	 * setgid binaries.
	 */

	if (((xip->i_d.di_mode & S_ISUID) ||
	    ((xip->i_d.di_mode & (S_ISGID | S_IXGRP)) ==
		(S_ISGID | S_IXGRP))) &&
	     !capable(CAP_FSETID)) {
		error = xfs_write_clear_setuid(xip);
		if (likely(!error))
			error = -remove_suid(file->f_path.dentry);
		if (unlikely(error)) {
			xfs_iunlock(xip, iolock);
			goto out_unlock_mutex;
		}
	}

retry:
	/* We can write back this queue in page reclaim */
	current->backing_dev_info = mapping->backing_dev_info;

	if ((ioflags & IO_ISDIRECT)) {
		if (need_flush) {
			xfs_inval_cached_trace(io, pos, -1,
					ctooff(offtoct(pos)), -1);
			bhv_vop_flushinval_pages(vp, ctooff(offtoct(pos)),
					-1, FI_REMAPF_LOCKED);
		}

		if (need_i_mutex) {
			/* demote the lock now the cached pages are gone */
			XFS_ILOCK_DEMOTE(mp, io, XFS_IOLOCK_EXCL);
			mutex_unlock(&inode->i_mutex);

			iolock = XFS_IOLOCK_SHARED;
			locktype = VRWLOCK_WRITE_DIRECT;
			need_i_mutex = 0;
		}

 		xfs_rw_enter_trace(XFS_DIOWR_ENTER, io, (void *)iovp, segs,
				*offset, ioflags);
		ret = generic_file_direct_write(iocb, iovp,
				&segs, pos, offset, count, ocount);

		/*
		 * direct-io write to a hole: fall through to buffered I/O
		 * for completing the rest of the request.
		 */
		if (ret >= 0 && ret != count) {
			XFS_STATS_ADD(xs_write_bytes, ret);

			pos += ret;
			count -= ret;

			need_i_mutex = 1;
			ioflags &= ~IO_ISDIRECT;
			xfs_iunlock(xip, iolock);
			goto relock;
		}
	} else {
		xfs_rw_enter_trace(XFS_WRITE_ENTER, io, (void *)iovp, segs,
				*offset, ioflags);
		ret = generic_file_buffered_write(iocb, iovp, segs,
				pos, offset, count, ret);
	}

	current->backing_dev_info = NULL;

	if (ret == -EIOCBQUEUED && !(ioflags & IO_ISAIO))
		ret = wait_on_sync_kiocb(iocb);

	if ((ret == -ENOSPC) &&
	    DM_EVENT_ENABLED(vp->v_vfsp, xip, DM_EVENT_NOSPACE) &&
	    !(ioflags & IO_INVIS)) {

		xfs_rwunlock(bdp, locktype);
		if (need_i_mutex)
			mutex_unlock(&inode->i_mutex);
		error = XFS_SEND_NAMESP(xip->i_mount, DM_EVENT_NOSPACE, vp,
				DM_RIGHT_NULL, vp, DM_RIGHT_NULL, NULL, NULL,
				0, 0, 0); /* Delay flag intentionally  unused */
		if (error)
			goto out_nounlocks;
		if (need_i_mutex)
			mutex_lock(&inode->i_mutex);
		xfs_rwlock(bdp, locktype);
		pos = xip->i_d.di_size;
		ret = 0;
		goto retry;
	}

	isize = i_size_read(inode);
	if (unlikely(ret < 0 && ret != -EFAULT && *offset > isize))
		*offset = isize;

	if (*offset > xip->i_d.di_size) {
		xfs_ilock(xip, XFS_ILOCK_EXCL);
		if (*offset > xip->i_d.di_size) {
			xip->i_d.di_size = *offset;
			i_size_write(inode, *offset);
			xip->i_update_core = 1;
			xip->i_update_size = 1;
		}
		xfs_iunlock(xip, XFS_ILOCK_EXCL);
	}

	error = -ret;
	if (ret <= 0)
		goto out_unlock_internal;

	XFS_STATS_ADD(xs_write_bytes, ret);

	/* Handle various SYNC-type writes */
	if ((file->f_flags & O_SYNC) || IS_SYNC(inode)) {
		error = xfs_write_sync_logforce(mp, xip);
		if (error)
			goto out_unlock_internal;

		xfs_rwunlock(bdp, locktype);
		if (need_i_mutex)
			mutex_unlock(&inode->i_mutex);

		error = sync_page_range(inode, mapping, pos, ret);
		if (!error)
			error = ret;
		return error;
	}

 out_unlock_internal:
	xfs_rwunlock(bdp, locktype);
 out_unlock_mutex:
	if (need_i_mutex)
		mutex_unlock(&inode->i_mutex);
 out_nounlocks:
	return -error;
}
예제 #14
0
파일: xfs_frw.c 프로젝트: AhmadTux/freebsd
ssize_t				/* bytes written, or (-) error */
xfs_write(
	bhv_desc_t      *bdp,
	uio_t		*uio,
	int		ioflag,
	cred_t          *credp)
{
	xfs_inode_t	*xip;
	xfs_mount_t	*mp;
	ssize_t		ret = 0;
	int		error = 0;
	xfs_fsize_t     isize, new_size;
	xfs_fsize_t	n, limit;
	xfs_fsize_t	size;
	xfs_iocore_t    *io;
	xfs_vnode_t	*vp;
	int		iolock;
	//int		eventsent = 0;
	vrwlock_t	locktype;
	xfs_off_t	offset_c;
	xfs_off_t	*offset;
	xfs_off_t	pos;

	XFS_STATS_INC(xs_write_calls);

	vp = BHV_TO_VNODE(bdp);
	xip = XFS_BHVTOI(bdp);

	io = &xip->i_iocore;
	mp = io->io_mount;

	if (XFS_FORCED_SHUTDOWN(xip->i_mount)) {
		return EIO;
	}

	size = uio->uio_resid;
	pos = offset_c = uio->uio_offset;
	offset = &offset_c;

	if (unlikely(ioflag & IO_ISDIRECT)) {
		if (((__psint_t)buf & BBMASK) ||
		    (*offset & mp->m_blockmask) ||
		    (size  & mp->m_blockmask)) {
			return EINVAL;
		}
		iolock = XFS_IOLOCK_SHARED;
		locktype = VRWLOCK_WRITE_DIRECT;
	} else {
		if (io->io_flags & XFS_IOCORE_RT)
			return EINVAL;
		iolock = XFS_IOLOCK_EXCL;
		locktype = VRWLOCK_WRITE;
	}

	iolock = XFS_IOLOCK_EXCL;
	locktype = VRWLOCK_WRITE;

	xfs_ilock(xip, XFS_ILOCK_EXCL|iolock);

	isize = xip->i_d.di_size;
	limit = XFS_MAXIOFFSET(mp);

	if (ioflag & O_APPEND)
		*offset = isize;

//start:
	n = limit - *offset;
	if (n <= 0) {
		xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock);
		return EFBIG;
	}
	if (n < size)
		size = n;

	new_size = *offset + size;
	if (new_size > isize) {
		io->io_new_size = new_size;
	}

#ifdef RMC
	/* probably be a long time before if ever that we do dmapi */
	if ((DM_EVENT_ENABLED(vp->v_vfsp, xip, DM_EVENT_WRITE) &&
	    !(ioflags & IO_INVIS) && !eventsent)) {
		loff_t		savedsize = *offset;
		int dmflags = FILP_DELAY_FLAG(file) | DM_SEM_FLAG_RD(ioflags);

		xfs_iunlock(xip, XFS_ILOCK_EXCL);
		error = XFS_SEND_DATA(xip->i_mount, DM_EVENT_WRITE, vp,
				      *offset, size,
				      dmflags, &locktype);
		if (error) {
			if (iolock) xfs_iunlock(xip, iolock);
			return -error;
		}
		xfs_ilock(xip, XFS_ILOCK_EXCL);
		eventsent = 1;

		/*
		 * The iolock was dropped and reaquired in XFS_SEND_DATA
		 * so we have to recheck the size when appending.
		 * We will only "goto start;" once, since having sent the
		 * event prevents another call to XFS_SEND_DATA, which is
		 * what allows the size to change in the first place.
		 */
		if ((file->f_flags & O_APPEND) &&
		    savedsize != xip->i_d.di_size) {
			*offset = isize = xip->i_d.di_size;
			goto start;
		}
	}
#endif

	/*
	 * If the offset is beyond the size of the file, we have a couple
	 * of things to do. First, if there is already space allocated
	 * we need to either create holes or zero the disk or ...
	 *
	 * If there is a page where the previous size lands, we need
	 * to zero it out up to the new size.
	 */

	if (!(ioflag & IO_ISDIRECT) && (*offset > isize && isize)) {
		error = xfs_zero_eof(BHV_TO_VNODE(bdp), io, *offset,
			isize, *offset + size);
		if (error) {
			xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock);
			return(-error);
		}
	}
	xfs_iunlock(xip, XFS_ILOCK_EXCL);

#if 0
	/*
	 * If we're writing the file then make sure to clear the
	 * setuid and setgid bits if the process is not being run
	 * by root.  This keeps people from modifying setuid and
	 * setgid binaries.
	 */

	if (((xip->i_d.di_mode & S_ISUID) ||
	    ((xip->i_d.di_mode & (S_ISGID | S_IXGRP)) ==
		(S_ISGID | S_IXGRP))) &&
	     !capable(CAP_FSETID)) {
		error = xfs_write_clear_setuid(xip);
		if (likely(!error))
			error = -remove_suid(file->f_dentry);
		if (unlikely(error)) {
			xfs_iunlock(xip, iolock);
			goto out_unlock_mutex;
		}
	}
#endif


//retry:
	if (unlikely(ioflag & IO_ISDIRECT)) {

#ifdef RMC
		xfs_off_t	pos = *offset;
		struct address_space *mapping = file->f_dentry->d_inode->i_mapping;
		struct inode    *inode = mapping->host;

		ret = precheck_file_write(file, inode, &size,  &pos);
		if (ret || size == 0)
			goto error;

		xfs_inval_cached_pages(vp, io, pos, 1, 1);
		inode->i_ctime = inode->i_mtime = CURRENT_TIME;
		/* mark_inode_dirty_sync(inode); - we do this later */

		xfs_rw_enter_trace(XFS_DIOWR_ENTER, io, buf, size, pos, ioflags);
		ret = generic_file_direct_IO(WRITE, file, (char *)buf, size, pos);
		xfs_inval_cached_pages(vp, io, pos, 1, 1);
		if (ret > 0)
			*offset += ret;
#endif
	} else {
		xfs_rw_enter_trace(XFS_WRITE_ENTER, io, buf, size, *offset, ioflags);
		ret = xfs_write_file(xip,uio,ioflag);
	}

	xfs_ichgtime(xip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);


//error:
	if (ret <= 0) {
		if (iolock)
			xfs_rwunlock(bdp, locktype);
		return ret;
	}

	XFS_STATS_ADD(xs_write_bytes, ret);

	if (*offset > xip->i_d.di_size) {
		xfs_ilock(xip, XFS_ILOCK_EXCL);
		if (*offset > xip->i_d.di_size) {
			printf("xfs_write look at doing more here %s:%d\n",__FILE__,__LINE__);
#ifdef RMC
			struct inode	*inode = LINVFS_GET_IP(vp);
			i_size_write(inode, *offset);
			mark_inode_dirty_sync(inode);
#endif

			xip->i_d.di_size = *offset;
			xip->i_update_core = 1;
			xip->i_update_size = 1;
		}
		xfs_iunlock(xip, XFS_ILOCK_EXCL);
	}

	/* Handle various SYNC-type writes */
#if 0
//	if ((file->f_flags & O_SYNC) || IS_SYNC(inode)) {
#endif
	if (ioflag & IO_SYNC) {
		/*
		 * If we're treating this as O_DSYNC and we have not updated the
		 * size, force the log.
		 */
		if (!(mp->m_flags & XFS_MOUNT_OSYNCISOSYNC) &&
		    !(xip->i_update_size)) {
			xfs_inode_log_item_t	*iip = xip->i_itemp;

			/*
			 * If an allocation transaction occurred
			 * without extending the size, then we have to force
			 * the log up the proper point to ensure that the
			 * allocation is permanent.  We can't count on
			 * the fact that buffered writes lock out direct I/O
			 * writes - the direct I/O write could have extended
			 * the size nontransactionally, then finished before
			 * we started.  xfs_write_file will think that the file
			 * didn't grow but the update isn't safe unless the
			 * size change is logged.
			 *
			 * Force the log if we've committed a transaction
			 * against the inode or if someone else has and
			 * the commit record hasn't gone to disk (e.g.
			 * the inode is pinned).  This guarantees that
			 * all changes affecting the inode are permanent
			 * when we return.
			 */
			if (iip && iip->ili_last_lsn) {
				xfs_log_force(mp, iip->ili_last_lsn,
						XFS_LOG_FORCE | XFS_LOG_SYNC);
			} else if (xfs_ipincount(xip) > 0) {
				xfs_log_force(mp, (xfs_lsn_t)0,
						XFS_LOG_FORCE | XFS_LOG_SYNC);
			}

		} else {
			xfs_trans_t	*tp;

			/*
			 * O_SYNC or O_DSYNC _with_ a size update are handled
			 * the same way.
			 *
			 * If the write was synchronous then we need to make
			 * sure that the inode modification time is permanent.
			 * We'll have updated the timestamp above, so here
			 * we use a synchronous transaction to log the inode.
			 * It's not fast, but it's necessary.
			 *
			 * If this a dsync write and the size got changed
			 * non-transactionally, then we need to ensure that
			 * the size change gets logged in a synchronous
			 * transaction.
			 */

			tp = xfs_trans_alloc(mp, XFS_TRANS_WRITE_SYNC);
			if ((error = xfs_trans_reserve(tp, 0,
						      XFS_SWRITE_LOG_RES(mp),
						      0, 0, 0))) {
				/* Transaction reserve failed */
				xfs_trans_cancel(tp, 0);
			} else {
				/* Transaction reserve successful */
				xfs_ilock(xip, XFS_ILOCK_EXCL);
				xfs_trans_ijoin(tp, xip, XFS_ILOCK_EXCL);
				xfs_trans_ihold(tp, xip);
				xfs_trans_log_inode(tp, xip, XFS_ILOG_CORE);
				xfs_trans_set_sync(tp);
				error = xfs_trans_commit(tp, 0, NULL);
				xfs_iunlock(xip, XFS_ILOCK_EXCL);
			}
			if (error)
				goto out_unlock_internal;
		}

		xfs_rwunlock(bdp, locktype);
		return ret;

	} /* (ioflags & O_SYNC) */

out_unlock_internal:
	xfs_rwunlock(bdp, locktype);
#if 0
out_unlock_mutex:
	if (need_i_mutex)
		mutex_unlock(&inode->i_mutex);
#endif
 //out_nounlocks:
	return -error;
}
예제 #15
0
ssize_t				/* bytes written, or (-) error */
xfs_write(
	struct xfs_inode	*xip,
	struct kiocb		*iocb,
	const struct iovec	*iovp,
	unsigned int		nsegs,
	loff_t			*offset,
	int			ioflags)
{
	struct file		*file = iocb->ki_filp;
	struct address_space	*mapping = file->f_mapping;
	struct inode		*inode = mapping->host;
	unsigned long		segs = nsegs;
	xfs_mount_t		*mp;
	ssize_t			ret = 0, error = 0;
	xfs_fsize_t		isize, new_size;
	int			iolock;
	int			eventsent = 0;
	size_t			ocount = 0, count;
	loff_t			pos;
	int			need_i_mutex;

	XFS_STATS_INC(xs_write_calls);

	error = generic_segment_checks(iovp, &segs, &ocount, VERIFY_READ);
	if (error)
		return error;

	count = ocount;
	pos = *offset;

	if (count == 0)
		return 0;

	mp = xip->i_mount;

	xfs_wait_for_freeze(mp, SB_FREEZE_WRITE);

	if (XFS_FORCED_SHUTDOWN(mp))
		return -EIO;

relock:
	if (ioflags & IO_ISDIRECT) {
		iolock = XFS_IOLOCK_SHARED;
		need_i_mutex = 0;
	} else {
		iolock = XFS_IOLOCK_EXCL;
		need_i_mutex = 1;
		mutex_lock(&inode->i_mutex);
	}

	xfs_ilock(xip, XFS_ILOCK_EXCL|iolock);

start:
	error = -generic_write_checks(file, &pos, &count,
					S_ISBLK(inode->i_mode));
	if (error) {
		xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock);
		goto out_unlock_mutex;
	}

	if ((DM_EVENT_ENABLED(xip, DM_EVENT_WRITE) &&
	    !(ioflags & IO_INVIS) && !eventsent)) {
		int		dmflags = FILP_DELAY_FLAG(file);

		if (need_i_mutex)
			dmflags |= DM_FLAGS_IMUX;

		xfs_iunlock(xip, XFS_ILOCK_EXCL);
		error = XFS_SEND_DATA(xip->i_mount, DM_EVENT_WRITE, xip,
				      pos, count, dmflags, &iolock);
		if (error) {
			goto out_unlock_internal;
		}
		xfs_ilock(xip, XFS_ILOCK_EXCL);
		eventsent = 1;

		/*
		 * The iolock was dropped and reacquired in XFS_SEND_DATA
		 * so we have to recheck the size when appending.
		 * We will only "goto start;" once, since having sent the
		 * event prevents another call to XFS_SEND_DATA, which is
		 * what allows the size to change in the first place.
		 */
		if ((file->f_flags & O_APPEND) && pos != xip->i_size)
			goto start;
	}

	if (ioflags & IO_ISDIRECT) {
		xfs_buftarg_t	*target =
			XFS_IS_REALTIME_INODE(xip) ?
				mp->m_rtdev_targp : mp->m_ddev_targp;

		if ((pos & target->bt_smask) || (count & target->bt_smask)) {
			xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock);
			return XFS_ERROR(-EINVAL);
		}

		if (!need_i_mutex && (mapping->nrpages || pos > xip->i_size)) {
			xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock);
			iolock = XFS_IOLOCK_EXCL;
			need_i_mutex = 1;
			mutex_lock(&inode->i_mutex);
			xfs_ilock(xip, XFS_ILOCK_EXCL|iolock);
			goto start;
		}
	}

	new_size = pos + count;
	if (new_size > xip->i_size)
		xip->i_new_size = new_size;

	if (likely(!(ioflags & IO_INVIS)))
		xfs_ichgtime(xip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);

	/*
	 * If the offset is beyond the size of the file, we have a couple
	 * of things to do. First, if there is already space allocated
	 * we need to either create holes or zero the disk or ...
	 *
	 * If there is a page where the previous size lands, we need
	 * to zero it out up to the new size.
	 */

	if (pos > xip->i_size) {
		error = xfs_zero_eof(xip, pos, xip->i_size);
		if (error) {
			xfs_iunlock(xip, XFS_ILOCK_EXCL);
			goto out_unlock_internal;
		}
	}
	xfs_iunlock(xip, XFS_ILOCK_EXCL);

	/*
	 * If we're writing the file then make sure to clear the
	 * setuid and setgid bits if the process is not being run
	 * by root.  This keeps people from modifying setuid and
	 * setgid binaries.
	 */

	if (((xip->i_d.di_mode & S_ISUID) ||
	    ((xip->i_d.di_mode & (S_ISGID | S_IXGRP)) ==
		(S_ISGID | S_IXGRP))) &&
	     !capable(CAP_FSETID)) {
		error = xfs_write_clear_setuid(xip);
		if (likely(!error))
			error = -file_remove_suid(file);
		if (unlikely(error)) {
			goto out_unlock_internal;
		}
	}

	/* We can write back this queue in page reclaim */
	current->backing_dev_info = mapping->backing_dev_info;

	if ((ioflags & IO_ISDIRECT)) {
		if (mapping->nrpages) {
			WARN_ON(need_i_mutex == 0);
			xfs_inval_cached_trace(xip, pos, -1,
					(pos & PAGE_CACHE_MASK), -1);
			error = xfs_flushinval_pages(xip,
					(pos & PAGE_CACHE_MASK),
					-1, FI_REMAPF_LOCKED);
			if (error)
				goto out_unlock_internal;
		}

		if (need_i_mutex) {
			/* demote the lock now the cached pages are gone */
			xfs_ilock_demote(xip, XFS_IOLOCK_EXCL);
			mutex_unlock(&inode->i_mutex);

			iolock = XFS_IOLOCK_SHARED;
			need_i_mutex = 0;
		}

 		xfs_rw_enter_trace(XFS_DIOWR_ENTER, xip, (void *)iovp, segs,
				*offset, ioflags);
		ret = generic_file_direct_write(iocb, iovp,
				&segs, pos, offset, count, ocount);

		/*
		 * direct-io write to a hole: fall through to buffered I/O
		 * for completing the rest of the request.
		 */
		if (ret >= 0 && ret != count) {
			XFS_STATS_ADD(xs_write_bytes, ret);

			pos += ret;
			count -= ret;

			ioflags &= ~IO_ISDIRECT;
			xfs_iunlock(xip, iolock);
			goto relock;
		}
	} else {
		xfs_rw_enter_trace(XFS_WRITE_ENTER, xip, (void *)iovp, segs,
				*offset, ioflags);
		ret = generic_file_buffered_write(iocb, iovp, segs,
				pos, offset, count, ret);
	}

	current->backing_dev_info = NULL;

	if (ret == -EIOCBQUEUED && !(ioflags & IO_ISAIO))
		ret = wait_on_sync_kiocb(iocb);

	isize = i_size_read(inode);
	if (unlikely(ret < 0 && ret != -EFAULT && *offset > isize))
		*offset = isize;

	if (*offset > xip->i_size) {
		xfs_ilock(xip, XFS_ILOCK_EXCL);
		if (*offset > xip->i_size)
			xip->i_size = *offset;
		xfs_iunlock(xip, XFS_ILOCK_EXCL);
	}

	if (ret == -ENOSPC &&
	    DM_EVENT_ENABLED(xip, DM_EVENT_NOSPACE) && !(ioflags & IO_INVIS)) {
		xfs_iunlock(xip, iolock);
		if (need_i_mutex)
			mutex_unlock(&inode->i_mutex);
		error = XFS_SEND_NAMESP(xip->i_mount, DM_EVENT_NOSPACE, xip,
				DM_RIGHT_NULL, xip, DM_RIGHT_NULL, NULL, NULL,
				0, 0, 0); /* Delay flag intentionally  unused */
		if (need_i_mutex)
			mutex_lock(&inode->i_mutex);
		xfs_ilock(xip, iolock);
		if (error)
			goto out_unlock_internal;
		goto start;
	}

	error = -ret;
	if (ret <= 0)
		goto out_unlock_internal;

	XFS_STATS_ADD(xs_write_bytes, ret);

	/* Handle various SYNC-type writes */
	if ((file->f_flags & O_SYNC) || IS_SYNC(inode)) {
		int error2;

		xfs_iunlock(xip, iolock);
		if (need_i_mutex)
			mutex_unlock(&inode->i_mutex);
		error2 = sync_page_range(inode, mapping, pos, ret);
		if (!error)
			error = error2;
		if (need_i_mutex)
			mutex_lock(&inode->i_mutex);
		xfs_ilock(xip, iolock);
		error2 = xfs_write_sync_logforce(mp, xip);
		if (!error)
			error = error2;
	}

 out_unlock_internal:
	if (xip->i_new_size) {
		xfs_ilock(xip, XFS_ILOCK_EXCL);
		xip->i_new_size = 0;
		/*
		 * If this was a direct or synchronous I/O that failed (such
		 * as ENOSPC) then part of the I/O may have been written to
		 * disk before the error occured.  In this case the on-disk
		 * file size may have been adjusted beyond the in-memory file
		 * size and now needs to be truncated back.
		 */
		if (xip->i_d.di_size > xip->i_size)
			xip->i_d.di_size = xip->i_size;
		xfs_iunlock(xip, XFS_ILOCK_EXCL);
	}
	xfs_iunlock(xip, iolock);
 out_unlock_mutex:
	if (need_i_mutex)
		mutex_unlock(&inode->i_mutex);
	return -error;
}
예제 #16
0
파일: xfs_iops.c 프로젝트: luyanseu/linux
/*
 * Truncate file.  Must have write permission and not be a directory.
 */
int
xfs_setattr_size(
	struct xfs_inode	*ip,
	struct iattr		*iattr)
{
	struct xfs_mount	*mp = ip->i_mount;
	struct inode		*inode = VFS_I(ip);
	xfs_off_t		oldsize, newsize;
	struct xfs_trans	*tp;
	int			error;
	uint			lock_flags = 0;
	uint			commit_flags = 0;

	trace_xfs_setattr(ip);

	if (mp->m_flags & XFS_MOUNT_RDONLY)
		return XFS_ERROR(EROFS);

	if (XFS_FORCED_SHUTDOWN(mp))
		return XFS_ERROR(EIO);

	error = -inode_change_ok(inode, iattr);
	if (error)
		return XFS_ERROR(error);

	ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
	ASSERT(S_ISREG(ip->i_d.di_mode));
	ASSERT((iattr->ia_valid & (ATTR_UID|ATTR_GID|ATTR_ATIME|ATTR_ATIME_SET|
		ATTR_MTIME_SET|ATTR_KILL_PRIV|ATTR_TIMES_SET)) == 0);

	oldsize = inode->i_size;
	newsize = iattr->ia_size;

	/*
	 * Short circuit the truncate case for zero length files.
	 */
	if (newsize == 0 && oldsize == 0 && ip->i_d.di_nextents == 0) {
		if (!(iattr->ia_valid & (ATTR_CTIME|ATTR_MTIME)))
			return 0;

		/*
		 * Use the regular setattr path to update the timestamps.
		 */
		iattr->ia_valid &= ~ATTR_SIZE;
		return xfs_setattr_nonsize(ip, iattr, 0);
	}

	/*
	 * Make sure that the dquots are attached to the inode.
	 */
	error = xfs_qm_dqattach(ip, 0);
	if (error)
		return error;

	/*
	 * Now we can make the changes.  Before we join the inode to the
	 * transaction, take care of the part of the truncation that must be
	 * done without the inode lock.  This needs to be done before joining
	 * the inode to the transaction, because the inode cannot be unlocked
	 * once it is a part of the transaction.
	 */
	if (newsize > oldsize) {
		/*
		 * Do the first part of growing a file: zero any data in the
		 * last block that is beyond the old EOF.  We need to do this
		 * before the inode is joined to the transaction to modify
		 * i_size.
		 */
		error = xfs_zero_eof(ip, newsize, oldsize);
		if (error)
			return error;
	}

	/*
	 * We are going to log the inode size change in this transaction so
	 * any previous writes that are beyond the on disk EOF and the new
	 * EOF that have not been written out need to be written here.  If we
	 * do not write the data out, we expose ourselves to the null files
	 * problem.
	 *
	 * Only flush from the on disk size to the smaller of the in memory
	 * file size or the new size as that's the range we really care about
	 * here and prevents waiting for other data not within the range we
	 * care about here.
	 */
	if (oldsize != ip->i_d.di_size && newsize > ip->i_d.di_size) {
		error = -filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
						      ip->i_d.di_size, newsize);
		if (error)
			return error;
	}

	/*
	 * Wait for all direct I/O to complete.
	 */
	inode_dio_wait(inode);

	error = -block_truncate_page(inode->i_mapping, newsize, xfs_get_blocks);
	if (error)
		return error;

	tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_SIZE);
	error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0);
	if (error)
		goto out_trans_cancel;

	truncate_setsize(inode, newsize);

	commit_flags = XFS_TRANS_RELEASE_LOG_RES;
	lock_flags |= XFS_ILOCK_EXCL;

	xfs_ilock(ip, XFS_ILOCK_EXCL);

	xfs_trans_ijoin(tp, ip, 0);

	/*
	 * Only change the c/mtime if we are changing the size or we are
	 * explicitly asked to change it.  This handles the semantic difference
	 * between truncate() and ftruncate() as implemented in the VFS.
	 *
	 * The regular truncate() case without ATTR_CTIME and ATTR_MTIME is a
	 * special case where we need to update the times despite not having
	 * these flags set.  For all other operations the VFS set these flags
	 * explicitly if it wants a timestamp update.
	 */
	if (newsize != oldsize &&
	    !(iattr->ia_valid & (ATTR_CTIME | ATTR_MTIME))) {
		iattr->ia_ctime = iattr->ia_mtime =
			current_fs_time(inode->i_sb);
		iattr->ia_valid |= ATTR_CTIME | ATTR_MTIME;
	}

	/*
	 * The first thing we do is set the size to new_size permanently on
	 * disk.  This way we don't have to worry about anyone ever being able
	 * to look at the data being freed even in the face of a crash.
	 * What we're getting around here is the case where we free a block, it
	 * is allocated to another file, it is written to, and then we crash.
	 * If the new data gets written to the file but the log buffers
	 * containing the free and reallocation don't, then we'd end up with
	 * garbage in the blocks being freed.  As long as we make the new size
	 * permanent before actually freeing any blocks it doesn't matter if
	 * they get written to.
	 */
	ip->i_d.di_size = newsize;
	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);

	if (newsize <= oldsize) {
		error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, newsize);
		if (error)
			goto out_trans_abort;

		/*
		 * Truncated "down", so we're removing references to old data
		 * here - if we delay flushing for a long time, we expose
		 * ourselves unduly to the notorious NULL files problem.  So,
		 * we mark this inode and flush it when the file is closed,
		 * and do not wait the usual (long) time for writeout.
		 */
		xfs_iflags_set(ip, XFS_ITRUNCATED);

		/* A truncate down always removes post-EOF blocks. */
		xfs_inode_clear_eofblocks_tag(ip);
	}

	if (iattr->ia_valid & ATTR_MODE)
		xfs_setattr_mode(ip, iattr);
	if (iattr->ia_valid & (ATTR_ATIME|ATTR_CTIME|ATTR_MTIME))
		xfs_setattr_time(ip, iattr);

	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);

	XFS_STATS_INC(xs_ig_attrchg);

	if (mp->m_flags & XFS_MOUNT_WSYNC)
		xfs_trans_set_sync(tp);

	error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
out_unlock:
	if (lock_flags)
		xfs_iunlock(ip, lock_flags);
	return error;

out_trans_abort:
	commit_flags |= XFS_TRANS_ABORT;
out_trans_cancel:
	xfs_trans_cancel(tp, commit_flags);
	goto out_unlock;
}
예제 #17
0
/*
 * Common pre-write limit and setup checks.
 *
 * Called with the iolocked held either shared and exclusive according to
 * @iolock, and returns with it held.  Might upgrade the iolock to exclusive
 * if called for a direct write beyond i_size.
 */
STATIC ssize_t
xfs_file_aio_write_checks(
	struct kiocb		*iocb,
	struct iov_iter		*from,
	int			*iolock)
{
	struct file		*file = iocb->ki_filp;
	struct inode		*inode = file->f_mapping->host;
	struct xfs_inode	*ip = XFS_I(inode);
	ssize_t			error = 0;
	size_t			count = iov_iter_count(from);
	bool			drained_dio = false;

restart:
	error = generic_write_checks(iocb, from);
	if (error <= 0)
		return error;

	error = xfs_break_layouts(inode, iolock);
	if (error)
		return error;

	/*
	 * For changing security info in file_remove_privs() we need i_rwsem
	 * exclusively.
	 */
	if (*iolock == XFS_IOLOCK_SHARED && !IS_NOSEC(inode)) {
		xfs_iunlock(ip, *iolock);
		*iolock = XFS_IOLOCK_EXCL;
		xfs_ilock(ip, *iolock);
		goto restart;
	}
	/*
	 * If the offset is beyond the size of the file, we need to zero any
	 * blocks that fall between the existing EOF and the start of this
	 * write.  If zeroing is needed and we are currently holding the
	 * iolock shared, we need to update it to exclusive which implies
	 * having to redo all checks before.
	 *
	 * We need to serialise against EOF updates that occur in IO
	 * completions here. We want to make sure that nobody is changing the
	 * size while we do this check until we have placed an IO barrier (i.e.
	 * hold the XFS_IOLOCK_EXCL) that prevents new IO from being dispatched.
	 * The spinlock effectively forms a memory barrier once we have the
	 * XFS_IOLOCK_EXCL so we are guaranteed to see the latest EOF value
	 * and hence be able to correctly determine if we need to run zeroing.
	 */
	spin_lock(&ip->i_flags_lock);
	if (iocb->ki_pos > i_size_read(inode)) {
		spin_unlock(&ip->i_flags_lock);
		if (!drained_dio) {
			if (*iolock == XFS_IOLOCK_SHARED) {
				xfs_iunlock(ip, *iolock);
				*iolock = XFS_IOLOCK_EXCL;
				xfs_ilock(ip, *iolock);
				iov_iter_reexpand(from, count);
			}
			/*
			 * We now have an IO submission barrier in place, but
			 * AIO can do EOF updates during IO completion and hence
			 * we now need to wait for all of them to drain. Non-AIO
			 * DIO will have drained before we are given the
			 * XFS_IOLOCK_EXCL, and so for most cases this wait is a
			 * no-op.
			 */
			inode_dio_wait(inode);
			drained_dio = true;
			goto restart;
		}
		error = xfs_zero_eof(ip, iocb->ki_pos, i_size_read(inode), NULL);
		if (error)
			return error;
	} else
		spin_unlock(&ip->i_flags_lock);

	/*
	 * Updating the timestamps will grab the ilock again from
	 * xfs_fs_dirty_inode, so we have to call it after dropping the
	 * lock above.  Eventually we should look into a way to avoid
	 * the pointless lock roundtrip.
	 */
	if (likely(!(file->f_mode & FMODE_NOCMTIME))) {
		error = file_update_time(file);
		if (error)
			return error;
	}

	/*
	 * If we're writing the file then make sure to clear the setuid and
	 * setgid bits if the process is not being run by root.  This keeps
	 * people from modifying setuid and setgid binaries.
	 */
	if (!IS_NOSEC(inode))
		return file_remove_privs(file);
	return 0;
}
예제 #18
0
파일: xfs_lrw.c 프로젝트: xricson/knoppix
ssize_t				/* bytes written, or (-) error */
xfs_write(
	bhv_desc_t		*bdp,
	struct kiocb		*iocb,
	const struct iovec	*iovp,
	unsigned int		segs,
	loff_t			*offset,
	int			ioflags,
	cred_t			*credp)
{
	struct file		*file = iocb->ki_filp;
	size_t			size = 0;
	xfs_inode_t		*xip;
	xfs_mount_t		*mp;
	ssize_t			ret;
	int			error = 0;
	xfs_fsize_t		isize, new_size;
	xfs_fsize_t		n, limit;
	xfs_iocore_t		*io;
	vnode_t			*vp;
	unsigned long		seg;
	int			iolock;
	int			eventsent = 0;
	vrwlock_t		locktype;

	XFS_STATS_INC(xs_write_calls);

	vp = BHV_TO_VNODE(bdp);
	vn_trace_entry(vp, "xfs_write", (inst_t *)__return_address);
	xip = XFS_BHVTOI(bdp);

	/* START copy & waste from filemap.c */
	for (seg = 0; seg < segs; seg++) {
		const struct iovec *iv = &iovp[seg];

		/*
		 * If any segment has a negative length, or the cumulative
		 * length ever wraps negative then return -EINVAL.
		 */
		size += iv->iov_len;
		if (unlikely((ssize_t)(size|iv->iov_len) < 0))
			return XFS_ERROR(-EINVAL);
	}
	/* END copy & waste from filemap.c */

	if (size == 0)
		return 0;

	io = &(xip->i_iocore);
	mp = io->io_mount;

	xfs_check_frozen(mp, bdp, XFS_FREEZE_WRITE);

	if (XFS_FORCED_SHUTDOWN(mp)) {
		return -EIO;
	}

	if (ioflags & IO_ISDIRECT) {
		pb_target_t	*target =
			(xip->i_d.di_flags & XFS_DIFLAG_REALTIME) ?
				mp->m_rtdev_targp : mp->m_ddev_targp;

		if ((*offset & target->pbr_smask) ||
		    (size & target->pbr_smask)) {
			return XFS_ERROR(-EINVAL);
		}
		iolock = XFS_IOLOCK_SHARED;
		locktype = VRWLOCK_WRITE_DIRECT;
	} else {
		iolock = XFS_IOLOCK_EXCL;
		locktype = VRWLOCK_WRITE;
	}

	xfs_ilock(xip, XFS_ILOCK_EXCL|iolock);

	isize = xip->i_d.di_size;
	limit = XFS_MAXIOFFSET(mp);

	if (file->f_flags & O_APPEND)
		*offset = isize;

start:
	n = limit - *offset;
	if (n <= 0) {
		xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock);
		return -EFBIG;
	}

	if (n < size)
		size = n;

	new_size = *offset + size;
	if (new_size > isize) {
		io->io_new_size = new_size;
	}

	if ((DM_EVENT_ENABLED(vp->v_vfsp, xip, DM_EVENT_WRITE) &&
	    !(ioflags & IO_INVIS) && !eventsent)) {
		loff_t		savedsize = *offset;

		xfs_iunlock(xip, XFS_ILOCK_EXCL);
		error = XFS_SEND_DATA(xip->i_mount, DM_EVENT_WRITE, vp,
				      *offset, size,
				      FILP_DELAY_FLAG(file), &locktype);
		if (error) {
			xfs_iunlock(xip, iolock);
			return -error;
		}
		xfs_ilock(xip, XFS_ILOCK_EXCL);
		eventsent = 1;

		/*
		 * The iolock was dropped and reaquired in XFS_SEND_DATA
		 * so we have to recheck the size when appending.
		 * We will only "goto start;" once, since having sent the
		 * event prevents another call to XFS_SEND_DATA, which is
		 * what allows the size to change in the first place.
		 */
		if ((file->f_flags & O_APPEND) &&
		    savedsize != xip->i_d.di_size) {
			*offset = isize = xip->i_d.di_size;
			goto start;
		}
	}

	/*
	 * On Linux, generic_file_write updates the times even if
	 * no data is copied in so long as the write had a size.
	 *
	 * We must update xfs' times since revalidate will overcopy xfs.
	 */
	if (size && !(ioflags & IO_INVIS))
		xfs_ichgtime(xip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);

	/*
	 * If the offset is beyond the size of the file, we have a couple
	 * of things to do. First, if there is already space allocated
	 * we need to either create holes or zero the disk or ...
	 *
	 * If there is a page where the previous size lands, we need
	 * to zero it out up to the new size.
	 */

	if (!(ioflags & IO_ISDIRECT) && (*offset > isize && isize)) {
		error = xfs_zero_eof(BHV_TO_VNODE(bdp), io, *offset,
			isize, *offset + size);
		if (error) {
			xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock);
			return(-error);
		}
	}
	xfs_iunlock(xip, XFS_ILOCK_EXCL);

	/*
	 * If we're writing the file then make sure to clear the
	 * setuid and setgid bits if the process is not being run
	 * by root.  This keeps people from modifying setuid and
	 * setgid binaries.
	 */

	if (((xip->i_d.di_mode & S_ISUID) ||
	    ((xip->i_d.di_mode & (S_ISGID | S_IXGRP)) ==
		(S_ISGID | S_IXGRP))) &&
	     !capable(CAP_FSETID)) {
		error = xfs_write_clear_setuid(xip);
		if (error) {
			xfs_iunlock(xip, iolock);
			return -error;
		}
	}

retry:
	if (ioflags & IO_ISDIRECT) {
		xfs_inval_cached_pages(vp, &xip->i_iocore, *offset, 1, 1);
	}

	ret = generic_file_aio_write_nolock(iocb, iovp, segs, offset);

	if ((ret == -ENOSPC) &&
	    DM_EVENT_ENABLED(vp->v_vfsp, xip, DM_EVENT_NOSPACE) &&
	    !(ioflags & IO_INVIS)) {

		xfs_rwunlock(bdp, locktype);
		error = XFS_SEND_NAMESP(xip->i_mount, DM_EVENT_NOSPACE, vp,
				DM_RIGHT_NULL, vp, DM_RIGHT_NULL, NULL, NULL,
				0, 0, 0); /* Delay flag intentionally  unused */
		if (error)
			return -error;
		xfs_rwlock(bdp, locktype);
		*offset = xip->i_d.di_size;
		goto retry;

	}

	if (*offset > xip->i_d.di_size) {
		xfs_ilock(xip, XFS_ILOCK_EXCL);
		if (*offset > xip->i_d.di_size) {
			struct inode	*inode = LINVFS_GET_IP(vp);

			xip->i_d.di_size = *offset;
			i_size_write(inode, *offset);
			xip->i_update_core = 1;
			xip->i_update_size = 1;
		}
		xfs_iunlock(xip, XFS_ILOCK_EXCL);
	}

	if (ret <= 0) {
		xfs_rwunlock(bdp, locktype);
		return ret;
	}

	XFS_STATS_ADD(xs_write_bytes, ret);

	/* Handle various SYNC-type writes */
	if ((file->f_flags & O_SYNC) || IS_SYNC(file->f_dentry->d_inode)) {

		/*
		 * If we're treating this as O_DSYNC and we have not updated the
		 * size, force the log.
		 */

		if (!(mp->m_flags & XFS_MOUNT_OSYNCISOSYNC)
			&& !(xip->i_update_size)) {
			/*
			 * If an allocation transaction occurred
			 * without extending the size, then we have to force
			 * the log up the proper point to ensure that the
			 * allocation is permanent.  We can't count on
			 * the fact that buffered writes lock out direct I/O
			 * writes - the direct I/O write could have extended
			 * the size nontransactionally, then finished before
			 * we started.  xfs_write_file will think that the file
			 * didn't grow but the update isn't safe unless the
			 * size change is logged.
			 *
			 * Force the log if we've committed a transaction
			 * against the inode or if someone else has and
			 * the commit record hasn't gone to disk (e.g.
			 * the inode is pinned).  This guarantees that
			 * all changes affecting the inode are permanent
			 * when we return.
			 */

			xfs_inode_log_item_t *iip;
			xfs_lsn_t lsn;

			iip = xip->i_itemp;
			if (iip && iip->ili_last_lsn) {
				lsn = iip->ili_last_lsn;
				xfs_log_force(mp, lsn,
						XFS_LOG_FORCE | XFS_LOG_SYNC);
			} else if (xfs_ipincount(xip) > 0) {
				xfs_log_force(mp, (xfs_lsn_t)0,
						XFS_LOG_FORCE | XFS_LOG_SYNC);
			}

		} else {
			xfs_trans_t	*tp;

			/*
			 * O_SYNC or O_DSYNC _with_ a size update are handled
			 * the same way.
			 *
			 * If the write was synchronous then we need to make
			 * sure that the inode modification time is permanent.
			 * We'll have updated the timestamp above, so here
			 * we use a synchronous transaction to log the inode.
			 * It's not fast, but it's necessary.
			 *
			 * If this a dsync write and the size got changed
			 * non-transactionally, then we need to ensure that
			 * the size change gets logged in a synchronous
			 * transaction.
			 */

			tp = xfs_trans_alloc(mp, XFS_TRANS_WRITE_SYNC);
			if ((error = xfs_trans_reserve(tp, 0,
						      XFS_SWRITE_LOG_RES(mp),
						      0, 0, 0))) {
				/* Transaction reserve failed */
				xfs_trans_cancel(tp, 0);
			} else {
				/* Transaction reserve successful */
				xfs_ilock(xip, XFS_ILOCK_EXCL);
				xfs_trans_ijoin(tp, xip, XFS_ILOCK_EXCL);
				xfs_trans_ihold(tp, xip);
				xfs_trans_log_inode(tp, xip, XFS_ILOG_CORE);
				xfs_trans_set_sync(tp);
				error = xfs_trans_commit(tp, 0, (xfs_lsn_t)0);
				xfs_iunlock(xip, XFS_ILOCK_EXCL);
			}
		}
	} /* (ioflags & O_SYNC) */

	xfs_rwunlock(bdp, locktype);
	return(ret);
}