Пример #1
0
/*
 * Handle logging requirements of various synchronous types of write.
 */
int
xfs_write_sync_logforce(
	xfs_mount_t	*mp,
	xfs_inode_t	*ip)
{
	int		error = 0;

	/*
	 * If we're treating this as O_DSYNC and we have not updated the
	 * size, force the log.
	 */
	if (!(mp->m_flags & XFS_MOUNT_OSYNCISOSYNC) &&
	    !(ip->i_update_size)) {
		xfs_inode_log_item_t	*iip = ip->i_itemp;

		/*
		 * If an allocation transaction occurred
		 * without extending the size, then we have to force
		 * the log up the proper point to ensure that the
		 * allocation is permanent.  We can't count on
		 * the fact that buffered writes lock out direct I/O
		 * writes - the direct I/O write could have extended
		 * the size nontransactionally, then finished before
		 * we started.  xfs_write_file will think that the file
		 * didn't grow but the update isn't safe unless the
		 * size change is logged.
		 *
		 * Force the log if we've committed a transaction
		 * against the inode or if someone else has and
		 * the commit record hasn't gone to disk (e.g.
		 * the inode is pinned).  This guarantees that
		 * all changes affecting the inode are permanent
		 * when we return.
		 */
		if (iip && iip->ili_last_lsn) {
			xfs_log_force(mp, iip->ili_last_lsn,
					XFS_LOG_FORCE | XFS_LOG_SYNC);
		} else if (xfs_ipincount(ip) > 0) {
			xfs_log_force(mp, (xfs_lsn_t)0,
					XFS_LOG_FORCE | XFS_LOG_SYNC);
		}

	} else {
		xfs_trans_t	*tp;

		/*
		 * O_SYNC or O_DSYNC _with_ a size update are handled
		 * the same way.
		 *
		 * If the write was synchronous then we need to make
		 * sure that the inode modification time is permanent.
		 * We'll have updated the timestamp above, so here
		 * we use a synchronous transaction to log the inode.
		 * It's not fast, but it's necessary.
		 *
		 * If this a dsync write and the size got changed
		 * non-transactionally, then we need to ensure that
		 * the size change gets logged in a synchronous
		 * transaction.
		 */
		tp = xfs_trans_alloc(mp, XFS_TRANS_WRITE_SYNC);
		if ((error = xfs_trans_reserve(tp, 0,
						XFS_SWRITE_LOG_RES(mp),
						0, 0, 0))) {
			/* Transaction reserve failed */
			xfs_trans_cancel(tp, 0);
		} else {
			/* Transaction reserve successful */
			xfs_ilock(ip, XFS_ILOCK_EXCL);
			xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
			xfs_trans_ihold(tp, ip);
			xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
			xfs_trans_set_sync(tp);
			error = xfs_trans_commit(tp, 0, NULL);
			xfs_iunlock(ip, XFS_ILOCK_EXCL);
		}
	}

	return error;
}
Пример #2
0
ssize_t				/* bytes written, or (-) error */
xfs_write(
	bhv_desc_t      *bdp,
	struct file	*file,
	const char	*buf,
	size_t		size,
	loff_t		*offset,
	int		ioflags,
	cred_t          *credp)
{
	xfs_inode_t	*xip;
	xfs_mount_t	*mp;
	ssize_t		ret;
	int		error = 0;
	xfs_fsize_t     isize, new_size;
	xfs_fsize_t	n, limit;
	xfs_iocore_t    *io;
	vnode_t		*vp;
	int		iolock;
	int		eventsent = 0;
	vrwlock_t	locktype;

	XFS_STATS_INC(xs_write_calls);

	vp = BHV_TO_VNODE(bdp);
	xip = XFS_BHVTOI(bdp);

	if (size == 0)
		return 0;

	io = &xip->i_iocore;
	mp = io->io_mount;

	fs_check_frozen(vp->v_vfsp, SB_FREEZE_WRITE);

	if (XFS_FORCED_SHUTDOWN(xip->i_mount)) {
		return -EIO;
	}

	if (unlikely(ioflags & IO_ISDIRECT)) {
		if (((__psint_t)buf & BBMASK) ||
		    (*offset & mp->m_blockmask) ||
		    (size  & mp->m_blockmask)) {
			return XFS_ERROR(-EINVAL);
		}
		iolock = XFS_IOLOCK_SHARED;
		locktype = VRWLOCK_WRITE_DIRECT;
	} else {
		iolock = XFS_IOLOCK_EXCL;
		locktype = VRWLOCK_WRITE;
	}

	if (ioflags & IO_ISLOCKED)
		iolock = 0;

	xfs_ilock(xip, XFS_ILOCK_EXCL|iolock);

	isize = xip->i_d.di_size;
	limit = XFS_MAXIOFFSET(mp);

	if (file->f_flags & O_APPEND)
		*offset = isize;

start:
	n = limit - *offset;
	if (n <= 0) {
		xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock);
		return -EFBIG;
	}
	if (n < size)
		size = n;

	new_size = *offset + size;
	if (new_size > isize) {
		io->io_new_size = new_size;
	}

	if ((DM_EVENT_ENABLED(vp->v_vfsp, xip, DM_EVENT_WRITE) &&
	    !(ioflags & IO_INVIS) && !eventsent)) {
		loff_t		savedsize = *offset;
		int dmflags = FILP_DELAY_FLAG(file) | DM_SEM_FLAG_RD(ioflags);

		xfs_iunlock(xip, XFS_ILOCK_EXCL);
		error = XFS_SEND_DATA(xip->i_mount, DM_EVENT_WRITE, vp,
				      *offset, size,
				      dmflags, &locktype);
		if (error) {
			if (iolock) xfs_iunlock(xip, iolock);
			return -error;
		}
		xfs_ilock(xip, XFS_ILOCK_EXCL);
		eventsent = 1;

		/*
		 * The iolock was dropped and reaquired in XFS_SEND_DATA
		 * so we have to recheck the size when appending.
		 * We will only "goto start;" once, since having sent the
		 * event prevents another call to XFS_SEND_DATA, which is
		 * what allows the size to change in the first place.
		 */
		if ((file->f_flags & O_APPEND) &&
		    savedsize != xip->i_d.di_size) {
			*offset = isize = xip->i_d.di_size;
			goto start;
		}
	}

	/*
	 * If the offset is beyond the size of the file, we have a couple
	 * of things to do. First, if there is already space allocated
	 * we need to either create holes or zero the disk or ...
	 *
	 * If there is a page where the previous size lands, we need
	 * to zero it out up to the new size.
	 */

	if (!(ioflags & IO_ISDIRECT) && (*offset > isize && isize)) {
		error = xfs_zero_eof(BHV_TO_VNODE(bdp), io, *offset,
			isize, *offset + size);
		if (error) {
			xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock);
			return(-error);
		}
	}
	xfs_iunlock(xip, XFS_ILOCK_EXCL);

	/*
	 * If we're writing the file then make sure to clear the
	 * setuid and setgid bits if the process is not being run
	 * by root.  This keeps people from modifying setuid and
	 * setgid binaries.
	 */

	if (((xip->i_d.di_mode & S_ISUID) ||
	    ((xip->i_d.di_mode & (S_ISGID | S_IXGRP)) ==
		(S_ISGID | S_IXGRP))) &&
	     !capable(CAP_FSETID)) {
		error = xfs_write_clear_setuid(xip);
		if (error) {
			xfs_iunlock(xip, iolock);
			return -error;
		}
	}


	if ((ssize_t) size < 0) {
		ret = -EINVAL;
		goto error;
	}

	if (!access_ok(VERIFY_READ, buf, size)) {
		ret = -EINVAL;
		goto error;
	}

retry:
	if (unlikely(ioflags & IO_ISDIRECT)) {
		xfs_inval_cached_pages(vp, io, *offset, 1, 1);
		xfs_rw_enter_trace(XFS_DIOWR_ENTER,
					io, buf, size, *offset, ioflags);
		ret = do_generic_direct_write(file, buf, size, offset);
	} else {
		xfs_rw_enter_trace(XFS_WRITE_ENTER,
					io, buf, size, *offset, ioflags);
		ret = do_generic_file_write(file, buf, size, offset);
	}

	if (unlikely(ioflags & IO_INVIS)) {
		/* generic_file_write updates the mtime/ctime but we need
		 * to undo that because this I/O was supposed to be
		 * invisible.
		 */
		struct inode	*inode = LINVFS_GET_IP(vp);
		inode->i_mtime = xip->i_d.di_mtime.t_sec;
		inode->i_ctime = xip->i_d.di_ctime.t_sec;
	} else {
		xfs_ichgtime(xip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
	}

	if ((ret == -ENOSPC) &&
	    DM_EVENT_ENABLED(vp->v_vfsp, xip, DM_EVENT_NOSPACE) &&
	    !(ioflags & IO_INVIS)) {

		xfs_rwunlock(bdp, locktype);
		error = XFS_SEND_NAMESP(xip->i_mount, DM_EVENT_NOSPACE, vp,
				DM_RIGHT_NULL, vp, DM_RIGHT_NULL, NULL, NULL,
				0, 0, 0); /* Delay flag intentionally  unused */
		if (error)
			return -error;
		xfs_rwlock(bdp, locktype);
		*offset = xip->i_d.di_size;
		goto retry;
	}

error:
	if (ret <= 0) {
		if (iolock)
			xfs_rwunlock(bdp, locktype);
		return ret;
	}

	XFS_STATS_ADD(xs_write_bytes, ret);

	if (*offset > xip->i_d.di_size) {
		xfs_ilock(xip, XFS_ILOCK_EXCL);
		if (*offset > xip->i_d.di_size) {
			struct inode	*inode = LINVFS_GET_IP(vp);

			xip->i_d.di_size = *offset;
			i_size_write(inode, *offset);
			xip->i_update_core = 1;
			xip->i_update_size = 1;
			mark_inode_dirty_sync(inode);
		}
		xfs_iunlock(xip, XFS_ILOCK_EXCL);
	}

	/* Handle various SYNC-type writes */
	if ((file->f_flags & O_SYNC) || IS_SYNC(file->f_dentry->d_inode)) {

		/*
		 * If we're treating this as O_DSYNC and we have not updated the
		 * size, force the log.
		 */

		if (!(mp->m_flags & XFS_MOUNT_OSYNCISOSYNC)
			&& !(xip->i_update_size)) {
			/*
			 * If an allocation transaction occurred
			 * without extending the size, then we have to force
			 * the log up the proper point to ensure that the
			 * allocation is permanent.  We can't count on
			 * the fact that buffered writes lock out direct I/O
			 * writes - the direct I/O write could have extended
			 * the size nontransactionally, then finished before
			 * we started.  xfs_write_file will think that the file
			 * didn't grow but the update isn't safe unless the
			 * size change is logged.
			 *
			 * Force the log if we've committed a transaction
			 * against the inode or if someone else has and
			 * the commit record hasn't gone to disk (e.g.
			 * the inode is pinned).  This guarantees that
			 * all changes affecting the inode are permanent
			 * when we return.
			 */

			xfs_inode_log_item_t *iip;
			xfs_lsn_t lsn;

			iip = xip->i_itemp;
			if (iip && iip->ili_last_lsn) {
				lsn = iip->ili_last_lsn;
				xfs_log_force(mp, lsn,
						XFS_LOG_FORCE | XFS_LOG_SYNC);
			} else if (xfs_ipincount(xip) > 0) {
				xfs_log_force(mp, (xfs_lsn_t)0,
						XFS_LOG_FORCE | XFS_LOG_SYNC);
			}

		} else {
			xfs_trans_t	*tp;

			/*
			 * O_SYNC or O_DSYNC _with_ a size update are handled
			 * the same way.
			 *
			 * If the write was synchronous then we need to make
			 * sure that the inode modification time is permanent.
			 * We'll have updated the timestamp above, so here
			 * we use a synchronous transaction to log the inode.
			 * It's not fast, but it's necessary.
			 *
			 * If this a dsync write and the size got changed
			 * non-transactionally, then we need to ensure that
			 * the size change gets logged in a synchronous
			 * transaction.
			 */

			tp = xfs_trans_alloc(mp, XFS_TRANS_WRITE_SYNC);
			if ((error = xfs_trans_reserve(tp, 0,
						      XFS_SWRITE_LOG_RES(mp),
						      0, 0, 0))) {
				/* Transaction reserve failed */
				xfs_trans_cancel(tp, 0);
			} else {
				/* Transaction reserve successful */
				xfs_ilock(xip, XFS_ILOCK_EXCL);
				xfs_trans_ijoin(tp, xip, XFS_ILOCK_EXCL);
				xfs_trans_ihold(tp, xip);
				xfs_trans_log_inode(tp, xip, XFS_ILOG_CORE);
				xfs_trans_set_sync(tp);
				error = xfs_trans_commit(tp, 0, NULL);
				xfs_iunlock(xip, XFS_ILOCK_EXCL);
			}
		}
	} /* (ioflags & O_SYNC) */

	/*
	 * If we are coming from an nfsd thread then insert into the
	 * reference cache.
	 */

	if (!strcmp(current->comm, "nfsd"))
		xfs_refcache_insert(xip);

	/* Drop lock this way - the old refcache release is in here */
	if (iolock)
		xfs_rwunlock(bdp, locktype);

	return(ret);
}
Пример #3
0
ssize_t				/* bytes written, or (-) error */
xfs_write(
	bhv_desc_t      *bdp,
	uio_t		*uio,
	int		ioflag,
	cred_t          *credp)
{
	xfs_inode_t	*xip;
	xfs_mount_t	*mp;
	ssize_t		ret = 0;
	int		error = 0;
	xfs_fsize_t     isize, new_size;
	xfs_fsize_t	n, limit;
	xfs_fsize_t	size;
	xfs_iocore_t    *io;
	xfs_vnode_t	*vp;
	int		iolock;
	//int		eventsent = 0;
	vrwlock_t	locktype;
	xfs_off_t	offset_c;
	xfs_off_t	*offset;
	xfs_off_t	pos;

	XFS_STATS_INC(xs_write_calls);

	vp = BHV_TO_VNODE(bdp);
	xip = XFS_BHVTOI(bdp);

	io = &xip->i_iocore;
	mp = io->io_mount;

	if (XFS_FORCED_SHUTDOWN(xip->i_mount)) {
		return EIO;
	}

	size = uio->uio_resid;
	pos = offset_c = uio->uio_offset;
	offset = &offset_c;

	if (unlikely(ioflag & IO_ISDIRECT)) {
		if (((__psint_t)buf & BBMASK) ||
		    (*offset & mp->m_blockmask) ||
		    (size  & mp->m_blockmask)) {
			return EINVAL;
		}
		iolock = XFS_IOLOCK_SHARED;
		locktype = VRWLOCK_WRITE_DIRECT;
	} else {
		if (io->io_flags & XFS_IOCORE_RT)
			return EINVAL;
		iolock = XFS_IOLOCK_EXCL;
		locktype = VRWLOCK_WRITE;
	}

	iolock = XFS_IOLOCK_EXCL;
	locktype = VRWLOCK_WRITE;

	xfs_ilock(xip, XFS_ILOCK_EXCL|iolock);

	isize = xip->i_d.di_size;
	limit = XFS_MAXIOFFSET(mp);

	if (ioflag & O_APPEND)
		*offset = isize;

//start:
	n = limit - *offset;
	if (n <= 0) {
		xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock);
		return EFBIG;
	}
	if (n < size)
		size = n;

	new_size = *offset + size;
	if (new_size > isize) {
		io->io_new_size = new_size;
	}

#ifdef RMC
	/* probably be a long time before if ever that we do dmapi */
	if ((DM_EVENT_ENABLED(vp->v_vfsp, xip, DM_EVENT_WRITE) &&
	    !(ioflags & IO_INVIS) && !eventsent)) {
		loff_t		savedsize = *offset;
		int dmflags = FILP_DELAY_FLAG(file) | DM_SEM_FLAG_RD(ioflags);

		xfs_iunlock(xip, XFS_ILOCK_EXCL);
		error = XFS_SEND_DATA(xip->i_mount, DM_EVENT_WRITE, vp,
				      *offset, size,
				      dmflags, &locktype);
		if (error) {
			if (iolock) xfs_iunlock(xip, iolock);
			return -error;
		}
		xfs_ilock(xip, XFS_ILOCK_EXCL);
		eventsent = 1;

		/*
		 * The iolock was dropped and reaquired in XFS_SEND_DATA
		 * so we have to recheck the size when appending.
		 * We will only "goto start;" once, since having sent the
		 * event prevents another call to XFS_SEND_DATA, which is
		 * what allows the size to change in the first place.
		 */
		if ((file->f_flags & O_APPEND) &&
		    savedsize != xip->i_d.di_size) {
			*offset = isize = xip->i_d.di_size;
			goto start;
		}
	}
#endif

	/*
	 * If the offset is beyond the size of the file, we have a couple
	 * of things to do. First, if there is already space allocated
	 * we need to either create holes or zero the disk or ...
	 *
	 * If there is a page where the previous size lands, we need
	 * to zero it out up to the new size.
	 */

	if (!(ioflag & IO_ISDIRECT) && (*offset > isize && isize)) {
		error = xfs_zero_eof(BHV_TO_VNODE(bdp), io, *offset,
			isize, *offset + size);
		if (error) {
			xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock);
			return(-error);
		}
	}
	xfs_iunlock(xip, XFS_ILOCK_EXCL);

#if 0
	/*
	 * If we're writing the file then make sure to clear the
	 * setuid and setgid bits if the process is not being run
	 * by root.  This keeps people from modifying setuid and
	 * setgid binaries.
	 */

	if (((xip->i_d.di_mode & S_ISUID) ||
	    ((xip->i_d.di_mode & (S_ISGID | S_IXGRP)) ==
		(S_ISGID | S_IXGRP))) &&
	     !capable(CAP_FSETID)) {
		error = xfs_write_clear_setuid(xip);
		if (likely(!error))
			error = -remove_suid(file->f_dentry);
		if (unlikely(error)) {
			xfs_iunlock(xip, iolock);
			goto out_unlock_mutex;
		}
	}
#endif


//retry:
	if (unlikely(ioflag & IO_ISDIRECT)) {

#ifdef RMC
		xfs_off_t	pos = *offset;
		struct address_space *mapping = file->f_dentry->d_inode->i_mapping;
		struct inode    *inode = mapping->host;

		ret = precheck_file_write(file, inode, &size,  &pos);
		if (ret || size == 0)
			goto error;

		xfs_inval_cached_pages(vp, io, pos, 1, 1);
		inode->i_ctime = inode->i_mtime = CURRENT_TIME;
		/* mark_inode_dirty_sync(inode); - we do this later */

		xfs_rw_enter_trace(XFS_DIOWR_ENTER, io, buf, size, pos, ioflags);
		ret = generic_file_direct_IO(WRITE, file, (char *)buf, size, pos);
		xfs_inval_cached_pages(vp, io, pos, 1, 1);
		if (ret > 0)
			*offset += ret;
#endif
	} else {
		xfs_rw_enter_trace(XFS_WRITE_ENTER, io, buf, size, *offset, ioflags);
		ret = xfs_write_file(xip,uio,ioflag);
	}

	xfs_ichgtime(xip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);


//error:
	if (ret <= 0) {
		if (iolock)
			xfs_rwunlock(bdp, locktype);
		return ret;
	}

	XFS_STATS_ADD(xs_write_bytes, ret);

	if (*offset > xip->i_d.di_size) {
		xfs_ilock(xip, XFS_ILOCK_EXCL);
		if (*offset > xip->i_d.di_size) {
			printf("xfs_write look at doing more here %s:%d\n",__FILE__,__LINE__);
#ifdef RMC
			struct inode	*inode = LINVFS_GET_IP(vp);
			i_size_write(inode, *offset);
			mark_inode_dirty_sync(inode);
#endif

			xip->i_d.di_size = *offset;
			xip->i_update_core = 1;
			xip->i_update_size = 1;
		}
		xfs_iunlock(xip, XFS_ILOCK_EXCL);
	}

	/* Handle various SYNC-type writes */
#if 0
//	if ((file->f_flags & O_SYNC) || IS_SYNC(inode)) {
#endif
	if (ioflag & IO_SYNC) {
		/*
		 * If we're treating this as O_DSYNC and we have not updated the
		 * size, force the log.
		 */
		if (!(mp->m_flags & XFS_MOUNT_OSYNCISOSYNC) &&
		    !(xip->i_update_size)) {
			xfs_inode_log_item_t	*iip = xip->i_itemp;

			/*
			 * If an allocation transaction occurred
			 * without extending the size, then we have to force
			 * the log up the proper point to ensure that the
			 * allocation is permanent.  We can't count on
			 * the fact that buffered writes lock out direct I/O
			 * writes - the direct I/O write could have extended
			 * the size nontransactionally, then finished before
			 * we started.  xfs_write_file will think that the file
			 * didn't grow but the update isn't safe unless the
			 * size change is logged.
			 *
			 * Force the log if we've committed a transaction
			 * against the inode or if someone else has and
			 * the commit record hasn't gone to disk (e.g.
			 * the inode is pinned).  This guarantees that
			 * all changes affecting the inode are permanent
			 * when we return.
			 */
			if (iip && iip->ili_last_lsn) {
				xfs_log_force(mp, iip->ili_last_lsn,
						XFS_LOG_FORCE | XFS_LOG_SYNC);
			} else if (xfs_ipincount(xip) > 0) {
				xfs_log_force(mp, (xfs_lsn_t)0,
						XFS_LOG_FORCE | XFS_LOG_SYNC);
			}

		} else {
			xfs_trans_t	*tp;

			/*
			 * O_SYNC or O_DSYNC _with_ a size update are handled
			 * the same way.
			 *
			 * If the write was synchronous then we need to make
			 * sure that the inode modification time is permanent.
			 * We'll have updated the timestamp above, so here
			 * we use a synchronous transaction to log the inode.
			 * It's not fast, but it's necessary.
			 *
			 * If this a dsync write and the size got changed
			 * non-transactionally, then we need to ensure that
			 * the size change gets logged in a synchronous
			 * transaction.
			 */

			tp = xfs_trans_alloc(mp, XFS_TRANS_WRITE_SYNC);
			if ((error = xfs_trans_reserve(tp, 0,
						      XFS_SWRITE_LOG_RES(mp),
						      0, 0, 0))) {
				/* Transaction reserve failed */
				xfs_trans_cancel(tp, 0);
			} else {
				/* Transaction reserve successful */
				xfs_ilock(xip, XFS_ILOCK_EXCL);
				xfs_trans_ijoin(tp, xip, XFS_ILOCK_EXCL);
				xfs_trans_ihold(tp, xip);
				xfs_trans_log_inode(tp, xip, XFS_ILOG_CORE);
				xfs_trans_set_sync(tp);
				error = xfs_trans_commit(tp, 0, NULL);
				xfs_iunlock(xip, XFS_ILOCK_EXCL);
			}
			if (error)
				goto out_unlock_internal;
		}

		xfs_rwunlock(bdp, locktype);
		return ret;

	} /* (ioflags & O_SYNC) */

out_unlock_internal:
	xfs_rwunlock(bdp, locktype);
#if 0
out_unlock_mutex:
	if (need_i_mutex)
		mutex_unlock(&inode->i_mutex);
#endif
 //out_nounlocks:
	return -error;
}
Пример #4
0
ssize_t				/* bytes written, or (-) error */
xfs_write(
	bhv_desc_t		*bdp,
	struct kiocb		*iocb,
	const struct iovec	*iovp,
	unsigned int		nsegs,
	loff_t			*offset,
	int			ioflags,
	cred_t			*credp)
{
	struct file		*file = iocb->ki_filp;
	struct address_space	*mapping = file->f_mapping;
	struct inode		*inode = mapping->host;
	unsigned long		segs = nsegs;
	xfs_inode_t		*xip;
	xfs_mount_t		*mp;
	ssize_t			ret = 0, error = 0;
	xfs_fsize_t		isize, new_size;
	xfs_iocore_t		*io;
	vnode_t			*vp;
	unsigned long		seg;
	int			iolock;
	int			eventsent = 0;
	vrwlock_t		locktype;
	size_t			ocount = 0, count;
	loff_t			pos;
	int			need_isem = 1, need_flush = 0;

	XFS_STATS_INC(xs_write_calls);

	vp = BHV_TO_VNODE(bdp);
	xip = XFS_BHVTOI(bdp);

	for (seg = 0; seg < segs; seg++) {
		const struct iovec *iv = &iovp[seg];

		/*
		 * If any segment has a negative length, or the cumulative
		 * length ever wraps negative then return -EINVAL.
		 */
		ocount += iv->iov_len;
		if (unlikely((ssize_t)(ocount|iv->iov_len) < 0))
			return -EINVAL;
		if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
			continue;
		if (seg == 0)
			return -EFAULT;
		segs = seg;
		ocount -= iv->iov_len;  /* This segment is no good */
		break;
	}

	count = ocount;
	pos = *offset;

	if (count == 0)
		return 0;

	io = &xip->i_iocore;
	mp = io->io_mount;

	if (XFS_FORCED_SHUTDOWN(mp))
		return -EIO;

	if (ioflags & IO_ISDIRECT) {
		xfs_buftarg_t	*target =
			(xip->i_d.di_flags & XFS_DIFLAG_REALTIME) ?
				mp->m_rtdev_targp : mp->m_ddev_targp;

		if ((pos & target->pbr_smask) || (count & target->pbr_smask))
			return XFS_ERROR(-EINVAL);

		if (!VN_CACHED(vp) && pos < i_size_read(inode))
			need_isem = 0;

		if (VN_CACHED(vp))
			need_flush = 1;
	}

relock:
	if (need_isem) {
		iolock = XFS_IOLOCK_EXCL;
		locktype = VRWLOCK_WRITE;

		down(&inode->i_sem);
	} else {
		iolock = XFS_IOLOCK_SHARED;
		locktype = VRWLOCK_WRITE_DIRECT;
	}

	xfs_ilock(xip, XFS_ILOCK_EXCL|iolock);

	isize = i_size_read(inode);

	if (file->f_flags & O_APPEND)
		*offset = isize;

start:
	error = -generic_write_checks(file, &pos, &count,
					S_ISBLK(inode->i_mode));
	if (error) {
		xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock);
		goto out_unlock_isem;
	}

	new_size = pos + count;
	if (new_size > isize)
		io->io_new_size = new_size;

	if ((DM_EVENT_ENABLED(vp->v_vfsp, xip, DM_EVENT_WRITE) &&
	    !(ioflags & IO_INVIS) && !eventsent)) {
		loff_t		savedsize = pos;
		int		dmflags = FILP_DELAY_FLAG(file);

		if (need_isem)
			dmflags |= DM_FLAGS_ISEM;

		xfs_iunlock(xip, XFS_ILOCK_EXCL);
		error = XFS_SEND_DATA(xip->i_mount, DM_EVENT_WRITE, vp,
				      pos, count,
				      dmflags, &locktype);
		if (error) {
			xfs_iunlock(xip, iolock);
			goto out_unlock_isem;
		}
		xfs_ilock(xip, XFS_ILOCK_EXCL);
		eventsent = 1;

		/*
		 * The iolock was dropped and reaquired in XFS_SEND_DATA
		 * so we have to recheck the size when appending.
		 * We will only "goto start;" once, since having sent the
		 * event prevents another call to XFS_SEND_DATA, which is
		 * what allows the size to change in the first place.
		 */
		if ((file->f_flags & O_APPEND) && savedsize != isize) {
			pos = isize = xip->i_d.di_size;
			goto start;
		}
	}

	/*
	 * On Linux, generic_file_write updates the times even if
	 * no data is copied in so long as the write had a size.
	 *
	 * We must update xfs' times since revalidate will overcopy xfs.
	 */
	if (!(ioflags & IO_INVIS)) {
		xfs_ichgtime(xip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
		inode_update_time(inode, 1);
	}

	/*
	 * If the offset is beyond the size of the file, we have a couple
	 * of things to do. First, if there is already space allocated
	 * we need to either create holes or zero the disk or ...
	 *
	 * If there is a page where the previous size lands, we need
	 * to zero it out up to the new size.
	 */

	if (pos > isize) {
		error = xfs_zero_eof(BHV_TO_VNODE(bdp), io, pos,
					isize, pos + count);
		if (error) {
			xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock);
			goto out_unlock_isem;
		}
	}
	xfs_iunlock(xip, XFS_ILOCK_EXCL);

	/*
	 * If we're writing the file then make sure to clear the
	 * setuid and setgid bits if the process is not being run
	 * by root.  This keeps people from modifying setuid and
	 * setgid binaries.
	 */

	if (((xip->i_d.di_mode & S_ISUID) ||
	    ((xip->i_d.di_mode & (S_ISGID | S_IXGRP)) ==
		(S_ISGID | S_IXGRP))) &&
	     !capable(CAP_FSETID)) {
		error = xfs_write_clear_setuid(xip);
		if (likely(!error))
			error = -remove_suid(file->f_dentry);
		if (unlikely(error)) {
			xfs_iunlock(xip, iolock);
			goto out_unlock_isem;
		}
	}

retry:
	/* We can write back this queue in page reclaim */
	current->backing_dev_info = mapping->backing_dev_info;

	if ((ioflags & IO_ISDIRECT)) {
		if (need_flush) {
			xfs_inval_cached_trace(io, pos, -1,
					ctooff(offtoct(pos)), -1);
			VOP_FLUSHINVAL_PAGES(vp, ctooff(offtoct(pos)),
					-1, FI_REMAPF_LOCKED);
		}

		if (need_isem) {
			/* demote the lock now the cached pages are gone */
			XFS_ILOCK_DEMOTE(mp, io, XFS_IOLOCK_EXCL);
			up(&inode->i_sem);

			iolock = XFS_IOLOCK_SHARED;
			locktype = VRWLOCK_WRITE_DIRECT;
			need_isem = 0;
		}

 		xfs_rw_enter_trace(XFS_DIOWR_ENTER, io, (void *)iovp, segs,
				*offset, ioflags);
		ret = generic_file_direct_write(iocb, iovp,
				&segs, pos, offset, count, ocount);

		/*
		 * direct-io write to a hole: fall through to buffered I/O
		 * for completing the rest of the request.
		 */
		if (ret >= 0 && ret != count) {
			XFS_STATS_ADD(xs_write_bytes, ret);

			pos += ret;
			count -= ret;

			need_isem = 1;
			ioflags &= ~IO_ISDIRECT;
			xfs_iunlock(xip, iolock);
			goto relock;
		}
	} else {
		xfs_rw_enter_trace(XFS_WRITE_ENTER, io, (void *)iovp, segs,
				*offset, ioflags);
		ret = generic_file_buffered_write(iocb, iovp, segs,
				pos, offset, count, ret);
	}

	current->backing_dev_info = NULL;

	if (ret == -EIOCBQUEUED)
		ret = wait_on_sync_kiocb(iocb);

	if ((ret == -ENOSPC) &&
	    DM_EVENT_ENABLED(vp->v_vfsp, xip, DM_EVENT_NOSPACE) &&
	    !(ioflags & IO_INVIS)) {

		xfs_rwunlock(bdp, locktype);
		error = XFS_SEND_NAMESP(xip->i_mount, DM_EVENT_NOSPACE, vp,
				DM_RIGHT_NULL, vp, DM_RIGHT_NULL, NULL, NULL,
				0, 0, 0); /* Delay flag intentionally  unused */
		if (error)
			goto out_unlock_isem;
		xfs_rwlock(bdp, locktype);
		pos = xip->i_d.di_size;
		goto retry;
	}

	if (*offset > xip->i_d.di_size) {
		xfs_ilock(xip, XFS_ILOCK_EXCL);
		if (*offset > xip->i_d.di_size) {
			xip->i_d.di_size = *offset;
			i_size_write(inode, *offset);
			xip->i_update_core = 1;
			xip->i_update_size = 1;
		}
		xfs_iunlock(xip, XFS_ILOCK_EXCL);
	}

	error = -ret;
	if (ret <= 0)
		goto out_unlock_internal;

	XFS_STATS_ADD(xs_write_bytes, ret);

	/* Handle various SYNC-type writes */
	if ((file->f_flags & O_SYNC) || IS_SYNC(inode)) {
		/*
		 * If we're treating this as O_DSYNC and we have not updated the
		 * size, force the log.
		 */
		if (!(mp->m_flags & XFS_MOUNT_OSYNCISOSYNC) &&
		    !(xip->i_update_size)) {
			xfs_inode_log_item_t	*iip = xip->i_itemp;

			/*
			 * If an allocation transaction occurred
			 * without extending the size, then we have to force
			 * the log up the proper point to ensure that the
			 * allocation is permanent.  We can't count on
			 * the fact that buffered writes lock out direct I/O
			 * writes - the direct I/O write could have extended
			 * the size nontransactionally, then finished before
			 * we started.  xfs_write_file will think that the file
			 * didn't grow but the update isn't safe unless the
			 * size change is logged.
			 *
			 * Force the log if we've committed a transaction
			 * against the inode or if someone else has and
			 * the commit record hasn't gone to disk (e.g.
			 * the inode is pinned).  This guarantees that
			 * all changes affecting the inode are permanent
			 * when we return.
			 */
			if (iip && iip->ili_last_lsn) {
				xfs_log_force(mp, iip->ili_last_lsn,
						XFS_LOG_FORCE | XFS_LOG_SYNC);
			} else if (xfs_ipincount(xip) > 0) {
				xfs_log_force(mp, (xfs_lsn_t)0,
						XFS_LOG_FORCE | XFS_LOG_SYNC);
			}

		} else {
			xfs_trans_t	*tp;

			/*
			 * O_SYNC or O_DSYNC _with_ a size update are handled
			 * the same way.
			 *
			 * If the write was synchronous then we need to make
			 * sure that the inode modification time is permanent.
			 * We'll have updated the timestamp above, so here
			 * we use a synchronous transaction to log the inode.
			 * It's not fast, but it's necessary.
			 *
			 * If this a dsync write and the size got changed
			 * non-transactionally, then we need to ensure that
			 * the size change gets logged in a synchronous
			 * transaction.
			 */

			tp = xfs_trans_alloc(mp, XFS_TRANS_WRITE_SYNC);
			if ((error = xfs_trans_reserve(tp, 0,
						      XFS_SWRITE_LOG_RES(mp),
						      0, 0, 0))) {
				/* Transaction reserve failed */
				xfs_trans_cancel(tp, 0);
			} else {
				/* Transaction reserve successful */
				xfs_ilock(xip, XFS_ILOCK_EXCL);
				xfs_trans_ijoin(tp, xip, XFS_ILOCK_EXCL);
				xfs_trans_ihold(tp, xip);
				xfs_trans_log_inode(tp, xip, XFS_ILOG_CORE);
				xfs_trans_set_sync(tp);
				error = xfs_trans_commit(tp, 0, NULL);
				xfs_iunlock(xip, XFS_ILOCK_EXCL);
				if (error)
					goto out_unlock_internal;
			}
		}
	
		xfs_rwunlock(bdp, locktype);
		if (need_isem)
			up(&inode->i_sem);

		error = sync_page_range(inode, mapping, pos, ret);
		if (!error)
			error = ret;
		return error;
	}

 out_unlock_internal:
	xfs_rwunlock(bdp, locktype);
 out_unlock_isem:
	if (need_isem)
		up(&inode->i_sem);
	return -error;
}
Пример #5
0
ssize_t				/* bytes written, or (-) error */
xfs_write(
	bhv_desc_t		*bdp,
	struct kiocb		*iocb,
	const struct iovec	*iovp,
	unsigned int		segs,
	loff_t			*offset,
	int			ioflags,
	cred_t			*credp)
{
	struct file		*file = iocb->ki_filp;
	size_t			size = 0;
	xfs_inode_t		*xip;
	xfs_mount_t		*mp;
	ssize_t			ret;
	int			error = 0;
	xfs_fsize_t		isize, new_size;
	xfs_fsize_t		n, limit;
	xfs_iocore_t		*io;
	vnode_t			*vp;
	unsigned long		seg;
	int			iolock;
	int			eventsent = 0;
	vrwlock_t		locktype;

	XFS_STATS_INC(xs_write_calls);

	vp = BHV_TO_VNODE(bdp);
	vn_trace_entry(vp, "xfs_write", (inst_t *)__return_address);
	xip = XFS_BHVTOI(bdp);

	/* START copy & waste from filemap.c */
	for (seg = 0; seg < segs; seg++) {
		const struct iovec *iv = &iovp[seg];

		/*
		 * If any segment has a negative length, or the cumulative
		 * length ever wraps negative then return -EINVAL.
		 */
		size += iv->iov_len;
		if (unlikely((ssize_t)(size|iv->iov_len) < 0))
			return XFS_ERROR(-EINVAL);
	}
	/* END copy & waste from filemap.c */

	if (size == 0)
		return 0;

	io = &(xip->i_iocore);
	mp = io->io_mount;

	xfs_check_frozen(mp, bdp, XFS_FREEZE_WRITE);

	if (XFS_FORCED_SHUTDOWN(mp)) {
		return -EIO;
	}

	if (ioflags & IO_ISDIRECT) {
		pb_target_t	*target =
			(xip->i_d.di_flags & XFS_DIFLAG_REALTIME) ?
				mp->m_rtdev_targp : mp->m_ddev_targp;

		if ((*offset & target->pbr_smask) ||
		    (size & target->pbr_smask)) {
			return XFS_ERROR(-EINVAL);
		}
		iolock = XFS_IOLOCK_SHARED;
		locktype = VRWLOCK_WRITE_DIRECT;
	} else {
		iolock = XFS_IOLOCK_EXCL;
		locktype = VRWLOCK_WRITE;
	}

	xfs_ilock(xip, XFS_ILOCK_EXCL|iolock);

	isize = xip->i_d.di_size;
	limit = XFS_MAXIOFFSET(mp);

	if (file->f_flags & O_APPEND)
		*offset = isize;

start:
	n = limit - *offset;
	if (n <= 0) {
		xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock);
		return -EFBIG;
	}

	if (n < size)
		size = n;

	new_size = *offset + size;
	if (new_size > isize) {
		io->io_new_size = new_size;
	}

	if ((DM_EVENT_ENABLED(vp->v_vfsp, xip, DM_EVENT_WRITE) &&
	    !(ioflags & IO_INVIS) && !eventsent)) {
		loff_t		savedsize = *offset;

		xfs_iunlock(xip, XFS_ILOCK_EXCL);
		error = XFS_SEND_DATA(xip->i_mount, DM_EVENT_WRITE, vp,
				      *offset, size,
				      FILP_DELAY_FLAG(file), &locktype);
		if (error) {
			xfs_iunlock(xip, iolock);
			return -error;
		}
		xfs_ilock(xip, XFS_ILOCK_EXCL);
		eventsent = 1;

		/*
		 * The iolock was dropped and reaquired in XFS_SEND_DATA
		 * so we have to recheck the size when appending.
		 * We will only "goto start;" once, since having sent the
		 * event prevents another call to XFS_SEND_DATA, which is
		 * what allows the size to change in the first place.
		 */
		if ((file->f_flags & O_APPEND) &&
		    savedsize != xip->i_d.di_size) {
			*offset = isize = xip->i_d.di_size;
			goto start;
		}
	}

	/*
	 * On Linux, generic_file_write updates the times even if
	 * no data is copied in so long as the write had a size.
	 *
	 * We must update xfs' times since revalidate will overcopy xfs.
	 */
	if (size && !(ioflags & IO_INVIS))
		xfs_ichgtime(xip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);

	/*
	 * If the offset is beyond the size of the file, we have a couple
	 * of things to do. First, if there is already space allocated
	 * we need to either create holes or zero the disk or ...
	 *
	 * If there is a page where the previous size lands, we need
	 * to zero it out up to the new size.
	 */

	if (!(ioflags & IO_ISDIRECT) && (*offset > isize && isize)) {
		error = xfs_zero_eof(BHV_TO_VNODE(bdp), io, *offset,
			isize, *offset + size);
		if (error) {
			xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock);
			return(-error);
		}
	}
	xfs_iunlock(xip, XFS_ILOCK_EXCL);

	/*
	 * If we're writing the file then make sure to clear the
	 * setuid and setgid bits if the process is not being run
	 * by root.  This keeps people from modifying setuid and
	 * setgid binaries.
	 */

	if (((xip->i_d.di_mode & S_ISUID) ||
	    ((xip->i_d.di_mode & (S_ISGID | S_IXGRP)) ==
		(S_ISGID | S_IXGRP))) &&
	     !capable(CAP_FSETID)) {
		error = xfs_write_clear_setuid(xip);
		if (error) {
			xfs_iunlock(xip, iolock);
			return -error;
		}
	}

retry:
	if (ioflags & IO_ISDIRECT) {
		xfs_inval_cached_pages(vp, &xip->i_iocore, *offset, 1, 1);
	}

	ret = generic_file_aio_write_nolock(iocb, iovp, segs, offset);

	if ((ret == -ENOSPC) &&
	    DM_EVENT_ENABLED(vp->v_vfsp, xip, DM_EVENT_NOSPACE) &&
	    !(ioflags & IO_INVIS)) {

		xfs_rwunlock(bdp, locktype);
		error = XFS_SEND_NAMESP(xip->i_mount, DM_EVENT_NOSPACE, vp,
				DM_RIGHT_NULL, vp, DM_RIGHT_NULL, NULL, NULL,
				0, 0, 0); /* Delay flag intentionally  unused */
		if (error)
			return -error;
		xfs_rwlock(bdp, locktype);
		*offset = xip->i_d.di_size;
		goto retry;

	}

	if (*offset > xip->i_d.di_size) {
		xfs_ilock(xip, XFS_ILOCK_EXCL);
		if (*offset > xip->i_d.di_size) {
			struct inode	*inode = LINVFS_GET_IP(vp);

			xip->i_d.di_size = *offset;
			i_size_write(inode, *offset);
			xip->i_update_core = 1;
			xip->i_update_size = 1;
		}
		xfs_iunlock(xip, XFS_ILOCK_EXCL);
	}

	if (ret <= 0) {
		xfs_rwunlock(bdp, locktype);
		return ret;
	}

	XFS_STATS_ADD(xs_write_bytes, ret);

	/* Handle various SYNC-type writes */
	if ((file->f_flags & O_SYNC) || IS_SYNC(file->f_dentry->d_inode)) {

		/*
		 * If we're treating this as O_DSYNC and we have not updated the
		 * size, force the log.
		 */

		if (!(mp->m_flags & XFS_MOUNT_OSYNCISOSYNC)
			&& !(xip->i_update_size)) {
			/*
			 * If an allocation transaction occurred
			 * without extending the size, then we have to force
			 * the log up the proper point to ensure that the
			 * allocation is permanent.  We can't count on
			 * the fact that buffered writes lock out direct I/O
			 * writes - the direct I/O write could have extended
			 * the size nontransactionally, then finished before
			 * we started.  xfs_write_file will think that the file
			 * didn't grow but the update isn't safe unless the
			 * size change is logged.
			 *
			 * Force the log if we've committed a transaction
			 * against the inode or if someone else has and
			 * the commit record hasn't gone to disk (e.g.
			 * the inode is pinned).  This guarantees that
			 * all changes affecting the inode are permanent
			 * when we return.
			 */

			xfs_inode_log_item_t *iip;
			xfs_lsn_t lsn;

			iip = xip->i_itemp;
			if (iip && iip->ili_last_lsn) {
				lsn = iip->ili_last_lsn;
				xfs_log_force(mp, lsn,
						XFS_LOG_FORCE | XFS_LOG_SYNC);
			} else if (xfs_ipincount(xip) > 0) {
				xfs_log_force(mp, (xfs_lsn_t)0,
						XFS_LOG_FORCE | XFS_LOG_SYNC);
			}

		} else {
			xfs_trans_t	*tp;

			/*
			 * O_SYNC or O_DSYNC _with_ a size update are handled
			 * the same way.
			 *
			 * If the write was synchronous then we need to make
			 * sure that the inode modification time is permanent.
			 * We'll have updated the timestamp above, so here
			 * we use a synchronous transaction to log the inode.
			 * It's not fast, but it's necessary.
			 *
			 * If this a dsync write and the size got changed
			 * non-transactionally, then we need to ensure that
			 * the size change gets logged in a synchronous
			 * transaction.
			 */

			tp = xfs_trans_alloc(mp, XFS_TRANS_WRITE_SYNC);
			if ((error = xfs_trans_reserve(tp, 0,
						      XFS_SWRITE_LOG_RES(mp),
						      0, 0, 0))) {
				/* Transaction reserve failed */
				xfs_trans_cancel(tp, 0);
			} else {
				/* Transaction reserve successful */
				xfs_ilock(xip, XFS_ILOCK_EXCL);
				xfs_trans_ijoin(tp, xip, XFS_ILOCK_EXCL);
				xfs_trans_ihold(tp, xip);
				xfs_trans_log_inode(tp, xip, XFS_ILOG_CORE);
				xfs_trans_set_sync(tp);
				error = xfs_trans_commit(tp, 0, (xfs_lsn_t)0);
				xfs_iunlock(xip, XFS_ILOCK_EXCL);
			}
		}
	} /* (ioflags & O_SYNC) */

	xfs_rwunlock(bdp, locktype);
	return(ret);
}