Esempio n. 1
0
ssize_t				/* bytes written, or (-) error */
xfs_write(
	bhv_desc_t      *bdp,
	struct file	*file,
	const char	*buf,
	size_t		size,
	loff_t		*offset,
	int		ioflags,
	cred_t          *credp)
{
	xfs_inode_t	*xip;
	xfs_mount_t	*mp;
	ssize_t		ret;
	int		error = 0;
	xfs_fsize_t     isize, new_size;
	xfs_fsize_t	n, limit;
	xfs_iocore_t    *io;
	vnode_t		*vp;
	int		iolock;
	int		eventsent = 0;
	vrwlock_t	locktype;

	XFS_STATS_INC(xs_write_calls);

	vp = BHV_TO_VNODE(bdp);
	xip = XFS_BHVTOI(bdp);

	if (size == 0)
		return 0;

	io = &xip->i_iocore;
	mp = io->io_mount;

	fs_check_frozen(vp->v_vfsp, SB_FREEZE_WRITE);

	if (XFS_FORCED_SHUTDOWN(xip->i_mount)) {
		return -EIO;
	}

	if (unlikely(ioflags & IO_ISDIRECT)) {
		if (((__psint_t)buf & BBMASK) ||
		    (*offset & mp->m_blockmask) ||
		    (size  & mp->m_blockmask)) {
			return XFS_ERROR(-EINVAL);
		}
		iolock = XFS_IOLOCK_SHARED;
		locktype = VRWLOCK_WRITE_DIRECT;
	} else {
		iolock = XFS_IOLOCK_EXCL;
		locktype = VRWLOCK_WRITE;
	}

	if (ioflags & IO_ISLOCKED)
		iolock = 0;

	xfs_ilock(xip, XFS_ILOCK_EXCL|iolock);

	isize = xip->i_d.di_size;
	limit = XFS_MAXIOFFSET(mp);

	if (file->f_flags & O_APPEND)
		*offset = isize;

start:
	n = limit - *offset;
	if (n <= 0) {
		xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock);
		return -EFBIG;
	}
	if (n < size)
		size = n;

	new_size = *offset + size;
	if (new_size > isize) {
		io->io_new_size = new_size;
	}

	if ((DM_EVENT_ENABLED(vp->v_vfsp, xip, DM_EVENT_WRITE) &&
	    !(ioflags & IO_INVIS) && !eventsent)) {
		loff_t		savedsize = *offset;
		int dmflags = FILP_DELAY_FLAG(file) | DM_SEM_FLAG_RD(ioflags);

		xfs_iunlock(xip, XFS_ILOCK_EXCL);
		error = XFS_SEND_DATA(xip->i_mount, DM_EVENT_WRITE, vp,
				      *offset, size,
				      dmflags, &locktype);
		if (error) {
			if (iolock) xfs_iunlock(xip, iolock);
			return -error;
		}
		xfs_ilock(xip, XFS_ILOCK_EXCL);
		eventsent = 1;

		/*
		 * The iolock was dropped and reaquired in XFS_SEND_DATA
		 * so we have to recheck the size when appending.
		 * We will only "goto start;" once, since having sent the
		 * event prevents another call to XFS_SEND_DATA, which is
		 * what allows the size to change in the first place.
		 */
		if ((file->f_flags & O_APPEND) &&
		    savedsize != xip->i_d.di_size) {
			*offset = isize = xip->i_d.di_size;
			goto start;
		}
	}

	/*
	 * If the offset is beyond the size of the file, we have a couple
	 * of things to do. First, if there is already space allocated
	 * we need to either create holes or zero the disk or ...
	 *
	 * If there is a page where the previous size lands, we need
	 * to zero it out up to the new size.
	 */

	if (!(ioflags & IO_ISDIRECT) && (*offset > isize && isize)) {
		error = xfs_zero_eof(BHV_TO_VNODE(bdp), io, *offset,
			isize, *offset + size);
		if (error) {
			xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock);
			return(-error);
		}
	}
	xfs_iunlock(xip, XFS_ILOCK_EXCL);

	/*
	 * If we're writing the file then make sure to clear the
	 * setuid and setgid bits if the process is not being run
	 * by root.  This keeps people from modifying setuid and
	 * setgid binaries.
	 */

	if (((xip->i_d.di_mode & S_ISUID) ||
	    ((xip->i_d.di_mode & (S_ISGID | S_IXGRP)) ==
		(S_ISGID | S_IXGRP))) &&
	     !capable(CAP_FSETID)) {
		error = xfs_write_clear_setuid(xip);
		if (error) {
			xfs_iunlock(xip, iolock);
			return -error;
		}
	}


	if ((ssize_t) size < 0) {
		ret = -EINVAL;
		goto error;
	}

	if (!access_ok(VERIFY_READ, buf, size)) {
		ret = -EINVAL;
		goto error;
	}

retry:
	if (unlikely(ioflags & IO_ISDIRECT)) {
		xfs_inval_cached_pages(vp, io, *offset, 1, 1);
		xfs_rw_enter_trace(XFS_DIOWR_ENTER,
					io, buf, size, *offset, ioflags);
		ret = do_generic_direct_write(file, buf, size, offset);
	} else {
		xfs_rw_enter_trace(XFS_WRITE_ENTER,
					io, buf, size, *offset, ioflags);
		ret = do_generic_file_write(file, buf, size, offset);
	}

	if (unlikely(ioflags & IO_INVIS)) {
		/* generic_file_write updates the mtime/ctime but we need
		 * to undo that because this I/O was supposed to be
		 * invisible.
		 */
		struct inode	*inode = LINVFS_GET_IP(vp);
		inode->i_mtime = xip->i_d.di_mtime.t_sec;
		inode->i_ctime = xip->i_d.di_ctime.t_sec;
	} else {
		xfs_ichgtime(xip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
	}

	if ((ret == -ENOSPC) &&
	    DM_EVENT_ENABLED(vp->v_vfsp, xip, DM_EVENT_NOSPACE) &&
	    !(ioflags & IO_INVIS)) {

		xfs_rwunlock(bdp, locktype);
		error = XFS_SEND_NAMESP(xip->i_mount, DM_EVENT_NOSPACE, vp,
				DM_RIGHT_NULL, vp, DM_RIGHT_NULL, NULL, NULL,
				0, 0, 0); /* Delay flag intentionally  unused */
		if (error)
			return -error;
		xfs_rwlock(bdp, locktype);
		*offset = xip->i_d.di_size;
		goto retry;
	}

error:
	if (ret <= 0) {
		if (iolock)
			xfs_rwunlock(bdp, locktype);
		return ret;
	}

	XFS_STATS_ADD(xs_write_bytes, ret);

	if (*offset > xip->i_d.di_size) {
		xfs_ilock(xip, XFS_ILOCK_EXCL);
		if (*offset > xip->i_d.di_size) {
			struct inode	*inode = LINVFS_GET_IP(vp);

			xip->i_d.di_size = *offset;
			i_size_write(inode, *offset);
			xip->i_update_core = 1;
			xip->i_update_size = 1;
			mark_inode_dirty_sync(inode);
		}
		xfs_iunlock(xip, XFS_ILOCK_EXCL);
	}

	/* Handle various SYNC-type writes */
	if ((file->f_flags & O_SYNC) || IS_SYNC(file->f_dentry->d_inode)) {

		/*
		 * If we're treating this as O_DSYNC and we have not updated the
		 * size, force the log.
		 */

		if (!(mp->m_flags & XFS_MOUNT_OSYNCISOSYNC)
			&& !(xip->i_update_size)) {
			/*
			 * If an allocation transaction occurred
			 * without extending the size, then we have to force
			 * the log up the proper point to ensure that the
			 * allocation is permanent.  We can't count on
			 * the fact that buffered writes lock out direct I/O
			 * writes - the direct I/O write could have extended
			 * the size nontransactionally, then finished before
			 * we started.  xfs_write_file will think that the file
			 * didn't grow but the update isn't safe unless the
			 * size change is logged.
			 *
			 * Force the log if we've committed a transaction
			 * against the inode or if someone else has and
			 * the commit record hasn't gone to disk (e.g.
			 * the inode is pinned).  This guarantees that
			 * all changes affecting the inode are permanent
			 * when we return.
			 */

			xfs_inode_log_item_t *iip;
			xfs_lsn_t lsn;

			iip = xip->i_itemp;
			if (iip && iip->ili_last_lsn) {
				lsn = iip->ili_last_lsn;
				xfs_log_force(mp, lsn,
						XFS_LOG_FORCE | XFS_LOG_SYNC);
			} else if (xfs_ipincount(xip) > 0) {
				xfs_log_force(mp, (xfs_lsn_t)0,
						XFS_LOG_FORCE | XFS_LOG_SYNC);
			}

		} else {
			xfs_trans_t	*tp;

			/*
			 * O_SYNC or O_DSYNC _with_ a size update are handled
			 * the same way.
			 *
			 * If the write was synchronous then we need to make
			 * sure that the inode modification time is permanent.
			 * We'll have updated the timestamp above, so here
			 * we use a synchronous transaction to log the inode.
			 * It's not fast, but it's necessary.
			 *
			 * If this a dsync write and the size got changed
			 * non-transactionally, then we need to ensure that
			 * the size change gets logged in a synchronous
			 * transaction.
			 */

			tp = xfs_trans_alloc(mp, XFS_TRANS_WRITE_SYNC);
			if ((error = xfs_trans_reserve(tp, 0,
						      XFS_SWRITE_LOG_RES(mp),
						      0, 0, 0))) {
				/* Transaction reserve failed */
				xfs_trans_cancel(tp, 0);
			} else {
				/* Transaction reserve successful */
				xfs_ilock(xip, XFS_ILOCK_EXCL);
				xfs_trans_ijoin(tp, xip, XFS_ILOCK_EXCL);
				xfs_trans_ihold(tp, xip);
				xfs_trans_log_inode(tp, xip, XFS_ILOG_CORE);
				xfs_trans_set_sync(tp);
				error = xfs_trans_commit(tp, 0, NULL);
				xfs_iunlock(xip, XFS_ILOCK_EXCL);
			}
		}
	} /* (ioflags & O_SYNC) */

	/*
	 * If we are coming from an nfsd thread then insert into the
	 * reference cache.
	 */

	if (!strcmp(current->comm, "nfsd"))
		xfs_refcache_insert(xip);

	/* Drop lock this way - the old refcache release is in here */
	if (iolock)
		xfs_rwunlock(bdp, locktype);

	return(ret);
}
Esempio n. 2
0
STATIC int
xfs_ioctl_setattr(
	xfs_inode_t		*ip,
	struct fsxattr		*fa,
	int			mask)
{
	struct xfs_mount	*mp = ip->i_mount;
	struct xfs_trans	*tp;
	unsigned int		lock_flags = 0;
	struct xfs_dquot	*udqp = NULL;
	struct xfs_dquot	*gdqp = NULL;
	struct xfs_dquot	*olddquot = NULL;
	int			code;

	xfs_itrace_entry(ip);

	if (mp->m_flags & XFS_MOUNT_RDONLY)
		return XFS_ERROR(EROFS);
	if (XFS_FORCED_SHUTDOWN(mp))
		return XFS_ERROR(EIO);

	/*
	 * If disk quotas is on, we make sure that the dquots do exist on disk,
	 * before we start any other transactions. Trying to do this later
	 * is messy. We don't care to take a readlock to look at the ids
	 * in inode here, because we can't hold it across the trans_reserve.
	 * If the IDs do change before we take the ilock, we're covered
	 * because the i_*dquot fields will get updated anyway.
	 */
	if (XFS_IS_QUOTA_ON(mp) && (mask & FSX_PROJID)) {
		code = xfs_qm_vop_dqalloc(ip, ip->i_d.di_uid,
					 ip->i_d.di_gid, fa->fsx_projid,
					 XFS_QMOPT_PQUOTA, &udqp, &gdqp);
		if (code)
			return code;
	}

	/*
	 * For the other attributes, we acquire the inode lock and
	 * first do an error checking pass.
	 */
	tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE);
	code = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp), 0, 0, 0);
	if (code)
		goto error_return;

	lock_flags = XFS_ILOCK_EXCL;
	xfs_ilock(ip, lock_flags);

	/*
	 * CAP_FOWNER overrides the following restrictions:
	 *
	 * The user ID of the calling process must be equal
	 * to the file owner ID, except in cases where the
	 * CAP_FSETID capability is applicable.
	 */
	if (current_fsuid() != ip->i_d.di_uid && !capable(CAP_FOWNER)) {
		code = XFS_ERROR(EPERM);
		goto error_return;
	}

	/*
	 * Do a quota reservation only if projid is actually going to change.
	 */
	if (mask & FSX_PROJID) {
		if (XFS_IS_QUOTA_RUNNING(mp) &&
		    XFS_IS_PQUOTA_ON(mp) &&
		    ip->i_d.di_projid != fa->fsx_projid) {
			ASSERT(tp);
			code = xfs_qm_vop_chown_reserve(tp, ip, udqp, gdqp,
						capable(CAP_FOWNER) ?
						XFS_QMOPT_FORCE_RES : 0);
			if (code)	/* out of quota */
				goto error_return;
		}
	}

	if (mask & FSX_EXTSIZE) {
		/*
		 * Can't change extent size if any extents are allocated.
		 */
		if (ip->i_d.di_nextents &&
		    ((ip->i_d.di_extsize << mp->m_sb.sb_blocklog) !=
		     fa->fsx_extsize)) {
			code = XFS_ERROR(EINVAL);	/* EFBIG? */
			goto error_return;
		}

		/*
		 * Extent size must be a multiple of the appropriate block
		 * size, if set at all.
		 */
		if (fa->fsx_extsize != 0) {
			xfs_extlen_t	size;

			if (XFS_IS_REALTIME_INODE(ip) ||
			    ((mask & FSX_XFLAGS) &&
			    (fa->fsx_xflags & XFS_XFLAG_REALTIME))) {
				size = mp->m_sb.sb_rextsize <<
				       mp->m_sb.sb_blocklog;
			} else {
				size = mp->m_sb.sb_blocksize;
			}

			if (fa->fsx_extsize % size) {
				code = XFS_ERROR(EINVAL);
				goto error_return;
			}
		}
	}


	if (mask & FSX_XFLAGS) {
		/*
		 * Can't change realtime flag if any extents are allocated.
		 */
		if ((ip->i_d.di_nextents || ip->i_delayed_blks) &&
		    (XFS_IS_REALTIME_INODE(ip)) !=
		    (fa->fsx_xflags & XFS_XFLAG_REALTIME)) {
			code = XFS_ERROR(EINVAL);	/* EFBIG? */
			goto error_return;
		}

		/*
		 * If realtime flag is set then must have realtime data.
		 */
		if ((fa->fsx_xflags & XFS_XFLAG_REALTIME)) {
			if ((mp->m_sb.sb_rblocks == 0) ||
			    (mp->m_sb.sb_rextsize == 0) ||
			    (ip->i_d.di_extsize % mp->m_sb.sb_rextsize)) {
				code = XFS_ERROR(EINVAL);
				goto error_return;
			}
		}

		/*
		 * Can't modify an immutable/append-only file unless
		 * we have appropriate permission.
		 */
		if ((ip->i_d.di_flags &
				(XFS_DIFLAG_IMMUTABLE|XFS_DIFLAG_APPEND) ||
		     (fa->fsx_xflags &
				(XFS_XFLAG_IMMUTABLE | XFS_XFLAG_APPEND))) &&
		    !capable(CAP_LINUX_IMMUTABLE)) {
			code = XFS_ERROR(EPERM);
			goto error_return;
		}
	}

	xfs_trans_ijoin(tp, ip, lock_flags);
	xfs_trans_ihold(tp, ip);

	/*
	 * Change file ownership.  Must be the owner or privileged.
	 */
	if (mask & FSX_PROJID) {
		/*
		 * CAP_FSETID overrides the following restrictions:
		 *
		 * The set-user-ID and set-group-ID bits of a file will be
		 * cleared upon successful return from chown()
		 */
		if ((ip->i_d.di_mode & (S_ISUID|S_ISGID)) &&
		    !capable(CAP_FSETID))
			ip->i_d.di_mode &= ~(S_ISUID|S_ISGID);

		/*
		 * Change the ownerships and register quota modifications
		 * in the transaction.
		 */
		if (ip->i_d.di_projid != fa->fsx_projid) {
			if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_PQUOTA_ON(mp)) {
				olddquot = xfs_qm_vop_chown(tp, ip,
							&ip->i_gdquot, gdqp);
			}
			ip->i_d.di_projid = fa->fsx_projid;

			/*
			 * We may have to rev the inode as well as
			 * the superblock version number since projids didn't
			 * exist before DINODE_VERSION_2 and SB_VERSION_NLINK.
			 */
			if (ip->i_d.di_version == 1)
				xfs_bump_ino_vers2(tp, ip);
		}

	}

	if (mask & FSX_EXTSIZE)
		ip->i_d.di_extsize = fa->fsx_extsize >> mp->m_sb.sb_blocklog;
	if (mask & FSX_XFLAGS) {
		xfs_set_diflags(ip, fa->fsx_xflags);
		xfs_diflags_to_linux(ip);
	}

	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
	xfs_ichgtime(ip, XFS_ICHGTIME_CHG);

	XFS_STATS_INC(xs_ig_attrchg);

	/*
	 * If this is a synchronous mount, make sure that the
	 * transaction goes to disk before returning to the user.
	 * This is slightly sub-optimal in that truncates require
	 * two sync transactions instead of one for wsync filesystems.
	 * One for the truncate and one for the timestamps since we
	 * don't want to change the timestamps unless we're sure the
	 * truncate worked.  Truncates are less than 1% of the laddis
	 * mix so this probably isn't worth the trouble to optimize.
	 */
	if (mp->m_flags & XFS_MOUNT_WSYNC)
		xfs_trans_set_sync(tp);
	code = xfs_trans_commit(tp, 0);
	xfs_iunlock(ip, lock_flags);

	/*
	 * Release any dquot(s) the inode had kept before chown.
	 */
	xfs_qm_dqrele(olddquot);
	xfs_qm_dqrele(udqp);
	xfs_qm_dqrele(gdqp);

	if (code)
		return code;

	if (DM_EVENT_ENABLED(ip, DM_EVENT_ATTRIBUTE)) {
		XFS_SEND_NAMESP(mp, DM_EVENT_ATTRIBUTE, ip, DM_RIGHT_NULL,
				NULL, DM_RIGHT_NULL, NULL, NULL, 0, 0,
				(mask & FSX_NONBLOCK) ? DM_FLAGS_NDELAY : 0);
	}

	return 0;

 error_return:
	xfs_qm_dqrele(udqp);
	xfs_qm_dqrele(gdqp);
	xfs_trans_cancel(tp, 0);
	if (lock_flags)
		xfs_iunlock(ip, lock_flags);
	return code;
}
Esempio n. 3
0
/*
 * xfs_rename
 */
int
xfs_rename(
	bhv_desc_t	*src_dir_bdp,
	bhv_vname_t	*src_vname,
	bhv_vnode_t	*target_dir_vp,
	bhv_vname_t	*target_vname,
	cred_t		*credp)
{
	xfs_trans_t	*tp;
	xfs_inode_t	*src_dp, *target_dp, *src_ip, *target_ip;
	xfs_mount_t	*mp;
	int		new_parent;		/* moving to a new dir */
	int		src_is_directory;	/* src_name is a directory */
	int		error;
	xfs_bmap_free_t free_list;
	xfs_fsblock_t   first_block;
	int		cancel_flags;
	int		committed;
	xfs_inode_t	*inodes[4];
	int		target_ip_dropped = 0;	/* dropped target_ip link? */
	bhv_vnode_t	*src_dir_vp;
	int		spaceres;
	int		target_link_zero = 0;
	int		num_inodes;
	char		*src_name = VNAME(src_vname);
	char		*target_name = VNAME(target_vname);
	int		src_namelen = VNAMELEN(src_vname);
	int		target_namelen = VNAMELEN(target_vname);

	src_dir_vp = BHV_TO_VNODE(src_dir_bdp);
	vn_trace_entry(src_dir_vp, "xfs_rename", (inst_t *)__return_address);
	vn_trace_entry(target_dir_vp, "xfs_rename", (inst_t *)__return_address);

	/*
	 * Find the XFS behavior descriptor for the target directory
	 * vnode since it was not handed to us.
	 */
	target_dp = xfs_vtoi(target_dir_vp);
	if (target_dp == NULL) {
		return XFS_ERROR(EXDEV);
	}

	src_dp = XFS_BHVTOI(src_dir_bdp);
	mp = src_dp->i_mount;

	if (DM_EVENT_ENABLED(src_dir_vp->v_vfsp, src_dp, DM_EVENT_RENAME) ||
	    DM_EVENT_ENABLED(target_dir_vp->v_vfsp,
				target_dp, DM_EVENT_RENAME)) {
		error = XFS_SEND_NAMESP(mp, DM_EVENT_RENAME,
					src_dir_vp, DM_RIGHT_NULL,
					target_dir_vp, DM_RIGHT_NULL,
					src_name, target_name,
					0, 0, 0);
		if (error) {
			return error;
		}
	}
	/* Return through std_return after this point. */

	/*
	 * Lock all the participating inodes. Depending upon whether
	 * the target_name exists in the target directory, and
	 * whether the target directory is the same as the source
	 * directory, we can lock from 2 to 4 inodes.
	 * xfs_lock_for_rename() will return ENOENT if src_name
	 * does not exist in the source directory.
	 */
	tp = NULL;
	error = xfs_lock_for_rename(src_dp, target_dp, src_vname,
			target_vname, &src_ip, &target_ip, inodes,
			&num_inodes);

	if (error) {
		/*
		 * We have nothing locked, no inode references, and
		 * no transaction, so just get out.
		 */
		goto std_return;
	}

	ASSERT(src_ip != NULL);

	if ((src_ip->i_d.di_mode & S_IFMT) == S_IFDIR) {
		/*
		 * Check for link count overflow on target_dp
		 */
		if (target_ip == NULL && (src_dp != target_dp) &&
		    target_dp->i_d.di_nlink >= XFS_MAXLINK) {
			error = XFS_ERROR(EMLINK);
			xfs_rename_unlock4(inodes, XFS_ILOCK_SHARED);
			goto rele_return;
		}
	}

	/*
	 * If we are using project inheritance, we only allow renames
	 * into our tree when the project IDs are the same; else the
	 * tree quota mechanism would be circumvented.
	 */
	if (unlikely((target_dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) &&
		     (target_dp->i_d.di_projid != src_ip->i_d.di_projid))) {
		error = XFS_ERROR(EXDEV);
		xfs_rename_unlock4(inodes, XFS_ILOCK_SHARED);
		goto rele_return;
	}

	new_parent = (src_dp != target_dp);
	src_is_directory = ((src_ip->i_d.di_mode & S_IFMT) == S_IFDIR);

	/*
	 * Drop the locks on our inodes so that we can start the transaction.
	 */
	xfs_rename_unlock4(inodes, XFS_ILOCK_SHARED);

	XFS_BMAP_INIT(&free_list, &first_block);
	tp = xfs_trans_alloc(mp, XFS_TRANS_RENAME);
	cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
	spaceres = XFS_RENAME_SPACE_RES(mp, target_namelen);
	error = xfs_trans_reserve(tp, spaceres, XFS_RENAME_LOG_RES(mp), 0,
			XFS_TRANS_PERM_LOG_RES, XFS_RENAME_LOG_COUNT);
	if (error == ENOSPC) {
		spaceres = 0;
		error = xfs_trans_reserve(tp, 0, XFS_RENAME_LOG_RES(mp), 0,
				XFS_TRANS_PERM_LOG_RES, XFS_RENAME_LOG_COUNT);
	}
	if (error) {
		xfs_trans_cancel(tp, 0);
		goto rele_return;
	}

	/*
	 * Attach the dquots to the inodes
	 */
	if ((error = XFS_QM_DQVOPRENAME(mp, inodes))) {
		xfs_trans_cancel(tp, cancel_flags);
		goto rele_return;
	}

	/*
	 * Reacquire the inode locks we dropped above.
	 */
	xfs_lock_inodes(inodes, num_inodes, 0, XFS_ILOCK_EXCL);

	/*
	 * Join all the inodes to the transaction. From this point on,
	 * we can rely on either trans_commit or trans_cancel to unlock
	 * them.  Note that we need to add a vnode reference to the
	 * directories since trans_commit & trans_cancel will decrement
	 * them when they unlock the inodes.  Also, we need to be careful
	 * not to add an inode to the transaction more than once.
	 */
	VN_HOLD(src_dir_vp);
	xfs_trans_ijoin(tp, src_dp, XFS_ILOCK_EXCL);
	if (new_parent) {
		VN_HOLD(target_dir_vp);
		xfs_trans_ijoin(tp, target_dp, XFS_ILOCK_EXCL);
	}
	if ((src_ip != src_dp) && (src_ip != target_dp)) {
		xfs_trans_ijoin(tp, src_ip, XFS_ILOCK_EXCL);
	}
	if ((target_ip != NULL) &&
	    (target_ip != src_ip) &&
	    (target_ip != src_dp) &&
	    (target_ip != target_dp)) {
		xfs_trans_ijoin(tp, target_ip, XFS_ILOCK_EXCL);
	}

	/*
	 * Set up the target.
	 */
	if (target_ip == NULL) {
		/*
		 * If there's no space reservation, check the entry will
		 * fit before actually inserting it.
		 */
		if (spaceres == 0 &&
		    (error = xfs_dir_canenter(tp, target_dp, target_name,
						target_namelen)))
			goto error_return;
		/*
		 * If target does not exist and the rename crosses
		 * directories, adjust the target directory link count
		 * to account for the ".." reference from the new entry.
		 */
		error = xfs_dir_createname(tp, target_dp, target_name,
					   target_namelen, src_ip->i_ino,
					   &first_block, &free_list, spaceres);
		if (error == ENOSPC)
			goto error_return;
		if (error)
			goto abort_return;
		xfs_ichgtime(target_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);

		if (new_parent && src_is_directory) {
			error = xfs_bumplink(tp, target_dp);
			if (error)
				goto abort_return;
		}
	} else { /* target_ip != NULL */
		/*
		 * If target exists and it's a directory, check that both
		 * target and source are directories and that target can be
		 * destroyed, or that neither is a directory.
		 */
		if ((target_ip->i_d.di_mode & S_IFMT) == S_IFDIR) {
			/*
			 * Make sure target dir is empty.
			 */
			if (!(xfs_dir_isempty(target_ip)) ||
			    (target_ip->i_d.di_nlink > 2)) {
				error = XFS_ERROR(EEXIST);
				goto error_return;
			}
		}

		/*
		 * Link the source inode under the target name.
		 * If the source inode is a directory and we are moving
		 * it across directories, its ".." entry will be
		 * inconsistent until we replace that down below.
		 *
		 * In case there is already an entry with the same
		 * name at the destination directory, remove it first.
		 */
		error = xfs_dir_replace(tp, target_dp, target_name,
					target_namelen, src_ip->i_ino,
					&first_block, &free_list, spaceres);
		if (error)
			goto abort_return;
		xfs_ichgtime(target_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);

		/*
		 * Decrement the link count on the target since the target
		 * dir no longer points to it.
		 */
		error = xfs_droplink(tp, target_ip);
		if (error)
			goto abort_return;
		target_ip_dropped = 1;

		if (src_is_directory) {
			/*
			 * Drop the link from the old "." entry.
			 */
			error = xfs_droplink(tp, target_ip);
			if (error)
				goto abort_return;
		}

		/* Do this test while we still hold the locks */
		target_link_zero = (target_ip)->i_d.di_nlink==0;

	} /* target_ip != NULL */

	/*
	 * Remove the source.
	 */
	if (new_parent && src_is_directory) {
		/*
		 * Rewrite the ".." entry to point to the new
		 * directory.
		 */
		error = xfs_dir_replace(tp, src_ip, "..", 2, target_dp->i_ino,
					&first_block, &free_list, spaceres);
		ASSERT(error != EEXIST);
		if (error)
			goto abort_return;
		xfs_ichgtime(src_ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);

	} else {
		/*
		 * We always want to hit the ctime on the source inode.
		 * We do it in the if clause above for the 'new_parent &&
		 * src_is_directory' case, and here we get all the other
		 * cases.  This isn't strictly required by the standards
		 * since the source inode isn't really being changed,
		 * but old unix file systems did it and some incremental
		 * backup programs won't work without it.
		 */
		xfs_ichgtime(src_ip, XFS_ICHGTIME_CHG);
	}

	/*
	 * Adjust the link count on src_dp.  This is necessary when
	 * renaming a directory, either within one parent when
	 * the target existed, or across two parent directories.
	 */
	if (src_is_directory && (new_parent || target_ip != NULL)) {

		/*
		 * Decrement link count on src_directory since the
		 * entry that's moved no longer points to it.
		 */
		error = xfs_droplink(tp, src_dp);
		if (error)
			goto abort_return;
	}

	error = xfs_dir_removename(tp, src_dp, src_name, src_namelen,
			src_ip->i_ino, &first_block, &free_list, spaceres);
	if (error)
		goto abort_return;
	xfs_ichgtime(src_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);

	/*
	 * Update the generation counts on all the directory inodes
	 * that we're modifying.
	 */
	src_dp->i_gen++;
	xfs_trans_log_inode(tp, src_dp, XFS_ILOG_CORE);

	if (new_parent) {
		target_dp->i_gen++;
		xfs_trans_log_inode(tp, target_dp, XFS_ILOG_CORE);
	}

	/*
	 * If there was a target inode, take an extra reference on
	 * it here so that it doesn't go to xfs_inactive() from
	 * within the commit.
	 */
	if (target_ip != NULL) {
		IHOLD(target_ip);
	}

	/*
	 * If this is a synchronous mount, make sure that the
	 * rename transaction goes to disk before returning to
	 * the user.
	 */
	if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
		xfs_trans_set_sync(tp);
	}

	/*
	 * Take refs. for vop_link_removed calls below.  No need to worry
	 * about directory refs. because the caller holds them.
	 *
	 * Do holds before the xfs_bmap_finish since it might rele them down
	 * to zero.
	 */

	if (target_ip_dropped)
		IHOLD(target_ip);
	IHOLD(src_ip);

	error = xfs_bmap_finish(&tp, &free_list, &committed);
	if (error) {
		xfs_bmap_cancel(&free_list);
		xfs_trans_cancel(tp, (XFS_TRANS_RELEASE_LOG_RES |
				 XFS_TRANS_ABORT));
		if (target_ip != NULL) {
			IRELE(target_ip);
		}
		if (target_ip_dropped) {
			IRELE(target_ip);
		}
		IRELE(src_ip);
		goto std_return;
	}

	/*
	 * trans_commit will unlock src_ip, target_ip & decrement
	 * the vnode references.
	 */
	error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
	if (target_ip != NULL) {
		xfs_refcache_purge_ip(target_ip);
		IRELE(target_ip);
	}
	/*
	 * Let interposed file systems know about removed links.
	 */
	if (target_ip_dropped) {
		bhv_vop_link_removed(XFS_ITOV(target_ip), target_dir_vp,
					target_link_zero);
		IRELE(target_ip);
	}

	IRELE(src_ip);

	/* Fall through to std_return with error = 0 or errno from
	 * xfs_trans_commit	 */
std_return:
	if (DM_EVENT_ENABLED(src_dir_vp->v_vfsp, src_dp, DM_EVENT_POSTRENAME) ||
	    DM_EVENT_ENABLED(target_dir_vp->v_vfsp,
				target_dp, DM_EVENT_POSTRENAME)) {
		(void) XFS_SEND_NAMESP (mp, DM_EVENT_POSTRENAME,
					src_dir_vp, DM_RIGHT_NULL,
					target_dir_vp, DM_RIGHT_NULL,
					src_name, target_name,
					0, error, 0);
	}
	return error;

 abort_return:
	cancel_flags |= XFS_TRANS_ABORT;
	/* FALLTHROUGH */
 error_return:
	xfs_bmap_cancel(&free_list);
	xfs_trans_cancel(tp, cancel_flags);
	goto std_return;

 rele_return:
	IRELE(src_ip);
	if (target_ip != NULL) {
		IRELE(target_ip);
	}
	goto std_return;
}
/*
 * xfs_rename
 */
int
xfs_rename(
	xfs_inode_t	*src_dp,
	struct xfs_name	*src_name,
	xfs_inode_t	*src_ip,
	xfs_inode_t	*target_dp,
	struct xfs_name	*target_name,
	xfs_inode_t	*target_ip)
{
	xfs_trans_t	*tp = NULL;
	xfs_mount_t	*mp = src_dp->i_mount;
	int		new_parent;		/* moving to a new dir */
	int		src_is_directory;	/* src_name is a directory */
	int		error;
	xfs_bmap_free_t free_list;
	xfs_fsblock_t   first_block;
	int		cancel_flags;
	int		committed;
	xfs_inode_t	*inodes[4];
	int		spaceres;
	int		num_inodes;

	xfs_itrace_entry(src_dp);
	xfs_itrace_entry(target_dp);

	if (DM_EVENT_ENABLED(src_dp, DM_EVENT_RENAME) ||
	    DM_EVENT_ENABLED(target_dp, DM_EVENT_RENAME)) {
		error = XFS_SEND_NAMESP(mp, DM_EVENT_RENAME,
					src_dp, DM_RIGHT_NULL,
					target_dp, DM_RIGHT_NULL,
					src_name->name, target_name->name,
					0, 0, 0);
		if (error)
			return error;
	}
	/* Return through std_return after this point. */

	new_parent = (src_dp != target_dp);
	src_is_directory = ((src_ip->i_d.di_mode & S_IFMT) == S_IFDIR);

	if (src_is_directory) {
		/*
		 * Check for link count overflow on target_dp
		 */
		if (target_ip == NULL && new_parent &&
		    target_dp->i_d.di_nlink >= XFS_MAXLINK) {
			error = XFS_ERROR(EMLINK);
			goto std_return;
		}
	}

	xfs_sort_for_rename(src_dp, target_dp, src_ip, target_ip,
				inodes, &num_inodes);

	xfs_bmap_init(&free_list, &first_block);
	tp = xfs_trans_alloc(mp, XFS_TRANS_RENAME);
	cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
	spaceres = XFS_RENAME_SPACE_RES(mp, target_name->len);
	error = xfs_trans_reserve(tp, spaceres, XFS_RENAME_LOG_RES(mp), 0,
			XFS_TRANS_PERM_LOG_RES, XFS_RENAME_LOG_COUNT);
	if (error == ENOSPC) {
		spaceres = 0;
		error = xfs_trans_reserve(tp, 0, XFS_RENAME_LOG_RES(mp), 0,
				XFS_TRANS_PERM_LOG_RES, XFS_RENAME_LOG_COUNT);
	}
	if (error) {
		xfs_trans_cancel(tp, 0);
		goto std_return;
	}

	/*
	 * Attach the dquots to the inodes
	 */
	if ((error = XFS_QM_DQVOPRENAME(mp, inodes))) {
		xfs_trans_cancel(tp, cancel_flags);
		goto std_return;
	}

	/*
	 * Lock all the participating inodes. Depending upon whether
	 * the target_name exists in the target directory, and
	 * whether the target directory is the same as the source
	 * directory, we can lock from 2 to 4 inodes.
	 */
	xfs_lock_inodes(inodes, num_inodes, XFS_ILOCK_EXCL);

	/*
	 * Join all the inodes to the transaction. From this point on,
	 * we can rely on either trans_commit or trans_cancel to unlock
	 * them.  Note that we need to add a vnode reference to the
	 * directories since trans_commit & trans_cancel will decrement
	 * them when they unlock the inodes.  Also, we need to be careful
	 * not to add an inode to the transaction more than once.
	 */
	IHOLD(src_dp);
	xfs_trans_ijoin(tp, src_dp, XFS_ILOCK_EXCL);

	if (new_parent) {
		IHOLD(target_dp);
		xfs_trans_ijoin(tp, target_dp, XFS_ILOCK_EXCL);
	}

	IHOLD(src_ip);
	xfs_trans_ijoin(tp, src_ip, XFS_ILOCK_EXCL);

	if (target_ip) {
		IHOLD(target_ip);
		xfs_trans_ijoin(tp, target_ip, XFS_ILOCK_EXCL);
	}

	/*
	 * If we are using project inheritance, we only allow renames
	 * into our tree when the project IDs are the same; else the
	 * tree quota mechanism would be circumvented.
	 */
	if (unlikely((target_dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) &&
		     (target_dp->i_d.di_projid != src_ip->i_d.di_projid))) {
		error = XFS_ERROR(EXDEV);
		goto error_return;
	}

	/*
	 * Set up the target.
	 */
	if (target_ip == NULL) {
		/*
		 * If there's no space reservation, check the entry will
		 * fit before actually inserting it.
		 */
		error = xfs_dir_canenter(tp, target_dp, target_name, spaceres);
		if (error)
			goto error_return;
		/*
		 * If target does not exist and the rename crosses
		 * directories, adjust the target directory link count
		 * to account for the ".." reference from the new entry.
		 */
		error = xfs_dir_createname(tp, target_dp, target_name,
						src_ip->i_ino, &first_block,
						&free_list, spaceres);
		if (error == ENOSPC)
			goto error_return;
		if (error)
			goto abort_return;
		xfs_ichgtime(target_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);

		if (new_parent && src_is_directory) {
			error = xfs_bumplink(tp, target_dp);
			if (error)
				goto abort_return;
		}
	} else { /* target_ip != NULL */
		/*
		 * If target exists and it's a directory, check that both
		 * target and source are directories and that target can be
		 * destroyed, or that neither is a directory.
		 */
		if ((target_ip->i_d.di_mode & S_IFMT) == S_IFDIR) {
			/*
			 * Make sure target dir is empty.
			 */
			if (!(xfs_dir_isempty(target_ip)) ||
			    (target_ip->i_d.di_nlink > 2)) {
				error = XFS_ERROR(EEXIST);
				goto error_return;
			}
		}

		/*
		 * Link the source inode under the target name.
		 * If the source inode is a directory and we are moving
		 * it across directories, its ".." entry will be
		 * inconsistent until we replace that down below.
		 *
		 * In case there is already an entry with the same
		 * name at the destination directory, remove it first.
		 */
		error = xfs_dir_replace(tp, target_dp, target_name,
					src_ip->i_ino,
					&first_block, &free_list, spaceres);
		if (error)
			goto abort_return;
		xfs_ichgtime(target_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);

		/*
		 * Decrement the link count on the target since the target
		 * dir no longer points to it.
		 */
		error = xfs_droplink(tp, target_ip);
		if (error)
			goto abort_return;

		if (src_is_directory) {
			/*
			 * Drop the link from the old "." entry.
			 */
			error = xfs_droplink(tp, target_ip);
			if (error)
				goto abort_return;
		}
	} /* target_ip != NULL */

	/*
	 * Remove the source.
	 */
	if (new_parent && src_is_directory) {
		/*
		 * Rewrite the ".." entry to point to the new
		 * directory.
		 */
		error = xfs_dir_replace(tp, src_ip, &xfs_name_dotdot,
					target_dp->i_ino,
					&first_block, &free_list, spaceres);
		ASSERT(error != EEXIST);
		if (error)
			goto abort_return;
	}

	/*
	 * We always want to hit the ctime on the source inode.
	 *
	 * This isn't strictly required by the standards since the source
	 * inode isn't really being changed, but old unix file systems did
	 * it and some incremental backup programs won't work without it.
	 */
	xfs_ichgtime(src_ip, XFS_ICHGTIME_CHG);

	/*
	 * Adjust the link count on src_dp.  This is necessary when
	 * renaming a directory, either within one parent when
	 * the target existed, or across two parent directories.
	 */
	if (src_is_directory && (new_parent || target_ip != NULL)) {

		/*
		 * Decrement link count on src_directory since the
		 * entry that's moved no longer points to it.
		 */
		error = xfs_droplink(tp, src_dp);
		if (error)
			goto abort_return;
	}

	error = xfs_dir_removename(tp, src_dp, src_name, src_ip->i_ino,
					&first_block, &free_list, spaceres);
	if (error)
		goto abort_return;

	xfs_ichgtime(src_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
	xfs_trans_log_inode(tp, src_dp, XFS_ILOG_CORE);
	if (new_parent)
		xfs_trans_log_inode(tp, target_dp, XFS_ILOG_CORE);

	/*
	 * If this is a synchronous mount, make sure that the
	 * rename transaction goes to disk before returning to
	 * the user.
	 */
	if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
		xfs_trans_set_sync(tp);
	}

	error = xfs_bmap_finish(&tp, &free_list, &committed);
	if (error) {
		xfs_bmap_cancel(&free_list);
		xfs_trans_cancel(tp, (XFS_TRANS_RELEASE_LOG_RES |
				 XFS_TRANS_ABORT));
		goto std_return;
	}

	/*
	 * trans_commit will unlock src_ip, target_ip & decrement
	 * the vnode references.
	 */
	error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);

	/* Fall through to std_return with error = 0 or errno from
	 * xfs_trans_commit	 */
std_return:
	if (DM_EVENT_ENABLED(src_dp, DM_EVENT_POSTRENAME) ||
	    DM_EVENT_ENABLED(target_dp, DM_EVENT_POSTRENAME)) {
		(void) XFS_SEND_NAMESP (mp, DM_EVENT_POSTRENAME,
					src_dp, DM_RIGHT_NULL,
					target_dp, DM_RIGHT_NULL,
					src_name->name, target_name->name,
					0, error, 0);
	}
	return error;

 abort_return:
	cancel_flags |= XFS_TRANS_ABORT;
	/* FALLTHROUGH */
 error_return:
	xfs_bmap_cancel(&free_list);
	xfs_trans_cancel(tp, cancel_flags);
	goto std_return;
}
Esempio n. 5
0
STATIC ssize_t
xfs_file_aio_write(
	struct kiocb		*iocb,
	const struct iovec	*iovp,
	unsigned long		nr_segs,
	loff_t			pos)
{
	struct file		*file = iocb->ki_filp;
	struct address_space	*mapping = file->f_mapping;
	struct inode		*inode = mapping->host;
	struct xfs_inode	*ip = XFS_I(inode);
	struct xfs_mount	*mp = ip->i_mount;
	ssize_t			ret = 0, error = 0;
	int			ioflags = 0;
	xfs_fsize_t		isize, new_size;
	int			iolock;
	int			eventsent = 0;
	size_t			ocount = 0, count;
	int			need_i_mutex;

	XFS_STATS_INC(xs_write_calls);

	BUG_ON(iocb->ki_pos != pos);

	if (unlikely(file->f_flags & O_DIRECT))
		ioflags |= IO_ISDIRECT;
	if (file->f_mode & FMODE_NOCMTIME)
		ioflags |= IO_INVIS;

	error = generic_segment_checks(iovp, &nr_segs, &ocount, VERIFY_READ);
	if (error)
		return error;

	count = ocount;
	if (count == 0)
		return 0;

	xfs_wait_for_freeze(mp, SB_FREEZE_WRITE);

	if (XFS_FORCED_SHUTDOWN(mp))
		return -EIO;

relock:
	if (ioflags & IO_ISDIRECT) {
		iolock = XFS_IOLOCK_SHARED;
		need_i_mutex = 0;
	} else {
		iolock = XFS_IOLOCK_EXCL;
		need_i_mutex = 1;
		mutex_lock(&inode->i_mutex);
	}

	xfs_ilock(ip, XFS_ILOCK_EXCL|iolock);

start:
	error = -generic_write_checks(file, &pos, &count,
					S_ISBLK(inode->i_mode));
	if (error) {
		xfs_iunlock(ip, XFS_ILOCK_EXCL|iolock);
		goto out_unlock_mutex;
	}

	if ((DM_EVENT_ENABLED(ip, DM_EVENT_WRITE) &&
	    !(ioflags & IO_INVIS) && !eventsent)) {
		int		dmflags = FILP_DELAY_FLAG(file);

		if (need_i_mutex)
			dmflags |= DM_FLAGS_IMUX;

		xfs_iunlock(ip, XFS_ILOCK_EXCL);
		error = XFS_SEND_DATA(ip->i_mount, DM_EVENT_WRITE, ip,
				      pos, count, dmflags, &iolock);
		if (error) {
			goto out_unlock_internal;
		}
		xfs_ilock(ip, XFS_ILOCK_EXCL);
		eventsent = 1;

		/*
		 * The iolock was dropped and reacquired in XFS_SEND_DATA
		 * so we have to recheck the size when appending.
		 * We will only "goto start;" once, since having sent the
		 * event prevents another call to XFS_SEND_DATA, which is
		 * what allows the size to change in the first place.
		 */
		if ((file->f_flags & O_APPEND) && pos != ip->i_size)
			goto start;
	}

	if (ioflags & IO_ISDIRECT) {
		xfs_buftarg_t	*target =
			XFS_IS_REALTIME_INODE(ip) ?
				mp->m_rtdev_targp : mp->m_ddev_targp;

		if ((pos & target->bt_smask) || (count & target->bt_smask)) {
			xfs_iunlock(ip, XFS_ILOCK_EXCL|iolock);
			return XFS_ERROR(-EINVAL);
		}

		if (!need_i_mutex && (mapping->nrpages || pos > ip->i_size)) {
			xfs_iunlock(ip, XFS_ILOCK_EXCL|iolock);
			iolock = XFS_IOLOCK_EXCL;
			need_i_mutex = 1;
			mutex_lock(&inode->i_mutex);
			xfs_ilock(ip, XFS_ILOCK_EXCL|iolock);
			goto start;
		}
	}

	new_size = pos + count;
	if (new_size > ip->i_size)
		ip->i_new_size = new_size;

	if (likely(!(ioflags & IO_INVIS)))
		file_update_time(file);

	/*
	 * If the offset is beyond the size of the file, we have a couple
	 * of things to do. First, if there is already space allocated
	 * we need to either create holes or zero the disk or ...
	 *
	 * If there is a page where the previous size lands, we need
	 * to zero it out up to the new size.
	 */

	if (pos > ip->i_size) {
		error = xfs_zero_eof(ip, pos, ip->i_size);
		if (error) {
			xfs_iunlock(ip, XFS_ILOCK_EXCL);
			goto out_unlock_internal;
		}
	}
	xfs_iunlock(ip, XFS_ILOCK_EXCL);

	/*
	 * If we're writing the file then make sure to clear the
	 * setuid and setgid bits if the process is not being run
	 * by root.  This keeps people from modifying setuid and
	 * setgid binaries.
	 */
	error = -file_remove_suid(file);
	if (unlikely(error))
		goto out_unlock_internal;

	/* We can write back this queue in page reclaim */
	current->backing_dev_info = mapping->backing_dev_info;

	if ((ioflags & IO_ISDIRECT)) {
		if (mapping->nrpages) {
			WARN_ON(need_i_mutex == 0);
			error = xfs_flushinval_pages(ip,
					(pos & PAGE_CACHE_MASK),
					-1, FI_REMAPF_LOCKED);
			if (error)
				goto out_unlock_internal;
		}

		if (need_i_mutex) {
			/* demote the lock now the cached pages are gone */
			xfs_ilock_demote(ip, XFS_IOLOCK_EXCL);
			mutex_unlock(&inode->i_mutex);

			iolock = XFS_IOLOCK_SHARED;
			need_i_mutex = 0;
		}

		trace_xfs_file_direct_write(ip, count, iocb->ki_pos, ioflags);
		ret = generic_file_direct_write(iocb, iovp,
				&nr_segs, pos, &iocb->ki_pos, count, ocount);

		/*
		 * direct-io write to a hole: fall through to buffered I/O
		 * for completing the rest of the request.
		 */
		if (ret >= 0 && ret != count) {
			XFS_STATS_ADD(xs_write_bytes, ret);

			pos += ret;
			count -= ret;

			ioflags &= ~IO_ISDIRECT;
			xfs_iunlock(ip, iolock);
			goto relock;
		}
	} else {
		int enospc = 0;
		ssize_t ret2 = 0;

write_retry:
		trace_xfs_file_buffered_write(ip, count, iocb->ki_pos, ioflags);
		ret2 = generic_file_buffered_write(iocb, iovp, nr_segs,
				pos, &iocb->ki_pos, count, ret);
		/*
		 * if we just got an ENOSPC, flush the inode now we
		 * aren't holding any page locks and retry *once*
		 */
		if (ret2 == -ENOSPC && !enospc) {
			error = xfs_flush_pages(ip, 0, -1, 0, FI_NONE);
			if (error)
				goto out_unlock_internal;
			enospc = 1;
			goto write_retry;
		}
		ret = ret2;
	}

	current->backing_dev_info = NULL;

	isize = i_size_read(inode);
	if (unlikely(ret < 0 && ret != -EFAULT && iocb->ki_pos > isize))
		iocb->ki_pos = isize;

	if (iocb->ki_pos > ip->i_size) {
		xfs_ilock(ip, XFS_ILOCK_EXCL);
		if (iocb->ki_pos > ip->i_size)
			ip->i_size = iocb->ki_pos;
		xfs_iunlock(ip, XFS_ILOCK_EXCL);
	}

	if (ret == -ENOSPC &&
	    DM_EVENT_ENABLED(ip, DM_EVENT_NOSPACE) && !(ioflags & IO_INVIS)) {
		xfs_iunlock(ip, iolock);
		if (need_i_mutex)
			mutex_unlock(&inode->i_mutex);
		error = XFS_SEND_NAMESP(ip->i_mount, DM_EVENT_NOSPACE, ip,
				DM_RIGHT_NULL, ip, DM_RIGHT_NULL, NULL, NULL,
				0, 0, 0); /* Delay flag intentionally  unused */
		if (need_i_mutex)
			mutex_lock(&inode->i_mutex);
		xfs_ilock(ip, iolock);
		if (error)
			goto out_unlock_internal;
		goto start;
	}

	error = -ret;
	if (ret <= 0)
		goto out_unlock_internal;

	XFS_STATS_ADD(xs_write_bytes, ret);

	/* Handle various SYNC-type writes */
	if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) {
		loff_t end = pos + ret - 1;
		int error2;

		xfs_iunlock(ip, iolock);
		if (need_i_mutex)
			mutex_unlock(&inode->i_mutex);

		error2 = filemap_write_and_wait_range(mapping, pos, end);
		if (!error)
			error = error2;
		if (need_i_mutex)
			mutex_lock(&inode->i_mutex);
		xfs_ilock(ip, iolock);

		error2 = -xfs_file_fsync(file, file->f_path.dentry,
					 (file->f_flags & __O_SYNC) ? 0 : 1);
		if (!error)
			error = error2;
	}

 out_unlock_internal:
	if (ip->i_new_size) {
		xfs_ilock(ip, XFS_ILOCK_EXCL);
		ip->i_new_size = 0;
		/*
		 * If this was a direct or synchronous I/O that failed (such
		 * as ENOSPC) then part of the I/O may have been written to
		 * disk before the error occured.  In this case the on-disk
		 * file size may have been adjusted beyond the in-memory file
		 * size and now needs to be truncated back.
		 */
		if (ip->i_d.di_size > ip->i_size)
			ip->i_d.di_size = ip->i_size;
		xfs_iunlock(ip, XFS_ILOCK_EXCL);
	}
	xfs_iunlock(ip, iolock);
 out_unlock_mutex:
	if (need_i_mutex)
		mutex_unlock(&inode->i_mutex);
	return -error;
}
Esempio n. 6
0
ssize_t				/* bytes written, or (-) error */
xfs_write(
	bhv_desc_t		*bdp,
	struct kiocb		*iocb,
	const struct iovec	*iovp,
	unsigned int		nsegs,
	loff_t			*offset,
	int			ioflags,
	cred_t			*credp)
{
	struct file		*file = iocb->ki_filp;
	struct address_space	*mapping = file->f_mapping;
	struct inode		*inode = mapping->host;
	unsigned long		segs = nsegs;
	xfs_inode_t		*xip;
	xfs_mount_t		*mp;
	ssize_t			ret = 0, error = 0;
	xfs_fsize_t		isize, new_size;
	xfs_iocore_t		*io;
	vnode_t			*vp;
	unsigned long		seg;
	int			iolock;
	int			eventsent = 0;
	vrwlock_t		locktype;
	size_t			ocount = 0, count;
	loff_t			pos;
	int			need_isem = 1, need_flush = 0;

	XFS_STATS_INC(xs_write_calls);

	vp = BHV_TO_VNODE(bdp);
	xip = XFS_BHVTOI(bdp);

	for (seg = 0; seg < segs; seg++) {
		const struct iovec *iv = &iovp[seg];

		/*
		 * If any segment has a negative length, or the cumulative
		 * length ever wraps negative then return -EINVAL.
		 */
		ocount += iv->iov_len;
		if (unlikely((ssize_t)(ocount|iv->iov_len) < 0))
			return -EINVAL;
		if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
			continue;
		if (seg == 0)
			return -EFAULT;
		segs = seg;
		ocount -= iv->iov_len;  /* This segment is no good */
		break;
	}

	count = ocount;
	pos = *offset;

	if (count == 0)
		return 0;

	io = &xip->i_iocore;
	mp = io->io_mount;

	if (XFS_FORCED_SHUTDOWN(mp))
		return -EIO;

	if (ioflags & IO_ISDIRECT) {
		xfs_buftarg_t	*target =
			(xip->i_d.di_flags & XFS_DIFLAG_REALTIME) ?
				mp->m_rtdev_targp : mp->m_ddev_targp;

		if ((pos & target->pbr_smask) || (count & target->pbr_smask))
			return XFS_ERROR(-EINVAL);

		if (!VN_CACHED(vp) && pos < i_size_read(inode))
			need_isem = 0;

		if (VN_CACHED(vp))
			need_flush = 1;
	}

relock:
	if (need_isem) {
		iolock = XFS_IOLOCK_EXCL;
		locktype = VRWLOCK_WRITE;

		down(&inode->i_sem);
	} else {
		iolock = XFS_IOLOCK_SHARED;
		locktype = VRWLOCK_WRITE_DIRECT;
	}

	xfs_ilock(xip, XFS_ILOCK_EXCL|iolock);

	isize = i_size_read(inode);

	if (file->f_flags & O_APPEND)
		*offset = isize;

start:
	error = -generic_write_checks(file, &pos, &count,
					S_ISBLK(inode->i_mode));
	if (error) {
		xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock);
		goto out_unlock_isem;
	}

	new_size = pos + count;
	if (new_size > isize)
		io->io_new_size = new_size;

	if ((DM_EVENT_ENABLED(vp->v_vfsp, xip, DM_EVENT_WRITE) &&
	    !(ioflags & IO_INVIS) && !eventsent)) {
		loff_t		savedsize = pos;
		int		dmflags = FILP_DELAY_FLAG(file);

		if (need_isem)
			dmflags |= DM_FLAGS_ISEM;

		xfs_iunlock(xip, XFS_ILOCK_EXCL);
		error = XFS_SEND_DATA(xip->i_mount, DM_EVENT_WRITE, vp,
				      pos, count,
				      dmflags, &locktype);
		if (error) {
			xfs_iunlock(xip, iolock);
			goto out_unlock_isem;
		}
		xfs_ilock(xip, XFS_ILOCK_EXCL);
		eventsent = 1;

		/*
		 * The iolock was dropped and reaquired in XFS_SEND_DATA
		 * so we have to recheck the size when appending.
		 * We will only "goto start;" once, since having sent the
		 * event prevents another call to XFS_SEND_DATA, which is
		 * what allows the size to change in the first place.
		 */
		if ((file->f_flags & O_APPEND) && savedsize != isize) {
			pos = isize = xip->i_d.di_size;
			goto start;
		}
	}

	/*
	 * On Linux, generic_file_write updates the times even if
	 * no data is copied in so long as the write had a size.
	 *
	 * We must update xfs' times since revalidate will overcopy xfs.
	 */
	if (!(ioflags & IO_INVIS)) {
		xfs_ichgtime(xip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
		inode_update_time(inode, 1);
	}

	/*
	 * If the offset is beyond the size of the file, we have a couple
	 * of things to do. First, if there is already space allocated
	 * we need to either create holes or zero the disk or ...
	 *
	 * If there is a page where the previous size lands, we need
	 * to zero it out up to the new size.
	 */

	if (pos > isize) {
		error = xfs_zero_eof(BHV_TO_VNODE(bdp), io, pos,
					isize, pos + count);
		if (error) {
			xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock);
			goto out_unlock_isem;
		}
	}
	xfs_iunlock(xip, XFS_ILOCK_EXCL);

	/*
	 * If we're writing the file then make sure to clear the
	 * setuid and setgid bits if the process is not being run
	 * by root.  This keeps people from modifying setuid and
	 * setgid binaries.
	 */

	if (((xip->i_d.di_mode & S_ISUID) ||
	    ((xip->i_d.di_mode & (S_ISGID | S_IXGRP)) ==
		(S_ISGID | S_IXGRP))) &&
	     !capable(CAP_FSETID)) {
		error = xfs_write_clear_setuid(xip);
		if (likely(!error))
			error = -remove_suid(file->f_dentry);
		if (unlikely(error)) {
			xfs_iunlock(xip, iolock);
			goto out_unlock_isem;
		}
	}

retry:
	/* We can write back this queue in page reclaim */
	current->backing_dev_info = mapping->backing_dev_info;

	if ((ioflags & IO_ISDIRECT)) {
		if (need_flush) {
			xfs_inval_cached_trace(io, pos, -1,
					ctooff(offtoct(pos)), -1);
			VOP_FLUSHINVAL_PAGES(vp, ctooff(offtoct(pos)),
					-1, FI_REMAPF_LOCKED);
		}

		if (need_isem) {
			/* demote the lock now the cached pages are gone */
			XFS_ILOCK_DEMOTE(mp, io, XFS_IOLOCK_EXCL);
			up(&inode->i_sem);

			iolock = XFS_IOLOCK_SHARED;
			locktype = VRWLOCK_WRITE_DIRECT;
			need_isem = 0;
		}

 		xfs_rw_enter_trace(XFS_DIOWR_ENTER, io, (void *)iovp, segs,
				*offset, ioflags);
		ret = generic_file_direct_write(iocb, iovp,
				&segs, pos, offset, count, ocount);

		/*
		 * direct-io write to a hole: fall through to buffered I/O
		 * for completing the rest of the request.
		 */
		if (ret >= 0 && ret != count) {
			XFS_STATS_ADD(xs_write_bytes, ret);

			pos += ret;
			count -= ret;

			need_isem = 1;
			ioflags &= ~IO_ISDIRECT;
			xfs_iunlock(xip, iolock);
			goto relock;
		}
	} else {
		xfs_rw_enter_trace(XFS_WRITE_ENTER, io, (void *)iovp, segs,
				*offset, ioflags);
		ret = generic_file_buffered_write(iocb, iovp, segs,
				pos, offset, count, ret);
	}

	current->backing_dev_info = NULL;

	if (ret == -EIOCBQUEUED)
		ret = wait_on_sync_kiocb(iocb);

	if ((ret == -ENOSPC) &&
	    DM_EVENT_ENABLED(vp->v_vfsp, xip, DM_EVENT_NOSPACE) &&
	    !(ioflags & IO_INVIS)) {

		xfs_rwunlock(bdp, locktype);
		error = XFS_SEND_NAMESP(xip->i_mount, DM_EVENT_NOSPACE, vp,
				DM_RIGHT_NULL, vp, DM_RIGHT_NULL, NULL, NULL,
				0, 0, 0); /* Delay flag intentionally  unused */
		if (error)
			goto out_unlock_isem;
		xfs_rwlock(bdp, locktype);
		pos = xip->i_d.di_size;
		goto retry;
	}

	if (*offset > xip->i_d.di_size) {
		xfs_ilock(xip, XFS_ILOCK_EXCL);
		if (*offset > xip->i_d.di_size) {
			xip->i_d.di_size = *offset;
			i_size_write(inode, *offset);
			xip->i_update_core = 1;
			xip->i_update_size = 1;
		}
		xfs_iunlock(xip, XFS_ILOCK_EXCL);
	}

	error = -ret;
	if (ret <= 0)
		goto out_unlock_internal;

	XFS_STATS_ADD(xs_write_bytes, ret);

	/* Handle various SYNC-type writes */
	if ((file->f_flags & O_SYNC) || IS_SYNC(inode)) {
		/*
		 * If we're treating this as O_DSYNC and we have not updated the
		 * size, force the log.
		 */
		if (!(mp->m_flags & XFS_MOUNT_OSYNCISOSYNC) &&
		    !(xip->i_update_size)) {
			xfs_inode_log_item_t	*iip = xip->i_itemp;

			/*
			 * If an allocation transaction occurred
			 * without extending the size, then we have to force
			 * the log up the proper point to ensure that the
			 * allocation is permanent.  We can't count on
			 * the fact that buffered writes lock out direct I/O
			 * writes - the direct I/O write could have extended
			 * the size nontransactionally, then finished before
			 * we started.  xfs_write_file will think that the file
			 * didn't grow but the update isn't safe unless the
			 * size change is logged.
			 *
			 * Force the log if we've committed a transaction
			 * against the inode or if someone else has and
			 * the commit record hasn't gone to disk (e.g.
			 * the inode is pinned).  This guarantees that
			 * all changes affecting the inode are permanent
			 * when we return.
			 */
			if (iip && iip->ili_last_lsn) {
				xfs_log_force(mp, iip->ili_last_lsn,
						XFS_LOG_FORCE | XFS_LOG_SYNC);
			} else if (xfs_ipincount(xip) > 0) {
				xfs_log_force(mp, (xfs_lsn_t)0,
						XFS_LOG_FORCE | XFS_LOG_SYNC);
			}

		} else {
			xfs_trans_t	*tp;

			/*
			 * O_SYNC or O_DSYNC _with_ a size update are handled
			 * the same way.
			 *
			 * If the write was synchronous then we need to make
			 * sure that the inode modification time is permanent.
			 * We'll have updated the timestamp above, so here
			 * we use a synchronous transaction to log the inode.
			 * It's not fast, but it's necessary.
			 *
			 * If this a dsync write and the size got changed
			 * non-transactionally, then we need to ensure that
			 * the size change gets logged in a synchronous
			 * transaction.
			 */

			tp = xfs_trans_alloc(mp, XFS_TRANS_WRITE_SYNC);
			if ((error = xfs_trans_reserve(tp, 0,
						      XFS_SWRITE_LOG_RES(mp),
						      0, 0, 0))) {
				/* Transaction reserve failed */
				xfs_trans_cancel(tp, 0);
			} else {
				/* Transaction reserve successful */
				xfs_ilock(xip, XFS_ILOCK_EXCL);
				xfs_trans_ijoin(tp, xip, XFS_ILOCK_EXCL);
				xfs_trans_ihold(tp, xip);
				xfs_trans_log_inode(tp, xip, XFS_ILOG_CORE);
				xfs_trans_set_sync(tp);
				error = xfs_trans_commit(tp, 0, NULL);
				xfs_iunlock(xip, XFS_ILOCK_EXCL);
				if (error)
					goto out_unlock_internal;
			}
		}
	
		xfs_rwunlock(bdp, locktype);
		if (need_isem)
			up(&inode->i_sem);

		error = sync_page_range(inode, mapping, pos, ret);
		if (!error)
			error = ret;
		return error;
	}

 out_unlock_internal:
	xfs_rwunlock(bdp, locktype);
 out_unlock_isem:
	if (need_isem)
		up(&inode->i_sem);
	return -error;
}
Esempio n. 7
0
ssize_t				/* bytes written, or (-) error */
xfs_write(
	bhv_desc_t		*bdp,
	struct kiocb		*iocb,
	const struct iovec	*iovp,
	unsigned int		nsegs,
	loff_t			*offset,
	int			ioflags,
	cred_t			*credp)
{
	struct file		*file = iocb->ki_filp;
	struct address_space	*mapping = file->f_mapping;
	struct inode		*inode = mapping->host;
	unsigned long		segs = nsegs;
	xfs_inode_t		*xip;
	xfs_mount_t		*mp;
	ssize_t			ret = 0, error = 0;
	xfs_fsize_t		isize, new_size;
	xfs_iocore_t		*io;
	bhv_vnode_t		*vp;
	unsigned long		seg;
	int			iolock;
	int			eventsent = 0;
	bhv_vrwlock_t		locktype;
	size_t			ocount = 0, count;
	loff_t			pos;
	int			need_i_mutex = 1, need_flush = 0;

	XFS_STATS_INC(xs_write_calls);

	vp = BHV_TO_VNODE(bdp);
	xip = XFS_BHVTOI(bdp);

	for (seg = 0; seg < segs; seg++) {
		const struct iovec *iv = &iovp[seg];

		/*
		 * If any segment has a negative length, or the cumulative
		 * length ever wraps negative then return -EINVAL.
		 */
		ocount += iv->iov_len;
		if (unlikely((ssize_t)(ocount|iv->iov_len) < 0))
			return -EINVAL;
		if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
			continue;
		if (seg == 0)
			return -EFAULT;
		segs = seg;
		ocount -= iv->iov_len;  /* This segment is no good */
		break;
	}

	count = ocount;
	pos = *offset;

	if (count == 0)
		return 0;

	io = &xip->i_iocore;
	mp = io->io_mount;

	vfs_wait_for_freeze(vp->v_vfsp, SB_FREEZE_WRITE);

	if (XFS_FORCED_SHUTDOWN(mp))
		return -EIO;

	if (ioflags & IO_ISDIRECT) {
		xfs_buftarg_t	*target =
			(xip->i_d.di_flags & XFS_DIFLAG_REALTIME) ?
				mp->m_rtdev_targp : mp->m_ddev_targp;

		if ((pos & target->bt_smask) || (count & target->bt_smask))
			return XFS_ERROR(-EINVAL);

		if (!VN_CACHED(vp) && pos < i_size_read(inode))
			need_i_mutex = 0;

		if (VN_CACHED(vp))
			need_flush = 1;
	}

relock:
	if (need_i_mutex) {
		iolock = XFS_IOLOCK_EXCL;
		locktype = VRWLOCK_WRITE;

		mutex_lock(&inode->i_mutex);
	} else {
		iolock = XFS_IOLOCK_SHARED;
		locktype = VRWLOCK_WRITE_DIRECT;
	}

	xfs_ilock(xip, XFS_ILOCK_EXCL|iolock);

	isize = i_size_read(inode);

	if (file->f_flags & O_APPEND)
		*offset = isize;

start:
	error = -generic_write_checks(file, &pos, &count,
					S_ISBLK(inode->i_mode));
	if (error) {
		xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock);
		goto out_unlock_mutex;
	}

	new_size = pos + count;
	if (new_size > isize)
		io->io_new_size = new_size;

	if ((DM_EVENT_ENABLED(vp->v_vfsp, xip, DM_EVENT_WRITE) &&
	    !(ioflags & IO_INVIS) && !eventsent)) {
		loff_t		savedsize = pos;
		int		dmflags = FILP_DELAY_FLAG(file);

		if (need_i_mutex)
			dmflags |= DM_FLAGS_IMUX;

		xfs_iunlock(xip, XFS_ILOCK_EXCL);
		error = XFS_SEND_DATA(xip->i_mount, DM_EVENT_WRITE, vp,
				      pos, count,
				      dmflags, &locktype);
		if (error) {
			xfs_iunlock(xip, iolock);
			goto out_unlock_mutex;
		}
		xfs_ilock(xip, XFS_ILOCK_EXCL);
		eventsent = 1;

		/*
		 * The iolock was dropped and reacquired in XFS_SEND_DATA
		 * so we have to recheck the size when appending.
		 * We will only "goto start;" once, since having sent the
		 * event prevents another call to XFS_SEND_DATA, which is
		 * what allows the size to change in the first place.
		 */
		if ((file->f_flags & O_APPEND) && savedsize != isize) {
			pos = isize = xip->i_d.di_size;
			goto start;
		}
	}

	if (likely(!(ioflags & IO_INVIS))) {
		file_update_time(file);
		xfs_ichgtime_fast(xip, inode,
				  XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
	}

	/*
	 * If the offset is beyond the size of the file, we have a couple
	 * of things to do. First, if there is already space allocated
	 * we need to either create holes or zero the disk or ...
	 *
	 * If there is a page where the previous size lands, we need
	 * to zero it out up to the new size.
	 */

	if (pos > isize) {
		error = xfs_zero_eof(BHV_TO_VNODE(bdp), io, pos, isize);
		if (error) {
			xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock);
			goto out_unlock_mutex;
		}
	}
	xfs_iunlock(xip, XFS_ILOCK_EXCL);

	/*
	 * If we're writing the file then make sure to clear the
	 * setuid and setgid bits if the process is not being run
	 * by root.  This keeps people from modifying setuid and
	 * setgid binaries.
	 */

	if (((xip->i_d.di_mode & S_ISUID) ||
	    ((xip->i_d.di_mode & (S_ISGID | S_IXGRP)) ==
		(S_ISGID | S_IXGRP))) &&
	     !capable(CAP_FSETID)) {
		error = xfs_write_clear_setuid(xip);
		if (likely(!error))
			error = -remove_suid(file->f_path.dentry);
		if (unlikely(error)) {
			xfs_iunlock(xip, iolock);
			goto out_unlock_mutex;
		}
	}

retry:
	/* We can write back this queue in page reclaim */
	current->backing_dev_info = mapping->backing_dev_info;

	if ((ioflags & IO_ISDIRECT)) {
		if (need_flush) {
			xfs_inval_cached_trace(io, pos, -1,
					ctooff(offtoct(pos)), -1);
			bhv_vop_flushinval_pages(vp, ctooff(offtoct(pos)),
					-1, FI_REMAPF_LOCKED);
		}

		if (need_i_mutex) {
			/* demote the lock now the cached pages are gone */
			XFS_ILOCK_DEMOTE(mp, io, XFS_IOLOCK_EXCL);
			mutex_unlock(&inode->i_mutex);

			iolock = XFS_IOLOCK_SHARED;
			locktype = VRWLOCK_WRITE_DIRECT;
			need_i_mutex = 0;
		}

 		xfs_rw_enter_trace(XFS_DIOWR_ENTER, io, (void *)iovp, segs,
				*offset, ioflags);
		ret = generic_file_direct_write(iocb, iovp,
				&segs, pos, offset, count, ocount);

		/*
		 * direct-io write to a hole: fall through to buffered I/O
		 * for completing the rest of the request.
		 */
		if (ret >= 0 && ret != count) {
			XFS_STATS_ADD(xs_write_bytes, ret);

			pos += ret;
			count -= ret;

			need_i_mutex = 1;
			ioflags &= ~IO_ISDIRECT;
			xfs_iunlock(xip, iolock);
			goto relock;
		}
	} else {
		xfs_rw_enter_trace(XFS_WRITE_ENTER, io, (void *)iovp, segs,
				*offset, ioflags);
		ret = generic_file_buffered_write(iocb, iovp, segs,
				pos, offset, count, ret);
	}

	current->backing_dev_info = NULL;

	if (ret == -EIOCBQUEUED && !(ioflags & IO_ISAIO))
		ret = wait_on_sync_kiocb(iocb);

	if ((ret == -ENOSPC) &&
	    DM_EVENT_ENABLED(vp->v_vfsp, xip, DM_EVENT_NOSPACE) &&
	    !(ioflags & IO_INVIS)) {

		xfs_rwunlock(bdp, locktype);
		if (need_i_mutex)
			mutex_unlock(&inode->i_mutex);
		error = XFS_SEND_NAMESP(xip->i_mount, DM_EVENT_NOSPACE, vp,
				DM_RIGHT_NULL, vp, DM_RIGHT_NULL, NULL, NULL,
				0, 0, 0); /* Delay flag intentionally  unused */
		if (error)
			goto out_nounlocks;
		if (need_i_mutex)
			mutex_lock(&inode->i_mutex);
		xfs_rwlock(bdp, locktype);
		pos = xip->i_d.di_size;
		ret = 0;
		goto retry;
	}

	isize = i_size_read(inode);
	if (unlikely(ret < 0 && ret != -EFAULT && *offset > isize))
		*offset = isize;

	if (*offset > xip->i_d.di_size) {
		xfs_ilock(xip, XFS_ILOCK_EXCL);
		if (*offset > xip->i_d.di_size) {
			xip->i_d.di_size = *offset;
			i_size_write(inode, *offset);
			xip->i_update_core = 1;
			xip->i_update_size = 1;
		}
		xfs_iunlock(xip, XFS_ILOCK_EXCL);
	}

	error = -ret;
	if (ret <= 0)
		goto out_unlock_internal;

	XFS_STATS_ADD(xs_write_bytes, ret);

	/* Handle various SYNC-type writes */
	if ((file->f_flags & O_SYNC) || IS_SYNC(inode)) {
		error = xfs_write_sync_logforce(mp, xip);
		if (error)
			goto out_unlock_internal;

		xfs_rwunlock(bdp, locktype);
		if (need_i_mutex)
			mutex_unlock(&inode->i_mutex);

		error = sync_page_range(inode, mapping, pos, ret);
		if (!error)
			error = ret;
		return error;
	}

 out_unlock_internal:
	xfs_rwunlock(bdp, locktype);
 out_unlock_mutex:
	if (need_i_mutex)
		mutex_unlock(&inode->i_mutex);
 out_nounlocks:
	return -error;
}
Esempio n. 8
0
ssize_t				/* bytes written, or (-) error */
xfs_write(
	struct xfs_inode	*xip,
	struct kiocb		*iocb,
	const struct iovec	*iovp,
	unsigned int		nsegs,
	loff_t			*offset,
	int			ioflags)
{
	struct file		*file = iocb->ki_filp;
	struct address_space	*mapping = file->f_mapping;
	struct inode		*inode = mapping->host;
	unsigned long		segs = nsegs;
	xfs_mount_t		*mp;
	ssize_t			ret = 0, error = 0;
	xfs_fsize_t		isize, new_size;
	int			iolock;
	int			eventsent = 0;
	size_t			ocount = 0, count;
	loff_t			pos;
	int			need_i_mutex;

	XFS_STATS_INC(xs_write_calls);

	error = generic_segment_checks(iovp, &segs, &ocount, VERIFY_READ);
	if (error)
		return error;

	count = ocount;
	pos = *offset;

	if (count == 0)
		return 0;

	mp = xip->i_mount;

	xfs_wait_for_freeze(mp, SB_FREEZE_WRITE);

	if (XFS_FORCED_SHUTDOWN(mp))
		return -EIO;

relock:
	if (ioflags & IO_ISDIRECT) {
		iolock = XFS_IOLOCK_SHARED;
		need_i_mutex = 0;
	} else {
		iolock = XFS_IOLOCK_EXCL;
		need_i_mutex = 1;
		mutex_lock(&inode->i_mutex);
	}

	xfs_ilock(xip, XFS_ILOCK_EXCL|iolock);

start:
	error = -generic_write_checks(file, &pos, &count,
					S_ISBLK(inode->i_mode));
	if (error) {
		xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock);
		goto out_unlock_mutex;
	}

	if ((DM_EVENT_ENABLED(xip, DM_EVENT_WRITE) &&
	    !(ioflags & IO_INVIS) && !eventsent)) {
		int		dmflags = FILP_DELAY_FLAG(file);

		if (need_i_mutex)
			dmflags |= DM_FLAGS_IMUX;

		xfs_iunlock(xip, XFS_ILOCK_EXCL);
		error = XFS_SEND_DATA(xip->i_mount, DM_EVENT_WRITE, xip,
				      pos, count, dmflags, &iolock);
		if (error) {
			goto out_unlock_internal;
		}
		xfs_ilock(xip, XFS_ILOCK_EXCL);
		eventsent = 1;

		/*
		 * The iolock was dropped and reacquired in XFS_SEND_DATA
		 * so we have to recheck the size when appending.
		 * We will only "goto start;" once, since having sent the
		 * event prevents another call to XFS_SEND_DATA, which is
		 * what allows the size to change in the first place.
		 */
		if ((file->f_flags & O_APPEND) && pos != xip->i_size)
			goto start;
	}

	if (ioflags & IO_ISDIRECT) {
		xfs_buftarg_t	*target =
			XFS_IS_REALTIME_INODE(xip) ?
				mp->m_rtdev_targp : mp->m_ddev_targp;

		if ((pos & target->bt_smask) || (count & target->bt_smask)) {
			xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock);
			return XFS_ERROR(-EINVAL);
		}

		if (!need_i_mutex && (mapping->nrpages || pos > xip->i_size)) {
			xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock);
			iolock = XFS_IOLOCK_EXCL;
			need_i_mutex = 1;
			mutex_lock(&inode->i_mutex);
			xfs_ilock(xip, XFS_ILOCK_EXCL|iolock);
			goto start;
		}
	}

	new_size = pos + count;
	if (new_size > xip->i_size)
		xip->i_new_size = new_size;

	if (likely(!(ioflags & IO_INVIS)))
		xfs_ichgtime(xip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);

	/*
	 * If the offset is beyond the size of the file, we have a couple
	 * of things to do. First, if there is already space allocated
	 * we need to either create holes or zero the disk or ...
	 *
	 * If there is a page where the previous size lands, we need
	 * to zero it out up to the new size.
	 */

	if (pos > xip->i_size) {
		error = xfs_zero_eof(xip, pos, xip->i_size);
		if (error) {
			xfs_iunlock(xip, XFS_ILOCK_EXCL);
			goto out_unlock_internal;
		}
	}
	xfs_iunlock(xip, XFS_ILOCK_EXCL);

	/*
	 * If we're writing the file then make sure to clear the
	 * setuid and setgid bits if the process is not being run
	 * by root.  This keeps people from modifying setuid and
	 * setgid binaries.
	 */

	if (((xip->i_d.di_mode & S_ISUID) ||
	    ((xip->i_d.di_mode & (S_ISGID | S_IXGRP)) ==
		(S_ISGID | S_IXGRP))) &&
	     !capable(CAP_FSETID)) {
		error = xfs_write_clear_setuid(xip);
		if (likely(!error))
			error = -file_remove_suid(file);
		if (unlikely(error)) {
			goto out_unlock_internal;
		}
	}

	/* We can write back this queue in page reclaim */
	current->backing_dev_info = mapping->backing_dev_info;

	if ((ioflags & IO_ISDIRECT)) {
		if (mapping->nrpages) {
			WARN_ON(need_i_mutex == 0);
			xfs_inval_cached_trace(xip, pos, -1,
					(pos & PAGE_CACHE_MASK), -1);
			error = xfs_flushinval_pages(xip,
					(pos & PAGE_CACHE_MASK),
					-1, FI_REMAPF_LOCKED);
			if (error)
				goto out_unlock_internal;
		}

		if (need_i_mutex) {
			/* demote the lock now the cached pages are gone */
			xfs_ilock_demote(xip, XFS_IOLOCK_EXCL);
			mutex_unlock(&inode->i_mutex);

			iolock = XFS_IOLOCK_SHARED;
			need_i_mutex = 0;
		}

 		xfs_rw_enter_trace(XFS_DIOWR_ENTER, xip, (void *)iovp, segs,
				*offset, ioflags);
		ret = generic_file_direct_write(iocb, iovp,
				&segs, pos, offset, count, ocount);

		/*
		 * direct-io write to a hole: fall through to buffered I/O
		 * for completing the rest of the request.
		 */
		if (ret >= 0 && ret != count) {
			XFS_STATS_ADD(xs_write_bytes, ret);

			pos += ret;
			count -= ret;

			ioflags &= ~IO_ISDIRECT;
			xfs_iunlock(xip, iolock);
			goto relock;
		}
	} else {
		xfs_rw_enter_trace(XFS_WRITE_ENTER, xip, (void *)iovp, segs,
				*offset, ioflags);
		ret = generic_file_buffered_write(iocb, iovp, segs,
				pos, offset, count, ret);
	}

	current->backing_dev_info = NULL;

	if (ret == -EIOCBQUEUED && !(ioflags & IO_ISAIO))
		ret = wait_on_sync_kiocb(iocb);

	isize = i_size_read(inode);
	if (unlikely(ret < 0 && ret != -EFAULT && *offset > isize))
		*offset = isize;

	if (*offset > xip->i_size) {
		xfs_ilock(xip, XFS_ILOCK_EXCL);
		if (*offset > xip->i_size)
			xip->i_size = *offset;
		xfs_iunlock(xip, XFS_ILOCK_EXCL);
	}

	if (ret == -ENOSPC &&
	    DM_EVENT_ENABLED(xip, DM_EVENT_NOSPACE) && !(ioflags & IO_INVIS)) {
		xfs_iunlock(xip, iolock);
		if (need_i_mutex)
			mutex_unlock(&inode->i_mutex);
		error = XFS_SEND_NAMESP(xip->i_mount, DM_EVENT_NOSPACE, xip,
				DM_RIGHT_NULL, xip, DM_RIGHT_NULL, NULL, NULL,
				0, 0, 0); /* Delay flag intentionally  unused */
		if (need_i_mutex)
			mutex_lock(&inode->i_mutex);
		xfs_ilock(xip, iolock);
		if (error)
			goto out_unlock_internal;
		goto start;
	}

	error = -ret;
	if (ret <= 0)
		goto out_unlock_internal;

	XFS_STATS_ADD(xs_write_bytes, ret);

	/* Handle various SYNC-type writes */
	if ((file->f_flags & O_SYNC) || IS_SYNC(inode)) {
		int error2;

		xfs_iunlock(xip, iolock);
		if (need_i_mutex)
			mutex_unlock(&inode->i_mutex);
		error2 = sync_page_range(inode, mapping, pos, ret);
		if (!error)
			error = error2;
		if (need_i_mutex)
			mutex_lock(&inode->i_mutex);
		xfs_ilock(xip, iolock);
		error2 = xfs_write_sync_logforce(mp, xip);
		if (!error)
			error = error2;
	}

 out_unlock_internal:
	if (xip->i_new_size) {
		xfs_ilock(xip, XFS_ILOCK_EXCL);
		xip->i_new_size = 0;
		/*
		 * If this was a direct or synchronous I/O that failed (such
		 * as ENOSPC) then part of the I/O may have been written to
		 * disk before the error occured.  In this case the on-disk
		 * file size may have been adjusted beyond the in-memory file
		 * size and now needs to be truncated back.
		 */
		if (xip->i_d.di_size > xip->i_size)
			xip->i_d.di_size = xip->i_size;
		xfs_iunlock(xip, XFS_ILOCK_EXCL);
	}
	xfs_iunlock(xip, iolock);
 out_unlock_mutex:
	if (need_i_mutex)
		mutex_unlock(&inode->i_mutex);
	return -error;
}
Esempio n. 9
0
ssize_t				/* bytes written, or (-) error */
xfs_write(
	bhv_desc_t		*bdp,
	struct kiocb		*iocb,
	const struct iovec	*iovp,
	unsigned int		segs,
	loff_t			*offset,
	int			ioflags,
	cred_t			*credp)
{
	struct file		*file = iocb->ki_filp;
	size_t			size = 0;
	xfs_inode_t		*xip;
	xfs_mount_t		*mp;
	ssize_t			ret;
	int			error = 0;
	xfs_fsize_t		isize, new_size;
	xfs_fsize_t		n, limit;
	xfs_iocore_t		*io;
	vnode_t			*vp;
	unsigned long		seg;
	int			iolock;
	int			eventsent = 0;
	vrwlock_t		locktype;

	XFS_STATS_INC(xs_write_calls);

	vp = BHV_TO_VNODE(bdp);
	vn_trace_entry(vp, "xfs_write", (inst_t *)__return_address);
	xip = XFS_BHVTOI(bdp);

	/* START copy & waste from filemap.c */
	for (seg = 0; seg < segs; seg++) {
		const struct iovec *iv = &iovp[seg];

		/*
		 * If any segment has a negative length, or the cumulative
		 * length ever wraps negative then return -EINVAL.
		 */
		size += iv->iov_len;
		if (unlikely((ssize_t)(size|iv->iov_len) < 0))
			return XFS_ERROR(-EINVAL);
	}
	/* END copy & waste from filemap.c */

	if (size == 0)
		return 0;

	io = &(xip->i_iocore);
	mp = io->io_mount;

	xfs_check_frozen(mp, bdp, XFS_FREEZE_WRITE);

	if (XFS_FORCED_SHUTDOWN(mp)) {
		return -EIO;
	}

	if (ioflags & IO_ISDIRECT) {
		pb_target_t	*target =
			(xip->i_d.di_flags & XFS_DIFLAG_REALTIME) ?
				mp->m_rtdev_targp : mp->m_ddev_targp;

		if ((*offset & target->pbr_smask) ||
		    (size & target->pbr_smask)) {
			return XFS_ERROR(-EINVAL);
		}
		iolock = XFS_IOLOCK_SHARED;
		locktype = VRWLOCK_WRITE_DIRECT;
	} else {
		iolock = XFS_IOLOCK_EXCL;
		locktype = VRWLOCK_WRITE;
	}

	xfs_ilock(xip, XFS_ILOCK_EXCL|iolock);

	isize = xip->i_d.di_size;
	limit = XFS_MAXIOFFSET(mp);

	if (file->f_flags & O_APPEND)
		*offset = isize;

start:
	n = limit - *offset;
	if (n <= 0) {
		xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock);
		return -EFBIG;
	}

	if (n < size)
		size = n;

	new_size = *offset + size;
	if (new_size > isize) {
		io->io_new_size = new_size;
	}

	if ((DM_EVENT_ENABLED(vp->v_vfsp, xip, DM_EVENT_WRITE) &&
	    !(ioflags & IO_INVIS) && !eventsent)) {
		loff_t		savedsize = *offset;

		xfs_iunlock(xip, XFS_ILOCK_EXCL);
		error = XFS_SEND_DATA(xip->i_mount, DM_EVENT_WRITE, vp,
				      *offset, size,
				      FILP_DELAY_FLAG(file), &locktype);
		if (error) {
			xfs_iunlock(xip, iolock);
			return -error;
		}
		xfs_ilock(xip, XFS_ILOCK_EXCL);
		eventsent = 1;

		/*
		 * The iolock was dropped and reaquired in XFS_SEND_DATA
		 * so we have to recheck the size when appending.
		 * We will only "goto start;" once, since having sent the
		 * event prevents another call to XFS_SEND_DATA, which is
		 * what allows the size to change in the first place.
		 */
		if ((file->f_flags & O_APPEND) &&
		    savedsize != xip->i_d.di_size) {
			*offset = isize = xip->i_d.di_size;
			goto start;
		}
	}

	/*
	 * On Linux, generic_file_write updates the times even if
	 * no data is copied in so long as the write had a size.
	 *
	 * We must update xfs' times since revalidate will overcopy xfs.
	 */
	if (size && !(ioflags & IO_INVIS))
		xfs_ichgtime(xip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);

	/*
	 * If the offset is beyond the size of the file, we have a couple
	 * of things to do. First, if there is already space allocated
	 * we need to either create holes or zero the disk or ...
	 *
	 * If there is a page where the previous size lands, we need
	 * to zero it out up to the new size.
	 */

	if (!(ioflags & IO_ISDIRECT) && (*offset > isize && isize)) {
		error = xfs_zero_eof(BHV_TO_VNODE(bdp), io, *offset,
			isize, *offset + size);
		if (error) {
			xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock);
			return(-error);
		}
	}
	xfs_iunlock(xip, XFS_ILOCK_EXCL);

	/*
	 * If we're writing the file then make sure to clear the
	 * setuid and setgid bits if the process is not being run
	 * by root.  This keeps people from modifying setuid and
	 * setgid binaries.
	 */

	if (((xip->i_d.di_mode & S_ISUID) ||
	    ((xip->i_d.di_mode & (S_ISGID | S_IXGRP)) ==
		(S_ISGID | S_IXGRP))) &&
	     !capable(CAP_FSETID)) {
		error = xfs_write_clear_setuid(xip);
		if (error) {
			xfs_iunlock(xip, iolock);
			return -error;
		}
	}

retry:
	if (ioflags & IO_ISDIRECT) {
		xfs_inval_cached_pages(vp, &xip->i_iocore, *offset, 1, 1);
	}

	ret = generic_file_aio_write_nolock(iocb, iovp, segs, offset);

	if ((ret == -ENOSPC) &&
	    DM_EVENT_ENABLED(vp->v_vfsp, xip, DM_EVENT_NOSPACE) &&
	    !(ioflags & IO_INVIS)) {

		xfs_rwunlock(bdp, locktype);
		error = XFS_SEND_NAMESP(xip->i_mount, DM_EVENT_NOSPACE, vp,
				DM_RIGHT_NULL, vp, DM_RIGHT_NULL, NULL, NULL,
				0, 0, 0); /* Delay flag intentionally  unused */
		if (error)
			return -error;
		xfs_rwlock(bdp, locktype);
		*offset = xip->i_d.di_size;
		goto retry;

	}

	if (*offset > xip->i_d.di_size) {
		xfs_ilock(xip, XFS_ILOCK_EXCL);
		if (*offset > xip->i_d.di_size) {
			struct inode	*inode = LINVFS_GET_IP(vp);

			xip->i_d.di_size = *offset;
			i_size_write(inode, *offset);
			xip->i_update_core = 1;
			xip->i_update_size = 1;
		}
		xfs_iunlock(xip, XFS_ILOCK_EXCL);
	}

	if (ret <= 0) {
		xfs_rwunlock(bdp, locktype);
		return ret;
	}

	XFS_STATS_ADD(xs_write_bytes, ret);

	/* Handle various SYNC-type writes */
	if ((file->f_flags & O_SYNC) || IS_SYNC(file->f_dentry->d_inode)) {

		/*
		 * If we're treating this as O_DSYNC and we have not updated the
		 * size, force the log.
		 */

		if (!(mp->m_flags & XFS_MOUNT_OSYNCISOSYNC)
			&& !(xip->i_update_size)) {
			/*
			 * If an allocation transaction occurred
			 * without extending the size, then we have to force
			 * the log up the proper point to ensure that the
			 * allocation is permanent.  We can't count on
			 * the fact that buffered writes lock out direct I/O
			 * writes - the direct I/O write could have extended
			 * the size nontransactionally, then finished before
			 * we started.  xfs_write_file will think that the file
			 * didn't grow but the update isn't safe unless the
			 * size change is logged.
			 *
			 * Force the log if we've committed a transaction
			 * against the inode or if someone else has and
			 * the commit record hasn't gone to disk (e.g.
			 * the inode is pinned).  This guarantees that
			 * all changes affecting the inode are permanent
			 * when we return.
			 */

			xfs_inode_log_item_t *iip;
			xfs_lsn_t lsn;

			iip = xip->i_itemp;
			if (iip && iip->ili_last_lsn) {
				lsn = iip->ili_last_lsn;
				xfs_log_force(mp, lsn,
						XFS_LOG_FORCE | XFS_LOG_SYNC);
			} else if (xfs_ipincount(xip) > 0) {
				xfs_log_force(mp, (xfs_lsn_t)0,
						XFS_LOG_FORCE | XFS_LOG_SYNC);
			}

		} else {
			xfs_trans_t	*tp;

			/*
			 * O_SYNC or O_DSYNC _with_ a size update are handled
			 * the same way.
			 *
			 * If the write was synchronous then we need to make
			 * sure that the inode modification time is permanent.
			 * We'll have updated the timestamp above, so here
			 * we use a synchronous transaction to log the inode.
			 * It's not fast, but it's necessary.
			 *
			 * If this a dsync write and the size got changed
			 * non-transactionally, then we need to ensure that
			 * the size change gets logged in a synchronous
			 * transaction.
			 */

			tp = xfs_trans_alloc(mp, XFS_TRANS_WRITE_SYNC);
			if ((error = xfs_trans_reserve(tp, 0,
						      XFS_SWRITE_LOG_RES(mp),
						      0, 0, 0))) {
				/* Transaction reserve failed */
				xfs_trans_cancel(tp, 0);
			} else {
				/* Transaction reserve successful */
				xfs_ilock(xip, XFS_ILOCK_EXCL);
				xfs_trans_ijoin(tp, xip, XFS_ILOCK_EXCL);
				xfs_trans_ihold(tp, xip);
				xfs_trans_log_inode(tp, xip, XFS_ILOG_CORE);
				xfs_trans_set_sync(tp);
				error = xfs_trans_commit(tp, 0, (xfs_lsn_t)0);
				xfs_iunlock(xip, XFS_ILOCK_EXCL);
			}
		}
	} /* (ioflags & O_SYNC) */

	xfs_rwunlock(bdp, locktype);
	return(ret);
}