/* * Common pre-write limit and setup checks. * * Called with the iolocked held either shared and exclusive according to * @iolock, and returns with it held. Might upgrade the iolock to exclusive * if called for a direct write beyond i_size. */ STATIC ssize_t xfs_file_aio_write_checks( struct file *file, loff_t *pos, size_t *count, int *iolock) { struct inode *inode = file->f_mapping->host; struct xfs_inode *ip = XFS_I(inode); int error = 0; restart: error = generic_write_checks(file, pos, count, S_ISBLK(inode->i_mode)); if (error) return error; /* * If the offset is beyond the size of the file, we need to zero any * blocks that fall between the existing EOF and the start of this * write. If zeroing is needed and we are currently holding the * iolock shared, we need to update it to exclusive which implies * having to redo all checks before. */ if (*pos > i_size_read(inode)) { if (*iolock == XFS_IOLOCK_SHARED) { xfs_rw_iunlock(ip, *iolock); *iolock = XFS_IOLOCK_EXCL; xfs_rw_ilock(ip, *iolock); goto restart; } error = -xfs_zero_eof(ip, *pos, i_size_read(inode)); if (error) return error; } /* * Updating the timestamps will grab the ilock again from * xfs_fs_dirty_inode, so we have to call it after dropping the * lock above. Eventually we should look into a way to avoid * the pointless lock roundtrip. */ if (likely(!(file->f_mode & FMODE_NOCMTIME))) { error = file_update_time(file); if (error) return error; } /* * If we're writing the file then make sure to clear the setuid and * setgid bits if the process is not being run by root. This keeps * people from modifying setuid and setgid binaries. */ return file_remove_suid(file); }
/* * Common pre-write limit and setup checks. * * Called with the iolocked held either shared and exclusive according to * @iolock, and returns with it held. Might upgrade the iolock to exclusive * if called for a direct write beyond i_size. */ STATIC ssize_t xfs_file_aio_write_checks( struct file *file, loff_t *pos, size_t *count, int *iolock) { struct inode *inode = file->f_mapping->host; struct xfs_inode *ip = XFS_I(inode); int error = 0; xfs_rw_ilock(ip, XFS_ILOCK_EXCL); restart: error = generic_write_checks(file, pos, count, S_ISBLK(inode->i_mode)); if (error) { xfs_rw_iunlock(ip, XFS_ILOCK_EXCL); return error; } if (likely(!(file->f_mode & FMODE_NOCMTIME))) file_update_time(file); /* * If the offset is beyond the size of the file, we need to zero any * blocks that fall between the existing EOF and the start of this * write. If zeroing is needed and we are currently holding the * iolock shared, we need to update it to exclusive which involves * dropping all locks and relocking to maintain correct locking order. * If we do this, restart the function to ensure all checks and values * are still valid. */ if (*pos > i_size_read(inode)) { if (*iolock == XFS_IOLOCK_SHARED) { xfs_rw_iunlock(ip, XFS_ILOCK_EXCL | *iolock); *iolock = XFS_IOLOCK_EXCL; xfs_rw_ilock(ip, XFS_ILOCK_EXCL | *iolock); goto restart; } error = -xfs_zero_eof(ip, *pos, i_size_read(inode)); } xfs_rw_iunlock(ip, XFS_ILOCK_EXCL); if (error) return error; /* * If we're writing the file then make sure to clear the setuid and * setgid bits if the process is not being run by root. This keeps * people from modifying setuid and setgid binaries. */ return file_remove_suid(file); }
/* * Common pre-write limit and setup checks. * * Returns with iolock held according to @iolock. */ STATIC ssize_t xfs_file_aio_write_checks( struct file *file, loff_t *pos, size_t *count, int *iolock) { struct inode *inode = file->f_mapping->host; struct xfs_inode *ip = XFS_I(inode); xfs_fsize_t new_size; int error = 0; xfs_rw_ilock(ip, XFS_ILOCK_EXCL); error = generic_write_checks(file, pos, count, S_ISBLK(inode->i_mode)); if (error) { xfs_rw_iunlock(ip, XFS_ILOCK_EXCL | *iolock); *iolock = 0; return error; } new_size = *pos + *count; if (new_size > ip->i_size) ip->i_new_size = new_size; if (likely(!(file->f_mode & FMODE_NOCMTIME))) file_update_time(file); /* * If the offset is beyond the size of the file, we need to zero any * blocks that fall between the existing EOF and the start of this * write. */ if (*pos > ip->i_size) error = -xfs_zero_eof(ip, *pos, ip->i_size); xfs_rw_iunlock(ip, XFS_ILOCK_EXCL); if (error) return error; /* * If we're writing the file then make sure to clear the setuid and * setgid bits if the process is not being run by root. This keeps * people from modifying setuid and setgid binaries. */ return file_remove_suid(file); }
/* * xfs_inval_cached_pages * * This routine is responsible for keeping direct I/O and buffered I/O * somewhat coherent. From here we make sure that we're at least * temporarily holding the inode I/O lock exclusively and then call * the page cache to flush and invalidate any cached pages. If there * are no cached pages this routine will be very quick. */ void xfs_inval_cached_pages( vnode_t *vp, xfs_iocore_t *io, xfs_off_t offset, int write, int relock) { xfs_mount_t *mp; if (!VN_CACHED(vp)) { return; } mp = io->io_mount; /* * We need to get the I/O lock exclusively in order * to safely invalidate pages and mappings. */ if (relock) { XFS_IUNLOCK(mp, io, XFS_IOLOCK_SHARED); XFS_ILOCK(mp, io, XFS_IOLOCK_EXCL); } /* Writing beyond EOF creates a hole that must be zeroed */ if (write && (offset > XFS_SIZE(mp, io))) { xfs_fsize_t isize; XFS_ILOCK(mp, io, XFS_ILOCK_EXCL|XFS_EXTSIZE_RD); isize = XFS_SIZE(mp, io); if (offset > isize) { xfs_zero_eof(vp, io, offset, isize, offset); } XFS_IUNLOCK(mp, io, XFS_ILOCK_EXCL|XFS_EXTSIZE_RD); } xfs_inval_cached_trace(io, offset, -1, ctooff(offtoct(offset)), -1); VOP_FLUSHINVAL_PAGES(vp, ctooff(offtoct(offset)), -1, FI_REMAPF_LOCKED); if (relock) { XFS_ILOCK_DEMOTE(mp, io, XFS_IOLOCK_EXCL); } }
STATIC ssize_t xfs_file_aio_write_checks( struct file *file, loff_t *pos, size_t *count, int *iolock) { struct inode *inode = file->f_mapping->host; struct xfs_inode *ip = XFS_I(inode); int error = 0; xfs_rw_ilock(ip, XFS_ILOCK_EXCL); restart: error = generic_write_checks(file, pos, count, S_ISBLK(inode->i_mode)); if (error) { xfs_rw_iunlock(ip, XFS_ILOCK_EXCL); return error; } if (*pos > i_size_read(inode)) { if (*iolock == XFS_IOLOCK_SHARED) { xfs_rw_iunlock(ip, XFS_ILOCK_EXCL | *iolock); *iolock = XFS_IOLOCK_EXCL; xfs_rw_ilock(ip, XFS_ILOCK_EXCL | *iolock); goto restart; } error = -xfs_zero_eof(ip, *pos, i_size_read(inode)); } xfs_rw_iunlock(ip, XFS_ILOCK_EXCL); if (error) return error; if (likely(!(file->f_mode & FMODE_NOCMTIME))) file_update_time(file); return file_remove_suid(file); }
ssize_t /* bytes written, or (-) error */ xfs_write( bhv_desc_t *bdp, struct file *file, const char *buf, size_t size, loff_t *offset, int ioflags, cred_t *credp) { xfs_inode_t *xip; xfs_mount_t *mp; ssize_t ret; int error = 0; xfs_fsize_t isize, new_size; xfs_fsize_t n, limit; xfs_iocore_t *io; vnode_t *vp; int iolock; int eventsent = 0; vrwlock_t locktype; XFS_STATS_INC(xs_write_calls); vp = BHV_TO_VNODE(bdp); xip = XFS_BHVTOI(bdp); if (size == 0) return 0; io = &xip->i_iocore; mp = io->io_mount; fs_check_frozen(vp->v_vfsp, SB_FREEZE_WRITE); if (XFS_FORCED_SHUTDOWN(xip->i_mount)) { return -EIO; } if (unlikely(ioflags & IO_ISDIRECT)) { if (((__psint_t)buf & BBMASK) || (*offset & mp->m_blockmask) || (size & mp->m_blockmask)) { return XFS_ERROR(-EINVAL); } iolock = XFS_IOLOCK_SHARED; locktype = VRWLOCK_WRITE_DIRECT; } else { iolock = XFS_IOLOCK_EXCL; locktype = VRWLOCK_WRITE; } if (ioflags & IO_ISLOCKED) iolock = 0; xfs_ilock(xip, XFS_ILOCK_EXCL|iolock); isize = xip->i_d.di_size; limit = XFS_MAXIOFFSET(mp); if (file->f_flags & O_APPEND) *offset = isize; start: n = limit - *offset; if (n <= 0) { xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock); return -EFBIG; } if (n < size) size = n; new_size = *offset + size; if (new_size > isize) { io->io_new_size = new_size; } if ((DM_EVENT_ENABLED(vp->v_vfsp, xip, DM_EVENT_WRITE) && !(ioflags & IO_INVIS) && !eventsent)) { loff_t savedsize = *offset; int dmflags = FILP_DELAY_FLAG(file) | DM_SEM_FLAG_RD(ioflags); xfs_iunlock(xip, XFS_ILOCK_EXCL); error = XFS_SEND_DATA(xip->i_mount, DM_EVENT_WRITE, vp, *offset, size, dmflags, &locktype); if (error) { if (iolock) xfs_iunlock(xip, iolock); return -error; } xfs_ilock(xip, XFS_ILOCK_EXCL); eventsent = 1; /* * The iolock was dropped and reaquired in XFS_SEND_DATA * so we have to recheck the size when appending. * We will only "goto start;" once, since having sent the * event prevents another call to XFS_SEND_DATA, which is * what allows the size to change in the first place. */ if ((file->f_flags & O_APPEND) && savedsize != xip->i_d.di_size) { *offset = isize = xip->i_d.di_size; goto start; } } /* * If the offset is beyond the size of the file, we have a couple * of things to do. First, if there is already space allocated * we need to either create holes or zero the disk or ... * * If there is a page where the previous size lands, we need * to zero it out up to the new size. */ if (!(ioflags & IO_ISDIRECT) && (*offset > isize && isize)) { error = xfs_zero_eof(BHV_TO_VNODE(bdp), io, *offset, isize, *offset + size); if (error) { xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock); return(-error); } } xfs_iunlock(xip, XFS_ILOCK_EXCL); /* * If we're writing the file then make sure to clear the * setuid and setgid bits if the process is not being run * by root. This keeps people from modifying setuid and * setgid binaries. */ if (((xip->i_d.di_mode & S_ISUID) || ((xip->i_d.di_mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP))) && !capable(CAP_FSETID)) { error = xfs_write_clear_setuid(xip); if (error) { xfs_iunlock(xip, iolock); return -error; } } if ((ssize_t) size < 0) { ret = -EINVAL; goto error; } if (!access_ok(VERIFY_READ, buf, size)) { ret = -EINVAL; goto error; } retry: if (unlikely(ioflags & IO_ISDIRECT)) { xfs_inval_cached_pages(vp, io, *offset, 1, 1); xfs_rw_enter_trace(XFS_DIOWR_ENTER, io, buf, size, *offset, ioflags); ret = do_generic_direct_write(file, buf, size, offset); } else { xfs_rw_enter_trace(XFS_WRITE_ENTER, io, buf, size, *offset, ioflags); ret = do_generic_file_write(file, buf, size, offset); } if (unlikely(ioflags & IO_INVIS)) { /* generic_file_write updates the mtime/ctime but we need * to undo that because this I/O was supposed to be * invisible. */ struct inode *inode = LINVFS_GET_IP(vp); inode->i_mtime = xip->i_d.di_mtime.t_sec; inode->i_ctime = xip->i_d.di_ctime.t_sec; } else { xfs_ichgtime(xip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); } if ((ret == -ENOSPC) && DM_EVENT_ENABLED(vp->v_vfsp, xip, DM_EVENT_NOSPACE) && !(ioflags & IO_INVIS)) { xfs_rwunlock(bdp, locktype); error = XFS_SEND_NAMESP(xip->i_mount, DM_EVENT_NOSPACE, vp, DM_RIGHT_NULL, vp, DM_RIGHT_NULL, NULL, NULL, 0, 0, 0); /* Delay flag intentionally unused */ if (error) return -error; xfs_rwlock(bdp, locktype); *offset = xip->i_d.di_size; goto retry; } error: if (ret <= 0) { if (iolock) xfs_rwunlock(bdp, locktype); return ret; } XFS_STATS_ADD(xs_write_bytes, ret); if (*offset > xip->i_d.di_size) { xfs_ilock(xip, XFS_ILOCK_EXCL); if (*offset > xip->i_d.di_size) { struct inode *inode = LINVFS_GET_IP(vp); xip->i_d.di_size = *offset; i_size_write(inode, *offset); xip->i_update_core = 1; xip->i_update_size = 1; mark_inode_dirty_sync(inode); } xfs_iunlock(xip, XFS_ILOCK_EXCL); } /* Handle various SYNC-type writes */ if ((file->f_flags & O_SYNC) || IS_SYNC(file->f_dentry->d_inode)) { /* * If we're treating this as O_DSYNC and we have not updated the * size, force the log. */ if (!(mp->m_flags & XFS_MOUNT_OSYNCISOSYNC) && !(xip->i_update_size)) { /* * If an allocation transaction occurred * without extending the size, then we have to force * the log up the proper point to ensure that the * allocation is permanent. We can't count on * the fact that buffered writes lock out direct I/O * writes - the direct I/O write could have extended * the size nontransactionally, then finished before * we started. xfs_write_file will think that the file * didn't grow but the update isn't safe unless the * size change is logged. * * Force the log if we've committed a transaction * against the inode or if someone else has and * the commit record hasn't gone to disk (e.g. * the inode is pinned). This guarantees that * all changes affecting the inode are permanent * when we return. */ xfs_inode_log_item_t *iip; xfs_lsn_t lsn; iip = xip->i_itemp; if (iip && iip->ili_last_lsn) { lsn = iip->ili_last_lsn; xfs_log_force(mp, lsn, XFS_LOG_FORCE | XFS_LOG_SYNC); } else if (xfs_ipincount(xip) > 0) { xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE | XFS_LOG_SYNC); } } else { xfs_trans_t *tp; /* * O_SYNC or O_DSYNC _with_ a size update are handled * the same way. * * If the write was synchronous then we need to make * sure that the inode modification time is permanent. * We'll have updated the timestamp above, so here * we use a synchronous transaction to log the inode. * It's not fast, but it's necessary. * * If this a dsync write and the size got changed * non-transactionally, then we need to ensure that * the size change gets logged in a synchronous * transaction. */ tp = xfs_trans_alloc(mp, XFS_TRANS_WRITE_SYNC); if ((error = xfs_trans_reserve(tp, 0, XFS_SWRITE_LOG_RES(mp), 0, 0, 0))) { /* Transaction reserve failed */ xfs_trans_cancel(tp, 0); } else { /* Transaction reserve successful */ xfs_ilock(xip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, xip, XFS_ILOCK_EXCL); xfs_trans_ihold(tp, xip); xfs_trans_log_inode(tp, xip, XFS_ILOG_CORE); xfs_trans_set_sync(tp); error = xfs_trans_commit(tp, 0, NULL); xfs_iunlock(xip, XFS_ILOCK_EXCL); } } } /* (ioflags & O_SYNC) */ /* * If we are coming from an nfsd thread then insert into the * reference cache. */ if (!strcmp(current->comm, "nfsd")) xfs_refcache_insert(xip); /* Drop lock this way - the old refcache release is in here */ if (iolock) xfs_rwunlock(bdp, locktype); return(ret); }
STATIC ssize_t xfs_file_aio_write( struct kiocb *iocb, const struct iovec *iovp, unsigned long nr_segs, loff_t pos) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; ssize_t ret = 0, error = 0; int ioflags = 0; xfs_fsize_t isize, new_size; int iolock; int eventsent = 0; size_t ocount = 0, count; int need_i_mutex; XFS_STATS_INC(xs_write_calls); BUG_ON(iocb->ki_pos != pos); if (unlikely(file->f_flags & O_DIRECT)) ioflags |= IO_ISDIRECT; if (file->f_mode & FMODE_NOCMTIME) ioflags |= IO_INVIS; error = generic_segment_checks(iovp, &nr_segs, &ocount, VERIFY_READ); if (error) return error; count = ocount; if (count == 0) return 0; xfs_wait_for_freeze(mp, SB_FREEZE_WRITE); if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; relock: if (ioflags & IO_ISDIRECT) { iolock = XFS_IOLOCK_SHARED; need_i_mutex = 0; } else { iolock = XFS_IOLOCK_EXCL; need_i_mutex = 1; mutex_lock(&inode->i_mutex); } xfs_ilock(ip, XFS_ILOCK_EXCL|iolock); start: error = -generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode)); if (error) { xfs_iunlock(ip, XFS_ILOCK_EXCL|iolock); goto out_unlock_mutex; } if ((DM_EVENT_ENABLED(ip, DM_EVENT_WRITE) && !(ioflags & IO_INVIS) && !eventsent)) { int dmflags = FILP_DELAY_FLAG(file); if (need_i_mutex) dmflags |= DM_FLAGS_IMUX; xfs_iunlock(ip, XFS_ILOCK_EXCL); error = XFS_SEND_DATA(ip->i_mount, DM_EVENT_WRITE, ip, pos, count, dmflags, &iolock); if (error) { goto out_unlock_internal; } xfs_ilock(ip, XFS_ILOCK_EXCL); eventsent = 1; /* * The iolock was dropped and reacquired in XFS_SEND_DATA * so we have to recheck the size when appending. * We will only "goto start;" once, since having sent the * event prevents another call to XFS_SEND_DATA, which is * what allows the size to change in the first place. */ if ((file->f_flags & O_APPEND) && pos != ip->i_size) goto start; } if (ioflags & IO_ISDIRECT) { xfs_buftarg_t *target = XFS_IS_REALTIME_INODE(ip) ? mp->m_rtdev_targp : mp->m_ddev_targp; if ((pos & target->bt_smask) || (count & target->bt_smask)) { xfs_iunlock(ip, XFS_ILOCK_EXCL|iolock); return XFS_ERROR(-EINVAL); } if (!need_i_mutex && (mapping->nrpages || pos > ip->i_size)) { xfs_iunlock(ip, XFS_ILOCK_EXCL|iolock); iolock = XFS_IOLOCK_EXCL; need_i_mutex = 1; mutex_lock(&inode->i_mutex); xfs_ilock(ip, XFS_ILOCK_EXCL|iolock); goto start; } } new_size = pos + count; if (new_size > ip->i_size) ip->i_new_size = new_size; if (likely(!(ioflags & IO_INVIS))) file_update_time(file); /* * If the offset is beyond the size of the file, we have a couple * of things to do. First, if there is already space allocated * we need to either create holes or zero the disk or ... * * If there is a page where the previous size lands, we need * to zero it out up to the new size. */ if (pos > ip->i_size) { error = xfs_zero_eof(ip, pos, ip->i_size); if (error) { xfs_iunlock(ip, XFS_ILOCK_EXCL); goto out_unlock_internal; } } xfs_iunlock(ip, XFS_ILOCK_EXCL); /* * If we're writing the file then make sure to clear the * setuid and setgid bits if the process is not being run * by root. This keeps people from modifying setuid and * setgid binaries. */ error = -file_remove_suid(file); if (unlikely(error)) goto out_unlock_internal; /* We can write back this queue in page reclaim */ current->backing_dev_info = mapping->backing_dev_info; if ((ioflags & IO_ISDIRECT)) { if (mapping->nrpages) { WARN_ON(need_i_mutex == 0); error = xfs_flushinval_pages(ip, (pos & PAGE_CACHE_MASK), -1, FI_REMAPF_LOCKED); if (error) goto out_unlock_internal; } if (need_i_mutex) { /* demote the lock now the cached pages are gone */ xfs_ilock_demote(ip, XFS_IOLOCK_EXCL); mutex_unlock(&inode->i_mutex); iolock = XFS_IOLOCK_SHARED; need_i_mutex = 0; } trace_xfs_file_direct_write(ip, count, iocb->ki_pos, ioflags); ret = generic_file_direct_write(iocb, iovp, &nr_segs, pos, &iocb->ki_pos, count, ocount); /* * direct-io write to a hole: fall through to buffered I/O * for completing the rest of the request. */ if (ret >= 0 && ret != count) { XFS_STATS_ADD(xs_write_bytes, ret); pos += ret; count -= ret; ioflags &= ~IO_ISDIRECT; xfs_iunlock(ip, iolock); goto relock; } } else { int enospc = 0; ssize_t ret2 = 0; write_retry: trace_xfs_file_buffered_write(ip, count, iocb->ki_pos, ioflags); ret2 = generic_file_buffered_write(iocb, iovp, nr_segs, pos, &iocb->ki_pos, count, ret); /* * if we just got an ENOSPC, flush the inode now we * aren't holding any page locks and retry *once* */ if (ret2 == -ENOSPC && !enospc) { error = xfs_flush_pages(ip, 0, -1, 0, FI_NONE); if (error) goto out_unlock_internal; enospc = 1; goto write_retry; } ret = ret2; } current->backing_dev_info = NULL; isize = i_size_read(inode); if (unlikely(ret < 0 && ret != -EFAULT && iocb->ki_pos > isize)) iocb->ki_pos = isize; if (iocb->ki_pos > ip->i_size) { xfs_ilock(ip, XFS_ILOCK_EXCL); if (iocb->ki_pos > ip->i_size) ip->i_size = iocb->ki_pos; xfs_iunlock(ip, XFS_ILOCK_EXCL); } if (ret == -ENOSPC && DM_EVENT_ENABLED(ip, DM_EVENT_NOSPACE) && !(ioflags & IO_INVIS)) { xfs_iunlock(ip, iolock); if (need_i_mutex) mutex_unlock(&inode->i_mutex); error = XFS_SEND_NAMESP(ip->i_mount, DM_EVENT_NOSPACE, ip, DM_RIGHT_NULL, ip, DM_RIGHT_NULL, NULL, NULL, 0, 0, 0); /* Delay flag intentionally unused */ if (need_i_mutex) mutex_lock(&inode->i_mutex); xfs_ilock(ip, iolock); if (error) goto out_unlock_internal; goto start; } error = -ret; if (ret <= 0) goto out_unlock_internal; XFS_STATS_ADD(xs_write_bytes, ret); /* Handle various SYNC-type writes */ if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) { loff_t end = pos + ret - 1; int error2; xfs_iunlock(ip, iolock); if (need_i_mutex) mutex_unlock(&inode->i_mutex); error2 = filemap_write_and_wait_range(mapping, pos, end); if (!error) error = error2; if (need_i_mutex) mutex_lock(&inode->i_mutex); xfs_ilock(ip, iolock); error2 = -xfs_file_fsync(file, file->f_path.dentry, (file->f_flags & __O_SYNC) ? 0 : 1); if (!error) error = error2; } out_unlock_internal: if (ip->i_new_size) { xfs_ilock(ip, XFS_ILOCK_EXCL); ip->i_new_size = 0; /* * If this was a direct or synchronous I/O that failed (such * as ENOSPC) then part of the I/O may have been written to * disk before the error occured. In this case the on-disk * file size may have been adjusted beyond the in-memory file * size and now needs to be truncated back. */ if (ip->i_d.di_size > ip->i_size) ip->i_d.di_size = ip->i_size; xfs_iunlock(ip, XFS_ILOCK_EXCL); } xfs_iunlock(ip, iolock); out_unlock_mutex: if (need_i_mutex) mutex_unlock(&inode->i_mutex); return -error; }
ssize_t /* bytes written, or (-) error */ xfs_write( bhv_desc_t *bdp, struct kiocb *iocb, const struct iovec *iovp, unsigned int nsegs, loff_t *offset, int ioflags, cred_t *credp) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; unsigned long segs = nsegs; xfs_inode_t *xip; xfs_mount_t *mp; ssize_t ret = 0, error = 0; xfs_fsize_t isize, new_size; xfs_iocore_t *io; vnode_t *vp; unsigned long seg; int iolock; int eventsent = 0; vrwlock_t locktype; size_t ocount = 0, count; loff_t pos; int need_isem = 1, need_flush = 0; XFS_STATS_INC(xs_write_calls); vp = BHV_TO_VNODE(bdp); xip = XFS_BHVTOI(bdp); for (seg = 0; seg < segs; seg++) { const struct iovec *iv = &iovp[seg]; /* * If any segment has a negative length, or the cumulative * length ever wraps negative then return -EINVAL. */ ocount += iv->iov_len; if (unlikely((ssize_t)(ocount|iv->iov_len) < 0)) return -EINVAL; if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len)) continue; if (seg == 0) return -EFAULT; segs = seg; ocount -= iv->iov_len; /* This segment is no good */ break; } count = ocount; pos = *offset; if (count == 0) return 0; io = &xip->i_iocore; mp = io->io_mount; if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; if (ioflags & IO_ISDIRECT) { xfs_buftarg_t *target = (xip->i_d.di_flags & XFS_DIFLAG_REALTIME) ? mp->m_rtdev_targp : mp->m_ddev_targp; if ((pos & target->pbr_smask) || (count & target->pbr_smask)) return XFS_ERROR(-EINVAL); if (!VN_CACHED(vp) && pos < i_size_read(inode)) need_isem = 0; if (VN_CACHED(vp)) need_flush = 1; } relock: if (need_isem) { iolock = XFS_IOLOCK_EXCL; locktype = VRWLOCK_WRITE; down(&inode->i_sem); } else { iolock = XFS_IOLOCK_SHARED; locktype = VRWLOCK_WRITE_DIRECT; } xfs_ilock(xip, XFS_ILOCK_EXCL|iolock); isize = i_size_read(inode); if (file->f_flags & O_APPEND) *offset = isize; start: error = -generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode)); if (error) { xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock); goto out_unlock_isem; } new_size = pos + count; if (new_size > isize) io->io_new_size = new_size; if ((DM_EVENT_ENABLED(vp->v_vfsp, xip, DM_EVENT_WRITE) && !(ioflags & IO_INVIS) && !eventsent)) { loff_t savedsize = pos; int dmflags = FILP_DELAY_FLAG(file); if (need_isem) dmflags |= DM_FLAGS_ISEM; xfs_iunlock(xip, XFS_ILOCK_EXCL); error = XFS_SEND_DATA(xip->i_mount, DM_EVENT_WRITE, vp, pos, count, dmflags, &locktype); if (error) { xfs_iunlock(xip, iolock); goto out_unlock_isem; } xfs_ilock(xip, XFS_ILOCK_EXCL); eventsent = 1; /* * The iolock was dropped and reaquired in XFS_SEND_DATA * so we have to recheck the size when appending. * We will only "goto start;" once, since having sent the * event prevents another call to XFS_SEND_DATA, which is * what allows the size to change in the first place. */ if ((file->f_flags & O_APPEND) && savedsize != isize) { pos = isize = xip->i_d.di_size; goto start; } } /* * On Linux, generic_file_write updates the times even if * no data is copied in so long as the write had a size. * * We must update xfs' times since revalidate will overcopy xfs. */ if (!(ioflags & IO_INVIS)) { xfs_ichgtime(xip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); inode_update_time(inode, 1); } /* * If the offset is beyond the size of the file, we have a couple * of things to do. First, if there is already space allocated * we need to either create holes or zero the disk or ... * * If there is a page where the previous size lands, we need * to zero it out up to the new size. */ if (pos > isize) { error = xfs_zero_eof(BHV_TO_VNODE(bdp), io, pos, isize, pos + count); if (error) { xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock); goto out_unlock_isem; } } xfs_iunlock(xip, XFS_ILOCK_EXCL); /* * If we're writing the file then make sure to clear the * setuid and setgid bits if the process is not being run * by root. This keeps people from modifying setuid and * setgid binaries. */ if (((xip->i_d.di_mode & S_ISUID) || ((xip->i_d.di_mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP))) && !capable(CAP_FSETID)) { error = xfs_write_clear_setuid(xip); if (likely(!error)) error = -remove_suid(file->f_dentry); if (unlikely(error)) { xfs_iunlock(xip, iolock); goto out_unlock_isem; } } retry: /* We can write back this queue in page reclaim */ current->backing_dev_info = mapping->backing_dev_info; if ((ioflags & IO_ISDIRECT)) { if (need_flush) { xfs_inval_cached_trace(io, pos, -1, ctooff(offtoct(pos)), -1); VOP_FLUSHINVAL_PAGES(vp, ctooff(offtoct(pos)), -1, FI_REMAPF_LOCKED); } if (need_isem) { /* demote the lock now the cached pages are gone */ XFS_ILOCK_DEMOTE(mp, io, XFS_IOLOCK_EXCL); up(&inode->i_sem); iolock = XFS_IOLOCK_SHARED; locktype = VRWLOCK_WRITE_DIRECT; need_isem = 0; } xfs_rw_enter_trace(XFS_DIOWR_ENTER, io, (void *)iovp, segs, *offset, ioflags); ret = generic_file_direct_write(iocb, iovp, &segs, pos, offset, count, ocount); /* * direct-io write to a hole: fall through to buffered I/O * for completing the rest of the request. */ if (ret >= 0 && ret != count) { XFS_STATS_ADD(xs_write_bytes, ret); pos += ret; count -= ret; need_isem = 1; ioflags &= ~IO_ISDIRECT; xfs_iunlock(xip, iolock); goto relock; } } else { xfs_rw_enter_trace(XFS_WRITE_ENTER, io, (void *)iovp, segs, *offset, ioflags); ret = generic_file_buffered_write(iocb, iovp, segs, pos, offset, count, ret); } current->backing_dev_info = NULL; if (ret == -EIOCBQUEUED) ret = wait_on_sync_kiocb(iocb); if ((ret == -ENOSPC) && DM_EVENT_ENABLED(vp->v_vfsp, xip, DM_EVENT_NOSPACE) && !(ioflags & IO_INVIS)) { xfs_rwunlock(bdp, locktype); error = XFS_SEND_NAMESP(xip->i_mount, DM_EVENT_NOSPACE, vp, DM_RIGHT_NULL, vp, DM_RIGHT_NULL, NULL, NULL, 0, 0, 0); /* Delay flag intentionally unused */ if (error) goto out_unlock_isem; xfs_rwlock(bdp, locktype); pos = xip->i_d.di_size; goto retry; } if (*offset > xip->i_d.di_size) { xfs_ilock(xip, XFS_ILOCK_EXCL); if (*offset > xip->i_d.di_size) { xip->i_d.di_size = *offset; i_size_write(inode, *offset); xip->i_update_core = 1; xip->i_update_size = 1; } xfs_iunlock(xip, XFS_ILOCK_EXCL); } error = -ret; if (ret <= 0) goto out_unlock_internal; XFS_STATS_ADD(xs_write_bytes, ret); /* Handle various SYNC-type writes */ if ((file->f_flags & O_SYNC) || IS_SYNC(inode)) { /* * If we're treating this as O_DSYNC and we have not updated the * size, force the log. */ if (!(mp->m_flags & XFS_MOUNT_OSYNCISOSYNC) && !(xip->i_update_size)) { xfs_inode_log_item_t *iip = xip->i_itemp; /* * If an allocation transaction occurred * without extending the size, then we have to force * the log up the proper point to ensure that the * allocation is permanent. We can't count on * the fact that buffered writes lock out direct I/O * writes - the direct I/O write could have extended * the size nontransactionally, then finished before * we started. xfs_write_file will think that the file * didn't grow but the update isn't safe unless the * size change is logged. * * Force the log if we've committed a transaction * against the inode or if someone else has and * the commit record hasn't gone to disk (e.g. * the inode is pinned). This guarantees that * all changes affecting the inode are permanent * when we return. */ if (iip && iip->ili_last_lsn) { xfs_log_force(mp, iip->ili_last_lsn, XFS_LOG_FORCE | XFS_LOG_SYNC); } else if (xfs_ipincount(xip) > 0) { xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE | XFS_LOG_SYNC); } } else { xfs_trans_t *tp; /* * O_SYNC or O_DSYNC _with_ a size update are handled * the same way. * * If the write was synchronous then we need to make * sure that the inode modification time is permanent. * We'll have updated the timestamp above, so here * we use a synchronous transaction to log the inode. * It's not fast, but it's necessary. * * If this a dsync write and the size got changed * non-transactionally, then we need to ensure that * the size change gets logged in a synchronous * transaction. */ tp = xfs_trans_alloc(mp, XFS_TRANS_WRITE_SYNC); if ((error = xfs_trans_reserve(tp, 0, XFS_SWRITE_LOG_RES(mp), 0, 0, 0))) { /* Transaction reserve failed */ xfs_trans_cancel(tp, 0); } else { /* Transaction reserve successful */ xfs_ilock(xip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, xip, XFS_ILOCK_EXCL); xfs_trans_ihold(tp, xip); xfs_trans_log_inode(tp, xip, XFS_ILOG_CORE); xfs_trans_set_sync(tp); error = xfs_trans_commit(tp, 0, NULL); xfs_iunlock(xip, XFS_ILOCK_EXCL); if (error) goto out_unlock_internal; } } xfs_rwunlock(bdp, locktype); if (need_isem) up(&inode->i_sem); error = sync_page_range(inode, mapping, pos, ret); if (!error) error = ret; return error; } out_unlock_internal: xfs_rwunlock(bdp, locktype); out_unlock_isem: if (need_isem) up(&inode->i_sem); return -error; }
/* * Truncate file. Must have write permission and not be a directory. */ int xfs_setattr_size( struct xfs_inode *ip, struct iattr *iattr) { struct xfs_mount *mp = ip->i_mount; struct inode *inode = VFS_I(ip); xfs_off_t oldsize, newsize; struct xfs_trans *tp; int error; uint lock_flags = 0; bool did_zeroing = false; trace_xfs_setattr(ip); if (mp->m_flags & XFS_MOUNT_RDONLY) return -EROFS; if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; error = inode_change_ok(inode, iattr); if (error) return error; ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); ASSERT(xfs_isilocked(ip, XFS_MMAPLOCK_EXCL)); ASSERT(S_ISREG(inode->i_mode)); ASSERT((iattr->ia_valid & (ATTR_UID|ATTR_GID|ATTR_ATIME|ATTR_ATIME_SET| ATTR_MTIME_SET|ATTR_KILL_PRIV|ATTR_TIMES_SET)) == 0); oldsize = inode->i_size; newsize = iattr->ia_size; /* * Short circuit the truncate case for zero length files. */ if (newsize == 0 && oldsize == 0 && ip->i_d.di_nextents == 0) { if (!(iattr->ia_valid & (ATTR_CTIME|ATTR_MTIME))) return 0; /* * Use the regular setattr path to update the timestamps. */ iattr->ia_valid &= ~ATTR_SIZE; return xfs_setattr_nonsize(ip, iattr, 0); } /* * Make sure that the dquots are attached to the inode. */ error = xfs_qm_dqattach(ip, 0); if (error) return error; /* * Wait for all direct I/O to complete. */ inode_dio_wait(inode); /* * File data changes must be complete before we start the transaction to * modify the inode. This needs to be done before joining the inode to * the transaction because the inode cannot be unlocked once it is a * part of the transaction. * * Start with zeroing any data beyond EOF that we may expose on file * extension, or zeroing out the rest of the block on a downward * truncate. */ if (newsize > oldsize) { error = xfs_zero_eof(ip, newsize, oldsize, &did_zeroing); } else { error = iomap_truncate_page(inode, newsize, &did_zeroing, &xfs_iomap_ops); } if (error) return error; /* * We are going to log the inode size change in this transaction so * any previous writes that are beyond the on disk EOF and the new * EOF that have not been written out need to be written here. If we * do not write the data out, we expose ourselves to the null files * problem. Note that this includes any block zeroing we did above; * otherwise those blocks may not be zeroed after a crash. */ if (did_zeroing || (newsize > ip->i_d.di_size && oldsize != ip->i_d.di_size)) { error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping, ip->i_d.di_size, newsize); if (error) return error; } /* * We've already locked out new page faults, so now we can safely remove * pages from the page cache knowing they won't get refaulted until we * drop the XFS_MMAP_EXCL lock after the extent manipulations are * complete. The truncate_setsize() call also cleans partial EOF page * PTEs on extending truncates and hence ensures sub-page block size * filesystems are correctly handled, too. * * We have to do all the page cache truncate work outside the * transaction context as the "lock" order is page lock->log space * reservation as defined by extent allocation in the writeback path. * Hence a truncate can fail with ENOMEM from xfs_trans_alloc(), but * having already truncated the in-memory version of the file (i.e. made * user visible changes). There's not much we can do about this, except * to hope that the caller sees ENOMEM and retries the truncate * operation. */ truncate_setsize(inode, newsize); error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp); if (error) return error; lock_flags |= XFS_ILOCK_EXCL; xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, 0); /* * Only change the c/mtime if we are changing the size or we are * explicitly asked to change it. This handles the semantic difference * between truncate() and ftruncate() as implemented in the VFS. * * The regular truncate() case without ATTR_CTIME and ATTR_MTIME is a * special case where we need to update the times despite not having * these flags set. For all other operations the VFS set these flags * explicitly if it wants a timestamp update. */ if (newsize != oldsize && !(iattr->ia_valid & (ATTR_CTIME | ATTR_MTIME))) { iattr->ia_ctime = iattr->ia_mtime = current_fs_time(inode->i_sb); iattr->ia_valid |= ATTR_CTIME | ATTR_MTIME; } /* * The first thing we do is set the size to new_size permanently on * disk. This way we don't have to worry about anyone ever being able * to look at the data being freed even in the face of a crash. * What we're getting around here is the case where we free a block, it * is allocated to another file, it is written to, and then we crash. * If the new data gets written to the file but the log buffers * containing the free and reallocation don't, then we'd end up with * garbage in the blocks being freed. As long as we make the new size * permanent before actually freeing any blocks it doesn't matter if * they get written to. */ ip->i_d.di_size = newsize; xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); if (newsize <= oldsize) { error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, newsize); if (error) goto out_trans_cancel; /* * Truncated "down", so we're removing references to old data * here - if we delay flushing for a long time, we expose * ourselves unduly to the notorious NULL files problem. So, * we mark this inode and flush it when the file is closed, * and do not wait the usual (long) time for writeout. */ xfs_iflags_set(ip, XFS_ITRUNCATED); /* A truncate down always removes post-EOF blocks. */ xfs_inode_clear_eofblocks_tag(ip); } if (iattr->ia_valid & ATTR_MODE) xfs_setattr_mode(ip, iattr); if (iattr->ia_valid & (ATTR_ATIME|ATTR_CTIME|ATTR_MTIME)) xfs_setattr_time(ip, iattr); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); XFS_STATS_INC(mp, xs_ig_attrchg); if (mp->m_flags & XFS_MOUNT_WSYNC) xfs_trans_set_sync(tp); error = xfs_trans_commit(tp); out_unlock: if (lock_flags) xfs_iunlock(ip, lock_flags); return error; out_trans_cancel: xfs_trans_cancel(tp); goto out_unlock; }
/* * Common pre-write limit and setup checks. * * Returns with iolock held according to @iolock. */ STATIC ssize_t xfs_file_aio_write_checks( struct file *file, loff_t *pos, size_t *count, xfs_fsize_t *new_sizep, int *iolock) { struct inode *inode = file->f_mapping->host; struct xfs_inode *ip = XFS_I(inode); xfs_fsize_t new_size; int error = 0; xfs_rw_ilock(ip, XFS_ILOCK_EXCL); *new_sizep = 0; restart: error = generic_write_checks(file, pos, count, S_ISBLK(inode->i_mode)); if (error) { xfs_rw_iunlock(ip, XFS_ILOCK_EXCL | *iolock); *iolock = 0; return error; } if (likely(!(file->f_mode & FMODE_NOCMTIME))) file_update_time(file); /* * If the offset is beyond the size of the file, we need to zero any * blocks that fall between the existing EOF and the start of this * write. There is no need to issue zeroing if another in-flght IO ends * at or before this one If zeronig is needed and we are currently * holding the iolock shared, we need to update it to exclusive which * involves dropping all locks and relocking to maintain correct locking * order. If we do this, restart the function to ensure all checks and * values are still valid. */ if ((ip->i_new_size && *pos > ip->i_new_size) || (!ip->i_new_size && *pos > ip->i_size)) { if (*iolock == XFS_IOLOCK_SHARED) { xfs_rw_iunlock(ip, XFS_ILOCK_EXCL | *iolock); *iolock = XFS_IOLOCK_EXCL; xfs_rw_ilock(ip, XFS_ILOCK_EXCL | *iolock); goto restart; } error = -xfs_zero_eof(ip, *pos, ip->i_size); } /* * If this IO extends beyond EOF, we may need to update ip->i_new_size. * We have already zeroed space beyond EOF (if necessary). Only update * ip->i_new_size if this IO ends beyond any other in-flight writes. */ new_size = *pos + *count; if (new_size > ip->i_size) { if (new_size > ip->i_new_size) ip->i_new_size = new_size; *new_sizep = new_size; } xfs_rw_iunlock(ip, XFS_ILOCK_EXCL); if (error) return error; /* * If we're writing the file then make sure to clear the setuid and * setgid bits if the process is not being run by root. This keeps * people from modifying setuid and setgid binaries. */ return file_remove_suid(file); }
/* * Truncate file. Must have write permission and not be a directory. */ int xfs_setattr_size( struct xfs_inode *ip, struct iattr *iattr, int flags) { struct xfs_mount *mp = ip->i_mount; struct inode *inode = VFS_I(ip); int mask = iattr->ia_valid; struct xfs_trans *tp; int error; uint lock_flags; uint commit_flags = 0; trace_xfs_setattr(ip); if (mp->m_flags & XFS_MOUNT_RDONLY) return XFS_ERROR(EROFS); if (XFS_FORCED_SHUTDOWN(mp)) return XFS_ERROR(EIO); error = -inode_change_ok(inode, iattr); if (error) return XFS_ERROR(error); ASSERT(S_ISREG(ip->i_d.di_mode)); ASSERT((mask & (ATTR_MODE|ATTR_UID|ATTR_GID|ATTR_ATIME|ATTR_ATIME_SET| ATTR_MTIME_SET|ATTR_KILL_SUID|ATTR_KILL_SGID| ATTR_KILL_PRIV|ATTR_TIMES_SET)) == 0); lock_flags = XFS_ILOCK_EXCL; if (!(flags & XFS_ATTR_NOLOCK)) lock_flags |= XFS_IOLOCK_EXCL; xfs_ilock(ip, lock_flags); /* * Short circuit the truncate case for zero length files. */ if (iattr->ia_size == 0 && ip->i_size == 0 && ip->i_d.di_nextents == 0) { if (!(mask & (ATTR_CTIME|ATTR_MTIME))) goto out_unlock; /* * Use the regular setattr path to update the timestamps. */ xfs_iunlock(ip, lock_flags); iattr->ia_valid &= ~ATTR_SIZE; return xfs_setattr_nonsize(ip, iattr, 0); } /* * Make sure that the dquots are attached to the inode. */ error = xfs_qm_dqattach_locked(ip, 0); if (error) goto out_unlock; /* * Now we can make the changes. Before we join the inode to the * transaction, take care of the part of the truncation that must be * done without the inode lock. This needs to be done before joining * the inode to the transaction, because the inode cannot be unlocked * once it is a part of the transaction. */ if (iattr->ia_size > ip->i_size) { /* * Do the first part of growing a file: zero any data in the * last block that is beyond the old EOF. We need to do this * before the inode is joined to the transaction to modify * i_size. */ error = xfs_zero_eof(ip, iattr->ia_size, ip->i_size); if (error) goto out_unlock; } xfs_iunlock(ip, XFS_ILOCK_EXCL); lock_flags &= ~XFS_ILOCK_EXCL; /* * We are going to log the inode size change in this transaction so * any previous writes that are beyond the on disk EOF and the new * EOF that have not been written out need to be written here. If we * do not write the data out, we expose ourselves to the null files * problem. * * Only flush from the on disk size to the smaller of the in memory * file size or the new size as that's the range we really care about * here and prevents waiting for other data not within the range we * care about here. */ if (ip->i_size != ip->i_d.di_size && iattr->ia_size > ip->i_d.di_size) { error = xfs_flush_pages(ip, ip->i_d.di_size, iattr->ia_size, XBF_ASYNC, FI_NONE); if (error) goto out_unlock; } /* * Wait for all I/O to complete. */ xfs_ioend_wait(ip); error = -block_truncate_page(inode->i_mapping, iattr->ia_size, xfs_get_blocks); if (error) goto out_unlock; tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_SIZE); error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0, XFS_TRANS_PERM_LOG_RES, XFS_ITRUNCATE_LOG_COUNT); if (error) goto out_trans_cancel; truncate_setsize(inode, iattr->ia_size); commit_flags = XFS_TRANS_RELEASE_LOG_RES; lock_flags |= XFS_ILOCK_EXCL; xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip); /* * Only change the c/mtime if we are changing the size or we are * explicitly asked to change it. This handles the semantic difference * between truncate() and ftruncate() as implemented in the VFS. * * The regular truncate() case without ATTR_CTIME and ATTR_MTIME is a * special case where we need to update the times despite not having * these flags set. For all other operations the VFS set these flags * explicitly if it wants a timestamp update. */ if (iattr->ia_size != ip->i_size && (!(mask & (ATTR_CTIME | ATTR_MTIME)))) { iattr->ia_ctime = iattr->ia_mtime = current_fs_time(inode->i_sb); mask |= ATTR_CTIME | ATTR_MTIME; } if (iattr->ia_size > ip->i_size) { ip->i_d.di_size = iattr->ia_size; ip->i_size = iattr->ia_size; } else if (iattr->ia_size <= ip->i_size || (iattr->ia_size == 0 && ip->i_d.di_nextents)) { error = xfs_itruncate_data(&tp, ip, iattr->ia_size); if (error) goto out_trans_abort; /* * Truncated "down", so we're removing references to old data * here - if we delay flushing for a long time, we expose * ourselves unduly to the notorious NULL files problem. So, * we mark this inode and flush it when the file is closed, * and do not wait the usual (long) time for writeout. */ xfs_iflags_set(ip, XFS_ITRUNCATED); } if (mask & ATTR_CTIME) { inode->i_ctime = iattr->ia_ctime; ip->i_d.di_ctime.t_sec = iattr->ia_ctime.tv_sec; ip->i_d.di_ctime.t_nsec = iattr->ia_ctime.tv_nsec; ip->i_update_core = 1; } if (mask & ATTR_MTIME) { inode->i_mtime = iattr->ia_mtime; ip->i_d.di_mtime.t_sec = iattr->ia_mtime.tv_sec; ip->i_d.di_mtime.t_nsec = iattr->ia_mtime.tv_nsec; ip->i_update_core = 1; } xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); XFS_STATS_INC(xs_ig_attrchg); if (mp->m_flags & XFS_MOUNT_WSYNC) xfs_trans_set_sync(tp); error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); out_unlock: if (lock_flags) xfs_iunlock(ip, lock_flags); return error; out_trans_abort: commit_flags |= XFS_TRANS_ABORT; out_trans_cancel: xfs_trans_cancel(tp, commit_flags); goto out_unlock; }
int xfs_setattr_size( struct xfs_inode *ip, struct iattr *iattr, int flags) { struct xfs_mount *mp = ip->i_mount; struct inode *inode = VFS_I(ip); int mask = iattr->ia_valid; xfs_off_t oldsize, newsize; struct xfs_trans *tp; int error; uint lock_flags; uint commit_flags = 0; trace_xfs_setattr(ip); if (mp->m_flags & XFS_MOUNT_RDONLY) return XFS_ERROR(EROFS); if (XFS_FORCED_SHUTDOWN(mp)) return XFS_ERROR(EIO); error = -inode_change_ok(inode, iattr); if (error) return XFS_ERROR(error); ASSERT(S_ISREG(ip->i_d.di_mode)); ASSERT((mask & (ATTR_MODE|ATTR_UID|ATTR_GID|ATTR_ATIME|ATTR_ATIME_SET| ATTR_MTIME_SET|ATTR_KILL_SUID|ATTR_KILL_SGID| ATTR_KILL_PRIV|ATTR_TIMES_SET)) == 0); lock_flags = XFS_ILOCK_EXCL; if (!(flags & XFS_ATTR_NOLOCK)) lock_flags |= XFS_IOLOCK_EXCL; xfs_ilock(ip, lock_flags); oldsize = inode->i_size; newsize = iattr->ia_size; if (newsize == 0 && oldsize == 0 && ip->i_d.di_nextents == 0) { if (!(mask & (ATTR_CTIME|ATTR_MTIME))) goto out_unlock; xfs_iunlock(ip, lock_flags); iattr->ia_valid &= ~ATTR_SIZE; return xfs_setattr_nonsize(ip, iattr, 0); } error = xfs_qm_dqattach_locked(ip, 0); if (error) goto out_unlock; if (newsize > oldsize) { error = xfs_zero_eof(ip, newsize, oldsize); if (error) goto out_unlock; } xfs_iunlock(ip, XFS_ILOCK_EXCL); lock_flags &= ~XFS_ILOCK_EXCL; /* * We are going to log the inode size change in this transaction so * any previous writes that are beyond the on disk EOF and the new * EOF that have not been written out need to be written here. If we * do not write the data out, we expose ourselves to the null files * problem. * * Only flush from the on disk size to the smaller of the in memory * file size or the new size as that's the range we really care about * here and prevents waiting for other data not within the range we * care about here. */ if (oldsize != ip->i_d.di_size && newsize > ip->i_d.di_size) { error = xfs_flush_pages(ip, ip->i_d.di_size, newsize, 0, FI_NONE); if (error) goto out_unlock; } inode_dio_wait(inode); error = -block_truncate_page(inode->i_mapping, newsize, xfs_get_blocks); if (error) goto out_unlock; tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_SIZE); error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0, XFS_TRANS_PERM_LOG_RES, XFS_ITRUNCATE_LOG_COUNT); if (error) goto out_trans_cancel; truncate_setsize(inode, newsize); commit_flags = XFS_TRANS_RELEASE_LOG_RES; lock_flags |= XFS_ILOCK_EXCL; xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, 0); if (newsize != oldsize && (!(mask & (ATTR_CTIME | ATTR_MTIME)))) { iattr->ia_ctime = iattr->ia_mtime = current_fs_time(inode->i_sb); mask |= ATTR_CTIME | ATTR_MTIME; } /* * The first thing we do is set the size to new_size permanently on * disk. This way we don't have to worry about anyone ever being able * to look at the data being freed even in the face of a crash. * What we're getting around here is the case where we free a block, it * is allocated to another file, it is written to, and then we crash. * If the new data gets written to the file but the log buffers * containing the free and reallocation don't, then we'd end up with * garbage in the blocks being freed. As long as we make the new size * permanent before actually freeing any blocks it doesn't matter if * they get written to. */ ip->i_d.di_size = newsize; xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); if (newsize <= oldsize) { error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, newsize); if (error) goto out_trans_abort; xfs_iflags_set(ip, XFS_ITRUNCATED); } if (mask & ATTR_CTIME) { inode->i_ctime = iattr->ia_ctime; ip->i_d.di_ctime.t_sec = iattr->ia_ctime.tv_sec; ip->i_d.di_ctime.t_nsec = iattr->ia_ctime.tv_nsec; } if (mask & ATTR_MTIME) { inode->i_mtime = iattr->ia_mtime; ip->i_d.di_mtime.t_sec = iattr->ia_mtime.tv_sec; ip->i_d.di_mtime.t_nsec = iattr->ia_mtime.tv_nsec; } xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); XFS_STATS_INC(xs_ig_attrchg); if (mp->m_flags & XFS_MOUNT_WSYNC) xfs_trans_set_sync(tp); error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); out_unlock: if (lock_flags) xfs_iunlock(ip, lock_flags); return error; out_trans_abort: commit_flags |= XFS_TRANS_ABORT; out_trans_cancel: xfs_trans_cancel(tp, commit_flags); goto out_unlock; }
ssize_t /* bytes written, or (-) error */ xfs_write( bhv_desc_t *bdp, struct kiocb *iocb, const struct iovec *iovp, unsigned int nsegs, loff_t *offset, int ioflags, cred_t *credp) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; unsigned long segs = nsegs; xfs_inode_t *xip; xfs_mount_t *mp; ssize_t ret = 0, error = 0; xfs_fsize_t isize, new_size; xfs_iocore_t *io; bhv_vnode_t *vp; unsigned long seg; int iolock; int eventsent = 0; bhv_vrwlock_t locktype; size_t ocount = 0, count; loff_t pos; int need_i_mutex = 1, need_flush = 0; XFS_STATS_INC(xs_write_calls); vp = BHV_TO_VNODE(bdp); xip = XFS_BHVTOI(bdp); for (seg = 0; seg < segs; seg++) { const struct iovec *iv = &iovp[seg]; /* * If any segment has a negative length, or the cumulative * length ever wraps negative then return -EINVAL. */ ocount += iv->iov_len; if (unlikely((ssize_t)(ocount|iv->iov_len) < 0)) return -EINVAL; if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len)) continue; if (seg == 0) return -EFAULT; segs = seg; ocount -= iv->iov_len; /* This segment is no good */ break; } count = ocount; pos = *offset; if (count == 0) return 0; io = &xip->i_iocore; mp = io->io_mount; vfs_wait_for_freeze(vp->v_vfsp, SB_FREEZE_WRITE); if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; if (ioflags & IO_ISDIRECT) { xfs_buftarg_t *target = (xip->i_d.di_flags & XFS_DIFLAG_REALTIME) ? mp->m_rtdev_targp : mp->m_ddev_targp; if ((pos & target->bt_smask) || (count & target->bt_smask)) return XFS_ERROR(-EINVAL); if (!VN_CACHED(vp) && pos < i_size_read(inode)) need_i_mutex = 0; if (VN_CACHED(vp)) need_flush = 1; } relock: if (need_i_mutex) { iolock = XFS_IOLOCK_EXCL; locktype = VRWLOCK_WRITE; mutex_lock(&inode->i_mutex); } else { iolock = XFS_IOLOCK_SHARED; locktype = VRWLOCK_WRITE_DIRECT; } xfs_ilock(xip, XFS_ILOCK_EXCL|iolock); isize = i_size_read(inode); if (file->f_flags & O_APPEND) *offset = isize; start: error = -generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode)); if (error) { xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock); goto out_unlock_mutex; } new_size = pos + count; if (new_size > isize) io->io_new_size = new_size; if ((DM_EVENT_ENABLED(vp->v_vfsp, xip, DM_EVENT_WRITE) && !(ioflags & IO_INVIS) && !eventsent)) { loff_t savedsize = pos; int dmflags = FILP_DELAY_FLAG(file); if (need_i_mutex) dmflags |= DM_FLAGS_IMUX; xfs_iunlock(xip, XFS_ILOCK_EXCL); error = XFS_SEND_DATA(xip->i_mount, DM_EVENT_WRITE, vp, pos, count, dmflags, &locktype); if (error) { xfs_iunlock(xip, iolock); goto out_unlock_mutex; } xfs_ilock(xip, XFS_ILOCK_EXCL); eventsent = 1; /* * The iolock was dropped and reacquired in XFS_SEND_DATA * so we have to recheck the size when appending. * We will only "goto start;" once, since having sent the * event prevents another call to XFS_SEND_DATA, which is * what allows the size to change in the first place. */ if ((file->f_flags & O_APPEND) && savedsize != isize) { pos = isize = xip->i_d.di_size; goto start; } } if (likely(!(ioflags & IO_INVIS))) { file_update_time(file); xfs_ichgtime_fast(xip, inode, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); } /* * If the offset is beyond the size of the file, we have a couple * of things to do. First, if there is already space allocated * we need to either create holes or zero the disk or ... * * If there is a page where the previous size lands, we need * to zero it out up to the new size. */ if (pos > isize) { error = xfs_zero_eof(BHV_TO_VNODE(bdp), io, pos, isize); if (error) { xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock); goto out_unlock_mutex; } } xfs_iunlock(xip, XFS_ILOCK_EXCL); /* * If we're writing the file then make sure to clear the * setuid and setgid bits if the process is not being run * by root. This keeps people from modifying setuid and * setgid binaries. */ if (((xip->i_d.di_mode & S_ISUID) || ((xip->i_d.di_mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP))) && !capable(CAP_FSETID)) { error = xfs_write_clear_setuid(xip); if (likely(!error)) error = -remove_suid(file->f_path.dentry); if (unlikely(error)) { xfs_iunlock(xip, iolock); goto out_unlock_mutex; } } retry: /* We can write back this queue in page reclaim */ current->backing_dev_info = mapping->backing_dev_info; if ((ioflags & IO_ISDIRECT)) { if (need_flush) { xfs_inval_cached_trace(io, pos, -1, ctooff(offtoct(pos)), -1); bhv_vop_flushinval_pages(vp, ctooff(offtoct(pos)), -1, FI_REMAPF_LOCKED); } if (need_i_mutex) { /* demote the lock now the cached pages are gone */ XFS_ILOCK_DEMOTE(mp, io, XFS_IOLOCK_EXCL); mutex_unlock(&inode->i_mutex); iolock = XFS_IOLOCK_SHARED; locktype = VRWLOCK_WRITE_DIRECT; need_i_mutex = 0; } xfs_rw_enter_trace(XFS_DIOWR_ENTER, io, (void *)iovp, segs, *offset, ioflags); ret = generic_file_direct_write(iocb, iovp, &segs, pos, offset, count, ocount); /* * direct-io write to a hole: fall through to buffered I/O * for completing the rest of the request. */ if (ret >= 0 && ret != count) { XFS_STATS_ADD(xs_write_bytes, ret); pos += ret; count -= ret; need_i_mutex = 1; ioflags &= ~IO_ISDIRECT; xfs_iunlock(xip, iolock); goto relock; } } else { xfs_rw_enter_trace(XFS_WRITE_ENTER, io, (void *)iovp, segs, *offset, ioflags); ret = generic_file_buffered_write(iocb, iovp, segs, pos, offset, count, ret); } current->backing_dev_info = NULL; if (ret == -EIOCBQUEUED && !(ioflags & IO_ISAIO)) ret = wait_on_sync_kiocb(iocb); if ((ret == -ENOSPC) && DM_EVENT_ENABLED(vp->v_vfsp, xip, DM_EVENT_NOSPACE) && !(ioflags & IO_INVIS)) { xfs_rwunlock(bdp, locktype); if (need_i_mutex) mutex_unlock(&inode->i_mutex); error = XFS_SEND_NAMESP(xip->i_mount, DM_EVENT_NOSPACE, vp, DM_RIGHT_NULL, vp, DM_RIGHT_NULL, NULL, NULL, 0, 0, 0); /* Delay flag intentionally unused */ if (error) goto out_nounlocks; if (need_i_mutex) mutex_lock(&inode->i_mutex); xfs_rwlock(bdp, locktype); pos = xip->i_d.di_size; ret = 0; goto retry; } isize = i_size_read(inode); if (unlikely(ret < 0 && ret != -EFAULT && *offset > isize)) *offset = isize; if (*offset > xip->i_d.di_size) { xfs_ilock(xip, XFS_ILOCK_EXCL); if (*offset > xip->i_d.di_size) { xip->i_d.di_size = *offset; i_size_write(inode, *offset); xip->i_update_core = 1; xip->i_update_size = 1; } xfs_iunlock(xip, XFS_ILOCK_EXCL); } error = -ret; if (ret <= 0) goto out_unlock_internal; XFS_STATS_ADD(xs_write_bytes, ret); /* Handle various SYNC-type writes */ if ((file->f_flags & O_SYNC) || IS_SYNC(inode)) { error = xfs_write_sync_logforce(mp, xip); if (error) goto out_unlock_internal; xfs_rwunlock(bdp, locktype); if (need_i_mutex) mutex_unlock(&inode->i_mutex); error = sync_page_range(inode, mapping, pos, ret); if (!error) error = ret; return error; } out_unlock_internal: xfs_rwunlock(bdp, locktype); out_unlock_mutex: if (need_i_mutex) mutex_unlock(&inode->i_mutex); out_nounlocks: return -error; }
ssize_t /* bytes written, or (-) error */ xfs_write( bhv_desc_t *bdp, uio_t *uio, int ioflag, cred_t *credp) { xfs_inode_t *xip; xfs_mount_t *mp; ssize_t ret = 0; int error = 0; xfs_fsize_t isize, new_size; xfs_fsize_t n, limit; xfs_fsize_t size; xfs_iocore_t *io; xfs_vnode_t *vp; int iolock; //int eventsent = 0; vrwlock_t locktype; xfs_off_t offset_c; xfs_off_t *offset; xfs_off_t pos; XFS_STATS_INC(xs_write_calls); vp = BHV_TO_VNODE(bdp); xip = XFS_BHVTOI(bdp); io = &xip->i_iocore; mp = io->io_mount; if (XFS_FORCED_SHUTDOWN(xip->i_mount)) { return EIO; } size = uio->uio_resid; pos = offset_c = uio->uio_offset; offset = &offset_c; if (unlikely(ioflag & IO_ISDIRECT)) { if (((__psint_t)buf & BBMASK) || (*offset & mp->m_blockmask) || (size & mp->m_blockmask)) { return EINVAL; } iolock = XFS_IOLOCK_SHARED; locktype = VRWLOCK_WRITE_DIRECT; } else { if (io->io_flags & XFS_IOCORE_RT) return EINVAL; iolock = XFS_IOLOCK_EXCL; locktype = VRWLOCK_WRITE; } iolock = XFS_IOLOCK_EXCL; locktype = VRWLOCK_WRITE; xfs_ilock(xip, XFS_ILOCK_EXCL|iolock); isize = xip->i_d.di_size; limit = XFS_MAXIOFFSET(mp); if (ioflag & O_APPEND) *offset = isize; //start: n = limit - *offset; if (n <= 0) { xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock); return EFBIG; } if (n < size) size = n; new_size = *offset + size; if (new_size > isize) { io->io_new_size = new_size; } #ifdef RMC /* probably be a long time before if ever that we do dmapi */ if ((DM_EVENT_ENABLED(vp->v_vfsp, xip, DM_EVENT_WRITE) && !(ioflags & IO_INVIS) && !eventsent)) { loff_t savedsize = *offset; int dmflags = FILP_DELAY_FLAG(file) | DM_SEM_FLAG_RD(ioflags); xfs_iunlock(xip, XFS_ILOCK_EXCL); error = XFS_SEND_DATA(xip->i_mount, DM_EVENT_WRITE, vp, *offset, size, dmflags, &locktype); if (error) { if (iolock) xfs_iunlock(xip, iolock); return -error; } xfs_ilock(xip, XFS_ILOCK_EXCL); eventsent = 1; /* * The iolock was dropped and reaquired in XFS_SEND_DATA * so we have to recheck the size when appending. * We will only "goto start;" once, since having sent the * event prevents another call to XFS_SEND_DATA, which is * what allows the size to change in the first place. */ if ((file->f_flags & O_APPEND) && savedsize != xip->i_d.di_size) { *offset = isize = xip->i_d.di_size; goto start; } } #endif /* * If the offset is beyond the size of the file, we have a couple * of things to do. First, if there is already space allocated * we need to either create holes or zero the disk or ... * * If there is a page where the previous size lands, we need * to zero it out up to the new size. */ if (!(ioflag & IO_ISDIRECT) && (*offset > isize && isize)) { error = xfs_zero_eof(BHV_TO_VNODE(bdp), io, *offset, isize, *offset + size); if (error) { xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock); return(-error); } } xfs_iunlock(xip, XFS_ILOCK_EXCL); #if 0 /* * If we're writing the file then make sure to clear the * setuid and setgid bits if the process is not being run * by root. This keeps people from modifying setuid and * setgid binaries. */ if (((xip->i_d.di_mode & S_ISUID) || ((xip->i_d.di_mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP))) && !capable(CAP_FSETID)) { error = xfs_write_clear_setuid(xip); if (likely(!error)) error = -remove_suid(file->f_dentry); if (unlikely(error)) { xfs_iunlock(xip, iolock); goto out_unlock_mutex; } } #endif //retry: if (unlikely(ioflag & IO_ISDIRECT)) { #ifdef RMC xfs_off_t pos = *offset; struct address_space *mapping = file->f_dentry->d_inode->i_mapping; struct inode *inode = mapping->host; ret = precheck_file_write(file, inode, &size, &pos); if (ret || size == 0) goto error; xfs_inval_cached_pages(vp, io, pos, 1, 1); inode->i_ctime = inode->i_mtime = CURRENT_TIME; /* mark_inode_dirty_sync(inode); - we do this later */ xfs_rw_enter_trace(XFS_DIOWR_ENTER, io, buf, size, pos, ioflags); ret = generic_file_direct_IO(WRITE, file, (char *)buf, size, pos); xfs_inval_cached_pages(vp, io, pos, 1, 1); if (ret > 0) *offset += ret; #endif } else { xfs_rw_enter_trace(XFS_WRITE_ENTER, io, buf, size, *offset, ioflags); ret = xfs_write_file(xip,uio,ioflag); } xfs_ichgtime(xip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); //error: if (ret <= 0) { if (iolock) xfs_rwunlock(bdp, locktype); return ret; } XFS_STATS_ADD(xs_write_bytes, ret); if (*offset > xip->i_d.di_size) { xfs_ilock(xip, XFS_ILOCK_EXCL); if (*offset > xip->i_d.di_size) { printf("xfs_write look at doing more here %s:%d\n",__FILE__,__LINE__); #ifdef RMC struct inode *inode = LINVFS_GET_IP(vp); i_size_write(inode, *offset); mark_inode_dirty_sync(inode); #endif xip->i_d.di_size = *offset; xip->i_update_core = 1; xip->i_update_size = 1; } xfs_iunlock(xip, XFS_ILOCK_EXCL); } /* Handle various SYNC-type writes */ #if 0 // if ((file->f_flags & O_SYNC) || IS_SYNC(inode)) { #endif if (ioflag & IO_SYNC) { /* * If we're treating this as O_DSYNC and we have not updated the * size, force the log. */ if (!(mp->m_flags & XFS_MOUNT_OSYNCISOSYNC) && !(xip->i_update_size)) { xfs_inode_log_item_t *iip = xip->i_itemp; /* * If an allocation transaction occurred * without extending the size, then we have to force * the log up the proper point to ensure that the * allocation is permanent. We can't count on * the fact that buffered writes lock out direct I/O * writes - the direct I/O write could have extended * the size nontransactionally, then finished before * we started. xfs_write_file will think that the file * didn't grow but the update isn't safe unless the * size change is logged. * * Force the log if we've committed a transaction * against the inode or if someone else has and * the commit record hasn't gone to disk (e.g. * the inode is pinned). This guarantees that * all changes affecting the inode are permanent * when we return. */ if (iip && iip->ili_last_lsn) { xfs_log_force(mp, iip->ili_last_lsn, XFS_LOG_FORCE | XFS_LOG_SYNC); } else if (xfs_ipincount(xip) > 0) { xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE | XFS_LOG_SYNC); } } else { xfs_trans_t *tp; /* * O_SYNC or O_DSYNC _with_ a size update are handled * the same way. * * If the write was synchronous then we need to make * sure that the inode modification time is permanent. * We'll have updated the timestamp above, so here * we use a synchronous transaction to log the inode. * It's not fast, but it's necessary. * * If this a dsync write and the size got changed * non-transactionally, then we need to ensure that * the size change gets logged in a synchronous * transaction. */ tp = xfs_trans_alloc(mp, XFS_TRANS_WRITE_SYNC); if ((error = xfs_trans_reserve(tp, 0, XFS_SWRITE_LOG_RES(mp), 0, 0, 0))) { /* Transaction reserve failed */ xfs_trans_cancel(tp, 0); } else { /* Transaction reserve successful */ xfs_ilock(xip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, xip, XFS_ILOCK_EXCL); xfs_trans_ihold(tp, xip); xfs_trans_log_inode(tp, xip, XFS_ILOG_CORE); xfs_trans_set_sync(tp); error = xfs_trans_commit(tp, 0, NULL); xfs_iunlock(xip, XFS_ILOCK_EXCL); } if (error) goto out_unlock_internal; } xfs_rwunlock(bdp, locktype); return ret; } /* (ioflags & O_SYNC) */ out_unlock_internal: xfs_rwunlock(bdp, locktype); #if 0 out_unlock_mutex: if (need_i_mutex) mutex_unlock(&inode->i_mutex); #endif //out_nounlocks: return -error; }
ssize_t /* bytes written, or (-) error */ xfs_write( struct xfs_inode *xip, struct kiocb *iocb, const struct iovec *iovp, unsigned int nsegs, loff_t *offset, int ioflags) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; unsigned long segs = nsegs; xfs_mount_t *mp; ssize_t ret = 0, error = 0; xfs_fsize_t isize, new_size; int iolock; int eventsent = 0; size_t ocount = 0, count; loff_t pos; int need_i_mutex; XFS_STATS_INC(xs_write_calls); error = generic_segment_checks(iovp, &segs, &ocount, VERIFY_READ); if (error) return error; count = ocount; pos = *offset; if (count == 0) return 0; mp = xip->i_mount; xfs_wait_for_freeze(mp, SB_FREEZE_WRITE); if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; relock: if (ioflags & IO_ISDIRECT) { iolock = XFS_IOLOCK_SHARED; need_i_mutex = 0; } else { iolock = XFS_IOLOCK_EXCL; need_i_mutex = 1; mutex_lock(&inode->i_mutex); } xfs_ilock(xip, XFS_ILOCK_EXCL|iolock); start: error = -generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode)); if (error) { xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock); goto out_unlock_mutex; } if ((DM_EVENT_ENABLED(xip, DM_EVENT_WRITE) && !(ioflags & IO_INVIS) && !eventsent)) { int dmflags = FILP_DELAY_FLAG(file); if (need_i_mutex) dmflags |= DM_FLAGS_IMUX; xfs_iunlock(xip, XFS_ILOCK_EXCL); error = XFS_SEND_DATA(xip->i_mount, DM_EVENT_WRITE, xip, pos, count, dmflags, &iolock); if (error) { goto out_unlock_internal; } xfs_ilock(xip, XFS_ILOCK_EXCL); eventsent = 1; /* * The iolock was dropped and reacquired in XFS_SEND_DATA * so we have to recheck the size when appending. * We will only "goto start;" once, since having sent the * event prevents another call to XFS_SEND_DATA, which is * what allows the size to change in the first place. */ if ((file->f_flags & O_APPEND) && pos != xip->i_size) goto start; } if (ioflags & IO_ISDIRECT) { xfs_buftarg_t *target = XFS_IS_REALTIME_INODE(xip) ? mp->m_rtdev_targp : mp->m_ddev_targp; if ((pos & target->bt_smask) || (count & target->bt_smask)) { xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock); return XFS_ERROR(-EINVAL); } if (!need_i_mutex && (mapping->nrpages || pos > xip->i_size)) { xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock); iolock = XFS_IOLOCK_EXCL; need_i_mutex = 1; mutex_lock(&inode->i_mutex); xfs_ilock(xip, XFS_ILOCK_EXCL|iolock); goto start; } } new_size = pos + count; if (new_size > xip->i_size) xip->i_new_size = new_size; if (likely(!(ioflags & IO_INVIS))) xfs_ichgtime(xip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); /* * If the offset is beyond the size of the file, we have a couple * of things to do. First, if there is already space allocated * we need to either create holes or zero the disk or ... * * If there is a page where the previous size lands, we need * to zero it out up to the new size. */ if (pos > xip->i_size) { error = xfs_zero_eof(xip, pos, xip->i_size); if (error) { xfs_iunlock(xip, XFS_ILOCK_EXCL); goto out_unlock_internal; } } xfs_iunlock(xip, XFS_ILOCK_EXCL); /* * If we're writing the file then make sure to clear the * setuid and setgid bits if the process is not being run * by root. This keeps people from modifying setuid and * setgid binaries. */ if (((xip->i_d.di_mode & S_ISUID) || ((xip->i_d.di_mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP))) && !capable(CAP_FSETID)) { error = xfs_write_clear_setuid(xip); if (likely(!error)) error = -file_remove_suid(file); if (unlikely(error)) { goto out_unlock_internal; } } /* We can write back this queue in page reclaim */ current->backing_dev_info = mapping->backing_dev_info; if ((ioflags & IO_ISDIRECT)) { if (mapping->nrpages) { WARN_ON(need_i_mutex == 0); xfs_inval_cached_trace(xip, pos, -1, (pos & PAGE_CACHE_MASK), -1); error = xfs_flushinval_pages(xip, (pos & PAGE_CACHE_MASK), -1, FI_REMAPF_LOCKED); if (error) goto out_unlock_internal; } if (need_i_mutex) { /* demote the lock now the cached pages are gone */ xfs_ilock_demote(xip, XFS_IOLOCK_EXCL); mutex_unlock(&inode->i_mutex); iolock = XFS_IOLOCK_SHARED; need_i_mutex = 0; } xfs_rw_enter_trace(XFS_DIOWR_ENTER, xip, (void *)iovp, segs, *offset, ioflags); ret = generic_file_direct_write(iocb, iovp, &segs, pos, offset, count, ocount); /* * direct-io write to a hole: fall through to buffered I/O * for completing the rest of the request. */ if (ret >= 0 && ret != count) { XFS_STATS_ADD(xs_write_bytes, ret); pos += ret; count -= ret; ioflags &= ~IO_ISDIRECT; xfs_iunlock(xip, iolock); goto relock; } } else { xfs_rw_enter_trace(XFS_WRITE_ENTER, xip, (void *)iovp, segs, *offset, ioflags); ret = generic_file_buffered_write(iocb, iovp, segs, pos, offset, count, ret); } current->backing_dev_info = NULL; if (ret == -EIOCBQUEUED && !(ioflags & IO_ISAIO)) ret = wait_on_sync_kiocb(iocb); isize = i_size_read(inode); if (unlikely(ret < 0 && ret != -EFAULT && *offset > isize)) *offset = isize; if (*offset > xip->i_size) { xfs_ilock(xip, XFS_ILOCK_EXCL); if (*offset > xip->i_size) xip->i_size = *offset; xfs_iunlock(xip, XFS_ILOCK_EXCL); } if (ret == -ENOSPC && DM_EVENT_ENABLED(xip, DM_EVENT_NOSPACE) && !(ioflags & IO_INVIS)) { xfs_iunlock(xip, iolock); if (need_i_mutex) mutex_unlock(&inode->i_mutex); error = XFS_SEND_NAMESP(xip->i_mount, DM_EVENT_NOSPACE, xip, DM_RIGHT_NULL, xip, DM_RIGHT_NULL, NULL, NULL, 0, 0, 0); /* Delay flag intentionally unused */ if (need_i_mutex) mutex_lock(&inode->i_mutex); xfs_ilock(xip, iolock); if (error) goto out_unlock_internal; goto start; } error = -ret; if (ret <= 0) goto out_unlock_internal; XFS_STATS_ADD(xs_write_bytes, ret); /* Handle various SYNC-type writes */ if ((file->f_flags & O_SYNC) || IS_SYNC(inode)) { int error2; xfs_iunlock(xip, iolock); if (need_i_mutex) mutex_unlock(&inode->i_mutex); error2 = sync_page_range(inode, mapping, pos, ret); if (!error) error = error2; if (need_i_mutex) mutex_lock(&inode->i_mutex); xfs_ilock(xip, iolock); error2 = xfs_write_sync_logforce(mp, xip); if (!error) error = error2; } out_unlock_internal: if (xip->i_new_size) { xfs_ilock(xip, XFS_ILOCK_EXCL); xip->i_new_size = 0; /* * If this was a direct or synchronous I/O that failed (such * as ENOSPC) then part of the I/O may have been written to * disk before the error occured. In this case the on-disk * file size may have been adjusted beyond the in-memory file * size and now needs to be truncated back. */ if (xip->i_d.di_size > xip->i_size) xip->i_d.di_size = xip->i_size; xfs_iunlock(xip, XFS_ILOCK_EXCL); } xfs_iunlock(xip, iolock); out_unlock_mutex: if (need_i_mutex) mutex_unlock(&inode->i_mutex); return -error; }
/* * Truncate file. Must have write permission and not be a directory. */ int xfs_setattr_size( struct xfs_inode *ip, struct iattr *iattr) { struct xfs_mount *mp = ip->i_mount; struct inode *inode = VFS_I(ip); xfs_off_t oldsize, newsize; struct xfs_trans *tp; int error; uint lock_flags = 0; uint commit_flags = 0; trace_xfs_setattr(ip); if (mp->m_flags & XFS_MOUNT_RDONLY) return XFS_ERROR(EROFS); if (XFS_FORCED_SHUTDOWN(mp)) return XFS_ERROR(EIO); error = -inode_change_ok(inode, iattr); if (error) return XFS_ERROR(error); ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); ASSERT(S_ISREG(ip->i_d.di_mode)); ASSERT((iattr->ia_valid & (ATTR_UID|ATTR_GID|ATTR_ATIME|ATTR_ATIME_SET| ATTR_MTIME_SET|ATTR_KILL_PRIV|ATTR_TIMES_SET)) == 0); oldsize = inode->i_size; newsize = iattr->ia_size; /* * Short circuit the truncate case for zero length files. */ if (newsize == 0 && oldsize == 0 && ip->i_d.di_nextents == 0) { if (!(iattr->ia_valid & (ATTR_CTIME|ATTR_MTIME))) return 0; /* * Use the regular setattr path to update the timestamps. */ iattr->ia_valid &= ~ATTR_SIZE; return xfs_setattr_nonsize(ip, iattr, 0); } /* * Make sure that the dquots are attached to the inode. */ error = xfs_qm_dqattach(ip, 0); if (error) return error; /* * Now we can make the changes. Before we join the inode to the * transaction, take care of the part of the truncation that must be * done without the inode lock. This needs to be done before joining * the inode to the transaction, because the inode cannot be unlocked * once it is a part of the transaction. */ if (newsize > oldsize) { /* * Do the first part of growing a file: zero any data in the * last block that is beyond the old EOF. We need to do this * before the inode is joined to the transaction to modify * i_size. */ error = xfs_zero_eof(ip, newsize, oldsize); if (error) return error; } /* * We are going to log the inode size change in this transaction so * any previous writes that are beyond the on disk EOF and the new * EOF that have not been written out need to be written here. If we * do not write the data out, we expose ourselves to the null files * problem. * * Only flush from the on disk size to the smaller of the in memory * file size or the new size as that's the range we really care about * here and prevents waiting for other data not within the range we * care about here. */ if (oldsize != ip->i_d.di_size && newsize > ip->i_d.di_size) { error = -filemap_write_and_wait_range(VFS_I(ip)->i_mapping, ip->i_d.di_size, newsize); if (error) return error; } /* * Wait for all direct I/O to complete. */ inode_dio_wait(inode); error = -block_truncate_page(inode->i_mapping, newsize, xfs_get_blocks); if (error) return error; tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_SIZE); error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0); if (error) goto out_trans_cancel; truncate_setsize(inode, newsize); commit_flags = XFS_TRANS_RELEASE_LOG_RES; lock_flags |= XFS_ILOCK_EXCL; xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, 0); /* * Only change the c/mtime if we are changing the size or we are * explicitly asked to change it. This handles the semantic difference * between truncate() and ftruncate() as implemented in the VFS. * * The regular truncate() case without ATTR_CTIME and ATTR_MTIME is a * special case where we need to update the times despite not having * these flags set. For all other operations the VFS set these flags * explicitly if it wants a timestamp update. */ if (newsize != oldsize && !(iattr->ia_valid & (ATTR_CTIME | ATTR_MTIME))) { iattr->ia_ctime = iattr->ia_mtime = current_fs_time(inode->i_sb); iattr->ia_valid |= ATTR_CTIME | ATTR_MTIME; } /* * The first thing we do is set the size to new_size permanently on * disk. This way we don't have to worry about anyone ever being able * to look at the data being freed even in the face of a crash. * What we're getting around here is the case where we free a block, it * is allocated to another file, it is written to, and then we crash. * If the new data gets written to the file but the log buffers * containing the free and reallocation don't, then we'd end up with * garbage in the blocks being freed. As long as we make the new size * permanent before actually freeing any blocks it doesn't matter if * they get written to. */ ip->i_d.di_size = newsize; xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); if (newsize <= oldsize) { error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, newsize); if (error) goto out_trans_abort; /* * Truncated "down", so we're removing references to old data * here - if we delay flushing for a long time, we expose * ourselves unduly to the notorious NULL files problem. So, * we mark this inode and flush it when the file is closed, * and do not wait the usual (long) time for writeout. */ xfs_iflags_set(ip, XFS_ITRUNCATED); /* A truncate down always removes post-EOF blocks. */ xfs_inode_clear_eofblocks_tag(ip); } if (iattr->ia_valid & ATTR_MODE) xfs_setattr_mode(ip, iattr); if (iattr->ia_valid & (ATTR_ATIME|ATTR_CTIME|ATTR_MTIME)) xfs_setattr_time(ip, iattr); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); XFS_STATS_INC(xs_ig_attrchg); if (mp->m_flags & XFS_MOUNT_WSYNC) xfs_trans_set_sync(tp); error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); out_unlock: if (lock_flags) xfs_iunlock(ip, lock_flags); return error; out_trans_abort: commit_flags |= XFS_TRANS_ABORT; out_trans_cancel: xfs_trans_cancel(tp, commit_flags); goto out_unlock; }
/* * Common pre-write limit and setup checks. * * Called with the iolocked held either shared and exclusive according to * @iolock, and returns with it held. Might upgrade the iolock to exclusive * if called for a direct write beyond i_size. */ STATIC ssize_t xfs_file_aio_write_checks( struct kiocb *iocb, struct iov_iter *from, int *iolock) { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; struct xfs_inode *ip = XFS_I(inode); ssize_t error = 0; size_t count = iov_iter_count(from); bool drained_dio = false; restart: error = generic_write_checks(iocb, from); if (error <= 0) return error; error = xfs_break_layouts(inode, iolock); if (error) return error; /* * For changing security info in file_remove_privs() we need i_rwsem * exclusively. */ if (*iolock == XFS_IOLOCK_SHARED && !IS_NOSEC(inode)) { xfs_iunlock(ip, *iolock); *iolock = XFS_IOLOCK_EXCL; xfs_ilock(ip, *iolock); goto restart; } /* * If the offset is beyond the size of the file, we need to zero any * blocks that fall between the existing EOF and the start of this * write. If zeroing is needed and we are currently holding the * iolock shared, we need to update it to exclusive which implies * having to redo all checks before. * * We need to serialise against EOF updates that occur in IO * completions here. We want to make sure that nobody is changing the * size while we do this check until we have placed an IO barrier (i.e. * hold the XFS_IOLOCK_EXCL) that prevents new IO from being dispatched. * The spinlock effectively forms a memory barrier once we have the * XFS_IOLOCK_EXCL so we are guaranteed to see the latest EOF value * and hence be able to correctly determine if we need to run zeroing. */ spin_lock(&ip->i_flags_lock); if (iocb->ki_pos > i_size_read(inode)) { spin_unlock(&ip->i_flags_lock); if (!drained_dio) { if (*iolock == XFS_IOLOCK_SHARED) { xfs_iunlock(ip, *iolock); *iolock = XFS_IOLOCK_EXCL; xfs_ilock(ip, *iolock); iov_iter_reexpand(from, count); } /* * We now have an IO submission barrier in place, but * AIO can do EOF updates during IO completion and hence * we now need to wait for all of them to drain. Non-AIO * DIO will have drained before we are given the * XFS_IOLOCK_EXCL, and so for most cases this wait is a * no-op. */ inode_dio_wait(inode); drained_dio = true; goto restart; } error = xfs_zero_eof(ip, iocb->ki_pos, i_size_read(inode), NULL); if (error) return error; } else spin_unlock(&ip->i_flags_lock); /* * Updating the timestamps will grab the ilock again from * xfs_fs_dirty_inode, so we have to call it after dropping the * lock above. Eventually we should look into a way to avoid * the pointless lock roundtrip. */ if (likely(!(file->f_mode & FMODE_NOCMTIME))) { error = file_update_time(file); if (error) return error; } /* * If we're writing the file then make sure to clear the setuid and * setgid bits if the process is not being run by root. This keeps * people from modifying setuid and setgid binaries. */ if (!IS_NOSEC(inode)) return file_remove_privs(file); return 0; }
ssize_t /* bytes written, or (-) error */ xfs_write( bhv_desc_t *bdp, struct kiocb *iocb, const struct iovec *iovp, unsigned int segs, loff_t *offset, int ioflags, cred_t *credp) { struct file *file = iocb->ki_filp; size_t size = 0; xfs_inode_t *xip; xfs_mount_t *mp; ssize_t ret; int error = 0; xfs_fsize_t isize, new_size; xfs_fsize_t n, limit; xfs_iocore_t *io; vnode_t *vp; unsigned long seg; int iolock; int eventsent = 0; vrwlock_t locktype; XFS_STATS_INC(xs_write_calls); vp = BHV_TO_VNODE(bdp); vn_trace_entry(vp, "xfs_write", (inst_t *)__return_address); xip = XFS_BHVTOI(bdp); /* START copy & waste from filemap.c */ for (seg = 0; seg < segs; seg++) { const struct iovec *iv = &iovp[seg]; /* * If any segment has a negative length, or the cumulative * length ever wraps negative then return -EINVAL. */ size += iv->iov_len; if (unlikely((ssize_t)(size|iv->iov_len) < 0)) return XFS_ERROR(-EINVAL); } /* END copy & waste from filemap.c */ if (size == 0) return 0; io = &(xip->i_iocore); mp = io->io_mount; xfs_check_frozen(mp, bdp, XFS_FREEZE_WRITE); if (XFS_FORCED_SHUTDOWN(mp)) { return -EIO; } if (ioflags & IO_ISDIRECT) { pb_target_t *target = (xip->i_d.di_flags & XFS_DIFLAG_REALTIME) ? mp->m_rtdev_targp : mp->m_ddev_targp; if ((*offset & target->pbr_smask) || (size & target->pbr_smask)) { return XFS_ERROR(-EINVAL); } iolock = XFS_IOLOCK_SHARED; locktype = VRWLOCK_WRITE_DIRECT; } else { iolock = XFS_IOLOCK_EXCL; locktype = VRWLOCK_WRITE; } xfs_ilock(xip, XFS_ILOCK_EXCL|iolock); isize = xip->i_d.di_size; limit = XFS_MAXIOFFSET(mp); if (file->f_flags & O_APPEND) *offset = isize; start: n = limit - *offset; if (n <= 0) { xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock); return -EFBIG; } if (n < size) size = n; new_size = *offset + size; if (new_size > isize) { io->io_new_size = new_size; } if ((DM_EVENT_ENABLED(vp->v_vfsp, xip, DM_EVENT_WRITE) && !(ioflags & IO_INVIS) && !eventsent)) { loff_t savedsize = *offset; xfs_iunlock(xip, XFS_ILOCK_EXCL); error = XFS_SEND_DATA(xip->i_mount, DM_EVENT_WRITE, vp, *offset, size, FILP_DELAY_FLAG(file), &locktype); if (error) { xfs_iunlock(xip, iolock); return -error; } xfs_ilock(xip, XFS_ILOCK_EXCL); eventsent = 1; /* * The iolock was dropped and reaquired in XFS_SEND_DATA * so we have to recheck the size when appending. * We will only "goto start;" once, since having sent the * event prevents another call to XFS_SEND_DATA, which is * what allows the size to change in the first place. */ if ((file->f_flags & O_APPEND) && savedsize != xip->i_d.di_size) { *offset = isize = xip->i_d.di_size; goto start; } } /* * On Linux, generic_file_write updates the times even if * no data is copied in so long as the write had a size. * * We must update xfs' times since revalidate will overcopy xfs. */ if (size && !(ioflags & IO_INVIS)) xfs_ichgtime(xip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); /* * If the offset is beyond the size of the file, we have a couple * of things to do. First, if there is already space allocated * we need to either create holes or zero the disk or ... * * If there is a page where the previous size lands, we need * to zero it out up to the new size. */ if (!(ioflags & IO_ISDIRECT) && (*offset > isize && isize)) { error = xfs_zero_eof(BHV_TO_VNODE(bdp), io, *offset, isize, *offset + size); if (error) { xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock); return(-error); } } xfs_iunlock(xip, XFS_ILOCK_EXCL); /* * If we're writing the file then make sure to clear the * setuid and setgid bits if the process is not being run * by root. This keeps people from modifying setuid and * setgid binaries. */ if (((xip->i_d.di_mode & S_ISUID) || ((xip->i_d.di_mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP))) && !capable(CAP_FSETID)) { error = xfs_write_clear_setuid(xip); if (error) { xfs_iunlock(xip, iolock); return -error; } } retry: if (ioflags & IO_ISDIRECT) { xfs_inval_cached_pages(vp, &xip->i_iocore, *offset, 1, 1); } ret = generic_file_aio_write_nolock(iocb, iovp, segs, offset); if ((ret == -ENOSPC) && DM_EVENT_ENABLED(vp->v_vfsp, xip, DM_EVENT_NOSPACE) && !(ioflags & IO_INVIS)) { xfs_rwunlock(bdp, locktype); error = XFS_SEND_NAMESP(xip->i_mount, DM_EVENT_NOSPACE, vp, DM_RIGHT_NULL, vp, DM_RIGHT_NULL, NULL, NULL, 0, 0, 0); /* Delay flag intentionally unused */ if (error) return -error; xfs_rwlock(bdp, locktype); *offset = xip->i_d.di_size; goto retry; } if (*offset > xip->i_d.di_size) { xfs_ilock(xip, XFS_ILOCK_EXCL); if (*offset > xip->i_d.di_size) { struct inode *inode = LINVFS_GET_IP(vp); xip->i_d.di_size = *offset; i_size_write(inode, *offset); xip->i_update_core = 1; xip->i_update_size = 1; } xfs_iunlock(xip, XFS_ILOCK_EXCL); } if (ret <= 0) { xfs_rwunlock(bdp, locktype); return ret; } XFS_STATS_ADD(xs_write_bytes, ret); /* Handle various SYNC-type writes */ if ((file->f_flags & O_SYNC) || IS_SYNC(file->f_dentry->d_inode)) { /* * If we're treating this as O_DSYNC and we have not updated the * size, force the log. */ if (!(mp->m_flags & XFS_MOUNT_OSYNCISOSYNC) && !(xip->i_update_size)) { /* * If an allocation transaction occurred * without extending the size, then we have to force * the log up the proper point to ensure that the * allocation is permanent. We can't count on * the fact that buffered writes lock out direct I/O * writes - the direct I/O write could have extended * the size nontransactionally, then finished before * we started. xfs_write_file will think that the file * didn't grow but the update isn't safe unless the * size change is logged. * * Force the log if we've committed a transaction * against the inode or if someone else has and * the commit record hasn't gone to disk (e.g. * the inode is pinned). This guarantees that * all changes affecting the inode are permanent * when we return. */ xfs_inode_log_item_t *iip; xfs_lsn_t lsn; iip = xip->i_itemp; if (iip && iip->ili_last_lsn) { lsn = iip->ili_last_lsn; xfs_log_force(mp, lsn, XFS_LOG_FORCE | XFS_LOG_SYNC); } else if (xfs_ipincount(xip) > 0) { xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE | XFS_LOG_SYNC); } } else { xfs_trans_t *tp; /* * O_SYNC or O_DSYNC _with_ a size update are handled * the same way. * * If the write was synchronous then we need to make * sure that the inode modification time is permanent. * We'll have updated the timestamp above, so here * we use a synchronous transaction to log the inode. * It's not fast, but it's necessary. * * If this a dsync write and the size got changed * non-transactionally, then we need to ensure that * the size change gets logged in a synchronous * transaction. */ tp = xfs_trans_alloc(mp, XFS_TRANS_WRITE_SYNC); if ((error = xfs_trans_reserve(tp, 0, XFS_SWRITE_LOG_RES(mp), 0, 0, 0))) { /* Transaction reserve failed */ xfs_trans_cancel(tp, 0); } else { /* Transaction reserve successful */ xfs_ilock(xip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, xip, XFS_ILOCK_EXCL); xfs_trans_ihold(tp, xip); xfs_trans_log_inode(tp, xip, XFS_ILOG_CORE); xfs_trans_set_sync(tp); error = xfs_trans_commit(tp, 0, (xfs_lsn_t)0); xfs_iunlock(xip, XFS_ILOCK_EXCL); } } } /* (ioflags & O_SYNC) */ xfs_rwunlock(bdp, locktype); return(ret); }