/** * rawfs_block_file_aio_read - read routine for block files * @iocb: kernel I/O control block * @iov: io vector request * @nr_segs: number of segments in the iovec * @pos: current file position */ ssize_t rawfs_block_file_aio_read(struct kiocb *iocb, const struct iovec *iov, unsigned long nr_segs, loff_t pos) { struct file *filp = iocb->ki_filp; struct super_block *sb = filp->f_path.dentry->d_sb; struct rawfs_sb_info *rawfs_sb = RAWFS_SB(sb); // struct address_space *mapping=filp->f_mapping; struct inode *inode = filp->f_mapping->host; struct rawfs_inode_info *inode_info = RAWFS_I(inode); ssize_t retval; // unsigned long seg = 0; size_t count; loff_t *ppos = &iocb->ki_pos; // Always use direct I/O loff_t size; int block_no; retval = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE); if (retval) return retval; retval=iov_length(iov, nr_segs); mutex_lock(&rawfs_sb->rawfs_lock); RAWFS_PRINT(RAWFS_DBG_FILE, "rawfs_block_file_aio_read %s, pos %lld, " "len %d\n", inode_info->i_name, pos, retval); size = i_size_read(inode); // Get inode ID block_no = filp->f_path.dentry->d_inode->i_ino - RAWFS_BLOCK0_INO; if ((retval + pos) >= size) retval = size - pos; if (pos < size) { rawfs_sb->dev.read_page_user(filp->f_path.dentry->d_inode->i_sb, block_no, pos, iov, nr_segs, retval); if (retval > 0) *ppos = pos + retval; if (retval < 0 || *ppos >= size) { file_accessed(filp); goto out; } } out: mutex_unlock(&rawfs_sb->rawfs_lock); return retval; }
STATIC ssize_t xfs_file_aio_write( struct kiocb *iocb, const struct iovec *iovp, unsigned long nr_segs, loff_t pos) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; struct xfs_inode *ip = XFS_I(inode); ssize_t ret; size_t ocount = 0; XFS_STATS_INC(xs_write_calls); BUG_ON(iocb->ki_pos != pos); ret = generic_segment_checks(iovp, &nr_segs, &ocount, VERIFY_READ); if (ret) return ret; if (ocount == 0) return 0; sb_start_write(inode->i_sb); if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { ret = -EIO; goto out; } if (unlikely(file->f_flags & O_DIRECT)) ret = xfs_file_dio_aio_write(iocb, iovp, nr_segs, pos, ocount); else ret = xfs_file_buffered_aio_write(iocb, iovp, nr_segs, pos, ocount); if (ret > 0) { ssize_t err; XFS_STATS_ADD(xs_write_bytes, ret); /* Handle various SYNC-type writes */ err = generic_write_sync(file, pos, ret); if (err < 0) ret = err; } out: sb_end_write(inode->i_sb); return ret; }
static ssize_t zpl_aio_read(struct kiocb *kiocb, const struct iovec *iovp, unsigned long nr_segs, loff_t pos) { ssize_t ret; size_t count; ret = generic_segment_checks(iovp, &nr_segs, &count, VERIFY_WRITE); if (ret) return (ret); return (zpl_iter_read_common(kiocb, iovp, nr_segs, count, UIO_USERSPACE, 0)); }
STATIC ssize_t xfs_file_aio_write( struct kiocb *iocb, const struct iovec *iovp, unsigned long nr_segs, loff_t pos) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; struct xfs_inode *ip = XFS_I(inode); ssize_t ret; size_t ocount = 0; XFS_STATS_INC(xs_write_calls); BUG_ON(iocb->ki_pos != pos); ret = generic_segment_checks(iovp, &nr_segs, &ocount, VERIFY_READ); if (ret) return ret; if (ocount == 0) return 0; xfs_wait_for_freeze(ip->i_mount, SB_FREEZE_WRITE); if (XFS_FORCED_SHUTDOWN(ip->i_mount)) return -EIO; if (unlikely(file->f_flags & O_DIRECT)) ret = xfs_file_dio_aio_write(iocb, iovp, nr_segs, pos, ocount); else ret = xfs_file_buffered_aio_write(iocb, iovp, nr_segs, pos, ocount); if (ret > 0) { ssize_t err; XFS_STATS_ADD(xs_write_bytes, ret); err = generic_write_sync(file, pos, ret); if (err < 0) ret = err; } return ret; }
ssize_t do_aio_write(struct kiocb *kiocb, const struct iovec *iov, unsigned long nr_segs, loff_t pos) { struct file *file = kiocb->ki_filp; if (file->f_op->write_iter) { size_t count; struct iov_iter iter; int ret; count = 0; ret = generic_segment_checks(iov, &nr_segs, &count, VERIFY_READ); if (ret) return ret; iov_iter_init(&iter, iov, nr_segs, count, 0); return file->f_op->write_iter(kiocb, &iter, pos); } return file->f_op->aio_write(kiocb, iov, nr_segs, pos); }
static ssize_t zpl_aio_write(struct kiocb *kiocb, const struct iovec *iovp, unsigned long nr_segs, loff_t pos) { struct file *file = kiocb->ki_filp; struct address_space *mapping = file->f_mapping; struct inode *ip = mapping->host; int isblk = S_ISBLK(ip->i_mode); size_t count; ssize_t ret; ret = generic_segment_checks(iovp, &nr_segs, &count, VERIFY_READ); if (ret) return (ret); ret = generic_write_checks(file, &pos, &count, isblk); if (ret) return (ret); return (zpl_iter_write_common(kiocb, iovp, nr_segs, count, UIO_USERSPACE, 0)); }
STATIC ssize_t xfs_file_aio_read( struct kiocb *iocb, const struct iovec *iovp, unsigned long nr_segs, loff_t pos) { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; size_t size = 0; ssize_t ret = 0; int ioflags = 0; xfs_fsize_t n; XFS_STATS_INC(xs_read_calls); BUG_ON(iocb->ki_pos != pos); if (unlikely(file->f_flags & O_DIRECT)) ioflags |= IO_ISDIRECT; if (file->f_mode & FMODE_NOCMTIME) ioflags |= IO_INVIS; ret = generic_segment_checks(iovp, &nr_segs, &size, VERIFY_WRITE); if (ret < 0) return ret; if (unlikely(ioflags & IO_ISDIRECT)) { xfs_buftarg_t *target = XFS_IS_REALTIME_INODE(ip) ? mp->m_rtdev_targp : mp->m_ddev_targp; if ((iocb->ki_pos & target->bt_smask) || (size & target->bt_smask)) { if (iocb->ki_pos == i_size_read(inode)) return 0; return -XFS_ERROR(EINVAL); } } n = mp->m_super->s_maxbytes - iocb->ki_pos; if (n <= 0 || size == 0) return 0; if (n < size) size = n; if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; /* * Locking is a bit tricky here. If we take an exclusive lock * for direct IO, we effectively serialise all new concurrent * read IO to this file and block it behind IO that is currently in * progress because IO in progress holds the IO lock shared. We only * need to hold the lock exclusive to blow away the page cache, so * only take lock exclusively if the page cache needs invalidation. * This allows the normal direct IO case of no page cache pages to * proceeed concurrently without serialisation. */ xfs_rw_ilock(ip, XFS_IOLOCK_SHARED); if ((ioflags & IO_ISDIRECT) && inode->i_mapping->nrpages) { xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); xfs_rw_ilock(ip, XFS_IOLOCK_EXCL); if (inode->i_mapping->nrpages) { ret = -xfs_flushinval_pages(ip, (iocb->ki_pos & PAGE_CACHE_MASK), -1, FI_REMAPF_LOCKED); if (ret) { xfs_rw_iunlock(ip, XFS_IOLOCK_EXCL); return ret; } } xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL); } trace_xfs_file_read(ip, size, iocb->ki_pos, ioflags); ret = generic_file_aio_read(iocb, iovp, nr_segs, iocb->ki_pos); if (ret > 0) XFS_STATS_ADD(xs_read_bytes, ret); xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); return ret; }
STATIC ssize_t xfs_file_aio_write( struct kiocb *iocb, const struct iovec *iovp, unsigned long nr_segs, loff_t pos) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; ssize_t ret = 0, error = 0; int ioflags = 0; xfs_fsize_t isize, new_size; int iolock; int eventsent = 0; size_t ocount = 0, count; int need_i_mutex; XFS_STATS_INC(xs_write_calls); BUG_ON(iocb->ki_pos != pos); if (unlikely(file->f_flags & O_DIRECT)) ioflags |= IO_ISDIRECT; if (file->f_mode & FMODE_NOCMTIME) ioflags |= IO_INVIS; error = generic_segment_checks(iovp, &nr_segs, &ocount, VERIFY_READ); if (error) return error; count = ocount; if (count == 0) return 0; xfs_wait_for_freeze(mp, SB_FREEZE_WRITE); if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; relock: if (ioflags & IO_ISDIRECT) { iolock = XFS_IOLOCK_SHARED; need_i_mutex = 0; } else { iolock = XFS_IOLOCK_EXCL; need_i_mutex = 1; mutex_lock(&inode->i_mutex); } xfs_ilock(ip, XFS_ILOCK_EXCL|iolock); start: error = -generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode)); if (error) { xfs_iunlock(ip, XFS_ILOCK_EXCL|iolock); goto out_unlock_mutex; } if ((DM_EVENT_ENABLED(ip, DM_EVENT_WRITE) && !(ioflags & IO_INVIS) && !eventsent)) { int dmflags = FILP_DELAY_FLAG(file); if (need_i_mutex) dmflags |= DM_FLAGS_IMUX; xfs_iunlock(ip, XFS_ILOCK_EXCL); error = XFS_SEND_DATA(ip->i_mount, DM_EVENT_WRITE, ip, pos, count, dmflags, &iolock); if (error) { goto out_unlock_internal; } xfs_ilock(ip, XFS_ILOCK_EXCL); eventsent = 1; /* * The iolock was dropped and reacquired in XFS_SEND_DATA * so we have to recheck the size when appending. * We will only "goto start;" once, since having sent the * event prevents another call to XFS_SEND_DATA, which is * what allows the size to change in the first place. */ if ((file->f_flags & O_APPEND) && pos != ip->i_size) goto start; } if (ioflags & IO_ISDIRECT) { xfs_buftarg_t *target = XFS_IS_REALTIME_INODE(ip) ? mp->m_rtdev_targp : mp->m_ddev_targp; if ((pos & target->bt_smask) || (count & target->bt_smask)) { xfs_iunlock(ip, XFS_ILOCK_EXCL|iolock); return XFS_ERROR(-EINVAL); } if (!need_i_mutex && (mapping->nrpages || pos > ip->i_size)) { xfs_iunlock(ip, XFS_ILOCK_EXCL|iolock); iolock = XFS_IOLOCK_EXCL; need_i_mutex = 1; mutex_lock(&inode->i_mutex); xfs_ilock(ip, XFS_ILOCK_EXCL|iolock); goto start; } } new_size = pos + count; if (new_size > ip->i_size) ip->i_new_size = new_size; if (likely(!(ioflags & IO_INVIS))) file_update_time(file); /* * If the offset is beyond the size of the file, we have a couple * of things to do. First, if there is already space allocated * we need to either create holes or zero the disk or ... * * If there is a page where the previous size lands, we need * to zero it out up to the new size. */ if (pos > ip->i_size) { error = xfs_zero_eof(ip, pos, ip->i_size); if (error) { xfs_iunlock(ip, XFS_ILOCK_EXCL); goto out_unlock_internal; } } xfs_iunlock(ip, XFS_ILOCK_EXCL); /* * If we're writing the file then make sure to clear the * setuid and setgid bits if the process is not being run * by root. This keeps people from modifying setuid and * setgid binaries. */ error = -file_remove_suid(file); if (unlikely(error)) goto out_unlock_internal; /* We can write back this queue in page reclaim */ current->backing_dev_info = mapping->backing_dev_info; if ((ioflags & IO_ISDIRECT)) { if (mapping->nrpages) { WARN_ON(need_i_mutex == 0); error = xfs_flushinval_pages(ip, (pos & PAGE_CACHE_MASK), -1, FI_REMAPF_LOCKED); if (error) goto out_unlock_internal; } if (need_i_mutex) { /* demote the lock now the cached pages are gone */ xfs_ilock_demote(ip, XFS_IOLOCK_EXCL); mutex_unlock(&inode->i_mutex); iolock = XFS_IOLOCK_SHARED; need_i_mutex = 0; } trace_xfs_file_direct_write(ip, count, iocb->ki_pos, ioflags); ret = generic_file_direct_write(iocb, iovp, &nr_segs, pos, &iocb->ki_pos, count, ocount); /* * direct-io write to a hole: fall through to buffered I/O * for completing the rest of the request. */ if (ret >= 0 && ret != count) { XFS_STATS_ADD(xs_write_bytes, ret); pos += ret; count -= ret; ioflags &= ~IO_ISDIRECT; xfs_iunlock(ip, iolock); goto relock; } } else { int enospc = 0; ssize_t ret2 = 0; write_retry: trace_xfs_file_buffered_write(ip, count, iocb->ki_pos, ioflags); ret2 = generic_file_buffered_write(iocb, iovp, nr_segs, pos, &iocb->ki_pos, count, ret); /* * if we just got an ENOSPC, flush the inode now we * aren't holding any page locks and retry *once* */ if (ret2 == -ENOSPC && !enospc) { error = xfs_flush_pages(ip, 0, -1, 0, FI_NONE); if (error) goto out_unlock_internal; enospc = 1; goto write_retry; } ret = ret2; } current->backing_dev_info = NULL; isize = i_size_read(inode); if (unlikely(ret < 0 && ret != -EFAULT && iocb->ki_pos > isize)) iocb->ki_pos = isize; if (iocb->ki_pos > ip->i_size) { xfs_ilock(ip, XFS_ILOCK_EXCL); if (iocb->ki_pos > ip->i_size) ip->i_size = iocb->ki_pos; xfs_iunlock(ip, XFS_ILOCK_EXCL); } if (ret == -ENOSPC && DM_EVENT_ENABLED(ip, DM_EVENT_NOSPACE) && !(ioflags & IO_INVIS)) { xfs_iunlock(ip, iolock); if (need_i_mutex) mutex_unlock(&inode->i_mutex); error = XFS_SEND_NAMESP(ip->i_mount, DM_EVENT_NOSPACE, ip, DM_RIGHT_NULL, ip, DM_RIGHT_NULL, NULL, NULL, 0, 0, 0); /* Delay flag intentionally unused */ if (need_i_mutex) mutex_lock(&inode->i_mutex); xfs_ilock(ip, iolock); if (error) goto out_unlock_internal; goto start; } error = -ret; if (ret <= 0) goto out_unlock_internal; XFS_STATS_ADD(xs_write_bytes, ret); /* Handle various SYNC-type writes */ if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) { loff_t end = pos + ret - 1; int error2; xfs_iunlock(ip, iolock); if (need_i_mutex) mutex_unlock(&inode->i_mutex); error2 = filemap_write_and_wait_range(mapping, pos, end); if (!error) error = error2; if (need_i_mutex) mutex_lock(&inode->i_mutex); xfs_ilock(ip, iolock); error2 = -xfs_file_fsync(file, file->f_path.dentry, (file->f_flags & __O_SYNC) ? 0 : 1); if (!error) error = error2; } out_unlock_internal: if (ip->i_new_size) { xfs_ilock(ip, XFS_ILOCK_EXCL); ip->i_new_size = 0; /* * If this was a direct or synchronous I/O that failed (such * as ENOSPC) then part of the I/O may have been written to * disk before the error occured. In this case the on-disk * file size may have been adjusted beyond the in-memory file * size and now needs to be truncated back. */ if (ip->i_d.di_size > ip->i_size) ip->i_d.di_size = ip->i_size; xfs_iunlock(ip, XFS_ILOCK_EXCL); } xfs_iunlock(ip, iolock); out_unlock_mutex: if (need_i_mutex) mutex_unlock(&inode->i_mutex); return -error; }
STATIC ssize_t xfs_file_aio_write( struct kiocb *iocb, const struct iovec *iovp, unsigned long nr_segs, loff_t pos) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; struct xfs_inode *ip = XFS_I(inode); ssize_t ret; int iolock; size_t ocount = 0; XFS_STATS_INC(xs_write_calls); BUG_ON(iocb->ki_pos != pos); ret = generic_segment_checks(iovp, &nr_segs, &ocount, VERIFY_READ); if (ret) return ret; if (ocount == 0) return 0; xfs_wait_for_freeze(ip->i_mount, SB_FREEZE_WRITE); if (XFS_FORCED_SHUTDOWN(ip->i_mount)) return -EIO; if (unlikely(file->f_flags & O_DIRECT)) ret = xfs_file_dio_aio_write(iocb, iovp, nr_segs, pos, ocount, &iolock); else ret = xfs_file_buffered_aio_write(iocb, iovp, nr_segs, pos, ocount, &iolock); xfs_aio_write_isize_update(inode, &iocb->ki_pos, ret); if (ret <= 0) goto out_unlock; /* Handle various SYNC-type writes */ if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) { loff_t end = pos + ret - 1; int error, error2; xfs_rw_iunlock(ip, iolock); error = filemap_write_and_wait_range(mapping, pos, end); xfs_rw_ilock(ip, iolock); error2 = -xfs_file_fsync(file, (file->f_flags & __O_SYNC) ? 0 : 1); if (error) ret = error; else if (error2) ret = error2; } out_unlock: xfs_aio_write_newsize_update(ip); xfs_rw_iunlock(ip, iolock); return ret; }
STATIC ssize_t xfs_file_aio_read( struct kiocb *iocb, const struct iovec *iovp, unsigned long nr_segs, loff_t pos) { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; size_t size = 0; ssize_t ret = 0; int ioflags = 0; xfs_fsize_t n; XFS_STATS_INC(xs_read_calls); BUG_ON(iocb->ki_pos != pos); if (unlikely(file->f_flags & O_DIRECT)) ioflags |= IO_ISDIRECT; if (file->f_mode & FMODE_NOCMTIME) ioflags |= IO_INVIS; ret = generic_segment_checks(iovp, &nr_segs, &size, VERIFY_WRITE); if (ret < 0) return ret; if (unlikely(ioflags & IO_ISDIRECT)) { xfs_buftarg_t *target = XFS_IS_REALTIME_INODE(ip) ? mp->m_rtdev_targp : mp->m_ddev_targp; /* DIO must be aligned to device logical sector size */ if ((pos | size) & target->bt_logical_sectormask) { if (pos == i_size_read(inode)) return 0; return -XFS_ERROR(EINVAL); } } n = mp->m_super->s_maxbytes - pos; if (n <= 0 || size == 0) return 0; if (n < size) size = n; if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; /* * Locking is a bit tricky here. If we take an exclusive lock * for direct IO, we effectively serialise all new concurrent * read IO to this file and block it behind IO that is currently in * progress because IO in progress holds the IO lock shared. We only * need to hold the lock exclusive to blow away the page cache, so * only take lock exclusively if the page cache needs invalidation. * This allows the normal direct IO case of no page cache pages to * proceeed concurrently without serialisation. */ xfs_rw_ilock(ip, XFS_IOLOCK_SHARED); if ((ioflags & IO_ISDIRECT) && inode->i_mapping->nrpages) { xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); xfs_rw_ilock(ip, XFS_IOLOCK_EXCL); if (inode->i_mapping->nrpages) { ret = filemap_write_and_wait_range( VFS_I(ip)->i_mapping, pos, pos + size - 1); if (ret) { xfs_rw_iunlock(ip, XFS_IOLOCK_EXCL); return ret; } /* * Invalidate whole pages. This can return an error if * we fail to invalidate a page, but this should never * happen on XFS. Warn if it does fail. */ ret = invalidate_inode_pages2_range(VFS_I(ip)->i_mapping, pos >> PAGE_CACHE_SHIFT, (pos + size - 1) >> PAGE_CACHE_SHIFT); WARN_ON_ONCE(ret); ret = 0; }
static ssize_t ocfs2_file_aio_write(struct kiocb *iocb, const struct iovec *iov, unsigned long nr_segs, loff_t pos) { int ret, direct_io, appending, rw_level, have_alloc_sem = 0; int can_do_direct, sync = 0; ssize_t written = 0; size_t ocount; /* original count */ size_t count; /* after file limit checks */ loff_t *ppos = &iocb->ki_pos; struct file *file = iocb->ki_filp; struct inode *inode = file->f_path.dentry->d_inode; mlog_entry("(0x%p, %u, '%.*s')\n", file, (unsigned int)nr_segs, file->f_path.dentry->d_name.len, file->f_path.dentry->d_name.name); if (iocb->ki_left == 0) return 0; ret = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ); if (ret) return ret; count = ocount; vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); appending = file->f_flags & O_APPEND ? 1 : 0; direct_io = file->f_flags & O_DIRECT ? 1 : 0; mutex_lock(&inode->i_mutex); relock: /* to match setattr's i_mutex -> i_alloc_sem -> rw_lock ordering */ if (direct_io) { down_read(&inode->i_alloc_sem); have_alloc_sem = 1; } /* concurrent O_DIRECT writes are allowed */ rw_level = !direct_io; ret = ocfs2_rw_lock(inode, rw_level); if (ret < 0) { mlog_errno(ret); goto out_sems; } can_do_direct = direct_io; ret = ocfs2_prepare_inode_for_write(file->f_path.dentry, ppos, iocb->ki_left, appending, &can_do_direct); if (ret < 0) { mlog_errno(ret); goto out; } /* * We can't complete the direct I/O as requested, fall back to * buffered I/O. */ if (direct_io && !can_do_direct) { ocfs2_rw_unlock(inode, rw_level); up_read(&inode->i_alloc_sem); have_alloc_sem = 0; rw_level = -1; direct_io = 0; sync = 1; goto relock; } if (!sync && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) sync = 1; /* * XXX: Is it ok to execute these checks a second time? */ ret = generic_write_checks(file, ppos, &count, S_ISBLK(inode->i_mode)); if (ret) goto out; /* * Set pos so that sync_page_range_nolock() below understands * where to start from. We might've moved it around via the * calls above. The range we want to actually sync starts from * *ppos here. * */ pos = *ppos; /* communicate with ocfs2_dio_end_io */ ocfs2_iocb_set_rw_locked(iocb, rw_level); if (direct_io) { written = generic_file_direct_write(iocb, iov, &nr_segs, *ppos, ppos, count, ocount); if (written < 0) { ret = written; goto out_dio; } } else { written = ocfs2_file_buffered_write(file, ppos, iov, nr_segs, count, written); if (written < 0) { ret = written; if (ret != -EFAULT || ret != -ENOSPC) mlog_errno(ret); goto out; } } out_dio: /* buffered aio wouldn't have proper lock coverage today */ BUG_ON(ret == -EIOCBQUEUED && !(file->f_flags & O_DIRECT)); /* * deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io * function pointer which is called when o_direct io completes so that * it can unlock our rw lock. (it's the clustered equivalent of * i_alloc_sem; protects truncate from racing with pending ios). * Unfortunately there are error cases which call end_io and others * that don't. so we don't have to unlock the rw_lock if either an * async dio is going to do it in the future or an end_io after an * error has already done it. */ if (ret == -EIOCBQUEUED || !ocfs2_iocb_is_rw_locked(iocb)) { rw_level = -1; have_alloc_sem = 0; } out: if (rw_level != -1) ocfs2_rw_unlock(inode, rw_level); out_sems: if (have_alloc_sem) up_read(&inode->i_alloc_sem); if (written > 0 && sync) { ssize_t err; err = sync_page_range_nolock(inode, file->f_mapping, pos, count); if (err < 0) written = err; } mutex_unlock(&inode->i_mutex); mlog_exit(ret); return written ? written : ret; }
ssize_t /* bytes written, or (-) error */ xfs_write( struct xfs_inode *xip, struct kiocb *iocb, const struct iovec *iovp, unsigned int nsegs, loff_t *offset, int ioflags) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; unsigned long segs = nsegs; xfs_mount_t *mp; ssize_t ret = 0, error = 0; xfs_fsize_t isize, new_size; int iolock; int eventsent = 0; size_t ocount = 0, count; loff_t pos; int need_i_mutex; XFS_STATS_INC(xs_write_calls); error = generic_segment_checks(iovp, &segs, &ocount, VERIFY_READ); if (error) return error; count = ocount; pos = *offset; if (count == 0) return 0; mp = xip->i_mount; xfs_wait_for_freeze(mp, SB_FREEZE_WRITE); if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; relock: if (ioflags & IO_ISDIRECT) { iolock = XFS_IOLOCK_SHARED; need_i_mutex = 0; } else { iolock = XFS_IOLOCK_EXCL; need_i_mutex = 1; mutex_lock(&inode->i_mutex); } xfs_ilock(xip, XFS_ILOCK_EXCL|iolock); start: error = -generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode)); if (error) { xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock); goto out_unlock_mutex; } if ((DM_EVENT_ENABLED(xip, DM_EVENT_WRITE) && !(ioflags & IO_INVIS) && !eventsent)) { int dmflags = FILP_DELAY_FLAG(file); if (need_i_mutex) dmflags |= DM_FLAGS_IMUX; xfs_iunlock(xip, XFS_ILOCK_EXCL); error = XFS_SEND_DATA(xip->i_mount, DM_EVENT_WRITE, xip, pos, count, dmflags, &iolock); if (error) { goto out_unlock_internal; } xfs_ilock(xip, XFS_ILOCK_EXCL); eventsent = 1; /* * The iolock was dropped and reacquired in XFS_SEND_DATA * so we have to recheck the size when appending. * We will only "goto start;" once, since having sent the * event prevents another call to XFS_SEND_DATA, which is * what allows the size to change in the first place. */ if ((file->f_flags & O_APPEND) && pos != xip->i_size) goto start; } if (ioflags & IO_ISDIRECT) { xfs_buftarg_t *target = XFS_IS_REALTIME_INODE(xip) ? mp->m_rtdev_targp : mp->m_ddev_targp; if ((pos & target->bt_smask) || (count & target->bt_smask)) { xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock); return XFS_ERROR(-EINVAL); } if (!need_i_mutex && (mapping->nrpages || pos > xip->i_size)) { xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock); iolock = XFS_IOLOCK_EXCL; need_i_mutex = 1; mutex_lock(&inode->i_mutex); xfs_ilock(xip, XFS_ILOCK_EXCL|iolock); goto start; } } new_size = pos + count; if (new_size > xip->i_size) xip->i_new_size = new_size; if (likely(!(ioflags & IO_INVIS))) xfs_ichgtime(xip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); /* * If the offset is beyond the size of the file, we have a couple * of things to do. First, if there is already space allocated * we need to either create holes or zero the disk or ... * * If there is a page where the previous size lands, we need * to zero it out up to the new size. */ if (pos > xip->i_size) { error = xfs_zero_eof(xip, pos, xip->i_size); if (error) { xfs_iunlock(xip, XFS_ILOCK_EXCL); goto out_unlock_internal; } } xfs_iunlock(xip, XFS_ILOCK_EXCL); /* * If we're writing the file then make sure to clear the * setuid and setgid bits if the process is not being run * by root. This keeps people from modifying setuid and * setgid binaries. */ if (((xip->i_d.di_mode & S_ISUID) || ((xip->i_d.di_mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP))) && !capable(CAP_FSETID)) { error = xfs_write_clear_setuid(xip); if (likely(!error)) error = -file_remove_suid(file); if (unlikely(error)) { goto out_unlock_internal; } } /* We can write back this queue in page reclaim */ current->backing_dev_info = mapping->backing_dev_info; if ((ioflags & IO_ISDIRECT)) { if (mapping->nrpages) { WARN_ON(need_i_mutex == 0); xfs_inval_cached_trace(xip, pos, -1, (pos & PAGE_CACHE_MASK), -1); error = xfs_flushinval_pages(xip, (pos & PAGE_CACHE_MASK), -1, FI_REMAPF_LOCKED); if (error) goto out_unlock_internal; } if (need_i_mutex) { /* demote the lock now the cached pages are gone */ xfs_ilock_demote(xip, XFS_IOLOCK_EXCL); mutex_unlock(&inode->i_mutex); iolock = XFS_IOLOCK_SHARED; need_i_mutex = 0; } xfs_rw_enter_trace(XFS_DIOWR_ENTER, xip, (void *)iovp, segs, *offset, ioflags); ret = generic_file_direct_write(iocb, iovp, &segs, pos, offset, count, ocount); /* * direct-io write to a hole: fall through to buffered I/O * for completing the rest of the request. */ if (ret >= 0 && ret != count) { XFS_STATS_ADD(xs_write_bytes, ret); pos += ret; count -= ret; ioflags &= ~IO_ISDIRECT; xfs_iunlock(xip, iolock); goto relock; } } else { xfs_rw_enter_trace(XFS_WRITE_ENTER, xip, (void *)iovp, segs, *offset, ioflags); ret = generic_file_buffered_write(iocb, iovp, segs, pos, offset, count, ret); } current->backing_dev_info = NULL; if (ret == -EIOCBQUEUED && !(ioflags & IO_ISAIO)) ret = wait_on_sync_kiocb(iocb); isize = i_size_read(inode); if (unlikely(ret < 0 && ret != -EFAULT && *offset > isize)) *offset = isize; if (*offset > xip->i_size) { xfs_ilock(xip, XFS_ILOCK_EXCL); if (*offset > xip->i_size) xip->i_size = *offset; xfs_iunlock(xip, XFS_ILOCK_EXCL); } if (ret == -ENOSPC && DM_EVENT_ENABLED(xip, DM_EVENT_NOSPACE) && !(ioflags & IO_INVIS)) { xfs_iunlock(xip, iolock); if (need_i_mutex) mutex_unlock(&inode->i_mutex); error = XFS_SEND_NAMESP(xip->i_mount, DM_EVENT_NOSPACE, xip, DM_RIGHT_NULL, xip, DM_RIGHT_NULL, NULL, NULL, 0, 0, 0); /* Delay flag intentionally unused */ if (need_i_mutex) mutex_lock(&inode->i_mutex); xfs_ilock(xip, iolock); if (error) goto out_unlock_internal; goto start; } error = -ret; if (ret <= 0) goto out_unlock_internal; XFS_STATS_ADD(xs_write_bytes, ret); /* Handle various SYNC-type writes */ if ((file->f_flags & O_SYNC) || IS_SYNC(inode)) { int error2; xfs_iunlock(xip, iolock); if (need_i_mutex) mutex_unlock(&inode->i_mutex); error2 = sync_page_range(inode, mapping, pos, ret); if (!error) error = error2; if (need_i_mutex) mutex_lock(&inode->i_mutex); xfs_ilock(xip, iolock); error2 = xfs_write_sync_logforce(mp, xip); if (!error) error = error2; } out_unlock_internal: if (xip->i_new_size) { xfs_ilock(xip, XFS_ILOCK_EXCL); xip->i_new_size = 0; /* * If this was a direct or synchronous I/O that failed (such * as ENOSPC) then part of the I/O may have been written to * disk before the error occured. In this case the on-disk * file size may have been adjusted beyond the in-memory file * size and now needs to be truncated back. */ if (xip->i_d.di_size > xip->i_size) xip->i_d.di_size = xip->i_size; xfs_iunlock(xip, XFS_ILOCK_EXCL); } xfs_iunlock(xip, iolock); out_unlock_mutex: if (need_i_mutex) mutex_unlock(&inode->i_mutex); return -error; }
//----------------------------------------------------------------------------- // Regular File Operation //----------------------------------------------------------------------------- ssize_t rawfs_reg_file_aio_read(struct kiocb *iocb, const struct iovec *iov, unsigned long nr_segs, loff_t pos) { struct file *filp = iocb->ki_filp; struct super_block *sb = filp->f_path.dentry->d_sb; struct rawfs_sb_info *rawfs_sb = RAWFS_SB(sb); // struct address_space *mapping=filp->f_mapping; struct inode *inode = filp->f_mapping->host; struct rawfs_inode_info *inode_info = RAWFS_I(inode); ssize_t retval; size_t count; loff_t *ppos = &iocb->ki_pos; loff_t size; unsigned int curr_file_pos = pos; unsigned int curr_buf_pos = 0; int remain_buf_size; const struct iovec *iv = &iov[0]; // TODO: Process all io vectors retval = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE); RAWFS_PRINT(RAWFS_DBG_FILE, "rawfs_reg_file_aio_read %s, segment check " "result %d\n", inode_info->i_name, retval); if (retval) return retval; mutex_lock(&rawfs_sb->rawfs_lock); retval=iov_length(iov, nr_segs); size = i_size_read(inode); RAWFS_PRINT(RAWFS_DBG_FILE, "rawfs_reg_file_aio_read %s, pos %lld, len %d, " "filesize: %lld\n", inode_info->i_name, pos, retval, size); if ((retval + pos) >= size) retval = size - pos; if (pos < size) { /* Read File */ { int preceding_pages, rear_pages; struct rawfs_page *page_buf = NULL; int i; // Prepare page buffer page_buf = kzalloc(rawfs_sb->page_size, GFP_NOFS); if (page_buf == NULL) { retval = 0; goto out; } preceding_pages = FLOOR((unsigned)pos, rawfs_sb->page_data_size); rear_pages = CEILING((unsigned)pos + retval, rawfs_sb->page_data_size); remain_buf_size = retval; RAWFS_PRINT(RAWFS_DBG_FILE, "rawfs_reg_file_aio_read %s, " "preceding_pages %d, rear_pages %d, remain_buf_size %d\n", inode_info->i_name, preceding_pages, rear_pages, remain_buf_size); // Step 1: Copy preceding pages, if starting pos is not 0. for (i=preceding_pages;i<rear_pages;i++) { __u32 crc; // Read page rawfs_sb->dev.read_page(sb, inode_info->i_location_block, inode_info->i_location_page+i, page_buf); // TODO: skip this page, if unrecoverable error occurs /* CRC error should not happen, since we have already check them at bootup */ crc = rawfs_page_crc_data(sb, page_buf); if (crc != page_buf->i_crc) RAWFS_PRINT(RAWFS_DBG_FILE, "rawfs_reg_file_aio_read: " "%s @ %X, crc fail %X, expected %X\n", page_buf->i_info.i_file_info.i_name, page_buf->i_info.i_file_info.i_parent_folder_id, crc, page_buf->i_crc); else RAWFS_PRINT(RAWFS_DBG_FILE, "rawfs_reg_file_aio_read: " "%s @ %X, crc %X\n", page_buf->i_info.i_file_info.i_name, page_buf->i_info.i_file_info.i_parent_folder_id, page_buf->i_crc); /* Copy requried parts */ { int start_in_buf; int copy_len; start_in_buf = (curr_file_pos % rawfs_sb->page_data_size); copy_len = ((start_in_buf + remain_buf_size) > rawfs_sb->page_data_size) ? (rawfs_sb->page_data_size - start_in_buf) : remain_buf_size; if (copy_to_user((char*)iv->iov_base + curr_buf_pos, &page_buf->i_data[0] + start_in_buf, copy_len)) { retval = -EFAULT; goto out2; } RAWFS_PRINT(RAWFS_DBG_FILE, "rawfs_reg_file_aio_read %s, " "%d, curr_buf_pos %d, remain_buf_size %d " "start_in_buf %d copy_len %d starting pattern %X\n", inode_info->i_name, i, curr_buf_pos, remain_buf_size, start_in_buf, copy_len, *(unsigned int*)(&page_buf->i_data[0] + start_in_buf)); curr_buf_pos += copy_len; remain_buf_size -= copy_len; } } out2: if (page_buf) kfree(page_buf); } if (retval > 0) *ppos = pos + retval; if (retval < 0 || *ppos >= size) { file_accessed(filp); goto out; } } else retval = 0; out: // Release Lock mutex_unlock(&rawfs_sb->rawfs_lock); return retval; }