/* * xfs_file_dio_aio_write - handle direct IO writes * * Lock the inode appropriately to prepare for and issue a direct IO write. * By separating it from the buffered write path we remove all the tricky to * follow locking changes and looping. * * If there are cached pages or we're extending the file, we need IOLOCK_EXCL * until we're sure the bytes at the new EOF have been zeroed and/or the cached * pages are flushed out. * * In most cases the direct IO writes will be done holding IOLOCK_SHARED * allowing them to be done in parallel with reads and other direct IO writes. * However, if the IO is not aligned to filesystem blocks, the direct IO layer * needs to do sub-block zeroing and that requires serialisation against other * direct IOs to the same block. In this case we need to serialise the * submission of the unaligned IOs so that we don't get racing block zeroing in * the dio layer. To avoid the problem with aio, we also need to wait for * outstanding IOs to complete so that unwritten extent conversion is completed * before we try to map the overlapping block. This is currently implemented by * hitting it with a big hammer (i.e. inode_dio_wait()). * * Returns with locks held indicated by @iolock and errors indicated by * negative return values. */ STATIC ssize_t xfs_file_dio_aio_write( struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; ssize_t ret = 0; int unaligned_io = 0; int iolock; size_t count = iov_iter_count(from); loff_t end; struct iov_iter data; struct xfs_buftarg *target = XFS_IS_REALTIME_INODE(ip) ? mp->m_rtdev_targp : mp->m_ddev_targp; /* DIO must be aligned to device logical sector size */ if (!IS_DAX(inode) && ((iocb->ki_pos | count) & target->bt_logical_sectormask)) return -EINVAL; /* "unaligned" here means not aligned to a filesystem block */ if ((iocb->ki_pos & mp->m_blockmask) || ((iocb->ki_pos + count) & mp->m_blockmask)) unaligned_io = 1; /* * We don't need to take an exclusive lock unless there page cache needs * to be invalidated or unaligned IO is being executed. We don't need to * consider the EOF extension case here because * xfs_file_aio_write_checks() will relock the inode as necessary for * EOF zeroing cases and fill out the new inode size as appropriate. */ if (unaligned_io || mapping->nrpages) iolock = XFS_IOLOCK_EXCL; else iolock = XFS_IOLOCK_SHARED; xfs_rw_ilock(ip, iolock); /* * Recheck if there are cached pages that need invalidate after we got * the iolock to protect against other threads adding new pages while * we were waiting for the iolock. */ if (mapping->nrpages && iolock == XFS_IOLOCK_SHARED) { xfs_rw_iunlock(ip, iolock); iolock = XFS_IOLOCK_EXCL; xfs_rw_ilock(ip, iolock); } ret = xfs_file_aio_write_checks(iocb, from, &iolock); if (ret) goto out; count = iov_iter_count(from); end = iocb->ki_pos + count - 1; /* * See xfs_file_read_iter() for why we do a full-file flush here. */ if (mapping->nrpages) { ret = filemap_write_and_wait(VFS_I(ip)->i_mapping); if (ret) goto out; /* * Invalidate whole pages. This can return an error if we fail * to invalidate a page, but this should never happen on XFS. * Warn if it does fail. */ ret = invalidate_inode_pages2(VFS_I(ip)->i_mapping); WARN_ON_ONCE(ret); ret = 0; } /* * If we are doing unaligned IO, wait for all other IO to drain, * otherwise demote the lock if we had to flush cached pages */ if (unaligned_io) inode_dio_wait(inode); else if (iolock == XFS_IOLOCK_EXCL) { xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL); iolock = XFS_IOLOCK_SHARED; } trace_xfs_file_direct_write(ip, count, iocb->ki_pos, 0); data = *from; ret = mapping->a_ops->direct_IO(iocb, &data); /* see generic_file_direct_write() for why this is necessary */ if (mapping->nrpages) { invalidate_inode_pages2_range(mapping, iocb->ki_pos >> PAGE_SHIFT, end >> PAGE_SHIFT); }
static int xfs_iget_cache_miss( struct xfs_mount *mp, struct xfs_perag *pag, xfs_trans_t *tp, xfs_ino_t ino, struct xfs_inode **ipp, int flags, int lock_flags) { struct xfs_inode *ip; int error; xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ino); int iflags; ip = xfs_inode_alloc(mp, ino); if (!ip) return -ENOMEM; error = xfs_iread(mp, tp, ip, flags); if (error) goto out_destroy; trace_xfs_iget_miss(ip); if ((VFS_I(ip)->i_mode == 0) && !(flags & XFS_IGET_CREATE)) { error = -ENOENT; goto out_destroy; } /* * Preload the radix tree so we can insert safely under the * write spinlock. Note that we cannot sleep inside the preload * region. Since we can be called from transaction context, don't * recurse into the file system. */ if (radix_tree_preload(GFP_NOFS)) { error = -EAGAIN; goto out_destroy; } /* * Because the inode hasn't been added to the radix-tree yet it can't * be found by another thread, so we can do the non-sleeping lock here. */ if (lock_flags) { if (!xfs_ilock_nowait(ip, lock_flags)) BUG(); } /* * These values must be set before inserting the inode into the radix * tree as the moment it is inserted a concurrent lookup (allowed by the * RCU locking mechanism) can find it and that lookup must see that this * is an inode currently under construction (i.e. that XFS_INEW is set). * The ip->i_flags_lock that protects the XFS_INEW flag forms the * memory barrier that ensures this detection works correctly at lookup * time. */ iflags = XFS_INEW; if (flags & XFS_IGET_DONTCACHE) iflags |= XFS_IDONTCACHE; ip->i_udquot = NULL; ip->i_gdquot = NULL; ip->i_pdquot = NULL; xfs_iflags_set(ip, iflags); /* insert the new inode */ spin_lock(&pag->pag_ici_lock); error = radix_tree_insert(&pag->pag_ici_root, agino, ip); if (unlikely(error)) { WARN_ON(error != -EEXIST); XFS_STATS_INC(mp, xs_ig_dup); error = -EAGAIN; goto out_preload_end; } spin_unlock(&pag->pag_ici_lock); radix_tree_preload_end(); *ipp = ip; return 0; out_preload_end: spin_unlock(&pag->pag_ici_lock); radix_tree_preload_end(); if (lock_flags) xfs_iunlock(ip, lock_flags); out_destroy: __destroy_inode(VFS_I(ip)); xfs_inode_free(ip); return error; }
STATIC ssize_t xfs_file_aio_read( struct kiocb *iocb, const struct iovec *iovp, unsigned long nr_segs, loff_t pos) { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; size_t size = 0; ssize_t ret = 0; int ioflags = 0; xfs_fsize_t n; XFS_STATS_INC(xs_read_calls); BUG_ON(iocb->ki_pos != pos); if (unlikely(file->f_flags & O_DIRECT)) ioflags |= IO_ISDIRECT; if (file->f_mode & FMODE_NOCMTIME) ioflags |= IO_INVIS; ret = generic_segment_checks(iovp, &nr_segs, &size, VERIFY_WRITE); if (ret < 0) return ret; if (unlikely(ioflags & IO_ISDIRECT)) { xfs_buftarg_t *target = XFS_IS_REALTIME_INODE(ip) ? mp->m_rtdev_targp : mp->m_ddev_targp; /* DIO must be aligned to device logical sector size */ if ((pos | size) & target->bt_logical_sectormask) { if (pos == i_size_read(inode)) return 0; return -XFS_ERROR(EINVAL); } } n = mp->m_super->s_maxbytes - pos; if (n <= 0 || size == 0) return 0; if (n < size) size = n; if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; /* * Locking is a bit tricky here. If we take an exclusive lock * for direct IO, we effectively serialise all new concurrent * read IO to this file and block it behind IO that is currently in * progress because IO in progress holds the IO lock shared. We only * need to hold the lock exclusive to blow away the page cache, so * only take lock exclusively if the page cache needs invalidation. * This allows the normal direct IO case of no page cache pages to * proceeed concurrently without serialisation. */ xfs_rw_ilock(ip, XFS_IOLOCK_SHARED); if ((ioflags & IO_ISDIRECT) && inode->i_mapping->nrpages) { xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); xfs_rw_ilock(ip, XFS_IOLOCK_EXCL); if (inode->i_mapping->nrpages) { ret = filemap_write_and_wait_range( VFS_I(ip)->i_mapping, pos, pos + size - 1); if (ret) { xfs_rw_iunlock(ip, XFS_IOLOCK_EXCL); return ret; } /* * Invalidate whole pages. This can return an error if * we fail to invalidate a page, but this should never * happen on XFS. Warn if it does fail. */ ret = invalidate_inode_pages2_range(VFS_I(ip)->i_mapping, pos >> PAGE_CACHE_SHIFT, (pos + size - 1) >> PAGE_CACHE_SHIFT); WARN_ON_ONCE(ret); ret = 0; }
int xfs_setattr_nonsize( struct xfs_inode *ip, struct iattr *iattr, int flags) { xfs_mount_t *mp = ip->i_mount; struct inode *inode = VFS_I(ip); int mask = iattr->ia_valid; xfs_trans_t *tp; int error; uid_t uid = 0, iuid = 0; gid_t gid = 0, igid = 0; struct xfs_dquot *udqp = NULL, *gdqp = NULL; struct xfs_dquot *olddquot1 = NULL, *olddquot2 = NULL; trace_xfs_setattr(ip); if (mp->m_flags & XFS_MOUNT_RDONLY) return XFS_ERROR(EROFS); if (XFS_FORCED_SHUTDOWN(mp)) return XFS_ERROR(EIO); error = -inode_change_ok(inode, iattr); if (error) return XFS_ERROR(error); ASSERT((mask & ATTR_SIZE) == 0); /* * If disk quotas is on, we make sure that the dquots do exist on disk, * before we start any other transactions. Trying to do this later * is messy. We don't care to take a readlock to look at the ids * in inode here, because we can't hold it across the trans_reserve. * If the IDs do change before we take the ilock, we're covered * because the i_*dquot fields will get updated anyway. */ if (XFS_IS_QUOTA_ON(mp) && (mask & (ATTR_UID|ATTR_GID))) { uint qflags = 0; if ((mask & ATTR_UID) && XFS_IS_UQUOTA_ON(mp)) { uid = iattr->ia_uid; qflags |= XFS_QMOPT_UQUOTA; } else { uid = ip->i_d.di_uid; } if ((mask & ATTR_GID) && XFS_IS_GQUOTA_ON(mp)) { gid = iattr->ia_gid; qflags |= XFS_QMOPT_GQUOTA; } else { gid = ip->i_d.di_gid; } /* * We take a reference when we initialize udqp and gdqp, * so it is important that we never blindly double trip on * the same variable. See xfs_create() for an example. */ ASSERT(udqp == NULL); ASSERT(gdqp == NULL); error = xfs_qm_vop_dqalloc(ip, uid, gid, xfs_get_projid(ip), qflags, &udqp, &gdqp); if (error) return error; } tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE); error = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp), 0, 0, 0); if (error) goto out_dqrele; xfs_ilock(ip, XFS_ILOCK_EXCL); /* * Change file ownership. Must be the owner or privileged. */ if (mask & (ATTR_UID|ATTR_GID)) { /* * These IDs could have changed since we last looked at them. * But, we're assured that if the ownership did change * while we didn't have the inode locked, inode's dquot(s) * would have changed also. */ iuid = ip->i_d.di_uid; igid = ip->i_d.di_gid; gid = (mask & ATTR_GID) ? iattr->ia_gid : igid; uid = (mask & ATTR_UID) ? iattr->ia_uid : iuid; /* * Do a quota reservation only if uid/gid is actually * going to change. */ if (XFS_IS_QUOTA_RUNNING(mp) && ((XFS_IS_UQUOTA_ON(mp) && iuid != uid) || (XFS_IS_GQUOTA_ON(mp) && igid != gid))) { ASSERT(tp); error = xfs_qm_vop_chown_reserve(tp, ip, udqp, gdqp, capable(CAP_FOWNER) ? XFS_QMOPT_FORCE_RES : 0); if (error) /* out of quota */ goto out_trans_cancel; } } xfs_trans_ijoin(tp, ip); /* * Change file ownership. Must be the owner or privileged. */ if (mask & (ATTR_UID|ATTR_GID)) { /* * CAP_FSETID overrides the following restrictions: * * The set-user-ID and set-group-ID bits of a file will be * cleared upon successful return from chown() */ if ((ip->i_d.di_mode & (S_ISUID|S_ISGID)) && !capable(CAP_FSETID)) ip->i_d.di_mode &= ~(S_ISUID|S_ISGID); /* * Change the ownerships and register quota modifications * in the transaction. */ if (iuid != uid) { if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_UQUOTA_ON(mp)) { ASSERT(mask & ATTR_UID); ASSERT(udqp); olddquot1 = xfs_qm_vop_chown(tp, ip, &ip->i_udquot, udqp); } ip->i_d.di_uid = uid; inode->i_uid = uid; } if (igid != gid) { if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_GQUOTA_ON(mp)) { ASSERT(!XFS_IS_PQUOTA_ON(mp)); ASSERT(mask & ATTR_GID); ASSERT(gdqp); olddquot2 = xfs_qm_vop_chown(tp, ip, &ip->i_gdquot, gdqp); } ip->i_d.di_gid = gid; inode->i_gid = gid; } } /* * Change file access modes. */ if (mask & ATTR_MODE) { umode_t mode = iattr->ia_mode; if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID)) mode &= ~S_ISGID; ip->i_d.di_mode &= S_IFMT; ip->i_d.di_mode |= mode & ~S_IFMT; inode->i_mode &= S_IFMT; inode->i_mode |= mode & ~S_IFMT; } /* * Change file access or modified times. */ if (mask & ATTR_ATIME) { inode->i_atime = iattr->ia_atime; ip->i_d.di_atime.t_sec = iattr->ia_atime.tv_sec; ip->i_d.di_atime.t_nsec = iattr->ia_atime.tv_nsec; ip->i_update_core = 1; } if (mask & ATTR_CTIME) { inode->i_ctime = iattr->ia_ctime; ip->i_d.di_ctime.t_sec = iattr->ia_ctime.tv_sec; ip->i_d.di_ctime.t_nsec = iattr->ia_ctime.tv_nsec; ip->i_update_core = 1; } if (mask & ATTR_MTIME) { inode->i_mtime = iattr->ia_mtime; ip->i_d.di_mtime.t_sec = iattr->ia_mtime.tv_sec; ip->i_d.di_mtime.t_nsec = iattr->ia_mtime.tv_nsec; ip->i_update_core = 1; } xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); XFS_STATS_INC(xs_ig_attrchg); if (mp->m_flags & XFS_MOUNT_WSYNC) xfs_trans_set_sync(tp); error = xfs_trans_commit(tp, 0); xfs_iunlock(ip, XFS_ILOCK_EXCL); /* * Release any dquot(s) the inode had kept before chown. */ xfs_qm_dqrele(olddquot1); xfs_qm_dqrele(olddquot2); xfs_qm_dqrele(udqp); xfs_qm_dqrele(gdqp); if (error) return XFS_ERROR(error); /* * XXX(hch): Updating the ACL entries is not atomic vs the i_mode * update. We could avoid this with linked transactions * and passing down the transaction pointer all the way * to attr_set. No previous user of the generic * Posix ACL code seems to care about this issue either. */ if ((mask & ATTR_MODE) && !(flags & XFS_ATTR_NOACL)) { error = -xfs_acl_chmod(inode); if (error) return XFS_ERROR(error); } return 0; out_trans_cancel: xfs_trans_cancel(tp, 0); xfs_iunlock(ip, XFS_ILOCK_EXCL); out_dqrele: xfs_qm_dqrele(udqp); xfs_qm_dqrele(gdqp); return error; }
/* * Allocate an inode on disk and return a copy of its in-core version. * Set mode, nlink, and rdev appropriately within the inode. * The uid and gid for the inode are set according to the contents of * the given cred structure. * * This was once shared with the kernel, but has diverged to the point * where it's no longer worth the hassle of maintaining common code. */ int libxfs_ialloc( xfs_trans_t *tp, xfs_inode_t *pip, mode_t mode, nlink_t nlink, xfs_dev_t rdev, struct cred *cr, struct fsxattr *fsx, int okalloc, xfs_buf_t **ialloc_context, xfs_inode_t **ipp) { xfs_ino_t ino; xfs_inode_t *ip; uint flags; int error; /* * Call the space management code to pick * the on-disk inode to be allocated. */ error = xfs_dialloc(tp, pip ? pip->i_ino : 0, mode, okalloc, ialloc_context, &ino); if (error != 0) return error; if (*ialloc_context || ino == NULLFSINO) { *ipp = NULL; return 0; } ASSERT(*ialloc_context == NULL); error = xfs_trans_iget(tp->t_mountp, tp, ino, 0, 0, &ip); if (error != 0) return error; ASSERT(ip != NULL); VFS_I(ip)->i_mode = mode; set_nlink(VFS_I(ip), nlink); ip->i_d.di_uid = cr->cr_uid; ip->i_d.di_gid = cr->cr_gid; xfs_set_projid(&ip->i_d, pip ? 0 : fsx->fsx_projid); xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG | XFS_ICHGTIME_MOD); /* * We only support filesystems that understand v2 format inodes. So if * this is currently an old format inode, then change the inode version * number now. This way we only do the conversion here rather than here * and in the flush/logging code. */ if (ip->i_d.di_version == 1) { ip->i_d.di_version = 2; /* * old link count, projid_lo/hi field, pad field * already zeroed */ } if (pip && (VFS_I(pip)->i_mode & S_ISGID)) { ip->i_d.di_gid = pip->i_d.di_gid; if ((VFS_I(pip)->i_mode & S_ISGID) && (mode & S_IFMT) == S_IFDIR) VFS_I(ip)->i_mode |= S_ISGID; } ip->i_d.di_size = 0; ip->i_d.di_nextents = 0; ASSERT(ip->i_d.di_nblocks == 0); ip->i_d.di_extsize = pip ? 0 : fsx->fsx_extsize; ip->i_d.di_dmevmask = 0; ip->i_d.di_dmstate = 0; ip->i_d.di_flags = pip ? 0 : fsx->fsx_xflags; if (ip->i_d.di_version == 3) { ASSERT(ip->i_d.di_ino == ino); ASSERT(uuid_equal(&ip->i_d.di_uuid, &mp->m_sb.sb_meta_uuid)); VFS_I(ip)->i_version = 1; ip->i_d.di_flags2 = 0; ip->i_d.di_crtime.t_sec = (__int32_t)VFS_I(ip)->i_mtime.tv_sec; ip->i_d.di_crtime.t_nsec = (__int32_t)VFS_I(ip)->i_mtime.tv_nsec; } flags = XFS_ILOG_CORE; switch (mode & S_IFMT) { case S_IFIFO: case S_IFSOCK: /* doesn't make sense to set an rdev for these */ rdev = 0; /* FALLTHROUGH */ case S_IFCHR: case S_IFBLK: ip->i_d.di_format = XFS_DINODE_FMT_DEV; ip->i_df.if_u2.if_rdev = rdev; flags |= XFS_ILOG_DEV; break; case S_IFREG: case S_IFDIR: if (pip && (pip->i_d.di_flags & XFS_DIFLAG_ANY)) { uint di_flags = 0; if ((mode & S_IFMT) == S_IFDIR) { if (pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT) di_flags |= XFS_DIFLAG_RTINHERIT; if (pip->i_d.di_flags & XFS_DIFLAG_EXTSZINHERIT) { di_flags |= XFS_DIFLAG_EXTSZINHERIT; ip->i_d.di_extsize = pip->i_d.di_extsize; } } else { if (pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT) { di_flags |= XFS_DIFLAG_REALTIME; } if (pip->i_d.di_flags & XFS_DIFLAG_EXTSZINHERIT) { di_flags |= XFS_DIFLAG_EXTSIZE; ip->i_d.di_extsize = pip->i_d.di_extsize; } } if (pip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) di_flags |= XFS_DIFLAG_PROJINHERIT; ip->i_d.di_flags |= di_flags; } /* FALLTHROUGH */ case S_IFLNK: ip->i_d.di_format = XFS_DINODE_FMT_EXTENTS; ip->i_df.if_flags = XFS_IFEXTENTS; ip->i_df.if_bytes = ip->i_df.if_real_bytes = 0; ip->i_df.if_u1.if_extents = NULL; break; default: ASSERT(0); } /* Attribute fork settings for new inode. */ ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS; ip->i_d.di_anextents = 0; /* * set up the inode ops structure that the libxfs code relies on */ if (XFS_ISDIR(ip)) ip->d_ops = ip->i_mount->m_dir_inode_ops; else ip->d_ops = ip->i_mount->m_nondir_inode_ops; /* * Log the new values stuffed into the inode. */ xfs_trans_log_inode(tp, ip, flags); *ipp = ip; return 0; }
/* Clear the inode reflink flag if there are no shared extents. */ int xfs_reflink_clear_inode_flag( struct xfs_inode *ip, struct xfs_trans **tpp) { struct xfs_mount *mp = ip->i_mount; xfs_fileoff_t fbno; xfs_filblks_t end; xfs_agnumber_t agno; xfs_agblock_t agbno; xfs_extlen_t aglen; xfs_agblock_t rbno; xfs_extlen_t rlen; struct xfs_bmbt_irec map; int nmaps; int error = 0; ASSERT(xfs_is_reflink_inode(ip)); fbno = 0; end = XFS_B_TO_FSB(mp, i_size_read(VFS_I(ip))); while (end - fbno > 0) { nmaps = 1; /* * Look for extents in the file. Skip holes, delalloc, or * unwritten extents; they can't be reflinked. */ error = xfs_bmapi_read(ip, fbno, end - fbno, &map, &nmaps, 0); if (error) return error; if (nmaps == 0) break; if (!xfs_bmap_is_real_extent(&map)) goto next; agno = XFS_FSB_TO_AGNO(mp, map.br_startblock); agbno = XFS_FSB_TO_AGBNO(mp, map.br_startblock); aglen = map.br_blockcount; error = xfs_reflink_find_shared(mp, agno, agbno, aglen, &rbno, &rlen, false); if (error) return error; /* Is there still a shared block here? */ if (rbno != NULLAGBLOCK) return 0; next: fbno = map.br_startoff + map.br_blockcount; } /* * We didn't find any shared blocks so turn off the reflink flag. * First, get rid of any leftover CoW mappings. */ error = xfs_reflink_cancel_cow_blocks(ip, tpp, 0, NULLFILEOFF, true); if (error) return error; /* Clear the inode flag. */ trace_xfs_reflink_unset_inode_flag(ip); ip->i_d.di_flags2 &= ~XFS_DIFLAG2_REFLINK; xfs_inode_clear_cowblocks_tag(ip); xfs_trans_ijoin(*tpp, ip, 0); xfs_trans_log_inode(*tpp, ip, XFS_ILOG_CORE); return error; }
/* * Sync all the inodes in the given AG according to the * direction given by the flags. */ STATIC int xfs_sync_inodes_ag( xfs_mount_t *mp, int ag, int flags) { xfs_perag_t *pag = &mp->m_perag[ag]; int nr_found; uint32_t first_index = 0; int error = 0; int last_error = 0; int fflag = XFS_B_ASYNC; if (flags & SYNC_DELWRI) fflag = XFS_B_DELWRI; if (flags & SYNC_WAIT) fflag = 0; /* synchronous overrides all */ do { struct inode *inode; xfs_inode_t *ip = NULL; int lock_flags = XFS_ILOCK_SHARED; /* * use a gang lookup to find the next inode in the tree * as the tree is sparse and a gang lookup walks to find * the number of objects requested. */ read_lock(&pag->pag_ici_lock); nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void**)&ip, first_index, 1); if (!nr_found) { read_unlock(&pag->pag_ici_lock); break; } /* * Update the index for the next lookup. Catch overflows * into the next AG range which can occur if we have inodes * in the last block of the AG and we are currently * pointing to the last inode. */ first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1); if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) { read_unlock(&pag->pag_ici_lock); break; } /* nothing to sync during shutdown */ if (XFS_FORCED_SHUTDOWN(mp)) { read_unlock(&pag->pag_ici_lock); return 0; } /* * If we can't get a reference on the inode, it must be * in reclaim. Leave it for the reclaim code to flush. */ inode = VFS_I(ip); if (!igrab(inode)) { read_unlock(&pag->pag_ici_lock); continue; } read_unlock(&pag->pag_ici_lock); /* avoid new or bad inodes */ if (is_bad_inode(inode) || xfs_iflags_test(ip, XFS_INEW)) { IRELE(ip); continue; } /* * If we have to flush data or wait for I/O completion * we need to hold the iolock. */ if ((flags & SYNC_DELWRI) && VN_DIRTY(inode)) { xfs_ilock(ip, XFS_IOLOCK_SHARED); lock_flags |= XFS_IOLOCK_SHARED; error = xfs_flush_pages(ip, 0, -1, fflag, FI_NONE); if (flags & SYNC_IOWAIT) xfs_ioend_wait(ip); } xfs_ilock(ip, XFS_ILOCK_SHARED); if ((flags & SYNC_ATTR) && !xfs_inode_clean(ip)) { if (flags & SYNC_WAIT) { xfs_iflock(ip); if (!xfs_inode_clean(ip)) error = xfs_iflush(ip, XFS_IFLUSH_SYNC); else xfs_ifunlock(ip); } else if (xfs_iflock_nowait(ip)) { if (!xfs_inode_clean(ip)) error = xfs_iflush(ip, XFS_IFLUSH_DELWRI); else xfs_ifunlock(ip); } } xfs_iput(ip, lock_flags); if (error) last_error = error; /* * bail out if the filesystem is corrupted. */ if (error == EFSCORRUPTED) return XFS_ERROR(error); } while (nr_found); return last_error; }
STATIC int xfs_vn_mknod( struct inode *dir, struct dentry *dentry, int mode, dev_t rdev) { struct inode *inode; struct xfs_inode *ip = NULL; xfs_acl_t *default_acl = NULL; struct xfs_name name; int (*test_default_acl)(struct inode *) = _ACL_DEFAULT_EXISTS; int error; /* * Irix uses Missed'em'V split, but doesn't want to see * the upper 5 bits of (14bit) major. */ if (unlikely(!sysv_valid_dev(rdev) || MAJOR(rdev) & ~0x1ff)) return -EINVAL; if (test_default_acl && test_default_acl(dir)) { if (!_ACL_ALLOC(default_acl)) { return -ENOMEM; } if (!_ACL_GET_DEFAULT(dir, default_acl)) { _ACL_FREE(default_acl); default_acl = NULL; } } xfs_dentry_to_name(&name, dentry); if (IS_POSIXACL(dir) && !default_acl) mode &= ~current->fs->umask; switch (mode & S_IFMT) { case S_IFCHR: case S_IFBLK: case S_IFIFO: case S_IFSOCK: rdev = sysv_encode_dev(rdev); case S_IFREG: error = xfs_create(XFS_I(dir), &name, mode, rdev, &ip, NULL); break; case S_IFDIR: error = xfs_mkdir(XFS_I(dir), &name, mode, &ip, NULL); break; default: error = EINVAL; break; } if (unlikely(error)) goto out_free_acl; inode = VFS_I(ip); error = xfs_init_security(inode, dir); if (unlikely(error)) goto out_cleanup_inode; if (default_acl) { error = _ACL_INHERIT(inode, mode, default_acl); if (unlikely(error)) goto out_cleanup_inode; _ACL_FREE(default_acl); } d_instantiate(dentry, inode); return -error; out_cleanup_inode: xfs_cleanup_inode(dir, inode, dentry); out_free_acl: if (default_acl) _ACL_FREE(default_acl); return -error; }
STATIC int xfs_generic_create( struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev, bool tmpfile) /* unnamed file */ { struct inode *inode; struct xfs_inode *ip = NULL; struct posix_acl *default_acl, *acl; struct xfs_name name; int error; /* * Irix uses Missed'em'V split, but doesn't want to see * the upper 5 bits of (14bit) major. */ if (S_ISCHR(mode) || S_ISBLK(mode)) { if (unlikely(!sysv_valid_dev(rdev) || MAJOR(rdev) & ~0x1ff)) return -EINVAL; rdev = sysv_encode_dev(rdev); } else { rdev = 0; } error = posix_acl_create(dir, &mode, &default_acl, &acl); if (error) return error; if (!tmpfile) { xfs_dentry_to_name(&name, dentry, mode); error = xfs_create(XFS_I(dir), &name, mode, rdev, &ip); } else { error = xfs_create_tmpfile(XFS_I(dir), dentry, mode, &ip); } if (unlikely(error)) goto out_free_acl; inode = VFS_I(ip); error = xfs_init_security(inode, dir, &dentry->d_name); if (unlikely(error)) goto out_cleanup_inode; #ifdef CONFIG_XFS_POSIX_ACL if (default_acl) { error = xfs_set_acl(inode, default_acl, ACL_TYPE_DEFAULT); if (error) goto out_cleanup_inode; } if (acl) { error = xfs_set_acl(inode, acl, ACL_TYPE_ACCESS); if (error) goto out_cleanup_inode; } #endif if (tmpfile) d_tmpfile(dentry, inode); else d_instantiate(dentry, inode); xfs_finish_inode_setup(ip); out_free_acl: if (default_acl) posix_acl_release(default_acl); if (acl) posix_acl_release(acl); return error; out_cleanup_inode: xfs_finish_inode_setup(ip); if (!tmpfile) xfs_cleanup_inode(dir, inode, dentry); iput(inode); goto out_free_acl; }
static int xfs_iget_cache_miss( struct xfs_mount *mp, struct xfs_perag *pag, xfs_trans_t *tp, xfs_ino_t ino, struct xfs_inode **ipp, int flags, int lock_flags) { struct xfs_inode *ip; int error; xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ino); ip = xfs_inode_alloc(mp, ino); if (!ip) return ENOMEM; error = xfs_iread(mp, tp, ip, flags); if (error) goto out_destroy; trace_xfs_iget_miss(ip); if ((ip->i_d.di_mode == 0) && !(flags & XFS_IGET_CREATE)) { error = ENOENT; goto out_destroy; } /* * Preload the radix tree so we can insert safely under the * write spinlock. Note that we cannot sleep inside the preload * region. */ if (radix_tree_preload(GFP_KERNEL)) { error = EAGAIN; goto out_destroy; } /* * Because the inode hasn't been added to the radix-tree yet it can't * be found by another thread, so we can do the non-sleeping lock here. */ if (lock_flags) { if (!xfs_ilock_nowait(ip, lock_flags)) BUG(); } spin_lock(&pag->pag_ici_lock); /* insert the new inode */ error = radix_tree_insert(&pag->pag_ici_root, agino, ip); if (unlikely(error)) { WARN_ON(error != -EEXIST); XFS_STATS_INC(xs_ig_dup); error = EAGAIN; goto out_preload_end; } /* These values _must_ be set before releasing the radix tree lock! */ ip->i_udquot = ip->i_gdquot = NULL; xfs_iflags_set(ip, XFS_INEW); spin_unlock(&pag->pag_ici_lock); radix_tree_preload_end(); *ipp = ip; return 0; out_preload_end: spin_unlock(&pag->pag_ici_lock); radix_tree_preload_end(); if (lock_flags) xfs_iunlock(ip, lock_flags); out_destroy: __destroy_inode(VFS_I(ip)); xfs_inode_free(ip); return error; }
/* * Process a bmap update intent item that was recovered from the log. * We need to update some inode's bmbt. */ int xfs_bui_recover( struct xfs_mount *mp, struct xfs_bui_log_item *buip) { int error = 0; unsigned int bui_type; struct xfs_map_extent *bmap; xfs_fsblock_t startblock_fsb; xfs_fsblock_t inode_fsb; xfs_filblks_t count; bool op_ok; struct xfs_bud_log_item *budp; enum xfs_bmap_intent_type type; int whichfork; xfs_exntst_t state; struct xfs_trans *tp; struct xfs_inode *ip = NULL; struct xfs_defer_ops dfops; struct xfs_bmbt_irec irec; xfs_fsblock_t firstfsb; ASSERT(!test_bit(XFS_BUI_RECOVERED, &buip->bui_flags)); /* Only one mapping operation per BUI... */ if (buip->bui_format.bui_nextents != XFS_BUI_MAX_FAST_EXTENTS) { set_bit(XFS_BUI_RECOVERED, &buip->bui_flags); xfs_bui_release(buip); return -EIO; } /* * First check the validity of the extent described by the * BUI. If anything is bad, then toss the BUI. */ bmap = &buip->bui_format.bui_extents[0]; startblock_fsb = XFS_BB_TO_FSB(mp, XFS_FSB_TO_DADDR(mp, bmap->me_startblock)); inode_fsb = XFS_BB_TO_FSB(mp, XFS_FSB_TO_DADDR(mp, XFS_INO_TO_FSB(mp, bmap->me_owner))); switch (bmap->me_flags & XFS_BMAP_EXTENT_TYPE_MASK) { case XFS_BMAP_MAP: case XFS_BMAP_UNMAP: op_ok = true; break; default: op_ok = false; break; } if (!op_ok || startblock_fsb == 0 || bmap->me_len == 0 || inode_fsb == 0 || startblock_fsb >= mp->m_sb.sb_dblocks || bmap->me_len >= mp->m_sb.sb_agblocks || inode_fsb >= mp->m_sb.sb_dblocks || (bmap->me_flags & ~XFS_BMAP_EXTENT_FLAGS)) { /* * This will pull the BUI from the AIL and * free the memory associated with it. */ set_bit(XFS_BUI_RECOVERED, &buip->bui_flags); xfs_bui_release(buip); return -EIO; } error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK), 0, 0, &tp); if (error) return error; budp = xfs_trans_get_bud(tp, buip); /* Grab the inode. */ error = xfs_iget(mp, tp, bmap->me_owner, 0, XFS_ILOCK_EXCL, &ip); if (error) goto err_inode; if (VFS_I(ip)->i_nlink == 0) xfs_iflags_set(ip, XFS_IRECOVERY); xfs_defer_init(&dfops, &firstfsb); /* Process deferred bmap item. */ state = (bmap->me_flags & XFS_BMAP_EXTENT_UNWRITTEN) ? XFS_EXT_UNWRITTEN : XFS_EXT_NORM; whichfork = (bmap->me_flags & XFS_BMAP_EXTENT_ATTR_FORK) ? XFS_ATTR_FORK : XFS_DATA_FORK; bui_type = bmap->me_flags & XFS_BMAP_EXTENT_TYPE_MASK; switch (bui_type) { case XFS_BMAP_MAP: case XFS_BMAP_UNMAP: type = bui_type; break; default: error = -EFSCORRUPTED; goto err_dfops; } xfs_trans_ijoin(tp, ip, 0); count = bmap->me_len; error = xfs_trans_log_finish_bmap_update(tp, budp, &dfops, type, ip, whichfork, bmap->me_startoff, bmap->me_startblock, &count, state); if (error) goto err_dfops; if (count > 0) { ASSERT(type == XFS_BMAP_UNMAP); irec.br_startblock = bmap->me_startblock; irec.br_blockcount = count; irec.br_startoff = bmap->me_startoff; irec.br_state = state; error = xfs_bmap_unmap_extent(tp->t_mountp, &dfops, ip, &irec); if (error) goto err_dfops; } /* Finish transaction, free inodes. */ error = xfs_defer_finish(&tp, &dfops); if (error) goto err_dfops; set_bit(XFS_BUI_RECOVERED, &buip->bui_flags); error = xfs_trans_commit(tp); xfs_iunlock(ip, XFS_ILOCK_EXCL); IRELE(ip); return error; err_dfops: xfs_defer_cancel(&dfops); err_inode: xfs_trans_cancel(tp); if (ip) { xfs_iunlock(ip, XFS_ILOCK_EXCL); IRELE(ip); } return error; }
/* * Convert userspace handle data into inode. * * We use the fact that all the fsop_handlereq ioctl calls have a data * structure argument whose first component is always a xfs_fsop_handlereq_t, * so we can pass that sub structure into this handy, shared routine. * * If no error, caller must always iput the returned inode. */ STATIC int xfs_vget_fsop_handlereq( xfs_mount_t *mp, struct inode *parinode, /* parent inode pointer */ xfs_fsop_handlereq_t *hreq, struct inode **inode) { void __user *hanp; size_t hlen; xfs_fid_t *xfid; xfs_handle_t *handlep; xfs_handle_t handle; xfs_inode_t *ip; xfs_ino_t ino; __u32 igen; int error; /* * Only allow handle opens under a directory. */ if (!S_ISDIR(parinode->i_mode)) return XFS_ERROR(ENOTDIR); hanp = hreq->ihandle; hlen = hreq->ihandlen; handlep = &handle; if (hlen < sizeof(handlep->ha_fsid) || hlen > sizeof(*handlep)) return XFS_ERROR(EINVAL); if (copy_from_user(handlep, hanp, hlen)) return XFS_ERROR(EFAULT); if (hlen < sizeof(*handlep)) memset(((char *)handlep) + hlen, 0, sizeof(*handlep) - hlen); if (hlen > sizeof(handlep->ha_fsid)) { if (handlep->ha_fid.fid_len != (hlen - sizeof(handlep->ha_fsid) - sizeof(handlep->ha_fid.fid_len)) || handlep->ha_fid.fid_pad) return XFS_ERROR(EINVAL); } /* * Crack the handle, obtain the inode # & generation # */ xfid = (struct xfs_fid *)&handlep->ha_fid; if (xfid->fid_len == sizeof(*xfid) - sizeof(xfid->fid_len)) { ino = xfid->fid_ino; igen = xfid->fid_gen; } else { return XFS_ERROR(EINVAL); } /* * Get the XFS inode, building a Linux inode to go with it. */ error = xfs_iget(mp, NULL, ino, 0, XFS_ILOCK_SHARED, &ip, 0); if (error) return error; if (ip == NULL) return XFS_ERROR(EIO); if (ip->i_d.di_gen != igen) { xfs_iput_new(ip, XFS_ILOCK_SHARED); return XFS_ERROR(ENOENT); } xfs_iunlock(ip, XFS_ILOCK_SHARED); *inode = VFS_I(ip); return 0; }
/* * Truncate file. Must have write permission and not be a directory. */ int xfs_setattr_size( struct xfs_inode *ip, struct iattr *iattr) { struct xfs_mount *mp = ip->i_mount; struct inode *inode = VFS_I(ip); xfs_off_t oldsize, newsize; struct xfs_trans *tp; int error; uint lock_flags = 0; uint commit_flags = 0; trace_xfs_setattr(ip); if (mp->m_flags & XFS_MOUNT_RDONLY) return XFS_ERROR(EROFS); if (XFS_FORCED_SHUTDOWN(mp)) return XFS_ERROR(EIO); error = -inode_change_ok(inode, iattr); if (error) return XFS_ERROR(error); ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); ASSERT(S_ISREG(ip->i_d.di_mode)); ASSERT((iattr->ia_valid & (ATTR_UID|ATTR_GID|ATTR_ATIME|ATTR_ATIME_SET| ATTR_MTIME_SET|ATTR_KILL_PRIV|ATTR_TIMES_SET)) == 0); oldsize = inode->i_size; newsize = iattr->ia_size; /* * Short circuit the truncate case for zero length files. */ if (newsize == 0 && oldsize == 0 && ip->i_d.di_nextents == 0) { if (!(iattr->ia_valid & (ATTR_CTIME|ATTR_MTIME))) return 0; /* * Use the regular setattr path to update the timestamps. */ iattr->ia_valid &= ~ATTR_SIZE; return xfs_setattr_nonsize(ip, iattr, 0); } /* * Make sure that the dquots are attached to the inode. */ error = xfs_qm_dqattach(ip, 0); if (error) return error; /* * Now we can make the changes. Before we join the inode to the * transaction, take care of the part of the truncation that must be * done without the inode lock. This needs to be done before joining * the inode to the transaction, because the inode cannot be unlocked * once it is a part of the transaction. */ if (newsize > oldsize) { /* * Do the first part of growing a file: zero any data in the * last block that is beyond the old EOF. We need to do this * before the inode is joined to the transaction to modify * i_size. */ error = xfs_zero_eof(ip, newsize, oldsize); if (error) return error; } /* * We are going to log the inode size change in this transaction so * any previous writes that are beyond the on disk EOF and the new * EOF that have not been written out need to be written here. If we * do not write the data out, we expose ourselves to the null files * problem. * * Only flush from the on disk size to the smaller of the in memory * file size or the new size as that's the range we really care about * here and prevents waiting for other data not within the range we * care about here. */ if (oldsize != ip->i_d.di_size && newsize > ip->i_d.di_size) { error = -filemap_write_and_wait_range(VFS_I(ip)->i_mapping, ip->i_d.di_size, newsize); if (error) return error; } /* * Wait for all direct I/O to complete. */ inode_dio_wait(inode); error = -block_truncate_page(inode->i_mapping, newsize, xfs_get_blocks); if (error) return error; tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_SIZE); error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0); if (error) goto out_trans_cancel; truncate_setsize(inode, newsize); commit_flags = XFS_TRANS_RELEASE_LOG_RES; lock_flags |= XFS_ILOCK_EXCL; xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, 0); /* * Only change the c/mtime if we are changing the size or we are * explicitly asked to change it. This handles the semantic difference * between truncate() and ftruncate() as implemented in the VFS. * * The regular truncate() case without ATTR_CTIME and ATTR_MTIME is a * special case where we need to update the times despite not having * these flags set. For all other operations the VFS set these flags * explicitly if it wants a timestamp update. */ if (newsize != oldsize && !(iattr->ia_valid & (ATTR_CTIME | ATTR_MTIME))) { iattr->ia_ctime = iattr->ia_mtime = current_fs_time(inode->i_sb); iattr->ia_valid |= ATTR_CTIME | ATTR_MTIME; } /* * The first thing we do is set the size to new_size permanently on * disk. This way we don't have to worry about anyone ever being able * to look at the data being freed even in the face of a crash. * What we're getting around here is the case where we free a block, it * is allocated to another file, it is written to, and then we crash. * If the new data gets written to the file but the log buffers * containing the free and reallocation don't, then we'd end up with * garbage in the blocks being freed. As long as we make the new size * permanent before actually freeing any blocks it doesn't matter if * they get written to. */ ip->i_d.di_size = newsize; xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); if (newsize <= oldsize) { error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, newsize); if (error) goto out_trans_abort; /* * Truncated "down", so we're removing references to old data * here - if we delay flushing for a long time, we expose * ourselves unduly to the notorious NULL files problem. So, * we mark this inode and flush it when the file is closed, * and do not wait the usual (long) time for writeout. */ xfs_iflags_set(ip, XFS_ITRUNCATED); /* A truncate down always removes post-EOF blocks. */ xfs_inode_clear_eofblocks_tag(ip); } if (iattr->ia_valid & ATTR_MODE) xfs_setattr_mode(ip, iattr); if (iattr->ia_valid & (ATTR_ATIME|ATTR_CTIME|ATTR_MTIME)) xfs_setattr_time(ip, iattr); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); XFS_STATS_INC(xs_ig_attrchg); if (mp->m_flags & XFS_MOUNT_WSYNC) xfs_trans_set_sync(tp); error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); out_unlock: if (lock_flags) xfs_iunlock(ip, lock_flags); return error; out_trans_abort: commit_flags |= XFS_TRANS_ABORT; out_trans_cancel: xfs_trans_cancel(tp, commit_flags); goto out_unlock; }
ASSERT(completion_done(&ip->i_flush)); kmem_zone_free(xfs_inode_zone, ip); } /* * Check the validity of the inode we just found it the cache */ static int xfs_iget_cache_hit( struct xfs_perag *pag, struct xfs_inode *ip, int flags, int lock_flags) __releases(pag->pag_ici_lock) { struct inode *inode = VFS_I(ip); struct xfs_mount *mp = ip->i_mount; int error; spin_lock(&ip->i_flags_lock); /* * If we are racing with another cache hit that is currently * instantiating this inode or currently recycling it out of * reclaimabe state, wait for the initialisation to complete * before continuing. * * XXX(hch): eventually we should do something equivalent to * wait_on_inode to wait for these flags to be cleared * instead of polling for it. */
/* * Unmap a range of blocks from a file, then map other blocks into the hole. * The range to unmap is (destoff : destoff + srcioff + irec->br_blockcount). * The extent irec is mapped into dest at irec->br_startoff. */ STATIC int xfs_reflink_remap_extent( struct xfs_inode *ip, struct xfs_bmbt_irec *irec, xfs_fileoff_t destoff, xfs_off_t new_isize) { struct xfs_mount *mp = ip->i_mount; bool real_extent = xfs_bmap_is_real_extent(irec); struct xfs_trans *tp; xfs_fsblock_t firstfsb; unsigned int resblks; struct xfs_defer_ops dfops; struct xfs_bmbt_irec uirec; xfs_filblks_t rlen; xfs_filblks_t unmap_len; xfs_off_t newlen; int error; unmap_len = irec->br_startoff + irec->br_blockcount - destoff; trace_xfs_reflink_punch_range(ip, destoff, unmap_len); /* No reflinking if we're low on space */ if (real_extent) { error = xfs_reflink_ag_has_free_space(mp, XFS_FSB_TO_AGNO(mp, irec->br_startblock)); if (error) goto out; } /* Start a rolling transaction to switch the mappings */ resblks = XFS_EXTENTADD_SPACE_RES(ip->i_mount, XFS_DATA_FORK); error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, &tp); if (error) goto out; xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, 0); /* If we're not just clearing space, then do we have enough quota? */ if (real_extent) { error = xfs_trans_reserve_quota_nblks(tp, ip, irec->br_blockcount, 0, XFS_QMOPT_RES_REGBLKS); if (error) goto out_cancel; } trace_xfs_reflink_remap(ip, irec->br_startoff, irec->br_blockcount, irec->br_startblock); /* Unmap the old blocks in the data fork. */ rlen = unmap_len; while (rlen) { xfs_defer_init(&dfops, &firstfsb); error = __xfs_bunmapi(tp, ip, destoff, &rlen, 0, 1, &firstfsb, &dfops); if (error) goto out_defer; /* * Trim the extent to whatever got unmapped. * Remember, bunmapi works backwards. */ uirec.br_startblock = irec->br_startblock + rlen; uirec.br_startoff = irec->br_startoff + rlen; uirec.br_blockcount = unmap_len - rlen; unmap_len = rlen; /* If this isn't a real mapping, we're done. */ if (!real_extent || uirec.br_blockcount == 0) goto next_extent; trace_xfs_reflink_remap(ip, uirec.br_startoff, uirec.br_blockcount, uirec.br_startblock); /* Update the refcount tree */ error = xfs_refcount_increase_extent(mp, &dfops, &uirec); if (error) goto out_defer; /* Map the new blocks into the data fork. */ error = xfs_bmap_map_extent(mp, &dfops, ip, &uirec); if (error) goto out_defer; /* Update quota accounting. */ xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, uirec.br_blockcount); /* Update dest isize if needed. */ newlen = XFS_FSB_TO_B(mp, uirec.br_startoff + uirec.br_blockcount); newlen = min_t(xfs_off_t, newlen, new_isize); if (newlen > i_size_read(VFS_I(ip))) { trace_xfs_reflink_update_inode_size(ip, newlen); i_size_write(VFS_I(ip), newlen); ip->i_d.di_size = newlen; xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); } next_extent: /* Process all the deferred stuff. */ error = xfs_defer_finish(&tp, &dfops, ip); if (error) goto out_defer; } error = xfs_trans_commit(tp); xfs_iunlock(ip, XFS_ILOCK_EXCL); if (error) goto out; return 0; out_defer: xfs_defer_cancel(&dfops); out_cancel: xfs_trans_cancel(tp); xfs_iunlock(ip, XFS_ILOCK_EXCL); out: trace_xfs_reflink_remap_extent_error(ip, error, _RET_IP_); return error; }
/* * Truncate file. Must have write permission and not be a directory. */ int xfs_setattr_size( struct xfs_inode *ip, struct iattr *iattr) { struct xfs_mount *mp = ip->i_mount; struct inode *inode = VFS_I(ip); xfs_off_t oldsize, newsize; struct xfs_trans *tp; int error; uint lock_flags = 0; bool did_zeroing = false; trace_xfs_setattr(ip); if (mp->m_flags & XFS_MOUNT_RDONLY) return -EROFS; if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; error = inode_change_ok(inode, iattr); if (error) return error; ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); ASSERT(xfs_isilocked(ip, XFS_MMAPLOCK_EXCL)); ASSERT(S_ISREG(ip->i_d.di_mode)); ASSERT((iattr->ia_valid & (ATTR_UID|ATTR_GID|ATTR_ATIME|ATTR_ATIME_SET| ATTR_MTIME_SET|ATTR_KILL_PRIV|ATTR_TIMES_SET)) == 0); oldsize = inode->i_size; newsize = iattr->ia_size; /* * Short circuit the truncate case for zero length files. */ if (newsize == 0 && oldsize == 0 && ip->i_d.di_nextents == 0) { if (!(iattr->ia_valid & (ATTR_CTIME|ATTR_MTIME))) return 0; /* * Use the regular setattr path to update the timestamps. */ iattr->ia_valid &= ~ATTR_SIZE; return xfs_setattr_nonsize(ip, iattr, 0); } /* * Make sure that the dquots are attached to the inode. */ error = xfs_qm_dqattach(ip, 0); if (error) return error; /* * File data changes must be complete before we start the transaction to * modify the inode. This needs to be done before joining the inode to * the transaction because the inode cannot be unlocked once it is a * part of the transaction. * * Start with zeroing any data block beyond EOF that we may expose on * file extension. */ if (newsize > oldsize) { error = xfs_zero_eof(ip, newsize, oldsize, &did_zeroing); if (error) return error; } /* * We are going to log the inode size change in this transaction so * any previous writes that are beyond the on disk EOF and the new * EOF that have not been written out need to be written here. If we * do not write the data out, we expose ourselves to the null files * problem. Note that this includes any block zeroing we did above; * otherwise those blocks may not be zeroed after a crash. */ if (newsize > ip->i_d.di_size && (oldsize != ip->i_d.di_size || did_zeroing)) { error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping, ip->i_d.di_size, newsize); if (error) return error; } /* Now wait for all direct I/O to complete. */ inode_dio_wait(inode); /* * We've already locked out new page faults, so now we can safely remove * pages from the page cache knowing they won't get refaulted until we * drop the XFS_MMAP_EXCL lock after the extent manipulations are * complete. The truncate_setsize() call also cleans partial EOF page * PTEs on extending truncates and hence ensures sub-page block size * filesystems are correctly handled, too. * * We have to do all the page cache truncate work outside the * transaction context as the "lock" order is page lock->log space * reservation as defined by extent allocation in the writeback path. * Hence a truncate can fail with ENOMEM from xfs_trans_reserve(), but * having already truncated the in-memory version of the file (i.e. made * user visible changes). There's not much we can do about this, except * to hope that the caller sees ENOMEM and retries the truncate * operation. */ if (IS_DAX(inode)) error = dax_truncate_page(inode, newsize, xfs_get_blocks_direct); else error = block_truncate_page(inode->i_mapping, newsize, xfs_get_blocks); if (error) return error; truncate_setsize(inode, newsize); tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_SIZE); error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0); if (error) goto out_trans_cancel; lock_flags |= XFS_ILOCK_EXCL; xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, 0); /* * Only change the c/mtime if we are changing the size or we are * explicitly asked to change it. This handles the semantic difference * between truncate() and ftruncate() as implemented in the VFS. * * The regular truncate() case without ATTR_CTIME and ATTR_MTIME is a * special case where we need to update the times despite not having * these flags set. For all other operations the VFS set these flags * explicitly if it wants a timestamp update. */ if (newsize != oldsize && !(iattr->ia_valid & (ATTR_CTIME | ATTR_MTIME))) { iattr->ia_ctime = iattr->ia_mtime = current_fs_time(inode->i_sb); iattr->ia_valid |= ATTR_CTIME | ATTR_MTIME; } /* * The first thing we do is set the size to new_size permanently on * disk. This way we don't have to worry about anyone ever being able * to look at the data being freed even in the face of a crash. * What we're getting around here is the case where we free a block, it * is allocated to another file, it is written to, and then we crash. * If the new data gets written to the file but the log buffers * containing the free and reallocation don't, then we'd end up with * garbage in the blocks being freed. As long as we make the new size * permanent before actually freeing any blocks it doesn't matter if * they get written to. */ ip->i_d.di_size = newsize; xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); if (newsize <= oldsize) { error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, newsize); if (error) goto out_trans_cancel; /* * Truncated "down", so we're removing references to old data * here - if we delay flushing for a long time, we expose * ourselves unduly to the notorious NULL files problem. So, * we mark this inode and flush it when the file is closed, * and do not wait the usual (long) time for writeout. */ xfs_iflags_set(ip, XFS_ITRUNCATED); /* A truncate down always removes post-EOF blocks. */ xfs_inode_clear_eofblocks_tag(ip); } if (iattr->ia_valid & ATTR_MODE) xfs_setattr_mode(ip, iattr); if (iattr->ia_valid & (ATTR_ATIME|ATTR_CTIME|ATTR_MTIME)) xfs_setattr_time(ip, iattr); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); XFS_STATS_INC(mp, xs_ig_attrchg); if (mp->m_flags & XFS_MOUNT_WSYNC) xfs_trans_set_sync(tp); error = xfs_trans_commit(tp); out_unlock: if (lock_flags) xfs_iunlock(ip, lock_flags); return error; out_trans_cancel: xfs_trans_cancel(tp); goto out_unlock; }
/* * The user wants to preemptively CoW all shared blocks in this file, * which enables us to turn off the reflink flag. Iterate all * extents which are not prealloc/delalloc to see which ranges are * mentioned in the refcount tree, then read those blocks into the * pagecache, dirty them, fsync them back out, and then we can update * the inode flag. What happens if we run out of memory? :) */ STATIC int xfs_reflink_dirty_extents( struct xfs_inode *ip, xfs_fileoff_t fbno, xfs_filblks_t end, xfs_off_t isize) { struct xfs_mount *mp = ip->i_mount; xfs_agnumber_t agno; xfs_agblock_t agbno; xfs_extlen_t aglen; xfs_agblock_t rbno; xfs_extlen_t rlen; xfs_off_t fpos; xfs_off_t flen; struct xfs_bmbt_irec map[2]; int nmaps; int error = 0; while (end - fbno > 0) { nmaps = 1; /* * Look for extents in the file. Skip holes, delalloc, or * unwritten extents; they can't be reflinked. */ error = xfs_bmapi_read(ip, fbno, end - fbno, map, &nmaps, 0); if (error) goto out; if (nmaps == 0) break; if (!xfs_bmap_is_real_extent(&map[0])) goto next; map[1] = map[0]; while (map[1].br_blockcount) { agno = XFS_FSB_TO_AGNO(mp, map[1].br_startblock); agbno = XFS_FSB_TO_AGBNO(mp, map[1].br_startblock); aglen = map[1].br_blockcount; error = xfs_reflink_find_shared(mp, agno, agbno, aglen, &rbno, &rlen, true); if (error) goto out; if (rbno == NULLAGBLOCK) break; /* Dirty the pages */ xfs_iunlock(ip, XFS_ILOCK_EXCL); fpos = XFS_FSB_TO_B(mp, map[1].br_startoff + (rbno - agbno)); flen = XFS_FSB_TO_B(mp, rlen); if (fpos + flen > isize) flen = isize - fpos; error = iomap_file_dirty(VFS_I(ip), fpos, flen, &xfs_iomap_ops); xfs_ilock(ip, XFS_ILOCK_EXCL); if (error) goto out; map[1].br_blockcount -= (rbno - agbno + rlen); map[1].br_startoff += (rbno - agbno + rlen); map[1].br_startblock += (rbno - agbno + rlen); } next: fbno = map[0].br_startoff + map[0].br_blockcount; } out: return error; }
int xfs_setattr_nonsize( struct xfs_inode *ip, struct iattr *iattr, int flags) { xfs_mount_t *mp = ip->i_mount; struct inode *inode = VFS_I(ip); int mask = iattr->ia_valid; xfs_trans_t *tp; int error; uid_t uid = 0, iuid = 0; gid_t gid = 0, igid = 0; struct xfs_dquot *udqp = NULL, *gdqp = NULL; struct xfs_dquot *olddquot1 = NULL, *olddquot2 = NULL; trace_xfs_setattr(ip); if (mp->m_flags & XFS_MOUNT_RDONLY) return XFS_ERROR(EROFS); if (XFS_FORCED_SHUTDOWN(mp)) return XFS_ERROR(EIO); error = -inode_change_ok(inode, iattr); if (error) return XFS_ERROR(error); ASSERT((mask & ATTR_SIZE) == 0); if (XFS_IS_QUOTA_ON(mp) && (mask & (ATTR_UID|ATTR_GID))) { uint qflags = 0; if ((mask & ATTR_UID) && XFS_IS_UQUOTA_ON(mp)) { uid = iattr->ia_uid; qflags |= XFS_QMOPT_UQUOTA; } else { uid = ip->i_d.di_uid; } if ((mask & ATTR_GID) && XFS_IS_GQUOTA_ON(mp)) { gid = iattr->ia_gid; qflags |= XFS_QMOPT_GQUOTA; } else { gid = ip->i_d.di_gid; } ASSERT(udqp == NULL); ASSERT(gdqp == NULL); error = xfs_qm_vop_dqalloc(ip, uid, gid, xfs_get_projid(ip), qflags, &udqp, &gdqp); if (error) return error; } tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE); error = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp), 0, 0, 0); if (error) goto out_dqrele; xfs_ilock(ip, XFS_ILOCK_EXCL); if (mask & (ATTR_UID|ATTR_GID)) { iuid = ip->i_d.di_uid; igid = ip->i_d.di_gid; gid = (mask & ATTR_GID) ? iattr->ia_gid : igid; uid = (mask & ATTR_UID) ? iattr->ia_uid : iuid; if (XFS_IS_QUOTA_RUNNING(mp) && ((XFS_IS_UQUOTA_ON(mp) && iuid != uid) || (XFS_IS_GQUOTA_ON(mp) && igid != gid))) { ASSERT(tp); error = xfs_qm_vop_chown_reserve(tp, ip, udqp, gdqp, capable(CAP_FOWNER) ? XFS_QMOPT_FORCE_RES : 0); if (error) goto out_trans_cancel; } } xfs_trans_ijoin(tp, ip, 0); if (mask & (ATTR_UID|ATTR_GID)) { if ((ip->i_d.di_mode & (S_ISUID|S_ISGID)) && !capable(CAP_FSETID)) ip->i_d.di_mode &= ~(S_ISUID|S_ISGID); if (iuid != uid) { if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_UQUOTA_ON(mp)) { ASSERT(mask & ATTR_UID); ASSERT(udqp); olddquot1 = xfs_qm_vop_chown(tp, ip, &ip->i_udquot, udqp); } ip->i_d.di_uid = uid; inode->i_uid = uid; } if (igid != gid) { if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_GQUOTA_ON(mp)) { ASSERT(!XFS_IS_PQUOTA_ON(mp)); ASSERT(mask & ATTR_GID); ASSERT(gdqp); olddquot2 = xfs_qm_vop_chown(tp, ip, &ip->i_gdquot, gdqp); } ip->i_d.di_gid = gid; inode->i_gid = gid; } } if (mask & ATTR_MODE) { umode_t mode = iattr->ia_mode; if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID)) mode &= ~S_ISGID; ip->i_d.di_mode &= S_IFMT; ip->i_d.di_mode |= mode & ~S_IFMT; inode->i_mode &= S_IFMT; inode->i_mode |= mode & ~S_IFMT; } if (mask & ATTR_ATIME) { inode->i_atime = iattr->ia_atime; ip->i_d.di_atime.t_sec = iattr->ia_atime.tv_sec; ip->i_d.di_atime.t_nsec = iattr->ia_atime.tv_nsec; } if (mask & ATTR_CTIME) { inode->i_ctime = iattr->ia_ctime; ip->i_d.di_ctime.t_sec = iattr->ia_ctime.tv_sec; ip->i_d.di_ctime.t_nsec = iattr->ia_ctime.tv_nsec; } if (mask & ATTR_MTIME) { inode->i_mtime = iattr->ia_mtime; ip->i_d.di_mtime.t_sec = iattr->ia_mtime.tv_sec; ip->i_d.di_mtime.t_nsec = iattr->ia_mtime.tv_nsec; } xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); XFS_STATS_INC(xs_ig_attrchg); if (mp->m_flags & XFS_MOUNT_WSYNC) xfs_trans_set_sync(tp); error = xfs_trans_commit(tp, 0); xfs_iunlock(ip, XFS_ILOCK_EXCL); xfs_qm_dqrele(olddquot1); xfs_qm_dqrele(olddquot2); xfs_qm_dqrele(udqp); xfs_qm_dqrele(gdqp); if (error) return XFS_ERROR(error); if ((mask & ATTR_MODE) && !(flags & XFS_ATTR_NOACL)) { error = -xfs_acl_chmod(inode); if (error) return XFS_ERROR(error); } return 0; out_trans_cancel: xfs_trans_cancel(tp, 0); xfs_iunlock(ip, XFS_ILOCK_EXCL); out_dqrele: xfs_qm_dqrele(udqp); xfs_qm_dqrele(gdqp); return error; }
STATIC int xfs_ioctl_setattr( xfs_inode_t *ip, struct fsxattr *fa) { struct xfs_mount *mp = ip->i_mount; struct xfs_trans *tp; struct xfs_dquot *udqp = NULL; struct xfs_dquot *pdqp = NULL; struct xfs_dquot *olddquot = NULL; int code; int join_flags = 0; trace_xfs_ioctl_setattr(ip); code = xfs_ioctl_setattr_check_projid(ip, fa); if (code) return code; /* * If disk quotas is on, we make sure that the dquots do exist on disk, * before we start any other transactions. Trying to do this later * is messy. We don't care to take a readlock to look at the ids * in inode here, because we can't hold it across the trans_reserve. * If the IDs do change before we take the ilock, we're covered * because the i_*dquot fields will get updated anyway. */ if (XFS_IS_QUOTA_ON(mp)) { code = xfs_qm_vop_dqalloc(ip, ip->i_d.di_uid, ip->i_d.di_gid, fa->fsx_projid, XFS_QMOPT_PQUOTA, &udqp, NULL, &pdqp); if (code) return code; } /* * Changing DAX config may require inode locking for mapping * invalidation. These need to be held all the way to transaction commit * or cancel time, so need to be passed through to * xfs_ioctl_setattr_get_trans() so it can apply them to the join call * appropriately. */ code = xfs_ioctl_setattr_dax_invalidate(ip, fa, &join_flags); if (code) goto error_free_dquots; tp = xfs_ioctl_setattr_get_trans(ip, join_flags); if (IS_ERR(tp)) { code = PTR_ERR(tp); goto error_free_dquots; } if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_PQUOTA_ON(mp) && xfs_get_projid(ip) != fa->fsx_projid) { code = xfs_qm_vop_chown_reserve(tp, ip, udqp, NULL, pdqp, capable(CAP_FOWNER) ? XFS_QMOPT_FORCE_RES : 0); if (code) /* out of quota */ goto error_trans_cancel; } code = xfs_ioctl_setattr_check_extsize(ip, fa); if (code) goto error_trans_cancel; code = xfs_ioctl_setattr_check_cowextsize(ip, fa); if (code) goto error_trans_cancel; code = xfs_ioctl_setattr_xflags(tp, ip, fa); if (code) goto error_trans_cancel; /* * Change file ownership. Must be the owner or privileged. CAP_FSETID * overrides the following restrictions: * * The set-user-ID and set-group-ID bits of a file will be cleared upon * successful return from chown() */ if ((VFS_I(ip)->i_mode & (S_ISUID|S_ISGID)) && !capable_wrt_inode_uidgid(VFS_I(ip), CAP_FSETID)) VFS_I(ip)->i_mode &= ~(S_ISUID|S_ISGID); /* Change the ownerships and register project quota modifications */ if (xfs_get_projid(ip) != fa->fsx_projid) { if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_PQUOTA_ON(mp)) { olddquot = xfs_qm_vop_chown(tp, ip, &ip->i_pdquot, pdqp); } ASSERT(ip->i_d.di_version > 1); xfs_set_projid(ip, fa->fsx_projid); } /* * Only set the extent size hint if we've already determined that the * extent size hint should be set on the inode. If no extent size flags * are set on the inode then unconditionally clear the extent size hint. */ if (ip->i_d.di_flags & (XFS_DIFLAG_EXTSIZE | XFS_DIFLAG_EXTSZINHERIT)) ip->i_d.di_extsize = fa->fsx_extsize >> mp->m_sb.sb_blocklog; else
int xfs_setattr_size( struct xfs_inode *ip, struct iattr *iattr, int flags) { struct xfs_mount *mp = ip->i_mount; struct inode *inode = VFS_I(ip); int mask = iattr->ia_valid; xfs_off_t oldsize, newsize; struct xfs_trans *tp; int error; uint lock_flags; uint commit_flags = 0; trace_xfs_setattr(ip); if (mp->m_flags & XFS_MOUNT_RDONLY) return XFS_ERROR(EROFS); if (XFS_FORCED_SHUTDOWN(mp)) return XFS_ERROR(EIO); error = -inode_change_ok(inode, iattr); if (error) return XFS_ERROR(error); ASSERT(S_ISREG(ip->i_d.di_mode)); ASSERT((mask & (ATTR_MODE|ATTR_UID|ATTR_GID|ATTR_ATIME|ATTR_ATIME_SET| ATTR_MTIME_SET|ATTR_KILL_SUID|ATTR_KILL_SGID| ATTR_KILL_PRIV|ATTR_TIMES_SET)) == 0); lock_flags = XFS_ILOCK_EXCL; if (!(flags & XFS_ATTR_NOLOCK)) lock_flags |= XFS_IOLOCK_EXCL; xfs_ilock(ip, lock_flags); oldsize = inode->i_size; newsize = iattr->ia_size; if (newsize == 0 && oldsize == 0 && ip->i_d.di_nextents == 0) { if (!(mask & (ATTR_CTIME|ATTR_MTIME))) goto out_unlock; xfs_iunlock(ip, lock_flags); iattr->ia_valid &= ~ATTR_SIZE; return xfs_setattr_nonsize(ip, iattr, 0); } error = xfs_qm_dqattach_locked(ip, 0); if (error) goto out_unlock; if (newsize > oldsize) { error = xfs_zero_eof(ip, newsize, oldsize); if (error) goto out_unlock; } xfs_iunlock(ip, XFS_ILOCK_EXCL); lock_flags &= ~XFS_ILOCK_EXCL; /* * We are going to log the inode size change in this transaction so * any previous writes that are beyond the on disk EOF and the new * EOF that have not been written out need to be written here. If we * do not write the data out, we expose ourselves to the null files * problem. * * Only flush from the on disk size to the smaller of the in memory * file size or the new size as that's the range we really care about * here and prevents waiting for other data not within the range we * care about here. */ if (oldsize != ip->i_d.di_size && newsize > ip->i_d.di_size) { error = xfs_flush_pages(ip, ip->i_d.di_size, newsize, 0, FI_NONE); if (error) goto out_unlock; } inode_dio_wait(inode); error = -block_truncate_page(inode->i_mapping, newsize, xfs_get_blocks); if (error) goto out_unlock; tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_SIZE); error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0, XFS_TRANS_PERM_LOG_RES, XFS_ITRUNCATE_LOG_COUNT); if (error) goto out_trans_cancel; truncate_setsize(inode, newsize); commit_flags = XFS_TRANS_RELEASE_LOG_RES; lock_flags |= XFS_ILOCK_EXCL; xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, 0); if (newsize != oldsize && (!(mask & (ATTR_CTIME | ATTR_MTIME)))) { iattr->ia_ctime = iattr->ia_mtime = current_fs_time(inode->i_sb); mask |= ATTR_CTIME | ATTR_MTIME; } /* * The first thing we do is set the size to new_size permanently on * disk. This way we don't have to worry about anyone ever being able * to look at the data being freed even in the face of a crash. * What we're getting around here is the case where we free a block, it * is allocated to another file, it is written to, and then we crash. * If the new data gets written to the file but the log buffers * containing the free and reallocation don't, then we'd end up with * garbage in the blocks being freed. As long as we make the new size * permanent before actually freeing any blocks it doesn't matter if * they get written to. */ ip->i_d.di_size = newsize; xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); if (newsize <= oldsize) { error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, newsize); if (error) goto out_trans_abort; xfs_iflags_set(ip, XFS_ITRUNCATED); } if (mask & ATTR_CTIME) { inode->i_ctime = iattr->ia_ctime; ip->i_d.di_ctime.t_sec = iattr->ia_ctime.tv_sec; ip->i_d.di_ctime.t_nsec = iattr->ia_ctime.tv_nsec; } if (mask & ATTR_MTIME) { inode->i_mtime = iattr->ia_mtime; ip->i_d.di_mtime.t_sec = iattr->ia_mtime.tv_sec; ip->i_d.di_mtime.t_nsec = iattr->ia_mtime.tv_nsec; } xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); XFS_STATS_INC(xs_ig_attrchg); if (mp->m_flags & XFS_MOUNT_WSYNC) xfs_trans_set_sync(tp); error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); out_unlock: if (lock_flags) xfs_iunlock(ip, lock_flags); return error; out_trans_abort: commit_flags |= XFS_TRANS_ABORT; out_trans_cancel: xfs_trans_cancel(tp, commit_flags); goto out_unlock; }
STATIC int xfs_vn_mknod( struct inode *dir, struct dentry *dentry, int mode, dev_t rdev) { struct inode *inode; struct xfs_inode *ip = NULL; struct posix_acl *default_acl = NULL; struct xfs_name name; int error; /* * Irix uses Missed'em'V split, but doesn't want to see * the upper 5 bits of (14bit) major. */ if (S_ISCHR(mode) || S_ISBLK(mode)) { if (unlikely(!sysv_valid_dev(rdev) || MAJOR(rdev) & ~0x1ff)) return -EINVAL; rdev = sysv_encode_dev(rdev); } else { rdev = 0; } if (IS_POSIXACL(dir)) { default_acl = xfs_get_acl(dir, ACL_TYPE_DEFAULT); if (IS_ERR(default_acl)) return PTR_ERR(default_acl); if (!default_acl) mode &= ~current_umask(); } xfs_dentry_to_name(&name, dentry); error = xfs_create(XFS_I(dir), &name, mode, rdev, &ip); if (unlikely(error)) goto out_free_acl; inode = VFS_I(ip); error = xfs_init_security(inode, dir, &dentry->d_name); if (unlikely(error)) goto out_cleanup_inode; if (default_acl) { error = -xfs_inherit_acl(inode, default_acl); default_acl = NULL; if (unlikely(error)) goto out_cleanup_inode; } d_instantiate(dentry, inode); return -error; out_cleanup_inode: xfs_cleanup_inode(dir, inode, dentry); out_free_acl: posix_acl_release(default_acl); return -error; }
/* * Inodes in different states need to be treated differently, and the return * value of xfs_iflush is not sufficient to get this right. The following table * lists the inode states and the reclaim actions necessary for non-blocking * reclaim: * * * inode state iflush ret required action * --------------- ---------- --------------- * bad - reclaim * shutdown EIO unpin and reclaim * clean, unpinned 0 reclaim * stale, unpinned 0 reclaim * clean, pinned(*) 0 requeue * stale, pinned EAGAIN requeue * dirty, delwri ok 0 requeue * dirty, delwri blocked EAGAIN requeue * dirty, sync flush 0 reclaim * * (*) dgc: I don't think the clean, pinned state is possible but it gets * handled anyway given the order of checks implemented. * * As can be seen from the table, the return value of xfs_iflush() is not * sufficient to correctly decide the reclaim action here. The checks in * xfs_iflush() might look like duplicates, but they are not. * * Also, because we get the flush lock first, we know that any inode that has * been flushed delwri has had the flush completed by the time we check that * the inode is clean. The clean inode check needs to be done before flushing * the inode delwri otherwise we would loop forever requeuing clean inodes as * we cannot tell apart a successful delwri flush and a clean inode from the * return value of xfs_iflush(). * * Note that because the inode is flushed delayed write by background * writeback, the flush lock may already be held here and waiting on it can * result in very long latencies. Hence for sync reclaims, where we wait on the * flush lock, the caller should push out delayed write inodes first before * trying to reclaim them to minimise the amount of time spent waiting. For * background relaim, we just requeue the inode for the next pass. * * Hence the order of actions after gaining the locks should be: * bad => reclaim * shutdown => unpin and reclaim * pinned, delwri => requeue * pinned, sync => unpin * stale => reclaim * clean => reclaim * dirty, delwri => flush and requeue * dirty, sync => flush, wait and reclaim */ STATIC int xfs_reclaim_inode( struct xfs_inode *ip, struct xfs_perag *pag, int sync_mode) { int error; restart: error = 0; xfs_ilock(ip, XFS_ILOCK_EXCL); if (!xfs_iflock_nowait(ip)) { if (!(sync_mode & SYNC_WAIT)) goto out; xfs_iflock(ip); } if (is_bad_inode(VFS_I(ip))) goto reclaim; if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { xfs_iunpin_wait(ip); goto reclaim; } if (xfs_ipincount(ip)) { if (!(sync_mode & SYNC_WAIT)) { xfs_ifunlock(ip); goto out; } xfs_iunpin_wait(ip); } if (xfs_iflags_test(ip, XFS_ISTALE)) goto reclaim; if (xfs_inode_clean(ip)) goto reclaim; /* * Now we have an inode that needs flushing. * * We do a nonblocking flush here even if we are doing a SYNC_WAIT * reclaim as we can deadlock with inode cluster removal. * xfs_ifree_cluster() can lock the inode buffer before it locks the * ip->i_lock, and we are doing the exact opposite here. As a result, * doing a blocking xfs_itobp() to get the cluster buffer will result * in an ABBA deadlock with xfs_ifree_cluster(). * * As xfs_ifree_cluser() must gather all inodes that are active in the * cache to mark them stale, if we hit this case we don't actually want * to do IO here - we want the inode marked stale so we can simply * reclaim it. Hence if we get an EAGAIN error on a SYNC_WAIT flush, * just unlock the inode, back off and try again. Hopefully the next * pass through will see the stale flag set on the inode. */ error = xfs_iflush(ip, SYNC_TRYLOCK | sync_mode); if (sync_mode & SYNC_WAIT) { if (error == EAGAIN) { xfs_iunlock(ip, XFS_ILOCK_EXCL); /* backoff longer than in xfs_ifree_cluster */ delay(2); goto restart; } xfs_iflock(ip); goto reclaim; } /* * When we have to flush an inode but don't have SYNC_WAIT set, we * flush the inode out using a delwri buffer and wait for the next * call into reclaim to find it in a clean state instead of waiting for * it now. We also don't return errors here - if the error is transient * then the next reclaim pass will flush the inode, and if the error * is permanent then the next sync reclaim will reclaim the inode and * pass on the error. */ if (error && error != EAGAIN && !XFS_FORCED_SHUTDOWN(ip->i_mount)) { xfs_warn(ip->i_mount, "inode 0x%llx background reclaim flush failed with %d", (long long)ip->i_ino, error); } out: xfs_iflags_clear(ip, XFS_IRECLAIM); xfs_iunlock(ip, XFS_ILOCK_EXCL); /* * We could return EAGAIN here to make reclaim rescan the inode tree in * a short while. However, this just burns CPU time scanning the tree * waiting for IO to complete and xfssyncd never goes back to the idle * state. Instead, return 0 to let the next scheduled background reclaim * attempt to reclaim the inode again. */ return 0; reclaim: xfs_ifunlock(ip); xfs_iunlock(ip, XFS_ILOCK_EXCL); XFS_STATS_INC(xs_ig_reclaims); /* * Remove the inode from the per-AG radix tree. * * Because radix_tree_delete won't complain even if the item was never * added to the tree assert that it's been there before to catch * problems with the inode life time early on. */ spin_lock(&pag->pag_ici_lock); if (!radix_tree_delete(&pag->pag_ici_root, XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino))) ASSERT(0); __xfs_inode_clear_reclaim(pag, ip); spin_unlock(&pag->pag_ici_lock); /* * Here we do an (almost) spurious inode lock in order to coordinate * with inode cache radix tree lookups. This is because the lookup * can reference the inodes in the cache without taking references. * * We make that OK here by ensuring that we wait until the inode is * unlocked after the lookup before we go ahead and free it. We get * both the ilock and the iolock because the code may need to drop the * ilock one but will still hold the iolock. */ xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); xfs_qm_dqdetach(ip); xfs_iunlock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); xfs_inode_free(ip); return error; }
/* * Truncate file. Must have write permission and not be a directory. */ int xfs_setattr_size( struct xfs_inode *ip, struct iattr *iattr, int flags) { struct xfs_mount *mp = ip->i_mount; struct inode *inode = VFS_I(ip); int mask = iattr->ia_valid; struct xfs_trans *tp; int error; uint lock_flags; uint commit_flags = 0; trace_xfs_setattr(ip); if (mp->m_flags & XFS_MOUNT_RDONLY) return XFS_ERROR(EROFS); if (XFS_FORCED_SHUTDOWN(mp)) return XFS_ERROR(EIO); error = -inode_change_ok(inode, iattr); if (error) return XFS_ERROR(error); ASSERT(S_ISREG(ip->i_d.di_mode)); ASSERT((mask & (ATTR_MODE|ATTR_UID|ATTR_GID|ATTR_ATIME|ATTR_ATIME_SET| ATTR_MTIME_SET|ATTR_KILL_SUID|ATTR_KILL_SGID| ATTR_KILL_PRIV|ATTR_TIMES_SET)) == 0); lock_flags = XFS_ILOCK_EXCL; if (!(flags & XFS_ATTR_NOLOCK)) lock_flags |= XFS_IOLOCK_EXCL; xfs_ilock(ip, lock_flags); /* * Short circuit the truncate case for zero length files. */ if (iattr->ia_size == 0 && ip->i_size == 0 && ip->i_d.di_nextents == 0) { if (!(mask & (ATTR_CTIME|ATTR_MTIME))) goto out_unlock; /* * Use the regular setattr path to update the timestamps. */ xfs_iunlock(ip, lock_flags); iattr->ia_valid &= ~ATTR_SIZE; return xfs_setattr_nonsize(ip, iattr, 0); } /* * Make sure that the dquots are attached to the inode. */ error = xfs_qm_dqattach_locked(ip, 0); if (error) goto out_unlock; /* * Now we can make the changes. Before we join the inode to the * transaction, take care of the part of the truncation that must be * done without the inode lock. This needs to be done before joining * the inode to the transaction, because the inode cannot be unlocked * once it is a part of the transaction. */ if (iattr->ia_size > ip->i_size) { /* * Do the first part of growing a file: zero any data in the * last block that is beyond the old EOF. We need to do this * before the inode is joined to the transaction to modify * i_size. */ error = xfs_zero_eof(ip, iattr->ia_size, ip->i_size); if (error) goto out_unlock; } xfs_iunlock(ip, XFS_ILOCK_EXCL); lock_flags &= ~XFS_ILOCK_EXCL; /* * We are going to log the inode size change in this transaction so * any previous writes that are beyond the on disk EOF and the new * EOF that have not been written out need to be written here. If we * do not write the data out, we expose ourselves to the null files * problem. * * Only flush from the on disk size to the smaller of the in memory * file size or the new size as that's the range we really care about * here and prevents waiting for other data not within the range we * care about here. */ if (ip->i_size != ip->i_d.di_size && iattr->ia_size > ip->i_d.di_size) { error = xfs_flush_pages(ip, ip->i_d.di_size, iattr->ia_size, XBF_ASYNC, FI_NONE); if (error) goto out_unlock; } /* * Wait for all I/O to complete. */ xfs_ioend_wait(ip); error = -block_truncate_page(inode->i_mapping, iattr->ia_size, xfs_get_blocks); if (error) goto out_unlock; tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_SIZE); error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0, XFS_TRANS_PERM_LOG_RES, XFS_ITRUNCATE_LOG_COUNT); if (error) goto out_trans_cancel; truncate_setsize(inode, iattr->ia_size); commit_flags = XFS_TRANS_RELEASE_LOG_RES; lock_flags |= XFS_ILOCK_EXCL; xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip); /* * Only change the c/mtime if we are changing the size or we are * explicitly asked to change it. This handles the semantic difference * between truncate() and ftruncate() as implemented in the VFS. * * The regular truncate() case without ATTR_CTIME and ATTR_MTIME is a * special case where we need to update the times despite not having * these flags set. For all other operations the VFS set these flags * explicitly if it wants a timestamp update. */ if (iattr->ia_size != ip->i_size && (!(mask & (ATTR_CTIME | ATTR_MTIME)))) { iattr->ia_ctime = iattr->ia_mtime = current_fs_time(inode->i_sb); mask |= ATTR_CTIME | ATTR_MTIME; } if (iattr->ia_size > ip->i_size) { ip->i_d.di_size = iattr->ia_size; ip->i_size = iattr->ia_size; } else if (iattr->ia_size <= ip->i_size || (iattr->ia_size == 0 && ip->i_d.di_nextents)) { error = xfs_itruncate_data(&tp, ip, iattr->ia_size); if (error) goto out_trans_abort; /* * Truncated "down", so we're removing references to old data * here - if we delay flushing for a long time, we expose * ourselves unduly to the notorious NULL files problem. So, * we mark this inode and flush it when the file is closed, * and do not wait the usual (long) time for writeout. */ xfs_iflags_set(ip, XFS_ITRUNCATED); } if (mask & ATTR_CTIME) { inode->i_ctime = iattr->ia_ctime; ip->i_d.di_ctime.t_sec = iattr->ia_ctime.tv_sec; ip->i_d.di_ctime.t_nsec = iattr->ia_ctime.tv_nsec; ip->i_update_core = 1; } if (mask & ATTR_MTIME) { inode->i_mtime = iattr->ia_mtime; ip->i_d.di_mtime.t_sec = iattr->ia_mtime.tv_sec; ip->i_d.di_mtime.t_nsec = iattr->ia_mtime.tv_nsec; ip->i_update_core = 1; } xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); XFS_STATS_INC(xs_ig_attrchg); if (mp->m_flags & XFS_MOUNT_WSYNC) xfs_trans_set_sync(tp); error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); out_unlock: if (lock_flags) xfs_iunlock(ip, lock_flags); return error; out_trans_abort: commit_flags |= XFS_TRANS_ABORT; out_trans_cancel: xfs_trans_cancel(tp, commit_flags); goto out_unlock; }
/* * xfs_file_dio_aio_write - handle direct IO writes * * Lock the inode appropriately to prepare for and issue a direct IO write. * By separating it from the buffered write path we remove all the tricky to * follow locking changes and looping. * * If there are cached pages or we're extending the file, we need IOLOCK_EXCL * until we're sure the bytes at the new EOF have been zeroed and/or the cached * pages are flushed out. * * In most cases the direct IO writes will be done holding IOLOCK_SHARED * allowing them to be done in parallel with reads and other direct IO writes. * However, if the IO is not aligned to filesystem blocks, the direct IO layer * needs to do sub-block zeroing and that requires serialisation against other * direct IOs to the same block. In this case we need to serialise the * submission of the unaligned IOs so that we don't get racing block zeroing in * the dio layer. To avoid the problem with aio, we also need to wait for * outstanding IOs to complete so that unwritten extent conversion is completed * before we try to map the overlapping block. This is currently implemented by * hitting it with a big hammer (i.e. inode_dio_wait()). * * Returns with locks held indicated by @iolock and errors indicated by * negative return values. */ STATIC ssize_t xfs_file_dio_aio_write( struct kiocb *iocb, const struct iovec *iovp, unsigned long nr_segs, loff_t pos, size_t ocount) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; ssize_t ret = 0; size_t count = ocount; int unaligned_io = 0; int iolock; struct xfs_buftarg *target = XFS_IS_REALTIME_INODE(ip) ? mp->m_rtdev_targp : mp->m_ddev_targp; /* DIO must be aligned to device logical sector size */ if ((pos | count) & target->bt_logical_sectormask) return -XFS_ERROR(EINVAL); /* "unaligned" here means not aligned to a filesystem block */ if ((pos & mp->m_blockmask) || ((pos + count) & mp->m_blockmask)) unaligned_io = 1; /* * We don't need to take an exclusive lock unless there page cache needs * to be invalidated or unaligned IO is being executed. We don't need to * consider the EOF extension case here because * xfs_file_aio_write_checks() will relock the inode as necessary for * EOF zeroing cases and fill out the new inode size as appropriate. */ if (unaligned_io || mapping->nrpages) iolock = XFS_IOLOCK_EXCL; else iolock = XFS_IOLOCK_SHARED; xfs_rw_ilock(ip, iolock); /* * Recheck if there are cached pages that need invalidate after we got * the iolock to protect against other threads adding new pages while * we were waiting for the iolock. */ if (mapping->nrpages && iolock == XFS_IOLOCK_SHARED) { xfs_rw_iunlock(ip, iolock); iolock = XFS_IOLOCK_EXCL; xfs_rw_ilock(ip, iolock); } ret = xfs_file_aio_write_checks(file, &pos, &count, &iolock); if (ret) goto out; if (mapping->nrpages) { ret = -filemap_write_and_wait_range(VFS_I(ip)->i_mapping, pos, -1); if (ret) goto out; truncate_pagecache_range(VFS_I(ip), pos, -1); } /* * If we are doing unaligned IO, wait for all other IO to drain, * otherwise demote the lock if we had to flush cached pages */ if (unaligned_io) inode_dio_wait(inode); else if (iolock == XFS_IOLOCK_EXCL) { xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL); iolock = XFS_IOLOCK_SHARED; } trace_xfs_file_direct_write(ip, count, iocb->ki_pos, 0); ret = generic_file_direct_write(iocb, iovp, &nr_segs, pos, &iocb->ki_pos, count, ocount); out: xfs_rw_iunlock(ip, iolock); /* No fallback to buffered IO on errors for XFS. */ ASSERT(ret < 0 || ret == count); return ret; }
void libxfs_iprint( xfs_inode_t *ip) { struct xfs_icdinode *dip; xfs_bmbt_rec_host_t *ep; xfs_extnum_t i; xfs_extnum_t nextents; printf("Inode %lx\n", (unsigned long)ip); printf(" i_ino %llx\n", (unsigned long long)ip->i_ino); if (ip->i_df.if_flags & XFS_IFEXTENTS) printf("EXTENTS "); printf("\n"); printf(" i_df.if_bytes %d\n", ip->i_df.if_bytes); printf(" i_df.if_u1.if_extents/if_data %lx\n", (unsigned long)ip->i_df.if_u1.if_extents); if (ip->i_df.if_flags & XFS_IFEXTENTS) { nextents = ip->i_df.if_bytes / (uint)sizeof(*ep); for (ep = ip->i_df.if_u1.if_extents, i = 0; i < nextents; i++, ep++) { xfs_bmbt_irec_t rec; xfs_bmbt_get_all(ep, &rec); printf("\t%d: startoff %llu, startblock 0x%llx," " blockcount %llu, state %d\n", i, (unsigned long long)rec.br_startoff, (unsigned long long)rec.br_startblock, (unsigned long long)rec.br_blockcount, (int)rec.br_state); } } printf(" i_df.if_broot %lx\n", (unsigned long)ip->i_df.if_broot); printf(" i_df.if_broot_bytes %x\n", ip->i_df.if_broot_bytes); dip = &ip->i_d; printf("\nOn disk portion\n"); printf(" di_mode %o\n", VFS_I(ip)->i_mode); printf(" di_version %x\n", (uint)dip->di_version); switch (ip->i_d.di_format) { case XFS_DINODE_FMT_LOCAL: printf(" Inline inode\n"); break; case XFS_DINODE_FMT_EXTENTS: printf(" Extents inode\n"); break; case XFS_DINODE_FMT_BTREE: printf(" B-tree inode\n"); break; default: printf(" Other inode\n"); break; } printf(" di_nlink %x\n", VFS_I(ip)->i_nlink); printf(" di_uid %d\n", dip->di_uid); printf(" di_gid %d\n", dip->di_gid); printf(" di_nextents %d\n", dip->di_nextents); printf(" di_size %llu\n", (unsigned long long)dip->di_size); printf(" di_gen %x\n", VFS_I(ip)->i_generation); printf(" di_extsize %d\n", dip->di_extsize); printf(" di_flags %x\n", dip->di_flags); printf(" di_nblocks %llu\n", (unsigned long long)dip->di_nblocks); }
/* * Inodes in different states need to be treated differently. The following * table lists the inode states and the reclaim actions necessary: * * inode state iflush ret required action * --------------- ---------- --------------- * bad - reclaim * shutdown EIO unpin and reclaim * clean, unpinned 0 reclaim * stale, unpinned 0 reclaim * clean, pinned(*) 0 requeue * stale, pinned EAGAIN requeue * dirty, async - requeue * dirty, sync 0 reclaim * * (*) dgc: I don't think the clean, pinned state is possible but it gets * handled anyway given the order of checks implemented. * * Also, because we get the flush lock first, we know that any inode that has * been flushed delwri has had the flush completed by the time we check that * the inode is clean. * * Note that because the inode is flushed delayed write by AIL pushing, the * flush lock may already be held here and waiting on it can result in very * long latencies. Hence for sync reclaims, where we wait on the flush lock, * the caller should push the AIL first before trying to reclaim inodes to * minimise the amount of time spent waiting. For background relaim, we only * bother to reclaim clean inodes anyway. * * Hence the order of actions after gaining the locks should be: * bad => reclaim * shutdown => unpin and reclaim * pinned, async => requeue * pinned, sync => unpin * stale => reclaim * clean => reclaim * dirty, async => requeue * dirty, sync => flush, wait and reclaim */ STATIC int xfs_reclaim_inode( struct xfs_inode *ip, struct xfs_perag *pag, int sync_mode) { struct xfs_buf *bp = NULL; int error; restart: error = 0; xfs_ilock(ip, XFS_ILOCK_EXCL); if (!xfs_iflock_nowait(ip)) { if (!(sync_mode & SYNC_WAIT)) goto out; xfs_iflock(ip); } if (is_bad_inode(VFS_I(ip))) goto reclaim; if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { xfs_iunpin_wait(ip); xfs_iflush_abort(ip, false); goto reclaim; } if (xfs_ipincount(ip)) { if (!(sync_mode & SYNC_WAIT)) goto out_ifunlock; xfs_iunpin_wait(ip); } if (xfs_iflags_test(ip, XFS_ISTALE)) goto reclaim; if (xfs_inode_clean(ip)) goto reclaim; /* * Never flush out dirty data during non-blocking reclaim, as it would * just contend with AIL pushing trying to do the same job. */ if (!(sync_mode & SYNC_WAIT)) goto out_ifunlock; /* * Now we have an inode that needs flushing. * * Note that xfs_iflush will never block on the inode buffer lock, as * xfs_ifree_cluster() can lock the inode buffer before it locks the * ip->i_lock, and we are doing the exact opposite here. As a result, * doing a blocking xfs_imap_to_bp() to get the cluster buffer would * result in an ABBA deadlock with xfs_ifree_cluster(). * * As xfs_ifree_cluser() must gather all inodes that are active in the * cache to mark them stale, if we hit this case we don't actually want * to do IO here - we want the inode marked stale so we can simply * reclaim it. Hence if we get an EAGAIN error here, just unlock the * inode, back off and try again. Hopefully the next pass through will * see the stale flag set on the inode. */ error = xfs_iflush(ip, &bp); if (error == EAGAIN) { xfs_iunlock(ip, XFS_ILOCK_EXCL); /* backoff longer than in xfs_ifree_cluster */ delay(2); goto restart; } if (!error) { error = xfs_bwrite(bp); xfs_buf_relse(bp); } xfs_iflock(ip); reclaim: xfs_ifunlock(ip); xfs_iunlock(ip, XFS_ILOCK_EXCL); XFS_STATS_INC(xs_ig_reclaims); /* * Remove the inode from the per-AG radix tree. * * Because radix_tree_delete won't complain even if the item was never * added to the tree assert that it's been there before to catch * problems with the inode life time early on. */ spin_lock(&pag->pag_ici_lock); if (!radix_tree_delete(&pag->pag_ici_root, XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino))) ASSERT(0); __xfs_inode_clear_reclaim(pag, ip); spin_unlock(&pag->pag_ici_lock); /* * Here we do an (almost) spurious inode lock in order to coordinate * with inode cache radix tree lookups. This is because the lookup * can reference the inodes in the cache without taking references. * * We make that OK here by ensuring that we wait until the inode is * unlocked after the lookup before we go ahead and free it. */ xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_qm_dqdetach(ip); xfs_iunlock(ip, XFS_ILOCK_EXCL); xfs_inode_free(ip); return error; out_ifunlock: xfs_ifunlock(ip); out: xfs_iflags_clear(ip, XFS_IRECLAIM); xfs_iunlock(ip, XFS_ILOCK_EXCL); /* * We could return EAGAIN here to make reclaim rescan the inode tree in * a short while. However, this just burns CPU time scanning the tree * waiting for IO to complete and the reclaim work never goes back to * the idle state. Instead, return 0 to let the next scheduled * background reclaim attempt to reclaim the inode again. */ return 0; }
/* * Look up an inode by number in the given file system. * The inode is looked up in the cache held in each AG. * If the inode is found in the cache, initialise the vfs inode * if necessary. * * If it is not in core, read it in from the file system's device, * add it to the cache and initialise the vfs inode. * * The inode is locked according to the value of the lock_flags parameter. * This flag parameter indicates how and if the inode's IO lock and inode lock * should be taken. * * mp -- the mount point structure for the current file system. It points * to the inode hash table. * tp -- a pointer to the current transaction if there is one. This is * simply passed through to the xfs_iread() call. * ino -- the number of the inode desired. This is the unique identifier * within the file system for the inode being requested. * lock_flags -- flags indicating how to lock the inode. See the comment * for xfs_ilock() for a list of valid values. */ int xfs_iget( xfs_mount_t *mp, xfs_trans_t *tp, xfs_ino_t ino, uint flags, uint lock_flags, xfs_inode_t **ipp) { xfs_inode_t *ip; int error; xfs_perag_t *pag; xfs_agino_t agino; /* * xfs_reclaim_inode() uses the ILOCK to ensure an inode * doesn't get freed while it's being referenced during a * radix tree traversal here. It assumes this function * aqcuires only the ILOCK (and therefore it has no need to * involve the IOLOCK in this synchronization). */ ASSERT((lock_flags & (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED)) == 0); /* reject inode numbers outside existing AGs */ if (!ino || XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount) return -EINVAL; XFS_STATS_INC(mp, xs_ig_attempts); /* get the perag structure and ensure that it's inode capable */ pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ino)); agino = XFS_INO_TO_AGINO(mp, ino); again: error = 0; rcu_read_lock(); ip = radix_tree_lookup(&pag->pag_ici_root, agino); if (ip) { error = xfs_iget_cache_hit(pag, ip, ino, flags, lock_flags); if (error) goto out_error_or_again; } else { rcu_read_unlock(); if (flags & XFS_IGET_INCORE) { error = -ENOENT; goto out_error_or_again; } XFS_STATS_INC(mp, xs_ig_missed); error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip, flags, lock_flags); if (error) goto out_error_or_again; } xfs_perag_put(pag); *ipp = ip; /* * If we have a real type for an on-disk inode, we can setup the inode * now. If it's a new inode being created, xfs_ialloc will handle it. */ if (xfs_iflags_test(ip, XFS_INEW) && VFS_I(ip)->i_mode != 0) xfs_setup_existing_inode(ip); return 0; out_error_or_again: if (!(flags & XFS_IGET_INCORE) && error == -EAGAIN) { delay(1); goto again; } xfs_perag_put(pag); return error; }
STATIC int xfs_ioctl_setattr( xfs_inode_t *ip, struct fsxattr *fa, int mask) { struct xfs_mount *mp = ip->i_mount; struct xfs_trans *tp; unsigned int lock_flags = 0; struct xfs_dquot *udqp = NULL; struct xfs_dquot *pdqp = NULL; struct xfs_dquot *olddquot = NULL; int code; trace_xfs_ioctl_setattr(ip); if (mp->m_flags & XFS_MOUNT_RDONLY) return XFS_ERROR(EROFS); if (XFS_FORCED_SHUTDOWN(mp)) return XFS_ERROR(EIO); /* * Disallow 32bit project ids when projid32bit feature is not enabled. */ if ((mask & FSX_PROJID) && (fa->fsx_projid > (__uint16_t)-1) && !xfs_sb_version_hasprojid32bit(&ip->i_mount->m_sb)) return XFS_ERROR(EINVAL); /* * If disk quotas is on, we make sure that the dquots do exist on disk, * before we start any other transactions. Trying to do this later * is messy. We don't care to take a readlock to look at the ids * in inode here, because we can't hold it across the trans_reserve. * If the IDs do change before we take the ilock, we're covered * because the i_*dquot fields will get updated anyway. */ if (XFS_IS_QUOTA_ON(mp) && (mask & FSX_PROJID)) { code = xfs_qm_vop_dqalloc(ip, ip->i_d.di_uid, ip->i_d.di_gid, fa->fsx_projid, XFS_QMOPT_PQUOTA, &udqp, NULL, &pdqp); if (code) return code; } /* * For the other attributes, we acquire the inode lock and * first do an error checking pass. */ tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE); code = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0); if (code) goto error_return; lock_flags = XFS_ILOCK_EXCL; xfs_ilock(ip, lock_flags); /* * CAP_FOWNER overrides the following restrictions: * * The user ID of the calling process must be equal * to the file owner ID, except in cases where the * CAP_FSETID capability is applicable. */ if (!inode_owner_or_capable(VFS_I(ip))) { code = XFS_ERROR(EPERM); goto error_return; } /* * Do a quota reservation only if projid is actually going to change. * Only allow changing of projid from init_user_ns since it is a * non user namespace aware identifier. */ if (mask & FSX_PROJID) { if (current_user_ns() != &init_user_ns) { code = XFS_ERROR(EINVAL); goto error_return; } if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_PQUOTA_ON(mp) && xfs_get_projid(ip) != fa->fsx_projid) { ASSERT(tp); code = xfs_qm_vop_chown_reserve(tp, ip, udqp, NULL, pdqp, capable(CAP_FOWNER) ? XFS_QMOPT_FORCE_RES : 0); if (code) /* out of quota */ goto error_return; } } if (mask & FSX_EXTSIZE) { /* * Can't change extent size if any extents are allocated. */ if (ip->i_d.di_nextents && ((ip->i_d.di_extsize << mp->m_sb.sb_blocklog) != fa->fsx_extsize)) { code = XFS_ERROR(EINVAL); /* EFBIG? */ goto error_return; } /* * Extent size must be a multiple of the appropriate block * size, if set at all. It must also be smaller than the * maximum extent size supported by the filesystem. * * Also, for non-realtime files, limit the extent size hint to * half the size of the AGs in the filesystem so alignment * doesn't result in extents larger than an AG. */ if (fa->fsx_extsize != 0) { xfs_extlen_t size; xfs_fsblock_t extsize_fsb; extsize_fsb = XFS_B_TO_FSB(mp, fa->fsx_extsize); if (extsize_fsb > MAXEXTLEN) { code = XFS_ERROR(EINVAL); goto error_return; } if (XFS_IS_REALTIME_INODE(ip) || ((mask & FSX_XFLAGS) && (fa->fsx_xflags & XFS_XFLAG_REALTIME))) { size = mp->m_sb.sb_rextsize << mp->m_sb.sb_blocklog; } else { size = mp->m_sb.sb_blocksize; if (extsize_fsb > mp->m_sb.sb_agblocks / 2) { code = XFS_ERROR(EINVAL); goto error_return; } } if (fa->fsx_extsize % size) { code = XFS_ERROR(EINVAL); goto error_return; } } } if (mask & FSX_XFLAGS) { /* * Can't change realtime flag if any extents are allocated. */ if ((ip->i_d.di_nextents || ip->i_delayed_blks) && (XFS_IS_REALTIME_INODE(ip)) != (fa->fsx_xflags & XFS_XFLAG_REALTIME)) { code = XFS_ERROR(EINVAL); /* EFBIG? */ goto error_return; } /* * If realtime flag is set then must have realtime data. */ if ((fa->fsx_xflags & XFS_XFLAG_REALTIME)) { if ((mp->m_sb.sb_rblocks == 0) || (mp->m_sb.sb_rextsize == 0) || (ip->i_d.di_extsize % mp->m_sb.sb_rextsize)) { code = XFS_ERROR(EINVAL); goto error_return; } } /* * Can't modify an immutable/append-only file unless * we have appropriate permission. */ if ((ip->i_d.di_flags & (XFS_DIFLAG_IMMUTABLE|XFS_DIFLAG_APPEND) || (fa->fsx_xflags & (XFS_XFLAG_IMMUTABLE | XFS_XFLAG_APPEND))) && !capable(CAP_LINUX_IMMUTABLE)) { code = XFS_ERROR(EPERM); goto error_return; } } xfs_trans_ijoin(tp, ip, 0); /* * Change file ownership. Must be the owner or privileged. */ if (mask & FSX_PROJID) { /* * CAP_FSETID overrides the following restrictions: * * The set-user-ID and set-group-ID bits of a file will be * cleared upon successful return from chown() */ if ((ip->i_d.di_mode & (S_ISUID|S_ISGID)) && !inode_capable(VFS_I(ip), CAP_FSETID)) ip->i_d.di_mode &= ~(S_ISUID|S_ISGID); /* * Change the ownerships and register quota modifications * in the transaction. */ if (xfs_get_projid(ip) != fa->fsx_projid) { if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_PQUOTA_ON(mp)) { olddquot = xfs_qm_vop_chown(tp, ip, &ip->i_pdquot, pdqp); } xfs_set_projid(ip, fa->fsx_projid); /* * We may have to rev the inode as well as * the superblock version number since projids didn't * exist before DINODE_VERSION_2 and SB_VERSION_NLINK. */ if (ip->i_d.di_version == 1) xfs_bump_ino_vers2(tp, ip); } } if (mask & FSX_EXTSIZE) ip->i_d.di_extsize = fa->fsx_extsize >> mp->m_sb.sb_blocklog; if (mask & FSX_XFLAGS) { xfs_set_diflags(ip, fa->fsx_xflags); xfs_diflags_to_linux(ip); } xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); XFS_STATS_INC(xs_ig_attrchg); /* * If this is a synchronous mount, make sure that the * transaction goes to disk before returning to the user. * This is slightly sub-optimal in that truncates require * two sync transactions instead of one for wsync filesystems. * One for the truncate and one for the timestamps since we * don't want to change the timestamps unless we're sure the * truncate worked. Truncates are less than 1% of the laddis * mix so this probably isn't worth the trouble to optimize. */ if (mp->m_flags & XFS_MOUNT_WSYNC) xfs_trans_set_sync(tp); code = xfs_trans_commit(tp, 0); xfs_iunlock(ip, lock_flags); /* * Release any dquot(s) the inode had kept before chown. */ xfs_qm_dqrele(olddquot); xfs_qm_dqrele(udqp); xfs_qm_dqrele(pdqp); return code; error_return: xfs_qm_dqrele(udqp); xfs_qm_dqrele(pdqp); xfs_trans_cancel(tp, 0); if (lock_flags) xfs_iunlock(ip, lock_flags); return code; }
static int xfs_swap_extents( xfs_inode_t *ip, /* target inode */ xfs_inode_t *tip, /* tmp inode */ xfs_swapext_t *sxp) { xfs_mount_t *mp = ip->i_mount; xfs_trans_t *tp; xfs_bstat_t *sbp = &sxp->sx_stat; xfs_ifork_t *tempifp, *ifp, *tifp; int src_log_flags, target_log_flags; int error = 0; int aforkblks = 0; int taforkblks = 0; __uint64_t tmp; tempifp = kmem_alloc(sizeof(xfs_ifork_t), KM_MAYFAIL); if (!tempifp) { error = XFS_ERROR(ENOMEM); goto out; } /* * we have to do two separate lock calls here to keep lockdep * happy. If we try to get all the locks in one call, lock will * report false positives when we drop the ILOCK and regain them * below. */ xfs_lock_two_inodes(ip, tip, XFS_IOLOCK_EXCL); xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL); /* Verify that both files have the same format */ if ((ip->i_d.di_mode & S_IFMT) != (tip->i_d.di_mode & S_IFMT)) { error = XFS_ERROR(EINVAL); goto out_unlock; } /* Verify both files are either real-time or non-realtime */ if (XFS_IS_REALTIME_INODE(ip) != XFS_IS_REALTIME_INODE(tip)) { error = XFS_ERROR(EINVAL); goto out_unlock; } if (VN_CACHED(VFS_I(tip)) != 0) { error = xfs_flushinval_pages(tip, 0, -1, FI_REMAPF_LOCKED); if (error) goto out_unlock; } /* Verify O_DIRECT for ftmp */ if (VN_CACHED(VFS_I(tip)) != 0) { error = XFS_ERROR(EINVAL); goto out_unlock; } /* Verify all data are being swapped */ if (sxp->sx_offset != 0 || sxp->sx_length != ip->i_d.di_size || sxp->sx_length != tip->i_d.di_size) { error = XFS_ERROR(EFAULT); goto out_unlock; } trace_xfs_swap_extent_before(ip, 0); trace_xfs_swap_extent_before(tip, 1); /* check inode formats now that data is flushed */ error = xfs_swap_extents_check_format(ip, tip); if (error) { xfs_notice(mp, "%s: inode 0x%llx format is incompatible for exchanging.", __func__, ip->i_ino); goto out_unlock; } /* * Compare the current change & modify times with that * passed in. If they differ, we abort this swap. * This is the mechanism used to ensure the calling * process that the file was not changed out from * under it. */ if ((sbp->bs_ctime.tv_sec != VFS_I(ip)->i_ctime.tv_sec) || (sbp->bs_ctime.tv_nsec != VFS_I(ip)->i_ctime.tv_nsec) || (sbp->bs_mtime.tv_sec != VFS_I(ip)->i_mtime.tv_sec) || (sbp->bs_mtime.tv_nsec != VFS_I(ip)->i_mtime.tv_nsec)) { error = XFS_ERROR(EBUSY); goto out_unlock; } /* We need to fail if the file is memory mapped. Once we have tossed * all existing pages, the page fault will have no option * but to go to the filesystem for pages. By making the page fault call * vop_read (or write in the case of autogrow) they block on the iolock * until we have switched the extents. */ if (VN_MAPPED(VFS_I(ip))) { error = XFS_ERROR(EBUSY); goto out_unlock; } xfs_iunlock(ip, XFS_ILOCK_EXCL); xfs_iunlock(tip, XFS_ILOCK_EXCL); /* * There is a race condition here since we gave up the * ilock. However, the data fork will not change since * we have the iolock (locked for truncation too) so we * are safe. We don't really care if non-io related * fields change. */ xfs_tosspages(ip, 0, -1, FI_REMAPF); tp = xfs_trans_alloc(mp, XFS_TRANS_SWAPEXT); if ((error = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp), 0, 0, 0))) { xfs_iunlock(ip, XFS_IOLOCK_EXCL); xfs_iunlock(tip, XFS_IOLOCK_EXCL); xfs_trans_cancel(tp, 0); goto out; } xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL); /* * Count the number of extended attribute blocks */ if ( ((XFS_IFORK_Q(ip) != 0) && (ip->i_d.di_anextents > 0)) && (ip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)) { error = xfs_bmap_count_blocks(tp, ip, XFS_ATTR_FORK, &aforkblks); if (error) goto out_trans_cancel; } if ( ((XFS_IFORK_Q(tip) != 0) && (tip->i_d.di_anextents > 0)) && (tip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)) { error = xfs_bmap_count_blocks(tp, tip, XFS_ATTR_FORK, &taforkblks); if (error) goto out_trans_cancel; } /* * Swap the data forks of the inodes */ ifp = &ip->i_df; tifp = &tip->i_df; *tempifp = *ifp; /* struct copy */ *ifp = *tifp; /* struct copy */ *tifp = *tempifp; /* struct copy */ /* * Fix the on-disk inode values */ tmp = (__uint64_t)ip->i_d.di_nblocks; ip->i_d.di_nblocks = tip->i_d.di_nblocks - taforkblks + aforkblks; tip->i_d.di_nblocks = tmp + taforkblks - aforkblks; tmp = (__uint64_t) ip->i_d.di_nextents; ip->i_d.di_nextents = tip->i_d.di_nextents; tip->i_d.di_nextents = tmp; tmp = (__uint64_t) ip->i_d.di_format; ip->i_d.di_format = tip->i_d.di_format; tip->i_d.di_format = tmp; /* * The extents in the source inode could still contain speculative * preallocation beyond EOF (e.g. the file is open but not modified * while defrag is in progress). In that case, we need to copy over the * number of delalloc blocks the data fork in the source inode is * tracking beyond EOF so that when the fork is truncated away when the * temporary inode is unlinked we don't underrun the i_delayed_blks * counter on that inode. */ ASSERT(tip->i_delayed_blks == 0); tip->i_delayed_blks = ip->i_delayed_blks; ip->i_delayed_blks = 0; src_log_flags = XFS_ILOG_CORE; switch (ip->i_d.di_format) { case XFS_DINODE_FMT_EXTENTS: /* If the extents fit in the inode, fix the * pointer. Otherwise it's already NULL or * pointing to the extent. */ if (ip->i_d.di_nextents <= XFS_INLINE_EXTS) { ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext; } src_log_flags |= XFS_ILOG_DEXT; break; case XFS_DINODE_FMT_BTREE: src_log_flags |= XFS_ILOG_DBROOT; break; } target_log_flags = XFS_ILOG_CORE; switch (tip->i_d.di_format) { case XFS_DINODE_FMT_EXTENTS: /* If the extents fit in the inode, fix the * pointer. Otherwise it's already NULL or * pointing to the extent. */ if (tip->i_d.di_nextents <= XFS_INLINE_EXTS) { tifp->if_u1.if_extents = tifp->if_u2.if_inline_ext; } target_log_flags |= XFS_ILOG_DEXT; break; case XFS_DINODE_FMT_BTREE: target_log_flags |= XFS_ILOG_DBROOT; break; } xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); xfs_trans_ijoin(tp, tip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); xfs_trans_log_inode(tp, ip, src_log_flags); xfs_trans_log_inode(tp, tip, target_log_flags); /* * If this is a synchronous mount, make sure that the * transaction goes to disk before returning to the user. */ if (mp->m_flags & XFS_MOUNT_WSYNC) xfs_trans_set_sync(tp); error = xfs_trans_commit(tp, 0); trace_xfs_swap_extent_after(ip, 0); trace_xfs_swap_extent_after(tip, 1); out: kmem_free(tempifp); return error; out_unlock: xfs_iunlock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); xfs_iunlock(tip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); goto out; out_trans_cancel: xfs_trans_cancel(tp, 0); goto out_unlock; }
STATIC ssize_t xfs_file_read_iter( struct kiocb *iocb, struct iov_iter *to) { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; size_t size = iov_iter_count(to); ssize_t ret = 0; int ioflags = 0; xfs_fsize_t n; loff_t pos = iocb->ki_pos; XFS_STATS_INC(mp, xs_read_calls); if (unlikely(iocb->ki_flags & IOCB_DIRECT)) ioflags |= XFS_IO_ISDIRECT; if (file->f_mode & FMODE_NOCMTIME) ioflags |= XFS_IO_INVIS; if ((ioflags & XFS_IO_ISDIRECT) && !IS_DAX(inode)) { xfs_buftarg_t *target = XFS_IS_REALTIME_INODE(ip) ? mp->m_rtdev_targp : mp->m_ddev_targp; /* DIO must be aligned to device logical sector size */ if ((pos | size) & target->bt_logical_sectormask) { if (pos == i_size_read(inode)) return 0; return -EINVAL; } } n = mp->m_super->s_maxbytes - pos; if (n <= 0 || size == 0) return 0; if (n < size) size = n; if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; /* * Locking is a bit tricky here. If we take an exclusive lock for direct * IO, we effectively serialise all new concurrent read IO to this file * and block it behind IO that is currently in progress because IO in * progress holds the IO lock shared. We only need to hold the lock * exclusive to blow away the page cache, so only take lock exclusively * if the page cache needs invalidation. This allows the normal direct * IO case of no page cache pages to proceeed concurrently without * serialisation. */ xfs_rw_ilock(ip, XFS_IOLOCK_SHARED); if ((ioflags & XFS_IO_ISDIRECT) && inode->i_mapping->nrpages) { xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); xfs_rw_ilock(ip, XFS_IOLOCK_EXCL); /* * The generic dio code only flushes the range of the particular * I/O. Because we take an exclusive lock here, this whole * sequence is considerably more expensive for us. This has a * noticeable performance impact for any file with cached pages, * even when outside of the range of the particular I/O. * * Hence, amortize the cost of the lock against a full file * flush and reduce the chances of repeated iolock cycles going * forward. */ if (inode->i_mapping->nrpages) { ret = filemap_write_and_wait(VFS_I(ip)->i_mapping); if (ret) { xfs_rw_iunlock(ip, XFS_IOLOCK_EXCL); return ret; } /* * Invalidate whole pages. This can return an error if * we fail to invalidate a page, but this should never * happen on XFS. Warn if it does fail. */ ret = invalidate_inode_pages2(VFS_I(ip)->i_mapping); WARN_ON_ONCE(ret); ret = 0; } xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL); } trace_xfs_file_read(ip, size, pos, ioflags); ret = generic_file_read_iter(iocb, to); if (ret > 0) XFS_STATS_ADD(mp, xs_read_bytes, ret); xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); return ret; }