static int xfs_swap_extents( xfs_inode_t *ip, /* target inode */ xfs_inode_t *tip, /* tmp inode */ xfs_swapext_t *sxp) { xfs_mount_t *mp; xfs_trans_t *tp; xfs_bstat_t *sbp = &sxp->sx_stat; xfs_ifork_t *tempifp, *ifp, *tifp; int ilf_fields, tilf_fields; int error = 0; int aforkblks = 0; int taforkblks = 0; __uint64_t tmp; mp = ip->i_mount; tempifp = kmem_alloc(sizeof(xfs_ifork_t), KM_MAYFAIL); if (!tempifp) { error = XFS_ERROR(ENOMEM); goto out; } sbp = &sxp->sx_stat; /* * we have to do two separate lock calls here to keep lockdep * happy. If we try to get all the locks in one call, lock will * report false positives when we drop the ILOCK and regain them * below. */ xfs_lock_two_inodes(ip, tip, XFS_IOLOCK_EXCL); xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL); /* Verify that both files have the same format */ if ((ip->i_d.di_mode & S_IFMT) != (tip->i_d.di_mode & S_IFMT)) { error = XFS_ERROR(EINVAL); goto out_unlock; } /* Verify both files are either real-time or non-realtime */ if (XFS_IS_REALTIME_INODE(ip) != XFS_IS_REALTIME_INODE(tip)) { error = XFS_ERROR(EINVAL); goto out_unlock; } if (VN_CACHED(VFS_I(tip)) != 0) { error = xfs_flushinval_pages(tip, 0, -1, FI_REMAPF_LOCKED); if (error) goto out_unlock; } /* Verify O_DIRECT for ftmp */ if (VN_CACHED(VFS_I(tip)) != 0) { error = XFS_ERROR(EINVAL); goto out_unlock; } /* Verify all data are being swapped */ if (sxp->sx_offset != 0 || sxp->sx_length != ip->i_d.di_size || sxp->sx_length != tip->i_d.di_size) { error = XFS_ERROR(EFAULT); goto out_unlock; } trace_xfs_swap_extent_before(ip, 0); trace_xfs_swap_extent_before(tip, 1); /* check inode formats now that data is flushed */ error = xfs_swap_extents_check_format(ip, tip); if (error) { xfs_fs_cmn_err(CE_NOTE, mp, "%s: inode 0x%llx format is incompatible for exchanging.", __FILE__, ip->i_ino); goto out_unlock; } /* * Compare the current change & modify times with that * passed in. If they differ, we abort this swap. * This is the mechanism used to ensure the calling * process that the file was not changed out from * under it. */ if ((sbp->bs_ctime.tv_sec != VFS_I(ip)->i_ctime.tv_sec) || (sbp->bs_ctime.tv_nsec != VFS_I(ip)->i_ctime.tv_nsec) || (sbp->bs_mtime.tv_sec != VFS_I(ip)->i_mtime.tv_sec) || (sbp->bs_mtime.tv_nsec != VFS_I(ip)->i_mtime.tv_nsec)) { error = XFS_ERROR(EBUSY); goto out_unlock; } /* We need to fail if the file is memory mapped. Once we have tossed * all existing pages, the page fault will have no option * but to go to the filesystem for pages. By making the page fault call * vop_read (or write in the case of autogrow) they block on the iolock * until we have switched the extents. */ if (VN_MAPPED(VFS_I(ip))) { error = XFS_ERROR(EBUSY); goto out_unlock; } xfs_iunlock(ip, XFS_ILOCK_EXCL); xfs_iunlock(tip, XFS_ILOCK_EXCL); /* * There is a race condition here since we gave up the * ilock. However, the data fork will not change since * we have the iolock (locked for truncation too) so we * are safe. We don't really care if non-io related * fields change. */ xfs_tosspages(ip, 0, -1, FI_REMAPF); tp = xfs_trans_alloc(mp, XFS_TRANS_SWAPEXT); if ((error = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp), 0, 0, 0))) { xfs_iunlock(ip, XFS_IOLOCK_EXCL); xfs_iunlock(tip, XFS_IOLOCK_EXCL); xfs_trans_cancel(tp, 0); goto out; } xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL); /* * Count the number of extended attribute blocks */ if ( ((XFS_IFORK_Q(ip) != 0) && (ip->i_d.di_anextents > 0)) && (ip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)) { error = xfs_bmap_count_blocks(tp, ip, XFS_ATTR_FORK, &aforkblks); if (error) goto out_trans_cancel; } if ( ((XFS_IFORK_Q(tip) != 0) && (tip->i_d.di_anextents > 0)) && (tip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)) { error = xfs_bmap_count_blocks(tp, tip, XFS_ATTR_FORK, &taforkblks); if (error) goto out_trans_cancel; } /* * Swap the data forks of the inodes */ ifp = &ip->i_df; tifp = &tip->i_df; *tempifp = *ifp; /* struct copy */ *ifp = *tifp; /* struct copy */ *tifp = *tempifp; /* struct copy */ /* * Fix the in-memory data fork values that are dependent on the fork * offset in the inode. We can't assume they remain the same as attr2 * has dynamic fork offsets. */ ifp->if_ext_max = XFS_IFORK_SIZE(ip, XFS_DATA_FORK) / (uint)sizeof(xfs_bmbt_rec_t); tifp->if_ext_max = XFS_IFORK_SIZE(tip, XFS_DATA_FORK) / (uint)sizeof(xfs_bmbt_rec_t); /* * Fix the on-disk inode values */ tmp = (__uint64_t)ip->i_d.di_nblocks; ip->i_d.di_nblocks = tip->i_d.di_nblocks - taforkblks + aforkblks; tip->i_d.di_nblocks = tmp + taforkblks - aforkblks; tmp = (__uint64_t) ip->i_d.di_nextents; ip->i_d.di_nextents = tip->i_d.di_nextents; tip->i_d.di_nextents = tmp; tmp = (__uint64_t) ip->i_d.di_format; ip->i_d.di_format = tip->i_d.di_format; tip->i_d.di_format = tmp; /* * The extents in the source inode could still contain speculative * preallocation beyond EOF (e.g. the file is open but not modified * while defrag is in progress). In that case, we need to copy over the * number of delalloc blocks the data fork in the source inode is * tracking beyond EOF so that when the fork is truncated away when the * temporary inode is unlinked we don't underrun the i_delayed_blks * counter on that inode. */ ASSERT(tip->i_delayed_blks == 0); tip->i_delayed_blks = ip->i_delayed_blks; ip->i_delayed_blks = 0; ilf_fields = XFS_ILOG_CORE; switch(ip->i_d.di_format) { case XFS_DINODE_FMT_EXTENTS: /* If the extents fit in the inode, fix the * pointer. Otherwise it's already NULL or * pointing to the extent. */ if (ip->i_d.di_nextents <= XFS_INLINE_EXTS) { ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext; } ilf_fields |= XFS_ILOG_DEXT; break; case XFS_DINODE_FMT_BTREE: ilf_fields |= XFS_ILOG_DBROOT; break; } tilf_fields = XFS_ILOG_CORE; switch(tip->i_d.di_format) { case XFS_DINODE_FMT_EXTENTS: /* If the extents fit in the inode, fix the * pointer. Otherwise it's already NULL or * pointing to the extent. */ if (tip->i_d.di_nextents <= XFS_INLINE_EXTS) { tifp->if_u1.if_extents = tifp->if_u2.if_inline_ext; } tilf_fields |= XFS_ILOG_DEXT; break; case XFS_DINODE_FMT_BTREE: tilf_fields |= XFS_ILOG_DBROOT; break; } xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); xfs_trans_ijoin_ref(tp, tip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); xfs_trans_log_inode(tp, ip, ilf_fields); xfs_trans_log_inode(tp, tip, tilf_fields); /* * If this is a synchronous mount, make sure that the * transaction goes to disk before returning to the user. */ if (mp->m_flags & XFS_MOUNT_WSYNC) xfs_trans_set_sync(tp); error = xfs_trans_commit(tp, XFS_TRANS_SWAPEXT); trace_xfs_swap_extent_after(ip, 0); trace_xfs_swap_extent_after(tip, 1); out: kmem_free(tempifp); return error; out_unlock: xfs_iunlock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); xfs_iunlock(tip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); goto out; out_trans_cancel: xfs_trans_cancel(tp, 0); goto out_unlock; }
/* * Maps a dquot to the buffer containing its on-disk version. * This returns a ptr to the buffer containing the on-disk dquot * in the bpp param, and a ptr to the on-disk dquot within that buffer */ STATIC int xfs_qm_dqtobp( xfs_trans_t **tpp, xfs_dquot_t *dqp, xfs_disk_dquot_t **O_ddpp, xfs_buf_t **O_bpp, uint flags) { struct xfs_bmbt_irec map; int nmaps = 1, error; struct xfs_buf *bp; struct xfs_inode *quotip = xfs_dq_to_quota_inode(dqp); struct xfs_mount *mp = dqp->q_mount; xfs_dqid_t id = be32_to_cpu(dqp->q_core.d_id); struct xfs_trans *tp = (tpp ? *tpp : NULL); dqp->q_fileoffset = (xfs_fileoff_t)id / mp->m_quotainfo->qi_dqperchunk; xfs_ilock(quotip, XFS_ILOCK_SHARED); if (!xfs_this_quota_on(dqp->q_mount, dqp->dq_flags)) { /* * Return if this type of quotas is turned off while we * didn't have the quota inode lock. */ xfs_iunlock(quotip, XFS_ILOCK_SHARED); return ESRCH; } /* * Find the block map; no allocations yet */ error = xfs_bmapi_read(quotip, dqp->q_fileoffset, XFS_DQUOT_CLUSTER_SIZE_FSB, &map, &nmaps, 0); xfs_iunlock(quotip, XFS_ILOCK_SHARED); if (error) return error; ASSERT(nmaps == 1); ASSERT(map.br_blockcount == 1); /* * Offset of dquot in the (fixed sized) dquot chunk. */ dqp->q_bufoffset = (id % mp->m_quotainfo->qi_dqperchunk) * sizeof(xfs_dqblk_t); ASSERT(map.br_startblock != DELAYSTARTBLOCK); if (map.br_startblock == HOLESTARTBLOCK) { /* * We don't allocate unless we're asked to */ if (!(flags & XFS_QMOPT_DQALLOC)) return ENOENT; ASSERT(tp); error = xfs_qm_dqalloc(tpp, mp, dqp, quotip, dqp->q_fileoffset, &bp); if (error) return error; tp = *tpp; } else { trace_xfs_dqtobp_read(dqp); /* * store the blkno etc so that we don't have to do the * mapping all the time */ dqp->q_blkno = XFS_FSB_TO_DADDR(mp, map.br_startblock); error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, dqp->q_blkno, mp->m_quotainfo->qi_dqchunklen, 0, &bp, &xfs_dquot_buf_ops); if (error == EFSCORRUPTED && (flags & XFS_QMOPT_DQREPAIR)) { xfs_dqid_t firstid = (xfs_dqid_t)map.br_startoff * mp->m_quotainfo->qi_dqperchunk; ASSERT(bp == NULL); error = xfs_qm_dqrepair(mp, tp, dqp, firstid, &bp); } if (error) { ASSERT(bp == NULL); return XFS_ERROR(error); } } ASSERT(xfs_buf_islocked(bp)); *O_bpp = bp; *O_ddpp = bp->b_addr + dqp->q_bufoffset; return (0); }
STATIC int xfs_ioctl_setattr( xfs_inode_t *ip, struct fsxattr *fa, int mask) { struct xfs_mount *mp = ip->i_mount; struct xfs_trans *tp; unsigned int lock_flags = 0; struct xfs_dquot *udqp = NULL; struct xfs_dquot *gdqp = NULL; struct xfs_dquot *olddquot = NULL; int code; trace_xfs_ioctl_setattr(ip); if (mp->m_flags & XFS_MOUNT_RDONLY) return XFS_ERROR(EROFS); if (XFS_FORCED_SHUTDOWN(mp)) return XFS_ERROR(EIO); /* * Disallow 32bit project ids when projid32bit feature is not enabled. */ if ((mask & FSX_PROJID) && (fa->fsx_projid > (__uint16_t)-1) && !xfs_sb_version_hasprojid32bit(&ip->i_mount->m_sb)) return XFS_ERROR(EINVAL); /* * If disk quotas is on, we make sure that the dquots do exist on disk, * before we start any other transactions. Trying to do this later * is messy. We don't care to take a readlock to look at the ids * in inode here, because we can't hold it across the trans_reserve. * If the IDs do change before we take the ilock, we're covered * because the i_*dquot fields will get updated anyway. */ if (XFS_IS_QUOTA_ON(mp) && (mask & FSX_PROJID)) { code = xfs_qm_vop_dqalloc(ip, ip->i_d.di_uid, ip->i_d.di_gid, fa->fsx_projid, XFS_QMOPT_PQUOTA, &udqp, &gdqp); if (code) return code; } /* * For the other attributes, we acquire the inode lock and * first do an error checking pass. */ tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE); code = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp), 0, 0, 0); if (code) goto error_return; lock_flags = XFS_ILOCK_EXCL; xfs_ilock(ip, lock_flags); /* * CAP_FOWNER overrides the following restrictions: * * The user ID of the calling process must be equal * to the file owner ID, except in cases where the * CAP_FSETID capability is applicable. */ if (current_fsuid() != ip->i_d.di_uid && !capable(CAP_FOWNER)) { code = XFS_ERROR(EPERM); goto error_return; } /* * Do a quota reservation only if projid is actually going to change. */ if (mask & FSX_PROJID) { if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_PQUOTA_ON(mp) && xfs_get_projid(ip) != fa->fsx_projid) { ASSERT(tp); code = xfs_qm_vop_chown_reserve(tp, ip, udqp, gdqp, capable(CAP_FOWNER) ? XFS_QMOPT_FORCE_RES : 0); if (code) /* out of quota */ goto error_return; } } if (mask & FSX_EXTSIZE) { /* * Can't change extent size if any extents are allocated. */ if (ip->i_d.di_nextents && ((ip->i_d.di_extsize << mp->m_sb.sb_blocklog) != fa->fsx_extsize)) { code = XFS_ERROR(EINVAL); /* EFBIG? */ goto error_return; } /* * Extent size must be a multiple of the appropriate block * size, if set at all. It must also be smaller than the * maximum extent size supported by the filesystem. * * Also, for non-realtime files, limit the extent size hint to * half the size of the AGs in the filesystem so alignment * doesn't result in extents larger than an AG. */ if (fa->fsx_extsize != 0) { xfs_extlen_t size; xfs_fsblock_t extsize_fsb; extsize_fsb = XFS_B_TO_FSB(mp, fa->fsx_extsize); if (extsize_fsb > MAXEXTLEN) { code = XFS_ERROR(EINVAL); goto error_return; } if (XFS_IS_REALTIME_INODE(ip) || ((mask & FSX_XFLAGS) && (fa->fsx_xflags & XFS_XFLAG_REALTIME))) { size = mp->m_sb.sb_rextsize << mp->m_sb.sb_blocklog; } else { size = mp->m_sb.sb_blocksize; if (extsize_fsb > mp->m_sb.sb_agblocks / 2) { code = XFS_ERROR(EINVAL); goto error_return; } } if (fa->fsx_extsize % size) { code = XFS_ERROR(EINVAL); goto error_return; } } } if (mask & FSX_XFLAGS) { /* * Can't change realtime flag if any extents are allocated. */ if ((ip->i_d.di_nextents || ip->i_delayed_blks) && (XFS_IS_REALTIME_INODE(ip)) != (fa->fsx_xflags & XFS_XFLAG_REALTIME)) { code = XFS_ERROR(EINVAL); /* EFBIG? */ goto error_return; } /* * If realtime flag is set then must have realtime data. */ if ((fa->fsx_xflags & XFS_XFLAG_REALTIME)) { if ((mp->m_sb.sb_rblocks == 0) || (mp->m_sb.sb_rextsize == 0) || (ip->i_d.di_extsize % mp->m_sb.sb_rextsize)) { code = XFS_ERROR(EINVAL); goto error_return; } } /* * Can't modify an immutable/append-only file unless * we have appropriate permission. */ if ((ip->i_d.di_flags & (XFS_DIFLAG_IMMUTABLE|XFS_DIFLAG_APPEND) || (fa->fsx_xflags & (XFS_XFLAG_IMMUTABLE | XFS_XFLAG_APPEND))) && !capable(CAP_LINUX_IMMUTABLE)) { code = XFS_ERROR(EPERM); goto error_return; } } xfs_trans_ijoin(tp, ip, 0); /* * Change file ownership. Must be the owner or privileged. */ if (mask & FSX_PROJID) { /* * CAP_FSETID overrides the following restrictions: * * The set-user-ID and set-group-ID bits of a file will be * cleared upon successful return from chown() */ if ((ip->i_d.di_mode & (S_ISUID|S_ISGID)) && !capable(CAP_FSETID)) ip->i_d.di_mode &= ~(S_ISUID|S_ISGID); /* * Change the ownerships and register quota modifications * in the transaction. */ if (xfs_get_projid(ip) != fa->fsx_projid) { if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_PQUOTA_ON(mp)) { olddquot = xfs_qm_vop_chown(tp, ip, &ip->i_gdquot, gdqp); } xfs_set_projid(ip, fa->fsx_projid); /* * We may have to rev the inode as well as * the superblock version number since projids didn't * exist before DINODE_VERSION_2 and SB_VERSION_NLINK. */ if (ip->i_d.di_version == 1) xfs_bump_ino_vers2(tp, ip); } } if (mask & FSX_EXTSIZE) ip->i_d.di_extsize = fa->fsx_extsize >> mp->m_sb.sb_blocklog; if (mask & FSX_XFLAGS) { xfs_set_diflags(ip, fa->fsx_xflags); xfs_diflags_to_linux(ip); } xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); XFS_STATS_INC(xs_ig_attrchg); /* * If this is a synchronous mount, make sure that the * transaction goes to disk before returning to the user. * This is slightly sub-optimal in that truncates require * two sync transactions instead of one for wsync filesystems. * One for the truncate and one for the timestamps since we * don't want to change the timestamps unless we're sure the * truncate worked. Truncates are less than 1% of the laddis * mix so this probably isn't worth the trouble to optimize. */ if (mp->m_flags & XFS_MOUNT_WSYNC) xfs_trans_set_sync(tp); code = xfs_trans_commit(tp, 0); xfs_iunlock(ip, lock_flags); /* * Release any dquot(s) the inode had kept before chown. */ xfs_qm_dqrele(olddquot); xfs_qm_dqrele(udqp); xfs_qm_dqrele(gdqp); return code; error_return: xfs_qm_dqrele(udqp); xfs_qm_dqrele(gdqp); xfs_trans_cancel(tp, 0); if (lock_flags) xfs_iunlock(ip, lock_flags); return code; }
int xfs_iomap_write_unwritten( xfs_inode_t *ip, xfs_off_t offset, size_t count) { xfs_mount_t *mp = ip->i_mount; xfs_iocore_t *io = &ip->i_iocore; xfs_fileoff_t offset_fsb; xfs_filblks_t count_fsb; xfs_filblks_t numblks_fsb; xfs_fsblock_t firstfsb; int nimaps; xfs_trans_t *tp; xfs_bmbt_irec_t imap; xfs_bmap_free_t free_list; uint resblks; int committed; int error; xfs_iomap_enter_trace(XFS_IOMAP_UNWRITTEN, &ip->i_iocore, offset, count); offset_fsb = XFS_B_TO_FSBT(mp, offset); count_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count); count_fsb = (xfs_filblks_t)(count_fsb - offset_fsb); resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0) << 1; do { /* * set up a transaction to convert the range of extents * from unwritten to real. Do allocations in a loop until * we have covered the range passed in. */ tp = xfs_trans_alloc(mp, XFS_TRANS_STRAT_WRITE); tp->t_flags |= XFS_TRANS_RESERVE; error = xfs_trans_reserve(tp, resblks, XFS_WRITE_LOG_RES(mp), 0, XFS_TRANS_PERM_LOG_RES, XFS_WRITE_LOG_COUNT); if (error) { xfs_trans_cancel(tp, 0); return XFS_ERROR(error); } xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); xfs_trans_ihold(tp, ip); /* * Modify the unwritten extent state of the buffer. */ XFS_BMAP_INIT(&free_list, &firstfsb); nimaps = 1; error = XFS_BMAPI(mp, tp, io, offset_fsb, count_fsb, XFS_BMAPI_WRITE|XFS_BMAPI_CONVERT, &firstfsb, 1, &imap, &nimaps, &free_list, NULL); if (error) goto error_on_bmapi_transaction; error = xfs_bmap_finish(&(tp), &(free_list), &committed); if (error) goto error_on_bmapi_transaction; error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); xfs_iunlock(ip, XFS_ILOCK_EXCL); if (error) return XFS_ERROR(error); if (unlikely(!imap.br_startblock && !(io->io_flags & XFS_IOCORE_RT))) return xfs_cmn_err_fsblock_zero(ip, &imap); if ((numblks_fsb = imap.br_blockcount) == 0) { /* * The numblks_fsb value should always get * smaller, otherwise the loop is stuck. */ ASSERT(imap.br_blockcount); break; } offset_fsb += numblks_fsb; count_fsb -= numblks_fsb; } while (count_fsb > 0); return 0; error_on_bmapi_transaction: xfs_bmap_cancel(&free_list); xfs_trans_cancel(tp, (XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT)); xfs_iunlock(ip, XFS_ILOCK_EXCL); return XFS_ERROR(error); }
/* * Handle logging requirements of various synchronous types of write. */ int xfs_write_sync_logforce( xfs_mount_t *mp, xfs_inode_t *ip) { int error = 0; /* * If we're treating this as O_DSYNC and we have not updated the * size, force the log. */ if (!(mp->m_flags & XFS_MOUNT_OSYNCISOSYNC) && !(ip->i_update_size)) { xfs_inode_log_item_t *iip = ip->i_itemp; /* * If an allocation transaction occurred * without extending the size, then we have to force * the log up the proper point to ensure that the * allocation is permanent. We can't count on * the fact that buffered writes lock out direct I/O * writes - the direct I/O write could have extended * the size nontransactionally, then finished before * we started. xfs_write_file will think that the file * didn't grow but the update isn't safe unless the * size change is logged. * * Force the log if we've committed a transaction * against the inode or if someone else has and * the commit record hasn't gone to disk (e.g. * the inode is pinned). This guarantees that * all changes affecting the inode are permanent * when we return. */ if (iip && iip->ili_last_lsn) { error = _xfs_log_force(mp, iip->ili_last_lsn, XFS_LOG_FORCE | XFS_LOG_SYNC, NULL); } else if (xfs_ipincount(ip) > 0) { error = _xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE | XFS_LOG_SYNC, NULL); } } else { xfs_trans_t *tp; /* * O_SYNC or O_DSYNC _with_ a size update are handled * the same way. * * If the write was synchronous then we need to make * sure that the inode modification time is permanent. * We'll have updated the timestamp above, so here * we use a synchronous transaction to log the inode. * It's not fast, but it's necessary. * * If this a dsync write and the size got changed * non-transactionally, then we need to ensure that * the size change gets logged in a synchronous * transaction. */ tp = xfs_trans_alloc(mp, XFS_TRANS_WRITE_SYNC); if ((error = xfs_trans_reserve(tp, 0, XFS_SWRITE_LOG_RES(mp), 0, 0, 0))) { /* Transaction reserve failed */ xfs_trans_cancel(tp, 0); } else { /* Transaction reserve successful */ xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); xfs_trans_ihold(tp, ip); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); xfs_trans_set_sync(tp); error = xfs_trans_commit(tp, 0); xfs_iunlock(ip, XFS_ILOCK_EXCL); } } return error; }
int xfs_filestream_new_ag( xfs_bmalloca_t *ap, xfs_agnumber_t *agp) { int flags, err; xfs_inode_t *ip, *pip = NULL; xfs_mount_t *mp; xfs_mru_cache_t *cache; xfs_extlen_t minlen; fstrm_item_t *dir, *file; xfs_agnumber_t ag = NULLAGNUMBER; ip = ap->ip; mp = ip->i_mount; cache = mp->m_filestream; minlen = ap->length; *agp = NULLAGNUMBER; file = xfs_mru_cache_remove(cache, ip->i_ino); if (file) { ASSERT(ip == file->ip); pip = file->pip; ag = file->ag; dir = xfs_mru_cache_lookup(cache, pip->i_ino); if (dir) { ASSERT(pip == dir->ip); if (dir->ag != file->ag) { xfs_filestream_put_ag(mp, file->ag); xfs_filestream_get_ag(mp, dir->ag); *agp = file->ag = dir->ag; } xfs_mru_cache_done(cache); } err = xfs_mru_cache_insert(cache, ip->i_ino, file); if (err) { xfs_fstrm_free_func(ip->i_ino, file); return err; } if (*agp != NULLAGNUMBER) { TRACE_MOVEAG(mp, ip, pip, ag, xfs_filestream_peek_ag(mp, ag), *agp, xfs_filestream_peek_ag(mp, *agp)); return 0; } } if (pip) xfs_ilock(pip, XFS_IOLOCK_EXCL | XFS_IOLOCK_PARENT); ag = (ag == NULLAGNUMBER) ? 0 : (ag + 1) % mp->m_sb.sb_agcount; flags = (ap->userdata ? XFS_PICK_USERDATA : 0) | (ap->flist->xbf_low ? XFS_PICK_LOWSPACE : 0); err = _xfs_filestream_pick_ag(mp, ag, agp, flags, minlen); if (err || *agp == NULLAGNUMBER) goto exit; if (!pip) { TRACE_ORPHAN(mp, ip, *agp); goto exit; } err = _xfs_filestream_update_ag(pip, NULL, *agp); if (err) goto exit; err = _xfs_filestream_update_ag(ip, pip, *agp); if (err) goto exit; TRACE_MOVEAG(mp, ip, pip, NULLAGNUMBER, 0, *agp, xfs_filestream_peek_ag(mp, *agp)); exit: if (*agp != NULLAGNUMBER) xfs_filestream_put_ag(mp, *agp); else *agp = 0; if (pip) xfs_iunlock(pip, XFS_IOLOCK_EXCL); return err; }
int xfs_iomap_write_direct( xfs_inode_t *ip, xfs_off_t offset, size_t count, int flags, xfs_bmbt_irec_t *ret_imap, int *nmaps, int found) { xfs_mount_t *mp = ip->i_mount; xfs_iocore_t *io = &ip->i_iocore; xfs_fileoff_t offset_fsb; xfs_fileoff_t last_fsb; xfs_filblks_t count_fsb, resaligned; xfs_fsblock_t firstfsb; xfs_extlen_t extsz, temp; xfs_fsize_t isize; int nimaps; int bmapi_flag; int quota_flag; int rt; xfs_trans_t *tp; xfs_bmbt_irec_t imap; xfs_bmap_free_t free_list; uint qblocks, resblks, resrtextents; int committed; int error; /* * Make sure that the dquots are there. This doesn't hold * the ilock across a disk read. */ error = XFS_QM_DQATTACH(ip->i_mount, ip, XFS_QMOPT_ILOCKED); if (error) return XFS_ERROR(error); rt = XFS_IS_REALTIME_INODE(ip); extsz = xfs_get_extsz_hint(ip); isize = ip->i_size; if (io->io_new_size > isize) isize = io->io_new_size; offset_fsb = XFS_B_TO_FSBT(mp, offset); last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count))); if ((offset + count) > isize) { error = xfs_iomap_eof_align_last_fsb(mp, io, isize, extsz, &last_fsb); if (error) goto error_out; } else { if (found && (ret_imap->br_startblock == HOLESTARTBLOCK)) last_fsb = MIN(last_fsb, (xfs_fileoff_t) ret_imap->br_blockcount + ret_imap->br_startoff); } count_fsb = last_fsb - offset_fsb; ASSERT(count_fsb > 0); resaligned = count_fsb; if (unlikely(extsz)) { if ((temp = do_mod(offset_fsb, extsz))) resaligned += temp; if ((temp = do_mod(resaligned, extsz))) resaligned += extsz - temp; } if (unlikely(rt)) { resrtextents = qblocks = resaligned; resrtextents /= mp->m_sb.sb_rextsize; resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0); quota_flag = XFS_QMOPT_RES_RTBLKS; } else { resrtextents = 0; resblks = qblocks = XFS_DIOSTRAT_SPACE_RES(mp, resaligned); quota_flag = XFS_QMOPT_RES_REGBLKS; } /* * Allocate and setup the transaction */ xfs_iunlock(ip, XFS_ILOCK_EXCL); tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT); error = xfs_trans_reserve(tp, resblks, XFS_WRITE_LOG_RES(mp), resrtextents, XFS_TRANS_PERM_LOG_RES, XFS_WRITE_LOG_COUNT); /* * Check for running out of space, note: need lock to return */ if (error) xfs_trans_cancel(tp, 0); xfs_ilock(ip, XFS_ILOCK_EXCL); if (error) goto error_out; error = XFS_TRANS_RESERVE_QUOTA_NBLKS(mp, tp, ip, qblocks, 0, quota_flag); if (error) goto error1; xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); xfs_trans_ihold(tp, ip); bmapi_flag = XFS_BMAPI_WRITE; if ((flags & BMAPI_DIRECT) && (offset < ip->i_size || extsz)) bmapi_flag |= XFS_BMAPI_PREALLOC; /* * Issue the xfs_bmapi() call to allocate the blocks */ XFS_BMAP_INIT(&free_list, &firstfsb); nimaps = 1; error = XFS_BMAPI(mp, tp, io, offset_fsb, count_fsb, bmapi_flag, &firstfsb, 0, &imap, &nimaps, &free_list, NULL); if (error) goto error0; /* * Complete the transaction */ error = xfs_bmap_finish(&tp, &free_list, &committed); if (error) goto error0; error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); if (error) goto error_out; /* * Copy any maps to caller's array and return any error. */ if (nimaps == 0) { error = ENOSPC; goto error_out; } if (unlikely(!imap.br_startblock && !(io->io_flags & XFS_IOCORE_RT))) { error = xfs_cmn_err_fsblock_zero(ip, &imap); goto error_out; } *ret_imap = imap; *nmaps = 1; return 0; error0: /* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */ xfs_bmap_cancel(&free_list); XFS_TRANS_UNRESERVE_QUOTA_NBLKS(mp, tp, ip, qblocks, 0, quota_flag); error1: /* Just cancel transaction */ xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); *nmaps = 0; /* nothing set-up here */ error_out: return XFS_ERROR(error); }
/* * Get a layout for the pNFS client. */ int xfs_fs_map_blocks( struct inode *inode, loff_t offset, u64 length, struct iomap *iomap, bool write, u32 *device_generation) { struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; struct xfs_bmbt_irec imap; xfs_fileoff_t offset_fsb, end_fsb; loff_t limit; int bmapi_flags = XFS_BMAPI_ENTIRE; int nimaps = 1; uint lock_flags; int error = 0; if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; /* * We can't export inodes residing on the realtime device. The realtime * device doesn't have a UUID to identify it, so the client has no way * to find it. */ if (XFS_IS_REALTIME_INODE(ip)) return -ENXIO; /* * The pNFS block layout spec actually supports reflink like * functionality, but the Linux pNFS server doesn't implement it yet. */ if (xfs_is_reflink_inode(ip)) return -ENXIO; /* * Lock out any other I/O before we flush and invalidate the pagecache, * and then hand out a layout to the remote system. This is very * similar to direct I/O, except that the synchronization is much more * complicated. See the comment near xfs_break_layouts for a detailed * explanation. */ xfs_ilock(ip, XFS_IOLOCK_EXCL); error = -EINVAL; limit = mp->m_super->s_maxbytes; if (!write) limit = max(limit, round_up(i_size_read(inode), inode->i_sb->s_blocksize)); if (offset > limit) goto out_unlock; if (offset > limit - length) length = limit - offset; error = filemap_write_and_wait(inode->i_mapping); if (error) goto out_unlock; error = invalidate_inode_pages2(inode->i_mapping); if (WARN_ON_ONCE(error)) return error; end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + length); offset_fsb = XFS_B_TO_FSBT(mp, offset); lock_flags = xfs_ilock_data_map_shared(ip); error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap, &nimaps, bmapi_flags); xfs_iunlock(ip, lock_flags); if (error) goto out_unlock; if (write) { enum xfs_prealloc_flags flags = 0; ASSERT(imap.br_startblock != DELAYSTARTBLOCK); if (!nimaps || imap.br_startblock == HOLESTARTBLOCK) { /* * xfs_iomap_write_direct() expects to take ownership of * the shared ilock. */ xfs_ilock(ip, XFS_ILOCK_SHARED); error = xfs_iomap_write_direct(ip, offset, length, &imap, nimaps); if (error) goto out_unlock; /* * Ensure the next transaction is committed * synchronously so that the blocks allocated and * handed out to the client are guaranteed to be * present even after a server crash. */ flags |= XFS_PREALLOC_SET | XFS_PREALLOC_SYNC; } error = xfs_update_prealloc_flags(ip, flags); if (error) goto out_unlock; } xfs_iunlock(ip, XFS_IOLOCK_EXCL); xfs_bmbt_to_iomap(ip, iomap, &imap); *device_generation = mp->m_generation; return error; out_unlock: xfs_iunlock(ip, XFS_IOLOCK_EXCL); return error; }
int xfs_iomap( xfs_inode_t *ip, xfs_off_t offset, ssize_t count, int flags, xfs_iomap_t *iomapp, int *niomaps) { xfs_mount_t *mp = ip->i_mount; xfs_fileoff_t offset_fsb, end_fsb; int error = 0; int lockmode = 0; xfs_bmbt_irec_t imap; int nimaps = 1; int bmapi_flags = 0; int iomap_flags = 0; ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFREG); if (XFS_FORCED_SHUTDOWN(mp)) return XFS_ERROR(EIO); switch (flags & (BMAPI_READ | BMAPI_WRITE | BMAPI_ALLOCATE)) { case BMAPI_READ: xfs_iomap_enter_trace(XFS_IOMAP_READ_ENTER, ip, offset, count); lockmode = xfs_ilock_map_shared(ip); bmapi_flags = XFS_BMAPI_ENTIRE; break; case BMAPI_WRITE: xfs_iomap_enter_trace(XFS_IOMAP_WRITE_ENTER, ip, offset, count); lockmode = XFS_ILOCK_EXCL; if (flags & BMAPI_IGNSTATE) bmapi_flags |= XFS_BMAPI_IGSTATE|XFS_BMAPI_ENTIRE; xfs_ilock(ip, lockmode); break; case BMAPI_ALLOCATE: xfs_iomap_enter_trace(XFS_IOMAP_ALLOC_ENTER, ip, offset, count); lockmode = XFS_ILOCK_SHARED; bmapi_flags = XFS_BMAPI_ENTIRE; /* Attempt non-blocking lock */ if (flags & BMAPI_TRYLOCK) { if (!xfs_ilock_nowait(ip, lockmode)) return XFS_ERROR(EAGAIN); } else { xfs_ilock(ip, lockmode); } break; default: BUG(); } ASSERT(offset <= mp->m_maxioffset); if ((xfs_fsize_t)offset + count > mp->m_maxioffset) count = mp->m_maxioffset - offset; end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count); offset_fsb = XFS_B_TO_FSBT(mp, offset); error = xfs_bmapi(NULL, ip, offset_fsb, (xfs_filblks_t)(end_fsb - offset_fsb), bmapi_flags, NULL, 0, &imap, &nimaps, NULL, NULL); if (error) goto out; switch (flags & (BMAPI_WRITE|BMAPI_ALLOCATE)) { case BMAPI_WRITE: /* If we found an extent, return it */ if (nimaps && (imap.br_startblock != HOLESTARTBLOCK) && (imap.br_startblock != DELAYSTARTBLOCK)) { xfs_iomap_map_trace(XFS_IOMAP_WRITE_MAP, ip, offset, count, iomapp, &imap, flags); break; } if (flags & (BMAPI_DIRECT|BMAPI_MMAP)) { error = xfs_iomap_write_direct(ip, offset, count, flags, &imap, &nimaps, nimaps); } else { error = xfs_iomap_write_delay(ip, offset, count, flags, &imap, &nimaps); } if (!error) { xfs_iomap_map_trace(XFS_IOMAP_ALLOC_MAP, ip, offset, count, iomapp, &imap, flags); } iomap_flags = IOMAP_NEW; break; case BMAPI_ALLOCATE: /* If we found an extent, return it */ xfs_iunlock(ip, lockmode); lockmode = 0; if (nimaps && !isnullstartblock(imap.br_startblock)) { xfs_iomap_map_trace(XFS_IOMAP_WRITE_MAP, ip, offset, count, iomapp, &imap, flags); break; } error = xfs_iomap_write_allocate(ip, offset, count, &imap, &nimaps); break; } if (nimaps) { *niomaps = xfs_imap_to_bmap(ip, offset, &imap, iomapp, nimaps, *niomaps, iomap_flags); } else if (niomaps) { *niomaps = 0; } out: if (lockmode) xfs_iunlock(ip, lockmode); return XFS_ERROR(error); }
int xfs_dir_lookup( xfs_trans_t *tp, xfs_inode_t *dp, struct xfs_name *name, xfs_ino_t *inum, /* out: inode number */ struct xfs_name *ci_name) /* out: actual name if CI match */ { struct xfs_da_args *args; int rval; int v; /* type-checking value */ int lock_mode; ASSERT(S_ISDIR(VFS_I(dp)->i_mode)); XFS_STATS_INC(dp->i_mount, xs_dir_lookup); /* * We need to use KM_NOFS here so that lockdep will not throw false * positive deadlock warnings on a non-transactional lookup path. It is * safe to recurse into inode recalim in that case, but lockdep can't * easily be taught about it. Hence KM_NOFS avoids having to add more * lockdep Doing this avoids having to add a bunch of lockdep class * annotations into the reclaim path for the ilock. */ args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS); args->geo = dp->i_mount->m_dir_geo; args->name = name->name; args->namelen = name->len; args->filetype = name->type; args->hashval = dp->i_mount->m_dirnameops->hashname(name); args->dp = dp; args->whichfork = XFS_DATA_FORK; args->trans = tp; args->op_flags = XFS_DA_OP_OKNOENT; if (ci_name) args->op_flags |= XFS_DA_OP_CILOOKUP; lock_mode = xfs_ilock_data_map_shared(dp); if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) { rval = xfs_dir2_sf_lookup(args); goto out_check_rval; } rval = xfs_dir2_isblock(args, &v); if (rval) goto out_free; if (v) { rval = xfs_dir2_block_lookup(args); goto out_check_rval; } rval = xfs_dir2_isleaf(args, &v); if (rval) goto out_free; if (v) rval = xfs_dir2_leaf_lookup(args); else rval = xfs_dir2_node_lookup(args); out_check_rval: if (rval == -EEXIST) rval = 0; if (!rval) { *inum = args->inumber; if (ci_name) { ci_name->name = args->value; ci_name->len = args->valuelen; } } out_free: xfs_iunlock(dp, lock_mode); kmem_free(args); return rval; }
/* * Make sure the blocks described by maps are stable on disk. This includes * converting any unwritten extents, flushing the disk cache and updating the * time stamps. * * Note that we rely on the caller to always send us a timestamp update so that * we always commit a transaction here. If that stops being true we will have * to manually flush the cache here similar to what the fsync code path does * for datasyncs on files that have no dirty metadata. */ int xfs_fs_commit_blocks( struct inode *inode, struct iomap *maps, int nr_maps, struct iattr *iattr) { struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; struct xfs_trans *tp; bool update_isize = false; int error, i; loff_t size; ASSERT(iattr->ia_valid & (ATTR_ATIME|ATTR_CTIME|ATTR_MTIME)); xfs_ilock(ip, XFS_IOLOCK_EXCL); size = i_size_read(inode); if ((iattr->ia_valid & ATTR_SIZE) && iattr->ia_size > size) { update_isize = true; size = iattr->ia_size; } for (i = 0; i < nr_maps; i++) { u64 start, length, end; start = maps[i].offset; if (start > size) continue; end = start + maps[i].length; if (end > size) end = size; length = end - start; if (!length) continue; /* * Make sure reads through the pagecache see the new data. */ error = invalidate_inode_pages2_range(inode->i_mapping, start >> PAGE_SHIFT, (end - 1) >> PAGE_SHIFT); WARN_ON_ONCE(error); error = xfs_iomap_write_unwritten(ip, start, length); if (error) goto out_drop_iolock; } if (update_isize) { error = xfs_pnfs_validate_isize(ip, size); if (error) goto out_drop_iolock; } error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp); if (error) goto out_drop_iolock; xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); xfs_setattr_time(ip, iattr); if (update_isize) { i_size_write(inode, iattr->ia_size); ip->i_d.di_size = iattr->ia_size; } xfs_trans_set_sync(tp); error = xfs_trans_commit(tp); out_drop_iolock: xfs_iunlock(ip, XFS_IOLOCK_EXCL); return error; }
/* * Inodes in different states need to be treated differently, and the return * value of xfs_iflush is not sufficient to get this right. The following table * lists the inode states and the reclaim actions necessary for non-blocking * reclaim: * * * inode state iflush ret required action * --------------- ---------- --------------- * bad - reclaim * shutdown EIO unpin and reclaim * clean, unpinned 0 reclaim * stale, unpinned 0 reclaim * clean, pinned(*) 0 requeue * stale, pinned EAGAIN requeue * dirty, delwri ok 0 requeue * dirty, delwri blocked EAGAIN requeue * dirty, sync flush 0 reclaim * * (*) dgc: I don't think the clean, pinned state is possible but it gets * handled anyway given the order of checks implemented. * * As can be seen from the table, the return value of xfs_iflush() is not * sufficient to correctly decide the reclaim action here. The checks in * xfs_iflush() might look like duplicates, but they are not. * * Also, because we get the flush lock first, we know that any inode that has * been flushed delwri has had the flush completed by the time we check that * the inode is clean. The clean inode check needs to be done before flushing * the inode delwri otherwise we would loop forever requeuing clean inodes as * we cannot tell apart a successful delwri flush and a clean inode from the * return value of xfs_iflush(). * * Note that because the inode is flushed delayed write by background * writeback, the flush lock may already be held here and waiting on it can * result in very long latencies. Hence for sync reclaims, where we wait on the * flush lock, the caller should push out delayed write inodes first before * trying to reclaim them to minimise the amount of time spent waiting. For * background relaim, we just requeue the inode for the next pass. * * Hence the order of actions after gaining the locks should be: * bad => reclaim * shutdown => unpin and reclaim * pinned, delwri => requeue * pinned, sync => unpin * stale => reclaim * clean => reclaim * dirty, delwri => flush and requeue * dirty, sync => flush, wait and reclaim */ STATIC int xfs_reclaim_inode( struct xfs_inode *ip, struct xfs_perag *pag, int sync_mode) { int error; restart: error = 0; xfs_ilock(ip, XFS_ILOCK_EXCL); if (!xfs_iflock_nowait(ip)) { if (!(sync_mode & SYNC_WAIT)) goto out; /* * If we only have a single dirty inode in a cluster there is * a fair chance that the AIL push may have pushed it into * the buffer, but xfsbufd won't touch it until 30 seconds * from now, and thus we will lock up here. * * Promote the inode buffer to the front of the delwri list * and wake up xfsbufd now. */ xfs_promote_inode(ip); xfs_iflock(ip); } if (is_bad_inode(VFS_I(ip))) goto reclaim; if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { xfs_iunpin_wait(ip); goto reclaim; } if (xfs_ipincount(ip)) { if (!(sync_mode & SYNC_WAIT)) { xfs_ifunlock(ip); goto out; } xfs_iunpin_wait(ip); } if (xfs_iflags_test(ip, XFS_ISTALE)) goto reclaim; if (xfs_inode_clean(ip)) goto reclaim; /* * Now we have an inode that needs flushing. * * We do a nonblocking flush here even if we are doing a SYNC_WAIT * reclaim as we can deadlock with inode cluster removal. * xfs_ifree_cluster() can lock the inode buffer before it locks the * ip->i_lock, and we are doing the exact opposite here. As a result, * doing a blocking xfs_itobp() to get the cluster buffer will result * in an ABBA deadlock with xfs_ifree_cluster(). * * As xfs_ifree_cluser() must gather all inodes that are active in the * cache to mark them stale, if we hit this case we don't actually want * to do IO here - we want the inode marked stale so we can simply * reclaim it. Hence if we get an EAGAIN error on a SYNC_WAIT flush, * just unlock the inode, back off and try again. Hopefully the next * pass through will see the stale flag set on the inode. */ error = xfs_iflush(ip, SYNC_TRYLOCK | sync_mode); if (sync_mode & SYNC_WAIT) { if (error == EAGAIN) { xfs_iunlock(ip, XFS_ILOCK_EXCL); /* backoff longer than in xfs_ifree_cluster */ delay(2); goto restart; } xfs_iflock(ip); goto reclaim; } /* * When we have to flush an inode but don't have SYNC_WAIT set, we * flush the inode out using a delwri buffer and wait for the next * call into reclaim to find it in a clean state instead of waiting for * it now. We also don't return errors here - if the error is transient * then the next reclaim pass will flush the inode, and if the error * is permanent then the next sync reclaim will reclaim the inode and * pass on the error. */ if (error && error != EAGAIN && !XFS_FORCED_SHUTDOWN(ip->i_mount)) { xfs_warn(ip->i_mount, "inode 0x%llx background reclaim flush failed with %d", (long long)ip->i_ino, error); } out: xfs_iflags_clear(ip, XFS_IRECLAIM); xfs_iunlock(ip, XFS_ILOCK_EXCL); /* * We could return EAGAIN here to make reclaim rescan the inode tree in * a short while. However, this just burns CPU time scanning the tree * waiting for IO to complete and xfssyncd never goes back to the idle * state. Instead, return 0 to let the next scheduled background reclaim * attempt to reclaim the inode again. */ return 0; reclaim: xfs_ifunlock(ip); xfs_iunlock(ip, XFS_ILOCK_EXCL); XFS_STATS_INC(xs_ig_reclaims); /* * Remove the inode from the per-AG radix tree. * * Because radix_tree_delete won't complain even if the item was never * added to the tree assert that it's been there before to catch * problems with the inode life time early on. */ spin_lock(&pag->pag_ici_lock); if (!radix_tree_delete(&pag->pag_ici_root, XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino))) ASSERT(0); __xfs_inode_clear_reclaim(pag, ip); spin_unlock(&pag->pag_ici_lock); /* * Here we do an (almost) spurious inode lock in order to coordinate * with inode cache radix tree lookups. This is because the lookup * can reference the inodes in the cache without taking references. * * We make that OK here by ensuring that we wait until the inode is * unlocked after the lookup before we go ahead and free it. We get * both the ilock and the iolock because the code may need to drop the * ilock one but will still hold the iolock. */ xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); xfs_qm_dqdetach(ip); xfs_iunlock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); xfs_inode_free(ip); return error; }
/* * Pick a new allocation group for the current file and its file stream. This * function is called by xfs_bmap_filestreams() with the mount point's per-ag * lock held. */ int xfs_filestream_new_ag( xfs_bmalloca_t *ap, xfs_agnumber_t *agp) { int flags, err; xfs_inode_t *ip, *pip = NULL; xfs_mount_t *mp; xfs_mru_cache_t *cache; xfs_extlen_t minlen; fstrm_item_t *dir, *file; xfs_agnumber_t ag = NULLAGNUMBER; ip = ap->ip; mp = ip->i_mount; cache = mp->m_filestream; minlen = ap->alen; *agp = NULLAGNUMBER; /* * Look for the file in the cache, removing it if it's found. Doing * this allows it to be held across the dir lookup that follows. */ file = xfs_mru_cache_remove(cache, ip->i_ino); if (file) { ASSERT(ip == file->ip); /* Save the file's parent inode and old AG number for later. */ pip = file->pip; ag = file->ag; /* Look for the file's directory in the cache. */ dir = xfs_mru_cache_lookup(cache, pip->i_ino); if (dir) { ASSERT(pip == dir->ip); /* * If the directory has already moved on to a new AG, * use that AG as the new AG for the file. Don't * forget to twiddle the AG refcounts to match the * movement. */ if (dir->ag != file->ag) { xfs_filestream_put_ag(mp, file->ag); xfs_filestream_get_ag(mp, dir->ag); *agp = file->ag = dir->ag; } xfs_mru_cache_done(cache); } /* * Put the file back in the cache. If this fails, the free * function needs to be called to tidy up in the same way as if * the item had simply expired from the cache. */ err = xfs_mru_cache_insert(cache, ip->i_ino, file); if (err) { xfs_fstrm_free_func(ip->i_ino, file); return err; } /* * If the file's AG was moved to the directory's new AG, there's * nothing more to be done. */ if (*agp != NULLAGNUMBER) { TRACE_MOVEAG(mp, ip, pip, ag, xfs_filestream_peek_ag(mp, ag), *agp, xfs_filestream_peek_ag(mp, *agp)); return 0; } } /* * If the file's parent directory is known, take its iolock in exclusive * mode to prevent two sibling files from racing each other to migrate * themselves and their parent to different AGs. */ if (pip) xfs_ilock(pip, XFS_IOLOCK_EXCL); /* * A new AG needs to be found for the file. If the file's parent * directory is also known, it will be moved to the new AG as well to * ensure that files created inside it in future use the new AG. */ ag = (ag == NULLAGNUMBER) ? 0 : (ag + 1) % mp->m_sb.sb_agcount; flags = (ap->userdata ? XFS_PICK_USERDATA : 0) | (ap->low ? XFS_PICK_LOWSPACE : 0); err = _xfs_filestream_pick_ag(mp, ag, agp, flags, minlen); if (err || *agp == NULLAGNUMBER) goto exit; /* * If the file wasn't found in the file cache, then its parent directory * inode isn't known. For this to have happened, the file must either * be pre-existing, or it was created long enough ago that its cache * entry has expired. This isn't the sort of usage that the filestreams * allocator is trying to optimise, so there's no point trying to track * its new AG somehow in the filestream data structures. */ if (!pip) { TRACE_ORPHAN(mp, ip, *agp); goto exit; } /* Associate the parent inode with the AG. */ err = _xfs_filestream_update_ag(pip, NULL, *agp); if (err) goto exit; /* Associate the file inode with the AG. */ err = _xfs_filestream_update_ag(ip, pip, *agp); if (err) goto exit; TRACE_MOVEAG(mp, ip, pip, NULLAGNUMBER, 0, *agp, xfs_filestream_peek_ag(mp, *agp)); exit: /* * If _xfs_filestream_pick_ag() returned a valid AG, remove the * reference it took on it, since the file and directory will have taken * their own now if they were successfully cached. */ if (*agp != NULLAGNUMBER) xfs_filestream_put_ag(mp, *agp); else *agp = 0; if (pip) xfs_iunlock(pip, XFS_IOLOCK_EXCL); return err; }
/* * xfs_filestream_associate() should only be called to associate a regular file * with its parent directory. Calling it with a child directory isn't * appropriate because filestreams don't apply to entire directory hierarchies. * Creating a file in a child directory of an existing filestream directory * starts a new filestream with its own allocation group association. * * Returns < 0 on error, 0 if successful association occurred, > 0 if * we failed to get an association because of locking issues. */ int xfs_filestream_associate( xfs_inode_t *pip, xfs_inode_t *ip) { xfs_mount_t *mp; xfs_mru_cache_t *cache; fstrm_item_t *item; xfs_agnumber_t ag, rotorstep, startag; int err = 0; ASSERT(pip->i_d.di_mode & S_IFDIR); ASSERT(ip->i_d.di_mode & S_IFREG); if (!(pip->i_d.di_mode & S_IFDIR) || !(ip->i_d.di_mode & S_IFREG)) return -EINVAL; mp = pip->i_mount; cache = mp->m_filestream; anon_down_read(&mp->m_peraglock); /* * We have a problem, Houston. * * Taking the iolock here violates inode locking order - we already * hold the ilock. Hence if we block getting this lock we may never * wake. Unfortunately, that means if we can't get the lock, we're * screwed in terms of getting a stream association - we can't spin * waiting for the lock because someone else is waiting on the lock we * hold and we cannot drop that as we are in a transaction here. * * Lucky for us, this inversion is not a problem because it's a * directory inode that we are trying to lock here. * * So, if we can't get the iolock without sleeping then just give up */ if (!xfs_ilock_nowait(pip, XFS_IOLOCK_EXCL)) { anon_up_read(&mp->m_peraglock); return 1; } /* If the parent directory is already in the cache, use its AG. */ item = xfs_mru_cache_lookup(cache, pip->i_ino); if (item) { ASSERT(item->ip == pip); ag = item->ag; xfs_mru_cache_done(cache); TRACE_LOOKUP(mp, pip, pip, ag, xfs_filestream_peek_ag(mp, ag)); err = _xfs_filestream_update_ag(ip, pip, ag); goto exit; } /* * Set the starting AG using the rotor for inode32, otherwise * use the directory inode's AG. */ if (mp->m_flags & XFS_MOUNT_32BITINODES) { rotorstep = xfs_rotorstep; startag = (mp->m_agfrotor / rotorstep) % mp->m_sb.sb_agcount; mp->m_agfrotor = (mp->m_agfrotor + 1) % (mp->m_sb.sb_agcount * rotorstep); } else startag = XFS_INO_TO_AGNO(mp, pip->i_ino); /* Pick a new AG for the parent inode starting at startag. */ err = _xfs_filestream_pick_ag(mp, startag, &ag, 0, 0); if (err || ag == NULLAGNUMBER) goto exit_did_pick; /* Associate the parent inode with the AG. */ err = _xfs_filestream_update_ag(pip, NULL, ag); if (err) goto exit_did_pick; /* Associate the file inode with the AG. */ err = _xfs_filestream_update_ag(ip, pip, ag); if (err) goto exit_did_pick; TRACE_ASSOCIATE(mp, ip, pip, ag, xfs_filestream_peek_ag(mp, ag)); exit_did_pick: /* * If _xfs_filestream_pick_ag() returned a valid AG, remove the * reference it took on it, since the file and directory will have taken * their own now if they were successfully cached. */ if (ag != NULLAGNUMBER) xfs_filestream_put_ag(mp, ag); exit: xfs_iunlock(pip, XFS_IOLOCK_EXCL); anon_up_read(&mp->m_peraglock); return -err; }
STATIC int xfs_ioctl_setattr( xfs_inode_t *ip, struct fsxattr *fa, int mask) { struct xfs_mount *mp = ip->i_mount; struct xfs_trans *tp; unsigned int lock_flags = 0; struct xfs_dquot *udqp = NULL, *gdqp = NULL; struct xfs_dquot *olddquot = NULL; int code; xfs_itrace_entry(ip); if (mp->m_flags & XFS_MOUNT_RDONLY) return XFS_ERROR(EROFS); if (XFS_FORCED_SHUTDOWN(mp)) return XFS_ERROR(EIO); /* * If disk quotas is on, we make sure that the dquots do exist on disk, * before we start any other transactions. Trying to do this later * is messy. We don't care to take a readlock to look at the ids * in inode here, because we can't hold it across the trans_reserve. * If the IDs do change before we take the ilock, we're covered * because the i_*dquot fields will get updated anyway. */ if (XFS_IS_QUOTA_ON(mp) && (mask & FSX_PROJID)) { code = XFS_QM_DQVOPALLOC(mp, ip, ip->i_d.di_uid, ip->i_d.di_gid, fa->fsx_projid, XFS_QMOPT_PQUOTA, &udqp, &gdqp); if (code) return code; } /* * For the other attributes, we acquire the inode lock and * first do an error checking pass. */ tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE); code = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp), 0, 0, 0); if (code) goto error_return; lock_flags = XFS_ILOCK_EXCL; xfs_ilock(ip, lock_flags); /* * CAP_FOWNER overrides the following restrictions: * * The user ID of the calling process must be equal * to the file owner ID, except in cases where the * CAP_FSETID capability is applicable. */ if (current->fsuid != ip->i_d.di_uid && !capable(CAP_FOWNER)) { code = XFS_ERROR(EPERM); goto error_return; } /* * Do a quota reservation only if projid is actually going to change. */ if (mask & FSX_PROJID) { if (XFS_IS_PQUOTA_ON(mp) && ip->i_d.di_projid != fa->fsx_projid) { ASSERT(tp); code = XFS_QM_DQVOPCHOWNRESV(mp, tp, ip, udqp, gdqp, capable(CAP_FOWNER) ? XFS_QMOPT_FORCE_RES : 0); if (code) /* out of quota */ goto error_return; } } if (mask & FSX_EXTSIZE) { /* * Can't change extent size if any extents are allocated. */ if (ip->i_d.di_nextents && ((ip->i_d.di_extsize << mp->m_sb.sb_blocklog) != fa->fsx_extsize)) { code = XFS_ERROR(EINVAL); /* EFBIG? */ goto error_return; } /* * Extent size must be a multiple of the appropriate block * size, if set at all. */ if (fa->fsx_extsize != 0) { xfs_extlen_t size; if (XFS_IS_REALTIME_INODE(ip) || ((mask & FSX_XFLAGS) && (fa->fsx_xflags & XFS_XFLAG_REALTIME))) { size = mp->m_sb.sb_rextsize << mp->m_sb.sb_blocklog; } else { size = mp->m_sb.sb_blocksize; } if (fa->fsx_extsize % size) { code = XFS_ERROR(EINVAL); goto error_return; } } } if (mask & FSX_XFLAGS) { /* * Can't change realtime flag if any extents are allocated. */ if ((ip->i_d.di_nextents || ip->i_delayed_blks) && (XFS_IS_REALTIME_INODE(ip)) != (fa->fsx_xflags & XFS_XFLAG_REALTIME)) { code = XFS_ERROR(EINVAL); /* EFBIG? */ goto error_return; } /* * If realtime flag is set then must have realtime data. */ if ((fa->fsx_xflags & XFS_XFLAG_REALTIME)) { if ((mp->m_sb.sb_rblocks == 0) || (mp->m_sb.sb_rextsize == 0) || (ip->i_d.di_extsize % mp->m_sb.sb_rextsize)) { code = XFS_ERROR(EINVAL); goto error_return; } } /* * Can't modify an immutable/append-only file unless * we have appropriate permission. */ if ((ip->i_d.di_flags & (XFS_DIFLAG_IMMUTABLE|XFS_DIFLAG_APPEND) || (fa->fsx_xflags & (XFS_XFLAG_IMMUTABLE | XFS_XFLAG_APPEND))) && !capable(CAP_LINUX_IMMUTABLE)) { code = XFS_ERROR(EPERM); goto error_return; } } xfs_trans_ijoin(tp, ip, lock_flags); xfs_trans_ihold(tp, ip); /* * Change file ownership. Must be the owner or privileged. * If the system was configured with the "restricted_chown" * option, the owner is not permitted to give away the file, * and can change the group id only to a group of which he * or she is a member. */ if (mask & FSX_PROJID) { /* * CAP_FSETID overrides the following restrictions: * * The set-user-ID and set-group-ID bits of a file will be * cleared upon successful return from chown() */ if ((ip->i_d.di_mode & (S_ISUID|S_ISGID)) && !capable(CAP_FSETID)) ip->i_d.di_mode &= ~(S_ISUID|S_ISGID); /* * Change the ownerships and register quota modifications * in the transaction. */ if (ip->i_d.di_projid != fa->fsx_projid) { if (XFS_IS_PQUOTA_ON(mp)) { olddquot = XFS_QM_DQVOPCHOWN(mp, tp, ip, &ip->i_gdquot, gdqp); } ip->i_d.di_projid = fa->fsx_projid; /* * We may have to rev the inode as well as * the superblock version number since projids didn't * exist before DINODE_VERSION_2 and SB_VERSION_NLINK. */ if (ip->i_d.di_version == XFS_DINODE_VERSION_1) xfs_bump_ino_vers2(tp, ip); } } if (mask & FSX_EXTSIZE) ip->i_d.di_extsize = fa->fsx_extsize >> mp->m_sb.sb_blocklog; if (mask & FSX_XFLAGS) { xfs_set_diflags(ip, fa->fsx_xflags); xfs_diflags_to_linux(ip); } xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); xfs_ichgtime(ip, XFS_ICHGTIME_CHG); XFS_STATS_INC(xs_ig_attrchg); /* * If this is a synchronous mount, make sure that the * transaction goes to disk before returning to the user. * This is slightly sub-optimal in that truncates require * two sync transactions instead of one for wsync filesystems. * One for the truncate and one for the timestamps since we * don't want to change the timestamps unless we're sure the * truncate worked. Truncates are less than 1% of the laddis * mix so this probably isn't worth the trouble to optimize. */ if (mp->m_flags & XFS_MOUNT_WSYNC) xfs_trans_set_sync(tp); code = xfs_trans_commit(tp, 0); xfs_iunlock(ip, lock_flags); /* * Release any dquot(s) the inode had kept before chown. */ XFS_QM_DQRELE(mp, olddquot); XFS_QM_DQRELE(mp, udqp); XFS_QM_DQRELE(mp, gdqp); if (code) return code; if (DM_EVENT_ENABLED(ip, DM_EVENT_ATTRIBUTE)) { XFS_SEND_NAMESP(mp, DM_EVENT_ATTRIBUTE, ip, DM_RIGHT_NULL, NULL, DM_RIGHT_NULL, NULL, NULL, 0, 0, (mask & FSX_NONBLOCK) ? DM_FLAGS_NDELAY : 0); } return 0; error_return: XFS_QM_DQRELE(mp, udqp); XFS_QM_DQRELE(mp, gdqp); xfs_trans_cancel(tp, 0); if (lock_flags) xfs_iunlock(ip, lock_flags); return code; }
/* * Pass in a delayed allocate extent, convert it to real extents; * return to the caller the extent we create which maps on top of * the originating callers request. * * Called without a lock on the inode. * * We no longer bother to look at the incoming map - all we have to * guarantee is that whatever we allocate fills the required range. */ int xfs_iomap_write_allocate( xfs_inode_t *ip, xfs_off_t offset, size_t count, xfs_bmbt_irec_t *map, int *retmap) { xfs_mount_t *mp = ip->i_mount; xfs_fileoff_t offset_fsb, last_block; xfs_fileoff_t end_fsb, map_start_fsb; xfs_fsblock_t first_block; xfs_bmap_free_t free_list; xfs_filblks_t count_fsb; xfs_bmbt_irec_t imap; xfs_trans_t *tp; int nimaps, committed; int error = 0; int nres; *retmap = 0; /* * Make sure that the dquots are there. */ if ((error = XFS_QM_DQATTACH(mp, ip, 0))) return XFS_ERROR(error); offset_fsb = XFS_B_TO_FSBT(mp, offset); count_fsb = map->br_blockcount; map_start_fsb = map->br_startoff; XFS_STATS_ADD(xs_xstrat_bytes, XFS_FSB_TO_B(mp, count_fsb)); while (count_fsb != 0) { /* * Set up a transaction with which to allocate the * backing store for the file. Do allocations in a * loop until we get some space in the range we are * interested in. The other space that might be allocated * is in the delayed allocation extent on which we sit * but before our buffer starts. */ nimaps = 0; while (nimaps == 0) { tp = xfs_trans_alloc(mp, XFS_TRANS_STRAT_WRITE); tp->t_flags |= XFS_TRANS_RESERVE; nres = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK); error = xfs_trans_reserve(tp, nres, XFS_WRITE_LOG_RES(mp), 0, XFS_TRANS_PERM_LOG_RES, XFS_WRITE_LOG_COUNT); if (error) { xfs_trans_cancel(tp, 0); return XFS_ERROR(error); } xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); xfs_trans_ihold(tp, ip); xfs_bmap_init(&free_list, &first_block); /* * it is possible that the extents have changed since * we did the read call as we dropped the ilock for a * while. We have to be careful about truncates or hole * punchs here - we are not allowed to allocate * non-delalloc blocks here. * * The only protection against truncation is the pages * for the range we are being asked to convert are * locked and hence a truncate will block on them * first. * * As a result, if we go beyond the range we really * need and hit an delalloc extent boundary followed by * a hole while we have excess blocks in the map, we * will fill the hole incorrectly and overrun the * transaction reservation. * * Using a single map prevents this as we are forced to * check each map we look for overlap with the desired * range and abort as soon as we find it. Also, given * that we only return a single map, having one beyond * what we can return is probably a bit silly. * * We also need to check that we don't go beyond EOF; * this is a truncate optimisation as a truncate sets * the new file size before block on the pages we * currently have locked under writeback. Because they * are about to be tossed, we don't need to write them * back.... */ nimaps = 1; end_fsb = XFS_B_TO_FSB(mp, ip->i_size); error = xfs_bmap_last_offset(NULL, ip, &last_block, XFS_DATA_FORK); if (error) goto trans_cancel; last_block = XFS_FILEOFF_MAX(last_block, end_fsb); if ((map_start_fsb + count_fsb) > last_block) { count_fsb = last_block - map_start_fsb; if (count_fsb == 0) { error = EAGAIN; goto trans_cancel; } } /* Go get the actual blocks */ error = xfs_bmapi(tp, ip, map_start_fsb, count_fsb, XFS_BMAPI_WRITE, &first_block, 1, &imap, &nimaps, &free_list, NULL); if (error) goto trans_cancel; error = xfs_bmap_finish(&tp, &free_list, &committed); if (error) goto trans_cancel; error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); if (error) goto error0; xfs_iunlock(ip, XFS_ILOCK_EXCL); } /* * See if we were able to allocate an extent that * covers at least part of the callers request */ if (!(imap.br_startblock || XFS_IS_REALTIME_INODE(ip))) return xfs_cmn_err_fsblock_zero(ip, &imap); if ((offset_fsb >= imap.br_startoff) && (offset_fsb < (imap.br_startoff + imap.br_blockcount))) { *map = imap; *retmap = 1; XFS_STATS_INC(xs_xstrat_quick); return 0; } /* * So far we have not mapped the requested part of the * file, just surrounding data, try again. */ count_fsb -= imap.br_blockcount; map_start_fsb = imap.br_startoff + imap.br_blockcount; } trans_cancel: xfs_bmap_cancel(&free_list); xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); error0: xfs_iunlock(ip, XFS_ILOCK_EXCL); return XFS_ERROR(error); }
int xfs_filestream_associate( xfs_inode_t *pip, xfs_inode_t *ip) { xfs_mount_t *mp; xfs_mru_cache_t *cache; fstrm_item_t *item; xfs_agnumber_t ag, rotorstep, startag; int err = 0; ASSERT(S_ISDIR(pip->i_d.di_mode)); ASSERT(S_ISREG(ip->i_d.di_mode)); if (!S_ISDIR(pip->i_d.di_mode) || !S_ISREG(ip->i_d.di_mode)) return -EINVAL; mp = pip->i_mount; cache = mp->m_filestream; if (!xfs_ilock_nowait(pip, XFS_IOLOCK_EXCL)) return 1; item = xfs_mru_cache_lookup(cache, pip->i_ino); if (item) { ASSERT(item->ip == pip); ag = item->ag; xfs_mru_cache_done(cache); TRACE_LOOKUP(mp, pip, pip, ag, xfs_filestream_peek_ag(mp, ag)); err = _xfs_filestream_update_ag(ip, pip, ag); goto exit; } if (mp->m_flags & XFS_MOUNT_32BITINODES) { rotorstep = xfs_rotorstep; startag = (mp->m_agfrotor / rotorstep) % mp->m_sb.sb_agcount; mp->m_agfrotor = (mp->m_agfrotor + 1) % (mp->m_sb.sb_agcount * rotorstep); } else startag = XFS_INO_TO_AGNO(mp, pip->i_ino); err = _xfs_filestream_pick_ag(mp, startag, &ag, 0, 0); if (err || ag == NULLAGNUMBER) goto exit_did_pick; err = _xfs_filestream_update_ag(pip, NULL, ag); if (err) goto exit_did_pick; err = _xfs_filestream_update_ag(ip, pip, ag); if (err) goto exit_did_pick; TRACE_ASSOCIATE(mp, ip, pip, ag, xfs_filestream_peek_ag(mp, ag)); exit_did_pick: if (ag != NULLAGNUMBER) xfs_filestream_put_ag(mp, ag); exit: xfs_iunlock(pip, XFS_IOLOCK_EXCL); return -err; }
int xfs_iomap_write_unwritten( xfs_inode_t *ip, xfs_off_t offset, size_t count) { xfs_mount_t *mp = ip->i_mount; xfs_fileoff_t offset_fsb; xfs_filblks_t count_fsb; xfs_filblks_t numblks_fsb; xfs_fsblock_t firstfsb; int nimaps; xfs_trans_t *tp; xfs_bmbt_irec_t imap; xfs_bmap_free_t free_list; uint resblks; int committed; int error; xfs_iomap_enter_trace(XFS_IOMAP_UNWRITTEN, ip, offset, count); offset_fsb = XFS_B_TO_FSBT(mp, offset); count_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count); count_fsb = (xfs_filblks_t)(count_fsb - offset_fsb); /* * Reserve enough blocks in this transaction for two complete extent * btree splits. We may be converting the middle part of an unwritten * extent and in this case we will insert two new extents in the btree * each of which could cause a full split. * * This reservation amount will be used in the first call to * xfs_bmbt_split() to select an AG with enough space to satisfy the * rest of the operation. */ resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0) << 1; do { /* * set up a transaction to convert the range of extents * from unwritten to real. Do allocations in a loop until * we have covered the range passed in. */ tp = xfs_trans_alloc(mp, XFS_TRANS_STRAT_WRITE); tp->t_flags |= XFS_TRANS_RESERVE; error = xfs_trans_reserve(tp, resblks, XFS_WRITE_LOG_RES(mp), 0, XFS_TRANS_PERM_LOG_RES, XFS_WRITE_LOG_COUNT); if (error) { xfs_trans_cancel(tp, 0); return XFS_ERROR(error); } xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); xfs_trans_ihold(tp, ip); /* * Modify the unwritten extent state of the buffer. */ xfs_bmap_init(&free_list, &firstfsb); nimaps = 1; error = xfs_bmapi(tp, ip, offset_fsb, count_fsb, XFS_BMAPI_WRITE|XFS_BMAPI_CONVERT, &firstfsb, 1, &imap, &nimaps, &free_list, NULL); if (error) goto error_on_bmapi_transaction; error = xfs_bmap_finish(&(tp), &(free_list), &committed); if (error) goto error_on_bmapi_transaction; error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); xfs_iunlock(ip, XFS_ILOCK_EXCL); if (error) return XFS_ERROR(error); if (!(imap.br_startblock || XFS_IS_REALTIME_INODE(ip))) return xfs_cmn_err_fsblock_zero(ip, &imap); if ((numblks_fsb = imap.br_blockcount) == 0) { /* * The numblks_fsb value should always get * smaller, otherwise the loop is stuck. */ ASSERT(imap.br_blockcount); break; } offset_fsb += numblks_fsb; count_fsb -= numblks_fsb; } while (count_fsb > 0); return 0; error_on_bmapi_transaction: xfs_bmap_cancel(&free_list); xfs_trans_cancel(tp, (XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT)); xfs_iunlock(ip, XFS_ILOCK_EXCL); return XFS_ERROR(error); }
static int xfs_iget_cache_miss( struct xfs_mount *mp, struct xfs_perag *pag, xfs_trans_t *tp, xfs_ino_t ino, struct xfs_inode **ipp, int flags, int lock_flags) { struct xfs_inode *ip; int error; xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ino); ip = xfs_inode_alloc(mp, ino); if (!ip) return ENOMEM; error = xfs_iread(mp, tp, ip, flags); if (error) goto out_destroy; trace_xfs_iget_miss(ip); if ((ip->i_d.di_mode == 0) && !(flags & XFS_IGET_CREATE)) { error = ENOENT; goto out_destroy; } /* * Preload the radix tree so we can insert safely under the * write spinlock. Note that we cannot sleep inside the preload * region. */ if (radix_tree_preload(GFP_KERNEL)) { error = EAGAIN; goto out_destroy; } /* * Because the inode hasn't been added to the radix-tree yet it can't * be found by another thread, so we can do the non-sleeping lock here. */ if (lock_flags) { if (!xfs_ilock_nowait(ip, lock_flags)) BUG(); } /* * These values must be set before inserting the inode into the radix * tree as the moment it is inserted a concurrent lookup (allowed by the * RCU locking mechanism) can find it and that lookup must see that this * is an inode currently under construction (i.e. that XFS_INEW is set). * The ip->i_flags_lock that protects the XFS_INEW flag forms the * memory barrier that ensures this detection works correctly at lookup * time. */ ip->i_udquot = ip->i_gdquot = NULL; xfs_iflags_set(ip, XFS_INEW); /* insert the new inode */ spin_lock(&pag->pag_ici_lock); error = radix_tree_insert(&pag->pag_ici_root, agino, ip); if (unlikely(error)) { WARN_ON(error != -EEXIST); XFS_STATS_INC(xs_ig_dup); error = EAGAIN; goto out_preload_end; } spin_unlock(&pag->pag_ici_lock); radix_tree_preload_end(); *ipp = ip; return 0; out_preload_end: spin_unlock(&pag->pag_ici_lock); radix_tree_preload_end(); if (lock_flags) xfs_iunlock(ip, lock_flags); out_destroy: __destroy_inode(VFS_I(ip)); xfs_inode_free(ip); return error; }
STATIC int xfs_file_fsync( struct file *file, loff_t start, loff_t end, int datasync) { struct inode *inode = file->f_mapping->host; struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; int error = 0; int log_flushed = 0; xfs_lsn_t lsn = 0; trace_xfs_file_fsync(ip); error = filemap_write_and_wait_range(inode->i_mapping, start, end); if (error) return error; if (XFS_FORCED_SHUTDOWN(mp)) return -XFS_ERROR(EIO); xfs_iflags_clear(ip, XFS_ITRUNCATED); if (mp->m_flags & XFS_MOUNT_BARRIER) { /* * If we have an RT and/or log subvolume we need to make sure * to flush the write cache the device used for file data * first. This is to ensure newly written file data make * it to disk before logging the new inode size in case of * an extending write. */ if (XFS_IS_REALTIME_INODE(ip)) xfs_blkdev_issue_flush(mp->m_rtdev_targp); else if (mp->m_logdev_targp != mp->m_ddev_targp) xfs_blkdev_issue_flush(mp->m_ddev_targp); } /* * All metadata updates are logged, which means that we just have * to flush the log up to the latest LSN that touched the inode. */ xfs_ilock(ip, XFS_ILOCK_SHARED); if (xfs_ipincount(ip)) { if (!datasync || (ip->i_itemp->ili_fields & ~XFS_ILOG_TIMESTAMP)) lsn = ip->i_itemp->ili_last_lsn; } xfs_iunlock(ip, XFS_ILOCK_SHARED); if (lsn) error = _xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, &log_flushed); /* * If we only have a single device, and the log force about was * a no-op we might have to flush the data device cache here. * This can only happen for fdatasync/O_DSYNC if we were overwriting * an already allocated file and thus do not have any metadata to * commit. */ if ((mp->m_flags & XFS_MOUNT_BARRIER) && mp->m_logdev_targp == mp->m_ddev_targp && !XFS_IS_REALTIME_INODE(ip) && !log_flushed) xfs_blkdev_issue_flush(mp->m_ddev_targp); return -error; }
/* * Pass in a delayed allocate extent, convert it to real extents; * return to the caller the extent we create which maps on top of * the originating callers request. * * Called without a lock on the inode. */ int xfs_iomap_write_allocate( xfs_inode_t *ip, xfs_off_t offset, size_t count, xfs_bmbt_irec_t *map, int *retmap) { xfs_mount_t *mp = ip->i_mount; xfs_iocore_t *io = &ip->i_iocore; xfs_fileoff_t offset_fsb, last_block; xfs_fileoff_t end_fsb, map_start_fsb; xfs_fsblock_t first_block; xfs_bmap_free_t free_list; xfs_filblks_t count_fsb; xfs_bmbt_irec_t imap[XFS_STRAT_WRITE_IMAPS]; xfs_trans_t *tp; int i, nimaps, committed; int error = 0; int nres; *retmap = 0; /* * Make sure that the dquots are there. */ if ((error = XFS_QM_DQATTACH(mp, ip, 0))) return XFS_ERROR(error); offset_fsb = XFS_B_TO_FSBT(mp, offset); count_fsb = map->br_blockcount; map_start_fsb = map->br_startoff; XFS_STATS_ADD(xs_xstrat_bytes, XFS_FSB_TO_B(mp, count_fsb)); while (count_fsb != 0) { /* * Set up a transaction with which to allocate the * backing store for the file. Do allocations in a * loop until we get some space in the range we are * interested in. The other space that might be allocated * is in the delayed allocation extent on which we sit * but before our buffer starts. */ nimaps = 0; while (nimaps == 0) { tp = xfs_trans_alloc(mp, XFS_TRANS_STRAT_WRITE); tp->t_flags |= XFS_TRANS_RESERVE; nres = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK); error = xfs_trans_reserve(tp, nres, XFS_WRITE_LOG_RES(mp), 0, XFS_TRANS_PERM_LOG_RES, XFS_WRITE_LOG_COUNT); if (error) { xfs_trans_cancel(tp, 0); return XFS_ERROR(error); } xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); xfs_trans_ihold(tp, ip); XFS_BMAP_INIT(&free_list, &first_block); nimaps = XFS_STRAT_WRITE_IMAPS; /* * Ensure we don't go beyond eof - it is possible * the extents changed since we did the read call, * we dropped the ilock in the interim. */ end_fsb = XFS_B_TO_FSB(mp, ip->i_size); xfs_bmap_last_offset(NULL, ip, &last_block, XFS_DATA_FORK); last_block = XFS_FILEOFF_MAX(last_block, end_fsb); if ((map_start_fsb + count_fsb) > last_block) { count_fsb = last_block - map_start_fsb; if (count_fsb == 0) { error = EAGAIN; goto trans_cancel; } } /* Go get the actual blocks */ error = XFS_BMAPI(mp, tp, io, map_start_fsb, count_fsb, XFS_BMAPI_WRITE, &first_block, 1, imap, &nimaps, &free_list, NULL); if (error) goto trans_cancel; error = xfs_bmap_finish(&tp, &free_list, &committed); if (error) goto trans_cancel; error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); if (error) goto error0; xfs_iunlock(ip, XFS_ILOCK_EXCL); } /* * See if we were able to allocate an extent that * covers at least part of the callers request */ for (i = 0; i < nimaps; i++) { if (unlikely(!imap[i].br_startblock && !(io->io_flags & XFS_IOCORE_RT))) return xfs_cmn_err_fsblock_zero(ip, &imap[i]); if ((offset_fsb >= imap[i].br_startoff) && (offset_fsb < (imap[i].br_startoff + imap[i].br_blockcount))) { *map = imap[i]; *retmap = 1; XFS_STATS_INC(xs_xstrat_quick); return 0; } count_fsb -= imap[i].br_blockcount; } /* So far we have not mapped the requested part of the * file, just surrounding data, try again. */ nimaps--; map_start_fsb = imap[nimaps].br_startoff + imap[nimaps].br_blockcount; } trans_cancel: xfs_bmap_cancel(&free_list); xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); error0: xfs_iunlock(ip, XFS_ILOCK_EXCL); return XFS_ERROR(error); }
/* * Zero any on disk space between the current EOF and the new, larger EOF. * * This handles the normal case of zeroing the remainder of the last block in * the file and the unusual case of zeroing blocks out beyond the size of the * file. This second case only happens with fixed size extents and when the * system crashes before the inode size was updated but after blocks were * allocated. * * Expects the iolock to be held exclusive, and will take the ilock internally. */ int /* error (positive) */ xfs_zero_eof( struct xfs_inode *ip, xfs_off_t offset, /* starting I/O offset */ xfs_fsize_t isize) /* current inode size */ { struct xfs_mount *mp = ip->i_mount; xfs_fileoff_t start_zero_fsb; xfs_fileoff_t end_zero_fsb; xfs_fileoff_t zero_count_fsb; xfs_fileoff_t last_fsb; xfs_fileoff_t zero_off; xfs_fsize_t zero_len; int nimaps; int error = 0; struct xfs_bmbt_irec imap; ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); ASSERT(offset > isize); /* * First handle zeroing the block on which isize resides. * * We only zero a part of that block so it is handled specially. */ if (XFS_B_FSB_OFFSET(mp, isize) != 0) { error = xfs_zero_last_block(ip, offset, isize); if (error) return error; } /* * Calculate the range between the new size and the old where blocks * needing to be zeroed may exist. * * To get the block where the last byte in the file currently resides, * we need to subtract one from the size and truncate back to a block * boundary. We subtract 1 in case the size is exactly on a block * boundary. */ last_fsb = isize ? XFS_B_TO_FSBT(mp, isize - 1) : (xfs_fileoff_t)-1; start_zero_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)isize); end_zero_fsb = XFS_B_TO_FSBT(mp, offset - 1); ASSERT((xfs_sfiloff_t)last_fsb < (xfs_sfiloff_t)start_zero_fsb); if (last_fsb == end_zero_fsb) { /* * The size was only incremented on its last block. * We took care of that above, so just return. */ return 0; } ASSERT(start_zero_fsb <= end_zero_fsb); while (start_zero_fsb <= end_zero_fsb) { nimaps = 1; zero_count_fsb = end_zero_fsb - start_zero_fsb + 1; xfs_ilock(ip, XFS_ILOCK_EXCL); error = xfs_bmapi_read(ip, start_zero_fsb, zero_count_fsb, &imap, &nimaps, 0); xfs_iunlock(ip, XFS_ILOCK_EXCL); if (error) return error; ASSERT(nimaps > 0); if (imap.br_state == XFS_EXT_UNWRITTEN || imap.br_startblock == HOLESTARTBLOCK) { start_zero_fsb = imap.br_startoff + imap.br_blockcount; ASSERT(start_zero_fsb <= (end_zero_fsb + 1)); continue; } /* * There are blocks we need to zero. */ zero_off = XFS_FSB_TO_B(mp, start_zero_fsb); zero_len = XFS_FSB_TO_B(mp, imap.br_blockcount); if ((zero_off + zero_len) > offset) zero_len = offset - zero_off; error = xfs_iozero(ip, zero_off, zero_len); if (error) return error; start_zero_fsb = imap.br_startoff + imap.br_blockcount; ASSERT(start_zero_fsb <= (end_zero_fsb + 1)); } return 0; }
STATIC int xfs_map_blocks( struct inode *inode, loff_t offset, struct xfs_bmbt_irec *imap, int type, int nonblocking) { struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; ssize_t count = 1 << inode->i_blkbits; xfs_fileoff_t offset_fsb, end_fsb; int error = 0; int bmapi_flags = XFS_BMAPI_ENTIRE; int nimaps = 1; if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; if (type == XFS_IO_UNWRITTEN) bmapi_flags |= XFS_BMAPI_IGSTATE; if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) { if (nonblocking) return -EAGAIN; xfs_ilock(ip, XFS_ILOCK_SHARED); } ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE || (ip->i_df.if_flags & XFS_IFEXTENTS)); ASSERT(offset <= mp->m_super->s_maxbytes); if (offset + count > mp->m_super->s_maxbytes) count = mp->m_super->s_maxbytes - offset; end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count); offset_fsb = XFS_B_TO_FSBT(mp, offset); error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, imap, &nimaps, bmapi_flags); xfs_iunlock(ip, XFS_ILOCK_SHARED); if (error) return error; if (type == XFS_IO_DELALLOC && (!nimaps || isnullstartblock(imap->br_startblock))) { error = xfs_iomap_write_allocate(ip, offset, imap); if (!error) trace_xfs_map_blocks_alloc(ip, offset, count, type, imap); return error; } #ifdef DEBUG if (type == XFS_IO_UNWRITTEN) { ASSERT(nimaps); ASSERT(imap->br_startblock != HOLESTARTBLOCK); ASSERT(imap->br_startblock != DELAYSTARTBLOCK); } #endif if (nimaps) trace_xfs_map_blocks_found(ip, offset, count, type, imap); return 0; }
int xfs_setattr_nonsize( struct xfs_inode *ip, struct iattr *iattr, int flags) { xfs_mount_t *mp = ip->i_mount; struct inode *inode = VFS_I(ip); int mask = iattr->ia_valid; xfs_trans_t *tp; int error; kuid_t uid = GLOBAL_ROOT_UID, iuid = GLOBAL_ROOT_UID; kgid_t gid = GLOBAL_ROOT_GID, igid = GLOBAL_ROOT_GID; struct xfs_dquot *udqp = NULL, *gdqp = NULL; struct xfs_dquot *olddquot1 = NULL, *olddquot2 = NULL; trace_xfs_setattr(ip); /* If acls are being inherited, we already have this checked */ if (!(flags & XFS_ATTR_NOACL)) { if (mp->m_flags & XFS_MOUNT_RDONLY) return XFS_ERROR(EROFS); if (XFS_FORCED_SHUTDOWN(mp)) return XFS_ERROR(EIO); error = -inode_change_ok(inode, iattr); if (error) return XFS_ERROR(error); } ASSERT((mask & ATTR_SIZE) == 0); /* * If disk quotas is on, we make sure that the dquots do exist on disk, * before we start any other transactions. Trying to do this later * is messy. We don't care to take a readlock to look at the ids * in inode here, because we can't hold it across the trans_reserve. * If the IDs do change before we take the ilock, we're covered * because the i_*dquot fields will get updated anyway. */ if (XFS_IS_QUOTA_ON(mp) && (mask & (ATTR_UID|ATTR_GID))) { uint qflags = 0; if ((mask & ATTR_UID) && XFS_IS_UQUOTA_ON(mp)) { uid = iattr->ia_uid; qflags |= XFS_QMOPT_UQUOTA; } else { uid = inode->i_uid; } if ((mask & ATTR_GID) && XFS_IS_GQUOTA_ON(mp)) { gid = iattr->ia_gid; qflags |= XFS_QMOPT_GQUOTA; } else { gid = inode->i_gid; } /* * We take a reference when we initialize udqp and gdqp, * so it is important that we never blindly double trip on * the same variable. See xfs_create() for an example. */ ASSERT(udqp == NULL); ASSERT(gdqp == NULL); error = xfs_qm_vop_dqalloc(ip, xfs_kuid_to_uid(uid), xfs_kgid_to_gid(gid), xfs_get_projid(ip), qflags, &udqp, &gdqp, NULL); if (error) return error; } tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE); error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0); if (error) goto out_dqrele; xfs_ilock(ip, XFS_ILOCK_EXCL); /* * Change file ownership. Must be the owner or privileged. */ if (mask & (ATTR_UID|ATTR_GID)) { /* * These IDs could have changed since we last looked at them. * But, we're assured that if the ownership did change * while we didn't have the inode locked, inode's dquot(s) * would have changed also. */ iuid = inode->i_uid; igid = inode->i_gid; gid = (mask & ATTR_GID) ? iattr->ia_gid : igid; uid = (mask & ATTR_UID) ? iattr->ia_uid : iuid; /* * Do a quota reservation only if uid/gid is actually * going to change. */ if (XFS_IS_QUOTA_RUNNING(mp) && ((XFS_IS_UQUOTA_ON(mp) && !uid_eq(iuid, uid)) || (XFS_IS_GQUOTA_ON(mp) && !gid_eq(igid, gid)))) { ASSERT(tp); error = xfs_qm_vop_chown_reserve(tp, ip, udqp, gdqp, NULL, capable(CAP_FOWNER) ? XFS_QMOPT_FORCE_RES : 0); if (error) /* out of quota */ goto out_trans_cancel; } } xfs_trans_ijoin(tp, ip, 0); /* * Change file ownership. Must be the owner or privileged. */ if (mask & (ATTR_UID|ATTR_GID)) { /* * CAP_FSETID overrides the following restrictions: * * The set-user-ID and set-group-ID bits of a file will be * cleared upon successful return from chown() */ if ((ip->i_d.di_mode & (S_ISUID|S_ISGID)) && !capable(CAP_FSETID)) ip->i_d.di_mode &= ~(S_ISUID|S_ISGID); /* * Change the ownerships and register quota modifications * in the transaction. */ if (!uid_eq(iuid, uid)) { if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_UQUOTA_ON(mp)) { ASSERT(mask & ATTR_UID); ASSERT(udqp); olddquot1 = xfs_qm_vop_chown(tp, ip, &ip->i_udquot, udqp); } ip->i_d.di_uid = xfs_kuid_to_uid(uid); inode->i_uid = uid; } if (!gid_eq(igid, gid)) { if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_GQUOTA_ON(mp)) { ASSERT(!XFS_IS_PQUOTA_ON(mp)); ASSERT(mask & ATTR_GID); ASSERT(gdqp); olddquot2 = xfs_qm_vop_chown(tp, ip, &ip->i_gdquot, gdqp); } ip->i_d.di_gid = xfs_kgid_to_gid(gid); inode->i_gid = gid; } } /* * Change file access modes. */ if (mask & ATTR_MODE) xfs_setattr_mode(tp, ip, iattr); /* * Change file access or modified times. */ if (mask & ATTR_ATIME) { inode->i_atime = iattr->ia_atime; ip->i_d.di_atime.t_sec = iattr->ia_atime.tv_sec; ip->i_d.di_atime.t_nsec = iattr->ia_atime.tv_nsec; } if (mask & ATTR_CTIME) { inode->i_ctime = iattr->ia_ctime; ip->i_d.di_ctime.t_sec = iattr->ia_ctime.tv_sec; ip->i_d.di_ctime.t_nsec = iattr->ia_ctime.tv_nsec; } if (mask & ATTR_MTIME) { inode->i_mtime = iattr->ia_mtime; ip->i_d.di_mtime.t_sec = iattr->ia_mtime.tv_sec; ip->i_d.di_mtime.t_nsec = iattr->ia_mtime.tv_nsec; } xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); XFS_STATS_INC(xs_ig_attrchg); if (mp->m_flags & XFS_MOUNT_WSYNC) xfs_trans_set_sync(tp); error = xfs_trans_commit(tp, 0); xfs_iunlock(ip, XFS_ILOCK_EXCL); /* * Release any dquot(s) the inode had kept before chown. */ xfs_qm_dqrele(olddquot1); xfs_qm_dqrele(olddquot2); xfs_qm_dqrele(udqp); xfs_qm_dqrele(gdqp); if (error) return XFS_ERROR(error); /* * XXX(hch): Updating the ACL entries is not atomic vs the i_mode * update. We could avoid this with linked transactions * and passing down the transaction pointer all the way * to attr_set. No previous user of the generic * Posix ACL code seems to care about this issue either. */ if ((mask & ATTR_MODE) && !(flags & XFS_ATTR_NOACL)) { error = -xfs_acl_chmod(inode); if (error) return XFS_ERROR(error); } return 0; out_trans_cancel: xfs_trans_cancel(tp, 0); xfs_iunlock(ip, XFS_ILOCK_EXCL); out_dqrele: xfs_qm_dqrele(udqp); xfs_qm_dqrele(gdqp); return error; }
/* * Readdir for block directories. */ STATIC int xfs_dir2_block_getdents( struct xfs_da_args *args, struct dir_context *ctx) { struct xfs_inode *dp = args->dp; /* incore directory inode */ xfs_dir2_data_hdr_t *hdr; /* block header */ struct xfs_buf *bp; /* buffer for block */ xfs_dir2_block_tail_t *btp; /* block tail */ xfs_dir2_data_entry_t *dep; /* block data entry */ xfs_dir2_data_unused_t *dup; /* block unused entry */ char *endptr; /* end of the data entries */ int error; /* error return value */ char *ptr; /* current data entry */ int wantoff; /* starting block offset */ xfs_off_t cook; struct xfs_da_geometry *geo = args->geo; int lock_mode; /* * If the block number in the offset is out of range, we're done. */ if (xfs_dir2_dataptr_to_db(geo, ctx->pos) > geo->datablk) return 0; lock_mode = xfs_ilock_data_map_shared(dp); error = xfs_dir3_block_read(NULL, dp, &bp); xfs_iunlock(dp, lock_mode); if (error) return error; /* * Extract the byte offset we start at from the seek pointer. * We'll skip entries before this. */ wantoff = xfs_dir2_dataptr_to_off(geo, ctx->pos); hdr = bp->b_addr; xfs_dir3_data_check(dp, bp); /* * Set up values for the loop. */ btp = xfs_dir2_block_tail_p(geo, hdr); ptr = (char *)dp->d_ops->data_entry_p(hdr); endptr = (char *)xfs_dir2_block_leaf_p(btp); /* * Loop over the data portion of the block. * Each object is a real entry (dep) or an unused one (dup). */ while (ptr < endptr) { __uint8_t filetype; dup = (xfs_dir2_data_unused_t *)ptr; /* * Unused, skip it. */ if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) { ptr += be16_to_cpu(dup->length); continue; } dep = (xfs_dir2_data_entry_t *)ptr; /* * Bump pointer for the next iteration. */ ptr += dp->d_ops->data_entsize(dep->namelen); /* * The entry is before the desired starting point, skip it. */ if ((char *)dep - (char *)hdr < wantoff) continue; cook = xfs_dir2_db_off_to_dataptr(geo, geo->datablk, (char *)dep - (char *)hdr); ctx->pos = cook & 0x7fffffff; filetype = dp->d_ops->data_get_ftype(dep); /* * If it didn't fit, set the final offset to here & return. */ if (!dir_emit(ctx, (char *)dep->name, dep->namelen, be64_to_cpu(dep->inumber), xfs_dir3_get_dtype(dp->i_mount, filetype))) { xfs_trans_brelse(NULL, bp); return 0; } } /* * Reached the end of the block. * Set the offset to a non-existent block 1 and return. */ ctx->pos = xfs_dir2_db_off_to_dataptr(geo, geo->datablk + 1, 0) & 0x7fffffff; xfs_trans_brelse(NULL, bp); return 0; }
/* * Truncate file. Must have write permission and not be a directory. */ int xfs_setattr_size( struct xfs_inode *ip, struct iattr *iattr) { struct xfs_mount *mp = ip->i_mount; struct inode *inode = VFS_I(ip); int mask = iattr->ia_valid; xfs_off_t oldsize, newsize; struct xfs_trans *tp; int error; uint lock_flags = 0; uint commit_flags = 0; trace_xfs_setattr(ip); if (mp->m_flags & XFS_MOUNT_RDONLY) return XFS_ERROR(EROFS); if (XFS_FORCED_SHUTDOWN(mp)) return XFS_ERROR(EIO); error = -inode_change_ok(inode, iattr); if (error) return XFS_ERROR(error); ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); ASSERT(S_ISREG(ip->i_d.di_mode)); ASSERT((mask & (ATTR_UID|ATTR_GID|ATTR_ATIME|ATTR_ATIME_SET| ATTR_MTIME_SET|ATTR_KILL_PRIV|ATTR_TIMES_SET)) == 0); oldsize = inode->i_size; newsize = iattr->ia_size; /* * Short circuit the truncate case for zero length files. */ if (newsize == 0 && oldsize == 0 && ip->i_d.di_nextents == 0) { if (!(mask & (ATTR_CTIME|ATTR_MTIME))) return 0; /* * Use the regular setattr path to update the timestamps. */ iattr->ia_valid &= ~ATTR_SIZE; return xfs_setattr_nonsize(ip, iattr, 0); } /* * Make sure that the dquots are attached to the inode. */ error = xfs_qm_dqattach(ip, 0); if (error) return error; /* * Now we can make the changes. Before we join the inode to the * transaction, take care of the part of the truncation that must be * done without the inode lock. This needs to be done before joining * the inode to the transaction, because the inode cannot be unlocked * once it is a part of the transaction. */ if (newsize > oldsize) { /* * Do the first part of growing a file: zero any data in the * last block that is beyond the old EOF. We need to do this * before the inode is joined to the transaction to modify * i_size. */ error = xfs_zero_eof(ip, newsize, oldsize); if (error) return error; } /* * We are going to log the inode size change in this transaction so * any previous writes that are beyond the on disk EOF and the new * EOF that have not been written out need to be written here. If we * do not write the data out, we expose ourselves to the null files * problem. * * Only flush from the on disk size to the smaller of the in memory * file size or the new size as that's the range we really care about * here and prevents waiting for other data not within the range we * care about here. */ if (oldsize != ip->i_d.di_size && newsize > ip->i_d.di_size) { error = -filemap_write_and_wait_range(VFS_I(ip)->i_mapping, ip->i_d.di_size, newsize); if (error) return error; } /* * Wait for all direct I/O to complete. */ inode_dio_wait(inode); error = -block_truncate_page(inode->i_mapping, newsize, xfs_get_blocks); if (error) return error; tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_SIZE); error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0); if (error) goto out_trans_cancel; truncate_setsize(inode, newsize); commit_flags = XFS_TRANS_RELEASE_LOG_RES; lock_flags |= XFS_ILOCK_EXCL; xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, 0); /* * Only change the c/mtime if we are changing the size or we are * explicitly asked to change it. This handles the semantic difference * between truncate() and ftruncate() as implemented in the VFS. * * The regular truncate() case without ATTR_CTIME and ATTR_MTIME is a * special case where we need to update the times despite not having * these flags set. For all other operations the VFS set these flags * explicitly if it wants a timestamp update. */ if (newsize != oldsize && (!(mask & (ATTR_CTIME | ATTR_MTIME)))) { iattr->ia_ctime = iattr->ia_mtime = current_fs_time(inode->i_sb); mask |= ATTR_CTIME | ATTR_MTIME; } /* * The first thing we do is set the size to new_size permanently on * disk. This way we don't have to worry about anyone ever being able * to look at the data being freed even in the face of a crash. * What we're getting around here is the case where we free a block, it * is allocated to another file, it is written to, and then we crash. * If the new data gets written to the file but the log buffers * containing the free and reallocation don't, then we'd end up with * garbage in the blocks being freed. As long as we make the new size * permanent before actually freeing any blocks it doesn't matter if * they get written to. */ ip->i_d.di_size = newsize; xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); if (newsize <= oldsize) { error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, newsize); if (error) goto out_trans_abort; /* * Truncated "down", so we're removing references to old data * here - if we delay flushing for a long time, we expose * ourselves unduly to the notorious NULL files problem. So, * we mark this inode and flush it when the file is closed, * and do not wait the usual (long) time for writeout. */ xfs_iflags_set(ip, XFS_ITRUNCATED); /* A truncate down always removes post-EOF blocks. */ xfs_inode_clear_eofblocks_tag(ip); } /* * Change file access modes. */ if (mask & ATTR_MODE) xfs_setattr_mode(tp, ip, iattr); if (mask & ATTR_CTIME) { inode->i_ctime = iattr->ia_ctime; ip->i_d.di_ctime.t_sec = iattr->ia_ctime.tv_sec; ip->i_d.di_ctime.t_nsec = iattr->ia_ctime.tv_nsec; } if (mask & ATTR_MTIME) { inode->i_mtime = iattr->ia_mtime; ip->i_d.di_mtime.t_sec = iattr->ia_mtime.tv_sec; ip->i_d.di_mtime.t_nsec = iattr->ia_mtime.tv_nsec; } xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); XFS_STATS_INC(xs_ig_attrchg); if (mp->m_flags & XFS_MOUNT_WSYNC) xfs_trans_set_sync(tp); error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); out_unlock: if (lock_flags) xfs_iunlock(ip, lock_flags); return error; out_trans_abort: commit_flags |= XFS_TRANS_ABORT; out_trans_cancel: xfs_trans_cancel(tp, commit_flags); goto out_unlock; }
/* * Given the file system, inode OR id, and type (UDQUOT/GDQUOT), return a * a locked dquot, doing an allocation (if requested) as needed. * When both an inode and an id are given, the inode's id takes precedence. * That is, if the id changes while we don't hold the ilock inside this * function, the new dquot is returned, not necessarily the one requested * in the id argument. */ int xfs_qm_dqget( xfs_mount_t *mp, xfs_inode_t *ip, /* locked inode (optional) */ xfs_dqid_t id, /* uid/projid/gid depending on type */ uint type, /* XFS_DQ_USER/XFS_DQ_PROJ/XFS_DQ_GROUP */ uint flags, /* DQALLOC, DQSUSER, DQREPAIR, DOWARN */ xfs_dquot_t **O_dqpp) /* OUT : locked incore dquot */ { struct xfs_quotainfo *qi = mp->m_quotainfo; struct radix_tree_root *tree = xfs_dquot_tree(qi, type); struct xfs_dquot *dqp; int error; ASSERT(XFS_IS_QUOTA_RUNNING(mp)); if ((! XFS_IS_UQUOTA_ON(mp) && type == XFS_DQ_USER) || (! XFS_IS_PQUOTA_ON(mp) && type == XFS_DQ_PROJ) || (! XFS_IS_GQUOTA_ON(mp) && type == XFS_DQ_GROUP)) { return (ESRCH); } #ifdef DEBUG if (xfs_do_dqerror) { if ((xfs_dqerror_target == mp->m_ddev_targp) && (xfs_dqreq_num++ % xfs_dqerror_mod) == 0) { xfs_debug(mp, "Returning error in dqget"); return (EIO); } } ASSERT(type == XFS_DQ_USER || type == XFS_DQ_PROJ || type == XFS_DQ_GROUP); if (ip) { ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); ASSERT(xfs_inode_dquot(ip, type) == NULL); } #endif restart: mutex_lock(&qi->qi_tree_lock); dqp = radix_tree_lookup(tree, id); if (dqp) { xfs_dqlock(dqp); if (dqp->dq_flags & XFS_DQ_FREEING) { xfs_dqunlock(dqp); mutex_unlock(&qi->qi_tree_lock); trace_xfs_dqget_freeing(dqp); delay(1); goto restart; } dqp->q_nrefs++; mutex_unlock(&qi->qi_tree_lock); trace_xfs_dqget_hit(dqp); XFS_STATS_INC(xs_qm_dqcachehits); *O_dqpp = dqp; return 0; } mutex_unlock(&qi->qi_tree_lock); XFS_STATS_INC(xs_qm_dqcachemisses); /* * Dquot cache miss. We don't want to keep the inode lock across * a (potential) disk read. Also we don't want to deal with the lock * ordering between quotainode and this inode. OTOH, dropping the inode * lock here means dealing with a chown that can happen before * we re-acquire the lock. */ if (ip) xfs_iunlock(ip, XFS_ILOCK_EXCL); error = xfs_qm_dqread(mp, id, type, flags, &dqp); if (ip) xfs_ilock(ip, XFS_ILOCK_EXCL); if (error) return error; if (ip) { /* * A dquot could be attached to this inode by now, since * we had dropped the ilock. */ if (xfs_this_quota_on(mp, type)) { struct xfs_dquot *dqp1; dqp1 = xfs_inode_dquot(ip, type); if (dqp1) { xfs_qm_dqdestroy(dqp); dqp = dqp1; xfs_dqlock(dqp); goto dqret; } } else { /* inode stays locked on return */ xfs_qm_dqdestroy(dqp); return XFS_ERROR(ESRCH); } } mutex_lock(&qi->qi_tree_lock); error = -radix_tree_insert(tree, id, dqp); if (unlikely(error)) { WARN_ON(error != EEXIST); /* * Duplicate found. Just throw away the new dquot and start * over. */ mutex_unlock(&qi->qi_tree_lock); trace_xfs_dqget_dup(dqp); xfs_qm_dqdestroy(dqp); XFS_STATS_INC(xs_qm_dquot_dups); goto restart; } /* * We return a locked dquot to the caller, with a reference taken */ xfs_dqlock(dqp); dqp->q_nrefs = 1; qi->qi_dquots++; mutex_unlock(&qi->qi_tree_lock); dqret: ASSERT((ip == NULL) || xfs_isilocked(ip, XFS_ILOCK_EXCL)); trace_xfs_dqget_miss(dqp); *O_dqpp = dqp; return (0); }
/* * Convert userspace handle data into inode. * * We use the fact that all the fsop_handlereq ioctl calls have a data * structure argument whose first component is always a xfs_fsop_handlereq_t, * so we can pass that sub structure into this handy, shared routine. * * If no error, caller must always iput the returned inode. */ STATIC int xfs_vget_fsop_handlereq( xfs_mount_t *mp, struct inode *parinode, /* parent inode pointer */ xfs_fsop_handlereq_t *hreq, struct inode **inode) { void __user *hanp; size_t hlen; xfs_fid_t *xfid; xfs_handle_t *handlep; xfs_handle_t handle; xfs_inode_t *ip; xfs_ino_t ino; __u32 igen; int error; /* * Only allow handle opens under a directory. */ if (!S_ISDIR(parinode->i_mode)) return XFS_ERROR(ENOTDIR); hanp = hreq->ihandle; hlen = hreq->ihandlen; handlep = &handle; if (hlen < sizeof(handlep->ha_fsid) || hlen > sizeof(*handlep)) return XFS_ERROR(EINVAL); if (copy_from_user(handlep, hanp, hlen)) return XFS_ERROR(EFAULT); if (hlen < sizeof(*handlep)) memset(((char *)handlep) + hlen, 0, sizeof(*handlep) - hlen); if (hlen > sizeof(handlep->ha_fsid)) { if (handlep->ha_fid.fid_len != (hlen - sizeof(handlep->ha_fsid) - sizeof(handlep->ha_fid.fid_len)) || handlep->ha_fid.fid_pad) return XFS_ERROR(EINVAL); } /* * Crack the handle, obtain the inode # & generation # */ xfid = (struct xfs_fid *)&handlep->ha_fid; if (xfid->fid_len == sizeof(*xfid) - sizeof(xfid->fid_len)) { ino = xfid->fid_ino; igen = xfid->fid_gen; } else { return XFS_ERROR(EINVAL); } /* * Get the XFS inode, building a Linux inode to go with it. */ error = xfs_iget(mp, NULL, ino, 0, XFS_ILOCK_SHARED, &ip, 0); if (error) return error; if (ip == NULL) return XFS_ERROR(EIO); if (ip->i_d.di_gen != igen) { xfs_iput_new(ip, XFS_ILOCK_SHARED); return XFS_ERROR(ENOENT); } xfs_iunlock(ip, XFS_ILOCK_SHARED); *inode = VFS_I(ip); return 0; }
/* * Allocate a block and fill it with dquots. * This is called when the bmapi finds a hole. */ STATIC int xfs_qm_dqalloc( xfs_trans_t **tpp, xfs_mount_t *mp, xfs_dquot_t *dqp, xfs_inode_t *quotip, xfs_fileoff_t offset_fsb, xfs_buf_t **O_bpp) { xfs_fsblock_t firstblock; xfs_bmap_free_t flist; xfs_bmbt_irec_t map; int nmaps, error, committed; xfs_buf_t *bp; xfs_trans_t *tp = *tpp; ASSERT(tp != NULL); xfs_dqtrace_entry(dqp, "DQALLOC"); /* * Initialize the bmap freelist prior to calling bmapi code. */ xfs_bmap_init(&flist, &firstblock); xfs_ilock(quotip, XFS_ILOCK_EXCL); /* * Return if this type of quotas is turned off while we didn't * have an inode lock */ if (XFS_IS_THIS_QUOTA_OFF(dqp)) { xfs_iunlock(quotip, XFS_ILOCK_EXCL); return (ESRCH); } /* * xfs_trans_commit normally decrements the vnode ref count * when it unlocks the inode. Since we want to keep the quota * inode around, we bump the vnode ref count now. */ IHOLD(quotip); xfs_trans_ijoin(tp, quotip, XFS_ILOCK_EXCL); nmaps = 1; if ((error = xfs_bmapi(tp, quotip, offset_fsb, XFS_DQUOT_CLUSTER_SIZE_FSB, XFS_BMAPI_METADATA | XFS_BMAPI_WRITE, &firstblock, XFS_QM_DQALLOC_SPACE_RES(mp), &map, &nmaps, &flist, NULL))) { goto error0; } ASSERT(map.br_blockcount == XFS_DQUOT_CLUSTER_SIZE_FSB); ASSERT(nmaps == 1); ASSERT((map.br_startblock != DELAYSTARTBLOCK) && (map.br_startblock != HOLESTARTBLOCK)); /* * Keep track of the blkno to save a lookup later */ dqp->q_blkno = XFS_FSB_TO_DADDR(mp, map.br_startblock); /* now we can just get the buffer (there's nothing to read yet) */ bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, dqp->q_blkno, XFS_QI_DQCHUNKLEN(mp), 0); if (!bp || (error = XFS_BUF_GETERROR(bp))) goto error1; /* * Make a chunk of dquots out of this buffer and log * the entire thing. */ xfs_qm_init_dquot_blk(tp, mp, be32_to_cpu(dqp->q_core.d_id), dqp->dq_flags & XFS_DQ_ALLTYPES, bp); /* * xfs_bmap_finish() may commit the current transaction and * start a second transaction if the freelist is not empty. * * Since we still want to modify this buffer, we need to * ensure that the buffer is not released on commit of * the first transaction and ensure the buffer is added to the * second transaction. * * If there is only one transaction then don't stop the buffer * from being released when it commits later on. */ xfs_trans_bhold(tp, bp); if ((error = xfs_bmap_finish(tpp, &flist, &committed))) { goto error1; } if (committed) { tp = *tpp; xfs_trans_bjoin(tp, bp); } else { xfs_trans_bhold_release(tp, bp); } *O_bpp = bp; return 0; error1: xfs_bmap_cancel(&flist); error0: xfs_iunlock(quotip, XFS_ILOCK_EXCL); return (error); }
/* * This gets called by xfs_trans_push_ail(), when IOP_TRYLOCK * failed to get the inode flush lock but did get the inode locked SHARED. * Here we're trying to see if the inode buffer is incore, and if so whether it's * marked delayed write. If that's the case, we'll initiate a bawrite on that * buffer to expedite the process. * * We aren't holding the AIL_LOCK (or the flush lock) when this gets called, * so it is inherently race-y. */ STATIC void xfs_inode_item_pushbuf( xfs_inode_log_item_t *iip) { xfs_inode_t *ip; xfs_mount_t *mp; xfs_buf_t *bp; uint dopush; ip = iip->ili_inode; ASSERT(ismrlocked(&(ip->i_lock), MR_ACCESS)); /* * The ili_pushbuf_flag keeps others from * trying to duplicate our effort. */ ASSERT(iip->ili_pushbuf_flag != 0); ASSERT(iip->ili_push_owner == get_thread_id()); /* * If flushlock isn't locked anymore, chances are that the * inode flush completed and the inode was taken off the AIL. * So, just get out. */ if ((valusema(&(ip->i_flock)) > 0) || ((iip->ili_item.li_flags & XFS_LI_IN_AIL) == 0)) { iip->ili_pushbuf_flag = 0; xfs_iunlock(ip, XFS_ILOCK_SHARED); return; } mp = ip->i_mount; bp = xfs_incore(mp->m_ddev_targ, iip->ili_format.ilf_blkno, iip->ili_format.ilf_len, XFS_INCORE_TRYLOCK); if (bp != NULL) { if (XFS_BUF_ISDELAYWRITE(bp)) { /* * We were racing with iflush because we don't hold * the AIL_LOCK or the flush lock. However, at this point, * we have the buffer, and we know that it's dirty. * So, it's possible that iflush raced with us, and * this item is already taken off the AIL. * If not, we can flush it async. */ dopush = ((iip->ili_item.li_flags & XFS_LI_IN_AIL) && (valusema(&(ip->i_flock)) <= 0)); iip->ili_pushbuf_flag = 0; xfs_iunlock(ip, XFS_ILOCK_SHARED); xfs_buftrace("INODE ITEM PUSH", bp); if (XFS_BUF_ISPINNED(bp)) { xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE); } if (dopush) { xfs_bawrite(mp, bp); } else { xfs_buf_relse(bp); } } else { iip->ili_pushbuf_flag = 0; xfs_iunlock(ip, XFS_ILOCK_SHARED); xfs_buf_relse(bp); } return; } /* * We have to be careful about resetting pushbuf flag too early (above). * Eventhough in theory we can do it as soon as we have the buflock, * we don't want others to be doing work needlessly. They'll come to * this function thinking that pushing the buffer is there responsibility * only to find that the buffer is still locked by another doing the * same thing.XXX */ iip->ili_pushbuf_flag = 0; xfs_iunlock(ip, XFS_ILOCK_SHARED); return; }