STATIC void xfs_fstrm_free_func( unsigned long ino, void *data) { fstrm_item_t *item = (fstrm_item_t *)data; xfs_inode_t *ip = item->ip; ASSERT(ip->i_ino == ino); xfs_iflags_clear(ip, XFS_IFILESTREAM); xfs_filestream_put_ag(ip->i_mount, item->ag); TRACE_FREE(ip->i_mount, ip, item->pip, item->ag, xfs_filestream_peek_ag(ip->i_mount, item->ag)); IRELE(ip); if (item->pip) IRELE(item->pip); kmem_zone_free(item_zone, item); }
STATIC int xfs_file_fsync( struct file *file, loff_t start, loff_t end, int datasync) { struct inode *inode = file->f_mapping->host; struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; int error = 0; int log_flushed = 0; xfs_lsn_t lsn = 0; trace_xfs_file_fsync(ip); error = filemap_write_and_wait_range(inode->i_mapping, start, end); if (error) return error; if (XFS_FORCED_SHUTDOWN(mp)) return -XFS_ERROR(EIO); xfs_iflags_clear(ip, XFS_ITRUNCATED); if (mp->m_flags & XFS_MOUNT_BARRIER) { /* * If we have an RT and/or log subvolume we need to make sure * to flush the write cache the device used for file data * first. This is to ensure newly written file data make * it to disk before logging the new inode size in case of * an extending write. */ if (XFS_IS_REALTIME_INODE(ip)) xfs_blkdev_issue_flush(mp->m_rtdev_targp); else if (mp->m_logdev_targp != mp->m_ddev_targp) xfs_blkdev_issue_flush(mp->m_ddev_targp); } xfs_ilock(ip, XFS_ILOCK_SHARED); if (xfs_ipincount(ip)) { if (!datasync || (ip->i_itemp->ili_fields & ~XFS_ILOG_TIMESTAMP)) lsn = ip->i_itemp->ili_last_lsn; } xfs_iunlock(ip, XFS_ILOCK_SHARED); if (lsn) error = _xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, &log_flushed); if ((mp->m_flags & XFS_MOUNT_BARRIER) && mp->m_logdev_targp == mp->m_ddev_targp && !XFS_IS_REALTIME_INODE(ip) && !log_flushed) xfs_blkdev_issue_flush(mp->m_ddev_targp); return -error; }
/* * We ignore the datasync flag here because a datasync is effectively * identical to an fsync. That is, datasync implies that we need to write * only the metadata needed to be able to access the data that is written * if we crash after the call completes. Hence if we are writing beyond * EOF we have to log the inode size change as well, which makes it a * full fsync. If we don't write beyond EOF, the inode core will be * clean in memory and so we don't need to log the inode, just like * fsync. */ STATIC int xfs_file_fsync( struct file *filp, struct dentry *dentry, int datasync) { xfs_iflags_clear(XFS_I(dentry->d_inode), XFS_ITRUNCATED); return -xfs_fsync(XFS_I(dentry->d_inode)); }
STATIC int xfs_file_fsync( struct file *filp, struct dentry *dentry, int datasync) { int flags = FSYNC_WAIT; if (datasync) flags |= FSYNC_DATA; xfs_iflags_clear(XFS_I(dentry->d_inode), XFS_ITRUNCATED); return -xfs_fsync(XFS_I(dentry->d_inode), flags, (xfs_off_t)0, (xfs_off_t)-1); }
/* * Revalidate the Linux inode from the XFS inode. * Note: i_size _not_ updated; we must hold the inode * semaphore when doing that - callers responsibility. */ int vn_revalidate( bhv_vnode_t *vp) { struct inode *inode = vn_to_inode(vp); struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; unsigned long xflags; xfs_itrace_entry(ip); if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; xfs_ilock(ip, XFS_ILOCK_SHARED); inode->i_mode = ip->i_d.di_mode; inode->i_uid = ip->i_d.di_uid; inode->i_gid = ip->i_d.di_gid; inode->i_mtime.tv_sec = ip->i_d.di_mtime.t_sec; inode->i_mtime.tv_nsec = ip->i_d.di_mtime.t_nsec; inode->i_ctime.tv_sec = ip->i_d.di_ctime.t_sec; inode->i_ctime.tv_nsec = ip->i_d.di_ctime.t_nsec; xflags = xfs_ip2xflags(ip); if (xflags & XFS_XFLAG_IMMUTABLE) inode->i_flags |= S_IMMUTABLE; else inode->i_flags &= ~S_IMMUTABLE; if (xflags & XFS_XFLAG_APPEND) inode->i_flags |= S_APPEND; else inode->i_flags &= ~S_APPEND; if (xflags & XFS_XFLAG_SYNC) inode->i_flags |= S_SYNC; else inode->i_flags &= ~S_SYNC; if (xflags & XFS_XFLAG_NOATIME) inode->i_flags |= S_NOATIME; else inode->i_flags &= ~S_NOATIME; xfs_iunlock(ip, XFS_ILOCK_SHARED); xfs_iflags_clear(ip, XFS_IMODIFIED); return 0; }
int xfs_flushinval_pages( xfs_inode_t *ip, xfs_off_t first, xfs_off_t last, int fiopt) { struct address_space *mapping = ip->i_vnode->i_mapping; int ret = 0; if (mapping->nrpages) { xfs_iflags_clear(ip, XFS_ITRUNCATED); ret = filemap_write_and_wait(mapping); if (!ret) truncate_inode_pages(mapping, first); } return ret; }
int xfs_flushinval_pages( xfs_inode_t *ip, xfs_off_t first, xfs_off_t last, int fiopt) { struct address_space *mapping = VFS_I(ip)->i_mapping; int ret = 0; trace_xfs_pagecache_inval(ip, first, last); xfs_iflags_clear(ip, XFS_ITRUNCATED); ret = filemap_write_and_wait_range(mapping, first, last == -1 ? LLONG_MAX : last); if (!ret) truncate_inode_pages_range(mapping, first, last); return -ret; }
/* xfs_fstrm_free_func(): callback for freeing cached stream items. */ STATIC void xfs_fstrm_free_func( unsigned long ino, void *data) { fstrm_item_t *item = (fstrm_item_t *)data; xfs_inode_t *ip = item->ip; int ref; ASSERT(ip->i_ino == ino); xfs_iflags_clear(ip, XFS_IFILESTREAM); /* Drop the reference taken on the AG when the item was added. */ ref = xfs_filestream_put_ag(ip->i_mount, item->ag); ASSERT(ref >= 0); TRACE_FREE(ip->i_mount, ip, item->pip, item->ag, xfs_filestream_peek_ag(ip->i_mount, item->ag)); /* * _xfs_filestream_update_ag() always takes a reference on the inode * itself, whether it's a file or a directory. Release it here. * This can result in the inode being freed and so we must * not hold any inode locks when freeing filesstreams objects * otherwise we can deadlock here. */ IRELE(ip); /* * In the case of a regular file, _xfs_filestream_update_ag() also * takes a ref on the parent inode to keep it in-core. Release that * too. */ if (item->pip) IRELE(item->pip); /* Finally, free the memory allocated for the item. */ kmem_zone_free(item_zone, item); }
int xfs_flush_pages( xfs_inode_t *ip, xfs_off_t first, xfs_off_t last, uint64_t flags, int fiopt) { struct address_space *mapping = VFS_I(ip)->i_mapping; int ret = 0; int ret2; xfs_iflags_clear(ip, XFS_ITRUNCATED); ret = -filemap_fdatawrite_range(mapping, first, last == -1 ? LLONG_MAX : last); if (flags & XBF_ASYNC) return ret; ret2 = xfs_wait_on_pages(ip, first, last); if (!ret) ret = ret2; return ret; }
int xfs_flush_pages( xfs_inode_t *ip, xfs_off_t first, xfs_off_t last, uint64_t flags, int fiopt) { struct address_space *mapping = VFS_I(ip)->i_mapping; int ret = 0; int ret2; if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { xfs_iflags_clear(ip, XFS_ITRUNCATED); ret = -filemap_fdatawrite(mapping); } if (flags & XBF_ASYNC) return ret; ret2 = xfs_wait_on_pages(ip, first, last); if (!ret) ret = ret2; return ret; }
/* * Initialize the Linux inode, set up the operation vectors and * unlock the inode. * * When reading existing inodes from disk this is called directly * from xfs_iget, when creating a new inode it is called from * xfs_ialloc after setting up the inode. * * We are always called with an uninitialised linux inode here. * We need to initialise the necessary fields and take a reference * on it. */ void xfs_setup_inode( struct xfs_inode *ip) { struct inode *inode = &ip->i_vnode; inode->i_ino = ip->i_ino; inode->i_state = I_NEW; inode_sb_list_add(inode); insert_inode_hash(inode); inode->i_mode = ip->i_d.di_mode; inode->i_nlink = ip->i_d.di_nlink; inode->i_uid = ip->i_d.di_uid; inode->i_gid = ip->i_d.di_gid; switch (inode->i_mode & S_IFMT) { case S_IFBLK: case S_IFCHR: inode->i_rdev = MKDEV(sysv_major(ip->i_df.if_u2.if_rdev) & 0x1ff, sysv_minor(ip->i_df.if_u2.if_rdev)); break; default: inode->i_rdev = 0; break; } inode->i_generation = ip->i_d.di_gen; i_size_write(inode, ip->i_d.di_size); inode->i_atime.tv_sec = ip->i_d.di_atime.t_sec; inode->i_atime.tv_nsec = ip->i_d.di_atime.t_nsec; inode->i_mtime.tv_sec = ip->i_d.di_mtime.t_sec; inode->i_mtime.tv_nsec = ip->i_d.di_mtime.t_nsec; inode->i_ctime.tv_sec = ip->i_d.di_ctime.t_sec; inode->i_ctime.tv_nsec = ip->i_d.di_ctime.t_nsec; xfs_diflags_to_iflags(inode, ip); switch (inode->i_mode & S_IFMT) { case S_IFREG: inode->i_op = &xfs_inode_operations; inode->i_fop = &xfs_file_operations; inode->i_mapping->a_ops = &xfs_address_space_operations; break; case S_IFDIR: if (xfs_sb_version_hasasciici(&XFS_M(inode->i_sb)->m_sb)) inode->i_op = &xfs_dir_ci_inode_operations; else inode->i_op = &xfs_dir_inode_operations; inode->i_fop = &xfs_dir_file_operations; break; case S_IFLNK: inode->i_op = &xfs_symlink_inode_operations; if (!(ip->i_df.if_flags & XFS_IFINLINE)) inode->i_mapping->a_ops = &xfs_address_space_operations; break; default: inode->i_op = &xfs_inode_operations; init_special_inode(inode, inode->i_mode, inode->i_rdev); break; } xfs_iflags_clear(ip, XFS_INEW); barrier(); unlock_new_inode(inode); }
/* * Initialize the Linux inode, set up the operation vectors and * unlock the inode. * * When reading existing inodes from disk this is called directly * from xfs_iget, when creating a new inode it is called from * xfs_ialloc after setting up the inode. * * We are always called with an uninitialised linux inode here. * We need to initialise the necessary fields and take a reference * on it. */ void xfs_setup_inode( struct xfs_inode *ip) { struct inode *inode = &ip->i_vnode; gfp_t gfp_mask; inode->i_ino = ip->i_ino; inode->i_state = I_NEW; inode_sb_list_add(inode); /* make the inode look hashed for the writeback code */ hlist_add_fake(&inode->i_hash); inode->i_mode = ip->i_d.di_mode; set_nlink(inode, ip->i_d.di_nlink); inode->i_uid = xfs_uid_to_kuid(ip->i_d.di_uid); inode->i_gid = xfs_gid_to_kgid(ip->i_d.di_gid); switch (inode->i_mode & S_IFMT) { case S_IFBLK: case S_IFCHR: inode->i_rdev = MKDEV(sysv_major(ip->i_df.if_u2.if_rdev) & 0x1ff, sysv_minor(ip->i_df.if_u2.if_rdev)); break; default: inode->i_rdev = 0; break; } inode->i_generation = ip->i_d.di_gen; i_size_write(inode, ip->i_d.di_size); inode->i_atime.tv_sec = ip->i_d.di_atime.t_sec; inode->i_atime.tv_nsec = ip->i_d.di_atime.t_nsec; inode->i_mtime.tv_sec = ip->i_d.di_mtime.t_sec; inode->i_mtime.tv_nsec = ip->i_d.di_mtime.t_nsec; inode->i_ctime.tv_sec = ip->i_d.di_ctime.t_sec; inode->i_ctime.tv_nsec = ip->i_d.di_ctime.t_nsec; xfs_diflags_to_iflags(inode, ip); ip->d_ops = ip->i_mount->m_nondir_inode_ops; lockdep_set_class(&ip->i_lock.mr_lock, &xfs_nondir_ilock_class); switch (inode->i_mode & S_IFMT) { case S_IFREG: inode->i_op = &xfs_inode_operations; inode->i_fop = &xfs_file_operations; inode->i_mapping->a_ops = &xfs_address_space_operations; break; case S_IFDIR: lockdep_set_class(&ip->i_lock.mr_lock, &xfs_dir_ilock_class); if (xfs_sb_version_hasasciici(&XFS_M(inode->i_sb)->m_sb)) inode->i_op = &xfs_dir_ci_inode_operations; else inode->i_op = &xfs_dir_inode_operations; inode->i_fop = &xfs_dir_file_operations; ip->d_ops = ip->i_mount->m_dir_inode_ops; break; case S_IFLNK: inode->i_op = &xfs_symlink_inode_operations; if (!(ip->i_df.if_flags & XFS_IFINLINE)) inode->i_mapping->a_ops = &xfs_address_space_operations; break; default: inode->i_op = &xfs_inode_operations; init_special_inode(inode, inode->i_mode, inode->i_rdev); break; } /* * Ensure all page cache allocations are done from GFP_NOFS context to * prevent direct reclaim recursion back into the filesystem and blowing * stacks or deadlocking. */ gfp_mask = mapping_gfp_mask(inode->i_mapping); mapping_set_gfp_mask(inode->i_mapping, (gfp_mask & ~(__GFP_FS))); /* * If there is no attribute fork no ACL can exist on this inode, * and it can't have any file capabilities attached to it either. */ if (!XFS_IFORK_Q(ip)) { inode_has_no_xattr(inode); cache_no_acl(inode); } xfs_iflags_clear(ip, XFS_INEW); barrier(); unlock_new_inode(inode); }
STATIC int xfs_file_fsync( struct file *file, loff_t start, loff_t end, int datasync) { struct inode *inode = file->f_mapping->host; struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; int error = 0; int log_flushed = 0; xfs_lsn_t lsn = 0; trace_xfs_file_fsync(ip); error = filemap_write_and_wait_range(inode->i_mapping, start, end); if (error) return error; if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; xfs_iflags_clear(ip, XFS_ITRUNCATED); if (mp->m_flags & XFS_MOUNT_BARRIER) { /* * If we have an RT and/or log subvolume we need to make sure * to flush the write cache the device used for file data * first. This is to ensure newly written file data make * it to disk before logging the new inode size in case of * an extending write. */ if (XFS_IS_REALTIME_INODE(ip)) xfs_blkdev_issue_flush(mp->m_rtdev_targp); else if (mp->m_logdev_targp != mp->m_ddev_targp) xfs_blkdev_issue_flush(mp->m_ddev_targp); } /* * All metadata updates are logged, which means that we just have * to flush the log up to the latest LSN that touched the inode. */ xfs_ilock(ip, XFS_ILOCK_SHARED); if (xfs_ipincount(ip)) { if (!datasync || (ip->i_itemp->ili_fields & ~XFS_ILOG_TIMESTAMP)) lsn = ip->i_itemp->ili_last_lsn; } xfs_iunlock(ip, XFS_ILOCK_SHARED); if (lsn) error = _xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, &log_flushed); /* * If we only have a single device, and the log force about was * a no-op we might have to flush the data device cache here. * This can only happen for fdatasync/O_DSYNC if we were overwriting * an already allocated file and thus do not have any metadata to * commit. */ if ((mp->m_flags & XFS_MOUNT_BARRIER) && mp->m_logdev_targp == mp->m_ddev_targp && !XFS_IS_REALTIME_INODE(ip) && !log_flushed) xfs_blkdev_issue_flush(mp->m_ddev_targp); return error; }
STATIC int xfs_file_fsync( struct file *file, struct dentry *dentry, int datasync) { struct xfs_inode *ip = XFS_I(dentry->d_inode); struct xfs_trans *tp; int error = 0; int log_flushed = 0; xfs_itrace_entry(ip); if (XFS_FORCED_SHUTDOWN(ip->i_mount)) return -XFS_ERROR(EIO); xfs_iflags_clear(ip, XFS_ITRUNCATED); /* * We always need to make sure that the required inode state is safe on * disk. The inode might be clean but we still might need to force the * log because of committed transactions that haven't hit the disk yet. * Likewise, there could be unflushed non-transactional changes to the * inode core that have to go to disk and this requires us to issue * a synchronous transaction to capture these changes correctly. * * This code relies on the assumption that if the i_update_core field * of the inode is clear and the inode is unpinned then it is clean * and no action is required. */ xfs_ilock(ip, XFS_ILOCK_SHARED); /* * First check if the VFS inode is marked dirty. All the dirtying * of non-transactional updates no goes through mark_inode_dirty*, * which allows us to distinguish beteeen pure timestamp updates * and i_size updates which need to be caught for fdatasync. * After that also theck for the dirty state in the XFS inode, which * might gets cleared when the inode gets written out via the AIL * or xfs_iflush_cluster. */ if (((dentry->d_inode->i_state & I_DIRTY_DATASYNC) || ((dentry->d_inode->i_state & I_DIRTY_SYNC) && !datasync)) && ip->i_update_core) { /* * Kick off a transaction to log the inode core to get the * updates. The sync transaction will also force the log. */ xfs_iunlock(ip, XFS_ILOCK_SHARED); tp = xfs_trans_alloc(ip->i_mount, XFS_TRANS_FSYNC_TS); error = xfs_trans_reserve(tp, 0, XFS_FSYNC_TS_LOG_RES(ip->i_mount), 0, 0, 0); if (error) { xfs_trans_cancel(tp, 0); return -error; } xfs_ilock(ip, XFS_ILOCK_EXCL); /* * Note - it's possible that we might have pushed ourselves out * of the way during trans_reserve which would flush the inode. * But there's no guarantee that the inode buffer has actually * gone out yet (it's delwri). Plus the buffer could be pinned * anyway if it's part of an inode in another recent * transaction. So we play it safe and fire off the * transaction anyway. */ xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); xfs_trans_ihold(tp, ip); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); xfs_trans_set_sync(tp); error = _xfs_trans_commit(tp, 0, &log_flushed); xfs_iunlock(ip, XFS_ILOCK_EXCL); } else { /* * Timestamps/size haven't changed since last inode flush or * inode transaction commit. That means either nothing got * written or a transaction committed which caught the updates. * If the latter happened and the transaction hasn't hit the * disk yet, the inode will be still be pinned. If it is, * force the log. */ if (xfs_ipincount(ip)) { error = _xfs_log_force_lsn(ip->i_mount, ip->i_itemp->ili_last_lsn, XFS_LOG_SYNC, &log_flushed); } xfs_iunlock(ip, XFS_ILOCK_SHARED); } if (ip->i_mount->m_flags & XFS_MOUNT_BARRIER) { /* * If the log write didn't issue an ordered tag we need * to flush the disk cache for the data device now. */ if (!log_flushed) xfs_blkdev_issue_flush(ip->i_mount->m_ddev_targp); /* * If this inode is on the RT dev we need to flush that * cache as well. */ if (XFS_IS_REALTIME_INODE(ip)) xfs_blkdev_issue_flush(ip->i_mount->m_rtdev_targp); } return -error; }
/* * Inodes in different states need to be treated differently, and the return * value of xfs_iflush is not sufficient to get this right. The following table * lists the inode states and the reclaim actions necessary for non-blocking * reclaim: * * * inode state iflush ret required action * --------------- ---------- --------------- * bad - reclaim * shutdown EIO unpin and reclaim * clean, unpinned 0 reclaim * stale, unpinned 0 reclaim * clean, pinned(*) 0 requeue * stale, pinned EAGAIN requeue * dirty, delwri ok 0 requeue * dirty, delwri blocked EAGAIN requeue * dirty, sync flush 0 reclaim * * (*) dgc: I don't think the clean, pinned state is possible but it gets * handled anyway given the order of checks implemented. * * As can be seen from the table, the return value of xfs_iflush() is not * sufficient to correctly decide the reclaim action here. The checks in * xfs_iflush() might look like duplicates, but they are not. * * Also, because we get the flush lock first, we know that any inode that has * been flushed delwri has had the flush completed by the time we check that * the inode is clean. The clean inode check needs to be done before flushing * the inode delwri otherwise we would loop forever requeuing clean inodes as * we cannot tell apart a successful delwri flush and a clean inode from the * return value of xfs_iflush(). * * Note that because the inode is flushed delayed write by background * writeback, the flush lock may already be held here and waiting on it can * result in very long latencies. Hence for sync reclaims, where we wait on the * flush lock, the caller should push out delayed write inodes first before * trying to reclaim them to minimise the amount of time spent waiting. For * background relaim, we just requeue the inode for the next pass. * * Hence the order of actions after gaining the locks should be: * bad => reclaim * shutdown => unpin and reclaim * pinned, delwri => requeue * pinned, sync => unpin * stale => reclaim * clean => reclaim * dirty, delwri => flush and requeue * dirty, sync => flush, wait and reclaim */ STATIC int xfs_reclaim_inode( struct xfs_inode *ip, struct xfs_perag *pag, int sync_mode) { int error; restart: error = 0; xfs_ilock(ip, XFS_ILOCK_EXCL); if (!xfs_iflock_nowait(ip)) { if (!(sync_mode & SYNC_WAIT)) goto out; /* * If we only have a single dirty inode in a cluster there is * a fair chance that the AIL push may have pushed it into * the buffer, but xfsbufd won't touch it until 30 seconds * from now, and thus we will lock up here. * * Promote the inode buffer to the front of the delwri list * and wake up xfsbufd now. */ xfs_promote_inode(ip); xfs_iflock(ip); } if (is_bad_inode(VFS_I(ip))) goto reclaim; if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { xfs_iunpin_wait(ip); goto reclaim; } if (xfs_ipincount(ip)) { if (!(sync_mode & SYNC_WAIT)) { xfs_ifunlock(ip); goto out; } xfs_iunpin_wait(ip); } if (xfs_iflags_test(ip, XFS_ISTALE)) goto reclaim; if (xfs_inode_clean(ip)) goto reclaim; /* * Now we have an inode that needs flushing. * * We do a nonblocking flush here even if we are doing a SYNC_WAIT * reclaim as we can deadlock with inode cluster removal. * xfs_ifree_cluster() can lock the inode buffer before it locks the * ip->i_lock, and we are doing the exact opposite here. As a result, * doing a blocking xfs_itobp() to get the cluster buffer will result * in an ABBA deadlock with xfs_ifree_cluster(). * * As xfs_ifree_cluser() must gather all inodes that are active in the * cache to mark them stale, if we hit this case we don't actually want * to do IO here - we want the inode marked stale so we can simply * reclaim it. Hence if we get an EAGAIN error on a SYNC_WAIT flush, * just unlock the inode, back off and try again. Hopefully the next * pass through will see the stale flag set on the inode. */ error = xfs_iflush(ip, SYNC_TRYLOCK | sync_mode); if (sync_mode & SYNC_WAIT) { if (error == EAGAIN) { xfs_iunlock(ip, XFS_ILOCK_EXCL); /* backoff longer than in xfs_ifree_cluster */ delay(2); goto restart; } xfs_iflock(ip); goto reclaim; } /* * When we have to flush an inode but don't have SYNC_WAIT set, we * flush the inode out using a delwri buffer and wait for the next * call into reclaim to find it in a clean state instead of waiting for * it now. We also don't return errors here - if the error is transient * then the next reclaim pass will flush the inode, and if the error * is permanent then the next sync reclaim will reclaim the inode and * pass on the error. */ if (error && error != EAGAIN && !XFS_FORCED_SHUTDOWN(ip->i_mount)) { xfs_warn(ip->i_mount, "inode 0x%llx background reclaim flush failed with %d", (long long)ip->i_ino, error); } out: xfs_iflags_clear(ip, XFS_IRECLAIM); xfs_iunlock(ip, XFS_ILOCK_EXCL); /* * We could return EAGAIN here to make reclaim rescan the inode tree in * a short while. However, this just burns CPU time scanning the tree * waiting for IO to complete and xfssyncd never goes back to the idle * state. Instead, return 0 to let the next scheduled background reclaim * attempt to reclaim the inode again. */ return 0; reclaim: xfs_ifunlock(ip); xfs_iunlock(ip, XFS_ILOCK_EXCL); XFS_STATS_INC(xs_ig_reclaims); /* * Remove the inode from the per-AG radix tree. * * Because radix_tree_delete won't complain even if the item was never * added to the tree assert that it's been there before to catch * problems with the inode life time early on. */ spin_lock(&pag->pag_ici_lock); if (!radix_tree_delete(&pag->pag_ici_root, XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino))) ASSERT(0); __xfs_inode_clear_reclaim(pag, ip); spin_unlock(&pag->pag_ici_lock); /* * Here we do an (almost) spurious inode lock in order to coordinate * with inode cache radix tree lookups. This is because the lookup * can reference the inodes in the cache without taking references. * * We make that OK here by ensuring that we wait until the inode is * unlocked after the lookup before we go ahead and free it. We get * both the ilock and the iolock because the code may need to drop the * ilock one but will still hold the iolock. */ xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); xfs_qm_dqdetach(ip); xfs_iunlock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); xfs_inode_free(ip); return error; }
/* * Inodes in different states need to be treated differently. The following * table lists the inode states and the reclaim actions necessary: * * inode state iflush ret required action * --------------- ---------- --------------- * bad - reclaim * shutdown EIO unpin and reclaim * clean, unpinned 0 reclaim * stale, unpinned 0 reclaim * clean, pinned(*) 0 requeue * stale, pinned EAGAIN requeue * dirty, async - requeue * dirty, sync 0 reclaim * * (*) dgc: I don't think the clean, pinned state is possible but it gets * handled anyway given the order of checks implemented. * * Also, because we get the flush lock first, we know that any inode that has * been flushed delwri has had the flush completed by the time we check that * the inode is clean. * * Note that because the inode is flushed delayed write by AIL pushing, the * flush lock may already be held here and waiting on it can result in very * long latencies. Hence for sync reclaims, where we wait on the flush lock, * the caller should push the AIL first before trying to reclaim inodes to * minimise the amount of time spent waiting. For background relaim, we only * bother to reclaim clean inodes anyway. * * Hence the order of actions after gaining the locks should be: * bad => reclaim * shutdown => unpin and reclaim * pinned, async => requeue * pinned, sync => unpin * stale => reclaim * clean => reclaim * dirty, async => requeue * dirty, sync => flush, wait and reclaim */ STATIC int xfs_reclaim_inode( struct xfs_inode *ip, struct xfs_perag *pag, int sync_mode) { struct xfs_buf *bp = NULL; xfs_ino_t ino = ip->i_ino; /* for radix_tree_delete */ int error; restart: error = 0; xfs_ilock(ip, XFS_ILOCK_EXCL); if (!xfs_iflock_nowait(ip)) { if (!(sync_mode & SYNC_WAIT)) goto out; xfs_iflock(ip); } if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { xfs_iunpin_wait(ip); /* xfs_iflush_abort() drops the flush lock */ xfs_iflush_abort(ip, false); goto reclaim; } if (xfs_ipincount(ip)) { if (!(sync_mode & SYNC_WAIT)) goto out_ifunlock; xfs_iunpin_wait(ip); } if (xfs_iflags_test(ip, XFS_ISTALE) || xfs_inode_clean(ip)) { xfs_ifunlock(ip); goto reclaim; } /* * Never flush out dirty data during non-blocking reclaim, as it would * just contend with AIL pushing trying to do the same job. */ if (!(sync_mode & SYNC_WAIT)) goto out_ifunlock; /* * Now we have an inode that needs flushing. * * Note that xfs_iflush will never block on the inode buffer lock, as * xfs_ifree_cluster() can lock the inode buffer before it locks the * ip->i_lock, and we are doing the exact opposite here. As a result, * doing a blocking xfs_imap_to_bp() to get the cluster buffer would * result in an ABBA deadlock with xfs_ifree_cluster(). * * As xfs_ifree_cluser() must gather all inodes that are active in the * cache to mark them stale, if we hit this case we don't actually want * to do IO here - we want the inode marked stale so we can simply * reclaim it. Hence if we get an EAGAIN error here, just unlock the * inode, back off and try again. Hopefully the next pass through will * see the stale flag set on the inode. */ error = xfs_iflush(ip, &bp); if (error == -EAGAIN) { xfs_iunlock(ip, XFS_ILOCK_EXCL); /* backoff longer than in xfs_ifree_cluster */ delay(2); goto restart; } if (!error) { error = xfs_bwrite(bp); xfs_buf_relse(bp); } reclaim: ASSERT(!xfs_isiflocked(ip)); /* * Because we use RCU freeing we need to ensure the inode always appears * to be reclaimed with an invalid inode number when in the free state. * We do this as early as possible under the ILOCK so that * xfs_iflush_cluster() can be guaranteed to detect races with us here. * By doing this, we guarantee that once xfs_iflush_cluster has locked * XFS_ILOCK that it will see either a valid, flushable inode that will * serialise correctly, or it will see a clean (and invalid) inode that * it can skip. */ spin_lock(&ip->i_flags_lock); ip->i_flags = XFS_IRECLAIM; ip->i_ino = 0; spin_unlock(&ip->i_flags_lock); xfs_iunlock(ip, XFS_ILOCK_EXCL); XFS_STATS_INC(ip->i_mount, xs_ig_reclaims); /* * Remove the inode from the per-AG radix tree. * * Because radix_tree_delete won't complain even if the item was never * added to the tree assert that it's been there before to catch * problems with the inode life time early on. */ spin_lock(&pag->pag_ici_lock); if (!radix_tree_delete(&pag->pag_ici_root, XFS_INO_TO_AGINO(ip->i_mount, ino))) ASSERT(0); xfs_perag_clear_reclaim_tag(pag); spin_unlock(&pag->pag_ici_lock); /* * Here we do an (almost) spurious inode lock in order to coordinate * with inode cache radix tree lookups. This is because the lookup * can reference the inodes in the cache without taking references. * * We make that OK here by ensuring that we wait until the inode is * unlocked after the lookup before we go ahead and free it. */ xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_qm_dqdetach(ip); xfs_iunlock(ip, XFS_ILOCK_EXCL); __xfs_inode_free(ip); return error; out_ifunlock: xfs_ifunlock(ip); out: xfs_iflags_clear(ip, XFS_IRECLAIM); xfs_iunlock(ip, XFS_ILOCK_EXCL); /* * We could return -EAGAIN here to make reclaim rescan the inode tree in * a short while. However, this just burns CPU time scanning the tree * waiting for IO to complete and the reclaim work never goes back to * the idle state. Instead, return 0 to let the next scheduled * background reclaim attempt to reclaim the inode again. */ return 0; }
/* * Initialize the Linux inode, set up the operation vectors and * unlock the inode. * * When reading existing inodes from disk this is called directly * from xfs_iget, when creating a new inode it is called from * xfs_ialloc after setting up the inode. * * We are always called with an uninitialised linux inode here. * We need to initialise the necessary fields and take a reference * on it. */ void xfs_setup_inode( struct xfs_inode *ip) { struct inode *inode = &ip->i_vnode; gfp_t gfp_mask; inode->i_ino = ip->i_ino; inode->i_state = I_NEW|I_LOCK; inode_add_to_lists(ip->i_mount->m_super, inode); inode->i_mode = ip->i_d.di_mode; inode->i_nlink = ip->i_d.di_nlink; inode->i_uid = ip->i_d.di_uid; inode->i_gid = ip->i_d.di_gid; switch (inode->i_mode & S_IFMT) { case S_IFBLK: case S_IFCHR: inode->i_rdev = MKDEV(sysv_major(ip->i_df.if_u2.if_rdev) & 0x1ff, sysv_minor(ip->i_df.if_u2.if_rdev)); break; default: inode->i_rdev = 0; break; } inode->i_generation = ip->i_d.di_gen; i_size_write(inode, ip->i_d.di_size); inode->i_atime.tv_sec = ip->i_d.di_atime.t_sec; inode->i_atime.tv_nsec = ip->i_d.di_atime.t_nsec; inode->i_mtime.tv_sec = ip->i_d.di_mtime.t_sec; inode->i_mtime.tv_nsec = ip->i_d.di_mtime.t_nsec; inode->i_ctime.tv_sec = ip->i_d.di_ctime.t_sec; inode->i_ctime.tv_nsec = ip->i_d.di_ctime.t_nsec; xfs_diflags_to_iflags(inode, ip); switch (inode->i_mode & S_IFMT) { case S_IFREG: inode->i_op = &xfs_inode_operations; inode->i_fop = &xfs_file_operations; inode->i_mapping->a_ops = &xfs_address_space_operations; break; case S_IFDIR: if (xfs_sb_version_hasasciici(&XFS_M(inode->i_sb)->m_sb)) inode->i_op = &xfs_dir_ci_inode_operations; else inode->i_op = &xfs_dir_inode_operations; inode->i_fop = &xfs_dir_file_operations; break; case S_IFLNK: inode->i_op = &xfs_symlink_inode_operations; if (!(ip->i_df.if_flags & XFS_IFINLINE)) inode->i_mapping->a_ops = &xfs_address_space_operations; break; default: inode->i_op = &xfs_inode_operations; init_special_inode(inode, inode->i_mode, inode->i_rdev); break; } /* * Ensure all page cache allocations are done from GFP_NOFS context to * prevent direct reclaim recursion back into the filesystem and blowing * stacks or deadlocking. */ gfp_mask = mapping_gfp_mask(inode->i_mapping); mapping_set_gfp_mask(inode->i_mapping, (gfp_mask & ~(__GFP_FS))); xfs_iflags_clear(ip, XFS_INEW); barrier(); unlock_new_inode(inode); }
/* * Initialize the Linux inode, set up the operation vectors and * unlock the inode. * * When reading existing inodes from disk this is called directly * from xfs_iget, when creating a new inode it is called from * xfs_ialloc after setting up the inode. * * We are always called with an uninitialised linux inode here. * We need to initialise the necessary fields and take a reference * on it. */ void xfs_setup_inode( struct xfs_inode *ip) { struct inode *inode = &ip->i_vnode; inode->i_ino = ip->i_ino; inode->i_state = I_NEW; inode_sb_list_add(inode); /* make the inode look hashed for the writeback code */ hlist_add_fake(&inode->i_hash); inode->i_mode = ip->i_d.di_mode; inode->i_nlink = ip->i_d.di_nlink; inode->i_uid = ip->i_d.di_uid; inode->i_gid = ip->i_d.di_gid; switch (inode->i_mode & S_IFMT) { case S_IFBLK: case S_IFCHR: inode->i_rdev = MKDEV(sysv_major(ip->i_df.if_u2.if_rdev) & 0x1ff, sysv_minor(ip->i_df.if_u2.if_rdev)); break; default: inode->i_rdev = 0; break; } inode->i_generation = ip->i_d.di_gen; i_size_write(inode, ip->i_d.di_size); inode->i_atime.tv_sec = ip->i_d.di_atime.t_sec; inode->i_atime.tv_nsec = ip->i_d.di_atime.t_nsec; inode->i_mtime.tv_sec = ip->i_d.di_mtime.t_sec; inode->i_mtime.tv_nsec = ip->i_d.di_mtime.t_nsec; inode->i_ctime.tv_sec = ip->i_d.di_ctime.t_sec; inode->i_ctime.tv_nsec = ip->i_d.di_ctime.t_nsec; xfs_diflags_to_iflags(inode, ip); switch (inode->i_mode & S_IFMT) { case S_IFREG: inode->i_op = &xfs_inode_operations; inode->i_fop = &xfs_file_operations; inode->i_mapping->a_ops = &xfs_address_space_operations; break; case S_IFDIR: if (xfs_sb_version_hasasciici(&XFS_M(inode->i_sb)->m_sb)) inode->i_op = &xfs_dir_ci_inode_operations; else inode->i_op = &xfs_dir_inode_operations; inode->i_fop = &xfs_dir_file_operations; break; case S_IFLNK: inode->i_op = &xfs_symlink_inode_operations; if (!(ip->i_df.if_flags & XFS_IFINLINE)) inode->i_mapping->a_ops = &xfs_address_space_operations; break; default: inode->i_op = &xfs_inode_operations; init_special_inode(inode, inode->i_mode, inode->i_rdev); break; } /* * If there is no attribute fork no ACL can exist on this inode, * and it can't have any file capabilities attached to it either. */ if (!XFS_IFORK_Q(ip)) { inode_has_no_xattr(inode); cache_no_acl(inode); } xfs_iflags_clear(ip, XFS_INEW); barrier(); unlock_new_inode(inode); }
STATIC int xfs_file_fsync( struct file *file, loff_t start, loff_t end, int datasync) { struct inode *inode = file->f_mapping->host; struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; int error = 0; int log_flushed = 0; xfs_lsn_t lsn = 0; trace_xfs_file_fsync(ip); error = file_write_and_wait_range(file, start, end); if (error) return error; if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; xfs_iflags_clear(ip, XFS_ITRUNCATED); /* * If we have an RT and/or log subvolume we need to make sure to flush * the write cache the device used for file data first. This is to * ensure newly written file data make it to disk before logging the new * inode size in case of an extending write. */ if (XFS_IS_REALTIME_INODE(ip)) xfs_blkdev_issue_flush(mp->m_rtdev_targp); else if (mp->m_logdev_targp != mp->m_ddev_targp) xfs_blkdev_issue_flush(mp->m_ddev_targp); /* * All metadata updates are logged, which means that we just have to * flush the log up to the latest LSN that touched the inode. If we have * concurrent fsync/fdatasync() calls, we need them to all block on the * log force before we clear the ili_fsync_fields field. This ensures * that we don't get a racing sync operation that does not wait for the * metadata to hit the journal before returning. If we race with * clearing the ili_fsync_fields, then all that will happen is the log * force will do nothing as the lsn will already be on disk. We can't * race with setting ili_fsync_fields because that is done under * XFS_ILOCK_EXCL, and that can't happen because we hold the lock shared * until after the ili_fsync_fields is cleared. */ xfs_ilock(ip, XFS_ILOCK_SHARED); if (xfs_ipincount(ip)) { if (!datasync || (ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP)) lsn = ip->i_itemp->ili_last_lsn; } if (lsn) { error = _xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, &log_flushed); ip->i_itemp->ili_fsync_fields = 0; } xfs_iunlock(ip, XFS_ILOCK_SHARED); /* * If we only have a single device, and the log force about was * a no-op we might have to flush the data device cache here. * This can only happen for fdatasync/O_DSYNC if we were overwriting * an already allocated file and thus do not have any metadata to * commit. */ if (!log_flushed && !XFS_IS_REALTIME_INODE(ip) && mp->m_logdev_targp == mp->m_ddev_targp) xfs_blkdev_issue_flush(mp->m_ddev_targp); return error; }
STATIC int xfs_reclaim_inode( struct xfs_inode *ip, struct xfs_perag *pag, int sync_mode) { int error; restart: error = 0; xfs_ilock(ip, XFS_ILOCK_EXCL); if (!xfs_iflock_nowait(ip)) { if (!(sync_mode & SYNC_WAIT)) goto out; xfs_promote_inode(ip); xfs_iflock(ip); } if (is_bad_inode(VFS_I(ip))) goto reclaim; if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { xfs_iunpin_wait(ip); goto reclaim; } if (xfs_ipincount(ip)) { if (!(sync_mode & SYNC_WAIT)) { xfs_ifunlock(ip); goto out; } xfs_iunpin_wait(ip); } if (xfs_iflags_test(ip, XFS_ISTALE)) goto reclaim; if (xfs_inode_clean(ip)) goto reclaim; error = xfs_iflush(ip, SYNC_TRYLOCK | sync_mode); if (sync_mode & SYNC_WAIT) { if (error == EAGAIN) { xfs_iunlock(ip, XFS_ILOCK_EXCL); delay(2); goto restart; } xfs_iflock(ip); goto reclaim; } if (error && error != EAGAIN && !XFS_FORCED_SHUTDOWN(ip->i_mount)) { xfs_warn(ip->i_mount, "inode 0x%llx background reclaim flush failed with %d", (long long)ip->i_ino, error); } out: xfs_iflags_clear(ip, XFS_IRECLAIM); xfs_iunlock(ip, XFS_ILOCK_EXCL); return 0; reclaim: xfs_ifunlock(ip); xfs_iunlock(ip, XFS_ILOCK_EXCL); XFS_STATS_INC(xs_ig_reclaims); spin_lock(&pag->pag_ici_lock); if (!radix_tree_delete(&pag->pag_ici_root, XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino))) ASSERT(0); __xfs_inode_clear_reclaim(pag, ip); spin_unlock(&pag->pag_ici_lock); xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_qm_dqdetach(ip); xfs_iunlock(ip, XFS_ILOCK_EXCL); xfs_inode_free(ip); return error; }
STATIC int xfs_file_fsync( struct file *file, loff_t start, loff_t end, int datasync) { struct inode *inode = file->f_mapping->host; struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; struct xfs_trans *tp; int error = 0; int log_flushed = 0; trace_xfs_file_fsync(ip); error = filemap_write_and_wait_range(inode->i_mapping, start, end); if (error) return error; if (XFS_FORCED_SHUTDOWN(mp)) return -XFS_ERROR(EIO); xfs_iflags_clear(ip, XFS_ITRUNCATED); xfs_ilock(ip, XFS_IOLOCK_SHARED); xfs_ioend_wait(ip); xfs_iunlock(ip, XFS_IOLOCK_SHARED); if (mp->m_flags & XFS_MOUNT_BARRIER) { /* * If we have an RT and/or log subvolume we need to make sure * to flush the write cache the device used for file data * first. This is to ensure newly written file data make * it to disk before logging the new inode size in case of * an extending write. */ if (XFS_IS_REALTIME_INODE(ip)) xfs_blkdev_issue_flush(mp->m_rtdev_targp); else if (mp->m_logdev_targp != mp->m_ddev_targp) xfs_blkdev_issue_flush(mp->m_ddev_targp); } /* * We always need to make sure that the required inode state is safe on * disk. The inode might be clean but we still might need to force the * log because of committed transactions that haven't hit the disk yet. * Likewise, there could be unflushed non-transactional changes to the * inode core that have to go to disk and this requires us to issue * a synchronous transaction to capture these changes correctly. * * This code relies on the assumption that if the i_update_core field * of the inode is clear and the inode is unpinned then it is clean * and no action is required. */ xfs_ilock(ip, XFS_ILOCK_SHARED); /* * First check if the VFS inode is marked dirty. All the dirtying * of non-transactional updates no goes through mark_inode_dirty*, * which allows us to distinguish beteeen pure timestamp updates * and i_size updates which need to be caught for fdatasync. * After that also theck for the dirty state in the XFS inode, which * might gets cleared when the inode gets written out via the AIL * or xfs_iflush_cluster. */ if (((inode->i_state & I_DIRTY_DATASYNC) || ((inode->i_state & I_DIRTY_SYNC) && !datasync)) && ip->i_update_core) { /* * Kick off a transaction to log the inode core to get the * updates. The sync transaction will also force the log. */ xfs_iunlock(ip, XFS_ILOCK_SHARED); tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS); error = xfs_trans_reserve(tp, 0, XFS_FSYNC_TS_LOG_RES(mp), 0, 0, 0); if (error) { xfs_trans_cancel(tp, 0); return -error; } xfs_ilock(ip, XFS_ILOCK_EXCL); /* * Note - it's possible that we might have pushed ourselves out * of the way during trans_reserve which would flush the inode. * But there's no guarantee that the inode buffer has actually * gone out yet (it's delwri). Plus the buffer could be pinned * anyway if it's part of an inode in another recent * transaction. So we play it safe and fire off the * transaction anyway. */ xfs_trans_ijoin(tp, ip); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); xfs_trans_set_sync(tp); error = _xfs_trans_commit(tp, 0, &log_flushed); xfs_iunlock(ip, XFS_ILOCK_EXCL); } else { /* * Timestamps/size haven't changed since last inode flush or * inode transaction commit. That means either nothing got * written or a transaction committed which caught the updates. * If the latter happened and the transaction hasn't hit the * disk yet, the inode will be still be pinned. If it is, * force the log. */ if (xfs_ipincount(ip)) { error = _xfs_log_force_lsn(mp, ip->i_itemp->ili_last_lsn, XFS_LOG_SYNC, &log_flushed); } xfs_iunlock(ip, XFS_ILOCK_SHARED); } /* * If we only have a single device, and the log force about was * a no-op we might have to flush the data device cache here. * This can only happen for fdatasync/O_DSYNC if we were overwriting * an already allocated file and thus do not have any metadata to * commit. */ if ((mp->m_flags & XFS_MOUNT_BARRIER) && mp->m_logdev_targp == mp->m_ddev_targp && !XFS_IS_REALTIME_INODE(ip) && !log_flushed) xfs_blkdev_issue_flush(mp->m_ddev_targp); return -error; }