/* * Ensure the reflink bit is set in both inodes. */ STATIC int xfs_reflink_set_inode_flag( struct xfs_inode *src, struct xfs_inode *dest) { struct xfs_mount *mp = src->i_mount; int error; struct xfs_trans *tp; if (xfs_is_reflink_inode(src) && xfs_is_reflink_inode(dest)) return 0; error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp); if (error) goto out_error; /* Lock both files against IO */ if (src->i_ino == dest->i_ino) xfs_ilock(src, XFS_ILOCK_EXCL); else xfs_lock_two_inodes(src, dest, XFS_ILOCK_EXCL); if (!xfs_is_reflink_inode(src)) { trace_xfs_reflink_set_inode_flag(src); xfs_trans_ijoin(tp, src, XFS_ILOCK_EXCL); src->i_d.di_flags2 |= XFS_DIFLAG2_REFLINK; xfs_trans_log_inode(tp, src, XFS_ILOG_CORE); xfs_ifork_init_cow(src); } else xfs_iunlock(src, XFS_ILOCK_EXCL); if (src->i_ino == dest->i_ino) goto commit_flags; if (!xfs_is_reflink_inode(dest)) { trace_xfs_reflink_set_inode_flag(dest); xfs_trans_ijoin(tp, dest, XFS_ILOCK_EXCL); dest->i_d.di_flags2 |= XFS_DIFLAG2_REFLINK; xfs_trans_log_inode(tp, dest, XFS_ILOG_CORE); xfs_ifork_init_cow(dest); } else xfs_iunlock(dest, XFS_ILOCK_EXCL); commit_flags: error = xfs_trans_commit(tp); if (error) goto out_error; return error; out_error: trace_xfs_reflink_set_inode_flag_error(dest, error, _RET_IP_); return error; }
/* Clear the inode reflink flag if there are no shared extents. */ int xfs_reflink_clear_inode_flag( struct xfs_inode *ip, struct xfs_trans **tpp) { bool needs_flag; int error = 0; ASSERT(xfs_is_reflink_inode(ip)); error = xfs_reflink_inode_has_shared_extents(*tpp, ip, &needs_flag); if (error || needs_flag) return error; /* * We didn't find any shared blocks so turn off the reflink flag. * First, get rid of any leftover CoW mappings. */ error = xfs_reflink_cancel_cow_blocks(ip, tpp, 0, NULLFILEOFF, true); if (error) return error; /* Clear the inode flag. */ trace_xfs_reflink_unset_inode_flag(ip); ip->i_d.di_flags2 &= ~XFS_DIFLAG2_REFLINK; xfs_inode_clear_cowblocks_tag(ip); xfs_trans_ijoin(*tpp, ip, 0); xfs_trans_log_inode(*tpp, ip, XFS_ILOG_CORE); return error; }
/* * Find the CoW reservation for a given byte offset of a file. */ bool xfs_reflink_find_cow_mapping( struct xfs_inode *ip, xfs_off_t offset, struct xfs_bmbt_irec *imap) { struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); xfs_fileoff_t offset_fsb; struct xfs_bmbt_irec got; xfs_extnum_t idx; ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL | XFS_ILOCK_SHARED)); ASSERT(xfs_is_reflink_inode(ip)); offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset); if (!xfs_iext_lookup_extent(ip, ifp, offset_fsb, &idx, &got)) return false; if (got.br_startoff > offset_fsb) return false; trace_xfs_reflink_find_cow_mapping(ip, offset, 1, XFS_IO_OVERWRITE, &got); *imap = got; return true; }
/* * Trim an extent to end at the next CoW reservation past offset_fsb. */ void xfs_reflink_trim_irec_to_next_cow( struct xfs_inode *ip, xfs_fileoff_t offset_fsb, struct xfs_bmbt_irec *imap) { struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); struct xfs_bmbt_irec got; xfs_extnum_t idx; if (!xfs_is_reflink_inode(ip)) return; /* Find the extent in the CoW fork. */ if (!xfs_iext_lookup_extent(ip, ifp, offset_fsb, &idx, &got)) return; /* This is the extent before; try sliding up one. */ if (got.br_startoff < offset_fsb) { if (!xfs_iext_get_extent(ifp, idx + 1, &got)) return; } if (got.br_startoff >= imap->br_startoff + imap->br_blockcount) return; imap->br_blockcount = got.br_startoff - imap->br_startoff; trace_xfs_reflink_trim_irec(ip, imap); }
STATIC void xfs_diflags_to_iflags( struct inode *inode, struct xfs_inode *ip) { uint16_t flags = ip->i_d.di_flags; inode->i_flags &= ~(S_IMMUTABLE | S_APPEND | S_SYNC | S_NOATIME | S_DAX); if (flags & XFS_DIFLAG_IMMUTABLE) inode->i_flags |= S_IMMUTABLE; if (flags & XFS_DIFLAG_APPEND) inode->i_flags |= S_APPEND; if (flags & XFS_DIFLAG_SYNC) inode->i_flags |= S_SYNC; if (flags & XFS_DIFLAG_NOATIME) inode->i_flags |= S_NOATIME; if (S_ISREG(inode->i_mode) && ip->i_mount->m_sb.sb_blocksize == PAGE_SIZE && !xfs_is_reflink_inode(ip) && (ip->i_mount->m_flags & XFS_MOUNT_DAX || ip->i_d.di_flags2 & XFS_DIFLAG2_DAX)) inode->i_flags |= S_DAX; }
/* * Set ourselves up to free CoW blocks from this file. If it's already clean * then we can bail out quickly, but otherwise we must back off if the file * is undergoing some kind of write. */ static bool xfs_prep_free_cowblocks( struct xfs_inode *ip, struct xfs_ifork *ifp) { /* * Just clear the tag if we have an empty cow fork or none at all. It's * possible the inode was fully unshared since it was originally tagged. */ if (!xfs_is_reflink_inode(ip) || !ifp->if_bytes) { trace_xfs_inode_free_cowblocks_invalid(ip); xfs_inode_clear_cowblocks_tag(ip); return false; } /* * If the mapping is dirty or under writeback we cannot touch the * CoW fork. Leave it alone if we're in the midst of a directio. */ if ((VFS_I(ip)->i_state & I_DIRTY_PAGES) || mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_DIRTY) || mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_WRITEBACK) || atomic_read(&VFS_I(ip)->i_dio_count)) return false; return true; }
/* * Trim the mapping to the next block where there's a change in the * shared/unshared status. More specifically, this means that we * find the lowest-numbered extent of shared blocks that coincides with * the given block mapping. If the shared extent overlaps the start of * the mapping, trim the mapping to the end of the shared extent. If * the shared region intersects the mapping, trim the mapping to the * start of the shared extent. If there are no shared regions that * overlap, just return the original extent. */ int xfs_reflink_trim_around_shared( struct xfs_inode *ip, struct xfs_bmbt_irec *irec, bool *shared, bool *trimmed) { xfs_agnumber_t agno; xfs_agblock_t agbno; xfs_extlen_t aglen; xfs_agblock_t fbno; xfs_extlen_t flen; int error = 0; /* Holes, unwritten, and delalloc extents cannot be shared */ if (!xfs_is_reflink_inode(ip) || !xfs_bmap_is_real_extent(irec)) { *shared = false; return 0; } trace_xfs_reflink_trim_around_shared(ip, irec); agno = XFS_FSB_TO_AGNO(ip->i_mount, irec->br_startblock); agbno = XFS_FSB_TO_AGBNO(ip->i_mount, irec->br_startblock); aglen = irec->br_blockcount; error = xfs_reflink_find_shared(ip->i_mount, NULL, agno, agbno, aglen, &fbno, &flen, true); if (error) return error; *shared = *trimmed = false; if (fbno == NULLAGBLOCK) { /* No shared blocks at all. */ return 0; } else if (fbno == agbno) { /* * The start of this extent is shared. Truncate the * mapping at the end of the shared region so that a * subsequent iteration starts at the start of the * unshared region. */ irec->br_blockcount = flen; *shared = true; if (flen != aglen) *trimmed = true; return 0; } else { /* * There's a shared extent midway through this extent. * Truncate the mapping at the start of the shared * extent so that a subsequent iteration starts at the * start of the shared region. */ irec->br_blockcount = fbno - agbno; *trimmed = true; return 0; } }
static int xfs_ioctl_setattr_xflags( struct xfs_trans *tp, struct xfs_inode *ip, struct fsxattr *fa) { struct xfs_mount *mp = ip->i_mount; /* Can't change realtime flag if any extents are allocated. */ if ((ip->i_d.di_nextents || ip->i_delayed_blks) && XFS_IS_REALTIME_INODE(ip) != (fa->fsx_xflags & FS_XFLAG_REALTIME)) return -EINVAL; /* If realtime flag is set then must have realtime device */ if (fa->fsx_xflags & FS_XFLAG_REALTIME) { if (mp->m_sb.sb_rblocks == 0 || mp->m_sb.sb_rextsize == 0 || (ip->i_d.di_extsize % mp->m_sb.sb_rextsize)) return -EINVAL; } /* Clear reflink if we are actually able to set the rt flag. */ if ((fa->fsx_xflags & FS_XFLAG_REALTIME) && xfs_is_reflink_inode(ip)) ip->i_d.di_flags2 &= ~XFS_DIFLAG2_REFLINK; /* Don't allow us to set DAX mode for a reflinked file for now. */ if ((fa->fsx_xflags & FS_XFLAG_DAX) && xfs_is_reflink_inode(ip)) return -EINVAL; /* * Can't modify an immutable/append-only file unless * we have appropriate permission. */ if (((ip->i_d.di_flags & (XFS_DIFLAG_IMMUTABLE | XFS_DIFLAG_APPEND)) || (fa->fsx_xflags & (FS_XFLAG_IMMUTABLE | FS_XFLAG_APPEND))) && !capable(CAP_LINUX_IMMUTABLE)) return -EPERM; xfs_set_diflags(ip, fa->fsx_xflags); xfs_diflags_to_linux(ip); xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); XFS_STATS_INC(mp, xs_ig_attrchg); return 0; }
/* * Automatic CoW Reservation Freeing * * These functions automatically garbage collect leftover CoW reservations * that were made on behalf of a cowextsize hint when we start to run out * of quota or when the reservations sit around for too long. If the file * has dirty pages or is undergoing writeback, its CoW reservations will * be retained. * * The actual garbage collection piggybacks off the same code that runs * the speculative EOF preallocation garbage collector. */ STATIC int xfs_inode_free_cowblocks( struct xfs_inode *ip, int flags, void *args) { int ret; struct xfs_eofblocks *eofb = args; int match; struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); /* * Just clear the tag if we have an empty cow fork or none at all. It's * possible the inode was fully unshared since it was originally tagged. */ if (!xfs_is_reflink_inode(ip) || !ifp->if_bytes) { trace_xfs_inode_free_cowblocks_invalid(ip); xfs_inode_clear_cowblocks_tag(ip); return 0; } /* * If the mapping is dirty or under writeback we cannot touch the * CoW fork. Leave it alone if we're in the midst of a directio. */ if ((VFS_I(ip)->i_state & I_DIRTY_PAGES) || mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_DIRTY) || mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_WRITEBACK) || atomic_read(&VFS_I(ip)->i_dio_count)) return 0; if (eofb) { if (eofb->eof_flags & XFS_EOF_FLAGS_UNION) match = xfs_inode_match_id_union(ip, eofb); else match = xfs_inode_match_id(ip, eofb); if (!match) return 0; /* skip the inode if the file size is too small */ if (eofb->eof_flags & XFS_EOF_FLAGS_MINFILESIZE && XFS_ISIZE(ip) < eofb->eof_min_file_size) return 0; } /* Free the CoW blocks */ xfs_ilock(ip, XFS_IOLOCK_EXCL); xfs_ilock(ip, XFS_MMAPLOCK_EXCL); ret = xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF, false); xfs_iunlock(ip, XFS_MMAPLOCK_EXCL); xfs_iunlock(ip, XFS_IOLOCK_EXCL); return ret; }
/* * If the reflink iflag disagrees with a scan for shared data fork extents, * either flag an error (shared extents w/ no flag) or a preen (flag set w/o * any shared extents). We already checked for reflink iflag set on a non * reflink filesystem. */ static void xchk_inode_check_reflink_iflag( struct xfs_scrub *sc, xfs_ino_t ino) { struct xfs_mount *mp = sc->mp; bool has_shared; int error; if (!xfs_sb_version_hasreflink(&mp->m_sb)) return; error = xfs_reflink_inode_has_shared_extents(sc->tp, sc->ip, &has_shared); if (!xchk_xref_process_error(sc, XFS_INO_TO_AGNO(mp, ino), XFS_INO_TO_AGBNO(mp, ino), &error)) return; if (xfs_is_reflink_inode(sc->ip) && !has_shared) xchk_ino_set_preen(sc, ino); else if (!xfs_is_reflink_inode(sc->ip) && has_shared) xchk_ino_set_corrupt(sc, ino); }
/* * Cancel CoW reservations for some byte range of an inode. * * If cancel_real is true this function cancels all COW fork extents for the * inode; if cancel_real is false, real extents are not cleared. */ int xfs_reflink_cancel_cow_range( struct xfs_inode *ip, xfs_off_t offset, xfs_off_t count, bool cancel_real) { struct xfs_trans *tp; xfs_fileoff_t offset_fsb; xfs_fileoff_t end_fsb; int error; trace_xfs_reflink_cancel_cow_range(ip, offset, count); ASSERT(xfs_is_reflink_inode(ip)); offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset); if (count == NULLFILEOFF) end_fsb = NULLFILEOFF; else end_fsb = XFS_B_TO_FSB(ip->i_mount, offset + count); /* Start a rolling transaction to remove the mappings */ error = xfs_trans_alloc(ip->i_mount, &M_RES(ip->i_mount)->tr_write, 0, 0, 0, &tp); if (error) goto out; xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, 0); /* Scrape out the old CoW reservations */ error = xfs_reflink_cancel_cow_blocks(ip, &tp, offset_fsb, end_fsb, cancel_real); if (error) goto out_cancel; error = xfs_trans_commit(tp); xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; out_cancel: xfs_trans_cancel(tp); xfs_iunlock(ip, XFS_ILOCK_EXCL); out: trace_xfs_reflink_cancel_cow_range_error(ip, error, _RET_IP_); return error; }
/* * Pre-COW all shared blocks within a given byte range of a file and turn off * the reflink flag if we unshare all of the file's blocks. */ int xfs_reflink_unshare( struct xfs_inode *ip, xfs_off_t offset, xfs_off_t len) { struct xfs_mount *mp = ip->i_mount; xfs_fileoff_t fbno; xfs_filblks_t end; xfs_off_t isize; int error; if (!xfs_is_reflink_inode(ip)) return 0; trace_xfs_reflink_unshare(ip, offset, len); inode_dio_wait(VFS_I(ip)); /* Try to CoW the selected ranges */ xfs_ilock(ip, XFS_ILOCK_EXCL); fbno = XFS_B_TO_FSBT(mp, offset); isize = i_size_read(VFS_I(ip)); end = XFS_B_TO_FSB(mp, offset + len); error = xfs_reflink_dirty_extents(ip, fbno, end, isize); if (error) goto out_unlock; xfs_iunlock(ip, XFS_ILOCK_EXCL); /* Wait for the IO to finish */ error = filemap_write_and_wait(VFS_I(ip)->i_mapping); if (error) goto out; /* Turn off the reflink flag if possible. */ error = xfs_reflink_try_clear_inode_flag(ip); if (error) goto out; return 0; out_unlock: xfs_iunlock(ip, XFS_ILOCK_EXCL); out: trace_xfs_reflink_unshare_error(ip, error, _RET_IP_); return error; }
/* * Cancel CoW reservations for some block range of an inode. * * If cancel_real is true this function cancels all COW fork extents for the * inode; if cancel_real is false, real extents are not cleared. */ int xfs_reflink_cancel_cow_blocks( struct xfs_inode *ip, struct xfs_trans **tpp, xfs_fileoff_t offset_fsb, xfs_fileoff_t end_fsb, bool cancel_real) { struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); struct xfs_bmbt_irec got, del; xfs_extnum_t idx; xfs_fsblock_t firstfsb; struct xfs_defer_ops dfops; int error = 0; if (!xfs_is_reflink_inode(ip)) return 0; if (!xfs_iext_lookup_extent(ip, ifp, offset_fsb, &idx, &got)) return 0; while (got.br_startoff < end_fsb) { del = got; xfs_trim_extent(&del, offset_fsb, end_fsb - offset_fsb); trace_xfs_reflink_cancel_cow(ip, &del); if (isnullstartblock(del.br_startblock)) { error = xfs_bmap_del_extent_delay(ip, XFS_COW_FORK, &idx, &got, &del); if (error) break; } else if (del.br_state == XFS_EXT_UNWRITTEN || cancel_real) { xfs_trans_ijoin(*tpp, ip, 0); xfs_defer_init(&dfops, &firstfsb); /* Free the CoW orphan record. */ error = xfs_refcount_free_cow_extent(ip->i_mount, &dfops, del.br_startblock, del.br_blockcount); if (error) break; xfs_bmap_add_free(ip->i_mount, &dfops, del.br_startblock, del.br_blockcount, NULL); /* Update quota accounting */ xfs_trans_mod_dquot_byino(*tpp, ip, XFS_TRANS_DQ_BCOUNT, -(long)del.br_blockcount); /* Roll the transaction */ xfs_defer_ijoin(&dfops, ip); error = xfs_defer_finish(tpp, &dfops); if (error) { xfs_defer_cancel(&dfops); break; } /* Remove the mapping from the CoW fork. */ xfs_bmap_del_extent_cow(ip, &idx, &got, &del); } if (!xfs_iext_get_extent(ifp, ++idx, &got)) break; } /* clear tag if cow fork is emptied */ if (!ifp->if_bytes) xfs_inode_clear_cowblocks_tag(ip); return error; }
/* Allocate all CoW reservations covering a range of blocks in a file. */ int xfs_reflink_allocate_cow( struct xfs_inode *ip, struct xfs_bmbt_irec *imap, bool *shared, uint *lockmode) { struct xfs_mount *mp = ip->i_mount; xfs_fileoff_t offset_fsb = imap->br_startoff; xfs_filblks_t count_fsb = imap->br_blockcount; struct xfs_bmbt_irec got; struct xfs_defer_ops dfops; struct xfs_trans *tp = NULL; xfs_fsblock_t first_block; int nimaps, error = 0; bool trimmed; xfs_filblks_t resaligned; xfs_extlen_t resblks = 0; xfs_extnum_t idx; retry: ASSERT(xfs_is_reflink_inode(ip)); ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL | XFS_ILOCK_SHARED)); /* * Even if the extent is not shared we might have a preallocation for * it in the COW fork. If so use it. */ if (xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &idx, &got) && got.br_startoff <= offset_fsb) { *shared = true; /* If we have a real allocation in the COW fork we're done. */ if (!isnullstartblock(got.br_startblock)) { xfs_trim_extent(&got, offset_fsb, count_fsb); *imap = got; goto convert; } xfs_trim_extent(imap, got.br_startoff, got.br_blockcount); } else { error = xfs_reflink_trim_around_shared(ip, imap, shared, &trimmed); if (error || !*shared) goto out; } if (!tp) { resaligned = xfs_aligned_fsb_count(imap->br_startoff, imap->br_blockcount, xfs_get_cowextsz_hint(ip)); resblks = XFS_DIOSTRAT_SPACE_RES(mp, resaligned); xfs_iunlock(ip, *lockmode); error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, &tp); *lockmode = XFS_ILOCK_EXCL; xfs_ilock(ip, *lockmode); if (error) return error; error = xfs_qm_dqattach_locked(ip, 0); if (error) goto out; goto retry; } error = xfs_trans_reserve_quota_nblks(tp, ip, resblks, 0, XFS_QMOPT_RES_REGBLKS); if (error) goto out; xfs_trans_ijoin(tp, ip, 0); xfs_defer_init(&dfops, &first_block); nimaps = 1; /* Allocate the entire reservation as unwritten blocks. */ error = xfs_bmapi_write(tp, ip, imap->br_startoff, imap->br_blockcount, XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC, &first_block, resblks, imap, &nimaps, &dfops); if (error) goto out_bmap_cancel; /* Finish up. */ error = xfs_defer_finish(&tp, &dfops); if (error) goto out_bmap_cancel; error = xfs_trans_commit(tp); if (error) return error; convert: return xfs_reflink_convert_cow_extent(ip, imap, offset_fsb, count_fsb, &dfops); out_bmap_cancel: xfs_defer_cancel(&dfops); xfs_trans_unreserve_quota_nblks(tp, ip, (long)resblks, 0, XFS_QMOPT_RES_REGBLKS); out: if (tp) xfs_trans_cancel(tp); return error; }
/* * xfs_file_dio_aio_write - handle direct IO writes * * Lock the inode appropriately to prepare for and issue a direct IO write. * By separating it from the buffered write path we remove all the tricky to * follow locking changes and looping. * * If there are cached pages or we're extending the file, we need IOLOCK_EXCL * until we're sure the bytes at the new EOF have been zeroed and/or the cached * pages are flushed out. * * In most cases the direct IO writes will be done holding IOLOCK_SHARED * allowing them to be done in parallel with reads and other direct IO writes. * However, if the IO is not aligned to filesystem blocks, the direct IO layer * needs to do sub-block zeroing and that requires serialisation against other * direct IOs to the same block. In this case we need to serialise the * submission of the unaligned IOs so that we don't get racing block zeroing in * the dio layer. To avoid the problem with aio, we also need to wait for * outstanding IOs to complete so that unwritten extent conversion is completed * before we try to map the overlapping block. This is currently implemented by * hitting it with a big hammer (i.e. inode_dio_wait()). * * Returns with locks held indicated by @iolock and errors indicated by * negative return values. */ STATIC ssize_t xfs_file_dio_aio_write( struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; ssize_t ret = 0; int unaligned_io = 0; int iolock; size_t count = iov_iter_count(from); loff_t end; struct iov_iter data; struct xfs_buftarg *target = XFS_IS_REALTIME_INODE(ip) ? mp->m_rtdev_targp : mp->m_ddev_targp; /* DIO must be aligned to device logical sector size */ if ((iocb->ki_pos | count) & target->bt_logical_sectormask) return -EINVAL; /* "unaligned" here means not aligned to a filesystem block */ if ((iocb->ki_pos & mp->m_blockmask) || ((iocb->ki_pos + count) & mp->m_blockmask)) unaligned_io = 1; /* * We don't need to take an exclusive lock unless there page cache needs * to be invalidated or unaligned IO is being executed. We don't need to * consider the EOF extension case here because * xfs_file_aio_write_checks() will relock the inode as necessary for * EOF zeroing cases and fill out the new inode size as appropriate. */ if (unaligned_io || mapping->nrpages) iolock = XFS_IOLOCK_EXCL; else iolock = XFS_IOLOCK_SHARED; xfs_rw_ilock(ip, iolock); /* * Recheck if there are cached pages that need invalidate after we got * the iolock to protect against other threads adding new pages while * we were waiting for the iolock. */ if (mapping->nrpages && iolock == XFS_IOLOCK_SHARED) { xfs_rw_iunlock(ip, iolock); iolock = XFS_IOLOCK_EXCL; xfs_rw_ilock(ip, iolock); } ret = xfs_file_aio_write_checks(iocb, from, &iolock); if (ret) goto out; count = iov_iter_count(from); end = iocb->ki_pos + count - 1; /* * See xfs_file_dio_aio_read() for why we do a full-file flush here. */ if (mapping->nrpages) { ret = filemap_write_and_wait(VFS_I(ip)->i_mapping); if (ret) goto out; /* * Invalidate whole pages. This can return an error if we fail * to invalidate a page, but this should never happen on XFS. * Warn if it does fail. */ ret = invalidate_inode_pages2(VFS_I(ip)->i_mapping); WARN_ON_ONCE(ret); ret = 0; } /* * If we are doing unaligned IO, wait for all other IO to drain, * otherwise demote the lock if we had to flush cached pages */ if (unaligned_io) inode_dio_wait(inode); else if (iolock == XFS_IOLOCK_EXCL) { xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL); iolock = XFS_IOLOCK_SHARED; } trace_xfs_file_direct_write(ip, count, iocb->ki_pos); /* If this is a block-aligned directio CoW, remap immediately. */ if (xfs_is_reflink_inode(ip) && !unaligned_io) { ret = xfs_reflink_allocate_cow_range(ip, iocb->ki_pos, count); if (ret) goto out; } data = *from; ret = __blockdev_direct_IO(iocb, inode, target->bt_bdev, &data, xfs_get_blocks_direct, xfs_end_io_direct_write, NULL, DIO_ASYNC_EXTEND); /* see generic_file_direct_write() for why this is necessary */ if (mapping->nrpages) { invalidate_inode_pages2_range(mapping, iocb->ki_pos >> PAGE_SHIFT, end >> PAGE_SHIFT); }
/* * Cancel CoW reservations for some block range of an inode. * * If cancel_real is true this function cancels all COW fork extents for the * inode; if cancel_real is false, real extents are not cleared. */ int xfs_reflink_cancel_cow_blocks( struct xfs_inode *ip, struct xfs_trans **tpp, xfs_fileoff_t offset_fsb, xfs_fileoff_t end_fsb, bool cancel_real) { struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); struct xfs_bmbt_irec got, del; struct xfs_iext_cursor icur; xfs_fsblock_t firstfsb; struct xfs_defer_ops dfops; int error = 0; if (!xfs_is_reflink_inode(ip)) return 0; if (!xfs_iext_lookup_extent_before(ip, ifp, &end_fsb, &icur, &got)) return 0; /* Walk backwards until we're out of the I/O range... */ while (got.br_startoff + got.br_blockcount > offset_fsb) { del = got; xfs_trim_extent(&del, offset_fsb, end_fsb - offset_fsb); /* Extent delete may have bumped ext forward */ if (!del.br_blockcount) { xfs_iext_prev(ifp, &icur); goto next_extent; } trace_xfs_reflink_cancel_cow(ip, &del); if (isnullstartblock(del.br_startblock)) { error = xfs_bmap_del_extent_delay(ip, XFS_COW_FORK, &icur, &got, &del); if (error) break; } else if (del.br_state == XFS_EXT_UNWRITTEN || cancel_real) { xfs_trans_ijoin(*tpp, ip, 0); xfs_defer_init(&dfops, &firstfsb); /* Free the CoW orphan record. */ error = xfs_refcount_free_cow_extent(ip->i_mount, &dfops, del.br_startblock, del.br_blockcount); if (error) break; xfs_bmap_add_free(ip->i_mount, &dfops, del.br_startblock, del.br_blockcount, NULL); /* Update quota accounting */ xfs_trans_mod_dquot_byino(*tpp, ip, XFS_TRANS_DQ_BCOUNT, -(long)del.br_blockcount); /* Roll the transaction */ xfs_defer_ijoin(&dfops, ip); error = xfs_defer_finish(tpp, &dfops); if (error) { xfs_defer_cancel(&dfops); break; } /* Remove the mapping from the CoW fork. */ xfs_bmap_del_extent_cow(ip, &icur, &got, &del); } else { /* Didn't do anything, push cursor back. */ xfs_iext_prev(ifp, &icur); } next_extent: if (!xfs_iext_get_extent(ifp, &icur, &got)) break; } /* clear tag if cow fork is emptied */ if (!ifp->if_bytes) xfs_inode_clear_cowblocks_tag(ip); return error; }
/* Clear the inode reflink flag if there are no shared extents. */ int xfs_reflink_clear_inode_flag( struct xfs_inode *ip, struct xfs_trans **tpp) { struct xfs_mount *mp = ip->i_mount; xfs_fileoff_t fbno; xfs_filblks_t end; xfs_agnumber_t agno; xfs_agblock_t agbno; xfs_extlen_t aglen; xfs_agblock_t rbno; xfs_extlen_t rlen; struct xfs_bmbt_irec map; int nmaps; int error = 0; ASSERT(xfs_is_reflink_inode(ip)); fbno = 0; end = XFS_B_TO_FSB(mp, i_size_read(VFS_I(ip))); while (end - fbno > 0) { nmaps = 1; /* * Look for extents in the file. Skip holes, delalloc, or * unwritten extents; they can't be reflinked. */ error = xfs_bmapi_read(ip, fbno, end - fbno, &map, &nmaps, 0); if (error) return error; if (nmaps == 0) break; if (!xfs_bmap_is_real_extent(&map)) goto next; agno = XFS_FSB_TO_AGNO(mp, map.br_startblock); agbno = XFS_FSB_TO_AGBNO(mp, map.br_startblock); aglen = map.br_blockcount; error = xfs_reflink_find_shared(mp, agno, agbno, aglen, &rbno, &rlen, false); if (error) return error; /* Is there still a shared block here? */ if (rbno != NULLAGBLOCK) return 0; next: fbno = map.br_startoff + map.br_blockcount; } /* * We didn't find any shared blocks so turn off the reflink flag. * First, get rid of any leftover CoW mappings. */ error = xfs_reflink_cancel_cow_blocks(ip, tpp, 0, NULLFILEOFF, true); if (error) return error; /* Clear the inode flag. */ trace_xfs_reflink_unset_inode_flag(ip); ip->i_d.di_flags2 &= ~XFS_DIFLAG2_REFLINK; xfs_inode_clear_cowblocks_tag(ip); xfs_trans_ijoin(*tpp, ip, 0); xfs_trans_log_inode(*tpp, ip, XFS_ILOG_CORE); return error; }
/* Allocate all CoW reservations covering a range of blocks in a file. */ int xfs_reflink_allocate_cow( struct xfs_inode *ip, struct xfs_bmbt_irec *imap, bool *shared, uint *lockmode, bool convert_now) { struct xfs_mount *mp = ip->i_mount; xfs_fileoff_t offset_fsb = imap->br_startoff; xfs_filblks_t count_fsb = imap->br_blockcount; struct xfs_trans *tp; int nimaps, error = 0; bool found; xfs_filblks_t resaligned; xfs_extlen_t resblks = 0; ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); if (!ip->i_cowfp) { ASSERT(!xfs_is_reflink_inode(ip)); xfs_ifork_init_cow(ip); } error = xfs_find_trim_cow_extent(ip, imap, shared, &found); if (error || !*shared) return error; if (found) goto convert; resaligned = xfs_aligned_fsb_count(imap->br_startoff, imap->br_blockcount, xfs_get_cowextsz_hint(ip)); resblks = XFS_DIOSTRAT_SPACE_RES(mp, resaligned); xfs_iunlock(ip, *lockmode); error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, &tp); *lockmode = XFS_ILOCK_EXCL; xfs_ilock(ip, *lockmode); if (error) return error; error = xfs_qm_dqattach_locked(ip, false); if (error) goto out_trans_cancel; /* * Check for an overlapping extent again now that we dropped the ilock. */ error = xfs_find_trim_cow_extent(ip, imap, shared, &found); if (error || !*shared) goto out_trans_cancel; if (found) { xfs_trans_cancel(tp); goto convert; } error = xfs_trans_reserve_quota_nblks(tp, ip, resblks, 0, XFS_QMOPT_RES_REGBLKS); if (error) goto out_trans_cancel; xfs_trans_ijoin(tp, ip, 0); /* Allocate the entire reservation as unwritten blocks. */ nimaps = 1; error = xfs_bmapi_write(tp, ip, imap->br_startoff, imap->br_blockcount, XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC, resblks, imap, &nimaps); if (error) goto out_unreserve; xfs_inode_set_cowblocks_tag(ip); error = xfs_trans_commit(tp); if (error) return error; /* * Allocation succeeded but the requested range was not even partially * satisfied? Bail out! */ if (nimaps == 0) return -ENOSPC; convert: xfs_trim_extent(imap, offset_fsb, count_fsb); /* * COW fork extents are supposed to remain unwritten until we're ready * to initiate a disk write. For direct I/O we are going to write the * data and need the conversion, but for buffered writes we're done. */ if (!convert_now || imap->br_state == XFS_EXT_NORM) return 0; trace_xfs_reflink_convert_cow(ip, imap); return xfs_reflink_convert_cow_locked(ip, offset_fsb, count_fsb); out_unreserve: xfs_trans_unreserve_quota_nblks(tp, ip, (long)resblks, 0, XFS_QMOPT_RES_REGBLKS); out_trans_cancel: xfs_trans_cancel(tp); return error; }
/* * xfs_file_dio_aio_write - handle direct IO writes * * Lock the inode appropriately to prepare for and issue a direct IO write. * By separating it from the buffered write path we remove all the tricky to * follow locking changes and looping. * * If there are cached pages or we're extending the file, we need IOLOCK_EXCL * until we're sure the bytes at the new EOF have been zeroed and/or the cached * pages are flushed out. * * In most cases the direct IO writes will be done holding IOLOCK_SHARED * allowing them to be done in parallel with reads and other direct IO writes. * However, if the IO is not aligned to filesystem blocks, the direct IO layer * needs to do sub-block zeroing and that requires serialisation against other * direct IOs to the same block. In this case we need to serialise the * submission of the unaligned IOs so that we don't get racing block zeroing in * the dio layer. To avoid the problem with aio, we also need to wait for * outstanding IOs to complete so that unwritten extent conversion is completed * before we try to map the overlapping block. This is currently implemented by * hitting it with a big hammer (i.e. inode_dio_wait()). * * Returns with locks held indicated by @iolock and errors indicated by * negative return values. */ STATIC ssize_t xfs_file_dio_aio_write( struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; ssize_t ret = 0; int unaligned_io = 0; int iolock; size_t count = iov_iter_count(from); struct xfs_buftarg *target = XFS_IS_REALTIME_INODE(ip) ? mp->m_rtdev_targp : mp->m_ddev_targp; /* DIO must be aligned to device logical sector size */ if ((iocb->ki_pos | count) & target->bt_logical_sectormask) return -EINVAL; /* * Don't take the exclusive iolock here unless the I/O is unaligned to * the file system block size. We don't need to consider the EOF * extension case here because xfs_file_aio_write_checks() will relock * the inode as necessary for EOF zeroing cases and fill out the new * inode size as appropriate. */ if ((iocb->ki_pos & mp->m_blockmask) || ((iocb->ki_pos + count) & mp->m_blockmask)) { unaligned_io = 1; /* * We can't properly handle unaligned direct I/O to reflink * files yet, as we can't unshare a partial block. */ if (xfs_is_reflink_inode(ip)) { trace_xfs_reflink_bounce_dio_write(ip, iocb->ki_pos, count); return -EREMCHG; } iolock = XFS_IOLOCK_EXCL; } else { iolock = XFS_IOLOCK_SHARED; } if (iocb->ki_flags & IOCB_NOWAIT) { if (!xfs_ilock_nowait(ip, iolock)) return -EAGAIN; } else { xfs_ilock(ip, iolock); } ret = xfs_file_aio_write_checks(iocb, from, &iolock); if (ret) goto out; count = iov_iter_count(from); /* * If we are doing unaligned IO, wait for all other IO to drain, * otherwise demote the lock if we had to take the exclusive lock * for other reasons in xfs_file_aio_write_checks. */ if (unaligned_io) { /* If we are going to wait for other DIO to finish, bail */ if (iocb->ki_flags & IOCB_NOWAIT) { if (atomic_read(&inode->i_dio_count)) return -EAGAIN; } else { inode_dio_wait(inode); } } else if (iolock == XFS_IOLOCK_EXCL) { xfs_ilock_demote(ip, XFS_IOLOCK_EXCL); iolock = XFS_IOLOCK_SHARED; } trace_xfs_file_direct_write(ip, count, iocb->ki_pos); ret = iomap_dio_rw(iocb, from, &xfs_iomap_ops, xfs_dio_write_end_io); out: xfs_iunlock(ip, iolock); /* * No fallback to buffered IO on errors for XFS, direct IO will either * complete fully or fail. */ ASSERT(ret < 0 || ret == count); return ret; }
/* * Get a layout for the pNFS client. */ int xfs_fs_map_blocks( struct inode *inode, loff_t offset, u64 length, struct iomap *iomap, bool write, u32 *device_generation) { struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; struct xfs_bmbt_irec imap; xfs_fileoff_t offset_fsb, end_fsb; loff_t limit; int bmapi_flags = XFS_BMAPI_ENTIRE; int nimaps = 1; uint lock_flags; int error = 0; if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; /* * We can't export inodes residing on the realtime device. The realtime * device doesn't have a UUID to identify it, so the client has no way * to find it. */ if (XFS_IS_REALTIME_INODE(ip)) return -ENXIO; /* * The pNFS block layout spec actually supports reflink like * functionality, but the Linux pNFS server doesn't implement it yet. */ if (xfs_is_reflink_inode(ip)) return -ENXIO; /* * Lock out any other I/O before we flush and invalidate the pagecache, * and then hand out a layout to the remote system. This is very * similar to direct I/O, except that the synchronization is much more * complicated. See the comment near xfs_break_layouts for a detailed * explanation. */ xfs_ilock(ip, XFS_IOLOCK_EXCL); error = -EINVAL; limit = mp->m_super->s_maxbytes; if (!write) limit = max(limit, round_up(i_size_read(inode), inode->i_sb->s_blocksize)); if (offset > limit) goto out_unlock; if (offset > limit - length) length = limit - offset; error = filemap_write_and_wait(inode->i_mapping); if (error) goto out_unlock; error = invalidate_inode_pages2(inode->i_mapping); if (WARN_ON_ONCE(error)) return error; end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + length); offset_fsb = XFS_B_TO_FSBT(mp, offset); lock_flags = xfs_ilock_data_map_shared(ip); error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap, &nimaps, bmapi_flags); xfs_iunlock(ip, lock_flags); if (error) goto out_unlock; if (write) { enum xfs_prealloc_flags flags = 0; ASSERT(imap.br_startblock != DELAYSTARTBLOCK); if (!nimaps || imap.br_startblock == HOLESTARTBLOCK) { /* * xfs_iomap_write_direct() expects to take ownership of * the shared ilock. */ xfs_ilock(ip, XFS_ILOCK_SHARED); error = xfs_iomap_write_direct(ip, offset, length, &imap, nimaps); if (error) goto out_unlock; /* * Ensure the next transaction is committed * synchronously so that the blocks allocated and * handed out to the client are guaranteed to be * present even after a server crash. */ flags |= XFS_PREALLOC_SET | XFS_PREALLOC_SYNC; } error = xfs_update_prealloc_flags(ip, flags); if (error) goto out_unlock; } xfs_iunlock(ip, XFS_IOLOCK_EXCL); xfs_bmbt_to_iomap(ip, iomap, &imap); *device_generation = mp->m_generation; return error; out_unlock: xfs_iunlock(ip, XFS_IOLOCK_EXCL); return error; }