/* * Inodes in different states need to be treated differently. The following * table lists the inode states and the reclaim actions necessary: * * inode state iflush ret required action * --------------- ---------- --------------- * bad - reclaim * shutdown EIO unpin and reclaim * clean, unpinned 0 reclaim * stale, unpinned 0 reclaim * clean, pinned(*) 0 requeue * stale, pinned EAGAIN requeue * dirty, async - requeue * dirty, sync 0 reclaim * * (*) dgc: I don't think the clean, pinned state is possible but it gets * handled anyway given the order of checks implemented. * * Also, because we get the flush lock first, we know that any inode that has * been flushed delwri has had the flush completed by the time we check that * the inode is clean. * * Note that because the inode is flushed delayed write by AIL pushing, the * flush lock may already be held here and waiting on it can result in very * long latencies. Hence for sync reclaims, where we wait on the flush lock, * the caller should push the AIL first before trying to reclaim inodes to * minimise the amount of time spent waiting. For background relaim, we only * bother to reclaim clean inodes anyway. * * Hence the order of actions after gaining the locks should be: * bad => reclaim * shutdown => unpin and reclaim * pinned, async => requeue * pinned, sync => unpin * stale => reclaim * clean => reclaim * dirty, async => requeue * dirty, sync => flush, wait and reclaim */ STATIC int xfs_reclaim_inode( struct xfs_inode *ip, struct xfs_perag *pag, int sync_mode) { struct xfs_buf *bp = NULL; xfs_ino_t ino = ip->i_ino; /* for radix_tree_delete */ int error; restart: error = 0; xfs_ilock(ip, XFS_ILOCK_EXCL); if (!xfs_iflock_nowait(ip)) { if (!(sync_mode & SYNC_WAIT)) goto out; xfs_iflock(ip); } if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { xfs_iunpin_wait(ip); xfs_iflush_abort(ip, false); goto reclaim; } if (xfs_ipincount(ip)) { if (!(sync_mode & SYNC_WAIT)) goto out_ifunlock; xfs_iunpin_wait(ip); } if (xfs_iflags_test(ip, XFS_ISTALE)) goto reclaim; if (xfs_inode_clean(ip)) goto reclaim; /* * Never flush out dirty data during non-blocking reclaim, as it would * just contend with AIL pushing trying to do the same job. */ if (!(sync_mode & SYNC_WAIT)) goto out_ifunlock; /* * Now we have an inode that needs flushing. * * Note that xfs_iflush will never block on the inode buffer lock, as * xfs_ifree_cluster() can lock the inode buffer before it locks the * ip->i_lock, and we are doing the exact opposite here. As a result, * doing a blocking xfs_imap_to_bp() to get the cluster buffer would * result in an ABBA deadlock with xfs_ifree_cluster(). * * As xfs_ifree_cluser() must gather all inodes that are active in the * cache to mark them stale, if we hit this case we don't actually want * to do IO here - we want the inode marked stale so we can simply * reclaim it. Hence if we get an EAGAIN error here, just unlock the * inode, back off and try again. Hopefully the next pass through will * see the stale flag set on the inode. */ error = xfs_iflush(ip, &bp); if (error == -EAGAIN) { xfs_iunlock(ip, XFS_ILOCK_EXCL); /* backoff longer than in xfs_ifree_cluster */ delay(2); goto restart; } if (!error) { error = xfs_bwrite(bp); xfs_buf_relse(bp); } xfs_iflock(ip); reclaim: /* * Because we use RCU freeing we need to ensure the inode always appears * to be reclaimed with an invalid inode number when in the free state. * We do this as early as possible under the ILOCK and flush lock so * that xfs_iflush_cluster() can be guaranteed to detect races with us * here. By doing this, we guarantee that once xfs_iflush_cluster has * locked both the XFS_ILOCK and the flush lock that it will see either * a valid, flushable inode that will serialise correctly against the * locks below, or it will see a clean (and invalid) inode that it can * skip. */ spin_lock(&ip->i_flags_lock); ip->i_flags = XFS_IRECLAIM; ip->i_ino = 0; spin_unlock(&ip->i_flags_lock); xfs_ifunlock(ip); xfs_iunlock(ip, XFS_ILOCK_EXCL); XFS_STATS_INC(ip->i_mount, xs_ig_reclaims); /* * Remove the inode from the per-AG radix tree. * * Because radix_tree_delete won't complain even if the item was never * added to the tree assert that it's been there before to catch * problems with the inode life time early on. */ spin_lock(&pag->pag_ici_lock); if (!radix_tree_delete(&pag->pag_ici_root, XFS_INO_TO_AGINO(ip->i_mount, ino))) ASSERT(0); xfs_perag_clear_reclaim_tag(pag); spin_unlock(&pag->pag_ici_lock); /* * Here we do an (almost) spurious inode lock in order to coordinate * with inode cache radix tree lookups. This is because the lookup * can reference the inodes in the cache without taking references. * * We make that OK here by ensuring that we wait until the inode is * unlocked after the lookup before we go ahead and free it. */ xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_qm_dqdetach(ip); xfs_iunlock(ip, XFS_ILOCK_EXCL); __xfs_inode_free(ip); return error; out_ifunlock: xfs_ifunlock(ip); out: xfs_iflags_clear(ip, XFS_IRECLAIM); xfs_iunlock(ip, XFS_ILOCK_EXCL); /* * We could return -EAGAIN here to make reclaim rescan the inode tree in * a short while. However, this just burns CPU time scanning the tree * waiting for IO to complete and the reclaim work never goes back to * the idle state. Instead, return 0 to let the next scheduled * background reclaim attempt to reclaim the inode again. */ return 0; }
/* * Remove the value associated with an attribute by deleting the * out-of-line buffer that it is stored on. */ int xfs_attr_rmtval_remove( struct xfs_da_args *args) { struct xfs_mount *mp = args->dp->i_mount; xfs_dablk_t lblkno; int blkcnt; int error; int done; trace_xfs_attr_rmtval_remove(args); /* * Roll through the "value", invalidating the attribute value's blocks. */ lblkno = args->rmtblkno; blkcnt = args->rmtblkcnt; while (blkcnt > 0) { struct xfs_bmbt_irec map; struct xfs_buf *bp; xfs_daddr_t dblkno; int dblkcnt; int nmap; /* * Try to remember where we decided to put the value. */ nmap = 1; error = xfs_bmapi_read(args->dp, (xfs_fileoff_t)lblkno, blkcnt, &map, &nmap, XFS_BMAPI_ATTRFORK); if (error) return error; ASSERT(nmap == 1); ASSERT((map.br_startblock != DELAYSTARTBLOCK) && (map.br_startblock != HOLESTARTBLOCK)); dblkno = XFS_FSB_TO_DADDR(mp, map.br_startblock), dblkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount); /* * If the "remote" value is in the cache, remove it. */ bp = xfs_incore(mp->m_ddev_targp, dblkno, dblkcnt, XBF_TRYLOCK); if (bp) { xfs_buf_stale(bp); xfs_buf_relse(bp); bp = NULL; } lblkno += map.br_blockcount; blkcnt -= map.br_blockcount; } /* * Keep de-allocating extents until the remote-value region is gone. */ lblkno = args->rmtblkno; blkcnt = args->rmtblkcnt; done = 0; while (!done) { int committed; xfs_bmap_init(args->flist, args->firstblock); error = xfs_bunmapi(args->trans, args->dp, lblkno, blkcnt, XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA, 1, args->firstblock, args->flist, &done); if (!error) { error = xfs_bmap_finish(&args->trans, args->flist, &committed); } if (error) { ASSERT(committed); args->trans = NULL; xfs_bmap_cancel(args->flist); return error; } /* * bmap_finish() may have committed the last trans and started * a new one. We need the inode to be in all transactions. */ if (committed) xfs_trans_ijoin(args->trans, args->dp, 0); /* * Close out trans and start the next one in the chain. */ error = xfs_trans_roll(&args->trans, args->dp); if (error) return error; } return 0; }
STATIC int xfs_trim_extents( struct xfs_mount *mp, xfs_agnumber_t agno, xfs_fsblock_t start, xfs_fsblock_t end, xfs_fsblock_t minlen, __uint64_t *blocks_trimmed) { struct block_device *bdev = mp->m_ddev_targp->bt_bdev; struct xfs_btree_cur *cur; struct xfs_buf *agbp; struct xfs_perag *pag; int error; int i; pag = xfs_perag_get(mp, agno); error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agbp); if (error || !agbp) goto out_put_perag; cur = xfs_allocbt_init_cursor(mp, NULL, agbp, agno, XFS_BTNUM_CNT); /* * Force out the log. This means any transactions that might have freed * space before we took the AGF buffer lock are now on disk, and the * volatile disk cache is flushed. */ xfs_log_force(mp, XFS_LOG_SYNC); /* * Look up the longest btree in the AGF and start with it. */ error = xfs_alloc_lookup_le(cur, 0, XFS_BUF_TO_AGF(agbp)->agf_longest, &i); if (error) goto out_del_cursor; /* * Loop until we are done with all extents that are large * enough to be worth discarding. */ while (i) { xfs_agblock_t fbno; xfs_extlen_t flen; error = xfs_alloc_get_rec(cur, &fbno, &flen, &i); if (error) goto out_del_cursor; XFS_WANT_CORRUPTED_GOTO(i == 1, out_del_cursor); ASSERT(flen <= XFS_BUF_TO_AGF(agbp)->agf_longest); /* * Too small? Give up. */ if (flen < minlen) { trace_xfs_discard_toosmall(mp, agno, fbno, flen); goto out_del_cursor; } /* * If the extent is entirely outside of the range we are * supposed to discard skip it. Do not bother to trim * down partially overlapping ranges for now. */ if (XFS_AGB_TO_FSB(mp, agno, fbno) + flen < start || XFS_AGB_TO_FSB(mp, agno, fbno) > end) { trace_xfs_discard_exclude(mp, agno, fbno, flen); goto next_extent; } /* * If any blocks in the range are still busy, skip the * discard and try again the next time. */ if (xfs_alloc_busy_search(mp, agno, fbno, flen)) { trace_xfs_discard_busy(mp, agno, fbno, flen); goto next_extent; } trace_xfs_discard_extent(mp, agno, fbno, flen); error = -blkdev_issue_discard(bdev, XFS_AGB_TO_DADDR(mp, agno, fbno), XFS_FSB_TO_BB(mp, flen), GFP_NOFS, 0); if (error) goto out_del_cursor; *blocks_trimmed += flen; next_extent: error = xfs_btree_decrement(cur, 0, &i); if (error) goto out_del_cursor; } out_del_cursor: xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR); xfs_buf_relse(agbp); out_put_perag: xfs_perag_put(pag); return error; }
/* ----- Kernel only functions below ----- */ STATIC int xfs_readlink_bmap( struct xfs_inode *ip, char *link) { struct xfs_mount *mp = ip->i_mount; struct xfs_bmbt_irec mval[XFS_SYMLINK_MAPS]; struct xfs_buf *bp; xfs_daddr_t d; char *cur_chunk; int pathlen = ip->i_d.di_size; int nmaps = XFS_SYMLINK_MAPS; int byte_cnt; int n; int error = 0; int fsblocks = 0; int offset; fsblocks = xfs_symlink_blocks(mp, pathlen); error = xfs_bmapi_read(ip, 0, fsblocks, mval, &nmaps, 0); if (error) goto out; offset = 0; for (n = 0; n < nmaps; n++) { d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock); byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount); bp = xfs_buf_read(mp->m_ddev_targp, d, BTOBB(byte_cnt), 0, &xfs_symlink_buf_ops); if (!bp) return -ENOMEM; error = bp->b_error; if (error) { xfs_buf_ioerror_alert(bp, __func__); xfs_buf_relse(bp); /* bad CRC means corrupted metadata */ if (error == -EFSBADCRC) error = -EFSCORRUPTED; goto out; } byte_cnt = XFS_SYMLINK_BUF_SPACE(mp, byte_cnt); if (pathlen < byte_cnt) byte_cnt = pathlen; cur_chunk = bp->b_addr; if (xfs_sb_version_hascrc(&mp->m_sb)) { if (!xfs_symlink_hdr_ok(ip->i_ino, offset, byte_cnt, bp)) { error = -EFSCORRUPTED; xfs_alert(mp, "symlink header does not match required off/len/owner (0x%x/Ox%x,0x%llx)", offset, byte_cnt, ip->i_ino); xfs_buf_relse(bp); goto out; } cur_chunk += sizeof(struct xfs_dsymlink_hdr); } memcpy(link + offset, cur_chunk, byte_cnt); pathlen -= byte_cnt; offset += byte_cnt; xfs_buf_relse(bp); } ASSERT(pathlen == 0); link[ip->i_d.di_size] = '\0'; error = 0; out: return error; }
/* * Write the value associated with an attribute into the out-of-line buffer * that we have defined for it. */ int xfs_attr_rmtval_set( struct xfs_da_args *args) { struct xfs_inode *dp = args->dp; struct xfs_mount *mp = dp->i_mount; struct xfs_bmbt_irec map; xfs_dablk_t lblkno; xfs_fileoff_t lfileoff = 0; __uint8_t *src = args->value; int blkcnt; int valuelen; int nmap; int error; int offset = 0; trace_xfs_attr_rmtval_set(args); /* * Find a "hole" in the attribute address space large enough for * us to drop the new attribute's value into. Because CRC enable * attributes have headers, we can't just do a straight byte to FSB * conversion and have to take the header space into account. */ blkcnt = xfs_attr3_rmt_blocks(mp, args->rmtvaluelen); error = xfs_bmap_first_unused(args->trans, args->dp, blkcnt, &lfileoff, XFS_ATTR_FORK); if (error) return error; args->rmtblkno = lblkno = (xfs_dablk_t)lfileoff; args->rmtblkcnt = blkcnt; /* * Roll through the "value", allocating blocks on disk as required. */ while (blkcnt > 0) { int committed; /* * Allocate a single extent, up to the size of the value. */ xfs_bmap_init(args->flist, args->firstblock); nmap = 1; error = xfs_bmapi_write(args->trans, dp, (xfs_fileoff_t)lblkno, blkcnt, XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA, args->firstblock, args->total, &map, &nmap, args->flist); if (!error) { error = xfs_bmap_finish(&args->trans, args->flist, &committed); } if (error) { ASSERT(committed); args->trans = NULL; xfs_bmap_cancel(args->flist); return error; } /* * bmap_finish() may have committed the last trans and started * a new one. We need the inode to be in all transactions. */ if (committed) xfs_trans_ijoin(args->trans, dp, 0); ASSERT(nmap == 1); ASSERT((map.br_startblock != DELAYSTARTBLOCK) && (map.br_startblock != HOLESTARTBLOCK)); lblkno += map.br_blockcount; blkcnt -= map.br_blockcount; /* * Start the next trans in the chain. */ error = xfs_trans_roll(&args->trans, dp); if (error) return error; } /* * Roll through the "value", copying the attribute value to the * already-allocated blocks. Blocks are written synchronously * so that we can know they are all on disk before we turn off * the INCOMPLETE flag. */ lblkno = args->rmtblkno; blkcnt = args->rmtblkcnt; valuelen = args->rmtvaluelen; while (valuelen > 0) { struct xfs_buf *bp; xfs_daddr_t dblkno; int dblkcnt; ASSERT(blkcnt > 0); xfs_bmap_init(args->flist, args->firstblock); nmap = 1; error = xfs_bmapi_read(dp, (xfs_fileoff_t)lblkno, blkcnt, &map, &nmap, XFS_BMAPI_ATTRFORK); if (error) return error; ASSERT(nmap == 1); ASSERT((map.br_startblock != DELAYSTARTBLOCK) && (map.br_startblock != HOLESTARTBLOCK)); dblkno = XFS_FSB_TO_DADDR(mp, map.br_startblock), dblkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount); bp = xfs_buf_get(mp->m_ddev_targp, dblkno, dblkcnt, 0); if (!bp) return -ENOMEM; bp->b_ops = &xfs_attr3_rmt_buf_ops; xfs_attr_rmtval_copyin(mp, bp, args->dp->i_ino, &offset, &valuelen, &src); error = xfs_bwrite(bp); /* GROT: NOTE: synchronous write */ xfs_buf_relse(bp); if (error) return error; /* roll attribute extent map forwards */ lblkno += map.br_blockcount; blkcnt -= map.br_blockcount; } ASSERT(valuelen == 0); return 0; }
/* * Release the buffer associated with the buf log item. If there is no dirty * logged data associated with the buffer recorded in the buf log item, then * free the buf log item and remove the reference to it in the buffer. * * This call ignores the recursion count. It is only called when the buffer * should REALLY be unlocked, regardless of the recursion count. * * We unconditionally drop the transaction's reference to the log item. If the * item was logged, then another reference was taken when it was pinned, so we * can safely drop the transaction reference now. This also allows us to avoid * potential races with the unpin code freeing the bli by not referencing the * bli after we've dropped the reference count. * * If the XFS_BLI_HOLD flag is set in the buf log item, then free the log item * if necessary but do not unlock the buffer. This is for support of * xfs_trans_bhold(). Make sure the XFS_BLI_HOLD field is cleared if we don't * free the item. */ STATIC void xfs_buf_item_unlock( struct xfs_log_item *lip) { struct xfs_buf_log_item *bip = BUF_ITEM(lip); struct xfs_buf *bp = bip->bli_buf; int aborted, clean, i; uint hold; /* Clear the buffer's association with this transaction. */ bp->b_transp = NULL; /* * If this is a transaction abort, don't return early. Instead, allow * the brelse to happen. Normally it would be done for stale * (cancelled) buffers at unpin time, but we'll never go through the * pin/unpin cycle if we abort inside commit. */ aborted = (lip->li_flags & XFS_LI_ABORTED) != 0; /* * Before possibly freeing the buf item, determine if we should * release the buffer at the end of this routine. */ hold = bip->bli_flags & XFS_BLI_HOLD; /* Clear the per transaction state. */ bip->bli_flags &= ~(XFS_BLI_LOGGED | XFS_BLI_HOLD); /* * If the buf item is marked stale, then don't do anything. We'll * unlock the buffer and free the buf item when the buffer is unpinned * for the last time. */ if (bip->bli_flags & XFS_BLI_STALE) { trace_xfs_buf_item_unlock_stale(bip); ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL); if (!aborted) { atomic_dec(&bip->bli_refcount); return; } } trace_xfs_buf_item_unlock(bip); /* * If the buf item isn't tracking any data, free it, otherwise drop the * reference we hold to it. */ clean = 1; for (i = 0; i < bip->bli_format_count; i++) { if (!xfs_bitmap_empty(bip->bli_formats[i].blf_data_map, bip->bli_formats[i].blf_map_size)) { clean = 0; break; } } if (clean) xfs_buf_item_relse(bp); else atomic_dec(&bip->bli_refcount); if (!hold) xfs_buf_relse(bp); }
/* * Write a modified dquot to disk. * The dquot must be locked and the flush lock too taken by caller. * The flush lock will not be unlocked until the dquot reaches the disk, * but the dquot is free to be unlocked and modified by the caller * in the interim. Dquot is still locked on return. This behavior is * identical to that of inodes. */ int xfs_qm_dqflush( struct xfs_dquot *dqp, struct xfs_buf **bpp) { struct xfs_mount *mp = dqp->q_mount; struct xfs_buf *bp; struct xfs_disk_dquot *ddqp; int error; ASSERT(XFS_DQ_IS_LOCKED(dqp)); ASSERT(!completion_done(&dqp->q_flush)); trace_xfs_dqflush(dqp); *bpp = NULL; xfs_qm_dqunpin_wait(dqp); /* * This may have been unpinned because the filesystem is shutting * down forcibly. If that's the case we must not write this dquot * to disk, because the log record didn't make it to disk. * * We also have to remove the log item from the AIL in this case, * as we wait for an emptry AIL as part of the unmount process. */ if (XFS_FORCED_SHUTDOWN(mp)) { struct xfs_log_item *lip = &dqp->q_logitem.qli_item; dqp->dq_flags &= ~XFS_DQ_DIRTY; spin_lock(&mp->m_ail->xa_lock); if (lip->li_flags & XFS_LI_IN_AIL) xfs_trans_ail_delete(mp->m_ail, lip, SHUTDOWN_CORRUPT_INCORE); else spin_unlock(&mp->m_ail->xa_lock); error = XFS_ERROR(EIO); goto out_unlock; } /* * Get the buffer containing the on-disk dquot */ error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dqp->q_blkno, mp->m_quotainfo->qi_dqchunklen, 0, &bp, NULL); if (error) goto out_unlock; /* * Calculate the location of the dquot inside the buffer. */ ddqp = bp->b_addr + dqp->q_bufoffset; /* * A simple sanity check in case we got a corrupted dquot.. */ error = xfs_qm_dqcheck(mp, &dqp->q_core, be32_to_cpu(ddqp->d_id), 0, XFS_QMOPT_DOWARN, "dqflush (incore copy)"); if (error) { xfs_buf_relse(bp); xfs_dqfunlock(dqp); xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); return XFS_ERROR(EIO); } /* This is the only portion of data that needs to persist */ memcpy(ddqp, &dqp->q_core, sizeof(xfs_disk_dquot_t)); /* * Clear the dirty field and remember the flush lsn for later use. */ dqp->dq_flags &= ~XFS_DQ_DIRTY; xfs_trans_ail_copy_lsn(mp->m_ail, &dqp->q_logitem.qli_flush_lsn, &dqp->q_logitem.qli_item.li_lsn); /* * Attach an iodone routine so that we can remove this dquot from the * AIL and release the flush lock once the dquot is synced to disk. */ xfs_buf_attach_iodone(bp, xfs_qm_dqflush_done, &dqp->q_logitem.qli_item); /* * If the buffer is pinned then push on the log so we won't * get stuck waiting in the write for too long. */ if (xfs_buf_ispinned(bp)) { trace_xfs_dqflush_force(dqp); xfs_log_force(mp, 0); } trace_xfs_dqflush_done(dqp); *bpp = bp; return 0; out_unlock: xfs_dqfunlock(dqp); return XFS_ERROR(EIO); }
/* * Allocates a new inode from disk and return a pointer to the * incore copy. This routine will internally commit the current * transaction and allocate a new one if the Space Manager needed * to do an allocation to replenish the inode free-list. * * This routine is designed to be called from xfs_create and * xfs_create_dir. * */ int xfs_dir_ialloc( xfs_trans_t **tpp, /* input: current transaction; output: may be a new transaction. */ xfs_inode_t *dp, /* directory within whose allocate the inode. */ umode_t mode, xfs_nlink_t nlink, xfs_dev_t rdev, prid_t prid, /* project id */ int okalloc, /* ok to allocate new space */ xfs_inode_t **ipp, /* pointer to inode; it will be locked. */ int *committed) { xfs_trans_t *tp; xfs_trans_t *ntp; xfs_inode_t *ip; xfs_buf_t *ialloc_context = NULL; int code; uint log_res; uint log_count; void *dqinfo; uint tflags; tp = *tpp; ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); /* * xfs_ialloc will return a pointer to an incore inode if * the Space Manager has an available inode on the free * list. Otherwise, it will do an allocation and replenish * the freelist. Since we can only do one allocation per * transaction without deadlocks, we will need to commit the * current transaction and start a new one. We will then * need to call xfs_ialloc again to get the inode. * * If xfs_ialloc did an allocation to replenish the freelist, * it returns the bp containing the head of the freelist as * ialloc_context. We will hold a lock on it across the * transaction commit so that no other process can steal * the inode(s) that we've just allocated. */ code = xfs_ialloc(tp, dp, mode, nlink, rdev, prid, okalloc, &ialloc_context, &ip); /* * Return an error if we were unable to allocate a new inode. * This should only happen if we run out of space on disk or * encounter a disk error. */ if (code) { *ipp = NULL; return code; } if (!ialloc_context && !ip) { *ipp = NULL; return XFS_ERROR(ENOSPC); } /* * If the AGI buffer is non-NULL, then we were unable to get an * inode in one operation. We need to commit the current * transaction and call xfs_ialloc() again. It is guaranteed * to succeed the second time. */ if (ialloc_context) { /* * Normally, xfs_trans_commit releases all the locks. * We call bhold to hang on to the ialloc_context across * the commit. Holding this buffer prevents any other * processes from doing any allocations in this * allocation group. */ xfs_trans_bhold(tp, ialloc_context); /* * Save the log reservation so we can use * them in the next transaction. */ log_res = xfs_trans_get_log_res(tp); log_count = xfs_trans_get_log_count(tp); /* * We want the quota changes to be associated with the next * transaction, NOT this one. So, detach the dqinfo from this * and attach it to the next transaction. */ dqinfo = NULL; tflags = 0; if (tp->t_dqinfo) { dqinfo = (void *)tp->t_dqinfo; tp->t_dqinfo = NULL; tflags = tp->t_flags & XFS_TRANS_DQ_DIRTY; tp->t_flags &= ~(XFS_TRANS_DQ_DIRTY); } ntp = xfs_trans_dup(tp); code = xfs_trans_commit(tp, 0); tp = ntp; if (committed != NULL) { *committed = 1; } /* * If we get an error during the commit processing, * release the buffer that is still held and return * to the caller. */ if (code) { xfs_buf_relse(ialloc_context); if (dqinfo) { tp->t_dqinfo = dqinfo; xfs_trans_free_dqinfo(tp); } *tpp = ntp; *ipp = NULL; return code; } /* * transaction commit worked ok so we can drop the extra ticket * reference that we gained in xfs_trans_dup() */ xfs_log_ticket_put(tp->t_ticket); code = xfs_trans_reserve(tp, 0, log_res, 0, XFS_TRANS_PERM_LOG_RES, log_count); /* * Re-attach the quota info that we detached from prev trx. */ if (dqinfo) { tp->t_dqinfo = dqinfo; tp->t_flags |= tflags; } if (code) { xfs_buf_relse(ialloc_context); *tpp = ntp; *ipp = NULL; return code; } xfs_trans_bjoin(tp, ialloc_context); /* * Call ialloc again. Since we've locked out all * other allocations in this allocation group, * this call should always succeed. */ code = xfs_ialloc(tp, dp, mode, nlink, rdev, prid, okalloc, &ialloc_context, &ip); /* * If we get an error at this point, return to the caller * so that the current transaction can be aborted. */ if (code) { *tpp = tp; *ipp = NULL; return code; } ASSERT(!ialloc_context && ip); } else { if (committed != NULL) *committed = 0; } *ipp = ip; *tpp = tp; return 0; }
/* * This gets called by xfs_trans_push_ail(), when IOP_TRYLOCK * failed to get the inode flush lock but did get the inode locked SHARED. * Here we're trying to see if the inode buffer is incore, and if so whether it's * marked delayed write. If that's the case, we'll initiate a bawrite on that * buffer to expedite the process. * * We aren't holding the AIL lock (or the flush lock) when this gets called, * so it is inherently race-y. */ STATIC void xfs_inode_item_pushbuf( xfs_inode_log_item_t *iip) { xfs_inode_t *ip; xfs_mount_t *mp; xfs_buf_t *bp; uint dopush; ip = iip->ili_inode; ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED)); /* * The ili_pushbuf_flag keeps others from * trying to duplicate our effort. */ ASSERT(iip->ili_pushbuf_flag != 0); ASSERT(iip->ili_push_owner == current_pid()); /* * If flushlock isn't locked anymore, chances are that the * inode flush completed and the inode was taken off the AIL. * So, just get out. */ if (!issemalocked(&(ip->i_flock)) || ((iip->ili_item.li_flags & XFS_LI_IN_AIL) == 0)) { iip->ili_pushbuf_flag = 0; xfs_iunlock(ip, XFS_ILOCK_SHARED); return; } mp = ip->i_mount; bp = xfs_incore(mp->m_ddev_targp, iip->ili_format.ilf_blkno, iip->ili_format.ilf_len, XFS_INCORE_TRYLOCK); if (bp != NULL) { if (XFS_BUF_ISDELAYWRITE(bp)) { /* * We were racing with iflush because we don't hold * the AIL lock or the flush lock. However, at this point, * we have the buffer, and we know that it's dirty. * So, it's possible that iflush raced with us, and * this item is already taken off the AIL. * If not, we can flush it async. */ dopush = ((iip->ili_item.li_flags & XFS_LI_IN_AIL) && issemalocked(&(ip->i_flock))); iip->ili_pushbuf_flag = 0; xfs_iunlock(ip, XFS_ILOCK_SHARED); xfs_buftrace("INODE ITEM PUSH", bp); if (XFS_BUF_ISPINNED(bp)) { xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE); } if (dopush) { int error; error = xfs_bawrite(mp, bp); if (error) xfs_fs_cmn_err(CE_WARN, mp, "xfs_inode_item_pushbuf: pushbuf error %d on iip %p, bp %p", error, iip, bp); } else { xfs_buf_relse(bp); } } else { iip->ili_pushbuf_flag = 0; xfs_iunlock(ip, XFS_ILOCK_SHARED); xfs_buf_relse(bp); } return; } /* * We have to be careful about resetting pushbuf flag too early (above). * Even though in theory we can do it as soon as we have the buflock, * we don't want others to be doing work needlessly. They'll come to * this function thinking that pushing the buffer is their * responsibility only to find that the buffer is still locked by * another doing the same thing */ iip->ili_pushbuf_flag = 0; xfs_iunlock(ip, XFS_ILOCK_SHARED); return; }
/* * Purge a dquot from all tracking data structures and free it. */ STATIC int xfs_qm_dqpurge( struct xfs_dquot *dqp, void *data) { struct xfs_mount *mp = dqp->q_mount; struct xfs_quotainfo *qi = mp->m_quotainfo; xfs_dqlock(dqp); if ((dqp->dq_flags & XFS_DQ_FREEING) || dqp->q_nrefs != 0) { xfs_dqunlock(dqp); return -EAGAIN; } dqp->dq_flags |= XFS_DQ_FREEING; xfs_dqflock(dqp); /* * If we are turning this type of quotas off, we don't care * about the dirty metadata sitting in this dquot. OTOH, if * we're unmounting, we do care, so we flush it and wait. */ if (XFS_DQ_IS_DIRTY(dqp)) { struct xfs_buf *bp = NULL; int error; /* * We don't care about getting disk errors here. We need * to purge this dquot anyway, so we go ahead regardless. */ error = xfs_qm_dqflush(dqp, &bp); if (error) { xfs_warn(mp, "%s: dquot %p flush failed", __func__, dqp); } else { error = xfs_bwrite(bp); xfs_buf_relse(bp); } xfs_dqflock(dqp); } ASSERT(atomic_read(&dqp->q_pincount) == 0); ASSERT(XFS_FORCED_SHUTDOWN(mp) || !(dqp->q_logitem.qli_item.li_flags & XFS_LI_IN_AIL)); xfs_dqfunlock(dqp); xfs_dqunlock(dqp); radix_tree_delete(xfs_dquot_tree(qi, dqp->q_core.d_flags), be32_to_cpu(dqp->q_core.d_id)); qi->qi_dquots--; /* * We move dquots to the freelist as soon as their reference count * hits zero, so it really should be on the freelist here. */ ASSERT(!list_empty(&dqp->q_lru)); list_lru_del(&qi->qi_lru, &dqp->q_lru); XFS_STATS_DEC(mp, xs_qm_dquot_unused); xfs_qm_dqdestroy(dqp); return 0; }
/* * Write a modified dquot to disk. * The dquot must be locked and the flush lock too taken by caller. * The flush lock will not be unlocked until the dquot reaches the disk, * but the dquot is free to be unlocked and modified by the caller * in the interim. Dquot is still locked on return. This behavior is * identical to that of inodes. */ int xfs_qm_dqflush( struct xfs_dquot *dqp, struct xfs_buf **bpp) { struct xfs_mount *mp = dqp->q_mount; struct xfs_buf *bp; struct xfs_disk_dquot *ddqp; int error; ASSERT(XFS_DQ_IS_LOCKED(dqp)); ASSERT(!completion_done(&dqp->q_flush)); trace_xfs_dqflush(dqp); *bpp = NULL; xfs_qm_dqunpin_wait(dqp); /* * This may have been unpinned because the filesystem is shutting * down forcibly. If that's the case we must not write this dquot * to disk, because the log record didn't make it to disk. * * We also have to remove the log item from the AIL in this case, * as we wait for an emptry AIL as part of the unmount process. */ if (XFS_FORCED_SHUTDOWN(mp)) { struct xfs_log_item *lip = &dqp->q_logitem.qli_item; dqp->dq_flags &= ~XFS_DQ_DIRTY; xfs_trans_ail_remove(lip, SHUTDOWN_CORRUPT_INCORE); error = -EIO; goto out_unlock; } /* * Get the buffer containing the on-disk dquot */ error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dqp->q_blkno, mp->m_quotainfo->qi_dqchunklen, 0, &bp, &xfs_dquot_buf_ops); if (error) goto out_unlock; /* * Calculate the location of the dquot inside the buffer. */ ddqp = bp->b_addr + dqp->q_bufoffset; /* * A simple sanity check in case we got a corrupted dquot.. */ error = xfs_dqcheck(mp, &dqp->q_core, be32_to_cpu(ddqp->d_id), 0, XFS_QMOPT_DOWARN, "dqflush (incore copy)"); if (error) { xfs_buf_relse(bp); xfs_dqfunlock(dqp); xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); return -EIO; } /* This is the only portion of data that needs to persist */ memcpy(ddqp, &dqp->q_core, sizeof(xfs_disk_dquot_t)); /* * Clear the dirty field and remember the flush lsn for later use. */ dqp->dq_flags &= ~XFS_DQ_DIRTY; xfs_trans_ail_copy_lsn(mp->m_ail, &dqp->q_logitem.qli_flush_lsn, &dqp->q_logitem.qli_item.li_lsn); /* * copy the lsn into the on-disk dquot now while we have the in memory * dquot here. This can't be done later in the write verifier as we * can't get access to the log item at that point in time. * * We also calculate the CRC here so that the on-disk dquot in the * buffer always has a valid CRC. This ensures there is no possibility * of a dquot without an up-to-date CRC getting to disk. */ if (xfs_sb_version_hascrc(&mp->m_sb)) { struct xfs_dqblk *dqb = (struct xfs_dqblk *)ddqp; dqb->dd_lsn = cpu_to_be64(dqp->q_logitem.qli_item.li_lsn); xfs_update_cksum((char *)dqb, sizeof(struct xfs_dqblk), XFS_DQUOT_CRC_OFF); } /* * Attach an iodone routine so that we can remove this dquot from the * AIL and release the flush lock once the dquot is synced to disk. */ xfs_buf_attach_iodone(bp, xfs_qm_dqflush_done, &dqp->q_logitem.qli_item); /* * If the buffer is pinned then push on the log so we won't * get stuck waiting in the write for too long. */ if (xfs_buf_ispinned(bp)) { trace_xfs_dqflush_force(dqp); xfs_log_force(mp, 0); } trace_xfs_dqflush_done(dqp); *bpp = bp; return 0; out_unlock: xfs_dqfunlock(dqp); return -EIO; }
/* * Write the value associated with an attribute into the out-of-line buffer * that we have defined for it. */ int xfs_attr_rmtval_set( struct xfs_da_args *args) { struct xfs_inode *dp = args->dp; struct xfs_mount *mp = dp->i_mount; struct xfs_bmbt_irec map; xfs_dablk_t lblkno; xfs_fileoff_t lfileoff = 0; uint8_t *src = args->value; int blkcnt; int valuelen; int nmap; int error; int offset = 0; trace_xfs_attr_rmtval_set(args); /* * Find a "hole" in the attribute address space large enough for * us to drop the new attribute's value into. Because CRC enable * attributes have headers, we can't just do a straight byte to FSB * conversion and have to take the header space into account. */ blkcnt = xfs_attr3_rmt_blocks(mp, args->rmtvaluelen); error = xfs_bmap_first_unused(args->trans, args->dp, blkcnt, &lfileoff, XFS_ATTR_FORK); if (error) return error; args->rmtblkno = lblkno = (xfs_dablk_t)lfileoff; args->rmtblkcnt = blkcnt; /* * Roll through the "value", allocating blocks on disk as required. */ while (blkcnt > 0) { /* * Allocate a single extent, up to the size of the value. * * Note that we have to consider this a data allocation as we * write the remote attribute without logging the contents. * Hence we must ensure that we aren't using blocks that are on * the busy list so that we don't overwrite blocks which have * recently been freed but their transactions are not yet * committed to disk. If we overwrite the contents of a busy * extent and then crash then the block may not contain the * correct metadata after log recovery occurs. */ xfs_defer_init(args->dfops, args->firstblock); nmap = 1; error = xfs_bmapi_write(args->trans, dp, (xfs_fileoff_t)lblkno, blkcnt, XFS_BMAPI_ATTRFORK, args->firstblock, args->total, &map, &nmap, args->dfops); if (!error) error = xfs_defer_finish(&args->trans, args->dfops, dp); if (error) { args->trans = NULL; xfs_defer_cancel(args->dfops); return error; } ASSERT(nmap == 1); ASSERT((map.br_startblock != DELAYSTARTBLOCK) && (map.br_startblock != HOLESTARTBLOCK)); lblkno += map.br_blockcount; blkcnt -= map.br_blockcount; /* * Start the next trans in the chain. */ error = xfs_trans_roll(&args->trans, dp); if (error) return error; } /* * Roll through the "value", copying the attribute value to the * already-allocated blocks. Blocks are written synchronously * so that we can know they are all on disk before we turn off * the INCOMPLETE flag. */ lblkno = args->rmtblkno; blkcnt = args->rmtblkcnt; valuelen = args->rmtvaluelen; while (valuelen > 0) { struct xfs_buf *bp; xfs_daddr_t dblkno; int dblkcnt; ASSERT(blkcnt > 0); xfs_defer_init(args->dfops, args->firstblock); nmap = 1; error = xfs_bmapi_read(dp, (xfs_fileoff_t)lblkno, blkcnt, &map, &nmap, XFS_BMAPI_ATTRFORK); if (error) return error; ASSERT(nmap == 1); ASSERT((map.br_startblock != DELAYSTARTBLOCK) && (map.br_startblock != HOLESTARTBLOCK)); dblkno = XFS_FSB_TO_DADDR(mp, map.br_startblock), dblkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount); bp = xfs_buf_get(mp->m_ddev_targp, dblkno, dblkcnt, 0); if (!bp) return -ENOMEM; bp->b_ops = &xfs_attr3_rmt_buf_ops; xfs_attr_rmtval_copyin(mp, bp, args->dp->i_ino, &offset, &valuelen, &src); error = xfs_bwrite(bp); /* GROT: NOTE: synchronous write */ xfs_buf_relse(bp); if (error) return error; /* roll attribute extent map forwards */ lblkno += map.br_blockcount; blkcnt -= map.br_blockcount; } ASSERT(valuelen == 0); return 0; }
/* * xfs sync routine for internal use * * This routine supports all of the flags defined for the generic VFS_SYNC * interface as explained above under xfs_sync. In the interests of not * changing interfaces within the 6.5 family, additional internallly- * required functions are specified within a separate xflags parameter, * only available by calling this routine. * */ STATIC int xfs_sync_inodes( xfs_mount_t *mp, int flags, int xflags, int *bypassed) { xfs_inode_t *ip = NULL; xfs_inode_t *ip_next; xfs_buf_t *bp; vnode_t *vp = NULL; vmap_t vmap; int error; int last_error; uint64_t fflag; uint lock_flags; uint base_lock_flags; boolean_t mount_locked; boolean_t vnode_refed; int preempt; xfs_dinode_t *dip; xfs_iptr_t *ipointer; #ifdef DEBUG boolean_t ipointer_in = B_FALSE; #define IPOINTER_SET ipointer_in = B_TRUE #define IPOINTER_CLR ipointer_in = B_FALSE #else #define IPOINTER_SET #define IPOINTER_CLR #endif /* Insert a marker record into the inode list after inode ip. The list * must be locked when this is called. After the call the list will no * longer be locked. */ #define IPOINTER_INSERT(ip, mp) { \ ASSERT(ipointer_in == B_FALSE); \ ipointer->ip_mnext = ip->i_mnext; \ ipointer->ip_mprev = ip; \ ip->i_mnext = (xfs_inode_t *)ipointer; \ ipointer->ip_mnext->i_mprev = (xfs_inode_t *)ipointer; \ preempt = 0; \ XFS_MOUNT_IUNLOCK(mp); \ mount_locked = B_FALSE; \ IPOINTER_SET; \ } /* Remove the marker from the inode list. If the marker was the only item * in the list then there are no remaining inodes and we should zero out * the whole list. If we are the current head of the list then move the head * past us. */ #define IPOINTER_REMOVE(ip, mp) { \ ASSERT(ipointer_in == B_TRUE); \ if (ipointer->ip_mnext != (xfs_inode_t *)ipointer) { \ ip = ipointer->ip_mnext; \ ip->i_mprev = ipointer->ip_mprev; \ ipointer->ip_mprev->i_mnext = ip; \ if (mp->m_inodes == (xfs_inode_t *)ipointer) { \ mp->m_inodes = ip; \ } \ } else { \ ASSERT(mp->m_inodes == (xfs_inode_t *)ipointer); \ mp->m_inodes = NULL; \ ip = NULL; \ } \ IPOINTER_CLR; \ } #define XFS_PREEMPT_MASK 0x7f if (bypassed) *bypassed = 0; if (XFS_MTOVFS(mp)->vfs_flag & VFS_RDONLY) return 0; error = 0; last_error = 0; preempt = 0; /* Allocate a reference marker */ ipointer = (xfs_iptr_t *)kmem_zalloc(sizeof(xfs_iptr_t), KM_SLEEP); fflag = XFS_B_ASYNC; /* default is don't wait */ if (flags & SYNC_BDFLUSH) fflag = XFS_B_DELWRI; if (flags & SYNC_WAIT) fflag = 0; /* synchronous overrides all */ base_lock_flags = XFS_ILOCK_SHARED; if (flags & (SYNC_DELWRI | SYNC_CLOSE)) { /* * We need the I/O lock if we're going to call any of * the flush/inval routines. */ base_lock_flags |= XFS_IOLOCK_SHARED; } XFS_MOUNT_ILOCK(mp); ip = mp->m_inodes; mount_locked = B_TRUE; vnode_refed = B_FALSE; IPOINTER_CLR; do { ASSERT(ipointer_in == B_FALSE); ASSERT(vnode_refed == B_FALSE); lock_flags = base_lock_flags; /* * There were no inodes in the list, just break out * of the loop. */ if (ip == NULL) { break; } /* * We found another sync thread marker - skip it */ if (ip->i_mount == NULL) { ip = ip->i_mnext; continue; } vp = XFS_ITOV_NULL(ip); /* * If the vnode is gone then this is being torn down, * call reclaim if it is flushed, else let regular flush * code deal with it later in the loop. */ if (vp == NULL) { /* Skip ones already in reclaim */ if (ip->i_flags & XFS_IRECLAIM) { ip = ip->i_mnext; continue; } if (xfs_ilock_nowait(ip, XFS_ILOCK_EXCL) == 0) { ip = ip->i_mnext; } else if ((xfs_ipincount(ip) == 0) && xfs_iflock_nowait(ip)) { IPOINTER_INSERT(ip, mp); xfs_finish_reclaim(ip, 1, XFS_IFLUSH_DELWRI_ELSE_ASYNC); XFS_MOUNT_ILOCK(mp); mount_locked = B_TRUE; IPOINTER_REMOVE(ip, mp); } else { xfs_iunlock(ip, XFS_ILOCK_EXCL); ip = ip->i_mnext; } continue; } if (XFS_FORCED_SHUTDOWN(mp) && !(flags & SYNC_CLOSE)) { XFS_MOUNT_IUNLOCK(mp); kmem_free(ipointer, sizeof(xfs_iptr_t)); return 0; } /* * If this is just vfs_sync() or pflushd() calling * then we can skip inodes for which it looks like * there is nothing to do. Since we don't have the * inode locked this is racey, but these are periodic * calls so it doesn't matter. For the others we want * to know for sure, so we at least try to lock them. */ if (flags & SYNC_BDFLUSH) { if (((ip->i_itemp == NULL) || !(ip->i_itemp->ili_format.ilf_fields & XFS_ILOG_ALL)) && (ip->i_update_core == 0)) { ip = ip->i_mnext; continue; } } /* * Try to lock without sleeping. We're out of order with * the inode list lock here, so if we fail we need to drop * the mount lock and try again. If we're called from * bdflush() here, then don't bother. * * The inode lock here actually coordinates with the * almost spurious inode lock in xfs_ireclaim() to prevent * the vnode we handle here without a reference from * being freed while we reference it. If we lock the inode * while it's on the mount list here, then the spurious inode * lock in xfs_ireclaim() after the inode is pulled from * the mount list will sleep until we release it here. * This keeps the vnode from being freed while we reference * it. It is also cheaper and simpler than actually doing * a vn_get() for every inode we touch here. */ if (xfs_ilock_nowait(ip, lock_flags) == 0) { if ((flags & SYNC_BDFLUSH) || (vp == NULL)) { ip = ip->i_mnext; continue; } /* * We need to unlock the inode list lock in order * to lock the inode. Insert a marker record into * the inode list to remember our position, dropping * the lock is now done inside the IPOINTER_INSERT * macro. * * We also use the inode list lock to protect us * in taking a snapshot of the vnode version number * for use in calling vn_get(). */ VMAP(vp, vmap); IPOINTER_INSERT(ip, mp); vp = vn_get(vp, &vmap); if (vp == NULL) { /* * The vnode was reclaimed once we let go * of the inode list lock. Skip to the * next list entry. Remove the marker. */ XFS_MOUNT_ILOCK(mp); mount_locked = B_TRUE; vnode_refed = B_FALSE; IPOINTER_REMOVE(ip, mp); continue; } xfs_ilock(ip, lock_flags); ASSERT(vp == XFS_ITOV(ip)); ASSERT(ip->i_mount == mp); vnode_refed = B_TRUE; } /* From here on in the loop we may have a marker record * in the inode list. */ if ((flags & SYNC_CLOSE) && (vp != NULL)) { /* * This is the shutdown case. We just need to * flush and invalidate all the pages associated * with the inode. Drop the inode lock since * we can't hold it across calls to the buffer * cache. * * We don't set the VREMAPPING bit in the vnode * here, because we don't hold the vnode lock * exclusively. It doesn't really matter, though, * because we only come here when we're shutting * down anyway. */ xfs_iunlock(ip, XFS_ILOCK_SHARED); if (XFS_FORCED_SHUTDOWN(mp)) { VOP_TOSS_PAGES(vp, 0, -1, FI_REMAPF); } else { VOP_FLUSHINVAL_PAGES(vp, 0, -1, FI_REMAPF); } xfs_ilock(ip, XFS_ILOCK_SHARED); } else if ((flags & SYNC_DELWRI) && (vp != NULL)) { if (VN_DIRTY(vp)) { /* We need to have dropped the lock here, * so insert a marker if we have not already * done so. */ if (mount_locked) { IPOINTER_INSERT(ip, mp); } /* * Drop the inode lock since we can't hold it * across calls to the buffer cache. */ xfs_iunlock(ip, XFS_ILOCK_SHARED); VOP_FLUSH_PAGES(vp, (xfs_off_t)0, -1, fflag, FI_NONE, error); xfs_ilock(ip, XFS_ILOCK_SHARED); } } if (flags & SYNC_BDFLUSH) { if ((flags & SYNC_ATTR) && ((ip->i_update_core) || ((ip->i_itemp != NULL) && (ip->i_itemp->ili_format.ilf_fields != 0)))) { /* Insert marker and drop lock if not already * done. */ if (mount_locked) { IPOINTER_INSERT(ip, mp); } /* * We don't want the periodic flushing of the * inodes by vfs_sync() to interfere with * I/O to the file, especially read I/O * where it is only the access time stamp * that is being flushed out. To prevent * long periods where we have both inode * locks held shared here while reading the * inode's buffer in from disk, we drop the * inode lock while reading in the inode * buffer. We have to release the buffer * and reacquire the inode lock so that they * are acquired in the proper order (inode * locks first). The buffer will go at the * end of the lru chain, though, so we can * expect it to still be there when we go * for it again in xfs_iflush(). */ if ((xfs_ipincount(ip) == 0) && xfs_iflock_nowait(ip)) { xfs_ifunlock(ip); xfs_iunlock(ip, XFS_ILOCK_SHARED); error = xfs_itobp(mp, NULL, ip, &dip, &bp, 0); if (!error) { xfs_buf_relse(bp); } else { /* Bailing out, remove the * marker and free it. */ XFS_MOUNT_ILOCK(mp); IPOINTER_REMOVE(ip, mp); XFS_MOUNT_IUNLOCK(mp); ASSERT(!(lock_flags & XFS_IOLOCK_SHARED)); kmem_free(ipointer, sizeof(xfs_iptr_t)); return (0); } /* * Since we dropped the inode lock, * the inode may have been reclaimed. * Therefore, we reacquire the mount * lock and check to see if we were the * inode reclaimed. If this happened * then the ipointer marker will no * longer point back at us. In this * case, move ip along to the inode * after the marker, remove the marker * and continue. */ XFS_MOUNT_ILOCK(mp); mount_locked = B_TRUE; if (ip != ipointer->ip_mprev) { IPOINTER_REMOVE(ip, mp); ASSERT(!vnode_refed); ASSERT(!(lock_flags & XFS_IOLOCK_SHARED)); continue; } ASSERT(ip->i_mount == mp); if (xfs_ilock_nowait(ip, XFS_ILOCK_SHARED) == 0) { ASSERT(ip->i_mount == mp); /* * We failed to reacquire * the inode lock without * sleeping, so just skip * the inode for now. We * clear the ILOCK bit from * the lock_flags so that we * won't try to drop a lock * we don't hold below. */ lock_flags &= ~XFS_ILOCK_SHARED; IPOINTER_REMOVE(ip_next, mp); } else if ((xfs_ipincount(ip) == 0) && xfs_iflock_nowait(ip)) { ASSERT(ip->i_mount == mp); /* * Since this is vfs_sync() * calling we only flush the * inode out if we can lock * it without sleeping and * it is not pinned. Drop * the mount lock here so * that we don't hold it for * too long. We already have * a marker in the list here. */ XFS_MOUNT_IUNLOCK(mp); mount_locked = B_FALSE; error = xfs_iflush(ip, XFS_IFLUSH_DELWRI); } else { ASSERT(ip->i_mount == mp); IPOINTER_REMOVE(ip_next, mp); } } } } else { if ((flags & SYNC_ATTR) && ((ip->i_update_core) || ((ip->i_itemp != NULL) && (ip->i_itemp->ili_format.ilf_fields != 0)))) { if (mount_locked) { IPOINTER_INSERT(ip, mp); } if (flags & SYNC_WAIT) { xfs_iflock(ip); error = xfs_iflush(ip, XFS_IFLUSH_SYNC); } else { /* * If we can't acquire the flush * lock, then the inode is already * being flushed so don't bother * waiting. If we can lock it then * do a delwri flush so we can * combine multiple inode flushes * in each disk write. */ if (xfs_iflock_nowait(ip)) { error = xfs_iflush(ip, XFS_IFLUSH_DELWRI); } else if (bypassed) (*bypassed)++; } } } if (lock_flags != 0) { xfs_iunlock(ip, lock_flags); } if (vnode_refed) { /* * If we had to take a reference on the vnode * above, then wait until after we've unlocked * the inode to release the reference. This is * because we can be already holding the inode * lock when VN_RELE() calls xfs_inactive(). * * Make sure to drop the mount lock before calling * VN_RELE() so that we don't trip over ourselves if * we have to go for the mount lock again in the * inactive code. */ if (mount_locked) { IPOINTER_INSERT(ip, mp); } VN_RELE(vp); vnode_refed = B_FALSE; } if (error) { last_error = error; } /* * bail out if the filesystem is corrupted. */ if (error == EFSCORRUPTED) { if (!mount_locked) { XFS_MOUNT_ILOCK(mp); IPOINTER_REMOVE(ip, mp); } XFS_MOUNT_IUNLOCK(mp); ASSERT(ipointer_in == B_FALSE); kmem_free(ipointer, sizeof(xfs_iptr_t)); return XFS_ERROR(error); } /* Let other threads have a chance at the mount lock * if we have looped many times without dropping the * lock. */ if ((++preempt & XFS_PREEMPT_MASK) == 0) { if (mount_locked) { IPOINTER_INSERT(ip, mp); } } if (mount_locked == B_FALSE) { XFS_MOUNT_ILOCK(mp); mount_locked = B_TRUE; IPOINTER_REMOVE(ip, mp); continue; } ASSERT(ipointer_in == B_FALSE); ip = ip->i_mnext; } while (ip != mp->m_inodes); XFS_MOUNT_IUNLOCK(mp); ASSERT(ipointer_in == B_FALSE); kmem_free(ipointer, sizeof(xfs_iptr_t)); return XFS_ERROR(last_error); }
STATIC uint xfs_inode_item_push( struct xfs_log_item *lip, struct list_head *buffer_list) { struct xfs_inode_log_item *iip = INODE_ITEM(lip); struct xfs_inode *ip = iip->ili_inode; struct xfs_buf *bp = NULL; uint rval = XFS_ITEM_SUCCESS; int error; if (xfs_ipincount(ip) > 0) return XFS_ITEM_PINNED; if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) return XFS_ITEM_LOCKED; /* * Re-check the pincount now that we stabilized the value by * taking the ilock. */ if (xfs_ipincount(ip) > 0) { rval = XFS_ITEM_PINNED; goto out_unlock; } /* * Stale inode items should force out the iclog. */ if (ip->i_flags & XFS_ISTALE) { rval = XFS_ITEM_PINNED; goto out_unlock; } /* * Someone else is already flushing the inode. Nothing we can do * here but wait for the flush to finish and remove the item from * the AIL. */ if (!xfs_iflock_nowait(ip)) { rval = XFS_ITEM_FLUSHING; goto out_unlock; } ASSERT(iip->ili_fields != 0 || XFS_FORCED_SHUTDOWN(ip->i_mount)); ASSERT(iip->ili_logged == 0 || XFS_FORCED_SHUTDOWN(ip->i_mount)); spin_unlock(&lip->li_ailp->xa_lock); error = xfs_iflush(ip, &bp); if (!error) { if (!xfs_buf_delwri_queue(bp, buffer_list)) rval = XFS_ITEM_FLUSHING; xfs_buf_relse(bp); } spin_lock(&lip->li_ailp->xa_lock); out_unlock: xfs_iunlock(ip, XFS_ILOCK_SHARED); return rval; }
/* * This is the iodone() function for buffers which have had callbacks * attached to them by xfs_buf_attach_iodone(). It should remove each * log item from the buffer's list and call the callback of each in turn. * When done, the buffer's fsprivate field is set to NULL and the buffer * is unlocked with a call to iodone(). */ void xfs_buf_iodone_callbacks( struct xfs_buf *bp) { struct xfs_log_item *lip = bp->b_fspriv; struct xfs_mount *mp = lip->li_mountp; static ulong lasttime; static xfs_buftarg_t *lasttarg; if (likely(!xfs_buf_geterror(bp))) goto do_callbacks; /* * If we've already decided to shutdown the filesystem because of * I/O errors, there's no point in giving this a retry. */ if (XFS_FORCED_SHUTDOWN(mp)) { xfs_buf_stale(bp); XFS_BUF_DONE(bp); trace_xfs_buf_item_iodone(bp, _RET_IP_); goto do_callbacks; } if (bp->b_target != lasttarg || time_after(jiffies, (lasttime + 5*HZ))) { lasttime = jiffies; xfs_buf_ioerror_alert(bp, __func__); } lasttarg = bp->b_target; /* * If the write was asynchronous then no one will be looking for the * error. Clear the error state and write the buffer out again. * * XXX: This helps against transient write errors, but we need to find * a way to shut the filesystem down if the writes keep failing. * * In practice we'll shut the filesystem down soon as non-transient * erorrs tend to affect the whole device and a failing log write * will make us give up. But we really ought to do better here. */ if (XFS_BUF_ISASYNC(bp)) { ASSERT(bp->b_iodone != NULL); trace_xfs_buf_item_iodone_async(bp, _RET_IP_); xfs_buf_ioerror(bp, 0); /* errno of 0 unsets the flag */ if (!XFS_BUF_ISSTALE(bp)) { bp->b_flags |= XBF_WRITE | XBF_ASYNC | XBF_DONE; xfs_buf_iorequest(bp); } else { xfs_buf_relse(bp); } return; } /* * If the write of the buffer was synchronous, we want to make * sure to return the error to the caller of xfs_bwrite(). */ xfs_buf_stale(bp); XFS_BUF_DONE(bp); trace_xfs_buf_error_relse(bp, _RET_IP_); do_callbacks: xfs_buf_do_callbacks(bp); bp->b_fspriv = NULL; bp->b_iodone = NULL; xfs_buf_ioend(bp, 0); }
/* * This is called when IOP_TRYLOCK returns XFS_ITEM_PUSHBUF to indicate that * the dquot is locked by us, but the flush lock isn't. So, here we are * going to see if the relevant dquot buffer is incore, waiting on DELWRI. * If so, we want to push it out to help us take this item off the AIL as soon * as possible. * * We must not be holding the AIL_LOCK at this point. Calling incore() to * search the buffercache can be a time consuming thing, and AIL_LOCK is a * spinlock. */ STATIC void xfs_qm_dquot_logitem_pushbuf( xfs_dq_logitem_t *qip) { xfs_dquot_t *dqp; xfs_mount_t *mp; xfs_buf_t *bp; uint dopush; dqp = qip->qli_dquot; ASSERT(XFS_DQ_IS_LOCKED(dqp)); /* * The qli_pushbuf_flag keeps others from * trying to duplicate our effort. */ ASSERT(qip->qli_pushbuf_flag != 0); ASSERT(qip->qli_push_owner == get_thread_id()); /* * If flushlock isn't locked anymore, chances are that the * inode flush completed and the inode was taken off the AIL. * So, just get out. */ if ((valusema(&(dqp->q_flock)) > 0) || ((qip->qli_item.li_flags & XFS_LI_IN_AIL) == 0)) { qip->qli_pushbuf_flag = 0; xfs_dqunlock(dqp); return; } mp = dqp->q_mount; bp = xfs_incore(mp->m_ddev_targ, qip->qli_format.qlf_blkno, XFS_QI_DQCHUNKLEN(mp), XFS_INCORE_TRYLOCK); if (bp != NULL) { if (XFS_BUF_ISDELAYWRITE(bp)) { dopush = ((qip->qli_item.li_flags & XFS_LI_IN_AIL) && (valusema(&(dqp->q_flock)) <= 0)); qip->qli_pushbuf_flag = 0; xfs_dqunlock(dqp); if (XFS_BUF_ISPINNED(bp)) { xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE); } if (dopush) { #ifdef XFSRACEDEBUG delay_for_intr(); delay(300); #endif xfs_bawrite(mp, bp); } else { xfs_buf_relse(bp); } } else { qip->qli_pushbuf_flag = 0; xfs_dqunlock(dqp); xfs_buf_relse(bp); } return; } qip->qli_pushbuf_flag = 0; xfs_dqunlock(dqp); }
/* * This is called to unpin the buffer associated with the buf log * item which was previously pinned with a call to xfs_buf_item_pin(). * * Also drop the reference to the buf item for the current transaction. * If the XFS_BLI_STALE flag is set and we are the last reference, * then free up the buf log item and unlock the buffer. * * If the remove flag is set we are called from uncommit in the * forced-shutdown path. If that is true and the reference count on * the log item is going to drop to zero we need to free the item's * descriptor in the transaction. */ STATIC void xfs_buf_item_unpin( struct xfs_log_item *lip, int remove) { struct xfs_buf_log_item *bip = BUF_ITEM(lip); xfs_buf_t *bp = bip->bli_buf; struct xfs_ail *ailp = lip->li_ailp; int stale = bip->bli_flags & XFS_BLI_STALE; int freed; ASSERT(bp->b_fspriv == bip); ASSERT(atomic_read(&bip->bli_refcount) > 0); trace_xfs_buf_item_unpin(bip); freed = atomic_dec_and_test(&bip->bli_refcount); if (atomic_dec_and_test(&bp->b_pin_count)) wake_up_all(&bp->b_waiters); if (freed && stale) { ASSERT(bip->bli_flags & XFS_BLI_STALE); ASSERT(xfs_buf_islocked(bp)); ASSERT(XFS_BUF_ISSTALE(bp)); ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL); trace_xfs_buf_item_unpin_stale(bip); if (remove) { /* * If we are in a transaction context, we have to * remove the log item from the transaction as we are * about to release our reference to the buffer. If we * don't, the unlock that occurs later in * xfs_trans_uncommit() will try to reference the * buffer which we no longer have a hold on. */ if (lip->li_desc) xfs_trans_del_item(lip); /* * Since the transaction no longer refers to the buffer, * the buffer should no longer refer to the transaction. */ bp->b_transp = NULL; } /* * If we get called here because of an IO error, we may * or may not have the item on the AIL. xfs_trans_ail_delete() * will take care of that situation. * xfs_trans_ail_delete() drops the AIL lock. */ if (bip->bli_flags & XFS_BLI_STALE_INODE) { xfs_buf_do_callbacks(bp); bp->b_fspriv = NULL; bp->b_iodone = NULL; } else { spin_lock(&ailp->xa_lock); xfs_trans_ail_delete(ailp, lip, SHUTDOWN_LOG_IO_ERROR); xfs_buf_item_relse(bp); ASSERT(bp->b_fspriv == NULL); } xfs_buf_relse(bp); } else if (freed && remove) { /* * There are currently two references to the buffer - the active * LRU reference and the buf log item. What we are about to do * here - simulate a failed IO completion - requires 3 * references. * * The LRU reference is removed by the xfs_buf_stale() call. The * buf item reference is removed by the xfs_buf_iodone() * callback that is run by xfs_buf_do_callbacks() during ioend * processing (via the bp->b_iodone callback), and then finally * the ioend processing will drop the IO reference if the buffer * is marked XBF_ASYNC. * * Hence we need to take an additional reference here so that IO * completion processing doesn't free the buffer prematurely. */ xfs_buf_lock(bp); xfs_buf_hold(bp); bp->b_flags |= XBF_ASYNC; xfs_buf_ioerror(bp, EIO); XFS_BUF_UNDONE(bp); xfs_buf_stale(bp); xfs_buf_ioend(bp, 0); } }
/* * Release the buffer associated with the buf log item. If there is no dirty * logged data associated with the buffer recorded in the buf log item, then * free the buf log item and remove the reference to it in the buffer. * * This call ignores the recursion count. It is only called when the buffer * should REALLY be unlocked, regardless of the recursion count. * * We unconditionally drop the transaction's reference to the log item. If the * item was logged, then another reference was taken when it was pinned, so we * can safely drop the transaction reference now. This also allows us to avoid * potential races with the unpin code freeing the bli by not referencing the * bli after we've dropped the reference count. * * If the XFS_BLI_HOLD flag is set in the buf log item, then free the log item * if necessary but do not unlock the buffer. This is for support of * xfs_trans_bhold(). Make sure the XFS_BLI_HOLD field is cleared if we don't * free the item. */ STATIC void xfs_buf_item_unlock( struct xfs_log_item *lip) { struct xfs_buf_log_item *bip = BUF_ITEM(lip); struct xfs_buf *bp = bip->bli_buf; bool clean; bool aborted; int flags; /* Clear the buffer's association with this transaction. */ bp->b_transp = NULL; /* * If this is a transaction abort, don't return early. Instead, allow * the brelse to happen. Normally it would be done for stale * (cancelled) buffers at unpin time, but we'll never go through the * pin/unpin cycle if we abort inside commit. */ aborted = (lip->li_flags & XFS_LI_ABORTED) ? true : false; /* * Before possibly freeing the buf item, copy the per-transaction state * so we can reference it safely later after clearing it from the * buffer log item. */ flags = bip->bli_flags; bip->bli_flags &= ~(XFS_BLI_LOGGED | XFS_BLI_HOLD | XFS_BLI_ORDERED); /* * If the buf item is marked stale, then don't do anything. We'll * unlock the buffer and free the buf item when the buffer is unpinned * for the last time. */ if (flags & XFS_BLI_STALE) { trace_xfs_buf_item_unlock_stale(bip); ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL); if (!aborted) { atomic_dec(&bip->bli_refcount); return; } } trace_xfs_buf_item_unlock(bip); /* * If the buf item isn't tracking any data, free it, otherwise drop the * reference we hold to it. If we are aborting the transaction, this may * be the only reference to the buf item, so we free it anyway * regardless of whether it is dirty or not. A dirty abort implies a * shutdown, anyway. * * Ordered buffers are dirty but may have no recorded changes, so ensure * we only release clean items here. */ clean = (flags & XFS_BLI_DIRTY) ? false : true; if (clean) { int i; for (i = 0; i < bip->bli_format_count; i++) { if (!xfs_bitmap_empty(bip->bli_formats[i].blf_data_map, bip->bli_formats[i].blf_map_size)) { clean = false; break; } } } /* * Clean buffers, by definition, cannot be in the AIL. However, aborted * buffers may be dirty and hence in the AIL. Therefore if we are * aborting a buffer and we've just taken the last refernce away, we * have to check if it is in the AIL before freeing it. We need to free * it in this case, because an aborted transaction has already shut the * filesystem down and this is the last chance we will have to do so. */ if (atomic_dec_and_test(&bip->bli_refcount)) { if (clean) xfs_buf_item_relse(bp); else if (aborted) { ASSERT(XFS_FORCED_SHUTDOWN(lip->li_mountp)); if (lip->li_flags & XFS_LI_IN_AIL) { spin_lock(&lip->li_ailp->xa_lock); xfs_trans_ail_delete(lip->li_ailp, lip, SHUTDOWN_LOG_IO_ERROR); } xfs_buf_item_relse(bp); } } if (!(flags & XFS_BLI_HOLD)) xfs_buf_relse(bp); }
/* * This is called to unpin the buffer associated with the buf log * item which was previously pinned with a call to xfs_buf_item_pin(). * * Also drop the reference to the buf item for the current transaction. * If the XFS_BLI_STALE flag is set and we are the last reference, * then free up the buf log item and unlock the buffer. * * If the remove flag is set we are called from uncommit in the * forced-shutdown path. If that is true and the reference count on * the log item is going to drop to zero we need to free the item's * descriptor in the transaction. */ STATIC void xfs_buf_item_unpin( struct xfs_log_item *lip, int remove) { struct xfs_buf_log_item *bip = BUF_ITEM(lip); xfs_buf_t *bp = bip->bli_buf; struct xfs_ail *ailp = lip->li_ailp; int stale = bip->bli_flags & XFS_BLI_STALE; int freed; ASSERT(XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *) == bip); ASSERT(atomic_read(&bip->bli_refcount) > 0); trace_xfs_buf_item_unpin(bip); freed = atomic_dec_and_test(&bip->bli_refcount); if (atomic_dec_and_test(&bp->b_pin_count)) wake_up_all(&bp->b_waiters); if (freed && stale) { ASSERT(bip->bli_flags & XFS_BLI_STALE); ASSERT(XFS_BUF_VALUSEMA(bp) <= 0); ASSERT(!(XFS_BUF_ISDELAYWRITE(bp))); ASSERT(XFS_BUF_ISSTALE(bp)); ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL); trace_xfs_buf_item_unpin_stale(bip); if (remove) { /* * We have to remove the log item from the transaction * as we are about to release our reference to the * buffer. If we don't, the unlock that occurs later * in xfs_trans_uncommit() will ry to reference the * buffer which we no longer have a hold on. */ xfs_trans_del_item(lip); /* * Since the transaction no longer refers to the buffer, * the buffer should no longer refer to the transaction. */ XFS_BUF_SET_FSPRIVATE2(bp, NULL); } /* * If we get called here because of an IO error, we may * or may not have the item on the AIL. xfs_trans_ail_delete() * will take care of that situation. * xfs_trans_ail_delete() drops the AIL lock. */ if (bip->bli_flags & XFS_BLI_STALE_INODE) { xfs_buf_do_callbacks(bp, (xfs_log_item_t *)bip); XFS_BUF_SET_FSPRIVATE(bp, NULL); XFS_BUF_CLR_IODONE_FUNC(bp); } else { spin_lock(&ailp->xa_lock); xfs_trans_ail_delete(ailp, (xfs_log_item_t *)bip); xfs_buf_item_relse(bp); ASSERT(XFS_BUF_FSPRIVATE(bp, void *) == NULL); } xfs_buf_relse(bp); }
/* * Remove the value associated with an attribute by deleting the * out-of-line buffer that it is stored on. */ int xfs_attr_rmtval_remove( struct xfs_da_args *args) { struct xfs_mount *mp = args->dp->i_mount; xfs_dablk_t lblkno; int blkcnt; int error; int done; trace_xfs_attr_rmtval_remove(args); /* * Roll through the "value", invalidating the attribute value's blocks. */ lblkno = args->rmtblkno; blkcnt = args->rmtblkcnt; while (blkcnt > 0) { struct xfs_bmbt_irec map; struct xfs_buf *bp; xfs_daddr_t dblkno; int dblkcnt; int nmap; /* * Try to remember where we decided to put the value. */ nmap = 1; error = xfs_bmapi_read(args->dp, (xfs_fileoff_t)lblkno, blkcnt, &map, &nmap, XFS_BMAPI_ATTRFORK); if (error) return error; ASSERT(nmap == 1); ASSERT((map.br_startblock != DELAYSTARTBLOCK) && (map.br_startblock != HOLESTARTBLOCK)); dblkno = XFS_FSB_TO_DADDR(mp, map.br_startblock), dblkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount); /* * If the "remote" value is in the cache, remove it. */ bp = xfs_buf_incore(mp->m_ddev_targp, dblkno, dblkcnt, XBF_TRYLOCK); if (bp) { xfs_buf_stale(bp); xfs_buf_relse(bp); bp = NULL; } lblkno += map.br_blockcount; blkcnt -= map.br_blockcount; } /* * Keep de-allocating extents until the remote-value region is gone. */ lblkno = args->rmtblkno; blkcnt = args->rmtblkcnt; done = 0; while (!done) { error = xfs_bunmapi(args->trans, args->dp, lblkno, blkcnt, XFS_BMAPI_ATTRFORK, 1, &done); if (error) goto out_defer_cancel; error = xfs_defer_finish(&args->trans); if (error) return error; /* * Close out trans and start the next one in the chain. */ error = xfs_trans_roll_inode(&args->trans, args->dp); if (error) return error; } return 0; out_defer_cancel: xfs_defer_cancel(args->trans); return error; }