/* * Return nonzero on i/o error. * Return new buf to write out in *bufp. */ static int dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype, dnode_phys_t *dnp, const blkptr_t *bp, const zbookmark_t *zb, dmu_tx_t *tx, arc_buf_t **bufp) { dsl_pool_t *dp = scn->scn_dp; int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SCAN_THREAD; int err; if (BP_GET_LEVEL(bp) > 0) { uint32_t flags = ARC_WAIT; int i; blkptr_t *cbp; int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT; err = arc_read(NULL, dp->dp_spa, bp, arc_getbuf_func, bufp, ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb); if (err) { scn->scn_phys.scn_errors++; return (err); } for (i = 0, cbp = (*bufp)->b_data; i < epb; i++, cbp++) { dsl_scan_prefetch(scn, *bufp, cbp, zb->zb_objset, zb->zb_object, zb->zb_blkid * epb + i); } for (i = 0, cbp = (*bufp)->b_data; i < epb; i++, cbp++) { zbookmark_t czb; SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object, zb->zb_level - 1, zb->zb_blkid * epb + i); dsl_scan_visitbp(cbp, &czb, dnp, *bufp, ds, scn, ostype, tx); } } else if (BP_GET_TYPE(bp) == DMU_OT_USERGROUP_USED) {
/* ARGSUSED */ static void ready(zio_t *zio, arc_buf_t *abuf, void *arg) { blkptr_t *bp = zio->io_bp; blkptr_t *bp_orig = &zio->io_bp_orig; objset_impl_t *os = arg; dnode_phys_t *dnp = &os->os_phys->os_meta_dnode; ASSERT(bp == os->os_rootbp); ASSERT(BP_GET_TYPE(bp) == DMU_OT_OBJSET); ASSERT(BP_GET_LEVEL(bp) == 0); /* * Update rootbp fill count. */ bp->blk_fill = 1; /* count the meta-dnode */ for (int i = 0; i < dnp->dn_nblkptr; i++) bp->blk_fill += dnp->dn_blkptr[i].blk_fill; if (zio->io_flags & ZIO_FLAG_IO_REWRITE) { ASSERT(DVA_EQUAL(BP_IDENTITY(bp), BP_IDENTITY(bp_orig))); } else { if (zio->io_bp_orig.blk_birth == os->os_synctx->tx_txg) (void) dsl_dataset_block_kill(os->os_dsl_dataset, &zio->io_bp_orig, zio, os->os_synctx); dsl_dataset_block_born(os->os_dsl_dataset, bp, os->os_synctx); } }
/* ARGSUSED */ static void ready(zio_t *zio, arc_buf_t *abuf, void *arg) { int i; blkptr_t *bp = zio->io_bp; objset_impl_t *os = arg; dnode_phys_t *dnp = &os->os_phys->os_meta_dnode; ASSERTV(blkptr_t *bp_orig = &zio->io_bp_orig); ASSERT(bp == os->os_rootbp); ASSERT(BP_GET_TYPE(bp) == DMU_OT_OBJSET); ASSERT(BP_GET_LEVEL(bp) == 0); /* * Update rootbp fill count: it should be the number of objects * allocated in the object set (not counting the "special" * objects that are stored in the objset_phys_t -- the meta * dnode and user/group accounting objects). */ bp->blk_fill = 0; for (i = 0; i < dnp->dn_nblkptr; i++) bp->blk_fill += dnp->dn_blkptr[i].blk_fill; if (zio->io_flags & ZIO_FLAG_IO_REWRITE) { ASSERT(DVA_EQUAL(BP_IDENTITY(bp), BP_IDENTITY(bp_orig))); } else { if (zio->io_bp_orig.blk_birth == os->os_synctx->tx_txg) (void) dsl_dataset_block_kill(os->os_dsl_dataset, &zio->io_bp_orig, zio, os->os_synctx); dsl_dataset_block_born(os->os_dsl_dataset, bp, os->os_synctx); } }
/* ARGSUSED */ static void dsl_scan_prefetch(dsl_scan_t *scn, arc_buf_t *buf, blkptr_t *bp, uint64_t objset, uint64_t object, uint64_t blkid) { zbookmark_t czb; uint32_t flags = ARC_NOWAIT | ARC_PREFETCH; if (zfs_no_scrub_prefetch) return; if (BP_IS_HOLE(bp) || bp->blk_birth <= scn->scn_phys.scn_min_txg || (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_DNODE)) return; SET_BOOKMARK(&czb, objset, object, BP_GET_LEVEL(bp), blkid); (void) arc_read(scn->scn_zio_root, scn->scn_dp->dp_spa, bp, NULL, NULL, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL | ZIO_FLAG_SCAN_THREAD, &flags, &czb); }
static void free_blocks(dnode_t *dn, blkptr_t *bp, int num, dmu_tx_t *tx) { dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset; uint64_t bytesfreed = 0; int i; dprintf("ds=%p obj=%llx num=%d\n", ds, dn->dn_object, num); for (i = 0; i < num; i++, bp++) { uint64_t lsize, lvl; dmu_object_type_t type; if (BP_IS_HOLE(bp)) continue; bytesfreed += dsl_dataset_block_kill(ds, bp, tx, B_FALSE); ASSERT3U(bytesfreed, <=, DN_USED_BYTES(dn->dn_phys)); /* * Save some useful information on the holes being * punched, including logical size, type, and indirection * level. Retaining birth time enables detection of when * holes are punched for reducing the number of free * records transmitted during a zfs send. */ lsize = BP_GET_LSIZE(bp); type = BP_GET_TYPE(bp); lvl = BP_GET_LEVEL(bp); bzero(bp, sizeof (blkptr_t)); if (spa_feature_is_active(dn->dn_objset->os_spa, SPA_FEATURE_HOLE_BIRTH)) { BP_SET_LSIZE(bp, lsize); BP_SET_TYPE(bp, type); BP_SET_LEVEL(bp, lvl); BP_SET_BIRTH(bp, dmu_tx_get_txg(tx), 0); } } dnode_diduse_space(dn, -bytesfreed); }
/* ARGSUSED */ static void dmu_objset_write_ready(zio_t *zio, arc_buf_t *abuf, void *arg) { blkptr_t *bp = zio->io_bp; objset_t *os = arg; dnode_phys_t *dnp = &os->os_phys->os_meta_dnode; ASSERT3P(bp, ==, os->os_rootbp); ASSERT3U(BP_GET_TYPE(bp), ==, DMU_OT_OBJSET); ASSERT0(BP_GET_LEVEL(bp)); /* * Update rootbp fill count: it should be the number of objects * allocated in the object set (not counting the "special" * objects that are stored in the objset_phys_t -- the meta * dnode and user/group accounting objects). */ bp->blk_fill = 0; for (int i = 0; i < dnp->dn_nblkptr; i++) bp->blk_fill += dnp->dn_blkptr[i].blk_fill; }
/* * Function called when a log block write completes */ static void zil_lwb_write_done(zio_t *zio) { lwb_t *lwb = zio->io_private; zilog_t *zilog = lwb->lwb_zilog; ASSERT(BP_GET_COMPRESS(zio->io_bp) == ZIO_COMPRESS_OFF); ASSERT(BP_GET_CHECKSUM(zio->io_bp) == ZIO_CHECKSUM_ZILOG); ASSERT(BP_GET_TYPE(zio->io_bp) == DMU_OT_INTENT_LOG); ASSERT(BP_GET_LEVEL(zio->io_bp) == 0); ASSERT(BP_GET_BYTEORDER(zio->io_bp) == ZFS_HOST_BYTEORDER); ASSERT(!BP_IS_GANG(zio->io_bp)); ASSERT(!BP_IS_HOLE(zio->io_bp)); ASSERT(zio->io_bp->blk_fill == 0); /* * Ensure the lwb buffer pointer is cleared before releasing * the txg. If we have had an allocation failure and * the txg is waiting to sync then we want want zil_sync() * to remove the lwb so that it's not picked up as the next new * one in zil_commit_writer(). zil_sync() will only remove * the lwb if lwb_buf is null. */ zio_buf_free(lwb->lwb_buf, lwb->lwb_sz); mutex_enter(&zilog->zl_lock); lwb->lwb_buf = NULL; if (zio->io_error) zilog->zl_log_error = B_TRUE; /* * Now that we've written this log block, we have a stable pointer * to the next block in the chain, so it's OK to let the txg in * which we allocated the next block sync. We still have the * zl_lock to ensure zil_sync doesn't kmem free the lwb. */ txg_rele_to_sync(&lwb->lwb_txgh); mutex_exit(&zilog->zl_lock); }
static void traverse_prefetch_metadata(traverse_data_t *td, const blkptr_t *bp, const zbookmark_t *zb) { uint32_t flags = ARC_NOWAIT | ARC_PREFETCH; if (!(td->td_flags & TRAVERSE_PREFETCH_METADATA)) return; /* * If we are in the process of resuming, don't prefetch, because * some children will not be needed (and in fact may have already * been freed). */ if (td->td_resume != NULL && !ZB_IS_ZERO(td->td_resume)) return; if (BP_IS_HOLE(bp) || bp->blk_birth <= td->td_min_txg) return; if (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_DNODE) return; (void) arc_read(NULL, td->td_spa, bp, NULL, NULL, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); }
static int traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, arc_buf_t *pbuf, blkptr_t *bp, const zbookmark_t *zb) { zbookmark_t czb; int err = 0, lasterr = 0; arc_buf_t *buf = NULL; prefetch_data_t *pd = td->td_pfd; boolean_t hard = td->td_flags & TRAVERSE_HARD; if (bp->blk_birth == 0) { err = td->td_func(td->td_spa, NULL, NULL, pbuf, zb, dnp, td->td_arg); return (err); } if (bp->blk_birth <= td->td_min_txg) return (0); if (pd && !pd->pd_exited && ((pd->pd_flags & TRAVERSE_PREFETCH_DATA) || BP_GET_TYPE(bp) == DMU_OT_DNODE || BP_GET_LEVEL(bp) > 0)) { mutex_enter(&pd->pd_mtx); ASSERT(pd->pd_blks_fetched >= 0); while (pd->pd_blks_fetched == 0 && !pd->pd_exited) cv_wait(&pd->pd_cv, &pd->pd_mtx); pd->pd_blks_fetched--; cv_broadcast(&pd->pd_cv); mutex_exit(&pd->pd_mtx); } if (td->td_flags & TRAVERSE_PRE) { err = td->td_func(td->td_spa, NULL, bp, pbuf, zb, dnp, td->td_arg); if (err == TRAVERSE_VISIT_NO_CHILDREN) return (0); if (err) return (err); } if (BP_GET_LEVEL(bp) > 0) { uint32_t flags = ARC_WAIT; int i; blkptr_t *cbp; int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT; err = dsl_read(NULL, td->td_spa, bp, pbuf, arc_getbuf_func, &buf, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); if (err) return (err); /* recursively visitbp() blocks below this */ cbp = buf->b_data; for (i = 0; i < epb; i++, cbp++) { SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object, zb->zb_level - 1, zb->zb_blkid * epb + i); err = traverse_visitbp(td, dnp, buf, cbp, &czb); if (err) { if (!hard) break; lasterr = err; } } } else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) {
static int traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, const blkptr_t *bp, const zbookmark_t *zb) { zbookmark_t czb; int err = 0, lasterr = 0; arc_buf_t *buf = NULL; prefetch_data_t *pd = td->td_pfd; boolean_t hard = td->td_flags & TRAVERSE_HARD; boolean_t pause = B_FALSE; switch (resume_skip_check(td, dnp, zb)) { case RESUME_SKIP_ALL: return (0); case RESUME_SKIP_CHILDREN: goto post; case RESUME_SKIP_NONE: break; default: ASSERT(0); } if (BP_IS_HOLE(bp)) { err = td->td_func(td->td_spa, NULL, NULL, zb, dnp, td->td_arg); return (err); } if (bp->blk_birth <= td->td_min_txg) return (0); if (pd && !pd->pd_exited && ((pd->pd_flags & TRAVERSE_PREFETCH_DATA) || BP_GET_TYPE(bp) == DMU_OT_DNODE || BP_GET_LEVEL(bp) > 0)) { mutex_enter(&pd->pd_mtx); ASSERT(pd->pd_blks_fetched >= 0); while (pd->pd_blks_fetched == 0 && !pd->pd_exited) cv_wait(&pd->pd_cv, &pd->pd_mtx); pd->pd_blks_fetched--; cv_broadcast(&pd->pd_cv); mutex_exit(&pd->pd_mtx); } if (td->td_flags & TRAVERSE_PRE) { err = td->td_func(td->td_spa, NULL, bp, zb, dnp, td->td_arg); if (err == TRAVERSE_VISIT_NO_CHILDREN) return (0); if (err == ERESTART) pause = B_TRUE; /* handle pausing at a common point */ if (err != 0) goto post; } if (BP_GET_LEVEL(bp) > 0) { uint32_t flags = ARC_WAIT; int i; blkptr_t *cbp; int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT; err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); if (err) return (err); cbp = buf->b_data; for (i = 0; i < epb; i++) { SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object, zb->zb_level - 1, zb->zb_blkid * epb + i); traverse_prefetch_metadata(td, &cbp[i], &czb); } /* recursively visitbp() blocks below this */ for (i = 0; i < epb; i++) { SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object, zb->zb_level - 1, zb->zb_blkid * epb + i); err = traverse_visitbp(td, dnp, &cbp[i], &czb); if (err) { if (!hard) break; lasterr = err; } } } else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) {
static int traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, const blkptr_t *bp, const zbookmark_phys_t *zb) { zbookmark_phys_t czb; int err = 0; arc_buf_t *buf = NULL; prefetch_data_t *pd = td->td_pfd; boolean_t hard = td->td_flags & TRAVERSE_HARD; switch (resume_skip_check(td, dnp, zb)) { case RESUME_SKIP_ALL: return (0); case RESUME_SKIP_CHILDREN: goto post; case RESUME_SKIP_NONE: break; default: ASSERT(0); } if (bp->blk_birth == 0) { /* * Since this block has a birth time of 0 it must be one of * two things: a hole created before the * SPA_FEATURE_HOLE_BIRTH feature was enabled, or a hole * which has always been a hole in an object. * * If a file is written sparsely, then the unwritten parts of * the file were "always holes" -- that is, they have been * holes since this object was allocated. However, we (and * our callers) can not necessarily tell when an object was * allocated. Therefore, if it's possible that this object * was freed and then its object number reused, we need to * visit all the holes with birth==0. * * If it isn't possible that the object number was reused, * then if SPA_FEATURE_HOLE_BIRTH was enabled before we wrote * all the blocks we will visit as part of this traversal, * then this hole must have always existed, so we can skip * it. We visit blocks born after (exclusive) td_min_txg. * * Note that the meta-dnode cannot be reallocated. */ if (!send_holes_without_birth_time && (!td->td_realloc_possible || zb->zb_object == DMU_META_DNODE_OBJECT) && td->td_hole_birth_enabled_txg <= td->td_min_txg) return (0); } else if (bp->blk_birth <= td->td_min_txg) { return (0); } if (pd != NULL && !pd->pd_exited && prefetch_needed(pd, bp)) { uint64_t size = BP_GET_LSIZE(bp); mutex_enter(&pd->pd_mtx); ASSERT(pd->pd_bytes_fetched >= 0); while (pd->pd_bytes_fetched < size && !pd->pd_exited) cv_wait(&pd->pd_cv, &pd->pd_mtx); pd->pd_bytes_fetched -= size; cv_broadcast(&pd->pd_cv); mutex_exit(&pd->pd_mtx); } if (BP_IS_HOLE(bp)) { err = td->td_func(td->td_spa, NULL, bp, zb, dnp, td->td_arg); if (err != 0) goto post; return (0); } if (td->td_flags & TRAVERSE_PRE) { err = td->td_func(td->td_spa, NULL, bp, zb, dnp, td->td_arg); if (err == TRAVERSE_VISIT_NO_CHILDREN) return (0); if (err != 0) goto post; } if (BP_GET_LEVEL(bp) > 0) { arc_flags_t flags = ARC_FLAG_WAIT; int i; blkptr_t *cbp; int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT; err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); if (err != 0) goto post; cbp = buf->b_data; for (i = 0; i < epb; i++) { SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object, zb->zb_level - 1, zb->zb_blkid * epb + i); traverse_prefetch_metadata(td, &cbp[i], &czb); } /* recursively visitbp() blocks below this */ for (i = 0; i < epb; i++) { SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object, zb->zb_level - 1, zb->zb_blkid * epb + i); err = traverse_visitbp(td, dnp, &cbp[i], &czb); if (err != 0) break; } } else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) {
static int traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, const blkptr_t *bp, const zbookmark_phys_t *zb) { zbookmark_phys_t czb; int err = 0; arc_buf_t *buf = NULL; prefetch_data_t *pd = td->td_pfd; boolean_t hard = td->td_flags & TRAVERSE_HARD; switch (resume_skip_check(td, dnp, zb)) { case RESUME_SKIP_ALL: return (0); case RESUME_SKIP_CHILDREN: goto post; case RESUME_SKIP_NONE: break; default: ASSERT(0); } if (bp->blk_birth == 0) { /* * Since this block has a birth time of 0 it must be a * hole created before the SPA_FEATURE_HOLE_BIRTH * feature was enabled. If SPA_FEATURE_HOLE_BIRTH * was enabled before the min_txg for this traveral we * know the hole must have been created before the * min_txg for this traveral, so we can skip it. If * SPA_FEATURE_HOLE_BIRTH was enabled after the min_txg * for this traveral we cannot tell if the hole was * created before or after the min_txg for this * traversal, so we cannot skip it. */ if (td->td_hole_birth_enabled_txg < td->td_min_txg) return (0); } else if (bp->blk_birth <= td->td_min_txg) { return (0); } if (pd != NULL && !pd->pd_exited && prefetch_needed(pd, bp)) { mutex_enter(&pd->pd_mtx); ASSERT(pd->pd_blks_fetched >= 0); while (pd->pd_blks_fetched == 0 && !pd->pd_exited) cv_wait(&pd->pd_cv, &pd->pd_mtx); pd->pd_blks_fetched--; cv_broadcast(&pd->pd_cv); mutex_exit(&pd->pd_mtx); } if (BP_IS_HOLE(bp)) { err = td->td_func(td->td_spa, NULL, bp, zb, dnp, td->td_arg); if (err != 0) goto post; return (0); } if (td->td_flags & TRAVERSE_PRE) { err = td->td_func(td->td_spa, NULL, bp, zb, dnp, td->td_arg); if (err == TRAVERSE_VISIT_NO_CHILDREN) return (0); if (err != 0) goto post; } if (BP_GET_LEVEL(bp) > 0) { arc_flags_t flags = ARC_FLAG_WAIT; int i; blkptr_t *cbp; int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT; err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); if (err != 0) goto post; cbp = buf->b_data; for (i = 0; i < epb; i++) { SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object, zb->zb_level - 1, zb->zb_blkid * epb + i); traverse_prefetch_metadata(td, &cbp[i], &czb); } /* recursively visitbp() blocks below this */ for (i = 0; i < epb; i++) { SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object, zb->zb_level - 1, zb->zb_blkid * epb + i); err = traverse_visitbp(td, dnp, &cbp[i], &czb); if (err != 0) break; } } else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) {