static void trim_thread(void *arg) { spa_t *spa = arg; zio_t *zio; #ifdef _KERNEL (void) snprintf(curthread->td_name, sizeof(curthread->td_name), "trim %s", spa_name(spa)); #endif for (;;) { mutex_enter(&spa->spa_trim_lock); if (spa->spa_trim_thread == NULL) { spa->spa_trim_thread = curthread; cv_signal(&spa->spa_trim_cv); mutex_exit(&spa->spa_trim_lock); thread_exit(); } (void) cv_timedwait(&spa->spa_trim_cv, &spa->spa_trim_lock, hz * trim_max_interval); mutex_exit(&spa->spa_trim_lock); zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); trim_map_commit(spa, zio, spa->spa_root_vdev); (void) zio_wait(zio); trim_map_commit_done(spa, spa->spa_root_vdev); spa_config_exit(spa, SCL_STATE, FTAG); } }
static void process_old_deadlist(dsl_dataset_t *ds, dsl_dataset_t *ds_prev, dsl_dataset_t *ds_next, boolean_t after_branch_point, dmu_tx_t *tx) { struct process_old_arg poa = { 0 }; dsl_pool_t *dp = ds->ds_dir->dd_pool; objset_t *mos = dp->dp_meta_objset; uint64_t deadlist_obj; ASSERT(ds->ds_deadlist.dl_oldfmt); ASSERT(ds_next->ds_deadlist.dl_oldfmt); poa.ds = ds; poa.ds_prev = ds_prev; poa.after_branch_point = after_branch_point; poa.pio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); VERIFY0(bpobj_iterate(&ds_next->ds_deadlist.dl_bpobj, process_old_cb, &poa, tx)); VERIFY0(zio_wait(poa.pio)); ASSERT3U(poa.used, ==, ds->ds_phys->ds_unique_bytes); /* change snapused */ dsl_dir_diduse_space(ds->ds_dir, DD_USED_SNAP, -poa.used, -poa.comp, -poa.uncomp, tx); /* swap next's deadlist to our deadlist */ dsl_deadlist_close(&ds->ds_deadlist); dsl_deadlist_close(&ds_next->ds_deadlist); deadlist_obj = ds->ds_phys->ds_deadlist_obj; ds->ds_phys->ds_deadlist_obj = ds_next->ds_phys->ds_deadlist_obj; ds_next->ds_phys->ds_deadlist_obj = deadlist_obj; dsl_deadlist_open(&ds->ds_deadlist, mos, ds->ds_phys->ds_deadlist_obj); dsl_deadlist_open(&ds_next->ds_deadlist, mos, ds_next->ds_phys->ds_deadlist_obj); }
static void trim_thread(void *arg) { spa_t *spa = arg; zio_t *zio; for (;;) { mutex_enter(&spa->spa_trim_lock); if (spa->spa_trim_thread == NULL) { spa->spa_trim_thread = curthread; cv_signal(&spa->spa_trim_cv); mutex_exit(&spa->spa_trim_lock); thread_exit(); } cv_wait(&spa->spa_trim_cv, &spa->spa_trim_lock); mutex_exit(&spa->spa_trim_lock); zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); trim_map_commit(spa, zio, spa->spa_root_vdev); (void) zio_wait(zio); trim_map_commit_done(spa, spa->spa_root_vdev); spa_config_exit(spa, SCL_STATE, FTAG); } }
/* ARGSUSED */ static void zil_claim_log_block(zilog_t *zilog, blkptr_t *bp, void *tx, uint64_t first_txg) { spa_t *spa = zilog->zl_spa; int err; /* * Claim log block if not already committed and not already claimed. */ if (bp->blk_birth >= first_txg && zil_dva_tree_add(&zilog->zl_dva_tree, BP_IDENTITY(bp)) == 0) { err = zio_wait(zio_claim(NULL, spa, first_txg, bp, NULL, NULL, ZIO_FLAG_MUSTSUCCEED)); ASSERT(err == 0); } }
/* * Read in the data for the dmu_sync()ed block, and change the log * record to write this whole block. */ void zil_get_replay_data(zilog_t *zilog, lr_write_t *lr) { blkptr_t *wbp = &lr->lr_blkptr; char *wbuf = (char *)(lr + 1); /* data follows lr_write_t */ uint64_t blksz; if (BP_IS_HOLE(wbp)) { /* compressed to a hole */ blksz = BP_GET_LSIZE(&lr->lr_blkptr); /* * If the blksz is zero then we must be replaying a log * from an version prior to setting the blksize of null blocks. * So we just zero the actual write size reqeusted. */ if (blksz == 0) { bzero(wbuf, lr->lr_length); return; } bzero(wbuf, blksz); } else { /* * A subsequent write may have overwritten this block, in which * case wbp may have been been freed and reallocated, and our * read of wbp may fail with a checksum error. We can safely * ignore this because the later write will provide the * correct data. */ zbookmark_t zb; zb.zb_objset = dmu_objset_id(zilog->zl_os); zb.zb_object = lr->lr_foid; zb.zb_level = 0; zb.zb_blkid = -1; /* unknown */ blksz = BP_GET_LSIZE(&lr->lr_blkptr); (void) zio_wait(zio_read(NULL, zilog->zl_spa, wbp, wbuf, blksz, NULL, NULL, ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, &zb)); } lr->lr_offset -= lr->lr_offset % blksz; lr->lr_length = blksz; }
void zil_flush_vdevs(zilog_t *zilog) { spa_t *spa = zilog->zl_spa; avl_tree_t *t = &zilog->zl_vdev_tree; void *cookie = NULL; zil_vdev_node_t *zv; zio_t *zio; ASSERT(zilog->zl_writer); /* * We don't need zl_vdev_lock here because we're the zl_writer, * and all zl_get_data() callbacks are done. */ if (avl_numnodes(t) == 0) return; spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); while ((zv = avl_destroy_nodes(t, &cookie)) != NULL) { vdev_t *vd = vdev_lookup_top(spa, zv->zv_vdev); if (vd != NULL) zio_flush(zio, vd); kmem_free(zv, sizeof (*zv)); } /* * Wait for all the flushes to complete. Not all devices actually * support the DKIOCFLUSHWRITECACHE ioctl, so it's OK if it fails. */ (void) zio_wait(zio); spa_config_exit(spa, SCL_STATE, FTAG); }
/* ARGSUSED */ static void dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) { dnode_t *dn = txh->txh_dnode; uint64_t start, end, i; int min_bs, max_bs, min_ibs, max_ibs, epbs, bits; int err = 0; if (len == 0) return; min_bs = SPA_MINBLOCKSHIFT; max_bs = SPA_MAXBLOCKSHIFT; min_ibs = DN_MIN_INDBLKSHIFT; max_ibs = DN_MAX_INDBLKSHIFT; if (dn) { uint64_t history[DN_MAX_LEVELS]; int nlvls = dn->dn_nlevels; int delta; /* * For i/o error checking, read the first and last level-0 * blocks (if they are not aligned), and all the level-1 blocks. */ if (dn->dn_maxblkid == 0) { delta = dn->dn_datablksz; start = (off < dn->dn_datablksz) ? 0 : 1; end = (off+len <= dn->dn_datablksz) ? 0 : 1; if (start == 0 && (off > 0 || len < dn->dn_datablksz)) { err = dmu_tx_check_ioerr(NULL, dn, 0, 0); if (err) goto out; delta -= off; } } else { zio_t *zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, ZIO_FLAG_CANFAIL); /* first level-0 block */ start = off >> dn->dn_datablkshift; if (P2PHASE(off, dn->dn_datablksz) || len < dn->dn_datablksz) { err = dmu_tx_check_ioerr(zio, dn, 0, start); if (err) goto out; } /* last level-0 block */ end = (off+len-1) >> dn->dn_datablkshift; if (end != start && end <= dn->dn_maxblkid && P2PHASE(off+len, dn->dn_datablksz)) { err = dmu_tx_check_ioerr(zio, dn, 0, end); if (err) goto out; } /* level-1 blocks */ if (nlvls > 1) { int shft = dn->dn_indblkshift - SPA_BLKPTRSHIFT; for (i = (start>>shft)+1; i < end>>shft; i++) { err = dmu_tx_check_ioerr(zio, dn, 1, i); if (err) goto out; } } err = zio_wait(zio); if (err) goto out; delta = P2NPHASE(off, dn->dn_datablksz); } min_ibs = max_ibs = dn->dn_indblkshift; if (dn->dn_maxblkid > 0) { /* * The blocksize can't change, * so we can make a more precise estimate. */ ASSERT(dn->dn_datablkshift != 0); min_bs = max_bs = dn->dn_datablkshift; } /* * If this write is not off the end of the file * we need to account for overwrites/unref. */ if (start <= dn->dn_maxblkid) { for (int l = 0; l < DN_MAX_LEVELS; l++) history[l] = -1ULL; } while (start <= dn->dn_maxblkid) { dmu_buf_impl_t *db; rw_enter(&dn->dn_struct_rwlock, RW_READER); err = dbuf_hold_impl(dn, 0, start, FALSE, FTAG, &db); rw_exit(&dn->dn_struct_rwlock); if (err) { txh->txh_tx->tx_err = err; return; } dmu_tx_count_twig(txh, dn, db, 0, start, B_FALSE, history); dbuf_rele(db, FTAG); if (++start > end) { /* * Account for new indirects appearing * before this IO gets assigned into a txg. */ bits = 64 - min_bs; epbs = min_ibs - SPA_BLKPTRSHIFT; for (bits -= epbs * (nlvls - 1); bits >= 0; bits -= epbs) txh->txh_fudge += 1ULL << max_ibs; goto out; } off += delta; if (len >= delta) len -= delta; delta = dn->dn_datablksz; } }
/* ARGSUSED */ static void zil_prt_rec_write(zilog_t *zilog, int txtype, lr_write_t *lr) { char *data, *dlimit; blkptr_t *bp = &lr->lr_blkptr; zbookmark_phys_t zb; char buf[SPA_MAXBLOCKSIZE]; int verbose = MAX(dump_opt['d'], dump_opt['i']); int error; (void) printf("%sfoid %llu, offset %llx, length %llx\n", prefix, (u_longlong_t)lr->lr_foid, (u_longlong_t)lr->lr_offset, (u_longlong_t)lr->lr_length); if (txtype == TX_WRITE2 || verbose < 5) return; if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) { (void) printf("%shas blkptr, %s\n", prefix, !BP_IS_HOLE(bp) && bp->blk_birth >= spa_first_txg(zilog->zl_spa) ? "will claim" : "won't claim"); print_log_bp(bp, prefix); if (BP_IS_HOLE(bp)) { (void) printf("\t\t\tLSIZE 0x%llx\n", (u_longlong_t)BP_GET_LSIZE(bp)); bzero(buf, sizeof (buf)); (void) printf("%s<hole>\n", prefix); return; } if (bp->blk_birth < zilog->zl_header->zh_claim_txg) { (void) printf("%s<block already committed>\n", prefix); return; } SET_BOOKMARK(&zb, dmu_objset_id(zilog->zl_os), lr->lr_foid, ZB_ZIL_LEVEL, lr->lr_offset / BP_GET_LSIZE(bp)); error = zio_wait(zio_read(NULL, zilog->zl_spa, bp, buf, BP_GET_LSIZE(bp), NULL, NULL, ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL, &zb)); if (error) return; data = buf; } else { data = (char *)(lr + 1); } dlimit = data + MIN(lr->lr_length, (verbose < 6 ? 20 : SPA_MAXBLOCKSIZE)); (void) printf("%s", prefix); while (data < dlimit) { if (isprint(*data)) (void) printf("%c ", *data); else (void) printf("%2hhX", *data); data++; } (void) printf("\n"); }
static void mmp_thread(void *arg) { spa_t *spa = (spa_t *)arg; mmp_thread_t *mmp = &spa->spa_mmp; boolean_t last_spa_suspended = spa_suspended(spa); boolean_t last_spa_multihost = spa_multihost(spa); callb_cpr_t cpr; hrtime_t max_fail_ns = zfs_multihost_fail_intervals * MSEC2NSEC(MAX(zfs_multihost_interval, MMP_MIN_INTERVAL)); mmp_thread_enter(mmp, &cpr); /* * The mmp_write_done() function calculates mmp_delay based on the * prior value of mmp_delay and the elapsed time since the last write. * For the first mmp write, there is no "last write", so we start * with fake, but reasonable, default non-zero values. */ mmp->mmp_delay = MSEC2NSEC(MAX(zfs_multihost_interval, MMP_MIN_INTERVAL)) / MAX(vdev_count_leaves(spa), 1); mmp->mmp_last_write = gethrtime() - mmp->mmp_delay; while (!mmp->mmp_thread_exiting) { uint64_t mmp_fail_intervals = zfs_multihost_fail_intervals; uint64_t mmp_interval = MSEC2NSEC( MAX(zfs_multihost_interval, MMP_MIN_INTERVAL)); boolean_t suspended = spa_suspended(spa); boolean_t multihost = spa_multihost(spa); hrtime_t start, next_time; start = gethrtime(); if (multihost) { next_time = start + mmp_interval / MAX(vdev_count_leaves(spa), 1); } else { next_time = start + MSEC2NSEC(MMP_DEFAULT_INTERVAL); } /* * When MMP goes off => on, or spa goes suspended => * !suspended, we know no writes occurred recently. We * update mmp_last_write to give us some time to try. */ if ((!last_spa_multihost && multihost) || (last_spa_suspended && !suspended)) { mutex_enter(&mmp->mmp_io_lock); mmp->mmp_last_write = gethrtime(); mutex_exit(&mmp->mmp_io_lock); } else if (last_spa_multihost && !multihost) { mutex_enter(&mmp->mmp_io_lock); mmp->mmp_delay = 0; mutex_exit(&mmp->mmp_io_lock); } last_spa_multihost = multihost; last_spa_suspended = suspended; /* * Smooth max_fail_ns when its factors are decreased, because * making (max_fail_ns < mmp_interval) results in the pool being * immediately suspended before writes can occur at the new * higher frequency. */ if ((mmp_interval * mmp_fail_intervals) < max_fail_ns) { max_fail_ns = ((31 * max_fail_ns) + (mmp_interval * mmp_fail_intervals)) / 32; } else { max_fail_ns = mmp_interval * mmp_fail_intervals; } /* * Suspend the pool if no MMP write has succeeded in over * mmp_interval * mmp_fail_intervals nanoseconds. */ if (!suspended && mmp_fail_intervals && multihost && (start - mmp->mmp_last_write) > max_fail_ns) { zio_suspend(spa, NULL); } if (multihost) mmp_write_uberblock(spa); CALLB_CPR_SAFE_BEGIN(&cpr); (void) cv_timedwait_sig(&mmp->mmp_thread_cv, &mmp->mmp_thread_lock, ddi_get_lbolt() + ((next_time - gethrtime()) / (NANOSEC / hz))); CALLB_CPR_SAFE_END(&cpr, &mmp->mmp_thread_lock); } /* Outstanding writes are allowed to complete. */ if (mmp->mmp_zio_root) zio_wait(mmp->mmp_zio_root); mmp->mmp_zio_root = NULL; mmp_thread_exit(mmp, &mmp->mmp_thread, &cpr); }
/* ARGSUSED */ static void dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) { dnode_t *dn = txh->txh_dnode; uint64_t start, end, i; int min_bs, max_bs, min_ibs, max_ibs, epbs, bits; int err = 0; if (len == 0) return; min_bs = SPA_MINBLOCKSHIFT; max_bs = SPA_MAXBLOCKSHIFT; min_ibs = DN_MIN_INDBLKSHIFT; max_ibs = DN_MAX_INDBLKSHIFT; /* * For i/o error checking, read the first and last level-0 * blocks (if they are not aligned), and all the level-1 blocks. */ if (dn) { if (dn->dn_maxblkid == 0) { err = dmu_tx_check_ioerr(NULL, dn, 0, 0); if (err) goto out; } else { zio_t *zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, ZIO_FLAG_CANFAIL); /* first level-0 block */ start = off >> dn->dn_datablkshift; if (P2PHASE(off, dn->dn_datablksz) || len < dn->dn_datablksz) { err = dmu_tx_check_ioerr(zio, dn, 0, start); if (err) goto out; } /* last level-0 block */ end = (off+len-1) >> dn->dn_datablkshift; if (end != start && P2PHASE(off+len, dn->dn_datablksz)) { err = dmu_tx_check_ioerr(zio, dn, 0, end); if (err) goto out; } /* level-1 blocks */ if (dn->dn_nlevels > 1) { start >>= dn->dn_indblkshift - SPA_BLKPTRSHIFT; end >>= dn->dn_indblkshift - SPA_BLKPTRSHIFT; for (i = start+1; i < end; i++) { err = dmu_tx_check_ioerr(zio, dn, 1, i); if (err) goto out; } } err = zio_wait(zio); if (err) goto out; } }
static void zil_replay_log_record(zilog_t *zilog, lr_t *lr, void *zra, uint64_t claim_txg) { zil_replay_arg_t *zr = zra; const zil_header_t *zh = zilog->zl_header; uint64_t reclen = lr->lrc_reclen; uint64_t txtype = lr->lrc_txtype; char *name; int pass, error; if (!zilog->zl_replay) /* giving up */ return; if (lr->lrc_txg < claim_txg) /* already committed */ return; if (lr->lrc_seq <= zh->zh_replay_seq) /* already replayed */ return; /* Strip case-insensitive bit, still present in log record */ txtype &= ~TX_CI; if (txtype == 0 || txtype >= TX_MAX_TYPE) { error = EINVAL; goto bad; } /* * Make a copy of the data so we can revise and extend it. */ bcopy(lr, zr->zr_lrbuf, reclen); /* * The log block containing this lr may have been byteswapped * so that we can easily examine common fields like lrc_txtype. * However, the log is a mix of different data types, and only the * replay vectors know how to byteswap their records. Therefore, if * the lr was byteswapped, undo it before invoking the replay vector. */ if (zr->zr_byteswap) byteswap_uint64_array(zr->zr_lrbuf, reclen); /* * If this is a TX_WRITE with a blkptr, suck in the data. */ if (txtype == TX_WRITE && reclen == sizeof (lr_write_t)) { lr_write_t *lrw = (lr_write_t *)lr; blkptr_t *wbp = &lrw->lr_blkptr; uint64_t wlen = lrw->lr_length; char *wbuf = zr->zr_lrbuf + reclen; if (BP_IS_HOLE(wbp)) { /* compressed to a hole */ bzero(wbuf, wlen); } else { /* * A subsequent write may have overwritten this block, * in which case wbp may have been been freed and * reallocated, and our read of wbp may fail with a * checksum error. We can safely ignore this because * the later write will provide the correct data. */ zbookmark_t zb; zb.zb_objset = dmu_objset_id(zilog->zl_os); zb.zb_object = lrw->lr_foid; zb.zb_level = -1; zb.zb_blkid = lrw->lr_offset / BP_GET_LSIZE(wbp); (void) zio_wait(zio_read(NULL, zilog->zl_spa, wbp, wbuf, BP_GET_LSIZE(wbp), NULL, NULL, ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, &zb)); (void) memmove(wbuf, wbuf + lrw->lr_blkoff, wlen); } } /* * We must now do two things atomically: replay this log record, * and update the log header sequence number to reflect the fact that * we did so. At the end of each replay function the sequence number * is updated if we are in replay mode. */ for (pass = 1; pass <= 2; pass++) { zilog->zl_replaying_seq = lr->lrc_seq; /* Only byteswap (if needed) on the 1st pass. */ error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lrbuf, zr->zr_byteswap && pass == 1); if (!error) return; /* * The DMU's dnode layer doesn't see removes until the txg * commits, so a subsequent claim can spuriously fail with * EEXIST. So if we receive any error we try syncing out * any removes then retry the transaction. */ if (pass == 1) txg_wait_synced(spa_get_dsl(zilog->zl_spa), 0); } bad: ASSERT(error); name = kmem_alloc(MAXNAMELEN, KM_SLEEP); dmu_objset_name(zr->zr_os, name); cmn_err(CE_WARN, "ZFS replay transaction error %d, " "dataset %s, seq 0x%llx, txtype %llu %s\n", error, name, (u_longlong_t)lr->lrc_seq, (u_longlong_t)txtype, (lr->lrc_txtype & TX_CI) ? "CI" : ""); zilog->zl_replay = B_FALSE; kmem_free(name, MAXNAMELEN); }
static void zil_commit_writer(zilog_t *zilog, uint64_t seq, uint64_t foid) { uint64_t txg; uint64_t commit_seq = 0; itx_t *itx, *itx_next = (itx_t *)-1; lwb_t *lwb; spa_t *spa; zilog->zl_writer = B_TRUE; ASSERT(zilog->zl_root_zio == NULL); spa = zilog->zl_spa; if (zilog->zl_suspend) { lwb = NULL; } else { lwb = list_tail(&zilog->zl_lwb_list); if (lwb == NULL) { /* * Return if there's nothing to flush before we * dirty the fs by calling zil_create() */ if (list_is_empty(&zilog->zl_itx_list)) { zilog->zl_writer = B_FALSE; return; } mutex_exit(&zilog->zl_lock); zil_create(zilog); mutex_enter(&zilog->zl_lock); lwb = list_tail(&zilog->zl_lwb_list); } } /* Loop through in-memory log transactions filling log blocks. */ DTRACE_PROBE1(zil__cw1, zilog_t *, zilog); for (;;) { /* * Find the next itx to push: * Push all transactions related to specified foid and all * other transactions except TX_WRITE, TX_TRUNCATE, * TX_SETATTR and TX_ACL for all other files. */ if (itx_next != (itx_t *)-1) itx = itx_next; else itx = list_head(&zilog->zl_itx_list); for (; itx != NULL; itx = list_next(&zilog->zl_itx_list, itx)) { if (foid == 0) /* push all foids? */ break; if (itx->itx_sync) /* push all O_[D]SYNC */ break; switch (itx->itx_lr.lrc_txtype) { case TX_SETATTR: case TX_WRITE: case TX_TRUNCATE: case TX_ACL: /* lr_foid is same offset for these records */ if (((lr_write_t *)&itx->itx_lr)->lr_foid != foid) { continue; /* skip this record */ } } break; } if (itx == NULL) break; if ((itx->itx_lr.lrc_seq > seq) && ((lwb == NULL) || (lwb->lwb_nused == 0) || (lwb->lwb_nused + itx->itx_sod > ZIL_BLK_DATA_SZ(lwb)))) { break; } /* * Save the next pointer. Even though we soon drop * zl_lock all threads that may change the list * (another writer or zil_itx_clean) can't do so until * they have zl_writer. */ itx_next = list_next(&zilog->zl_itx_list, itx); list_remove(&zilog->zl_itx_list, itx); zilog->zl_itx_list_sz -= itx->itx_sod; mutex_exit(&zilog->zl_lock); txg = itx->itx_lr.lrc_txg; ASSERT(txg); if (txg > spa_last_synced_txg(spa) || txg > spa_freeze_txg(spa)) lwb = zil_lwb_commit(zilog, itx, lwb); kmem_free(itx, offsetof(itx_t, itx_lr) + itx->itx_lr.lrc_reclen); mutex_enter(&zilog->zl_lock); } DTRACE_PROBE1(zil__cw2, zilog_t *, zilog); /* determine commit sequence number */ itx = list_head(&zilog->zl_itx_list); if (itx) commit_seq = itx->itx_lr.lrc_seq; else commit_seq = zilog->zl_itx_seq; mutex_exit(&zilog->zl_lock); /* write the last block out */ if (lwb != NULL && lwb->lwb_zio != NULL) lwb = zil_lwb_write_start(zilog, lwb); zilog->zl_prev_used = zilog->zl_cur_used; zilog->zl_cur_used = 0; /* * Wait if necessary for the log blocks to be on stable storage. */ if (zilog->zl_root_zio) { DTRACE_PROBE1(zil__cw3, zilog_t *, zilog); (void) zio_wait(zilog->zl_root_zio); zilog->zl_root_zio = NULL; DTRACE_PROBE1(zil__cw4, zilog_t *, zilog); zil_flush_vdevs(zilog); } if (zilog->zl_log_error || lwb == NULL) { zilog->zl_log_error = 0; txg_wait_synced(zilog->zl_dmu_pool, 0); } mutex_enter(&zilog->zl_lock); zilog->zl_writer = B_FALSE; ASSERT3U(commit_seq, >=, zilog->zl_commit_seq); zilog->zl_commit_seq = commit_seq; }
/* ARGSUSED */ static void dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) { dnode_t *dn = txh->txh_dnode; int err = 0; if (len == 0) return; (void) refcount_add_many(&txh->txh_space_towrite, len, FTAG); if (refcount_count(&txh->txh_space_towrite) > 2 * DMU_MAX_ACCESS) err = SET_ERROR(EFBIG); if (dn == NULL) return; /* * For i/o error checking, read the blocks that will be needed * to perform the write: the first and last level-0 blocks (if * they are not aligned, i.e. if they are partial-block writes), * and all the level-1 blocks. */ if (dn->dn_maxblkid == 0) { if (off < dn->dn_datablksz && (off > 0 || len < dn->dn_datablksz)) { err = dmu_tx_check_ioerr(NULL, dn, 0, 0); if (err != 0) { txh->txh_tx->tx_err = err; } } } else { zio_t *zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, ZIO_FLAG_CANFAIL); /* first level-0 block */ uint64_t start = off >> dn->dn_datablkshift; if (P2PHASE(off, dn->dn_datablksz) || len < dn->dn_datablksz) { err = dmu_tx_check_ioerr(zio, dn, 0, start); if (err != 0) { txh->txh_tx->tx_err = err; } } /* last level-0 block */ uint64_t end = (off + len - 1) >> dn->dn_datablkshift; if (end != start && end <= dn->dn_maxblkid && P2PHASE(off + len, dn->dn_datablksz)) { err = dmu_tx_check_ioerr(zio, dn, 0, end); if (err != 0) { txh->txh_tx->tx_err = err; } } /* level-1 blocks */ if (dn->dn_nlevels > 1) { int shft = dn->dn_indblkshift - SPA_BLKPTRSHIFT; for (uint64_t i = (start >> shft) + 1; i < end >> shft; i++) { err = dmu_tx_check_ioerr(zio, dn, 1, i); if (err != 0) { txh->txh_tx->tx_err = err; } } } err = zio_wait(zio); if (err != 0) { txh->txh_tx->tx_err = err; } }