static void trim_map_vdev_commit(spa_t *spa, zio_t *zio, vdev_t *vd) { trim_map_t *tm = vd->vdev_trimmap; trim_seg_t *ts; uint64_t size, offset, txgtarget, txgsafe; int64_t hard, soft; hrtime_t timelimit; ASSERT(vd->vdev_ops->vdev_op_leaf); if (tm == NULL) return; timelimit = gethrtime() - (hrtime_t)trim_timeout * NANOSEC; if (vd->vdev_isl2cache) { txgsafe = UINT64_MAX; txgtarget = UINT64_MAX; } else { txgsafe = MIN(spa_last_synced_txg(spa), spa_freeze_txg(spa)); if (txgsafe > trim_txg_delay) txgtarget = txgsafe - trim_txg_delay; else txgtarget = 0; } mutex_enter(&tm->tm_lock); hard = 0; if (tm->tm_pending > trim_vdev_max_pending) hard = (tm->tm_pending - trim_vdev_max_pending) / 4; soft = P2ROUNDUP(hard + tm->tm_pending / trim_timeout + 1, 64); /* Loop until we have sent all outstanding free's */ while (soft > 0 && (ts = trim_map_first(tm, txgtarget, txgsafe, timelimit, hard > 0)) != NULL) { TRIM_MAP_REM(tm, ts); avl_remove(&tm->tm_queued_frees, ts); avl_add(&tm->tm_inflight_frees, ts); size = ts->ts_end - ts->ts_start; offset = ts->ts_start; /* * We drop the lock while we call zio_nowait as the IO * scheduler can result in a different IO being run e.g. * a write which would result in a recursive lock. */ mutex_exit(&tm->tm_lock); zio_nowait(zio_trim(zio, spa, vd, offset, size)); soft -= TRIM_MAP_SEGS(size); hard -= TRIM_MAP_SEGS(size); mutex_enter(&tm->tm_lock); } mutex_exit(&tm->tm_lock); }
int zil_claim(char *osname, void *txarg) { dmu_tx_t *tx = txarg; uint64_t first_txg = dmu_tx_get_txg(tx); zilog_t *zilog; zil_header_t *zh; objset_t *os; int error; error = dmu_objset_open(osname, DMU_OST_ANY, DS_MODE_USER, &os); if (error) { cmn_err(CE_WARN, "can't open objset for %s", osname); return (0); } zilog = dmu_objset_zil(os); zh = zil_header_in_syncing_context(zilog); /* * Record here whether the zil has any records to replay. * If the header block pointer is null or the block points * to the stubby then we know there are no valid log records. * We use the header to store this state as the the zilog gets * freed later in dmu_objset_close(). * The flags (and the rest of the header fields) are cleared in * zil_sync() as a result of a zil_destroy(), after replaying the log. * * Note, the intent log can be empty but still need the * stubby to be claimed. */ if (!zil_empty(zilog)) zh->zh_flags |= ZIL_REPLAY_NEEDED; /* * Claim all log blocks if we haven't already done so, and remember * the highest claimed sequence number. This ensures that if we can * read only part of the log now (e.g. due to a missing device), * but we can read the entire log later, we will not try to replay * or destroy beyond the last block we successfully claimed. */ ASSERT3U(zh->zh_claim_txg, <=, first_txg); if (zh->zh_claim_txg == 0 && !BP_IS_HOLE(&zh->zh_log)) { zh->zh_claim_txg = first_txg; zh->zh_claim_seq = zil_parse(zilog, zil_claim_log_block, zil_claim_log_record, tx, first_txg); dsl_dataset_dirty(dmu_objset_ds(os), tx); } ASSERT3U(first_txg, ==, (spa_last_synced_txg(zilog->zl_spa) + 1)); dmu_objset_close(os); return (0); }
/* * If there are any in-memory intent log transactions which have now been * synced then start up a taskq to free them. */ void zil_clean(zilog_t *zilog) { itx_t *itx; mutex_enter(&zilog->zl_lock); itx = list_head(&zilog->zl_itx_list); if ((itx != NULL) && (itx->itx_lr.lrc_txg <= spa_last_synced_txg(zilog->zl_spa))) { (void) taskq_dispatch(zilog->zl_clean_taskq, (task_func_t *)zil_itx_clean, zilog, TQ_SLEEP); } mutex_exit(&zilog->zl_lock); }
/* * Update all disk labels, generate a fresh config based on the current * in-core state, and sync the global config cache (do not sync the config * cache if this is a booting rootpool). */ void spa_config_update(spa_t *spa, int what) { vdev_t *rvd = spa->spa_root_vdev; uint64_t txg; int c; ASSERT(MUTEX_HELD(&spa_namespace_lock)); spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); txg = spa_last_synced_txg(spa) + 1; if (what == SPA_CONFIG_UPDATE_POOL) { vdev_config_dirty(rvd); } else { /* * If we have top-level vdevs that were added but have * not yet been prepared for allocation, do that now. * (It's safe now because the config cache is up to date, * so it will be able to translate the new DVAs.) * See comments in spa_vdev_add() for full details. */ for (c = 0; c < rvd->vdev_children; c++) { vdev_t *tvd = rvd->vdev_child[c]; if (tvd->vdev_ms_array == 0) vdev_metaslab_set_size(tvd); vdev_expand(tvd, txg); } } spa_config_exit(spa, SCL_ALL, FTAG); /* * Wait for the mosconfig to be regenerated and synced. */ txg_wait_synced(spa->spa_dsl_pool, txg); /* * Update the global config cache to reflect the new mosconfig. */ if (!spa->spa_is_root) { spa_write_cachefile(spa, B_FALSE, what != SPA_CONFIG_UPDATE_POOL); } if (what == SPA_CONFIG_UPDATE_POOL) spa_config_update(spa, SPA_CONFIG_UPDATE_VDEVS); }
static void trim_map_vdev_commit(spa_t *spa, zio_t *zio, vdev_t *vd) { trim_map_t *tm = vd->vdev_trimmap; trim_seg_t *ts; uint64_t size, txgtarget, txgsafe; hrtime_t timelimit; ASSERT(vd->vdev_ops->vdev_op_leaf); if (tm == NULL) return; timelimit = gethrtime() - trim_timeout * NANOSEC; if (vd->vdev_isl2cache) { txgsafe = UINT64_MAX; txgtarget = UINT64_MAX; } else { txgsafe = MIN(spa_last_synced_txg(spa), spa_freeze_txg(spa)); if (txgsafe > trim_txg_delay) txgtarget = txgsafe - trim_txg_delay; else txgtarget = 0; } mutex_enter(&tm->tm_lock); /* Loop until we have sent all outstanding free's */ while ((ts = trim_map_first(tm, txgtarget, txgsafe, timelimit)) != NULL) { list_remove(&tm->tm_head, ts); avl_remove(&tm->tm_queued_frees, ts); avl_add(&tm->tm_inflight_frees, ts); size = ts->ts_end - ts->ts_start; zio_nowait(zio_trim(zio, spa, vd, ts->ts_start, size)); TRIM_MAP_SDEC(tm, size); TRIM_MAP_QDEC(tm); } mutex_exit(&tm->tm_lock); }
/* * Free up all in-memory intent log transactions that have now been synced. */ static void zil_itx_clean(zilog_t *zilog) { uint64_t synced_txg = spa_last_synced_txg(zilog->zl_spa); uint64_t freeze_txg = spa_freeze_txg(zilog->zl_spa); list_t clean_list; itx_t *itx; list_create(&clean_list, sizeof (itx_t), offsetof(itx_t, itx_node)); mutex_enter(&zilog->zl_lock); /* wait for a log writer to finish walking list */ while (zilog->zl_writer) { cv_wait(&zilog->zl_cv_writer, &zilog->zl_lock); } /* * Move the sync'd log transactions to a separate list so we can call * kmem_free without holding the zl_lock. * * There is no need to set zl_writer as we don't drop zl_lock here */ while ((itx = list_head(&zilog->zl_itx_list)) != NULL && itx->itx_lr.lrc_txg <= MIN(synced_txg, freeze_txg)) { list_remove(&zilog->zl_itx_list, itx); zilog->zl_itx_list_sz -= itx->itx_sod; list_insert_tail(&clean_list, itx); } cv_broadcast(&zilog->zl_cv_writer); mutex_exit(&zilog->zl_lock); /* destroy sync'd log transactions */ while ((itx = list_head(&clean_list)) != NULL) { list_remove(&clean_list, itx); kmem_free(itx, offsetof(itx_t, itx_lr) + itx->itx_lr.lrc_reclen); } list_destroy(&clean_list); }
static void zil_commit_writer(zilog_t *zilog, uint64_t seq, uint64_t foid) { uint64_t txg; uint64_t commit_seq = 0; itx_t *itx, *itx_next = (itx_t *)-1; lwb_t *lwb; spa_t *spa; zilog->zl_writer = B_TRUE; ASSERT(zilog->zl_root_zio == NULL); spa = zilog->zl_spa; if (zilog->zl_suspend) { lwb = NULL; } else { lwb = list_tail(&zilog->zl_lwb_list); if (lwb == NULL) { /* * Return if there's nothing to flush before we * dirty the fs by calling zil_create() */ if (list_is_empty(&zilog->zl_itx_list)) { zilog->zl_writer = B_FALSE; return; } mutex_exit(&zilog->zl_lock); zil_create(zilog); mutex_enter(&zilog->zl_lock); lwb = list_tail(&zilog->zl_lwb_list); } } /* Loop through in-memory log transactions filling log blocks. */ DTRACE_PROBE1(zil__cw1, zilog_t *, zilog); for (;;) { /* * Find the next itx to push: * Push all transactions related to specified foid and all * other transactions except TX_WRITE, TX_TRUNCATE, * TX_SETATTR and TX_ACL for all other files. */ if (itx_next != (itx_t *)-1) itx = itx_next; else itx = list_head(&zilog->zl_itx_list); for (; itx != NULL; itx = list_next(&zilog->zl_itx_list, itx)) { if (foid == 0) /* push all foids? */ break; if (itx->itx_sync) /* push all O_[D]SYNC */ break; switch (itx->itx_lr.lrc_txtype) { case TX_SETATTR: case TX_WRITE: case TX_TRUNCATE: case TX_ACL: /* lr_foid is same offset for these records */ if (((lr_write_t *)&itx->itx_lr)->lr_foid != foid) { continue; /* skip this record */ } } break; } if (itx == NULL) break; if ((itx->itx_lr.lrc_seq > seq) && ((lwb == NULL) || (lwb->lwb_nused == 0) || (lwb->lwb_nused + itx->itx_sod > ZIL_BLK_DATA_SZ(lwb)))) { break; } /* * Save the next pointer. Even though we soon drop * zl_lock all threads that may change the list * (another writer or zil_itx_clean) can't do so until * they have zl_writer. */ itx_next = list_next(&zilog->zl_itx_list, itx); list_remove(&zilog->zl_itx_list, itx); zilog->zl_itx_list_sz -= itx->itx_sod; mutex_exit(&zilog->zl_lock); txg = itx->itx_lr.lrc_txg; ASSERT(txg); if (txg > spa_last_synced_txg(spa) || txg > spa_freeze_txg(spa)) lwb = zil_lwb_commit(zilog, itx, lwb); kmem_free(itx, offsetof(itx_t, itx_lr) + itx->itx_lr.lrc_reclen); mutex_enter(&zilog->zl_lock); } DTRACE_PROBE1(zil__cw2, zilog_t *, zilog); /* determine commit sequence number */ itx = list_head(&zilog->zl_itx_list); if (itx) commit_seq = itx->itx_lr.lrc_seq; else commit_seq = zilog->zl_itx_seq; mutex_exit(&zilog->zl_lock); /* write the last block out */ if (lwb != NULL && lwb->lwb_zio != NULL) lwb = zil_lwb_write_start(zilog, lwb); zilog->zl_prev_used = zilog->zl_cur_used; zilog->zl_cur_used = 0; /* * Wait if necessary for the log blocks to be on stable storage. */ if (zilog->zl_root_zio) { DTRACE_PROBE1(zil__cw3, zilog_t *, zilog); (void) zio_wait(zilog->zl_root_zio); zilog->zl_root_zio = NULL; DTRACE_PROBE1(zil__cw4, zilog_t *, zilog); zil_flush_vdevs(zilog); } if (zilog->zl_log_error || lwb == NULL) { zilog->zl_log_error = 0; txg_wait_synced(zilog->zl_dmu_pool, 0); } mutex_enter(&zilog->zl_lock); zilog->zl_writer = B_FALSE; ASSERT3U(commit_seq, >=, zilog->zl_commit_seq); zilog->zl_commit_seq = commit_seq; }