static mirror_map_t * vdev_mirror_map_alloc(zio_t *zio) { mirror_map_t *mm = NULL; mirror_child_t *mc; vdev_t *vd = zio->io_vd; int c, d; if (vd == NULL) { dva_t *dva = zio->io_bp->blk_dva; spa_t *spa = zio->io_spa; c = BP_GET_NDVAS(zio->io_bp); mm = kmem_zalloc(offsetof(mirror_map_t, mm_child[c]), KM_SLEEP); mm->mm_children = c; mm->mm_replacing = B_FALSE; mm->mm_preferred = spa_get_random(c); mm->mm_root = B_TRUE; /* * Check the other, lower-index DVAs to see if they're on * the same vdev as the child we picked. If they are, use * them since they are likely to have been allocated from * the primary metaslab in use at the time, and hence are * more likely to have locality with single-copy data. */ for (c = mm->mm_preferred, d = c - 1; d >= 0; d--) { if (DVA_GET_VDEV(&dva[d]) == DVA_GET_VDEV(&dva[c])) mm->mm_preferred = d; } for (c = 0; c < mm->mm_children; c++) { mc = &mm->mm_child[c]; mc->mc_vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[c])); mc->mc_offset = DVA_GET_OFFSET(&dva[c]); } } else { c = vd->vdev_children; mm = kmem_zalloc(offsetof(mirror_map_t, mm_child[c]), KM_SLEEP); mm->mm_children = c; mm->mm_replacing = (vd->vdev_ops == &vdev_replacing_ops || vd->vdev_ops == &vdev_spare_ops); mm->mm_preferred = mm->mm_replacing ? 0 : (zio->io_offset >> vdev_mirror_shift) % c; mm->mm_root = B_FALSE; for (c = 0; c < mm->mm_children; c++) { mc = &mm->mm_child[c]; mc->mc_vd = vd->vdev_child[c]; mc->mc_offset = zio->io_offset; } } zio->io_vsd = mm; zio->io_vsd_ops = &vdev_mirror_vsd_ops; return (mm); }
static mirror_map_t * vdev_mirror_map_init(zio_t *zio) { mirror_map_t *mm = NULL; mirror_child_t *mc; vdev_t *vd = zio->io_vd; int c; if (vd == NULL) { dva_t *dva = zio->io_bp->blk_dva; spa_t *spa = zio->io_spa; mm = vdev_mirror_map_alloc(BP_GET_NDVAS(zio->io_bp), B_FALSE, B_TRUE); for (c = 0; c < mm->mm_children; c++) { mc = &mm->mm_child[c]; mc->mc_vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[c])); mc->mc_offset = DVA_GET_OFFSET(&dva[c]); } } else { mm = vdev_mirror_map_alloc(vd->vdev_children, (vd->vdev_ops == &vdev_replacing_ops || vd->vdev_ops == &vdev_spare_ops), B_FALSE); for (c = 0; c < mm->mm_children; c++) { mc = &mm->mm_child[c]; mc->mc_vd = vd->vdev_child[c]; mc->mc_offset = zio->io_offset; } } zio->io_vsd = mm; zio->io_vsd_ops = &vdev_mirror_vsd_ops; return (mm); }
/* * This sync task completes (finishes) a condense, deleting the old * mapping and replacing it with the new one. */ static void spa_condense_indirect_complete_sync(void *arg, dmu_tx_t *tx) { spa_condensing_indirect_t *sci = arg; spa_t *spa = dmu_tx_pool(tx)->dp_spa; spa_condensing_indirect_phys_t *scip = &spa->spa_condensing_indirect_phys; vdev_t *vd = vdev_lookup_top(spa, scip->scip_vdev); vdev_indirect_config_t *vic = &vd->vdev_indirect_config; objset_t *mos = spa->spa_meta_objset; vdev_indirect_mapping_t *old_mapping = vd->vdev_indirect_mapping; uint64_t old_count = vdev_indirect_mapping_num_entries(old_mapping); uint64_t new_count = vdev_indirect_mapping_num_entries(sci->sci_new_mapping); ASSERT(dmu_tx_is_syncing(tx)); ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); ASSERT3P(sci, ==, spa->spa_condensing_indirect); for (int i = 0; i < TXG_SIZE; i++) { ASSERT(list_is_empty(&sci->sci_new_mapping_entries[i])); } ASSERT(vic->vic_mapping_object != 0); ASSERT3U(vd->vdev_id, ==, scip->scip_vdev); ASSERT(scip->scip_next_mapping_object != 0); ASSERT(scip->scip_prev_obsolete_sm_object != 0); /* * Reset vdev_indirect_mapping to refer to the new object. */ rw_enter(&vd->vdev_indirect_rwlock, RW_WRITER); vdev_indirect_mapping_close(vd->vdev_indirect_mapping); vd->vdev_indirect_mapping = sci->sci_new_mapping; rw_exit(&vd->vdev_indirect_rwlock); sci->sci_new_mapping = NULL; vdev_indirect_mapping_free(mos, vic->vic_mapping_object, tx); vic->vic_mapping_object = scip->scip_next_mapping_object; scip->scip_next_mapping_object = 0; space_map_free_obj(mos, scip->scip_prev_obsolete_sm_object, tx); spa_feature_decr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx); scip->scip_prev_obsolete_sm_object = 0; scip->scip_vdev = 0; VERIFY0(zap_remove(mos, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONDENSING_INDIRECT, tx)); spa_condensing_indirect_destroy(spa->spa_condensing_indirect); spa->spa_condensing_indirect = NULL; zfs_dbgmsg("finished condense of vdev %llu in txg %llu: " "new mapping object %llu has %llu entries " "(was %llu entries)", vd->vdev_id, dmu_tx_get_txg(tx), vic->vic_mapping_object, new_count, old_count); vdev_config_dirty(spa->spa_root_vdev); }
/* * Mark the DVA vdev_id:offset:size as being obsolete in the given tx. This * wrapper is provided because the DMU does not know about vdev_t's and * cannot directly call vdev_indirect_mark_obsolete. */ void spa_vdev_indirect_mark_obsolete(spa_t *spa, uint64_t vdev_id, uint64_t offset, uint64_t size, dmu_tx_t *tx) { vdev_t *vd = vdev_lookup_top(spa, vdev_id); ASSERT(dmu_tx_is_syncing(tx)); /* The DMU can only remap indirect vdevs. */ ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); vdev_indirect_mark_obsolete(vd, offset, size); }
void zil_flush_vdevs(zilog_t *zilog) { spa_t *spa = zilog->zl_spa; avl_tree_t *t = &zilog->zl_vdev_tree; void *cookie = NULL; zil_vdev_node_t *zv; zio_t *zio; ASSERT(zilog->zl_writer); /* * We don't need zl_vdev_lock here because we're the zl_writer, * and all zl_get_data() callbacks are done. */ if (avl_numnodes(t) == 0) return; spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); while ((zv = avl_destroy_nodes(t, &cookie)) != NULL) { vdev_t *vd = vdev_lookup_top(spa, zv->zv_vdev); if (vd != NULL) zio_flush(zio, vd); kmem_free(zv, sizeof (*zv)); } /* * Wait for all the flushes to complete. Not all devices actually * support the DKIOCFLUSHWRITECACHE ioctl, so it's OK if it fails. */ (void) zio_wait(zio); spa_config_exit(spa, SCL_STATE, FTAG); }
/* * Avoid inlining the function to keep vdev_mirror_io_start(), which * is this functions only caller, as small as possible on the stack. */ noinline static mirror_map_t * vdev_mirror_map_alloc(zio_t *zio) { mirror_map_t *mm = NULL; mirror_child_t *mc; vdev_t *vd = zio->io_vd; int c, d; if (vd == NULL) { dva_t *dva = zio->io_bp->blk_dva; spa_t *spa = zio->io_spa; c = BP_GET_NDVAS(zio->io_bp); mm = kmem_zalloc(offsetof(mirror_map_t, mm_child[c]), KM_PUSHPAGE); mm->mm_children = c; mm->mm_replacing = B_FALSE; mm->mm_preferred = spa_get_random(c); mm->mm_root = B_TRUE; /* * Check the other, lower-index DVAs to see if they're on * the same vdev as the child we picked. If they are, use * them since they are likely to have been allocated from * the primary metaslab in use at the time, and hence are * more likely to have locality with single-copy data. */ for (c = mm->mm_preferred, d = c - 1; d >= 0; d--) { if (DVA_GET_VDEV(&dva[d]) == DVA_GET_VDEV(&dva[c])) mm->mm_preferred = d; } for (c = 0; c < mm->mm_children; c++) { mc = &mm->mm_child[c]; mc->mc_vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[c])); mc->mc_offset = DVA_GET_OFFSET(&dva[c]); } } else { int lowest_pending = INT_MAX; int lowest_nr = 1; c = vd->vdev_children; mm = kmem_zalloc(offsetof(mirror_map_t, mm_child[c]), KM_PUSHPAGE); mm->mm_children = c; mm->mm_replacing = (vd->vdev_ops == &vdev_replacing_ops || vd->vdev_ops == &vdev_spare_ops); mm->mm_preferred = 0; mm->mm_root = B_FALSE; for (c = 0; c < mm->mm_children; c++) { mc = &mm->mm_child[c]; mc->mc_vd = vd->vdev_child[c]; mc->mc_offset = zio->io_offset; if (mm->mm_replacing) continue; if (!vdev_readable(mc->mc_vd)) { mc->mc_error = SET_ERROR(ENXIO); mc->mc_tried = 1; mc->mc_skipped = 1; mc->mc_pending = INT_MAX; continue; } mc->mc_pending = vdev_mirror_pending(mc->mc_vd); if (mc->mc_pending < lowest_pending) { lowest_pending = mc->mc_pending; lowest_nr = 1; } else if (mc->mc_pending == lowest_pending) { lowest_nr++; } } d = gethrtime() / (NSEC_PER_USEC * zfs_vdev_mirror_switch_us); d = (d % lowest_nr) + 1; for (c = 0; c < mm->mm_children; c++) { mc = &mm->mm_child[c]; if (mm->mm_child[c].mc_pending == lowest_pending) { if (--d == 0) { mm->mm_preferred = c; break; } } } } zio->io_vsd = mm; zio->io_vsd_ops = &vdev_mirror_vsd_ops; return (mm); }
static mirror_map_t * vdev_mirror_map_init(zio_t *zio) { mirror_map_t *mm = NULL; mirror_child_t *mc; vdev_t *vd = zio->io_vd; int c; if (vd == NULL) { dva_t *dva = zio->io_bp->blk_dva; spa_t *spa = zio->io_spa; mm = vdev_mirror_map_alloc(BP_GET_NDVAS(zio->io_bp), B_FALSE, B_TRUE); for (c = 0; c < mm->mm_children; c++) { mc = &mm->mm_child[c]; mc->mc_vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[c])); mc->mc_offset = DVA_GET_OFFSET(&dva[c]); } } else { /* * If we are resilvering, then we should handle scrub reads * differently; we shouldn't issue them to the resilvering * device because it might not have those blocks. * * We are resilvering iff: * 1) We are a replacing vdev (ie our name is "replacing-1" or * "spare-1" or something like that), and * 2) The pool is currently being resilvered. * * We cannot simply check vd->vdev_resilver_txg, because it's * not set in this path. * * Nor can we just check our vdev_ops; there are cases (such as * when a user types "zpool replace pool odev spare_dev" and * spare_dev is in the spare list, or when a spare device is * automatically used to replace a DEGRADED device) when * resilvering is complete but both the original vdev and the * spare vdev remain in the pool. That behavior is intentional. * It helps implement the policy that a spare should be * automatically removed from the pool after the user replaces * the device that originally failed. * * If a spa load is in progress, then spa_dsl_pool may be * uninitialized. But we shouldn't be resilvering during a spa * load anyway. */ boolean_t replacing = (vd->vdev_ops == &vdev_replacing_ops || vd->vdev_ops == &vdev_spare_ops) && spa_load_state(vd->vdev_spa) == SPA_LOAD_NONE && dsl_scan_resilvering(vd->vdev_spa->spa_dsl_pool); mm = vdev_mirror_map_alloc(vd->vdev_children, replacing, B_FALSE); for (c = 0; c < mm->mm_children; c++) { mc = &mm->mm_child[c]; mc->mc_vd = vd->vdev_child[c]; mc->mc_offset = zio->io_offset; } } zio->io_vsd = mm; zio->io_vsd_ops = &vdev_mirror_vsd_ops; return (mm); }
/* ARGSUSED */ static int spa_condense_indirect_thread(void *arg, zthr_t *zthr) { spa_t *spa = arg; vdev_t *vd; ASSERT3P(spa->spa_condensing_indirect, !=, NULL); spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); vd = vdev_lookup_top(spa, spa->spa_condensing_indirect_phys.scip_vdev); ASSERT3P(vd, !=, NULL); spa_config_exit(spa, SCL_VDEV, FTAG); spa_condensing_indirect_t *sci = spa->spa_condensing_indirect; spa_condensing_indirect_phys_t *scip = &spa->spa_condensing_indirect_phys; uint32_t *counts; uint64_t start_index; vdev_indirect_mapping_t *old_mapping = vd->vdev_indirect_mapping; space_map_t *prev_obsolete_sm = NULL; ASSERT3U(vd->vdev_id, ==, scip->scip_vdev); ASSERT(scip->scip_next_mapping_object != 0); ASSERT(scip->scip_prev_obsolete_sm_object != 0); ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); for (int i = 0; i < TXG_SIZE; i++) { /* * The list must start out empty in order for the * _commit_sync() sync task to be properly registered * on the first call to _commit_entry(); so it's wise * to double check and ensure we actually are starting * with empty lists. */ ASSERT(list_is_empty(&sci->sci_new_mapping_entries[i])); } VERIFY0(space_map_open(&prev_obsolete_sm, spa->spa_meta_objset, scip->scip_prev_obsolete_sm_object, 0, vd->vdev_asize, 0)); space_map_update(prev_obsolete_sm); counts = vdev_indirect_mapping_load_obsolete_counts(old_mapping); if (prev_obsolete_sm != NULL) { vdev_indirect_mapping_load_obsolete_spacemap(old_mapping, counts, prev_obsolete_sm); } space_map_close(prev_obsolete_sm); /* * Generate new mapping. Determine what index to continue from * based on the max offset that we've already written in the * new mapping. */ uint64_t max_offset = vdev_indirect_mapping_max_offset(sci->sci_new_mapping); if (max_offset == 0) { /* We haven't written anything to the new mapping yet. */ start_index = 0; } else { /* * Pick up from where we left off. _entry_for_offset() * returns a pointer into the vim_entries array. If * max_offset is greater than any of the mappings * contained in the table NULL will be returned and * that indicates we've exhausted our iteration of the * old_mapping. */ vdev_indirect_mapping_entry_phys_t *entry = vdev_indirect_mapping_entry_for_offset_or_next(old_mapping, max_offset); if (entry == NULL) { /* * We've already written the whole new mapping. * This special value will cause us to skip the * generate_new_mapping step and just do the sync * task to complete the condense. */ start_index = UINT64_MAX; } else { start_index = entry - old_mapping->vim_entries; ASSERT3U(start_index, <, vdev_indirect_mapping_num_entries(old_mapping)); } } spa_condense_indirect_generate_new_mapping(vd, counts, start_index, zthr); vdev_indirect_mapping_free_obsolete_counts(old_mapping, counts); /* * If the zthr has received a cancellation signal while running * in generate_new_mapping() or at any point after that, then bail * early. We don't want to complete the condense if the spa is * shutting down. */ if (zthr_iscancelled(zthr)) return (0); VERIFY0(dsl_sync_task(spa_name(spa), NULL, spa_condense_indirect_complete_sync, sci, 0, ZFS_SPACE_CHECK_EXTRA_RESERVED)); return (0); thread_exit(); }
static mirror_map_t * vdev_mirror_map_alloc(zio_t *zio) { mirror_map_t *mm = NULL; mirror_child_t *mc; vdev_t *vd = zio->io_vd; int c, d; if (vd == NULL) { dva_t *dva = zio->io_bp->blk_dva; spa_t *spa = zio->io_spa; c = BP_GET_NDVAS(zio->io_bp); mm = kmem_zalloc(offsetof(mirror_map_t, mm_child[c]), KM_SLEEP); mm->mm_children = c; mm->mm_resilvering = B_FALSE; mm->mm_preferred = spa_get_random(c); mm->mm_root = B_TRUE; /* * Check the other, lower-index DVAs to see if they're on * the same vdev as the child we picked. If they are, use * them since they are likely to have been allocated from * the primary metaslab in use at the time, and hence are * more likely to have locality with single-copy data. */ for (c = mm->mm_preferred, d = c - 1; d >= 0; d--) { if (DVA_GET_VDEV(&dva[d]) == DVA_GET_VDEV(&dva[c])) mm->mm_preferred = d; } for (c = 0; c < mm->mm_children; c++) { mc = &mm->mm_child[c]; mc->mc_vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[c])); mc->mc_offset = DVA_GET_OFFSET(&dva[c]); } } else { int replacing; c = vd->vdev_children; mm = kmem_zalloc(offsetof(mirror_map_t, mm_child[c]), KM_SLEEP); mm->mm_children = c; /* * If we are resilvering, then we should handle scrub reads * differently; we shouldn't issue them to the resilvering * device because it might not have those blocks. * * We are resilvering iff: * 1) We are a replacing vdev (ie our name is "replacing-1" or * "spare-1" or something like that), and * 2) The pool is currently being resilvered. * * We cannot simply check vd->vdev_resilver_txg, because it's * not set in this path. * * Nor can we just check our vdev_ops; there are cases (such as * when a user types "zpool replace pool odev spare_dev" and * spare_dev is in the spare list, or when a spare device is * automatically used to replace a DEGRADED device) when * resilvering is complete but both the original vdev and the * spare vdev remain in the pool. That behavior is intentional. * It helps implement the policy that a spare should be * automatically removed from the pool after the user replaces * the device that originally failed. */ replacing = (vd->vdev_ops == &vdev_replacing_ops || vd->vdev_ops == &vdev_spare_ops); /* * If a spa load is in progress, then spa_dsl_pool may be * uninitialized. But we shouldn't be resilvering during a spa * load anyway. */ if (replacing && (spa_load_state(vd->vdev_spa) == SPA_LOAD_NONE) && dsl_scan_resilvering(vd->vdev_spa->spa_dsl_pool)) { mm->mm_resilvering = B_TRUE; } else { mm->mm_resilvering = B_FALSE; } mm->mm_preferred = mm->mm_resilvering ? 0 : (zio->io_offset >> vdev_mirror_shift) % c; mm->mm_root = B_FALSE; for (c = 0; c < mm->mm_children; c++) { mc = &mm->mm_child[c]; mc->mc_vd = vd->vdev_child[c]; mc->mc_offset = zio->io_offset; } } zio->io_vsd = mm; zio->io_vsd_ops = &vdev_mirror_vsd_ops; return (mm); }