Example #1
0
static mirror_map_t *
vdev_mirror_map_alloc(zio_t *zio)
{
	mirror_map_t *mm = NULL;
	mirror_child_t *mc;
	vdev_t *vd = zio->io_vd;
	int c, d;

	if (vd == NULL) {
		dva_t *dva = zio->io_bp->blk_dva;
		spa_t *spa = zio->io_spa;

		c = BP_GET_NDVAS(zio->io_bp);

		mm = kmem_zalloc(offsetof(mirror_map_t, mm_child[c]), KM_SLEEP);
		mm->mm_children = c;
		mm->mm_replacing = B_FALSE;
		mm->mm_preferred = spa_get_random(c);
		mm->mm_root = B_TRUE;

		/*
		 * Check the other, lower-index DVAs to see if they're on
		 * the same vdev as the child we picked.  If they are, use
		 * them since they are likely to have been allocated from
		 * the primary metaslab in use at the time, and hence are
		 * more likely to have locality with single-copy data.
		 */
		for (c = mm->mm_preferred, d = c - 1; d >= 0; d--) {
			if (DVA_GET_VDEV(&dva[d]) == DVA_GET_VDEV(&dva[c]))
				mm->mm_preferred = d;
		}

		for (c = 0; c < mm->mm_children; c++) {
			mc = &mm->mm_child[c];

			mc->mc_vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[c]));
			mc->mc_offset = DVA_GET_OFFSET(&dva[c]);
		}
	} else {
		c = vd->vdev_children;

		mm = kmem_zalloc(offsetof(mirror_map_t, mm_child[c]), KM_SLEEP);
		mm->mm_children = c;
		mm->mm_replacing = (vd->vdev_ops == &vdev_replacing_ops ||
		    vd->vdev_ops == &vdev_spare_ops);
		mm->mm_preferred = mm->mm_replacing ? 0 :
		    (zio->io_offset >> vdev_mirror_shift) % c;
		mm->mm_root = B_FALSE;

		for (c = 0; c < mm->mm_children; c++) {
			mc = &mm->mm_child[c];
			mc->mc_vd = vd->vdev_child[c];
			mc->mc_offset = zio->io_offset;
		}
	}

	zio->io_vsd = mm;
	zio->io_vsd_ops = &vdev_mirror_vsd_ops;
	return (mm);
}
Example #2
0
static mirror_map_t *
vdev_mirror_map_init(zio_t *zio)
{
	mirror_map_t *mm = NULL;
	mirror_child_t *mc;
	vdev_t *vd = zio->io_vd;
	int c;

	if (vd == NULL) {
		dva_t *dva = zio->io_bp->blk_dva;
		spa_t *spa = zio->io_spa;

		mm = vdev_mirror_map_alloc(BP_GET_NDVAS(zio->io_bp), B_FALSE,
		    B_TRUE);
		for (c = 0; c < mm->mm_children; c++) {
			mc = &mm->mm_child[c];
			mc->mc_vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[c]));
			mc->mc_offset = DVA_GET_OFFSET(&dva[c]);
		}
	} else {
		mm = vdev_mirror_map_alloc(vd->vdev_children,
		    (vd->vdev_ops == &vdev_replacing_ops ||
                    vd->vdev_ops == &vdev_spare_ops), B_FALSE);
		for (c = 0; c < mm->mm_children; c++) {
			mc = &mm->mm_child[c];
			mc->mc_vd = vd->vdev_child[c];
			mc->mc_offset = zio->io_offset;
		}
	}

	zio->io_vsd = mm;
	zio->io_vsd_ops = &vdev_mirror_vsd_ops;
	return (mm);
}
Example #3
0
/*
 * This sync task completes (finishes) a condense, deleting the old
 * mapping and replacing it with the new one.
 */
static void
spa_condense_indirect_complete_sync(void *arg, dmu_tx_t *tx)
{
	spa_condensing_indirect_t *sci = arg;
	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
	spa_condensing_indirect_phys_t *scip =
	    &spa->spa_condensing_indirect_phys;
	vdev_t *vd = vdev_lookup_top(spa, scip->scip_vdev);
	vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
	objset_t *mos = spa->spa_meta_objset;
	vdev_indirect_mapping_t *old_mapping = vd->vdev_indirect_mapping;
	uint64_t old_count = vdev_indirect_mapping_num_entries(old_mapping);
	uint64_t new_count =
	    vdev_indirect_mapping_num_entries(sci->sci_new_mapping);

	ASSERT(dmu_tx_is_syncing(tx));
	ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
	ASSERT3P(sci, ==, spa->spa_condensing_indirect);
	for (int i = 0; i < TXG_SIZE; i++) {
		ASSERT(list_is_empty(&sci->sci_new_mapping_entries[i]));
	}
	ASSERT(vic->vic_mapping_object != 0);
	ASSERT3U(vd->vdev_id, ==, scip->scip_vdev);
	ASSERT(scip->scip_next_mapping_object != 0);
	ASSERT(scip->scip_prev_obsolete_sm_object != 0);

	/*
	 * Reset vdev_indirect_mapping to refer to the new object.
	 */
	rw_enter(&vd->vdev_indirect_rwlock, RW_WRITER);
	vdev_indirect_mapping_close(vd->vdev_indirect_mapping);
	vd->vdev_indirect_mapping = sci->sci_new_mapping;
	rw_exit(&vd->vdev_indirect_rwlock);

	sci->sci_new_mapping = NULL;
	vdev_indirect_mapping_free(mos, vic->vic_mapping_object, tx);
	vic->vic_mapping_object = scip->scip_next_mapping_object;
	scip->scip_next_mapping_object = 0;

	space_map_free_obj(mos, scip->scip_prev_obsolete_sm_object, tx);
	spa_feature_decr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx);
	scip->scip_prev_obsolete_sm_object = 0;

	scip->scip_vdev = 0;

	VERIFY0(zap_remove(mos, DMU_POOL_DIRECTORY_OBJECT,
	    DMU_POOL_CONDENSING_INDIRECT, tx));
	spa_condensing_indirect_destroy(spa->spa_condensing_indirect);
	spa->spa_condensing_indirect = NULL;

	zfs_dbgmsg("finished condense of vdev %llu in txg %llu: "
	    "new mapping object %llu has %llu entries "
	    "(was %llu entries)",
	    vd->vdev_id, dmu_tx_get_txg(tx), vic->vic_mapping_object,
	    new_count, old_count);

	vdev_config_dirty(spa->spa_root_vdev);
}
Example #4
0
/*
 * Mark the DVA vdev_id:offset:size as being obsolete in the given tx. This
 * wrapper is provided because the DMU does not know about vdev_t's and
 * cannot directly call vdev_indirect_mark_obsolete.
 */
void
spa_vdev_indirect_mark_obsolete(spa_t *spa, uint64_t vdev_id, uint64_t offset,
    uint64_t size, dmu_tx_t *tx)
{
	vdev_t *vd = vdev_lookup_top(spa, vdev_id);
	ASSERT(dmu_tx_is_syncing(tx));

	/* The DMU can only remap indirect vdevs. */
	ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
	vdev_indirect_mark_obsolete(vd, offset, size);
}
Example #5
0
File: zil.c Project: harshada/zfs
void
zil_flush_vdevs(zilog_t *zilog)
{
	spa_t *spa = zilog->zl_spa;
	avl_tree_t *t = &zilog->zl_vdev_tree;
	void *cookie = NULL;
	zil_vdev_node_t *zv;
	zio_t *zio;

	ASSERT(zilog->zl_writer);

	/*
	 * We don't need zl_vdev_lock here because we're the zl_writer,
	 * and all zl_get_data() callbacks are done.
	 */
	if (avl_numnodes(t) == 0)
		return;

	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);

	zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);

	while ((zv = avl_destroy_nodes(t, &cookie)) != NULL) {
		vdev_t *vd = vdev_lookup_top(spa, zv->zv_vdev);
		if (vd != NULL)
			zio_flush(zio, vd);
		kmem_free(zv, sizeof (*zv));
	}

	/*
	 * Wait for all the flushes to complete.  Not all devices actually
	 * support the DKIOCFLUSHWRITECACHE ioctl, so it's OK if it fails.
	 */
	(void) zio_wait(zio);

	spa_config_exit(spa, SCL_STATE, FTAG);
}
Example #6
0
/*
 * Avoid inlining the function to keep vdev_mirror_io_start(), which
 * is this functions only caller, as small as possible on the stack.
 */
noinline static mirror_map_t *
vdev_mirror_map_alloc(zio_t *zio)
{
	mirror_map_t *mm = NULL;
	mirror_child_t *mc;
	vdev_t *vd = zio->io_vd;
	int c, d;

	if (vd == NULL) {
		dva_t *dva = zio->io_bp->blk_dva;
		spa_t *spa = zio->io_spa;

		c = BP_GET_NDVAS(zio->io_bp);

		mm = kmem_zalloc(offsetof(mirror_map_t, mm_child[c]),
		    KM_PUSHPAGE);
		mm->mm_children = c;
		mm->mm_replacing = B_FALSE;
		mm->mm_preferred = spa_get_random(c);
		mm->mm_root = B_TRUE;

		/*
		 * Check the other, lower-index DVAs to see if they're on
		 * the same vdev as the child we picked.  If they are, use
		 * them since they are likely to have been allocated from
		 * the primary metaslab in use at the time, and hence are
		 * more likely to have locality with single-copy data.
		 */
		for (c = mm->mm_preferred, d = c - 1; d >= 0; d--) {
			if (DVA_GET_VDEV(&dva[d]) == DVA_GET_VDEV(&dva[c]))
				mm->mm_preferred = d;
		}

		for (c = 0; c < mm->mm_children; c++) {
			mc = &mm->mm_child[c];

			mc->mc_vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[c]));
			mc->mc_offset = DVA_GET_OFFSET(&dva[c]);
		}
	} else {
		int lowest_pending = INT_MAX;
		int lowest_nr = 1;

		c = vd->vdev_children;

		mm = kmem_zalloc(offsetof(mirror_map_t, mm_child[c]),
		    KM_PUSHPAGE);
		mm->mm_children = c;
		mm->mm_replacing = (vd->vdev_ops == &vdev_replacing_ops ||
		    vd->vdev_ops == &vdev_spare_ops);
		mm->mm_preferred = 0;
		mm->mm_root = B_FALSE;

		for (c = 0; c < mm->mm_children; c++) {
			mc = &mm->mm_child[c];
			mc->mc_vd = vd->vdev_child[c];
			mc->mc_offset = zio->io_offset;

			if (mm->mm_replacing)
				continue;

			if (!vdev_readable(mc->mc_vd)) {
				mc->mc_error = SET_ERROR(ENXIO);
				mc->mc_tried = 1;
				mc->mc_skipped = 1;
				mc->mc_pending = INT_MAX;
				continue;
			}

			mc->mc_pending = vdev_mirror_pending(mc->mc_vd);
			if (mc->mc_pending < lowest_pending) {
				lowest_pending = mc->mc_pending;
				lowest_nr = 1;
			} else if (mc->mc_pending == lowest_pending) {
				lowest_nr++;
			}
		}

		d = gethrtime() / (NSEC_PER_USEC * zfs_vdev_mirror_switch_us);
		d = (d % lowest_nr) + 1;

		for (c = 0; c < mm->mm_children; c++) {
			mc = &mm->mm_child[c];

			if (mm->mm_child[c].mc_pending == lowest_pending) {
				if (--d == 0) {
					mm->mm_preferred = c;
					break;
				}
			}
		}
	}

	zio->io_vsd = mm;
	zio->io_vsd_ops = &vdev_mirror_vsd_ops;
	return (mm);
}
Example #7
0
static mirror_map_t *
vdev_mirror_map_init(zio_t *zio)
{
	mirror_map_t *mm = NULL;
	mirror_child_t *mc;
	vdev_t *vd = zio->io_vd;
	int c;

	if (vd == NULL) {
		dva_t *dva = zio->io_bp->blk_dva;
		spa_t *spa = zio->io_spa;

		mm = vdev_mirror_map_alloc(BP_GET_NDVAS(zio->io_bp), B_FALSE,
		    B_TRUE);
		for (c = 0; c < mm->mm_children; c++) {
			mc = &mm->mm_child[c];
			mc->mc_vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[c]));
			mc->mc_offset = DVA_GET_OFFSET(&dva[c]);
		}
	} else {
		/*
		 * If we are resilvering, then we should handle scrub reads
		 * differently; we shouldn't issue them to the resilvering
		 * device because it might not have those blocks.
		 *
		 * We are resilvering iff:
		 * 1) We are a replacing vdev (ie our name is "replacing-1" or
		 *    "spare-1" or something like that), and
		 * 2) The pool is currently being resilvered.
		 *
		 * We cannot simply check vd->vdev_resilver_txg, because it's
		 * not set in this path.
		 *
		 * Nor can we just check our vdev_ops; there are cases (such as
		 * when a user types "zpool replace pool odev spare_dev" and
		 * spare_dev is in the spare list, or when a spare device is
		 * automatically used to replace a DEGRADED device) when
		 * resilvering is complete but both the original vdev and the
		 * spare vdev remain in the pool.  That behavior is intentional.
		 * It helps implement the policy that a spare should be
		 * automatically removed from the pool after the user replaces
		 * the device that originally failed.
		 *
		 * If a spa load is in progress, then spa_dsl_pool may be
		 * uninitialized.  But we shouldn't be resilvering during a spa
		 * load anyway.
		 */
		boolean_t replacing = (vd->vdev_ops == &vdev_replacing_ops ||
		    vd->vdev_ops == &vdev_spare_ops) &&
		    spa_load_state(vd->vdev_spa) == SPA_LOAD_NONE &&
		    dsl_scan_resilvering(vd->vdev_spa->spa_dsl_pool);		
		mm = vdev_mirror_map_alloc(vd->vdev_children, replacing,
		    B_FALSE);
		for (c = 0; c < mm->mm_children; c++) {
			mc = &mm->mm_child[c];
			mc->mc_vd = vd->vdev_child[c];
			mc->mc_offset = zio->io_offset;
		}
	}

	zio->io_vsd = mm;
	zio->io_vsd_ops = &vdev_mirror_vsd_ops;
	return (mm);
}
Example #8
0
/* ARGSUSED */
static int
spa_condense_indirect_thread(void *arg, zthr_t *zthr)
{
	spa_t *spa = arg;
	vdev_t *vd;

	ASSERT3P(spa->spa_condensing_indirect, !=, NULL);
	spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
	vd = vdev_lookup_top(spa, spa->spa_condensing_indirect_phys.scip_vdev);
	ASSERT3P(vd, !=, NULL);
	spa_config_exit(spa, SCL_VDEV, FTAG);

	spa_condensing_indirect_t *sci = spa->spa_condensing_indirect;
	spa_condensing_indirect_phys_t *scip =
	    &spa->spa_condensing_indirect_phys;
	uint32_t *counts;
	uint64_t start_index;
	vdev_indirect_mapping_t *old_mapping = vd->vdev_indirect_mapping;
	space_map_t *prev_obsolete_sm = NULL;

	ASSERT3U(vd->vdev_id, ==, scip->scip_vdev);
	ASSERT(scip->scip_next_mapping_object != 0);
	ASSERT(scip->scip_prev_obsolete_sm_object != 0);
	ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);

	for (int i = 0; i < TXG_SIZE; i++) {
		/*
		 * The list must start out empty in order for the
		 * _commit_sync() sync task to be properly registered
		 * on the first call to _commit_entry(); so it's wise
		 * to double check and ensure we actually are starting
		 * with empty lists.
		 */
		ASSERT(list_is_empty(&sci->sci_new_mapping_entries[i]));
	}

	VERIFY0(space_map_open(&prev_obsolete_sm, spa->spa_meta_objset,
	    scip->scip_prev_obsolete_sm_object, 0, vd->vdev_asize, 0));
	space_map_update(prev_obsolete_sm);
	counts = vdev_indirect_mapping_load_obsolete_counts(old_mapping);
	if (prev_obsolete_sm != NULL) {
		vdev_indirect_mapping_load_obsolete_spacemap(old_mapping,
		    counts, prev_obsolete_sm);
	}
	space_map_close(prev_obsolete_sm);

	/*
	 * Generate new mapping.  Determine what index to continue from
	 * based on the max offset that we've already written in the
	 * new mapping.
	 */
	uint64_t max_offset =
	    vdev_indirect_mapping_max_offset(sci->sci_new_mapping);
	if (max_offset == 0) {
		/* We haven't written anything to the new mapping yet. */
		start_index = 0;
	} else {
		/*
		 * Pick up from where we left off. _entry_for_offset()
		 * returns a pointer into the vim_entries array. If
		 * max_offset is greater than any of the mappings
		 * contained in the table  NULL will be returned and
		 * that indicates we've exhausted our iteration of the
		 * old_mapping.
		 */

		vdev_indirect_mapping_entry_phys_t *entry =
		    vdev_indirect_mapping_entry_for_offset_or_next(old_mapping,
		    max_offset);

		if (entry == NULL) {
			/*
			 * We've already written the whole new mapping.
			 * This special value will cause us to skip the
			 * generate_new_mapping step and just do the sync
			 * task to complete the condense.
			 */
			start_index = UINT64_MAX;
		} else {
			start_index = entry - old_mapping->vim_entries;
			ASSERT3U(start_index, <,
			    vdev_indirect_mapping_num_entries(old_mapping));
		}
	}

	spa_condense_indirect_generate_new_mapping(vd, counts,
	    start_index, zthr);

	vdev_indirect_mapping_free_obsolete_counts(old_mapping, counts);

	/*
	 * If the zthr has received a cancellation signal while running
	 * in generate_new_mapping() or at any point after that, then bail
	 * early. We don't want to complete the condense if the spa is
	 * shutting down.
	 */
	if (zthr_iscancelled(zthr))
		return (0);

	VERIFY0(dsl_sync_task(spa_name(spa), NULL,
	    spa_condense_indirect_complete_sync, sci, 0,
	    ZFS_SPACE_CHECK_EXTRA_RESERVED));

	return (0);
	thread_exit();
}
Example #9
0
static mirror_map_t *
vdev_mirror_map_alloc(zio_t *zio)
{
	mirror_map_t *mm = NULL;
	mirror_child_t *mc;
	vdev_t *vd = zio->io_vd;
	int c, d;

	if (vd == NULL) {
		dva_t *dva = zio->io_bp->blk_dva;
		spa_t *spa = zio->io_spa;

		c = BP_GET_NDVAS(zio->io_bp);

		mm = kmem_zalloc(offsetof(mirror_map_t, mm_child[c]), KM_SLEEP);
		mm->mm_children = c;
		mm->mm_resilvering = B_FALSE;
		mm->mm_preferred = spa_get_random(c);
		mm->mm_root = B_TRUE;

		/*
		 * Check the other, lower-index DVAs to see if they're on
		 * the same vdev as the child we picked.  If they are, use
		 * them since they are likely to have been allocated from
		 * the primary metaslab in use at the time, and hence are
		 * more likely to have locality with single-copy data.
		 */
		for (c = mm->mm_preferred, d = c - 1; d >= 0; d--) {
			if (DVA_GET_VDEV(&dva[d]) == DVA_GET_VDEV(&dva[c]))
				mm->mm_preferred = d;
		}

		for (c = 0; c < mm->mm_children; c++) {
			mc = &mm->mm_child[c];

			mc->mc_vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[c]));
			mc->mc_offset = DVA_GET_OFFSET(&dva[c]);
		}
	} else {
		int replacing;

		c = vd->vdev_children;

		mm = kmem_zalloc(offsetof(mirror_map_t, mm_child[c]), KM_SLEEP);
		mm->mm_children = c;
		/*
		 * If we are resilvering, then we should handle scrub reads
		 * differently; we shouldn't issue them to the resilvering
		 * device because it might not have those blocks.
		 *
		 * We are resilvering iff:
		 * 1) We are a replacing vdev (ie our name is "replacing-1" or
		 *    "spare-1" or something like that), and
		 * 2) The pool is currently being resilvered.
		 *
		 * We cannot simply check vd->vdev_resilver_txg, because it's
		 * not set in this path.
		 *
		 * Nor can we just check our vdev_ops; there are cases (such as
		 * when a user types "zpool replace pool odev spare_dev" and
		 * spare_dev is in the spare list, or when a spare device is
		 * automatically used to replace a DEGRADED device) when
		 * resilvering is complete but both the original vdev and the
		 * spare vdev remain in the pool.  That behavior is intentional.
		 * It helps implement the policy that a spare should be
		 * automatically removed from the pool after the user replaces
		 * the device that originally failed.
		 */
		replacing = (vd->vdev_ops == &vdev_replacing_ops ||
		    vd->vdev_ops == &vdev_spare_ops);
		/*
		 * If a spa load is in progress, then spa_dsl_pool may be
		 * uninitialized.  But we shouldn't be resilvering during a spa
		 * load anyway.
		 */
		if (replacing &&
		    (spa_load_state(vd->vdev_spa) == SPA_LOAD_NONE) &&
		    dsl_scan_resilvering(vd->vdev_spa->spa_dsl_pool)) {
			mm->mm_resilvering = B_TRUE;
		} else {
			mm->mm_resilvering = B_FALSE;
		}

		mm->mm_preferred = mm->mm_resilvering ? 0 :
		    (zio->io_offset >> vdev_mirror_shift) % c;
		mm->mm_root = B_FALSE;

		for (c = 0; c < mm->mm_children; c++) {
			mc = &mm->mm_child[c];
			mc->mc_vd = vd->vdev_child[c];
			mc->mc_offset = zio->io_offset;
		}
	}

	zio->io_vsd = mm;
	zio->io_vsd_ops = &vdev_mirror_vsd_ops;
	return (mm);
}