Ejemplo n.º 1
0
/*
 * Returns true if the given record matches the I/O in progress.
 */
static boolean_t
zio_match_handler(zbookmark_t *zb, uint64_t type,
    zinject_record_t *record, int error)
{
	/*
	 * Check for a match against the MOS, which is based on type
	 */
	if (zb->zb_objset == DMU_META_OBJSET &&
	    record->zi_objset == DMU_META_OBJSET &&
	    record->zi_object == DMU_META_DNODE_OBJECT) {
		if (record->zi_type == DMU_OT_NONE ||
		    type == record->zi_type)
			return (record->zi_freq == 0 ||
			    spa_get_random(100) < record->zi_freq);
		else
			return (B_FALSE);
	}

	/*
	 * Check for an exact match.
	 */
	if (zb->zb_objset == record->zi_objset &&
	    zb->zb_object == record->zi_object &&
	    zb->zb_level == record->zi_level &&
	    zb->zb_blkid >= record->zi_start &&
	    zb->zb_blkid <= record->zi_end &&
	    error == record->zi_error)
		return (record->zi_freq == 0 ||
		    spa_get_random(100) < record->zi_freq);

	return (B_FALSE);
}
Ejemplo n.º 2
0
Archivo: zil.c Proyecto: harshada/zfs
static void
zil_init_log_chain(zilog_t *zilog, blkptr_t *bp)
{
	zio_cksum_t *zc = &bp->blk_cksum;

	zc->zc_word[ZIL_ZC_GUID_0] = spa_get_random(-1ULL);
	zc->zc_word[ZIL_ZC_GUID_1] = spa_get_random(-1ULL);
	zc->zc_word[ZIL_ZC_OBJSET] = dmu_objset_id(zilog->zl_os);
	zc->zc_word[ZIL_ZC_SEQ] = 1ULL;
}
Ejemplo n.º 3
0
/*
 * Allocate and minimally initialize a vdev_t.
 */
static vdev_t *
vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
{
	vdev_t *vd;

	vd = kmem_zalloc(sizeof (vdev_t), KM_SLEEP);

	if (spa->spa_root_vdev == NULL) {
		ASSERT(ops == &vdev_root_ops);
		spa->spa_root_vdev = vd;
	}

	if (guid == 0) {
		if (spa->spa_root_vdev == vd) {
			/*
			 * The root vdev's guid will also be the pool guid,
			 * which must be unique among all pools.
			 */
			while (guid == 0 || spa_guid_exists(guid, 0))
				guid = spa_get_random(-1ULL);
		} else {
			/*
			 * Any other vdev's guid must be unique within the pool.
			 */
			while (guid == 0 ||
			    spa_guid_exists(spa_guid(spa), guid))
				guid = spa_get_random(-1ULL);
		}
		ASSERT(!spa_guid_exists(spa_guid(spa), guid));
	}

	vd->vdev_spa = spa;
	vd->vdev_id = id;
	vd->vdev_guid = guid;
	vd->vdev_guid_sum = guid;
	vd->vdev_ops = ops;
	vd->vdev_state = VDEV_STATE_CLOSED;

	mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_DEFAULT, NULL);
	mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL);
	space_map_create(&vd->vdev_dtl_map, 0, -1ULL, 0, &vd->vdev_dtl_lock);
	space_map_create(&vd->vdev_dtl_scrub, 0, -1ULL, 0, &vd->vdev_dtl_lock);
	txg_list_create(&vd->vdev_ms_list,
	    offsetof(struct metaslab, ms_txg_node));
	txg_list_create(&vd->vdev_dtl_list,
	    offsetof(struct vdev, vdev_dtl_node));
	vd->vdev_stat.vs_timestamp = gethrtime();

	return (vd);
}
Ejemplo n.º 4
0
Archivo: mmp.c Proyecto: LLNL/zfs
/*
 * Choose a random vdev, label, and MMP block, and write over it
 * with a copy of the last-synced uberblock, whose timestamp
 * has been updated to reflect that the pool is in use.
 */
static void
mmp_write_uberblock(spa_t *spa)
{
	int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL;
	mmp_thread_t *mmp = &spa->spa_mmp;
	uberblock_t *ub;
	vdev_t *vd;
	int label;
	uint64_t offset;

	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
	vd = mmp_random_leaf(spa->spa_root_vdev);
	if (vd == NULL) {
		spa_config_exit(spa, SCL_STATE, FTAG);
		return;
	}

	mutex_enter(&mmp->mmp_io_lock);

	if (mmp->mmp_zio_root == NULL)
		mmp->mmp_zio_root = zio_root(spa, NULL, NULL,
		    flags | ZIO_FLAG_GODFATHER);

	ub = &mmp->mmp_ub;
	ub->ub_timestamp = gethrestime_sec();
	ub->ub_mmp_magic = MMP_MAGIC;
	ub->ub_mmp_delay = mmp->mmp_delay;
	vd->vdev_mmp_pending = gethrtime();

	zio_t *zio  = zio_null(mmp->mmp_zio_root, spa, NULL, NULL, NULL, flags);
	abd_t *ub_abd = abd_alloc_for_io(VDEV_UBERBLOCK_SIZE(vd), B_TRUE);
	abd_zero(ub_abd, VDEV_UBERBLOCK_SIZE(vd));
	abd_copy_from_buf(ub_abd, ub, sizeof (uberblock_t));

	mutex_exit(&mmp->mmp_io_lock);

	offset = VDEV_UBERBLOCK_OFFSET(vd, VDEV_UBERBLOCK_COUNT(vd) -
	    MMP_BLOCKS_PER_LABEL + spa_get_random(MMP_BLOCKS_PER_LABEL));

	label = spa_get_random(VDEV_LABELS);
	vdev_label_write(zio, vd, label, ub_abd, offset,
	    VDEV_UBERBLOCK_SIZE(vd), mmp_write_done, mmp,
	    flags | ZIO_FLAG_DONT_PROPAGATE);

	spa_mmp_history_add(ub->ub_txg, ub->ub_timestamp, ub->ub_mmp_delay, vd,
	    label);

	zio_nowait(zio);
}
Ejemplo n.º 5
0
/*
 * Determine if the underlying device is accessible by reading and writing
 * to a known location. We must be able to do this during syncing context
 * and thus we cannot set the vdev state directly.
 */
static int
vdev_file_probe(vdev_t *vd)
{
	vdev_t *nvd;
	char *vl_boot;
	uint64_t offset;
	int l, error = 0, retries = 0;

	if (vd == NULL)
		return (EINVAL);

	/* Hijack the current vdev */
	nvd = vd;

	/*
	 * Pick a random label to rewrite.
	 */
	l = spa_get_random(VDEV_LABELS);
	ASSERT(l < VDEV_LABELS);

	offset = vdev_label_offset(vd->vdev_psize, l,
	    offsetof(vdev_label_t, vl_boot_header));

	vl_boot = kmem_alloc(VDEV_BOOT_HEADER_SIZE, KM_SLEEP);

	while ((error = vdev_file_probe_io(nvd, vl_boot, VDEV_BOOT_HEADER_SIZE,
	    offset, UIO_READ)) != 0 && retries == 0) {

		/*
		 * If we failed with the vdev that was passed in then
		 * try allocating a new one and try again.
		 */
		nvd = kmem_zalloc(sizeof (vdev_t), KM_SLEEP);
		if (vd->vdev_path)
			nvd->vdev_path = spa_strdup(vd->vdev_path);
		nvd->vdev_guid = vd->vdev_guid;
		retries++;

		if (vdev_file_open_common(nvd) != 0)
			break;
	}

	if ((spa_mode & FWRITE) && !error) {
		error = vdev_file_probe_io(nvd, vl_boot, VDEV_BOOT_HEADER_SIZE,
		    offset, UIO_WRITE);
	}

	if (retries) {
		vdev_file_close(nvd);
		if (nvd->vdev_path)
			spa_strfree(nvd->vdev_path);
		kmem_free(nvd, sizeof (vdev_t));
	}
	kmem_free(vl_boot, VDEV_BOOT_HEADER_SIZE);

	if (!error)
		vd->vdev_is_failing = B_FALSE;

	return (error);
}
Ejemplo n.º 6
0
static mirror_map_t *
vdev_mirror_map_alloc(zio_t *zio)
{
	mirror_map_t *mm = NULL;
	mirror_child_t *mc;
	vdev_t *vd = zio->io_vd;
	int c, d;

	if (vd == NULL) {
		dva_t *dva = zio->io_bp->blk_dva;
		spa_t *spa = zio->io_spa;

		c = BP_GET_NDVAS(zio->io_bp);

		mm = kmem_zalloc(offsetof(mirror_map_t, mm_child[c]), KM_SLEEP);
		mm->mm_children = c;
		mm->mm_replacing = B_FALSE;
		mm->mm_preferred = spa_get_random(c);
		mm->mm_root = B_TRUE;

		/*
		 * Check the other, lower-index DVAs to see if they're on
		 * the same vdev as the child we picked.  If they are, use
		 * them since they are likely to have been allocated from
		 * the primary metaslab in use at the time, and hence are
		 * more likely to have locality with single-copy data.
		 */
		for (c = mm->mm_preferred, d = c - 1; d >= 0; d--) {
			if (DVA_GET_VDEV(&dva[d]) == DVA_GET_VDEV(&dva[c]))
				mm->mm_preferred = d;
		}

		for (c = 0; c < mm->mm_children; c++) {
			mc = &mm->mm_child[c];

			mc->mc_vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[c]));
			mc->mc_offset = DVA_GET_OFFSET(&dva[c]);
		}
	} else {
		c = vd->vdev_children;

		mm = kmem_zalloc(offsetof(mirror_map_t, mm_child[c]), KM_SLEEP);
		mm->mm_children = c;
		mm->mm_replacing = (vd->vdev_ops == &vdev_replacing_ops ||
		    vd->vdev_ops == &vdev_spare_ops);
		mm->mm_preferred = mm->mm_replacing ? 0 :
		    (zio->io_offset >> vdev_mirror_shift) % c;
		mm->mm_root = B_FALSE;

		for (c = 0; c < mm->mm_children; c++) {
			mc = &mm->mm_child[c];
			mc->mc_vd = vd->vdev_child[c];
			mc->mc_offset = zio->io_offset;
		}
	}

	zio->io_vsd = mm;
	zio->io_vsd_ops = &vdev_mirror_vsd_ops;
	return (mm);
}
Ejemplo n.º 7
0
/*
 * Simulate hardware that ignores cache flushes.  For requested number
 * of seconds nix the actual writing to disk.
 */
void
zio_handle_ignored_writes(zio_t *zio)
{
	inject_handler_t *handler;

	rw_enter(&inject_lock, RW_READER);

	for (handler = list_head(&inject_handlers); handler != NULL;
	    handler = list_next(&inject_handlers, handler)) {

		/* Ignore errors not destined for this pool */
		if (zio->io_spa != handler->zi_spa ||
		    handler->zi_record.zi_cmd != ZINJECT_IGNORED_WRITES)
			continue;

		/*
		 * Positive duration implies # of seconds, negative
		 * a number of txgs
		 */
		if (handler->zi_record.zi_timer == 0) {
			if (handler->zi_record.zi_duration > 0)
				handler->zi_record.zi_timer = ddi_get_lbolt64();
			else
				handler->zi_record.zi_timer = zio->io_txg;
		}

		/* Have a "problem" writing 60% of the time */
		if (spa_get_random(100) < 60)
			zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES;
		break;
	}

	rw_exit(&inject_lock);
}
Ejemplo n.º 8
0
int
dsl_sync_task_group_wait(dsl_sync_task_group_t *dstg)
{
	dmu_tx_t *tx;
	uint64_t txg;
	dsl_sync_task_t *dst;

top:
	tx = dmu_tx_create_dd(dstg->dstg_pool->dp_mos_dir);
	VERIFY(0 == dmu_tx_assign(tx, TXG_WAIT));

	txg = dmu_tx_get_txg(tx);

	/* Do a preliminary error check. */
	dstg->dstg_err = 0;
#ifdef ZFS_DEBUG
	/*
	 * Only check half the time, otherwise, the sync-context
	 * check will almost never fail.
	 */
	if (spa_get_random(2) == 0)
		goto skip;
#endif
	rw_enter(&dstg->dstg_pool->dp_config_rwlock, RW_READER);
	for (dst = list_head(&dstg->dstg_tasks); dst;
	    dst = list_next(&dstg->dstg_tasks, dst)) {
		dst->dst_err =
		    dst->dst_checkfunc(dst->dst_arg1, dst->dst_arg2, tx);
		if (dst->dst_err)
			dstg->dstg_err = dst->dst_err;
	}
	rw_exit(&dstg->dstg_pool->dp_config_rwlock);

	if (dstg->dstg_err) {
		dmu_tx_commit(tx);
		return (dstg->dstg_err);
	}
skip:

	/*
	 * We don't generally have many sync tasks, so pay the price of
	 * add_tail to get the tasks executed in the right order.
	 */
	VERIFY(0 == txg_list_add_tail(&dstg->dstg_pool->dp_sync_tasks,
	    dstg, txg));

	dmu_tx_commit(tx);

	txg_wait_synced(dstg->dstg_pool, txg);

	if (dstg->dstg_err == EAGAIN) {
		txg_wait_synced(dstg->dstg_pool, txg + TXG_DEFER_SIZE);
		goto top;
	}

	return (dstg->dstg_err);
}
Ejemplo n.º 9
0
int
dsl_sync_task_group_wait(dsl_sync_task_group_t *dstg)
{
	dmu_tx_t *tx;
	uint64_t txg;
	dsl_sync_task_t *dst;

top:
	tx = dmu_tx_create_dd(dstg->dstg_pool->dp_mos_dir);
	VERIFY(0 == dmu_tx_assign(tx, TXG_WAIT));

	txg = dmu_tx_get_txg(tx);

	/* Do a preliminary error check. */
	dstg->dstg_err = 0;
	rw_enter(&dstg->dstg_pool->dp_config_rwlock, RW_READER);
	for (dst = list_head(&dstg->dstg_tasks); dst;
	    dst = list_next(&dstg->dstg_tasks, dst)) {
#ifdef ZFS_DEBUG
		/*
		 * Only check half the time, otherwise, the sync-context
		 * check will almost never fail.
		 */
		if (spa_get_random(2) == 0)
			continue;
#endif
		dst->dst_err =
		    dst->dst_checkfunc(dst->dst_arg1, dst->dst_arg2, tx);
		if (dst->dst_err)
			dstg->dstg_err = dst->dst_err;
	}
	rw_exit(&dstg->dstg_pool->dp_config_rwlock);

	if (dstg->dstg_err) {
		dmu_tx_commit(tx);
		return (dstg->dstg_err);
	}

	VERIFY(0 == txg_list_add(&dstg->dstg_pool->dp_sync_tasks, dstg, txg));

	dmu_tx_commit(tx);

	txg_wait_synced(dstg->dstg_pool, txg);

	if (dstg->dstg_err == EAGAIN)
		goto top;

	return (dstg->dstg_err);
}
Ejemplo n.º 10
0
Archivo: mmp.c Proyecto: LLNL/zfs
/*
 * Choose a leaf vdev to write an MMP block to.  It must not have an
 * outstanding mmp write (if so then there is a problem, and a new write will
 * also block).  If there is no usable leaf in this subtree return NULL,
 * otherwise return a pointer to the leaf.
 *
 * When walking the subtree, a random child is chosen as the starting point so
 * that when the tree is healthy, the leaf chosen will be random with even
 * distribution.  If there are unhealthy vdevs in the tree, the distribution
 * will be really poor only if a large proportion of the vdevs are unhealthy,
 * in which case there are other more pressing problems.
 */
static vdev_t *
mmp_random_leaf(vdev_t *vd)
{
	int child_idx;

	if (!vdev_writeable(vd))
		return (NULL);

	if (vd->vdev_ops->vdev_op_leaf)
		return (vd->vdev_mmp_pending == 0 ? vd : NULL);

	child_idx = spa_get_random(vd->vdev_children);
	for (int offset = vd->vdev_children; offset > 0; offset--) {
		vdev_t *leaf;
		vdev_t *child = vd->vdev_child[(child_idx + offset) %
		    vd->vdev_children];

		leaf = mmp_random_leaf(child);
		if (leaf)
			return (leaf);
	}

	return (NULL);
}
Ejemplo n.º 11
0
/*
 * Avoid inlining the function to keep vdev_mirror_io_start(), which
 * is this functions only caller, as small as possible on the stack.
 */
noinline static mirror_map_t *
vdev_mirror_map_alloc(zio_t *zio)
{
	mirror_map_t *mm = NULL;
	mirror_child_t *mc;
	vdev_t *vd = zio->io_vd;
	int c, d;

	if (vd == NULL) {
		dva_t *dva = zio->io_bp->blk_dva;
		spa_t *spa = zio->io_spa;

		c = BP_GET_NDVAS(zio->io_bp);

		mm = kmem_zalloc(offsetof(mirror_map_t, mm_child[c]),
		    KM_PUSHPAGE);
		mm->mm_children = c;
		mm->mm_replacing = B_FALSE;
		mm->mm_preferred = spa_get_random(c);
		mm->mm_root = B_TRUE;

		/*
		 * Check the other, lower-index DVAs to see if they're on
		 * the same vdev as the child we picked.  If they are, use
		 * them since they are likely to have been allocated from
		 * the primary metaslab in use at the time, and hence are
		 * more likely to have locality with single-copy data.
		 */
		for (c = mm->mm_preferred, d = c - 1; d >= 0; d--) {
			if (DVA_GET_VDEV(&dva[d]) == DVA_GET_VDEV(&dva[c]))
				mm->mm_preferred = d;
		}

		for (c = 0; c < mm->mm_children; c++) {
			mc = &mm->mm_child[c];

			mc->mc_vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[c]));
			mc->mc_offset = DVA_GET_OFFSET(&dva[c]);
		}
	} else {
		int lowest_pending = INT_MAX;
		int lowest_nr = 1;

		c = vd->vdev_children;

		mm = kmem_zalloc(offsetof(mirror_map_t, mm_child[c]),
		    KM_PUSHPAGE);
		mm->mm_children = c;
		mm->mm_replacing = (vd->vdev_ops == &vdev_replacing_ops ||
		    vd->vdev_ops == &vdev_spare_ops);
		mm->mm_preferred = 0;
		mm->mm_root = B_FALSE;

		for (c = 0; c < mm->mm_children; c++) {
			mc = &mm->mm_child[c];
			mc->mc_vd = vd->vdev_child[c];
			mc->mc_offset = zio->io_offset;

			if (mm->mm_replacing)
				continue;

			if (!vdev_readable(mc->mc_vd)) {
				mc->mc_error = SET_ERROR(ENXIO);
				mc->mc_tried = 1;
				mc->mc_skipped = 1;
				mc->mc_pending = INT_MAX;
				continue;
			}

			mc->mc_pending = vdev_mirror_pending(mc->mc_vd);
			if (mc->mc_pending < lowest_pending) {
				lowest_pending = mc->mc_pending;
				lowest_nr = 1;
			} else if (mc->mc_pending == lowest_pending) {
				lowest_nr++;
			}
		}

		d = gethrtime() / (NSEC_PER_USEC * zfs_vdev_mirror_switch_us);
		d = (d % lowest_nr) + 1;

		for (c = 0; c < mm->mm_children; c++) {
			mc = &mm->mm_child[c];

			if (mm->mm_child[c].mc_pending == lowest_pending) {
				if (--d == 0) {
					mm->mm_preferred = c;
					break;
				}
			}
		}
	}

	zio->io_vsd = mm;
	zio->io_vsd_ops = &vdev_mirror_vsd_ops;
	return (mm);
}
Ejemplo n.º 12
0
static mirror_map_t *
vdev_mirror_map_alloc(zio_t *zio)
{
	mirror_map_t *mm = NULL;
	mirror_child_t *mc;
	vdev_t *vd = zio->io_vd;
	int c, d;

	if (vd == NULL) {
		dva_t *dva = zio->io_bp->blk_dva;
		spa_t *spa = zio->io_spa;

		c = BP_GET_NDVAS(zio->io_bp);

		mm = kmem_zalloc(offsetof(mirror_map_t, mm_child[c]), KM_SLEEP);
		mm->mm_children = c;
		mm->mm_resilvering = B_FALSE;
		mm->mm_preferred = spa_get_random(c);
		mm->mm_root = B_TRUE;

		/*
		 * Check the other, lower-index DVAs to see if they're on
		 * the same vdev as the child we picked.  If they are, use
		 * them since they are likely to have been allocated from
		 * the primary metaslab in use at the time, and hence are
		 * more likely to have locality with single-copy data.
		 */
		for (c = mm->mm_preferred, d = c - 1; d >= 0; d--) {
			if (DVA_GET_VDEV(&dva[d]) == DVA_GET_VDEV(&dva[c]))
				mm->mm_preferred = d;
		}

		for (c = 0; c < mm->mm_children; c++) {
			mc = &mm->mm_child[c];

			mc->mc_vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[c]));
			mc->mc_offset = DVA_GET_OFFSET(&dva[c]);
		}
	} else {
		int replacing;

		c = vd->vdev_children;

		mm = kmem_zalloc(offsetof(mirror_map_t, mm_child[c]), KM_SLEEP);
		mm->mm_children = c;
		/*
		 * If we are resilvering, then we should handle scrub reads
		 * differently; we shouldn't issue them to the resilvering
		 * device because it might not have those blocks.
		 *
		 * We are resilvering iff:
		 * 1) We are a replacing vdev (ie our name is "replacing-1" or
		 *    "spare-1" or something like that), and
		 * 2) The pool is currently being resilvered.
		 *
		 * We cannot simply check vd->vdev_resilver_txg, because it's
		 * not set in this path.
		 *
		 * Nor can we just check our vdev_ops; there are cases (such as
		 * when a user types "zpool replace pool odev spare_dev" and
		 * spare_dev is in the spare list, or when a spare device is
		 * automatically used to replace a DEGRADED device) when
		 * resilvering is complete but both the original vdev and the
		 * spare vdev remain in the pool.  That behavior is intentional.
		 * It helps implement the policy that a spare should be
		 * automatically removed from the pool after the user replaces
		 * the device that originally failed.
		 */
		replacing = (vd->vdev_ops == &vdev_replacing_ops ||
		    vd->vdev_ops == &vdev_spare_ops);
		/*
		 * If a spa load is in progress, then spa_dsl_pool may be
		 * uninitialized.  But we shouldn't be resilvering during a spa
		 * load anyway.
		 */
		if (replacing &&
		    (spa_load_state(vd->vdev_spa) == SPA_LOAD_NONE) &&
		    dsl_scan_resilvering(vd->vdev_spa->spa_dsl_pool)) {
			mm->mm_resilvering = B_TRUE;
		} else {
			mm->mm_resilvering = B_FALSE;
		}

		mm->mm_preferred = mm->mm_resilvering ? 0 :
		    (zio->io_offset >> vdev_mirror_shift) % c;
		mm->mm_root = B_FALSE;

		for (c = 0; c < mm->mm_children; c++) {
			mc = &mm->mm_child[c];
			mc->mc_vd = vd->vdev_child[c];
			mc->mc_offset = zio->io_offset;
		}
	}

	zio->io_vsd = mm;
	zio->io_vsd_ops = &vdev_mirror_vsd_ops;
	return (mm);
}